@tuttiai/core 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { TuttiHooks, TelemetryConfig, TuttiEventType, TuttiEvent, ScoreConfig, AgentResult, Session, LLMProvider, SessionStore, AgentConfig, ChatMessage, Voice, Permission, BudgetConfig, ChatRequest, ChatResponse, StreamChunk } from '@tuttiai/types';
1
+ import { TuttiHooks, TokenUsage, ScoreConfig, TelemetryConfig, TuttiEventType, TuttiEvent, AgentResult, Session, LLMProvider, SessionStore, AgentConfig, ChatMessage, Voice, Permission, BudgetConfig, ChatRequest, ChatResponse, StreamChunk } from '@tuttiai/types';
2
2
  export { AgentConfig, AgentMemoryConfig, AgentResult, BudgetConfig, ChatMessage, ChatRequest, ChatResponse, ContentBlock, HookContext, LLMProvider, MemoryConfig, Permission, ScoreConfig, Session, SessionStore, StopReason, StreamChunk, TelemetryConfig, TextBlock, TokenUsage, Tool, ToolContext, ToolDefinition, ToolMemoryHelpers, ToolResult, ToolResultBlock, ToolUseBlock, TuttiEvent, TuttiEventHandler, TuttiEventType, TuttiHooks, Voice, VoiceContext } from '@tuttiai/types';
3
3
  import pino from 'pino';
4
4
 
@@ -84,6 +84,76 @@ declare function createBlocklistHook(blockedTools: string[]): TuttiHooks;
84
84
  */
85
85
  declare function createMaxCostHook(maxUsd: number): TuttiHooks;
86
86
 
87
+ /** Evaluation framework types. */
88
+
89
+ interface EvalAssertion {
90
+ type: "contains" | "not_contains" | "matches_regex" | "tool_called" | "tool_not_called" | "turns_lte" | "cost_lte";
91
+ value: string | number;
92
+ description?: string;
93
+ }
94
+ interface EvalCase {
95
+ id: string;
96
+ name: string;
97
+ agent_id: string;
98
+ input: string;
99
+ assertions: EvalAssertion[];
100
+ }
101
+ interface EvalSuite {
102
+ name: string;
103
+ cases: EvalCase[];
104
+ }
105
+ interface AssertionResult {
106
+ assertion: EvalAssertion;
107
+ passed: boolean;
108
+ actual: string | number;
109
+ }
110
+ interface EvalResult {
111
+ case_id: string;
112
+ case_name: string;
113
+ passed: boolean;
114
+ score: number;
115
+ output: string;
116
+ turns: number;
117
+ usage: TokenUsage;
118
+ cost_usd: number;
119
+ duration_ms: number;
120
+ assertions: AssertionResult[];
121
+ error?: string;
122
+ }
123
+ interface EvalSummary {
124
+ total: number;
125
+ passed: number;
126
+ failed: number;
127
+ avg_score: number;
128
+ total_cost_usd: number;
129
+ total_duration_ms: number;
130
+ }
131
+ interface EvalReport {
132
+ suite_name: string;
133
+ results: EvalResult[];
134
+ summary: EvalSummary;
135
+ }
136
+
137
+ /** Evaluation runner — executes test suites against a score. */
138
+
139
+ declare class EvalRunner {
140
+ private runtime;
141
+ constructor(score: ScoreConfig);
142
+ run(suite: EvalSuite): Promise<EvalReport>;
143
+ private runCase;
144
+ private checkAssertion;
145
+ private summarize;
146
+ }
147
+
148
+ /** Evaluation report formatting — table, JSON, Markdown. */
149
+
150
+ /** Print a formatted results table to stdout. */
151
+ declare function printTable(report: EvalReport): void;
152
+ /** Convert report to a plain JSON object for storage or CI. */
153
+ declare function toJSON(report: EvalReport): string;
154
+ /** Convert report to a GitHub-friendly markdown table. */
155
+ declare function toMarkdown(report: EvalReport): string;
156
+
87
157
  declare const createLogger: (name: string) => pino.Logger<never, boolean>;
88
158
  declare const logger: pino.Logger<never, boolean>;
89
159
 
@@ -336,4 +406,4 @@ declare class GeminiProvider implements LLMProvider {
336
406
  stream(request: ChatRequest): AsyncGenerator<StreamChunk>;
337
407
  }
338
408
 
339
- export { AgentNotFoundError, AgentRouter, AgentRunner, AnthropicProvider, type AnthropicProviderOptions, AuthenticationError, BudgetExceededError, ContextWindowError, EventBus, GeminiProvider, type GeminiProviderOptions, InMemorySemanticStore, InMemorySessionStore, type MemoryEntry, OpenAIProvider, type OpenAIProviderOptions, PathTraversalError, PermissionError, PermissionGuard, PostgresSessionStore, PromptGuard, ProviderError, RateLimitError, ScoreLoader, ScoreValidationError, SecretsManager, type SemanticMemoryStore, TokenBudget, ToolTimeoutError, TuttiError, TuttiRuntime, TuttiTracer, UrlValidationError, VoiceError, createBlocklistHook, createCacheHook, createLogger, createLoggingHook, createMaxCostHook, defineScore, initTelemetry, logger, shutdownTelemetry, validateScore };
409
+ export { AgentNotFoundError, AgentRouter, AgentRunner, AnthropicProvider, type AnthropicProviderOptions, AuthenticationError, BudgetExceededError, ContextWindowError, type EvalAssertion, type EvalCase, type EvalReport, type EvalResult, EvalRunner, type EvalSuite, type EvalSummary, EventBus, GeminiProvider, type GeminiProviderOptions, InMemorySemanticStore, InMemorySessionStore, type MemoryEntry, OpenAIProvider, type OpenAIProviderOptions, PathTraversalError, PermissionError, PermissionGuard, PostgresSessionStore, PromptGuard, ProviderError, RateLimitError, ScoreLoader, ScoreValidationError, SecretsManager, type SemanticMemoryStore, TokenBudget, ToolTimeoutError, TuttiError, TuttiRuntime, TuttiTracer, UrlValidationError, VoiceError, createBlocklistHook, createCacheHook, createLogger, createLoggingHook, createMaxCostHook, defineScore, toJSON as evalToJSON, toMarkdown as evalToMarkdown, initTelemetry, logger, printTable as printEvalTable, shutdownTelemetry, validateScore };
package/dist/index.js CHANGED
@@ -122,20 +122,21 @@ Only http:// and https:// URLs to public hosts are allowed.`,
122
122
  // src/hooks/index.ts
123
123
  function createLoggingHook(log) {
124
124
  return {
125
- async beforeLLMCall(ctx, request) {
125
+ beforeLLMCall(ctx, request) {
126
126
  log.info({ agent: ctx.agent_name, turn: ctx.turn, model: request.model }, "LLM call");
127
- return request;
127
+ return Promise.resolve(request);
128
128
  },
129
- async afterLLMCall(ctx, response) {
129
+ afterLLMCall(ctx, response) {
130
130
  log.info({ agent: ctx.agent_name, turn: ctx.turn, usage: response.usage }, "LLM response");
131
+ return Promise.resolve();
131
132
  },
132
- async beforeToolCall(ctx, tool, input) {
133
+ beforeToolCall(ctx, tool, input) {
133
134
  log.info({ agent: ctx.agent_name, tool, input }, "Tool call");
134
- return input;
135
+ return Promise.resolve(input);
135
136
  },
136
- async afterToolCall(ctx, tool, result) {
137
+ afterToolCall(ctx, tool, result) {
137
138
  log.info({ agent: ctx.agent_name, tool, is_error: result.is_error }, "Tool result");
138
- return result;
139
+ return Promise.resolve(result);
139
140
  }
140
141
  };
141
142
  }
@@ -144,150 +145,47 @@ function createCacheHook(store) {
144
145
  return tool + ":" + JSON.stringify(input);
145
146
  }
146
147
  return {
147
- async beforeToolCall(_ctx, tool, input) {
148
+ beforeToolCall(_ctx, tool, input) {
148
149
  const cached = store.get(cacheKey(tool, input));
149
- if (cached) return cached;
150
- return input;
150
+ if (cached) return Promise.resolve(cached);
151
+ return Promise.resolve(input);
151
152
  },
152
- async afterToolCall(_ctx, tool, result) {
153
+ afterToolCall(_ctx, tool, result) {
153
154
  if (!result.is_error) {
154
155
  store.set(cacheKey(tool, result.content), result.content);
155
156
  }
156
- return result;
157
+ return Promise.resolve(result);
157
158
  }
158
159
  };
159
160
  }
160
161
  function createBlocklistHook(blockedTools) {
161
162
  const blocked = new Set(blockedTools);
162
163
  return {
163
- async beforeToolCall(_ctx, tool) {
164
- return !blocked.has(tool);
164
+ beforeToolCall(_ctx, tool) {
165
+ return Promise.resolve(!blocked.has(tool));
165
166
  }
166
167
  };
167
168
  }
168
169
  function createMaxCostHook(maxUsd) {
169
170
  let totalCost = 0;
170
- const INPUT_PER_M = 3;
171
- const OUTPUT_PER_M = 15;
171
+ const INPUT_PER_M2 = 3;
172
+ const OUTPUT_PER_M2 = 15;
172
173
  return {
173
- async afterLLMCall(_ctx, response) {
174
- totalCost += response.usage.input_tokens / 1e6 * INPUT_PER_M + response.usage.output_tokens / 1e6 * OUTPUT_PER_M;
174
+ afterLLMCall(_ctx, response) {
175
+ totalCost += response.usage.input_tokens / 1e6 * INPUT_PER_M2 + response.usage.output_tokens / 1e6 * OUTPUT_PER_M2;
176
+ return Promise.resolve();
175
177
  },
176
- async beforeLLMCall(ctx, request) {
178
+ beforeLLMCall(ctx, request) {
177
179
  if (totalCost >= maxUsd) {
178
- throw new Error(
180
+ return Promise.reject(new Error(
179
181
  "Max cost hook: $" + totalCost.toFixed(4) + " exceeds limit $" + maxUsd.toFixed(2) + " for agent " + ctx.agent_name
180
- );
182
+ ));
181
183
  }
182
- return request;
184
+ return Promise.resolve(request);
183
185
  }
184
186
  };
185
187
  }
186
188
 
187
- // src/logger.ts
188
- import pino from "pino";
189
- var createLogger = (name) => pino({
190
- name,
191
- level: process.env.TUTTI_LOG_LEVEL ?? "info",
192
- transport: process.env.NODE_ENV === "production" ? void 0 : {
193
- target: "pino-pretty",
194
- options: {
195
- colorize: true,
196
- translateTime: "HH:MM:ss",
197
- ignore: "pid,hostname"
198
- }
199
- }
200
- });
201
- var logger = createLogger("tutti");
202
-
203
- // src/telemetry.ts
204
- import { trace, SpanStatusCode } from "@opentelemetry/api";
205
- var tracer = trace.getTracer("tutti", "1.0.0");
206
- var TuttiTracer = {
207
- agentRun(agentName, sessionId, fn) {
208
- return tracer.startActiveSpan("agent.run", async (span) => {
209
- span.setAttribute("agent.name", agentName);
210
- span.setAttribute("session.id", sessionId);
211
- try {
212
- const result = await fn();
213
- span.setStatus({ code: SpanStatusCode.OK });
214
- return result;
215
- } catch (err) {
216
- span.setStatus({
217
- code: SpanStatusCode.ERROR,
218
- message: err instanceof Error ? err.message : String(err)
219
- });
220
- throw err;
221
- } finally {
222
- span.end();
223
- }
224
- });
225
- },
226
- llmCall(model, fn) {
227
- return tracer.startActiveSpan("llm.call", async (span) => {
228
- span.setAttribute("llm.model", model);
229
- try {
230
- const result = await fn();
231
- span.setStatus({ code: SpanStatusCode.OK });
232
- return result;
233
- } catch (err) {
234
- span.setStatus({
235
- code: SpanStatusCode.ERROR,
236
- message: err instanceof Error ? err.message : String(err)
237
- });
238
- throw err;
239
- } finally {
240
- span.end();
241
- }
242
- });
243
- },
244
- toolCall(toolName, fn) {
245
- return tracer.startActiveSpan("tool.call", async (span) => {
246
- span.setAttribute("tool.name", toolName);
247
- try {
248
- const result = await fn();
249
- span.setStatus({ code: SpanStatusCode.OK });
250
- return result;
251
- } catch (err) {
252
- span.setStatus({
253
- code: SpanStatusCode.ERROR,
254
- message: err instanceof Error ? err.message : String(err)
255
- });
256
- throw err;
257
- } finally {
258
- span.end();
259
- }
260
- });
261
- }
262
- };
263
-
264
- // src/telemetry-setup.ts
265
- import { NodeSDK } from "@opentelemetry/sdk-node";
266
- import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
267
- import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
268
- var sdk;
269
- function initTelemetry(config) {
270
- if (!config.enabled || sdk) return;
271
- const endpoint = config.endpoint ?? "http://localhost:4318";
272
- const exporter = new OTLPTraceExporter({
273
- url: `${endpoint}/v1/traces`,
274
- headers: config.headers
275
- });
276
- sdk = new NodeSDK({
277
- traceExporter: exporter,
278
- instrumentations: [getNodeAutoInstrumentations({ "@opentelemetry/instrumentation-fs": { enabled: false } })],
279
- serviceName: process.env.OTEL_SERVICE_NAME ?? "tutti"
280
- });
281
- sdk.start();
282
- logger.info({ endpoint }, "OpenTelemetry tracing enabled");
283
- }
284
- async function shutdownTelemetry() {
285
- if (sdk) {
286
- await sdk.shutdown();
287
- sdk = void 0;
288
- }
289
- }
290
-
291
189
  // src/agent-runner.ts
292
190
  import { z } from "zod";
293
191
  import { zodToJsonSchema } from "zod-to-json-schema";
@@ -415,6 +313,83 @@ var TokenBudget = class {
415
313
  }
416
314
  };
417
315
 
316
+ // src/logger.ts
317
+ import pino from "pino";
318
+ var createLogger = (name) => pino({
319
+ name,
320
+ level: process.env.TUTTI_LOG_LEVEL ?? "info",
321
+ transport: process.env.NODE_ENV === "production" ? void 0 : {
322
+ target: "pino-pretty",
323
+ options: {
324
+ colorize: true,
325
+ translateTime: "HH:MM:ss",
326
+ ignore: "pid,hostname"
327
+ }
328
+ }
329
+ });
330
+ var logger = createLogger("tutti");
331
+
332
+ // src/telemetry.ts
333
+ import { trace, SpanStatusCode } from "@opentelemetry/api";
334
+ var tracer = trace.getTracer("tutti", "1.0.0");
335
+ var TuttiTracer = {
336
+ agentRun(agentName, sessionId, fn) {
337
+ return tracer.startActiveSpan("agent.run", async (span) => {
338
+ span.setAttribute("agent.name", agentName);
339
+ span.setAttribute("session.id", sessionId);
340
+ try {
341
+ const result = await fn();
342
+ span.setStatus({ code: SpanStatusCode.OK });
343
+ return result;
344
+ } catch (err) {
345
+ span.setStatus({
346
+ code: SpanStatusCode.ERROR,
347
+ message: err instanceof Error ? err.message : String(err)
348
+ });
349
+ throw err;
350
+ } finally {
351
+ span.end();
352
+ }
353
+ });
354
+ },
355
+ llmCall(model, fn) {
356
+ return tracer.startActiveSpan("llm.call", async (span) => {
357
+ span.setAttribute("llm.model", model);
358
+ try {
359
+ const result = await fn();
360
+ span.setStatus({ code: SpanStatusCode.OK });
361
+ return result;
362
+ } catch (err) {
363
+ span.setStatus({
364
+ code: SpanStatusCode.ERROR,
365
+ message: err instanceof Error ? err.message : String(err)
366
+ });
367
+ throw err;
368
+ } finally {
369
+ span.end();
370
+ }
371
+ });
372
+ },
373
+ toolCall(toolName, fn) {
374
+ return tracer.startActiveSpan("tool.call", async (span) => {
375
+ span.setAttribute("tool.name", toolName);
376
+ try {
377
+ const result = await fn();
378
+ span.setStatus({ code: SpanStatusCode.OK });
379
+ return result;
380
+ } catch (err) {
381
+ span.setStatus({
382
+ code: SpanStatusCode.ERROR,
383
+ message: err instanceof Error ? err.message : String(err)
384
+ });
385
+ throw err;
386
+ } finally {
387
+ span.end();
388
+ }
389
+ });
390
+ }
391
+ };
392
+
418
393
  // src/agent-runner.ts
419
394
  var DEFAULT_MAX_TURNS = 10;
420
395
  var DEFAULT_MAX_TOOL_CALLS = 20;
@@ -1106,6 +1081,33 @@ var PermissionGuard = class {
1106
1081
  }
1107
1082
  };
1108
1083
 
1084
+ // src/telemetry-setup.ts
1085
+ import { NodeSDK } from "@opentelemetry/sdk-node";
1086
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
1087
+ import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
1088
+ var sdk;
1089
+ function initTelemetry(config) {
1090
+ if (!config.enabled || sdk) return;
1091
+ const endpoint = config.endpoint ?? "http://localhost:4318";
1092
+ const exporter = new OTLPTraceExporter({
1093
+ url: `${endpoint}/v1/traces`,
1094
+ headers: config.headers
1095
+ });
1096
+ sdk = new NodeSDK({
1097
+ traceExporter: exporter,
1098
+ instrumentations: [getNodeAutoInstrumentations({ "@opentelemetry/instrumentation-fs": { enabled: false } })],
1099
+ serviceName: process.env.OTEL_SERVICE_NAME ?? "tutti"
1100
+ });
1101
+ sdk.start();
1102
+ logger.info({ endpoint }, "OpenTelemetry tracing enabled");
1103
+ }
1104
+ async function shutdownTelemetry() {
1105
+ if (sdk) {
1106
+ await sdk.shutdown();
1107
+ sdk = void 0;
1108
+ }
1109
+ }
1110
+
1109
1111
  // src/runtime.ts
1110
1112
  var TuttiRuntime = class _TuttiRuntime {
1111
1113
  events;
@@ -1196,6 +1198,209 @@ Supported: "in-memory", "postgres"`,
1196
1198
  }
1197
1199
  };
1198
1200
 
1201
+ // src/eval/runner.ts
1202
+ var INPUT_PER_M = 3;
1203
+ var OUTPUT_PER_M = 15;
1204
+ function estimateCost(inputTokens, outputTokens) {
1205
+ return inputTokens / 1e6 * INPUT_PER_M + outputTokens / 1e6 * OUTPUT_PER_M;
1206
+ }
1207
+ var EvalRunner = class {
1208
+ runtime;
1209
+ constructor(score) {
1210
+ this.runtime = new TuttiRuntime(score);
1211
+ }
1212
+ async run(suite) {
1213
+ const results = [];
1214
+ for (const testCase of suite.cases) {
1215
+ const result = await this.runCase(testCase);
1216
+ results.push(result);
1217
+ }
1218
+ const summary = this.summarize(results);
1219
+ return { suite_name: suite.name, results, summary };
1220
+ }
1221
+ async runCase(testCase) {
1222
+ const toolsCalled = [];
1223
+ const unsubscribeToolStart = this.runtime.events.on("tool:start", (e) => {
1224
+ toolsCalled.push(e.tool_name);
1225
+ });
1226
+ const start = Date.now();
1227
+ let output = "";
1228
+ let turns = 0;
1229
+ let usage = { input_tokens: 0, output_tokens: 0 };
1230
+ let error;
1231
+ try {
1232
+ const result = await this.runtime.run(testCase.agent_id, testCase.input);
1233
+ output = result.output;
1234
+ turns = result.turns;
1235
+ usage = result.usage;
1236
+ } catch (err) {
1237
+ error = err instanceof Error ? err.message : String(err);
1238
+ output = "[error] " + error;
1239
+ }
1240
+ unsubscribeToolStart();
1241
+ const durationMs = Date.now() - start;
1242
+ const costUsd = estimateCost(usage.input_tokens, usage.output_tokens);
1243
+ const assertionResults = testCase.assertions.map(
1244
+ (assertion) => this.checkAssertion(assertion, output, toolsCalled, turns, costUsd)
1245
+ );
1246
+ const passedCount = assertionResults.filter((a) => a.passed).length;
1247
+ const score = testCase.assertions.length > 0 ? passedCount / testCase.assertions.length : error ? 0 : 1;
1248
+ return {
1249
+ case_id: testCase.id,
1250
+ case_name: testCase.name,
1251
+ passed: assertionResults.every((a) => a.passed) && !error,
1252
+ score,
1253
+ output,
1254
+ turns,
1255
+ usage,
1256
+ cost_usd: costUsd,
1257
+ duration_ms: durationMs,
1258
+ assertions: assertionResults,
1259
+ error
1260
+ };
1261
+ }
1262
+ checkAssertion(assertion, output, toolsCalled, turns, costUsd) {
1263
+ const val = assertion.value;
1264
+ switch (assertion.type) {
1265
+ case "contains":
1266
+ return {
1267
+ assertion,
1268
+ passed: output.toLowerCase().includes(String(val).toLowerCase()),
1269
+ actual: output.slice(0, 200)
1270
+ };
1271
+ case "not_contains":
1272
+ return {
1273
+ assertion,
1274
+ passed: !output.toLowerCase().includes(String(val).toLowerCase()),
1275
+ actual: output.slice(0, 200)
1276
+ };
1277
+ case "matches_regex": {
1278
+ const regex = new RegExp(String(val), "i");
1279
+ return {
1280
+ assertion,
1281
+ passed: regex.test(output),
1282
+ actual: output.slice(0, 200)
1283
+ };
1284
+ }
1285
+ case "tool_called":
1286
+ return {
1287
+ assertion,
1288
+ passed: toolsCalled.includes(String(val)),
1289
+ actual: toolsCalled.join(", ") || "(none)"
1290
+ };
1291
+ case "tool_not_called":
1292
+ return {
1293
+ assertion,
1294
+ passed: !toolsCalled.includes(String(val)),
1295
+ actual: toolsCalled.join(", ") || "(none)"
1296
+ };
1297
+ case "turns_lte":
1298
+ return {
1299
+ assertion,
1300
+ passed: turns <= Number(val),
1301
+ actual: turns
1302
+ };
1303
+ case "cost_lte":
1304
+ return {
1305
+ assertion,
1306
+ passed: costUsd <= Number(val),
1307
+ actual: Number(costUsd.toFixed(4))
1308
+ };
1309
+ default:
1310
+ logger.warn({ type: assertion.type }, "Unknown assertion type");
1311
+ return { assertion, passed: false, actual: "unknown assertion type" };
1312
+ }
1313
+ }
1314
+ summarize(results) {
1315
+ const passed = results.filter((r) => r.passed).length;
1316
+ const scores = results.map((r) => r.score);
1317
+ const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
1318
+ const totalCost = results.reduce((s, r) => s + r.cost_usd, 0);
1319
+ const totalDuration = results.reduce((s, r) => s + r.duration_ms, 0);
1320
+ return {
1321
+ total: results.length,
1322
+ passed,
1323
+ failed: results.length - passed,
1324
+ avg_score: Number(avgScore.toFixed(2)),
1325
+ total_cost_usd: Number(totalCost.toFixed(4)),
1326
+ total_duration_ms: totalDuration
1327
+ };
1328
+ }
1329
+ };
1330
+
1331
+ // src/eval/report.ts
1332
+ function pad(str, len) {
1333
+ return str.length >= len ? str.slice(0, len) : str + " ".repeat(len - str.length);
1334
+ }
1335
+ function printTable(report) {
1336
+ const { results, summary } = report;
1337
+ console.log();
1338
+ console.log(" Eval suite: " + report.suite_name + " (" + summary.total + " cases)");
1339
+ console.log();
1340
+ for (const r of results) {
1341
+ const icon = r.passed ? "\x1B[32m\u2714\x1B[0m" : "\x1B[31m\u2717\x1B[0m";
1342
+ const score = r.score.toFixed(2);
1343
+ const cost = "$" + r.cost_usd.toFixed(3);
1344
+ const line = " " + icon + " " + pad(r.case_id, 10) + " " + pad(r.case_name, 28) + " " + pad(score, 6) + " " + r.turns + " turns " + cost;
1345
+ console.log(line);
1346
+ if (!r.passed) {
1347
+ for (const a of r.assertions) {
1348
+ if (!a.passed) {
1349
+ const desc = a.assertion.description ?? a.assertion.type + ": " + String(a.assertion.value);
1350
+ console.log(" \x1B[31m\u21B3 FAIL: " + desc + " (actual: " + String(a.actual).slice(0, 60) + ")\x1B[0m");
1351
+ }
1352
+ }
1353
+ if (r.error) {
1354
+ console.log(" \x1B[31m\u21B3 ERROR: " + r.error.slice(0, 80) + "\x1B[0m");
1355
+ }
1356
+ }
1357
+ }
1358
+ const pct = summary.total > 0 ? Math.round(summary.passed / summary.total * 100) : 0;
1359
+ console.log();
1360
+ console.log(
1361
+ " Results: " + summary.passed + "/" + summary.total + " passed (" + pct + "%) | Avg: " + summary.avg_score.toFixed(2) + " | Total: $" + summary.total_cost_usd.toFixed(3)
1362
+ );
1363
+ console.log();
1364
+ }
1365
+ function toJSON(report) {
1366
+ return JSON.stringify(report, null, 2);
1367
+ }
1368
+ function toMarkdown(report) {
1369
+ const { results, summary } = report;
1370
+ const lines = [];
1371
+ lines.push("## Eval: " + report.suite_name);
1372
+ lines.push("");
1373
+ lines.push("| Status | ID | Name | Score | Turns | Cost |");
1374
+ lines.push("|--------|-----|------|-------|-------|------|");
1375
+ for (const r of results) {
1376
+ const icon = r.passed ? "pass" : "FAIL";
1377
+ lines.push(
1378
+ "| " + icon + " | " + r.case_id + " | " + r.case_name + " | " + r.score.toFixed(2) + " | " + r.turns + " | $" + r.cost_usd.toFixed(3) + " |"
1379
+ );
1380
+ }
1381
+ lines.push("");
1382
+ const pct = summary.total > 0 ? Math.round(summary.passed / summary.total * 100) : 0;
1383
+ lines.push(
1384
+ "**Results:** " + summary.passed + "/" + summary.total + " passed (" + pct + "%) | Avg score: " + summary.avg_score.toFixed(2) + " | Total cost: $" + summary.total_cost_usd.toFixed(3)
1385
+ );
1386
+ const failed = results.filter((r) => !r.passed);
1387
+ if (failed.length > 0) {
1388
+ lines.push("");
1389
+ lines.push("### Failures");
1390
+ lines.push("");
1391
+ for (const r of failed) {
1392
+ lines.push("**" + r.case_id + "** \u2014 " + r.case_name);
1393
+ for (const a of r.assertions.filter((x) => !x.passed)) {
1394
+ const desc = a.assertion.description ?? a.assertion.type + ": " + String(a.assertion.value);
1395
+ lines.push("- " + desc + " (actual: `" + String(a.actual).slice(0, 80) + "`)");
1396
+ }
1397
+ if (r.error) lines.push("- Error: " + r.error);
1398
+ lines.push("");
1399
+ }
1400
+ }
1401
+ return lines.join("\n");
1402
+ }
1403
+
1199
1404
  // src/agent-router.ts
1200
1405
  import { z as z2 } from "zod";
1201
1406
  var AgentRouter = class {
@@ -2050,6 +2255,7 @@ export {
2050
2255
  AuthenticationError,
2051
2256
  BudgetExceededError,
2052
2257
  ContextWindowError,
2258
+ EvalRunner,
2053
2259
  EventBus,
2054
2260
  GeminiProvider,
2055
2261
  InMemorySemanticStore,
@@ -2078,8 +2284,11 @@ export {
2078
2284
  createLoggingHook,
2079
2285
  createMaxCostHook,
2080
2286
  defineScore,
2287
+ toJSON as evalToJSON,
2288
+ toMarkdown as evalToMarkdown,
2081
2289
  initTelemetry,
2082
2290
  logger,
2291
+ printTable as printEvalTable,
2083
2292
  shutdownTelemetry,
2084
2293
  validateScore
2085
2294
  };