syntropylabs-evalkit 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +132 -79
- package/dist/index.d.mts +53 -1
- package/dist/index.d.ts +53 -1
- package/dist/index.js +15 -15
- package/dist/index.mjs +12 -12
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,22 +1,25 @@
|
|
|
1
|
-
# EvalKit TypeScript SDK
|
|
1
|
+
# EvalKit — TypeScript SDK
|
|
2
2
|
|
|
3
|
-
OpenTelemetry-based
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
> The Python SDK is published as [`syntropylabs-evalkit`](https://pypi.org/project/syntropylabs-evalkit/) on PyPI.
|
|
8
|
-
|
|
9
|
-
## Install
|
|
3
|
+
OpenTelemetry-based tracing and evaluation for Node.js LLM apps, by
|
|
4
|
+
[Syntropy Labs](https://syntropylabs.ai). A single `init()` call auto-instruments
|
|
5
|
+
your LLM providers, databases, and HTTP clients, then ships traces to the platform.
|
|
10
6
|
|
|
11
7
|
```bash
|
|
12
8
|
npm install syntropylabs-evalkit
|
|
13
9
|
```
|
|
14
10
|
|
|
15
|
-
|
|
11
|
+
> The Python SDK ships as [`syntropylabs-evalkit`](https://pypi.org/project/syntropylabs-evalkit/) on PyPI.
|
|
16
12
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
## Contents
|
|
14
|
+
|
|
15
|
+
- [Quick start](#quick-start)
|
|
16
|
+
- [What gets traced](#what-gets-traced)
|
|
17
|
+
- [Framework middleware](#framework-middleware)
|
|
18
|
+
- [Trace your own code](#trace-your-own-code)
|
|
19
|
+
- [Manual spans](#manual-spans)
|
|
20
|
+
- [Offline evaluation](#offline-evaluation)
|
|
21
|
+
- [Scenario simulation](#scenario-simulation)
|
|
22
|
+
- [Configuration](#configuration)
|
|
20
23
|
|
|
21
24
|
## Quick start
|
|
22
25
|
|
|
@@ -24,18 +27,36 @@ npm install openai @anthropic-ai/sdk
|
|
|
24
27
|
import evalkit from "syntropylabs-evalkit";
|
|
25
28
|
|
|
26
29
|
evalkit.init({
|
|
27
|
-
subscriptionKey: process.env.EVALKIT_SUBSCRIPTION_KEY!,
|
|
30
|
+
subscriptionKey: process.env.EVALKIT_SUBSCRIPTION_KEY!, // Dashboard → Settings → Tracing
|
|
28
31
|
serviceName: "my-service",
|
|
29
32
|
});
|
|
30
33
|
|
|
31
|
-
//
|
|
34
|
+
// Every OpenAI / Anthropic / DB / HTTP call from here on is traced automatically.
|
|
32
35
|
```
|
|
33
36
|
|
|
34
|
-
|
|
37
|
+
Call `init()` as early as possible — at the top of your entrypoint, before other
|
|
38
|
+
modules run requests — so auto-instrumentation can hook the libraries.
|
|
39
|
+
|
|
40
|
+
## What gets traced
|
|
41
|
+
|
|
42
|
+
| Category | Captured automatically |
|
|
43
|
+
| ----------- | ------------------------------------------------------------------- |
|
|
44
|
+
| LLM clients | OpenAI, Anthropic, Bedrock, Cohere, Google, Vertex, LangChain |
|
|
45
|
+
| HTTP | `fetch`, `axios`, `node:http` — method, URL, status, latency |
|
|
46
|
+
| Databases | Postgres, MySQL, MongoDB, Redis — query text + latency |
|
|
47
|
+
| Inbound | Every incoming HTTP request becomes a root trace |
|
|
48
|
+
| Your code | Opt-in per function / tool / class, or whole-app on NestJS |
|
|
49
|
+
|
|
50
|
+
Provider SDKs are **optional peer dependencies** — install only what you use:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
npm install openai @anthropic-ai/sdk
|
|
54
|
+
```
|
|
35
55
|
|
|
36
56
|
## Framework middleware
|
|
37
57
|
|
|
38
|
-
Each incoming request becomes a root trace, with downstream LLM/DB/HTTP calls nested
|
|
58
|
+
Each incoming request becomes a root trace, with downstream LLM/DB/HTTP calls nested
|
|
59
|
+
underneath.
|
|
39
60
|
|
|
40
61
|
```ts
|
|
41
62
|
import express from "express";
|
|
@@ -47,118 +68,150 @@ const app = express();
|
|
|
47
68
|
app.use(evalkit.expressMiddleware());
|
|
48
69
|
```
|
|
49
70
|
|
|
50
|
-
|
|
71
|
+
Adapters: `expressMiddleware()`, `fastifyPlugin()`, `koaMiddleware()`,
|
|
72
|
+
`honoMiddleware()`, `hapiPlugin()`, `createNestjsInterceptor()`.
|
|
51
73
|
|
|
52
|
-
|
|
74
|
+
### NestJS — trace the whole app
|
|
53
75
|
|
|
54
|
-
|
|
55
|
-
|
|
76
|
+
NestJS exposes a DI registry, so the SDK can wrap **every** provider/controller method
|
|
77
|
+
for you. One line in `main.ts` — pass the app; the SDK resolves `DiscoveryService`
|
|
78
|
+
itself (no `@nestjs/core` import needed):
|
|
56
79
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
try {
|
|
60
|
-
await embed(docs);
|
|
61
|
-
end("OK", { "result.count": docs.length });
|
|
62
|
-
} catch (e) {
|
|
63
|
-
end("ERROR", { "error.message": String(e) });
|
|
64
|
-
throw e;
|
|
65
|
-
}
|
|
80
|
+
```ts
|
|
81
|
+
evalkit.init({ subscriptionKey: "tk_live_...", serviceName: "orchestrator" });
|
|
66
82
|
|
|
67
|
-
await
|
|
83
|
+
const app = await NestFactory.create(AppModule);
|
|
84
|
+
await evalkit.enableNestjsAutoTrace(app); // ← the only line you add
|
|
85
|
+
await app.listen(5000);
|
|
68
86
|
```
|
|
69
87
|
|
|
70
|
-
|
|
88
|
+
Route metadata (`@Get`, `@Body`, guards) is preserved, and the call never throws.
|
|
89
|
+
|
|
90
|
+
## Trace your own code
|
|
71
91
|
|
|
72
|
-
Auto-instrumentation covers libraries
|
|
73
|
-
a
|
|
92
|
+
Auto-instrumentation covers libraries. For **your** code, opt in — a function, a tool,
|
|
93
|
+
a class, or a whole service object:
|
|
74
94
|
|
|
75
95
|
```ts
|
|
76
96
|
import evalkit, { Traced } from "syntropylabs-evalkit";
|
|
77
97
|
|
|
78
|
-
|
|
79
|
-
const
|
|
98
|
+
const rankResults = evalkit.traceFunction("rank-results", rank); // → function_call span
|
|
99
|
+
const searchWeb = evalkit.traceTool("search_web", (q: string) => runSearch(q)); // → tool_call span
|
|
80
100
|
|
|
81
|
-
//
|
|
82
|
-
const searchWeb = evalkit.traceTool("search_web", (q: string) => runSearch(q));
|
|
83
|
-
|
|
84
|
-
// Every method of a class, APM-style
|
|
85
|
-
@Traced()
|
|
101
|
+
@Traced() // → every method of the class
|
|
86
102
|
class OrderService {
|
|
87
103
|
place(order: Order) { /* ... */ }
|
|
88
104
|
cancel(id: string) { /* ... */ }
|
|
89
105
|
}
|
|
90
106
|
|
|
91
|
-
// Or one method
|
|
92
|
-
class Service {
|
|
93
|
-
@evalkit.TraceMethod()
|
|
94
|
-
async compute() { /* ... */ }
|
|
95
|
-
}
|
|
96
|
-
|
|
97
107
|
// Every function of a service object (parity with Python's trace_module)
|
|
98
108
|
export const orders = evalkit.traceObject({ place, cancel }, { prefix: "orders" });
|
|
99
109
|
```
|
|
100
110
|
|
|
101
|
-
|
|
111
|
+
> A client-side tool the model calls only shows its **output** if you wrap it with
|
|
112
|
+
> `traceTool` — the SDK sees the model's request, not your function's return value.
|
|
113
|
+
> Server-side tools (e.g. OpenAI `web_search`) and LangChain tools are automatic.
|
|
102
114
|
|
|
103
|
-
|
|
104
|
-
method for you — no per-class decorators. One line in `main.ts`: pass the app and
|
|
105
|
-
the SDK resolves `DiscoveryService` itself (no `@nestjs/core` import needed):
|
|
115
|
+
## Manual spans
|
|
106
116
|
|
|
107
117
|
```ts
|
|
108
|
-
|
|
109
|
-
evalkit.init({ subscriptionKey: "tk_live_…", serviceName: "orchestrator" });
|
|
110
|
-
await evalkit.enableNestjsAutoTrace(app); // ← the only line you add
|
|
111
|
-
await app.listen(5000);
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
Route metadata (`@Get`, `@Body`, guards) is preserved, so routing and auth are
|
|
115
|
-
unaffected. The call never throws — if discovery can't be resolved it's a no-op.
|
|
116
|
-
You can still pass a `DiscoveryService` directly if you prefer.
|
|
118
|
+
import { startSpan } from "syntropylabs-evalkit";
|
|
117
119
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
120
|
+
const { end } = startSpan("embed-documents", { count: 42 });
|
|
121
|
+
try {
|
|
122
|
+
await embed(docs);
|
|
123
|
+
end("OK", { "result.count": docs.length });
|
|
124
|
+
} catch (e) {
|
|
125
|
+
end("ERROR", { "error.message": String(e) });
|
|
126
|
+
throw e;
|
|
127
|
+
}
|
|
121
128
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
> value. Server-side tools (OpenAI `web_search`, …) and LangChain tools are captured
|
|
125
|
-
> automatically.
|
|
129
|
+
await evalkit.flush(); // force-flush before process exit
|
|
130
|
+
```
|
|
126
131
|
|
|
127
132
|
## Offline evaluation
|
|
128
133
|
|
|
134
|
+
Deterministic, local scoring — runs synchronously, pushed as an `eval_result` span.
|
|
135
|
+
|
|
129
136
|
```ts
|
|
130
137
|
import { evaluate } from "syntropylabs-evalkit";
|
|
131
138
|
|
|
132
|
-
// Deterministic, local scoring — runs synchronously, pushed as an eval_result span
|
|
133
139
|
const { scores } = evaluate({
|
|
134
140
|
output: "The answer is AWS and Azure.",
|
|
135
141
|
expectedTools: ["search", "summarize"],
|
|
136
142
|
toolCalls: [{ name: "search" }, { name: "summarize" }],
|
|
137
143
|
constraints: { requiredTerms: ["AWS", "Azure"] },
|
|
138
144
|
});
|
|
145
|
+
// → { tool_trajectory: 1, tool_f1: 1, tool_correctness: 1, response_match: 1, constraint_compliance: 1 }
|
|
139
146
|
```
|
|
140
147
|
|
|
141
|
-
## Scenario
|
|
148
|
+
## Scenario simulation
|
|
149
|
+
|
|
150
|
+
Generate synthetic-user scenarios from your agent's prompt and tools, replay each one
|
|
151
|
+
against your real agent, then grade the run with LLM-as-judge evaluators.
|
|
142
152
|
|
|
143
153
|
```ts
|
|
144
154
|
import evalkit from "syntropylabs-evalkit";
|
|
145
155
|
|
|
146
|
-
|
|
147
|
-
|
|
156
|
+
evalkit.init({ subscriptionKey: process.env.EVALKIT_SUBSCRIPTION_KEY!, serviceName: "support-bot" });
|
|
157
|
+
|
|
158
|
+
// 1 — generate scenarios (bring your own key for the generation call)
|
|
159
|
+
const scenarios = await evalkit.generateScenarios({
|
|
160
|
+
agentInstructions: SYSTEM_PROMPT,
|
|
161
|
+
tools: ["search_kb", "lookup_order", "create_ticket"],
|
|
162
|
+
count: 5,
|
|
163
|
+
provider: "anthropic",
|
|
164
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
// 2 — replay each scenario against your real agent
|
|
168
|
+
const { simulationId, runId, results } = await evalkit.simulateUser({
|
|
169
|
+
scenarios,
|
|
170
|
+
entrypoint: async (ctx) => {
|
|
171
|
+
const { text, toolCalls } = await runAgent(ctx.sessionId, ctx.message);
|
|
172
|
+
return { text, toolCalls };
|
|
173
|
+
},
|
|
174
|
+
tags: ["ci"],
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
// 3 — grade the run against an evaluator collection (LLM-as-judge, BYOK)
|
|
178
|
+
const result = await evalkit.evaluateSimulation({
|
|
179
|
+
simulationId,
|
|
180
|
+
collectionId: "665f0c...", // Dashboard → Evaluators → Collections
|
|
181
|
+
provider: "openai",
|
|
182
|
+
model: "gpt-4o",
|
|
183
|
+
apiKey: process.env.OPENAI_API_KEY!,
|
|
184
|
+
maxTokens: 1024, // optional judge output cap
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
console.log(result.aggregate); // { averageScore, passRate, ... }
|
|
188
|
+
for (const scn of result.scenarios) {
|
|
189
|
+
console.log(scn.name, scn.overallScore, scn.passed);
|
|
190
|
+
for (const m of scn.metrics) console.log(" -", m.ruleName, m.score, m.reason);
|
|
191
|
+
}
|
|
148
192
|
```
|
|
149
193
|
|
|
194
|
+
`evaluateSimulation` returns per-scenario, per-criterion scores with reasons, which
|
|
195
|
+
also appear in the Tracing dashboard.
|
|
196
|
+
|
|
150
197
|
## Configuration
|
|
151
198
|
|
|
152
|
-
| Option
|
|
153
|
-
|
|
|
154
|
-
| `subscriptionKey`
|
|
155
|
-
| `serviceName`
|
|
156
|
-
| `environment`
|
|
157
|
-
| `baseUrl`
|
|
158
|
-
| `apiUrl`
|
|
159
|
-
| `maxBodyBytes`
|
|
199
|
+
| Option | Description |
|
|
200
|
+
| ----------------- | ------------------------------------------------------------- |
|
|
201
|
+
| `subscriptionKey` | Trace-project subscription key (**required**). |
|
|
202
|
+
| `serviceName` | Logical service name attached to every trace. |
|
|
203
|
+
| `environment` | `"development"` \| `"staging"` \| `"production"`. |
|
|
204
|
+
| `baseUrl` | Override the trace ingest endpoint (defaults to hosted). |
|
|
205
|
+
| `apiUrl` | Override the control-plane endpoint (scenario / simulation). |
|
|
206
|
+
| `maxBodyBytes` | Max captured HTTP body size (default 10 MB). |
|
|
207
|
+
|
|
208
|
+
See the exported `EvalKitOptions` type for the full set (`appVersion`, `deviceId`,
|
|
209
|
+
`debug`, batch tuning).
|
|
210
|
+
|
|
211
|
+
## Links
|
|
160
212
|
|
|
161
|
-
|
|
213
|
+
- Website: https://syntropylabs.ai
|
|
214
|
+
- Documentation: https://syntropylabs.ai/docs
|
|
162
215
|
|
|
163
216
|
## License
|
|
164
217
|
|
package/dist/index.d.mts
CHANGED
|
@@ -146,6 +146,9 @@ interface GenerateScenariosOptions {
|
|
|
146
146
|
provider?: string;
|
|
147
147
|
apiKey?: string;
|
|
148
148
|
temperature?: number;
|
|
149
|
+
reasoningEffort?: string;
|
|
150
|
+
maxCompletionTokens?: number;
|
|
151
|
+
maxTokens?: number;
|
|
149
152
|
apiUrl?: string;
|
|
150
153
|
}
|
|
151
154
|
interface SimulateUserOptions {
|
|
@@ -159,6 +162,53 @@ interface SimulateUserOptions {
|
|
|
159
162
|
provider?: string;
|
|
160
163
|
apiKey?: string;
|
|
161
164
|
}
|
|
165
|
+
interface EvaluateSimulationOptions {
|
|
166
|
+
/** The simulationId returned by simulateUser. */
|
|
167
|
+
simulationId: string;
|
|
168
|
+
/** Mongo id of the evaluator collection whose rules to run. */
|
|
169
|
+
collectionId: string;
|
|
170
|
+
/** Judge provider, e.g. "openai", "anthropic", "google". */
|
|
171
|
+
provider: string;
|
|
172
|
+
/** Judge model id, e.g. "gpt-4o". */
|
|
173
|
+
model: string;
|
|
174
|
+
/** BYOK judge key — used for the judge call only, never stored. */
|
|
175
|
+
apiKey: string;
|
|
176
|
+
/** A specific run of the simulation. Defaults to the most recent run. */
|
|
177
|
+
runId?: string;
|
|
178
|
+
/** Judge output token cap (defaults to the backend's value). */
|
|
179
|
+
maxTokens?: number;
|
|
180
|
+
/** Override the control-plane URL (default: the value from init). */
|
|
181
|
+
apiUrl?: string;
|
|
182
|
+
}
|
|
183
|
+
interface SimEvalMetric {
|
|
184
|
+
ruleId: string;
|
|
185
|
+
ruleName: string;
|
|
186
|
+
category: string;
|
|
187
|
+
score: number;
|
|
188
|
+
passed: boolean;
|
|
189
|
+
reason: string;
|
|
190
|
+
}
|
|
191
|
+
interface SimEvalScenario {
|
|
192
|
+
scenarioId: string;
|
|
193
|
+
name: string;
|
|
194
|
+
traceId: string;
|
|
195
|
+
status: string;
|
|
196
|
+
overallScore: number;
|
|
197
|
+
passed: boolean;
|
|
198
|
+
metrics: SimEvalMetric[];
|
|
199
|
+
error?: string;
|
|
200
|
+
}
|
|
201
|
+
interface EvaluateSimulationResult {
|
|
202
|
+
simulationId: string;
|
|
203
|
+
runId: string;
|
|
204
|
+
scenarios: SimEvalScenario[];
|
|
205
|
+
aggregate: {
|
|
206
|
+
averageScore: number;
|
|
207
|
+
passRate: number;
|
|
208
|
+
totalScenarios: number;
|
|
209
|
+
evaluatedScenarios: number;
|
|
210
|
+
};
|
|
211
|
+
}
|
|
162
212
|
|
|
163
213
|
interface HapiPluginOptions {
|
|
164
214
|
name?: string | ((request: any) => string);
|
|
@@ -366,11 +416,13 @@ declare function simulateUser(opts: SimulateUserOptions): Promise<{
|
|
|
366
416
|
runId: string;
|
|
367
417
|
results: any[];
|
|
368
418
|
}>;
|
|
419
|
+
declare function evaluateSimulation(opts: EvaluateSimulationOptions): Promise<EvaluateSimulationResult>;
|
|
369
420
|
declare const _default: {
|
|
370
421
|
init: typeof init;
|
|
371
422
|
evaluate: typeof evaluate;
|
|
372
423
|
generateScenarios: typeof generateScenarios;
|
|
373
424
|
simulateUser: typeof simulateUser;
|
|
425
|
+
evaluateSimulation: typeof evaluateSimulation;
|
|
374
426
|
patchOpenAIClient: typeof patchOpenAIClient;
|
|
375
427
|
patchAnthropicClient: typeof patchAnthropicClient;
|
|
376
428
|
patchBedrockClient: typeof patchBedrockClient;
|
|
@@ -405,4 +457,4 @@ declare const _default: {
|
|
|
405
457
|
flush: typeof flush;
|
|
406
458
|
};
|
|
407
459
|
|
|
408
|
-
export { type AgentTurnResult, EvalKitClient, EvalKitInterceptor, type EvalKitOptions, type ExpressMiddlewareOptions, type FastifyPluginOptions, type GenerateScenariosOptions, type HapiPluginOptions, type HonoMiddlewareOptions, type KoaMiddlewareOptions, type OfflineEvalInput, type OfflineEvalResult, type OfflineMetric, type SimContext, type SimulateUserOptions, type SpanEvent, type TraceEnvelope, TraceMethod, Traced, createNestjsInterceptor, currentTraceId, _default as default, enableNestjsAutoTrace, evaluate, expressMiddleware, fastifyPlugin, flush, generateScenarios, hapiPlugin, honoMiddleware, init, koaMiddleware, langchainHandler, patchAnthropicClient, patchAnthropicVertexClient, patchAxiosClient, patchBedrockClient, patchCohereClient, patchGoogleAIModel, patchGoogleGenAIModels, patchMongooseClient, patchMysql2Client, patchOpenAIClient, patchPgClient, patchRedisClient, patchVertexGenerativeModel, simulateUser, startHttpTrace, startSpan, startTrace, traceFunction, traceObject, traceTool, withTrace };
|
|
460
|
+
export { type AgentTurnResult, EvalKitClient, EvalKitInterceptor, type EvalKitOptions, type EvaluateSimulationOptions, type EvaluateSimulationResult, type ExpressMiddlewareOptions, type FastifyPluginOptions, type GenerateScenariosOptions, type HapiPluginOptions, type HonoMiddlewareOptions, type KoaMiddlewareOptions, type OfflineEvalInput, type OfflineEvalResult, type OfflineMetric, type SimContext, type SimEvalMetric, type SimEvalScenario, type SimulateUserOptions, type SpanEvent, type TraceEnvelope, TraceMethod, Traced, createNestjsInterceptor, currentTraceId, _default as default, enableNestjsAutoTrace, evaluate, evaluateSimulation, expressMiddleware, fastifyPlugin, flush, generateScenarios, hapiPlugin, honoMiddleware, init, koaMiddleware, langchainHandler, patchAnthropicClient, patchAnthropicVertexClient, patchAxiosClient, patchBedrockClient, patchCohereClient, patchGoogleAIModel, patchGoogleGenAIModels, patchMongooseClient, patchMysql2Client, patchOpenAIClient, patchPgClient, patchRedisClient, patchVertexGenerativeModel, simulateUser, startHttpTrace, startSpan, startTrace, traceFunction, traceObject, traceTool, withTrace };
|
package/dist/index.d.ts
CHANGED
|
@@ -146,6 +146,9 @@ interface GenerateScenariosOptions {
|
|
|
146
146
|
provider?: string;
|
|
147
147
|
apiKey?: string;
|
|
148
148
|
temperature?: number;
|
|
149
|
+
reasoningEffort?: string;
|
|
150
|
+
maxCompletionTokens?: number;
|
|
151
|
+
maxTokens?: number;
|
|
149
152
|
apiUrl?: string;
|
|
150
153
|
}
|
|
151
154
|
interface SimulateUserOptions {
|
|
@@ -159,6 +162,53 @@ interface SimulateUserOptions {
|
|
|
159
162
|
provider?: string;
|
|
160
163
|
apiKey?: string;
|
|
161
164
|
}
|
|
165
|
+
interface EvaluateSimulationOptions {
|
|
166
|
+
/** The simulationId returned by simulateUser. */
|
|
167
|
+
simulationId: string;
|
|
168
|
+
/** Mongo id of the evaluator collection whose rules to run. */
|
|
169
|
+
collectionId: string;
|
|
170
|
+
/** Judge provider, e.g. "openai", "anthropic", "google". */
|
|
171
|
+
provider: string;
|
|
172
|
+
/** Judge model id, e.g. "gpt-4o". */
|
|
173
|
+
model: string;
|
|
174
|
+
/** BYOK judge key — used for the judge call only, never stored. */
|
|
175
|
+
apiKey: string;
|
|
176
|
+
/** A specific run of the simulation. Defaults to the most recent run. */
|
|
177
|
+
runId?: string;
|
|
178
|
+
/** Judge output token cap (defaults to the backend's value). */
|
|
179
|
+
maxTokens?: number;
|
|
180
|
+
/** Override the control-plane URL (default: the value from init). */
|
|
181
|
+
apiUrl?: string;
|
|
182
|
+
}
|
|
183
|
+
interface SimEvalMetric {
|
|
184
|
+
ruleId: string;
|
|
185
|
+
ruleName: string;
|
|
186
|
+
category: string;
|
|
187
|
+
score: number;
|
|
188
|
+
passed: boolean;
|
|
189
|
+
reason: string;
|
|
190
|
+
}
|
|
191
|
+
interface SimEvalScenario {
|
|
192
|
+
scenarioId: string;
|
|
193
|
+
name: string;
|
|
194
|
+
traceId: string;
|
|
195
|
+
status: string;
|
|
196
|
+
overallScore: number;
|
|
197
|
+
passed: boolean;
|
|
198
|
+
metrics: SimEvalMetric[];
|
|
199
|
+
error?: string;
|
|
200
|
+
}
|
|
201
|
+
interface EvaluateSimulationResult {
|
|
202
|
+
simulationId: string;
|
|
203
|
+
runId: string;
|
|
204
|
+
scenarios: SimEvalScenario[];
|
|
205
|
+
aggregate: {
|
|
206
|
+
averageScore: number;
|
|
207
|
+
passRate: number;
|
|
208
|
+
totalScenarios: number;
|
|
209
|
+
evaluatedScenarios: number;
|
|
210
|
+
};
|
|
211
|
+
}
|
|
162
212
|
|
|
163
213
|
interface HapiPluginOptions {
|
|
164
214
|
name?: string | ((request: any) => string);
|
|
@@ -366,11 +416,13 @@ declare function simulateUser(opts: SimulateUserOptions): Promise<{
|
|
|
366
416
|
runId: string;
|
|
367
417
|
results: any[];
|
|
368
418
|
}>;
|
|
419
|
+
declare function evaluateSimulation(opts: EvaluateSimulationOptions): Promise<EvaluateSimulationResult>;
|
|
369
420
|
declare const _default: {
|
|
370
421
|
init: typeof init;
|
|
371
422
|
evaluate: typeof evaluate;
|
|
372
423
|
generateScenarios: typeof generateScenarios;
|
|
373
424
|
simulateUser: typeof simulateUser;
|
|
425
|
+
evaluateSimulation: typeof evaluateSimulation;
|
|
374
426
|
patchOpenAIClient: typeof patchOpenAIClient;
|
|
375
427
|
patchAnthropicClient: typeof patchAnthropicClient;
|
|
376
428
|
patchBedrockClient: typeof patchBedrockClient;
|
|
@@ -405,4 +457,4 @@ declare const _default: {
|
|
|
405
457
|
flush: typeof flush;
|
|
406
458
|
};
|
|
407
459
|
|
|
408
|
-
export { type AgentTurnResult, EvalKitClient, EvalKitInterceptor, type EvalKitOptions, type ExpressMiddlewareOptions, type FastifyPluginOptions, type GenerateScenariosOptions, type HapiPluginOptions, type HonoMiddlewareOptions, type KoaMiddlewareOptions, type OfflineEvalInput, type OfflineEvalResult, type OfflineMetric, type SimContext, type SimulateUserOptions, type SpanEvent, type TraceEnvelope, TraceMethod, Traced, createNestjsInterceptor, currentTraceId, _default as default, enableNestjsAutoTrace, evaluate, expressMiddleware, fastifyPlugin, flush, generateScenarios, hapiPlugin, honoMiddleware, init, koaMiddleware, langchainHandler, patchAnthropicClient, patchAnthropicVertexClient, patchAxiosClient, patchBedrockClient, patchCohereClient, patchGoogleAIModel, patchGoogleGenAIModels, patchMongooseClient, patchMysql2Client, patchOpenAIClient, patchPgClient, patchRedisClient, patchVertexGenerativeModel, simulateUser, startHttpTrace, startSpan, startTrace, traceFunction, traceObject, traceTool, withTrace };
|
|
460
|
+
export { type AgentTurnResult, EvalKitClient, EvalKitInterceptor, type EvalKitOptions, type EvaluateSimulationOptions, type EvaluateSimulationResult, type ExpressMiddlewareOptions, type FastifyPluginOptions, type GenerateScenariosOptions, type HapiPluginOptions, type HonoMiddlewareOptions, type KoaMiddlewareOptions, type OfflineEvalInput, type OfflineEvalResult, type OfflineMetric, type SimContext, type SimEvalMetric, type SimEvalScenario, type SimulateUserOptions, type SpanEvent, type TraceEnvelope, TraceMethod, Traced, createNestjsInterceptor, currentTraceId, _default as default, enableNestjsAutoTrace, evaluate, evaluateSimulation, expressMiddleware, fastifyPlugin, flush, generateScenarios, hapiPlugin, honoMiddleware, init, koaMiddleware, langchainHandler, patchAnthropicClient, patchAnthropicVertexClient, patchAxiosClient, patchBedrockClient, patchCohereClient, patchGoogleAIModel, patchGoogleGenAIModels, patchMongooseClient, patchMysql2Client, patchOpenAIClient, patchPgClient, patchRedisClient, patchVertexGenerativeModel, simulateUser, startHttpTrace, startSpan, startTrace, traceFunction, traceObject, traceTool, withTrace };
|