@rudderjs/ai 1.17.3 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -1274
- package/dist/budget-orm/index.d.ts +1 -95
- package/dist/budget-orm/index.d.ts.map +1 -1
- package/dist/budget-orm/index.js +4 -176
- package/dist/budget-orm/index.js.map +1 -1
- package/dist/chat-mentions.d.ts +1 -58
- package/dist/chat-mentions.d.ts.map +1 -1
- package/dist/chat-mentions.js +4 -80
- package/dist/chat-mentions.js.map +1 -1
- package/dist/commands/ai-eval.d.ts +1 -92
- package/dist/commands/ai-eval.d.ts.map +1 -1
- package/dist/commands/ai-eval.js +4 -377
- package/dist/commands/ai-eval.js.map +1 -1
- package/dist/commands/make-agent.d.ts +1 -2
- package/dist/commands/make-agent.d.ts.map +1 -1
- package/dist/commands/make-agent.js +4 -22
- package/dist/commands/make-agent.js.map +1 -1
- package/dist/computer-use/index.d.ts +1 -52
- package/dist/computer-use/index.d.ts.map +1 -1
- package/dist/computer-use/index.js +4 -50
- package/dist/computer-use/index.js.map +1 -1
- package/dist/conversation-orm/index.d.ts +1 -108
- package/dist/conversation-orm/index.d.ts.map +1 -1
- package/dist/conversation-orm/index.js +4 -214
- package/dist/conversation-orm/index.js.map +1 -1
- package/dist/doctor.d.ts +1 -1
- package/dist/doctor.d.ts.map +1 -1
- package/dist/doctor.js +4 -65
- package/dist/doctor.js.map +1 -1
- package/dist/eval/index.d.ts +1 -270
- package/dist/eval/index.d.ts.map +1 -1
- package/dist/eval/index.js +4 -509
- package/dist/eval/index.js.map +1 -1
- package/dist/gateway/index.d.ts +1 -10
- package/dist/gateway/index.d.ts.map +1 -1
- package/dist/gateway/index.js +4 -10
- package/dist/gateway/index.js.map +1 -1
- package/dist/index.d.ts +1 -66
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -78
- package/dist/index.js.map +1 -1
- package/dist/mcp/index.d.ts +1 -15
- package/dist/mcp/index.d.ts.map +1 -1
- package/dist/mcp/index.js +4 -14
- package/dist/mcp/index.js.map +1 -1
- package/dist/memory-embedding/index.d.ts +1 -120
- package/dist/memory-embedding/index.d.ts.map +1 -1
- package/dist/memory-embedding/index.js +4 -228
- package/dist/memory-embedding/index.js.map +1 -1
- package/dist/memory-orm/index.d.ts +1 -117
- package/dist/memory-orm/index.d.ts.map +1 -1
- package/dist/memory-orm/index.js +4 -186
- package/dist/memory-orm/index.js.map +1 -1
- package/dist/node/index.d.ts +1 -2
- package/dist/node/index.d.ts.map +1 -1
- package/dist/node/index.js +4 -2
- package/dist/node/index.js.map +1 -1
- package/dist/observers.d.ts +1 -129
- package/dist/observers.d.ts.map +1 -1
- package/dist/observers.js +4 -39
- package/dist/observers.js.map +1 -1
- package/dist/react/index.d.ts +1 -15
- package/dist/react/index.d.ts.map +1 -1
- package/dist/react/index.js +4 -15
- package/dist/react/index.js.map +1 -1
- package/dist/server/index.d.ts +1 -1
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +4 -1
- package/dist/server/index.js.map +1 -1
- package/package.json +9 -13
- package/boost/guidelines.md +0 -260
- package/boost/skills/ai-agents/SKILL.md +0 -240
- package/boost/skills/ai-tools/SKILL.md +0 -260
- package/dist/agent-run-store.d.ts +0 -161
- package/dist/agent-run-store.d.ts.map +0 -1
- package/dist/agent-run-store.js +0 -98
- package/dist/agent-run-store.js.map +0 -1
- package/dist/agent-sse.d.ts +0 -153
- package/dist/agent-sse.d.ts.map +0 -1
- package/dist/agent-sse.js +0 -282
- package/dist/agent-sse.js.map +0 -1
- package/dist/agent.d.ts +0 -508
- package/dist/agent.d.ts.map +0 -1
- package/dist/agent.js +0 -1538
- package/dist/agent.js.map +0 -1
- package/dist/attachment.d.ts +0 -31
- package/dist/attachment.d.ts.map +0 -1
- package/dist/attachment.js +0 -89
- package/dist/attachment.js.map +0 -1
- package/dist/audio.d.ts +0 -45
- package/dist/audio.d.ts.map +0 -1
- package/dist/audio.js +0 -93
- package/dist/audio.js.map +0 -1
- package/dist/base64.d.ts +0 -7
- package/dist/base64.d.ts.map +0 -1
- package/dist/base64.js +0 -39
- package/dist/base64.js.map +0 -1
- package/dist/budget/pricing.d.ts +0 -124
- package/dist/budget/pricing.d.ts.map +0 -1
- package/dist/budget/pricing.js +0 -175
- package/dist/budget/pricing.js.map +0 -1
- package/dist/budget/storage.d.ts +0 -104
- package/dist/budget/storage.d.ts.map +0 -1
- package/dist/budget/storage.js +0 -0
- package/dist/budget/storage.js.map +0 -1
- package/dist/budget/with-budget.d.ts +0 -119
- package/dist/budget/with-budget.d.ts.map +0 -1
- package/dist/budget/with-budget.js +0 -175
- package/dist/budget/with-budget.js.map +0 -1
- package/dist/cached-embedding.d.ts +0 -14
- package/dist/cached-embedding.d.ts.map +0 -1
- package/dist/cached-embedding.js +0 -44
- package/dist/cached-embedding.js.map +0 -1
- package/dist/computer-use/actions.d.ts +0 -214
- package/dist/computer-use/actions.d.ts.map +0 -1
- package/dist/computer-use/actions.js +0 -48
- package/dist/computer-use/actions.js.map +0 -1
- package/dist/computer-use/errors.d.ts +0 -57
- package/dist/computer-use/errors.d.ts.map +0 -1
- package/dist/computer-use/errors.js +0 -76
- package/dist/computer-use/errors.js.map +0 -1
- package/dist/computer-use/playwright.d.ts +0 -76
- package/dist/computer-use/playwright.d.ts.map +0 -1
- package/dist/computer-use/playwright.js +0 -270
- package/dist/computer-use/playwright.js.map +0 -1
- package/dist/computer-use/tool.d.ts +0 -154
- package/dist/computer-use/tool.d.ts.map +0 -1
- package/dist/computer-use/tool.js +0 -210
- package/dist/computer-use/tool.js.map +0 -1
- package/dist/continuation-validation.d.ts +0 -85
- package/dist/continuation-validation.d.ts.map +0 -1
- package/dist/continuation-validation.js +0 -166
- package/dist/continuation-validation.js.map +0 -1
- package/dist/conversation-persistence.d.ts +0 -46
- package/dist/conversation-persistence.d.ts.map +0 -1
- package/dist/conversation-persistence.js +0 -176
- package/dist/conversation-persistence.js.map +0 -1
- package/dist/conversation.d.ts +0 -11
- package/dist/conversation.d.ts.map +0 -1
- package/dist/conversation.js +0 -55
- package/dist/conversation.js.map +0 -1
- package/dist/eval/fixtures.d.ts +0 -65
- package/dist/eval/fixtures.d.ts.map +0 -1
- package/dist/eval/fixtures.js +0 -110
- package/dist/eval/fixtures.js.map +0 -1
- package/dist/eval/html-reporter.d.ts +0 -25
- package/dist/eval/html-reporter.d.ts.map +0 -1
- package/dist/eval/html-reporter.js +0 -209
- package/dist/eval/html-reporter.js.map +0 -1
- package/dist/eval/json-reporter.d.ts +0 -43
- package/dist/eval/json-reporter.d.ts.map +0 -1
- package/dist/eval/json-reporter.js +0 -40
- package/dist/eval/json-reporter.js.map +0 -1
- package/dist/facade.d.ts +0 -96
- package/dist/facade.d.ts.map +0 -1
- package/dist/facade.js +0 -146
- package/dist/facade.js.map +0 -1
- package/dist/fake.d.ts +0 -201
- package/dist/fake.d.ts.map +0 -1
- package/dist/fake.js +0 -428
- package/dist/fake.js.map +0 -1
- package/dist/file-search.d.ts +0 -168
- package/dist/file-search.d.ts.map +0 -1
- package/dist/file-search.js +0 -158
- package/dist/file-search.js.map +0 -1
- package/dist/files.d.ts +0 -27
- package/dist/files.d.ts.map +0 -1
- package/dist/files.js +0 -44
- package/dist/files.js.map +0 -1
- package/dist/gateway/http-gateway-adapter.d.ts +0 -94
- package/dist/gateway/http-gateway-adapter.d.ts.map +0 -1
- package/dist/gateway/http-gateway-adapter.js +0 -106
- package/dist/gateway/http-gateway-adapter.js.map +0 -1
- package/dist/gateway/sse.d.ts +0 -28
- package/dist/gateway/sse.d.ts.map +0 -1
- package/dist/gateway/sse.js +0 -78
- package/dist/gateway/sse.js.map +0 -1
- package/dist/handoff.d.ts +0 -95
- package/dist/handoff.d.ts.map +0 -1
- package/dist/handoff.js +0 -78
- package/dist/handoff.js.map +0 -1
- package/dist/handoffs-driver.d.ts +0 -58
- package/dist/handoffs-driver.d.ts.map +0 -1
- package/dist/handoffs-driver.js +0 -103
- package/dist/handoffs-driver.js.map +0 -1
- package/dist/image.d.ts +0 -40
- package/dist/image.d.ts.map +0 -1
- package/dist/image.js +0 -109
- package/dist/image.js.map +0 -1
- package/dist/mcp/client-tools.d.ts +0 -39
- package/dist/mcp/client-tools.d.ts.map +0 -1
- package/dist/mcp/client-tools.js +0 -147
- package/dist/mcp/client-tools.js.map +0 -1
- package/dist/mcp/server-from-agent.d.ts +0 -24
- package/dist/mcp/server-from-agent.d.ts.map +0 -1
- package/dist/mcp/server-from-agent.js +0 -113
- package/dist/mcp/server-from-agent.js.map +0 -1
- package/dist/mcp/types.d.ts +0 -64
- package/dist/mcp/types.d.ts.map +0 -1
- package/dist/mcp/types.js +0 -6
- package/dist/mcp/types.js.map +0 -1
- package/dist/memory-extract.d.ts +0 -60
- package/dist/memory-extract.d.ts.map +0 -1
- package/dist/memory-extract.js +0 -163
- package/dist/memory-extract.js.map +0 -1
- package/dist/memory-inject.d.ts +0 -39
- package/dist/memory-inject.d.ts.map +0 -1
- package/dist/memory-inject.js +0 -135
- package/dist/memory-inject.js.map +0 -1
- package/dist/memory.d.ts +0 -55
- package/dist/memory.d.ts.map +0 -1
- package/dist/memory.js +0 -132
- package/dist/memory.js.map +0 -1
- package/dist/middleware.d.ts +0 -18
- package/dist/middleware.d.ts.map +0 -1
- package/dist/middleware.js +0 -72
- package/dist/middleware.js.map +0 -1
- package/dist/node/attachment.d.ts +0 -6
- package/dist/node/attachment.d.ts.map +0 -1
- package/dist/node/attachment.js +0 -35
- package/dist/node/attachment.js.map +0 -1
- package/dist/node/transcription.d.ts +0 -4
- package/dist/node/transcription.d.ts.map +0 -1
- package/dist/node/transcription.js +0 -8
- package/dist/node/transcription.js.map +0 -1
- package/dist/output.d.ts +0 -22
- package/dist/output.d.ts.map +0 -1
- package/dist/output.js +0 -60
- package/dist/output.js.map +0 -1
- package/dist/provider-tools.d.ts +0 -87
- package/dist/provider-tools.d.ts.map +0 -1
- package/dist/provider-tools.js +0 -189
- package/dist/provider-tools.js.map +0 -1
- package/dist/providers/anthropic.d.ts +0 -24
- package/dist/providers/anthropic.d.ts.map +0 -1
- package/dist/providers/anthropic.js +0 -405
- package/dist/providers/anthropic.js.map +0 -1
- package/dist/providers/azure.d.ts +0 -13
- package/dist/providers/azure.d.ts.map +0 -1
- package/dist/providers/azure.js +0 -15
- package/dist/providers/azure.js.map +0 -1
- package/dist/providers/bedrock.d.ts +0 -75
- package/dist/providers/bedrock.d.ts.map +0 -1
- package/dist/providers/bedrock.js +0 -181
- package/dist/providers/bedrock.js.map +0 -1
- package/dist/providers/cohere.d.ts +0 -13
- package/dist/providers/cohere.d.ts.map +0 -1
- package/dist/providers/cohere.js +0 -87
- package/dist/providers/cohere.js.map +0 -1
- package/dist/providers/deepseek.d.ts +0 -12
- package/dist/providers/deepseek.d.ts.map +0 -1
- package/dist/providers/deepseek.js +0 -15
- package/dist/providers/deepseek.js.map +0 -1
- package/dist/providers/elevenlabs.d.ts +0 -98
- package/dist/providers/elevenlabs.d.ts.map +0 -1
- package/dist/providers/elevenlabs.js +0 -229
- package/dist/providers/elevenlabs.js.map +0 -1
- package/dist/providers/google-cache-registry.d.ts +0 -132
- package/dist/providers/google-cache-registry.d.ts.map +0 -1
- package/dist/providers/google-cache-registry.js +0 -209
- package/dist/providers/google-cache-registry.js.map +0 -1
- package/dist/providers/google.d.ts +0 -38
- package/dist/providers/google.d.ts.map +0 -1
- package/dist/providers/google.js +0 -903
- package/dist/providers/google.js.map +0 -1
- package/dist/providers/groq.d.ts +0 -12
- package/dist/providers/groq.d.ts.map +0 -1
- package/dist/providers/groq.js +0 -15
- package/dist/providers/groq.js.map +0 -1
- package/dist/providers/jina.d.ts +0 -13
- package/dist/providers/jina.d.ts.map +0 -1
- package/dist/providers/jina.js +0 -90
- package/dist/providers/jina.js.map +0 -1
- package/dist/providers/mistral.d.ts +0 -13
- package/dist/providers/mistral.d.ts.map +0 -1
- package/dist/providers/mistral.js +0 -46
- package/dist/providers/mistral.js.map +0 -1
- package/dist/providers/ollama.d.ts +0 -11
- package/dist/providers/ollama.d.ts.map +0 -1
- package/dist/providers/ollama.js +0 -15
- package/dist/providers/ollama.js.map +0 -1
- package/dist/providers/openai.d.ts +0 -79
- package/dist/providers/openai.d.ts.map +0 -1
- package/dist/providers/openai.js +0 -792
- package/dist/providers/openai.js.map +0 -1
- package/dist/providers/openrouter.d.ts +0 -43
- package/dist/providers/openrouter.d.ts.map +0 -1
- package/dist/providers/openrouter.js +0 -21
- package/dist/providers/openrouter.js.map +0 -1
- package/dist/providers/voyage.d.ts +0 -91
- package/dist/providers/voyage.d.ts.map +0 -1
- package/dist/providers/voyage.js +0 -166
- package/dist/providers/voyage.js.map +0 -1
- package/dist/providers/xai.d.ts +0 -12
- package/dist/providers/xai.d.ts.map +0 -1
- package/dist/providers/xai.js +0 -15
- package/dist/providers/xai.js.map +0 -1
- package/dist/queue-job.d.ts +0 -100
- package/dist/queue-job.d.ts.map +0 -1
- package/dist/queue-job.js +0 -185
- package/dist/queue-job.js.map +0 -1
- package/dist/react/agent-run.d.ts +0 -111
- package/dist/react/agent-run.d.ts.map +0 -1
- package/dist/react/agent-run.js +0 -107
- package/dist/react/agent-run.js.map +0 -1
- package/dist/react/useAgentRun.d.ts +0 -68
- package/dist/react/useAgentRun.d.ts.map +0 -1
- package/dist/react/useAgentRun.js +0 -125
- package/dist/react/useAgentRun.js.map +0 -1
- package/dist/registry.d.ts +0 -45
- package/dist/registry.d.ts.map +0 -1
- package/dist/registry.js +0 -131
- package/dist/registry.js.map +0 -1
- package/dist/rerank.d.ts +0 -20
- package/dist/rerank.d.ts.map +0 -1
- package/dist/rerank.js +0 -40
- package/dist/rerank.js.map +0 -1
- package/dist/resume-approval.d.ts +0 -30
- package/dist/resume-approval.d.ts.map +0 -1
- package/dist/resume-approval.js +0 -147
- package/dist/resume-approval.js.map +0 -1
- package/dist/sanitize-conversation.d.ts +0 -43
- package/dist/sanitize-conversation.d.ts.map +0 -1
- package/dist/sanitize-conversation.js +0 -85
- package/dist/sanitize-conversation.js.map +0 -1
- package/dist/scoped-tool.d.ts +0 -98
- package/dist/scoped-tool.d.ts.map +0 -1
- package/dist/scoped-tool.js +0 -174
- package/dist/scoped-tool.js.map +0 -1
- package/dist/server/provider.d.ts +0 -22
- package/dist/server/provider.d.ts.map +0 -1
- package/dist/server/provider.js +0 -194
- package/dist/server/provider.js.map +0 -1
- package/dist/similarity-search.d.ts +0 -163
- package/dist/similarity-search.d.ts.map +0 -1
- package/dist/similarity-search.js +0 -147
- package/dist/similarity-search.js.map +0 -1
- package/dist/sub-agent-run-store.d.ts +0 -157
- package/dist/sub-agent-run-store.d.ts.map +0 -1
- package/dist/sub-agent-run-store.js +0 -87
- package/dist/sub-agent-run-store.js.map +0 -1
- package/dist/tool-execution.d.ts +0 -16
- package/dist/tool-execution.d.ts.map +0 -1
- package/dist/tool-execution.js +0 -498
- package/dist/tool-execution.js.map +0 -1
- package/dist/tool-helpers.d.ts +0 -77
- package/dist/tool-helpers.d.ts.map +0 -1
- package/dist/tool-helpers.js +0 -117
- package/dist/tool-helpers.js.map +0 -1
- package/dist/tool.d.ts +0 -216
- package/dist/tool.d.ts.map +0 -1
- package/dist/tool.js +0 -175
- package/dist/tool.js.map +0 -1
- package/dist/transcription.d.ts +0 -42
- package/dist/transcription.d.ts.map +0 -1
- package/dist/transcription.js +0 -77
- package/dist/transcription.js.map +0 -1
- package/dist/types.d.ts +0 -1020
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -2
- package/dist/types.js.map +0 -1
- package/dist/util/hash.d.ts +0 -11
- package/dist/util/hash.d.ts.map +0 -1
- package/dist/util/hash.js +0 -23
- package/dist/util/hash.js.map +0 -1
- package/dist/vector-stores/index.d.ts +0 -96
- package/dist/vector-stores/index.d.ts.map +0 -1
- package/dist/vector-stores/index.js +0 -153
- package/dist/vector-stores/index.js.map +0 -1
- package/dist/vercel-protocol.d.ts +0 -18
- package/dist/vercel-protocol.d.ts.map +0 -1
- package/dist/vercel-protocol.js +0 -75
- package/dist/vercel-protocol.js.map +0 -1
- package/dist/zod-to-json-schema.d.ts +0 -16
- package/dist/zod-to-json-schema.d.ts.map +0 -1
- package/dist/zod-to-json-schema.js +0 -17
- package/dist/zod-to-json-schema.js.map +0 -1
package/dist/eval/index.d.ts
CHANGED
|
@@ -1,271 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
* `@rudderjs/ai/eval` — built-in eval framework for #A5 Phase 1.
|
|
3
|
-
*
|
|
4
|
-
* Define a suite of input cases + assertions, run them against any
|
|
5
|
-
* `Agent`, get a console report with pass/fail + cost + tokens. Same
|
|
6
|
-
* `Agent` instances as your app code — one source of truth.
|
|
7
|
-
*
|
|
8
|
-
* @example
|
|
9
|
-
* ```ts
|
|
10
|
-
* // evals/support-agent.eval.ts
|
|
11
|
-
* import { evalSuite, llmJudge, exactMatch, regex } from '@rudderjs/ai/eval'
|
|
12
|
-
* import { SupportAgent } from '../app/Agents/SupportAgent.js'
|
|
13
|
-
*
|
|
14
|
-
* export default evalSuite('SupportAgent', {
|
|
15
|
-
* agent: () => new SupportAgent(),
|
|
16
|
-
* cases: [
|
|
17
|
-
* { name: 'password reset', input: 'How do I reset my password?',
|
|
18
|
-
* assert: llmJudge('mentions a password reset link') },
|
|
19
|
-
* { name: 'price', input: 'How much?', assert: exactMatch('$99/month') },
|
|
20
|
-
* { name: 'support email', input: 'Contact?', assert: regex(/support@/) },
|
|
21
|
-
* ],
|
|
22
|
-
* })
|
|
23
|
-
* ```
|
|
24
|
-
*
|
|
25
|
-
* Run programmatically via `runSuite(suite)` from this entry, or via
|
|
26
|
-
* `pnpm rudder ai:eval` once Phase 2 lands.
|
|
27
|
-
*
|
|
28
|
-
* Built-in metrics: `exactMatch`, `regex`, `llmJudge`, `jsonShape`,
|
|
29
|
-
* `semanticMatch`, `tokenCost`. Compose multiple via `compose(...)`.
|
|
30
|
-
* User-defined metrics work today — any `(response, ctx) =>
|
|
31
|
-
* MetricResult` qualifies.
|
|
32
|
-
*/
|
|
33
|
-
import type { Agent } from '../agent.js';
|
|
34
|
-
import type { AgentResponse } from '../types.js';
|
|
35
|
-
import { z } from 'zod';
|
|
36
|
-
export { reportJson } from './json-reporter.js';
|
|
37
|
-
export type { SuiteJson, SuiteJsonCase } from './json-reporter.js';
|
|
38
|
-
export { stepsFromResponse } from './fixtures.js';
|
|
39
|
-
export type { EvalFixture } from './fixtures.js';
|
|
40
|
-
export { reportHtml } from './html-reporter.js';
|
|
41
|
-
export type { HtmlReportOptions } from './html-reporter.js';
|
|
42
|
-
/**
|
|
43
|
-
* Result of a single assertion. `pass` is the only required field;
|
|
44
|
-
* `score` (0..1) and `reason` are surfaced in reports.
|
|
45
|
-
*/
|
|
46
|
-
export interface MetricResult {
|
|
47
|
-
pass: boolean;
|
|
48
|
-
score?: number;
|
|
49
|
-
reason?: string;
|
|
50
|
-
}
|
|
51
|
-
/**
|
|
52
|
-
* Assertion signature. Sync or async; the runner awaits both.
|
|
53
|
-
*
|
|
54
|
-
* `ctx` carries the case context so user metrics can opt into the
|
|
55
|
-
* input/case-name (e.g. for logging). The built-ins ignore it.
|
|
56
|
-
*/
|
|
57
|
-
export type Metric = (response: AgentResponse, ctx: MetricContext) => MetricResult | Promise<MetricResult>;
|
|
58
|
-
export interface MetricContext {
|
|
59
|
-
/** The case's input string (the same passed to `agent.prompt`). */
|
|
60
|
-
input: string;
|
|
61
|
-
/** Optional case `name` if set on the spec. */
|
|
62
|
-
caseName: string;
|
|
63
|
-
}
|
|
64
|
-
/** A single eval case. */
|
|
65
|
-
export interface EvalCase {
|
|
66
|
-
/** Stable identifier used in reports. Defaults to `case-<index>`. */
|
|
67
|
-
name?: string;
|
|
68
|
-
/** Input passed to `agent.prompt(input)`. */
|
|
69
|
-
input: string;
|
|
70
|
-
/** The assertion. Pass-fail + optional score/reason. */
|
|
71
|
-
assert: Metric;
|
|
72
|
-
/**
|
|
73
|
-
* Per-case agent override. When set, replaces the suite-level
|
|
74
|
-
* `agent` factory for this case (e.g. swap models for a stress
|
|
75
|
-
* test).
|
|
76
|
-
*/
|
|
77
|
-
agent?: () => Agent;
|
|
78
|
-
/**
|
|
79
|
-
* Per-case timeout in ms. Defaults to the suite-level timeout
|
|
80
|
-
* (or no timeout if neither is set).
|
|
81
|
-
*/
|
|
82
|
-
timeout?: number;
|
|
83
|
-
/**
|
|
84
|
-
* Skip this case. Pass `true` to silently skip, or a string for
|
|
85
|
-
* a reason that surfaces in the report.
|
|
86
|
-
*/
|
|
87
|
-
skip?: boolean | string;
|
|
88
|
-
}
|
|
89
|
-
export interface EvalSuiteSpec {
|
|
90
|
-
/** Factory for the agent under test. Called once per case. */
|
|
91
|
-
agent: () => Agent;
|
|
92
|
-
/** The cases to run. */
|
|
93
|
-
cases: EvalCase[];
|
|
94
|
-
/**
|
|
95
|
-
* Suite-wide timeout in ms applied to every case unless the case
|
|
96
|
-
* overrides. Throws cause `pass: false` with the timeout message.
|
|
97
|
-
*/
|
|
98
|
-
timeout?: number;
|
|
99
|
-
/**
|
|
100
|
-
* Optional ownership / context surfaced in the HTML report (#A5
|
|
101
|
-
* Phase 5). Well-known keys (`owner`, `lastReviewed`, `ticket`)
|
|
102
|
-
* get formatted headings; any extra string keys render as a
|
|
103
|
-
* generic key/value row so teams can attach their own metadata.
|
|
104
|
-
*/
|
|
105
|
-
metadata?: EvalMetadata;
|
|
106
|
-
}
|
|
107
|
-
export interface EvalMetadata {
|
|
108
|
-
owner?: string;
|
|
109
|
-
lastReviewed?: string;
|
|
110
|
-
ticket?: string;
|
|
111
|
-
[key: string]: string | undefined;
|
|
112
|
-
}
|
|
113
|
-
export interface EvalSuite {
|
|
114
|
-
name: string;
|
|
115
|
-
spec: EvalSuiteSpec;
|
|
116
|
-
}
|
|
117
|
-
/** Per-case run record collected by {@link runSuite}. */
|
|
118
|
-
export interface CaseResult {
|
|
119
|
-
name: string;
|
|
120
|
-
/** Final result; `'skipped'` skips assertion + cost. */
|
|
121
|
-
status: 'passed' | 'failed' | 'skipped';
|
|
122
|
-
metric?: MetricResult;
|
|
123
|
-
/** Skip reason (when `status === 'skipped'`). */
|
|
124
|
-
reason?: string;
|
|
125
|
-
/** Wall-clock ms for the agent call + assertion. */
|
|
126
|
-
duration: number;
|
|
127
|
-
/**
|
|
128
|
-
* Token usage from the agent's `prompt()` (zero on skip / failure
|
|
129
|
-
* before the call). Includes BOTH the agent under test AND any
|
|
130
|
-
* judge-model calls the assertion made.
|
|
131
|
-
*/
|
|
132
|
-
tokens: number;
|
|
133
|
-
/** USD estimate (see {@link estimateCost}; zero on skip). */
|
|
134
|
-
cost: number;
|
|
135
|
-
/**
|
|
136
|
-
* The case's input string, copied through from `EvalCase.input`
|
|
137
|
-
* for reporters that want to render the prompt alongside the
|
|
138
|
-
* response (#A5 Phase 5 HTML report). Always present — runners
|
|
139
|
-
* always know the input.
|
|
140
|
-
*/
|
|
141
|
-
input: string;
|
|
142
|
-
/**
|
|
143
|
-
* The agent's final assistant text. Absent when the case skipped
|
|
144
|
-
* or the agent threw before producing a response. The HTML
|
|
145
|
-
* reporter renders `<no response>` in that case.
|
|
146
|
-
*/
|
|
147
|
-
responseText?: string;
|
|
148
|
-
}
|
|
149
|
-
/** Full report returned by {@link runSuite}. */
|
|
150
|
-
export interface SuiteReport {
|
|
151
|
-
suite: string;
|
|
152
|
-
cases: CaseResult[];
|
|
153
|
-
passed: number;
|
|
154
|
-
failed: number;
|
|
155
|
-
skipped: number;
|
|
156
|
-
duration: number;
|
|
157
|
-
cost: number;
|
|
158
|
-
tokens: number;
|
|
159
|
-
/** Suite-level metadata (#A5 Phase 5), copied through from the spec. */
|
|
160
|
-
metadata?: EvalMetadata;
|
|
161
|
-
}
|
|
162
|
-
/**
|
|
163
|
-
* Define an eval suite. Returns a frozen `EvalSuite` ready to pass
|
|
164
|
-
* into {@link runSuite} or to default-export from an `evals/*.eval.ts`
|
|
165
|
-
* file (Phase 2's CLI auto-discovers those).
|
|
166
|
-
*
|
|
167
|
-
* The shape is deliberately a function rather than a class — keeps the
|
|
168
|
-
* file's default export trivially serializable (Phase 2 needs to load
|
|
169
|
-
* suites via dynamic import) and avoids the "did you forget `new`?"
|
|
170
|
-
* footgun.
|
|
171
|
-
*/
|
|
172
|
-
export declare function evalSuite(name: string, spec: EvalSuiteSpec): EvalSuite;
|
|
173
|
-
/** Exact string equality against `response.text`. */
|
|
174
|
-
export declare function exactMatch(expected: string): Metric;
|
|
175
|
-
/** Pattern match against `response.text`. */
|
|
176
|
-
export declare function regex(pattern: RegExp): Metric;
|
|
177
|
-
/**
|
|
178
|
-
* LLM-as-judge: ask a small model whether the response satisfies a
|
|
179
|
-
* natural-language criterion. Returns the judge's reasoning in
|
|
180
|
-
* `reason` so failures are debuggable.
|
|
181
|
-
*
|
|
182
|
-
* Design: the judge runs as a one-shot anonymous agent (no recursion
|
|
183
|
-
* concern — default `remembers()` is `false`). Output is shaped via
|
|
184
|
-
* `Output.object({ schema })` for deterministic parsing. Failures
|
|
185
|
-
* (network, parse, unhandled judge error) bubble as `pass: false`
|
|
186
|
-
* with the error in `reason` — a broken judge is not a passing case.
|
|
187
|
-
*
|
|
188
|
-
* Pitfall: the judge model has the same biases as any LLM. Use it
|
|
189
|
-
* for fuzzy "did the answer mention X?" assertions; for exact
|
|
190
|
-
* structural checks prefer `jsonShape` (Phase 3) or `regex`.
|
|
191
|
-
*/
|
|
192
|
-
export declare function llmJudge(criterion: string, opts?: {
|
|
193
|
-
model?: string;
|
|
194
|
-
}): Metric;
|
|
195
|
-
/**
|
|
196
|
-
* Strict structural assertion: parse `response.text` as JSON
|
|
197
|
-
* (stripping ```json fences) and run it through a zod schema.
|
|
198
|
-
*
|
|
199
|
-
* Pairs naturally with `Output.object({ schema })` on the agent —
|
|
200
|
-
* if the agent declares the same schema, this metric verifies the
|
|
201
|
-
* output actually conforms. Failures surface the zod issue path
|
|
202
|
-
* (e.g. `customer.email`) so debugging doesn't require a separate
|
|
203
|
-
* console log.
|
|
204
|
-
*/
|
|
205
|
-
export declare function jsonShape<T>(schema: z.ZodType<T>): Metric;
|
|
206
|
-
/**
|
|
207
|
-
* Embedding-based fuzzy match. Embeds both `reference` and
|
|
208
|
-
* `response.text` via `AI.embed()`, computes cosine similarity,
|
|
209
|
-
* passes when >= `threshold` (default `0.85` — tighter than
|
|
210
|
-
* `EmbeddingUserMemory`'s 0.5 retrieval-rank floor since this is
|
|
211
|
-
* an assertion, not a ranking).
|
|
212
|
-
*
|
|
213
|
-
* Uses ≤ 2 embedding calls per case; embed tokens roll into the
|
|
214
|
-
* case's cost rollup via the same side-channel `llmJudge` uses.
|
|
215
|
-
*
|
|
216
|
-
* Pitfall: requires a provider that implements `createEmbedding()`
|
|
217
|
-
* (openai / google / mistral / cohere / jina). Failures (no
|
|
218
|
-
* provider, network, etc.) surface as `pass: false` with the
|
|
219
|
-
* error in `reason` — a broken embed is not a passing case.
|
|
220
|
-
*/
|
|
221
|
-
export declare function semanticMatch(reference: string, opts?: {
|
|
222
|
-
threshold?: number;
|
|
223
|
-
model?: string;
|
|
224
|
-
}): Metric;
|
|
225
|
-
/**
|
|
226
|
-
* Token budget guard. Passes when `response.usage.totalTokens
|
|
227
|
-
* <= threshold`. Pair with cost-conscious agents to detect prompt-
|
|
228
|
-
* size regressions before they show up as a billing surprise.
|
|
229
|
-
*
|
|
230
|
-
* `response.usage` is the multi-step rollup, so it's meaningful
|
|
231
|
-
* even when the agent runs tools across several provider calls.
|
|
232
|
-
*/
|
|
233
|
-
export declare function tokenCost(threshold: number): Metric;
|
|
234
|
-
/**
|
|
235
|
-
* Compose multiple metrics into one assertion. Runs them in order
|
|
236
|
-
* and short-circuits on the first failure — failure `reason` is
|
|
237
|
-
* surfaced; success returns `{ pass: true, score: 1 }`.
|
|
238
|
-
*
|
|
239
|
-
* @example
|
|
240
|
-
* { input: '…',
|
|
241
|
-
* assert: compose(
|
|
242
|
-
* jsonShape(SummarySchema),
|
|
243
|
-
* tokenCost(800),
|
|
244
|
-
* ),
|
|
245
|
-
* }
|
|
246
|
-
*/
|
|
247
|
-
export declare function compose(...metrics: Metric[]): Metric;
|
|
248
|
-
/**
|
|
249
|
-
* Run every case in the suite, in declaration order. Returns the
|
|
250
|
-
* full report; never throws (assertion errors become `failed` cases,
|
|
251
|
-
* not exceptions).
|
|
252
|
-
*
|
|
253
|
-
* Phase 1 runs serially. Parallel execution lands in a follow-up
|
|
254
|
-
* once we understand the rate-limit shape of real-world judge
|
|
255
|
-
* models — sequential is correct under any rate limit.
|
|
256
|
-
*/
|
|
257
|
-
export declare function runSuite(suite: EvalSuite): Promise<SuiteReport>;
|
|
258
|
-
export { estimateCost, ModelPricing } from '../budget/pricing.js';
|
|
259
|
-
export type { ModelPriceEntry } from '../budget/pricing.js';
|
|
260
|
-
/**
|
|
261
|
-
* Default reporter — prints a colorless ANSI-aware table to a
|
|
262
|
-
* caller-supplied `console`-like sink. Uses Unicode pass/fail glyphs
|
|
263
|
-
* for visual scanning. JSON / HTML reporters land in Phase 2 / 5.
|
|
264
|
-
*
|
|
265
|
-
* Returns the report unchanged so chains compose: `await
|
|
266
|
-
* reportConsole(await runSuite(suite))`.
|
|
267
|
-
*/
|
|
268
|
-
export declare function reportConsole(report: SuiteReport, sink?: {
|
|
269
|
-
log: (s: string) => void;
|
|
270
|
-
}): SuiteReport;
|
|
1
|
+
export * from '@gemstack/ai-sdk/eval';
|
|
271
2
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/eval/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAGA,cAAc,uBAAuB,CAAA"}
|