elasticdash-sdk 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +775 -0
- package/dist/browser-ui.d.ts +43 -0
- package/dist/browser-ui.d.ts.map +1 -0
- package/dist/browser-ui.js +246 -0
- package/dist/browser-ui.js.map +1 -0
- package/dist/capture/event.d.ts +33 -0
- package/dist/capture/event.d.ts.map +1 -0
- package/dist/capture/event.js +2 -0
- package/dist/capture/event.js.map +1 -0
- package/dist/capture/index.d.ts +4 -0
- package/dist/capture/index.d.ts.map +1 -0
- package/dist/capture/index.js +4 -0
- package/dist/capture/index.js.map +1 -0
- package/dist/capture/recorder.d.ts +24 -0
- package/dist/capture/recorder.d.ts.map +1 -0
- package/dist/capture/recorder.js +46 -0
- package/dist/capture/recorder.js.map +1 -0
- package/dist/capture/replay.d.ts +20 -0
- package/dist/capture/replay.d.ts.map +1 -0
- package/dist/capture/replay.js +47 -0
- package/dist/capture/replay.js.map +1 -0
- package/dist/ci/api-client.d.ts +38 -0
- package/dist/ci/api-client.d.ts.map +1 -0
- package/dist/ci/api-client.js +96 -0
- package/dist/ci/api-client.js.map +1 -0
- package/dist/ci/benchmark.d.ts +33 -0
- package/dist/ci/benchmark.d.ts.map +1 -0
- package/dist/ci/benchmark.js +213 -0
- package/dist/ci/benchmark.js.map +1 -0
- package/dist/ci/ed-runner.d.ts +48 -0
- package/dist/ci/ed-runner.d.ts.map +1 -0
- package/dist/ci/ed-runner.js +260 -0
- package/dist/ci/ed-runner.js.map +1 -0
- package/dist/ci/executor.d.ts +13 -0
- package/dist/ci/executor.d.ts.map +1 -0
- package/dist/ci/executor.js +542 -0
- package/dist/ci/executor.js.map +1 -0
- package/dist/ci/git-info.d.ts +17 -0
- package/dist/ci/git-info.d.ts.map +1 -0
- package/dist/ci/git-info.js +102 -0
- package/dist/ci/git-info.js.map +1 -0
- package/dist/ci/index.d.ts +6 -0
- package/dist/ci/index.d.ts.map +1 -0
- package/dist/ci/index.js +4 -0
- package/dist/ci/index.js.map +1 -0
- package/dist/ci/measurement.d.ts +9 -0
- package/dist/ci/measurement.d.ts.map +1 -0
- package/dist/ci/measurement.js +15 -0
- package/dist/ci/measurement.js.map +1 -0
- package/dist/ci/replay.d.ts +31 -0
- package/dist/ci/replay.d.ts.map +1 -0
- package/dist/ci/replay.js +96 -0
- package/dist/ci/replay.js.map +1 -0
- package/dist/ci/reporters/default.d.ts +8 -0
- package/dist/ci/reporters/default.d.ts.map +1 -0
- package/dist/ci/reporters/default.js +46 -0
- package/dist/ci/reporters/default.js.map +1 -0
- package/dist/ci/reporters/index.d.ts +8 -0
- package/dist/ci/reporters/index.d.ts.map +1 -0
- package/dist/ci/reporters/index.js +14 -0
- package/dist/ci/reporters/index.js.map +1 -0
- package/dist/ci/reporters/json.d.ts +8 -0
- package/dist/ci/reporters/json.d.ts.map +1 -0
- package/dist/ci/reporters/json.js +14 -0
- package/dist/ci/reporters/json.js.map +1 -0
- package/dist/ci/reporters/junit.d.ts +8 -0
- package/dist/ci/reporters/junit.d.ts.map +1 -0
- package/dist/ci/reporters/junit.js +48 -0
- package/dist/ci/reporters/junit.js.map +1 -0
- package/dist/ci/runner.d.ts +3 -0
- package/dist/ci/runner.d.ts.map +1 -0
- package/dist/ci/runner.js +187 -0
- package/dist/ci/runner.js.map +1 -0
- package/dist/ci/test-discovery.d.ts +5 -0
- package/dist/ci/test-discovery.d.ts.map +1 -0
- package/dist/ci/test-discovery.js +11 -0
- package/dist/ci/test-discovery.js.map +1 -0
- package/dist/ci/test-loader.d.ts +19 -0
- package/dist/ci/test-loader.d.ts.map +1 -0
- package/dist/ci/test-loader.js +149 -0
- package/dist/ci/test-loader.js.map +1 -0
- package/dist/ci/test-registry.d.ts +42 -0
- package/dist/ci/test-registry.d.ts.map +1 -0
- package/dist/ci/test-registry.js +18 -0
- package/dist/ci/test-registry.js.map +1 -0
- package/dist/ci/trace-schema.d.ts +30 -0
- package/dist/ci/trace-schema.d.ts.map +1 -0
- package/dist/ci/trace-schema.js +66 -0
- package/dist/ci/trace-schema.js.map +1 -0
- package/dist/ci/trace-writer.d.ts +16 -0
- package/dist/ci/trace-writer.d.ts.map +1 -0
- package/dist/ci/trace-writer.js +108 -0
- package/dist/ci/trace-writer.js.map +1 -0
- package/dist/ci/types.d.ts +108 -0
- package/dist/ci/types.d.ts.map +1 -0
- package/dist/ci/types.js +3 -0
- package/dist/ci/types.js.map +1 -0
- package/dist/ci/upload-client.d.ts +74 -0
- package/dist/ci/upload-client.d.ts.map +1 -0
- package/dist/ci/upload-client.js +195 -0
- package/dist/ci/upload-client.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +716 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/agent-state.d.ts +47 -0
- package/dist/core/agent-state.d.ts.map +1 -0
- package/dist/core/agent-state.js +137 -0
- package/dist/core/agent-state.js.map +1 -0
- package/dist/core/judge-utils.d.ts +22 -0
- package/dist/core/judge-utils.d.ts.map +1 -0
- package/dist/core/judge-utils.js +211 -0
- package/dist/core/judge-utils.js.map +1 -0
- package/dist/core/registry.d.ts +28 -0
- package/dist/core/registry.d.ts.map +1 -0
- package/dist/core/registry.js +52 -0
- package/dist/core/registry.js.map +1 -0
- package/dist/dashboard-server.d.ts +65 -0
- package/dist/dashboard-server.d.ts.map +1 -0
- package/dist/dashboard-server.js +3940 -0
- package/dist/dashboard-server.js.map +1 -0
- package/dist/execution/tool-runner.d.ts +26 -0
- package/dist/execution/tool-runner.d.ts.map +1 -0
- package/dist/execution/tool-runner.js +316 -0
- package/dist/execution/tool-runner.js.map +1 -0
- package/dist/html/dashboard.html +2218 -0
- package/dist/http.d.ts +14 -0
- package/dist/http.d.ts.map +1 -0
- package/dist/http.js +13 -0
- package/dist/http.js.map +1 -0
- package/dist/index.cjs +8102 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +67 -0
- package/dist/index.js.map +1 -0
- package/dist/interceptors/ai-interceptor.d.ts +26 -0
- package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
- package/dist/interceptors/ai-interceptor.js +756 -0
- package/dist/interceptors/ai-interceptor.js.map +1 -0
- package/dist/interceptors/db-auto.d.ts +8 -0
- package/dist/interceptors/db-auto.d.ts.map +1 -0
- package/dist/interceptors/db-auto.js +217 -0
- package/dist/interceptors/db-auto.js.map +1 -0
- package/dist/interceptors/db.d.ts +23 -0
- package/dist/interceptors/db.d.ts.map +1 -0
- package/dist/interceptors/db.js +137 -0
- package/dist/interceptors/db.js.map +1 -0
- package/dist/interceptors/http.d.ts +28 -0
- package/dist/interceptors/http.d.ts.map +1 -0
- package/dist/interceptors/http.js +356 -0
- package/dist/interceptors/http.js.map +1 -0
- package/dist/interceptors/side-effects.d.ts +7 -0
- package/dist/interceptors/side-effects.d.ts.map +1 -0
- package/dist/interceptors/side-effects.js +72 -0
- package/dist/interceptors/side-effects.js.map +1 -0
- package/dist/interceptors/telemetry-push.d.ts +142 -0
- package/dist/interceptors/telemetry-push.d.ts.map +1 -0
- package/dist/interceptors/telemetry-push.js +463 -0
- package/dist/interceptors/telemetry-push.js.map +1 -0
- package/dist/interceptors/tool.d.ts +2 -0
- package/dist/interceptors/tool.d.ts.map +1 -0
- package/dist/interceptors/tool.js +274 -0
- package/dist/interceptors/tool.js.map +1 -0
- package/dist/interceptors/workflow-ai.d.ts +5 -0
- package/dist/interceptors/workflow-ai.d.ts.map +1 -0
- package/dist/interceptors/workflow-ai.js +382 -0
- package/dist/interceptors/workflow-ai.js.map +1 -0
- package/dist/internals/conditional-recorder.d.ts +21 -0
- package/dist/internals/conditional-recorder.d.ts.map +1 -0
- package/dist/internals/conditional-recorder.js +54 -0
- package/dist/internals/conditional-recorder.js.map +1 -0
- package/dist/internals/mock-resolver.d.ts +146 -0
- package/dist/internals/mock-resolver.d.ts.map +1 -0
- package/dist/internals/mock-resolver.js +427 -0
- package/dist/internals/mock-resolver.js.map +1 -0
- package/dist/matchers/index.d.ts +96 -0
- package/dist/matchers/index.d.ts.map +1 -0
- package/dist/matchers/index.js +668 -0
- package/dist/matchers/index.js.map +1 -0
- package/dist/observability.d.ts +82 -0
- package/dist/observability.d.ts.map +1 -0
- package/dist/observability.js +471 -0
- package/dist/observability.js.map +1 -0
- package/dist/portal-executor.d.ts +30 -0
- package/dist/portal-executor.d.ts.map +1 -0
- package/dist/portal-executor.js +324 -0
- package/dist/portal-executor.js.map +1 -0
- package/dist/portal-server.d.ts +3 -0
- package/dist/portal-server.d.ts.map +1 -0
- package/dist/portal-server.js +279 -0
- package/dist/portal-server.js.map +1 -0
- package/dist/proxy/llm-capture.d.ts +14 -0
- package/dist/proxy/llm-capture.d.ts.map +1 -0
- package/dist/proxy/llm-capture.js +264 -0
- package/dist/proxy/llm-capture.js.map +1 -0
- package/dist/reporter.d.ts +3 -0
- package/dist/reporter.d.ts.map +1 -0
- package/dist/reporter.js +72 -0
- package/dist/reporter.js.map +1 -0
- package/dist/runWorkflowSubprocess.d.ts +14 -0
- package/dist/runWorkflowSubprocess.d.ts.map +1 -0
- package/dist/runWorkflowSubprocess.js +66 -0
- package/dist/runWorkflowSubprocess.js.map +1 -0
- package/dist/runner.d.ts +16 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +138 -0
- package/dist/runner.js.map +1 -0
- package/dist/socket-connector.d.ts +22 -0
- package/dist/socket-connector.d.ts.map +1 -0
- package/dist/socket-connector.js +104 -0
- package/dist/socket-connector.js.map +1 -0
- package/dist/telemetry-batcher.d.ts +56 -0
- package/dist/telemetry-batcher.d.ts.map +1 -0
- package/dist/telemetry-batcher.js +143 -0
- package/dist/telemetry-batcher.js.map +1 -0
- package/dist/test-setup.d.ts +12 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +13 -0
- package/dist/test-setup.js.map +1 -0
- package/dist/tool-registry.d.ts +31 -0
- package/dist/tool-registry.d.ts.map +1 -0
- package/dist/tool-registry.js +73 -0
- package/dist/tool-registry.js.map +1 -0
- package/dist/tool-runner-worker.d.ts +2 -0
- package/dist/tool-runner-worker.d.ts.map +1 -0
- package/dist/tool-runner-worker.js +215 -0
- package/dist/tool-runner-worker.js.map +1 -0
- package/dist/trace-adapter/context.d.ts +72 -0
- package/dist/trace-adapter/context.d.ts.map +1 -0
- package/dist/trace-adapter/context.js +80 -0
- package/dist/trace-adapter/context.js.map +1 -0
- package/dist/tracing.d.ts +2 -0
- package/dist/tracing.d.ts.map +1 -0
- package/dist/tracing.js +59 -0
- package/dist/tracing.js.map +1 -0
- package/dist/trigger-executor.d.ts +12 -0
- package/dist/trigger-executor.d.ts.map +1 -0
- package/dist/trigger-executor.js +130 -0
- package/dist/trigger-executor.js.map +1 -0
- package/dist/types/portal.d.ts +76 -0
- package/dist/types/portal.d.ts.map +1 -0
- package/dist/types/portal.js +2 -0
- package/dist/types/portal.js.map +1 -0
- package/dist/utils/debug.d.ts +3 -0
- package/dist/utils/debug.d.ts.map +1 -0
- package/dist/utils/debug.js +8 -0
- package/dist/utils/debug.js.map +1 -0
- package/dist/utils/license-error.d.ts +23 -0
- package/dist/utils/license-error.d.ts.map +1 -0
- package/dist/utils/license-error.js +42 -0
- package/dist/utils/license-error.js.map +1 -0
- package/dist/utils/redact.d.ts +7 -0
- package/dist/utils/redact.d.ts.map +1 -0
- package/dist/utils/redact.js +26 -0
- package/dist/utils/redact.js.map +1 -0
- package/dist/workflow-runner-worker.d.ts +2 -0
- package/dist/workflow-runner-worker.d.ts.map +1 -0
- package/dist/workflow-runner-worker.js +329 -0
- package/dist/workflow-runner-worker.js.map +1 -0
- package/dist/workflow-runner.d.ts +14 -0
- package/dist/workflow-runner.d.ts.map +1 -0
- package/dist/workflow-runner.js +34 -0
- package/dist/workflow-runner.js.map +1 -0
- package/docs/agent-coding-instructions.md +138 -0
- package/docs/agent-integration-guide.md +564 -0
- package/docs/agents.md +140 -0
- package/docs/dashboard.md +394 -0
- package/docs/deno.md +69 -0
- package/docs/instrumentation.md +424 -0
- package/docs/langfuse-trace-structure.md +145 -0
- package/docs/matchers.md +173 -0
- package/docs/observability_contract.md +192 -0
- package/docs/observability_mode.md +195 -0
- package/docs/quickstart.md +621 -0
- package/docs/security-compliance.md +566 -0
- package/docs/test-writing-guidelines.md +444 -0
- package/docs/tools.md +165 -0
- package/docs/workflow-modes.md +253 -0
- package/package.json +76 -0
- package/src/browser-ui.ts +281 -0
- package/src/capture/event.ts +30 -0
- package/src/capture/index.ts +3 -0
- package/src/capture/recorder.ts +62 -0
- package/src/capture/replay.ts +55 -0
- package/src/ci/api-client.ts +136 -0
- package/src/ci/benchmark.ts +257 -0
- package/src/ci/ed-runner.ts +351 -0
- package/src/ci/executor.ts +671 -0
- package/src/ci/git-info.ts +127 -0
- package/src/ci/index.ts +5 -0
- package/src/ci/measurement.ts +25 -0
- package/src/ci/replay.ts +127 -0
- package/src/ci/reporters/default.ts +50 -0
- package/src/ci/reporters/index.ts +21 -0
- package/src/ci/reporters/json.ts +18 -0
- package/src/ci/reporters/junit.ts +61 -0
- package/src/ci/runner.ts +208 -0
- package/src/ci/test-discovery.ts +16 -0
- package/src/ci/test-loader.ts +187 -0
- package/src/ci/test-registry.ts +62 -0
- package/src/ci/trace-schema.ts +96 -0
- package/src/ci/trace-writer.ts +107 -0
- package/src/ci/types.ts +115 -0
- package/src/ci/upload-client.ts +300 -0
- package/src/cli.ts +811 -0
- package/src/core/agent-state.ts +162 -0
- package/src/core/judge-utils.ts +232 -0
- package/src/core/registry.ts +92 -0
- package/src/dashboard-server.ts +2047 -0
- package/src/execution/tool-runner.ts +352 -0
- package/src/html/dashboard.html +2218 -0
- package/src/http.ts +13 -0
- package/src/index.ts +138 -0
- package/src/interceptors/ai-interceptor.ts +798 -0
- package/src/interceptors/db-auto.ts +243 -0
- package/src/interceptors/db.ts +156 -0
- package/src/interceptors/http.ts +393 -0
- package/src/interceptors/side-effects.ts +83 -0
- package/src/interceptors/telemetry-push.ts +537 -0
- package/src/interceptors/tool.ts +287 -0
- package/src/interceptors/workflow-ai.ts +419 -0
- package/src/internals/conditional-recorder.ts +63 -0
- package/src/internals/mock-resolver.ts +492 -0
- package/src/matchers/index.ts +824 -0
- package/src/observability.ts +501 -0
- package/src/portal-executor.ts +355 -0
- package/src/portal-server.ts +304 -0
- package/src/proxy/llm-capture.ts +301 -0
- package/src/reporter.ts +81 -0
- package/src/runWorkflowSubprocess.ts +74 -0
- package/src/runner.ts +178 -0
- package/src/socket-connector.ts +117 -0
- package/src/telemetry-batcher.ts +191 -0
- package/src/test-setup.ts +16 -0
- package/src/tool-registry.ts +94 -0
- package/src/tool-runner-worker.ts +244 -0
- package/src/trace-adapter/context.ts +156 -0
- package/src/tracing.ts +62 -0
- package/src/trigger-executor.ts +171 -0
- package/src/types/agent.d.ts +63 -0
- package/src/types/expect.d.ts +81 -0
- package/src/types/modules.d.ts +2 -0
- package/src/types/portal.ts +69 -0
- package/src/utils/debug.ts +8 -0
- package/src/utils/license-error.ts +43 -0
- package/src/utils/redact.ts +25 -0
- package/src/workflow-runner-worker.ts +386 -0
- package/src/workflow-runner.ts +58 -0
|
@@ -0,0 +1,824 @@
|
|
|
1
|
+
import { expect } from 'expect'
|
|
2
|
+
import type { TraceHandle, LLMStep, CustomStep, CustomStepKind } from '../trace-adapter/context.js'
|
|
3
|
+
import { prepareOutputForJudge } from '../core/judge-utils.js'
|
|
4
|
+
|
|
5
|
+
interface LLMStepConfig {
|
|
6
|
+
model?: string
|
|
7
|
+
contains?: string // searches prompt + completion
|
|
8
|
+
promptContains?: string // searches only in step.prompt
|
|
9
|
+
outputContains?: string // searches only in step.completion
|
|
10
|
+
provider?: string // 'openai' | 'claude' | 'gemini' | 'grok'
|
|
11
|
+
times?: number // match count must equal exactly this value
|
|
12
|
+
minTimes?: number // match count must be >= this value
|
|
13
|
+
maxTimes?: number // match count must be <= this value
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
interface CustomStepConfig {
|
|
17
|
+
kind?: CustomStepKind
|
|
18
|
+
name?: string
|
|
19
|
+
tag?: string
|
|
20
|
+
contains?: string // searches payload/result/metadata stringified
|
|
21
|
+
resultContains?: string // searches result only
|
|
22
|
+
payloadContains?: string // searches payload only
|
|
23
|
+
metadataContains?: string // searches metadata only
|
|
24
|
+
times?: number
|
|
25
|
+
minTimes?: number
|
|
26
|
+
maxTimes?: number
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
interface PromptWhereConfig {
|
|
30
|
+
filterContains: string // first filter: prompts that contain this substring
|
|
31
|
+
requireContains?: string // then assert: filtered prompts must also contain this
|
|
32
|
+
requireNotContains?: string // and must NOT contain this
|
|
33
|
+
times?: number // exact count of filtered prompts
|
|
34
|
+
minTimes?: number // min count of filtered prompts
|
|
35
|
+
maxTimes?: number // max count of filtered prompts
|
|
36
|
+
index?: number // optional 0-based index into filtered prompts to check specifically
|
|
37
|
+
nth?: number // optional 1-based alias for index
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
type SupportedProvider = 'openai' | 'claude' | 'gemini' | 'grok' | 'kimi'
|
|
41
|
+
|
|
42
|
+
interface SemanticMatchOptions {
|
|
43
|
+
provider?: SupportedProvider
|
|
44
|
+
model?: string
|
|
45
|
+
sdk?: unknown // optional user-supplied SDK instance
|
|
46
|
+
apiKey?: string // optional API key override (useful for OpenAI-compatible endpoints)
|
|
47
|
+
baseURL?: string // optional base URL override for OpenAI-compatible APIs
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
type EvaluationTarget = 'prompt' | 'result'
|
|
51
|
+
|
|
52
|
+
interface EvaluationCondition {
|
|
53
|
+
greaterThan?: number
|
|
54
|
+
lessThan?: number
|
|
55
|
+
atLeast?: number
|
|
56
|
+
atMost?: number
|
|
57
|
+
equals?: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
interface EvaluateOutputMetricConfig {
|
|
61
|
+
evaluationPrompt: string
|
|
62
|
+
target?: EvaluationTarget // 'prompt' or 'result'; default 'result'
|
|
63
|
+
index?: number // 0-based index into LLM steps
|
|
64
|
+
nth?: number // 1-based alias for index
|
|
65
|
+
condition?: EvaluationCondition // optional; default atLeast 0.7
|
|
66
|
+
provider?: SupportedProvider
|
|
67
|
+
model?: string
|
|
68
|
+
sdk?: unknown // optional SDK instance
|
|
69
|
+
apiKey?: string // optional API key override (useful for OpenAI-compatible endpoints)
|
|
70
|
+
baseURL?: string // optional base URL override for OpenAI-compatible APIs
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Type guard: returns true only if `value` looks like a TraceHandle.
|
|
75
|
+
* Used to produce a clear error message when a non-trace value (e.g. a plain
|
|
76
|
+
* string) is passed to a trace-aware matcher.
|
|
77
|
+
*/
|
|
78
|
+
function isTraceHandle(value: unknown): value is TraceHandle {
|
|
79
|
+
return (
|
|
80
|
+
value !== null &&
|
|
81
|
+
typeof value === 'object' &&
|
|
82
|
+
typeof (value as TraceHandle).getLLMSteps === 'function' &&
|
|
83
|
+
typeof (value as TraceHandle).getToolCalls === 'function'
|
|
84
|
+
)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const defaultModels: Record<SupportedProvider, string> = {
|
|
88
|
+
openai: 'gpt-4.1',
|
|
89
|
+
claude: 'claude-3-opus-20240229',
|
|
90
|
+
gemini: 'gemini-1.5-pro',
|
|
91
|
+
grok: 'grok-beta',
|
|
92
|
+
kimi: 'kimi-k2-turbo-preview',
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Helper: call an LLM provider (or SDK) and return the text content
|
|
96
|
+
export interface LLMCallResult {
|
|
97
|
+
content: string
|
|
98
|
+
durationMs: number
|
|
99
|
+
usage?: { inputTokens: number; outputTokens: number; totalTokens: number }
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export async function callProviderLLM(
|
|
103
|
+
prompt: string,
|
|
104
|
+
options: SemanticMatchOptions = {},
|
|
105
|
+
systemPrompt = 'You are an expert test judge.',
|
|
106
|
+
maxTokens = 32,
|
|
107
|
+
temperature = 0
|
|
108
|
+
): Promise<LLMCallResult> {
|
|
109
|
+
const provider: SupportedProvider = options.provider ?? 'openai'
|
|
110
|
+
const sdk = options.sdk as any | undefined
|
|
111
|
+
const resolvedModel = options.model ?? defaultModels[provider]
|
|
112
|
+
const t0 = Date.now()
|
|
113
|
+
|
|
114
|
+
switch (provider) {
|
|
115
|
+
case 'openai': {
|
|
116
|
+
if (sdk && sdk.chat?.completions?.create) {
|
|
117
|
+
const resp = await sdk.chat.completions.create({
|
|
118
|
+
model: resolvedModel,
|
|
119
|
+
messages: [
|
|
120
|
+
{ role: 'system', content: systemPrompt },
|
|
121
|
+
{ role: 'user', content: prompt },
|
|
122
|
+
],
|
|
123
|
+
max_tokens: maxTokens,
|
|
124
|
+
// temperature,
|
|
125
|
+
})
|
|
126
|
+
const u = resp?.usage
|
|
127
|
+
return {
|
|
128
|
+
content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
|
|
129
|
+
durationMs: Date.now() - t0,
|
|
130
|
+
usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const apiKey = options.apiKey ?? process.env.OPENAI_API_KEY
|
|
135
|
+
if (!apiKey) throw new Error('Provide apiKey or set OPENAI_API_KEY for OpenAI-compatible endpoint.')
|
|
136
|
+
|
|
137
|
+
const baseURL = (options.baseURL ?? 'https://api.openai.com/v1').replace(/\/$/, '')
|
|
138
|
+
const response = await fetch(`${baseURL}/chat/completions`, {
|
|
139
|
+
method: 'POST',
|
|
140
|
+
headers: {
|
|
141
|
+
Authorization: `Bearer ${apiKey}`,
|
|
142
|
+
'Content-Type': 'application/json',
|
|
143
|
+
},
|
|
144
|
+
body: JSON.stringify({
|
|
145
|
+
model: resolvedModel,
|
|
146
|
+
messages: [
|
|
147
|
+
{ role: 'system', content: systemPrompt },
|
|
148
|
+
{ role: 'user', content: prompt },
|
|
149
|
+
],
|
|
150
|
+
max_tokens: maxTokens,
|
|
151
|
+
// temperature,
|
|
152
|
+
}),
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
if (!response.ok) {
|
|
156
|
+
throw new Error(`OpenAI API error: ${response.status} ${response.statusText}`)
|
|
157
|
+
}
|
|
158
|
+
const data: any = await response.json()
|
|
159
|
+
const u = data?.usage
|
|
160
|
+
return {
|
|
161
|
+
content: data.choices?.[0]?.message?.content?.trim() ?? '',
|
|
162
|
+
durationMs: Date.now() - t0,
|
|
163
|
+
usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
case 'claude': {
|
|
168
|
+
if (sdk && sdk.messages?.create) {
|
|
169
|
+
const resp = await sdk.messages.create({
|
|
170
|
+
model: resolvedModel,
|
|
171
|
+
max_tokens: maxTokens,
|
|
172
|
+
// temperature,
|
|
173
|
+
messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
|
|
174
|
+
})
|
|
175
|
+
const u = resp?.usage
|
|
176
|
+
return {
|
|
177
|
+
content: resp?.content?.[0]?.text?.trim() ?? '',
|
|
178
|
+
durationMs: Date.now() - t0,
|
|
179
|
+
usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const apiKey = process.env.ANTHROPIC_API_KEY
|
|
184
|
+
if (!apiKey) throw new Error('ANTHROPIC_API_KEY is not set in environment.')
|
|
185
|
+
|
|
186
|
+
const response = await fetch('https://api.anthropic.com/v1/messages', {
|
|
187
|
+
method: 'POST',
|
|
188
|
+
headers: {
|
|
189
|
+
'x-api-key': apiKey,
|
|
190
|
+
'anthropic-version': '2023-06-01',
|
|
191
|
+
'content-type': 'application/json',
|
|
192
|
+
},
|
|
193
|
+
body: JSON.stringify({
|
|
194
|
+
model: resolvedModel,
|
|
195
|
+
max_tokens: maxTokens,
|
|
196
|
+
// temperature,
|
|
197
|
+
messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
|
|
198
|
+
}),
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
if (!response.ok) {
|
|
202
|
+
const errBody = await response.text().catch(() => '')
|
|
203
|
+
throw new Error(`Claude API error: ${response.status} ${response.statusText} (model=${resolvedModel}): ${errBody.substring(0, 200)}`)
|
|
204
|
+
}
|
|
205
|
+
const data: any = await response.json()
|
|
206
|
+
const u = data?.usage
|
|
207
|
+
return {
|
|
208
|
+
content: data?.content?.[0]?.text?.trim() ?? '',
|
|
209
|
+
durationMs: Date.now() - t0,
|
|
210
|
+
usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
case 'gemini': {
|
|
215
|
+
if (sdk && sdk.models?.generateContent) {
|
|
216
|
+
const resp = await sdk.models.generateContent({
|
|
217
|
+
model: resolvedModel,
|
|
218
|
+
contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
|
|
219
|
+
generationConfig: {
|
|
220
|
+
// temperature,
|
|
221
|
+
maxOutputTokens: maxTokens
|
|
222
|
+
},
|
|
223
|
+
})
|
|
224
|
+
const u = resp?.response?.usageMetadata
|
|
225
|
+
return {
|
|
226
|
+
content: resp?.response?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
|
|
227
|
+
durationMs: Date.now() - t0,
|
|
228
|
+
usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY
|
|
233
|
+
if (!apiKey) throw new Error('GEMINI_API_KEY (or GOOGLE_API_KEY) is not set in environment.')
|
|
234
|
+
|
|
235
|
+
const response = await fetch(
|
|
236
|
+
`https://generativelanguage.googleapis.com/v1beta/models/${resolvedModel}:generateContent?key=${apiKey}`,
|
|
237
|
+
{
|
|
238
|
+
method: 'POST',
|
|
239
|
+
headers: { 'Content-Type': 'application/json' },
|
|
240
|
+
body: JSON.stringify({
|
|
241
|
+
contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
|
|
242
|
+
generationConfig: {
|
|
243
|
+
// temperature,
|
|
244
|
+
maxOutputTokens: maxTokens
|
|
245
|
+
},
|
|
246
|
+
}),
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if (!response.ok) {
|
|
251
|
+
throw new Error(`Gemini API error: ${response.status} ${response.statusText}`)
|
|
252
|
+
}
|
|
253
|
+
const data: any = await response.json()
|
|
254
|
+
const u = data?.usageMetadata
|
|
255
|
+
return {
|
|
256
|
+
content: data?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
|
|
257
|
+
durationMs: Date.now() - t0,
|
|
258
|
+
usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
case 'grok': {
|
|
263
|
+
if (sdk && sdk.chat?.completions?.create) {
|
|
264
|
+
const resp = await sdk.chat.completions.create({
|
|
265
|
+
model: resolvedModel,
|
|
266
|
+
messages: [
|
|
267
|
+
{ role: 'system', content: systemPrompt },
|
|
268
|
+
{ role: 'user', content: prompt },
|
|
269
|
+
],
|
|
270
|
+
max_tokens: maxTokens,
|
|
271
|
+
// temperature,
|
|
272
|
+
})
|
|
273
|
+
const u = resp?.usage
|
|
274
|
+
return {
|
|
275
|
+
content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
|
|
276
|
+
durationMs: Date.now() - t0,
|
|
277
|
+
usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
const apiKey = process.env.GROK_API_KEY
|
|
282
|
+
if (!apiKey) throw new Error('GROK_API_KEY is not set in environment.')
|
|
283
|
+
|
|
284
|
+
const response = await fetch('https://api.x.ai/v1/chat/completions', {
|
|
285
|
+
method: 'POST',
|
|
286
|
+
headers: {
|
|
287
|
+
Authorization: `Bearer ${apiKey}`,
|
|
288
|
+
'Content-Type': 'application/json',
|
|
289
|
+
},
|
|
290
|
+
body: JSON.stringify({
|
|
291
|
+
model: resolvedModel,
|
|
292
|
+
messages: [
|
|
293
|
+
{ role: 'system', content: systemPrompt },
|
|
294
|
+
{ role: 'user', content: prompt },
|
|
295
|
+
],
|
|
296
|
+
max_tokens: maxTokens,
|
|
297
|
+
// temperature,
|
|
298
|
+
}),
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
if (!response.ok) {
|
|
302
|
+
throw new Error(`Grok API error: ${response.status} ${response.statusText}`)
|
|
303
|
+
}
|
|
304
|
+
const data: any = await response.json()
|
|
305
|
+
const u = data?.usage
|
|
306
|
+
return {
|
|
307
|
+
content: data.choices?.[0]?.message?.content?.trim() ?? '',
|
|
308
|
+
durationMs: Date.now() - t0,
|
|
309
|
+
usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
case 'kimi': {
|
|
314
|
+
const apiKey = process.env.KIMI_API_KEY
|
|
315
|
+
if (!apiKey) throw new Error('KIMI_API_KEY is not set in environment.')
|
|
316
|
+
|
|
317
|
+
const response = await fetch('https://api.moonshot.ai/v1/chat/completions', {
|
|
318
|
+
method: 'POST',
|
|
319
|
+
headers: {
|
|
320
|
+
Authorization: `Bearer ${apiKey}`,
|
|
321
|
+
'Content-Type': 'application/json',
|
|
322
|
+
},
|
|
323
|
+
body: JSON.stringify({
|
|
324
|
+
model: resolvedModel,
|
|
325
|
+
messages: [
|
|
326
|
+
{ role: 'system', content: systemPrompt },
|
|
327
|
+
{ role: 'user', content: prompt },
|
|
328
|
+
],
|
|
329
|
+
max_tokens: maxTokens,
|
|
330
|
+
}),
|
|
331
|
+
})
|
|
332
|
+
|
|
333
|
+
const data: any = await response.json()
|
|
334
|
+
console.log(` [kimi] response: ${JSON.stringify(data).slice(0, 500)}`)
|
|
335
|
+
if (!response.ok) {
|
|
336
|
+
throw new Error(`Kimi API error: ${response.status} ${response.statusText} — ${JSON.stringify(data)}`)
|
|
337
|
+
}
|
|
338
|
+
const u = data?.usage
|
|
339
|
+
return {
|
|
340
|
+
content: data.choices?.[0]?.message?.content?.trim() ?? '',
|
|
341
|
+
durationMs: Date.now() - t0,
|
|
342
|
+
usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
default:
|
|
347
|
+
throw new Error(`Unsupported provider: ${provider}`)
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Helper: Call an LLM (configurable provider/model/sdk) to judge semantic match
|
|
352
|
+
async function llmJudgeSemanticMatch(
|
|
353
|
+
traceOutput: string,
|
|
354
|
+
expected: string,
|
|
355
|
+
options: SemanticMatchOptions = {}
|
|
356
|
+
): Promise<boolean> {
|
|
357
|
+
const prompt = `
|
|
358
|
+
You are an expert test judge. Given the following AI trace output and an expected semantic result, answer "YES" if the trace output semantically matches the expectation, otherwise answer "NO".
|
|
359
|
+
|
|
360
|
+
Trace Output:
|
|
361
|
+
${traceOutput}
|
|
362
|
+
|
|
363
|
+
Expected:
|
|
364
|
+
${expected}
|
|
365
|
+
|
|
366
|
+
Answer only "YES" or "NO".
|
|
367
|
+
`.trim()
|
|
368
|
+
|
|
369
|
+
const content = (await callProviderLLM(prompt, options, 'You are an expert test judge.', 8, 0)).content.trim().toUpperCase()
|
|
370
|
+
return content.startsWith('YES')
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
function parseFirstNumber(text: string): number | null {
|
|
374
|
+
const match = text.match(/-?\d+(?:\.\d+)?/)
|
|
375
|
+
if (!match) return null
|
|
376
|
+
const num = Number.parseFloat(match[0])
|
|
377
|
+
return Number.isFinite(num) ? num : null
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
function resolveCondition(config?: EvaluationCondition): { kind: keyof EvaluationCondition; value: number } {
|
|
381
|
+
const entries = Object.entries(config || {}).filter(([, v]) => typeof v === 'number' && Number.isFinite(v)) as Array<
|
|
382
|
+
[keyof EvaluationCondition, number]
|
|
383
|
+
>
|
|
384
|
+
if (entries.length === 0) return { kind: 'atLeast', value: 0.7 }
|
|
385
|
+
if (entries.length > 1) {
|
|
386
|
+
throw new Error('Provide only one metric condition (greaterThan, lessThan, atLeast, atMost, equals).')
|
|
387
|
+
}
|
|
388
|
+
return { kind: entries[0][0], value: entries[0][1] }
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
function checkCondition(score: number, condition: { kind: keyof EvaluationCondition; value: number }): boolean {
|
|
392
|
+
switch (condition.kind) {
|
|
393
|
+
case 'greaterThan':
|
|
394
|
+
return score > condition.value
|
|
395
|
+
case 'lessThan':
|
|
396
|
+
return score < condition.value
|
|
397
|
+
case 'atLeast':
|
|
398
|
+
return score >= condition.value
|
|
399
|
+
case 'atMost':
|
|
400
|
+
return score <= condition.value
|
|
401
|
+
case 'equals':
|
|
402
|
+
return score === condition.value
|
|
403
|
+
default:
|
|
404
|
+
return false
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Augment the `expect` package so TypeScript knows about custom matchers
|
|
409
|
+
declare module 'expect' {
|
|
410
|
+
interface Matchers<R> {
|
|
411
|
+
toHaveLLMStep(config?: LLMStepConfig): R
|
|
412
|
+
toCallTool(toolName: string): R
|
|
413
|
+
toMatchSemanticOutput(expected: string, options?: SemanticMatchOptions): R
|
|
414
|
+
toHaveCustomStep(config?: CustomStepConfig): R
|
|
415
|
+
/**
|
|
416
|
+
* Filter prompts that contain `filterContains`, then assert additional requirements.
|
|
417
|
+
* Example: prompts containing "A" must also contain "B".
|
|
418
|
+
*/
|
|
419
|
+
toHavePromptWhere(config: PromptWhereConfig): R
|
|
420
|
+
/**
|
|
421
|
+
* Evaluate a specific LLM step's prompt or result via an LLM and assert a numeric metric condition (0.0–1.0).
|
|
422
|
+
*/
|
|
423
|
+
toEvaluateOutputMetric(config: EvaluateOutputMetricConfig): Promise<R>
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Register all AI-specific custom matchers onto the `expect` instance.
|
|
429
|
+
* Call this once on runner startup.
|
|
430
|
+
*/
|
|
431
|
+
export function registerMatchers(): void {
|
|
432
|
+
expect.extend({
|
|
433
|
+
toHaveLLMStep(trace: TraceHandle, config: LLMStepConfig = {}) {
|
|
434
|
+
if (!isTraceHandle(trace)) {
|
|
435
|
+
return {
|
|
436
|
+
pass: false,
|
|
437
|
+
message: () =>
|
|
438
|
+
`Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveLLMStep(...)`,
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
const steps = trace.getLLMSteps()
|
|
442
|
+
|
|
443
|
+
const matching = steps.filter((step: LLMStep) => {
|
|
444
|
+
if (config.model && step.model !== config.model) return false
|
|
445
|
+
if (config.provider && step.provider !== config.provider) return false
|
|
446
|
+
if (config.contains) {
|
|
447
|
+
const haystack = [step.completion, step.prompt, step.contains]
|
|
448
|
+
.filter(Boolean)
|
|
449
|
+
.join(' ')
|
|
450
|
+
.toLowerCase()
|
|
451
|
+
if (!haystack.includes(config.contains.toLowerCase())) return false
|
|
452
|
+
}
|
|
453
|
+
if (config.promptContains) {
|
|
454
|
+
const promptHaystack = (step.prompt ?? '').toLowerCase()
|
|
455
|
+
if (!promptHaystack.includes(config.promptContains.toLowerCase())) return false
|
|
456
|
+
}
|
|
457
|
+
if (config.outputContains) {
|
|
458
|
+
const outputHaystack = (step.completion ?? '').toLowerCase()
|
|
459
|
+
if (!outputHaystack.includes(config.outputContains.toLowerCase())) return false
|
|
460
|
+
}
|
|
461
|
+
return true
|
|
462
|
+
})
|
|
463
|
+
|
|
464
|
+
const count = matching.length
|
|
465
|
+
let pass: boolean
|
|
466
|
+
if (config.times !== undefined) {
|
|
467
|
+
pass = count === config.times
|
|
468
|
+
} else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
|
|
469
|
+
const min = config.minTimes ?? 0
|
|
470
|
+
const max = config.maxTimes ?? Infinity
|
|
471
|
+
pass = count >= min && count <= max
|
|
472
|
+
} else {
|
|
473
|
+
pass = count > 0
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
return {
|
|
477
|
+
pass,
|
|
478
|
+
message: () => {
|
|
479
|
+
if (pass) {
|
|
480
|
+
return `Expected trace NOT to have LLM step matching ${JSON.stringify(config)}`
|
|
481
|
+
}
|
|
482
|
+
const stepSummary =
|
|
483
|
+
steps.length === 0
|
|
484
|
+
? 'no LLM steps were recorded'
|
|
485
|
+
: `${count} matching step(s) found; recorded steps: ${JSON.stringify(steps)}`
|
|
486
|
+
return `Expected trace to have LLM step matching ${JSON.stringify(config)}, but ${stepSummary}`
|
|
487
|
+
},
|
|
488
|
+
}
|
|
489
|
+
},
|
|
490
|
+
|
|
491
|
+
toCallTool(trace: TraceHandle, toolName: string) {
|
|
492
|
+
if (!isTraceHandle(trace)) {
|
|
493
|
+
return {
|
|
494
|
+
pass: false,
|
|
495
|
+
message: () =>
|
|
496
|
+
`Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toCallTool(...)`,
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
const calls = trace.getToolCalls()
|
|
500
|
+
const pass = calls.some((c) => c.name === toolName)
|
|
501
|
+
|
|
502
|
+
return {
|
|
503
|
+
pass,
|
|
504
|
+
message: () => {
|
|
505
|
+
if (pass) {
|
|
506
|
+
return `Expected trace NOT to call tool "${toolName}"`
|
|
507
|
+
}
|
|
508
|
+
const names = calls.map((c) => c.name)
|
|
509
|
+
const recorded = names.length === 0 ? 'no tool calls were recorded' : `recorded: [${names.join(', ')}]`
|
|
510
|
+
return `Expected tool "${toolName}" to be called, but ${recorded}`
|
|
511
|
+
},
|
|
512
|
+
}
|
|
513
|
+
},
|
|
514
|
+
|
|
515
|
+
async toMatchSemanticOutput(trace: TraceHandle, expected: string, options?: SemanticMatchOptions) {
|
|
516
|
+
if (!isTraceHandle(trace)) {
|
|
517
|
+
return {
|
|
518
|
+
pass: false,
|
|
519
|
+
message: () =>
|
|
520
|
+
`Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toMatchSemanticOutput(...)`,
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
const steps = trace.getLLMSteps()
|
|
524
|
+
const fullOutput = steps
|
|
525
|
+
.map((s: LLMStep) => [s.completion, s.contains].filter(Boolean).join(' '))
|
|
526
|
+
.join(' ')
|
|
527
|
+
.trim()
|
|
528
|
+
|
|
529
|
+
try {
|
|
530
|
+
const pass = await llmJudgeSemanticMatch(fullOutput, expected, options)
|
|
531
|
+
return {
|
|
532
|
+
pass,
|
|
533
|
+
message: () => {
|
|
534
|
+
if (pass) {
|
|
535
|
+
return `Expected trace output NOT to semantically match "${expected}" (LLM judged YES)`
|
|
536
|
+
}
|
|
537
|
+
return `Expected trace output to semantically match "${expected}", but LLM judged NO. Trace output: "${fullOutput || '(empty)'}"`
|
|
538
|
+
},
|
|
539
|
+
}
|
|
540
|
+
} catch (err) {
|
|
541
|
+
return {
|
|
542
|
+
pass: false,
|
|
543
|
+
message: () =>
|
|
544
|
+
`LLM semantic match failed: ${(err as Error).message}`,
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
},
|
|
548
|
+
|
|
549
|
+
async toEvaluateOutputMetric(trace: TraceHandle, config: EvaluateOutputMetricConfig) {
|
|
550
|
+
if (!isTraceHandle(trace)) {
|
|
551
|
+
return {
|
|
552
|
+
pass: false,
|
|
553
|
+
message: () =>
|
|
554
|
+
`Expected a TraceHandle (ctx.trace) but received ${typeof trace}.
|
|
555
|
+
Use: expect(ctx.trace).toEvaluateOutputMetric(...)`,
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
if (!config || !config.evaluationPrompt) {
|
|
559
|
+
return {
|
|
560
|
+
pass: false,
|
|
561
|
+
message: () => 'toEvaluateOutputMetric requires evaluationPrompt',
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
const steps = trace.getLLMSteps()
|
|
566
|
+
if (steps.length === 0) {
|
|
567
|
+
return {
|
|
568
|
+
pass: false,
|
|
569
|
+
message: () => 'No LLM steps recorded; cannot evaluate output metric.',
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : steps.length - 1)
|
|
574
|
+
if (targetIdx < 0 || targetIdx >= steps.length) {
|
|
575
|
+
return {
|
|
576
|
+
pass: false,
|
|
577
|
+
message: () => `LLM steps length ${steps.length}, but index/nth points to ${targetIdx}.`,
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const targetStep = steps[targetIdx]
|
|
582
|
+
const targetField: EvaluationTarget = config.target ?? 'result'
|
|
583
|
+
const targetText = targetField === 'prompt' ? targetStep.prompt ?? '' : targetStep.completion ?? ''
|
|
584
|
+
if (!targetText) {
|
|
585
|
+
return {
|
|
586
|
+
pass: false,
|
|
587
|
+
message: () => `Selected LLM step has empty ${targetField}; cannot evaluate.`,
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
const condition = (() => {
|
|
592
|
+
try {
|
|
593
|
+
return resolveCondition(config.condition)
|
|
594
|
+
} catch (err) {
|
|
595
|
+
return err as Error
|
|
596
|
+
}
|
|
597
|
+
})()
|
|
598
|
+
if (condition instanceof Error) {
|
|
599
|
+
return {
|
|
600
|
+
pass: false,
|
|
601
|
+
message: () => condition.message,
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
const preparedText = prepareOutputForJudge(targetText, config.evaluationPrompt)
|
|
606
|
+
const evalPrompt = `
|
|
607
|
+
Evaluation prompt (from user):
|
|
608
|
+
${config.evaluationPrompt}
|
|
609
|
+
|
|
610
|
+
Score the following text strictly between 0 and 1 (inclusive). Respond with only the number.
|
|
611
|
+
|
|
612
|
+
<output>
|
|
613
|
+
${preparedText}
|
|
614
|
+
</output>
|
|
615
|
+
`.trim()
|
|
616
|
+
|
|
617
|
+
try {
|
|
618
|
+
const raw = (await callProviderLLM(
|
|
619
|
+
evalPrompt,
|
|
620
|
+
{ provider: config.provider, model: config.model, sdk: config.sdk, apiKey: config.apiKey, baseURL: config.baseURL },
|
|
621
|
+
'You are an evaluation assistant. Return only a number between 0 and 1.',
|
|
622
|
+
16,
|
|
623
|
+
0
|
|
624
|
+
)).content
|
|
625
|
+
const score = parseFirstNumber(raw)
|
|
626
|
+
if (score === null) {
|
|
627
|
+
return {
|
|
628
|
+
pass: false,
|
|
629
|
+
message: () => `Could not parse numeric metric from model response: "${raw}"`,
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
if (score < 0 || score > 1) {
|
|
633
|
+
return {
|
|
634
|
+
pass: false,
|
|
635
|
+
message: () => `Metric ${score} is out of allowed range 0.0–1.0 (raw: "${raw}")`,
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
const pass = checkCondition(score, condition)
|
|
640
|
+
return {
|
|
641
|
+
pass,
|
|
642
|
+
message: () => {
|
|
643
|
+
if (pass) {
|
|
644
|
+
return `Expected metric NOT to satisfy ${condition.kind} ${condition.value} (score ${score})`
|
|
645
|
+
}
|
|
646
|
+
return `Metric check failed: score ${score} did not satisfy ${condition.kind} ${condition.value}. Raw response: "${raw}"`
|
|
647
|
+
},
|
|
648
|
+
}
|
|
649
|
+
} catch (err) {
|
|
650
|
+
return {
|
|
651
|
+
pass: false,
|
|
652
|
+
message: () => `LLM evaluation failed: ${(err as Error).message}`,
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
},
|
|
656
|
+
|
|
657
|
+
toHaveCustomStep(trace: TraceHandle, config: CustomStepConfig = {}) {
|
|
658
|
+
if (!isTraceHandle(trace) || typeof (trace as any).getCustomSteps !== 'function') {
|
|
659
|
+
return {
|
|
660
|
+
pass: false,
|
|
661
|
+
message: () =>
|
|
662
|
+
`Expected a TraceHandle (ctx.trace with getCustomSteps) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveCustomStep(...)`,
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
const steps = (trace as any).getCustomSteps() as CustomStep[]
|
|
667
|
+
|
|
668
|
+
const matchString = (val: unknown): string => {
|
|
669
|
+
if (val === undefined || val === null) return ''
|
|
670
|
+
if (typeof val === 'string') return val
|
|
671
|
+
try {
|
|
672
|
+
return JSON.stringify(val)
|
|
673
|
+
} catch {
|
|
674
|
+
return String(val)
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
const matching = steps.filter((step) => {
|
|
679
|
+
if (config.kind && step.kind !== config.kind) return false
|
|
680
|
+
if (config.name && step.name !== config.name) return false
|
|
681
|
+
if (config.tag && !(step.tags || []).includes(config.tag)) return false
|
|
682
|
+
|
|
683
|
+
const payloadStr = matchString(step.payload).toLowerCase()
|
|
684
|
+
const resultStr = matchString(step.result).toLowerCase()
|
|
685
|
+
const metaStr = matchString(step.metadata).toLowerCase()
|
|
686
|
+
const combined = [payloadStr, resultStr, metaStr].filter(Boolean).join(' ')
|
|
687
|
+
|
|
688
|
+
if (config.contains && !combined.includes(config.contains.toLowerCase())) return false
|
|
689
|
+
if (config.payloadContains && !payloadStr.includes(config.payloadContains.toLowerCase())) return false
|
|
690
|
+
if (config.resultContains && !resultStr.includes(config.resultContains.toLowerCase())) return false
|
|
691
|
+
if (config.metadataContains && !metaStr.includes(config.metadataContains.toLowerCase())) return false
|
|
692
|
+
|
|
693
|
+
return true
|
|
694
|
+
})
|
|
695
|
+
|
|
696
|
+
const count = matching.length
|
|
697
|
+
let pass: boolean
|
|
698
|
+
if (config.times !== undefined) {
|
|
699
|
+
pass = count === config.times
|
|
700
|
+
} else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
|
|
701
|
+
const min = config.minTimes ?? 0
|
|
702
|
+
const max = config.maxTimes ?? Infinity
|
|
703
|
+
pass = count >= min && count <= max
|
|
704
|
+
} else {
|
|
705
|
+
pass = count > 0
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
return {
|
|
709
|
+
pass,
|
|
710
|
+
message: () => {
|
|
711
|
+
if (pass) {
|
|
712
|
+
return `Expected trace NOT to have custom step matching ${JSON.stringify(config)}`
|
|
713
|
+
}
|
|
714
|
+
const stepSummary =
|
|
715
|
+
steps.length === 0
|
|
716
|
+
? 'no custom steps were recorded'
|
|
717
|
+
: `${count} matching step(s) found; recorded custom steps: ${JSON.stringify(steps)}`
|
|
718
|
+
return `Expected trace to have custom step matching ${JSON.stringify(config)}, but ${stepSummary}`
|
|
719
|
+
},
|
|
720
|
+
}
|
|
721
|
+
},
|
|
722
|
+
|
|
723
|
+
toHavePromptWhere(trace: TraceHandle, config: PromptWhereConfig) {
|
|
724
|
+
if (!isTraceHandle(trace)) {
|
|
725
|
+
return {
|
|
726
|
+
pass: false,
|
|
727
|
+
message: () =>
|
|
728
|
+
`Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHavePromptWhere(...)`,
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
if (!config || !config.filterContains) {
|
|
732
|
+
return {
|
|
733
|
+
pass: false,
|
|
734
|
+
message: () => 'toHavePromptWhere requires filterContains',
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
const filterNeedle = config.filterContains.toLowerCase()
|
|
739
|
+
const requireNeedle = config.requireContains?.toLowerCase()
|
|
740
|
+
const forbidNeedle = config.requireNotContains?.toLowerCase()
|
|
741
|
+
|
|
742
|
+
const prompts = trace.getLLMSteps().map((s) => s.prompt ?? '')
|
|
743
|
+
|
|
744
|
+
const filtered = prompts.filter((p) => p.toLowerCase().includes(filterNeedle))
|
|
745
|
+
|
|
746
|
+
// Optional positional check (index or nth)
|
|
747
|
+
const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : undefined)
|
|
748
|
+
|
|
749
|
+
let checked: string[] = []
|
|
750
|
+
let count = 0
|
|
751
|
+
let pass = true
|
|
752
|
+
|
|
753
|
+
if (targetIdx !== undefined) {
|
|
754
|
+
if (targetIdx < 0 || targetIdx >= filtered.length) {
|
|
755
|
+
return {
|
|
756
|
+
pass: false,
|
|
757
|
+
message: () =>
|
|
758
|
+
`Filtered prompts length ${filtered.length}, but index/nth points to ${targetIdx}. Config: ${JSON.stringify(config)}`,
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
const p = filtered[targetIdx]
|
|
762
|
+
const lower = p.toLowerCase()
|
|
763
|
+
const okRequire = requireNeedle ? lower.includes(requireNeedle) : true
|
|
764
|
+
const okForbid = forbidNeedle ? !lower.includes(forbidNeedle) : true
|
|
765
|
+
pass = okRequire && okForbid
|
|
766
|
+
checked = okRequire && okForbid ? [p] : []
|
|
767
|
+
count = checked.length
|
|
768
|
+
} else {
|
|
769
|
+
checked = filtered.filter((p) => {
|
|
770
|
+
const lower = p.toLowerCase()
|
|
771
|
+
if (requireNeedle && !lower.includes(requireNeedle)) return false
|
|
772
|
+
if (forbidNeedle && lower.includes(forbidNeedle)) return false
|
|
773
|
+
return true
|
|
774
|
+
})
|
|
775
|
+
|
|
776
|
+
count = checked.length
|
|
777
|
+
|
|
778
|
+
if (config.times !== undefined) {
|
|
779
|
+
pass = count === config.times
|
|
780
|
+
} else {
|
|
781
|
+
const min = config.minTimes ?? 0
|
|
782
|
+
const max = config.maxTimes ?? Infinity
|
|
783
|
+
pass = count >= min && count <= max
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// Also ensure that if requireContains is set, no filtered prompt violates it
|
|
787
|
+
if (requireNeedle) {
|
|
788
|
+
const violating = filtered.filter((p) => !p.toLowerCase().includes(requireNeedle))
|
|
789
|
+
if (violating.length > 0) pass = false
|
|
790
|
+
}
|
|
791
|
+
if (forbidNeedle) {
|
|
792
|
+
const violating = filtered.filter((p) => p.toLowerCase().includes(forbidNeedle))
|
|
793
|
+
if (violating.length > 0) pass = false
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
return {
|
|
798
|
+
pass,
|
|
799
|
+
message: () => {
|
|
800
|
+
if (pass) {
|
|
801
|
+
return `Expected prompts NOT to satisfy filter/require combo: ${JSON.stringify(config)}`
|
|
802
|
+
}
|
|
803
|
+
const base = [`Expected prompts filtered by "${config.filterContains}" to satisfy requirements`]
|
|
804
|
+
if (config.requireContains) base.push(`requireContains: "${config.requireContains}"`)
|
|
805
|
+
if (config.requireNotContains) base.push(`requireNotContains: "${config.requireNotContains}"`)
|
|
806
|
+
if (targetIdx !== undefined) {
|
|
807
|
+
base.push(`checked index: ${targetIdx}`, `filtered count: ${filtered.length}`)
|
|
808
|
+
} else {
|
|
809
|
+
base.push(`filtered count: ${filtered.length}, passing count: ${checked.length}`)
|
|
810
|
+
base.push(
|
|
811
|
+
config.times !== undefined
|
|
812
|
+
? `expected exactly ${config.times}`
|
|
813
|
+
: `expected between ${config.minTimes ?? 0} and ${config.maxTimes ?? Infinity}`,
|
|
814
|
+
)
|
|
815
|
+
}
|
|
816
|
+
return base.filter(Boolean).join('; ')
|
|
817
|
+
},
|
|
818
|
+
}
|
|
819
|
+
},
|
|
820
|
+
})
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
// Export our patched expect so users can import it and get the correct type and runtime matchers
|
|
824
|
+
export { expect }
|