elasticdash-sdk 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +775 -0
- package/dist/browser-ui.d.ts +43 -0
- package/dist/browser-ui.d.ts.map +1 -0
- package/dist/browser-ui.js +246 -0
- package/dist/browser-ui.js.map +1 -0
- package/dist/capture/event.d.ts +33 -0
- package/dist/capture/event.d.ts.map +1 -0
- package/dist/capture/event.js +2 -0
- package/dist/capture/event.js.map +1 -0
- package/dist/capture/index.d.ts +4 -0
- package/dist/capture/index.d.ts.map +1 -0
- package/dist/capture/index.js +4 -0
- package/dist/capture/index.js.map +1 -0
- package/dist/capture/recorder.d.ts +24 -0
- package/dist/capture/recorder.d.ts.map +1 -0
- package/dist/capture/recorder.js +46 -0
- package/dist/capture/recorder.js.map +1 -0
- package/dist/capture/replay.d.ts +20 -0
- package/dist/capture/replay.d.ts.map +1 -0
- package/dist/capture/replay.js +47 -0
- package/dist/capture/replay.js.map +1 -0
- package/dist/ci/api-client.d.ts +38 -0
- package/dist/ci/api-client.d.ts.map +1 -0
- package/dist/ci/api-client.js +96 -0
- package/dist/ci/api-client.js.map +1 -0
- package/dist/ci/benchmark.d.ts +33 -0
- package/dist/ci/benchmark.d.ts.map +1 -0
- package/dist/ci/benchmark.js +213 -0
- package/dist/ci/benchmark.js.map +1 -0
- package/dist/ci/ed-runner.d.ts +48 -0
- package/dist/ci/ed-runner.d.ts.map +1 -0
- package/dist/ci/ed-runner.js +260 -0
- package/dist/ci/ed-runner.js.map +1 -0
- package/dist/ci/executor.d.ts +13 -0
- package/dist/ci/executor.d.ts.map +1 -0
- package/dist/ci/executor.js +542 -0
- package/dist/ci/executor.js.map +1 -0
- package/dist/ci/git-info.d.ts +17 -0
- package/dist/ci/git-info.d.ts.map +1 -0
- package/dist/ci/git-info.js +102 -0
- package/dist/ci/git-info.js.map +1 -0
- package/dist/ci/index.d.ts +6 -0
- package/dist/ci/index.d.ts.map +1 -0
- package/dist/ci/index.js +4 -0
- package/dist/ci/index.js.map +1 -0
- package/dist/ci/measurement.d.ts +9 -0
- package/dist/ci/measurement.d.ts.map +1 -0
- package/dist/ci/measurement.js +15 -0
- package/dist/ci/measurement.js.map +1 -0
- package/dist/ci/replay.d.ts +31 -0
- package/dist/ci/replay.d.ts.map +1 -0
- package/dist/ci/replay.js +96 -0
- package/dist/ci/replay.js.map +1 -0
- package/dist/ci/reporters/default.d.ts +8 -0
- package/dist/ci/reporters/default.d.ts.map +1 -0
- package/dist/ci/reporters/default.js +46 -0
- package/dist/ci/reporters/default.js.map +1 -0
- package/dist/ci/reporters/index.d.ts +8 -0
- package/dist/ci/reporters/index.d.ts.map +1 -0
- package/dist/ci/reporters/index.js +14 -0
- package/dist/ci/reporters/index.js.map +1 -0
- package/dist/ci/reporters/json.d.ts +8 -0
- package/dist/ci/reporters/json.d.ts.map +1 -0
- package/dist/ci/reporters/json.js +14 -0
- package/dist/ci/reporters/json.js.map +1 -0
- package/dist/ci/reporters/junit.d.ts +8 -0
- package/dist/ci/reporters/junit.d.ts.map +1 -0
- package/dist/ci/reporters/junit.js +48 -0
- package/dist/ci/reporters/junit.js.map +1 -0
- package/dist/ci/runner.d.ts +3 -0
- package/dist/ci/runner.d.ts.map +1 -0
- package/dist/ci/runner.js +187 -0
- package/dist/ci/runner.js.map +1 -0
- package/dist/ci/test-discovery.d.ts +5 -0
- package/dist/ci/test-discovery.d.ts.map +1 -0
- package/dist/ci/test-discovery.js +11 -0
- package/dist/ci/test-discovery.js.map +1 -0
- package/dist/ci/test-loader.d.ts +19 -0
- package/dist/ci/test-loader.d.ts.map +1 -0
- package/dist/ci/test-loader.js +149 -0
- package/dist/ci/test-loader.js.map +1 -0
- package/dist/ci/test-registry.d.ts +42 -0
- package/dist/ci/test-registry.d.ts.map +1 -0
- package/dist/ci/test-registry.js +18 -0
- package/dist/ci/test-registry.js.map +1 -0
- package/dist/ci/trace-schema.d.ts +30 -0
- package/dist/ci/trace-schema.d.ts.map +1 -0
- package/dist/ci/trace-schema.js +66 -0
- package/dist/ci/trace-schema.js.map +1 -0
- package/dist/ci/trace-writer.d.ts +16 -0
- package/dist/ci/trace-writer.d.ts.map +1 -0
- package/dist/ci/trace-writer.js +108 -0
- package/dist/ci/trace-writer.js.map +1 -0
- package/dist/ci/types.d.ts +108 -0
- package/dist/ci/types.d.ts.map +1 -0
- package/dist/ci/types.js +3 -0
- package/dist/ci/types.js.map +1 -0
- package/dist/ci/upload-client.d.ts +74 -0
- package/dist/ci/upload-client.d.ts.map +1 -0
- package/dist/ci/upload-client.js +195 -0
- package/dist/ci/upload-client.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +716 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/agent-state.d.ts +47 -0
- package/dist/core/agent-state.d.ts.map +1 -0
- package/dist/core/agent-state.js +137 -0
- package/dist/core/agent-state.js.map +1 -0
- package/dist/core/judge-utils.d.ts +22 -0
- package/dist/core/judge-utils.d.ts.map +1 -0
- package/dist/core/judge-utils.js +211 -0
- package/dist/core/judge-utils.js.map +1 -0
- package/dist/core/registry.d.ts +28 -0
- package/dist/core/registry.d.ts.map +1 -0
- package/dist/core/registry.js +52 -0
- package/dist/core/registry.js.map +1 -0
- package/dist/dashboard-server.d.ts +65 -0
- package/dist/dashboard-server.d.ts.map +1 -0
- package/dist/dashboard-server.js +3940 -0
- package/dist/dashboard-server.js.map +1 -0
- package/dist/execution/tool-runner.d.ts +26 -0
- package/dist/execution/tool-runner.d.ts.map +1 -0
- package/dist/execution/tool-runner.js +316 -0
- package/dist/execution/tool-runner.js.map +1 -0
- package/dist/html/dashboard.html +2218 -0
- package/dist/http.d.ts +14 -0
- package/dist/http.d.ts.map +1 -0
- package/dist/http.js +13 -0
- package/dist/http.js.map +1 -0
- package/dist/index.cjs +8102 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +67 -0
- package/dist/index.js.map +1 -0
- package/dist/interceptors/ai-interceptor.d.ts +26 -0
- package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
- package/dist/interceptors/ai-interceptor.js +756 -0
- package/dist/interceptors/ai-interceptor.js.map +1 -0
- package/dist/interceptors/db-auto.d.ts +8 -0
- package/dist/interceptors/db-auto.d.ts.map +1 -0
- package/dist/interceptors/db-auto.js +217 -0
- package/dist/interceptors/db-auto.js.map +1 -0
- package/dist/interceptors/db.d.ts +23 -0
- package/dist/interceptors/db.d.ts.map +1 -0
- package/dist/interceptors/db.js +137 -0
- package/dist/interceptors/db.js.map +1 -0
- package/dist/interceptors/http.d.ts +28 -0
- package/dist/interceptors/http.d.ts.map +1 -0
- package/dist/interceptors/http.js +356 -0
- package/dist/interceptors/http.js.map +1 -0
- package/dist/interceptors/side-effects.d.ts +7 -0
- package/dist/interceptors/side-effects.d.ts.map +1 -0
- package/dist/interceptors/side-effects.js +72 -0
- package/dist/interceptors/side-effects.js.map +1 -0
- package/dist/interceptors/telemetry-push.d.ts +142 -0
- package/dist/interceptors/telemetry-push.d.ts.map +1 -0
- package/dist/interceptors/telemetry-push.js +463 -0
- package/dist/interceptors/telemetry-push.js.map +1 -0
- package/dist/interceptors/tool.d.ts +2 -0
- package/dist/interceptors/tool.d.ts.map +1 -0
- package/dist/interceptors/tool.js +274 -0
- package/dist/interceptors/tool.js.map +1 -0
- package/dist/interceptors/workflow-ai.d.ts +5 -0
- package/dist/interceptors/workflow-ai.d.ts.map +1 -0
- package/dist/interceptors/workflow-ai.js +382 -0
- package/dist/interceptors/workflow-ai.js.map +1 -0
- package/dist/internals/conditional-recorder.d.ts +21 -0
- package/dist/internals/conditional-recorder.d.ts.map +1 -0
- package/dist/internals/conditional-recorder.js +54 -0
- package/dist/internals/conditional-recorder.js.map +1 -0
- package/dist/internals/mock-resolver.d.ts +146 -0
- package/dist/internals/mock-resolver.d.ts.map +1 -0
- package/dist/internals/mock-resolver.js +427 -0
- package/dist/internals/mock-resolver.js.map +1 -0
- package/dist/matchers/index.d.ts +96 -0
- package/dist/matchers/index.d.ts.map +1 -0
- package/dist/matchers/index.js +668 -0
- package/dist/matchers/index.js.map +1 -0
- package/dist/observability.d.ts +82 -0
- package/dist/observability.d.ts.map +1 -0
- package/dist/observability.js +471 -0
- package/dist/observability.js.map +1 -0
- package/dist/portal-executor.d.ts +30 -0
- package/dist/portal-executor.d.ts.map +1 -0
- package/dist/portal-executor.js +324 -0
- package/dist/portal-executor.js.map +1 -0
- package/dist/portal-server.d.ts +3 -0
- package/dist/portal-server.d.ts.map +1 -0
- package/dist/portal-server.js +279 -0
- package/dist/portal-server.js.map +1 -0
- package/dist/proxy/llm-capture.d.ts +14 -0
- package/dist/proxy/llm-capture.d.ts.map +1 -0
- package/dist/proxy/llm-capture.js +264 -0
- package/dist/proxy/llm-capture.js.map +1 -0
- package/dist/reporter.d.ts +3 -0
- package/dist/reporter.d.ts.map +1 -0
- package/dist/reporter.js +72 -0
- package/dist/reporter.js.map +1 -0
- package/dist/runWorkflowSubprocess.d.ts +14 -0
- package/dist/runWorkflowSubprocess.d.ts.map +1 -0
- package/dist/runWorkflowSubprocess.js +66 -0
- package/dist/runWorkflowSubprocess.js.map +1 -0
- package/dist/runner.d.ts +16 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +138 -0
- package/dist/runner.js.map +1 -0
- package/dist/socket-connector.d.ts +22 -0
- package/dist/socket-connector.d.ts.map +1 -0
- package/dist/socket-connector.js +104 -0
- package/dist/socket-connector.js.map +1 -0
- package/dist/telemetry-batcher.d.ts +56 -0
- package/dist/telemetry-batcher.d.ts.map +1 -0
- package/dist/telemetry-batcher.js +143 -0
- package/dist/telemetry-batcher.js.map +1 -0
- package/dist/test-setup.d.ts +12 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +13 -0
- package/dist/test-setup.js.map +1 -0
- package/dist/tool-registry.d.ts +31 -0
- package/dist/tool-registry.d.ts.map +1 -0
- package/dist/tool-registry.js +73 -0
- package/dist/tool-registry.js.map +1 -0
- package/dist/tool-runner-worker.d.ts +2 -0
- package/dist/tool-runner-worker.d.ts.map +1 -0
- package/dist/tool-runner-worker.js +215 -0
- package/dist/tool-runner-worker.js.map +1 -0
- package/dist/trace-adapter/context.d.ts +72 -0
- package/dist/trace-adapter/context.d.ts.map +1 -0
- package/dist/trace-adapter/context.js +80 -0
- package/dist/trace-adapter/context.js.map +1 -0
- package/dist/tracing.d.ts +2 -0
- package/dist/tracing.d.ts.map +1 -0
- package/dist/tracing.js +59 -0
- package/dist/tracing.js.map +1 -0
- package/dist/trigger-executor.d.ts +12 -0
- package/dist/trigger-executor.d.ts.map +1 -0
- package/dist/trigger-executor.js +130 -0
- package/dist/trigger-executor.js.map +1 -0
- package/dist/types/portal.d.ts +76 -0
- package/dist/types/portal.d.ts.map +1 -0
- package/dist/types/portal.js +2 -0
- package/dist/types/portal.js.map +1 -0
- package/dist/utils/debug.d.ts +3 -0
- package/dist/utils/debug.d.ts.map +1 -0
- package/dist/utils/debug.js +8 -0
- package/dist/utils/debug.js.map +1 -0
- package/dist/utils/license-error.d.ts +23 -0
- package/dist/utils/license-error.d.ts.map +1 -0
- package/dist/utils/license-error.js +42 -0
- package/dist/utils/license-error.js.map +1 -0
- package/dist/utils/redact.d.ts +7 -0
- package/dist/utils/redact.d.ts.map +1 -0
- package/dist/utils/redact.js +26 -0
- package/dist/utils/redact.js.map +1 -0
- package/dist/workflow-runner-worker.d.ts +2 -0
- package/dist/workflow-runner-worker.d.ts.map +1 -0
- package/dist/workflow-runner-worker.js +329 -0
- package/dist/workflow-runner-worker.js.map +1 -0
- package/dist/workflow-runner.d.ts +14 -0
- package/dist/workflow-runner.d.ts.map +1 -0
- package/dist/workflow-runner.js +34 -0
- package/dist/workflow-runner.js.map +1 -0
- package/docs/agent-coding-instructions.md +138 -0
- package/docs/agent-integration-guide.md +564 -0
- package/docs/agents.md +140 -0
- package/docs/dashboard.md +394 -0
- package/docs/deno.md +69 -0
- package/docs/instrumentation.md +424 -0
- package/docs/langfuse-trace-structure.md +145 -0
- package/docs/matchers.md +173 -0
- package/docs/observability_contract.md +192 -0
- package/docs/observability_mode.md +195 -0
- package/docs/quickstart.md +621 -0
- package/docs/security-compliance.md +566 -0
- package/docs/test-writing-guidelines.md +444 -0
- package/docs/tools.md +165 -0
- package/docs/workflow-modes.md +253 -0
- package/package.json +76 -0
- package/src/browser-ui.ts +281 -0
- package/src/capture/event.ts +30 -0
- package/src/capture/index.ts +3 -0
- package/src/capture/recorder.ts +62 -0
- package/src/capture/replay.ts +55 -0
- package/src/ci/api-client.ts +136 -0
- package/src/ci/benchmark.ts +257 -0
- package/src/ci/ed-runner.ts +351 -0
- package/src/ci/executor.ts +671 -0
- package/src/ci/git-info.ts +127 -0
- package/src/ci/index.ts +5 -0
- package/src/ci/measurement.ts +25 -0
- package/src/ci/replay.ts +127 -0
- package/src/ci/reporters/default.ts +50 -0
- package/src/ci/reporters/index.ts +21 -0
- package/src/ci/reporters/json.ts +18 -0
- package/src/ci/reporters/junit.ts +61 -0
- package/src/ci/runner.ts +208 -0
- package/src/ci/test-discovery.ts +16 -0
- package/src/ci/test-loader.ts +187 -0
- package/src/ci/test-registry.ts +62 -0
- package/src/ci/trace-schema.ts +96 -0
- package/src/ci/trace-writer.ts +107 -0
- package/src/ci/types.ts +115 -0
- package/src/ci/upload-client.ts +300 -0
- package/src/cli.ts +811 -0
- package/src/core/agent-state.ts +162 -0
- package/src/core/judge-utils.ts +232 -0
- package/src/core/registry.ts +92 -0
- package/src/dashboard-server.ts +2047 -0
- package/src/execution/tool-runner.ts +352 -0
- package/src/html/dashboard.html +2218 -0
- package/src/http.ts +13 -0
- package/src/index.ts +138 -0
- package/src/interceptors/ai-interceptor.ts +798 -0
- package/src/interceptors/db-auto.ts +243 -0
- package/src/interceptors/db.ts +156 -0
- package/src/interceptors/http.ts +393 -0
- package/src/interceptors/side-effects.ts +83 -0
- package/src/interceptors/telemetry-push.ts +537 -0
- package/src/interceptors/tool.ts +287 -0
- package/src/interceptors/workflow-ai.ts +419 -0
- package/src/internals/conditional-recorder.ts +63 -0
- package/src/internals/mock-resolver.ts +492 -0
- package/src/matchers/index.ts +824 -0
- package/src/observability.ts +501 -0
- package/src/portal-executor.ts +355 -0
- package/src/portal-server.ts +304 -0
- package/src/proxy/llm-capture.ts +301 -0
- package/src/reporter.ts +81 -0
- package/src/runWorkflowSubprocess.ts +74 -0
- package/src/runner.ts +178 -0
- package/src/socket-connector.ts +117 -0
- package/src/telemetry-batcher.ts +191 -0
- package/src/test-setup.ts +16 -0
- package/src/tool-registry.ts +94 -0
- package/src/tool-runner-worker.ts +244 -0
- package/src/trace-adapter/context.ts +156 -0
- package/src/tracing.ts +62 -0
- package/src/trigger-executor.ts +171 -0
- package/src/types/agent.d.ts +63 -0
- package/src/types/expect.d.ts +81 -0
- package/src/types/modules.d.ts +2 -0
- package/src/types/portal.ts +69 -0
- package/src/utils/debug.ts +8 -0
- package/src/utils/license-error.ts +43 -0
- package/src/utils/redact.ts +25 -0
- package/src/workflow-runner-worker.ts +386 -0
- package/src/workflow-runner.ts +58 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
import { executePortalTask, checkToolAvailability, checkAIAvailability } from '../portal-executor.js'
|
|
2
|
+
import { scanTools } from '../execution/tool-runner.js'
|
|
3
|
+
import { callProviderLLM } from '../matchers/index.js'
|
|
4
|
+
import { prepareOutputForJudge } from '../core/judge-utils.js'
|
|
5
|
+
import type { ToolInfo } from '../execution/tool-runner.js'
|
|
6
|
+
import type {
|
|
7
|
+
APITestGroupTest,
|
|
8
|
+
APIExpectation,
|
|
9
|
+
CISingleRunResult,
|
|
10
|
+
CIExpectationResult,
|
|
11
|
+
} from './types.js'
|
|
12
|
+
|
|
13
|
+
// ─── Test Executor ───────────────────────────────────────────
|
|
14
|
+
// Executes a single TestGroupTest: runs it N times, evaluates expectations,
|
|
15
|
+
// determines pass/fail based on pass_threshold.
|
|
16
|
+
|
|
17
|
+
interface ExecutionResult {
|
|
18
|
+
passed: boolean
|
|
19
|
+
singleRuns: CISingleRunResult[]
|
|
20
|
+
expectationResults: CIExpectationResult[]
|
|
21
|
+
durationMs: number
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Execute a test (single-step or full-flow) according to its configuration.
|
|
26
|
+
*/
|
|
27
|
+
export async function executeTest(
|
|
28
|
+
test: APITestGroupTest,
|
|
29
|
+
cwd: string,
|
|
30
|
+
): Promise<ExecutionResult> {
|
|
31
|
+
const tools = scanTools(cwd)
|
|
32
|
+
const runCount = test.run_count || 1
|
|
33
|
+
const timeoutMs = test.timeout_ms || 30000
|
|
34
|
+
const startTime = Date.now()
|
|
35
|
+
|
|
36
|
+
const singleRuns: CISingleRunResult[] = []
|
|
37
|
+
|
|
38
|
+
for (let i = 0; i < runCount; i++) {
|
|
39
|
+
const runStart = Date.now()
|
|
40
|
+
let result: CISingleRunResult
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
const runPromise = executeSingleRun(test, cwd, tools, i)
|
|
44
|
+
const timeoutPromise = new Promise<never>((_, reject) =>
|
|
45
|
+
setTimeout(() => reject(new Error(`Test timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
46
|
+
)
|
|
47
|
+
result = await Promise.race([runPromise, timeoutPromise])
|
|
48
|
+
} catch (err) {
|
|
49
|
+
result = {
|
|
50
|
+
runIndex: i + 1,
|
|
51
|
+
passed: false,
|
|
52
|
+
durationMs: Date.now() - runStart,
|
|
53
|
+
inputTokens: 0,
|
|
54
|
+
outputTokens: 0,
|
|
55
|
+
totalTokens: 0,
|
|
56
|
+
output: null,
|
|
57
|
+
trace: null,
|
|
58
|
+
error: err instanceof Error ? err.message : String(err),
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
singleRuns.push(result)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Evaluate expectations against the single runs
|
|
66
|
+
const expectationResults = await evaluateExpectations(test.expectations, singleRuns)
|
|
67
|
+
|
|
68
|
+
// Determine overall pass/fail
|
|
69
|
+
const passed = determinePassFail(test.pass_threshold, singleRuns, expectationResults)
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
passed,
|
|
73
|
+
singleRuns,
|
|
74
|
+
expectationResults,
|
|
75
|
+
durationMs: Date.now() - startTime,
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ─── Single Run Execution ────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
async function executeSingleRun(
|
|
82
|
+
test: APITestGroupTest,
|
|
83
|
+
cwd: string,
|
|
84
|
+
tools: ToolInfo[],
|
|
85
|
+
runIndex: number,
|
|
86
|
+
): Promise<CISingleRunResult> {
|
|
87
|
+
const start = Date.now()
|
|
88
|
+
|
|
89
|
+
if (test.test_type === 'single-step') {
|
|
90
|
+
return executeSingleStep(test, cwd, tools, runIndex, start)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return executeFullFlow(test, cwd, tools, runIndex, start)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function executeSingleStep(
|
|
97
|
+
test: APITestGroupTest,
|
|
98
|
+
cwd: string,
|
|
99
|
+
tools: ToolInfo[],
|
|
100
|
+
runIndex: number,
|
|
101
|
+
start: number,
|
|
102
|
+
): Promise<CISingleRunResult> {
|
|
103
|
+
const stepType = test.target_step_type // 'ai' or 'tool'
|
|
104
|
+
const stepName = test.target_step_name
|
|
105
|
+
|
|
106
|
+
if (!stepType || !stepName) {
|
|
107
|
+
return {
|
|
108
|
+
runIndex: runIndex + 1,
|
|
109
|
+
passed: false,
|
|
110
|
+
durationMs: Date.now() - start,
|
|
111
|
+
inputTokens: 0, outputTokens: 0, totalTokens: 0,
|
|
112
|
+
output: null, trace: null,
|
|
113
|
+
error: 'Single-step test requires target_step_type and target_step_name.',
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Pre-validate availability
|
|
118
|
+
const availability = stepType === 'ai'
|
|
119
|
+
? checkAIAvailability(undefined, stepName)
|
|
120
|
+
: checkToolAvailability(stepName, cwd, tools)
|
|
121
|
+
|
|
122
|
+
if (!availability.available) {
|
|
123
|
+
return {
|
|
124
|
+
runIndex: runIndex + 1,
|
|
125
|
+
passed: false,
|
|
126
|
+
durationMs: Date.now() - start,
|
|
127
|
+
inputTokens: 0, outputTokens: 0, totalTokens: 0,
|
|
128
|
+
output: null, trace: null,
|
|
129
|
+
error: `Step unavailable: ${availability.reason}`,
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Execute via portal executor (reuses existing infrastructure)
|
|
134
|
+
const result = await executePortalTask(
|
|
135
|
+
{
|
|
136
|
+
taskId: `ci-${test.id}-run-${runIndex}`,
|
|
137
|
+
type: stepType === 'ai' ? 'ai' : 'tool',
|
|
138
|
+
name: stepName,
|
|
139
|
+
input: test.mock_input,
|
|
140
|
+
frozenEvents: test.frozen_events as any[],
|
|
141
|
+
},
|
|
142
|
+
cwd,
|
|
143
|
+
tools,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
runIndex: runIndex + 1,
|
|
148
|
+
passed: result.ok,
|
|
149
|
+
durationMs: result.durationMs,
|
|
150
|
+
inputTokens: result.usage?.inputTokens ?? 0,
|
|
151
|
+
outputTokens: result.usage?.outputTokens ?? 0,
|
|
152
|
+
totalTokens: result.usage?.totalTokens ?? 0,
|
|
153
|
+
output: result.output,
|
|
154
|
+
trace: null,
|
|
155
|
+
error: result.error,
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
async function executeFullFlow(
|
|
160
|
+
test: APITestGroupTest,
|
|
161
|
+
cwd: string,
|
|
162
|
+
tools: ToolInfo[],
|
|
163
|
+
runIndex: number,
|
|
164
|
+
start: number,
|
|
165
|
+
): Promise<CISingleRunResult> {
|
|
166
|
+
// Full-flow tests require running the entire workflow.
|
|
167
|
+
// We leverage the existing workflow runner infrastructure.
|
|
168
|
+
// The workflow is loaded from ed_workflows.ts and executed with workflow_input.
|
|
169
|
+
|
|
170
|
+
try {
|
|
171
|
+
const { runWorkflow } = await import('../workflow-runner.js')
|
|
172
|
+
const { resolveRuntimeModule } = await import('../execution/tool-runner.js')
|
|
173
|
+
const { pathToFileURL } = await import('node:url')
|
|
174
|
+
|
|
175
|
+
const workflowModulePath = resolveRuntimeModule(cwd, 'ed_workflows')
|
|
176
|
+
if (!workflowModulePath) {
|
|
177
|
+
return {
|
|
178
|
+
runIndex: runIndex + 1,
|
|
179
|
+
passed: false,
|
|
180
|
+
durationMs: Date.now() - start,
|
|
181
|
+
inputTokens: 0, outputTokens: 0, totalTokens: 0,
|
|
182
|
+
output: null, trace: null,
|
|
183
|
+
error: 'Cannot find ed_workflows.ts/js in workspace root.',
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Import the workflow module dynamically
|
|
188
|
+
const workflowModule = await import(pathToFileURL(workflowModulePath).href)
|
|
189
|
+
|
|
190
|
+
// Find a suitable workflow function to execute
|
|
191
|
+
// Convention: use the first exported async function, or match by test group's workflow name
|
|
192
|
+
const workflowFns = Object.entries(workflowModule).filter(
|
|
193
|
+
([, val]) => typeof val === 'function'
|
|
194
|
+
) as [string, (...args: unknown[]) => Promise<unknown>][]
|
|
195
|
+
|
|
196
|
+
if (workflowFns.length === 0) {
|
|
197
|
+
return {
|
|
198
|
+
runIndex: runIndex + 1,
|
|
199
|
+
passed: false,
|
|
200
|
+
durationMs: Date.now() - start,
|
|
201
|
+
inputTokens: 0, outputTokens: 0, totalTokens: 0,
|
|
202
|
+
output: null, trace: null,
|
|
203
|
+
error: 'No workflow functions found in ed_workflows.',
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Prefer matching by name if the test has a target_step_name
|
|
208
|
+
const targetFn = test.target_step_name
|
|
209
|
+
? workflowFns.find(([name]) => name === test.target_step_name)?.[1]
|
|
210
|
+
: workflowFns[0][1]
|
|
211
|
+
|
|
212
|
+
const fn = targetFn ?? workflowFns[0][1]
|
|
213
|
+
|
|
214
|
+
const frozenEvents = Array.isArray(test.frozen_events) ? test.frozen_events : []
|
|
215
|
+
|
|
216
|
+
const { result, trace } = await runWorkflow(
|
|
217
|
+
() => {
|
|
218
|
+
const input = test.workflow_input
|
|
219
|
+
return fn(input) as Promise<unknown>
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
replayMode: frozenEvents.length > 0,
|
|
223
|
+
history: frozenEvents as any[],
|
|
224
|
+
interceptHttp: true,
|
|
225
|
+
interceptSideEffects: true,
|
|
226
|
+
},
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
// Extract usage from trace events
|
|
230
|
+
let inputTokens = 0, outputTokens = 0, totalTokens = 0
|
|
231
|
+
if (trace?.events) {
|
|
232
|
+
for (const evt of trace.events) {
|
|
233
|
+
if (evt.type === 'ai' && evt.usage) {
|
|
234
|
+
inputTokens += (evt.usage as any).inputTokens ?? (evt.usage as any).input ?? 0
|
|
235
|
+
outputTokens += (evt.usage as any).outputTokens ?? (evt.usage as any).output ?? 0
|
|
236
|
+
totalTokens += (evt.usage as any).totalTokens ?? (evt.usage as any).total ?? 0
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
runIndex: runIndex + 1,
|
|
243
|
+
passed: true,
|
|
244
|
+
durationMs: Date.now() - start,
|
|
245
|
+
inputTokens, outputTokens, totalTokens,
|
|
246
|
+
output: result,
|
|
247
|
+
trace: trace ?? null,
|
|
248
|
+
}
|
|
249
|
+
} catch (err) {
|
|
250
|
+
return {
|
|
251
|
+
runIndex: runIndex + 1,
|
|
252
|
+
passed: false,
|
|
253
|
+
durationMs: Date.now() - start,
|
|
254
|
+
inputTokens: 0, outputTokens: 0, totalTokens: 0,
|
|
255
|
+
output: null, trace: null,
|
|
256
|
+
error: err instanceof Error ? err.message : String(err),
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// ─── Expectation Evaluation ──────────────────────────────────
|
|
262
|
+
|
|
263
|
+
async function evaluateExpectations(
|
|
264
|
+
expectations: APIExpectation[],
|
|
265
|
+
singleRuns: CISingleRunResult[],
|
|
266
|
+
): Promise<CIExpectationResult[]> {
|
|
267
|
+
const results: CIExpectationResult[] = []
|
|
268
|
+
|
|
269
|
+
for (const exp of expectations) {
|
|
270
|
+
const result = await evaluateExpectation(exp, singleRuns)
|
|
271
|
+
results.push(result)
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return results
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
async function evaluateExpectation(
|
|
278
|
+
exp: APIExpectation,
|
|
279
|
+
singleRuns: CISingleRunResult[],
|
|
280
|
+
): Promise<CIExpectationResult> {
|
|
281
|
+
switch (exp.type) {
|
|
282
|
+
case 'token-budget':
|
|
283
|
+
return evaluateTokenBudget(exp, singleRuns)
|
|
284
|
+
case 'latency-budget':
|
|
285
|
+
return evaluateLatencyBudget(exp, singleRuns)
|
|
286
|
+
case 'output-contains':
|
|
287
|
+
return evaluateOutputContains(exp, singleRuns)
|
|
288
|
+
case 'output-schema':
|
|
289
|
+
return evaluateOutputSchema(exp, singleRuns)
|
|
290
|
+
case 'tool-called':
|
|
291
|
+
return evaluateToolCalled(exp, singleRuns)
|
|
292
|
+
case 'determinism':
|
|
293
|
+
return evaluateDeterminism(exp, singleRuns)
|
|
294
|
+
case 'llm-judge':
|
|
295
|
+
return evaluateLLMJudge(exp, singleRuns)
|
|
296
|
+
default:
|
|
297
|
+
return {
|
|
298
|
+
expectationId: exp.id,
|
|
299
|
+
type: exp.type,
|
|
300
|
+
passed: false,
|
|
301
|
+
detail: `Unknown expectation type: ${exp.type}`,
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function evaluateTokenBudget(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
307
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
308
|
+
let allPassed = true
|
|
309
|
+
|
|
310
|
+
for (const run of runs) {
|
|
311
|
+
let passed = true
|
|
312
|
+
const details: string[] = []
|
|
313
|
+
|
|
314
|
+
if (exp.max_tokens_per_run != null && run.totalTokens > exp.max_tokens_per_run) {
|
|
315
|
+
passed = false
|
|
316
|
+
details.push(`total ${run.totalTokens} > max ${exp.max_tokens_per_run}`)
|
|
317
|
+
}
|
|
318
|
+
perRun[run.runIndex] = { passed, detail: details.join('; ') || undefined }
|
|
319
|
+
if (!passed) allPassed = false
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (exp.max_total_tokens != null) {
|
|
323
|
+
const total = runs.reduce((sum, r) => sum + r.totalTokens, 0)
|
|
324
|
+
if (total > exp.max_total_tokens) {
|
|
325
|
+
allPassed = false
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
expectationId: exp.id,
|
|
331
|
+
type: 'token-budget',
|
|
332
|
+
passed: allPassed,
|
|
333
|
+
detail: allPassed ? undefined : 'Token budget exceeded.',
|
|
334
|
+
perRun,
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
function evaluateLatencyBudget(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
339
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
340
|
+
let allPassed = true
|
|
341
|
+
|
|
342
|
+
for (const run of runs) {
|
|
343
|
+
let passed = true
|
|
344
|
+
if (exp.max_duration_ms != null && run.durationMs > exp.max_duration_ms) {
|
|
345
|
+
passed = false
|
|
346
|
+
}
|
|
347
|
+
perRun[run.runIndex] = { passed, detail: passed ? undefined : `${run.durationMs}ms > ${exp.max_duration_ms}ms` }
|
|
348
|
+
if (!passed) allPassed = false
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
if (exp.max_total_duration_ms != null) {
|
|
352
|
+
const total = runs.reduce((sum, r) => sum + r.durationMs, 0)
|
|
353
|
+
if (total > exp.max_total_duration_ms) {
|
|
354
|
+
allPassed = false
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
return {
|
|
359
|
+
expectationId: exp.id,
|
|
360
|
+
type: 'latency-budget',
|
|
361
|
+
passed: allPassed,
|
|
362
|
+
detail: allPassed ? undefined : 'Latency budget exceeded.',
|
|
363
|
+
perRun,
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
function evaluateOutputContains(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
368
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
369
|
+
let allPassed = true
|
|
370
|
+
|
|
371
|
+
for (const run of runs) {
|
|
372
|
+
const outputStr = typeof run.output === 'string'
|
|
373
|
+
? run.output
|
|
374
|
+
: JSON.stringify(run.output ?? '')
|
|
375
|
+
|
|
376
|
+
const haystack = exp.case_insensitive ? outputStr.toLowerCase() : outputStr
|
|
377
|
+
let passed = true
|
|
378
|
+
|
|
379
|
+
if (exp.contains_text) {
|
|
380
|
+
const needle = exp.case_insensitive ? exp.contains_text.toLowerCase() : exp.contains_text
|
|
381
|
+
if (!haystack.includes(needle)) passed = false
|
|
382
|
+
}
|
|
383
|
+
if (exp.not_contains_text) {
|
|
384
|
+
const needle = exp.case_insensitive ? exp.not_contains_text.toLowerCase() : exp.not_contains_text
|
|
385
|
+
if (haystack.includes(needle)) passed = false
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
perRun[run.runIndex] = { passed, detail: passed ? undefined : 'Output text check failed.' }
|
|
389
|
+
if (!passed) allPassed = false
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return {
|
|
393
|
+
expectationId: exp.id,
|
|
394
|
+
type: 'output-contains',
|
|
395
|
+
passed: allPassed,
|
|
396
|
+
detail: allPassed ? undefined : 'Output contains check failed.',
|
|
397
|
+
perRun,
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function evaluateOutputSchema(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
402
|
+
// Basic JSON schema validation: check that output is valid JSON matching the schema's type/required fields
|
|
403
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
404
|
+
let allPassed = true
|
|
405
|
+
|
|
406
|
+
const schema = exp.json_schema as Record<string, unknown> | null
|
|
407
|
+
|
|
408
|
+
for (const run of runs) {
|
|
409
|
+
if (!schema) {
|
|
410
|
+
perRun[run.runIndex] = { passed: true }
|
|
411
|
+
continue
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
let output = run.output
|
|
415
|
+
if (typeof output === 'string') {
|
|
416
|
+
try { output = JSON.parse(output) } catch {
|
|
417
|
+
perRun[run.runIndex] = { passed: false, detail: 'Output is not valid JSON.' }
|
|
418
|
+
allPassed = false
|
|
419
|
+
continue
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Check type
|
|
424
|
+
const schemaType = schema.type as string | undefined
|
|
425
|
+
let passed = true
|
|
426
|
+
|
|
427
|
+
if (schemaType === 'object' && (typeof output !== 'object' || output === null || Array.isArray(output))) {
|
|
428
|
+
passed = false
|
|
429
|
+
} else if (schemaType === 'array' && !Array.isArray(output)) {
|
|
430
|
+
passed = false
|
|
431
|
+
} else if (schemaType === 'string' && typeof output !== 'string') {
|
|
432
|
+
passed = false
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// Check required fields
|
|
436
|
+
if (passed && schemaType === 'object' && Array.isArray(schema.required)) {
|
|
437
|
+
for (const key of schema.required as string[]) {
|
|
438
|
+
if (!(key in (output as Record<string, unknown>))) {
|
|
439
|
+
passed = false
|
|
440
|
+
break
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
perRun[run.runIndex] = { passed, detail: passed ? undefined : 'Output does not match schema.' }
|
|
446
|
+
if (!passed) allPassed = false
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
expectationId: exp.id,
|
|
451
|
+
type: 'output-schema',
|
|
452
|
+
passed: allPassed,
|
|
453
|
+
detail: allPassed ? undefined : 'Output schema check failed.',
|
|
454
|
+
perRun,
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function evaluateToolCalled(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
459
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
460
|
+
let allPassed = true
|
|
461
|
+
|
|
462
|
+
for (const run of runs) {
|
|
463
|
+
// Extract tool calls from trace
|
|
464
|
+
const trace = run.trace as { events?: Array<{ type: string; name: string }> } | null
|
|
465
|
+
const toolNames = trace?.events
|
|
466
|
+
?.filter((e) => e.type === 'tool')
|
|
467
|
+
.map((e) => e.name) ?? []
|
|
468
|
+
|
|
469
|
+
let passed = true
|
|
470
|
+
|
|
471
|
+
// Check required tools
|
|
472
|
+
if (exp.required_tools?.length) {
|
|
473
|
+
for (const required of exp.required_tools) {
|
|
474
|
+
if (!toolNames.includes(required)) {
|
|
475
|
+
passed = false
|
|
476
|
+
break
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Check forbidden tools
|
|
482
|
+
if (exp.forbidden_tools?.length) {
|
|
483
|
+
for (const forbidden of exp.forbidden_tools) {
|
|
484
|
+
if (toolNames.includes(forbidden)) {
|
|
485
|
+
passed = false
|
|
486
|
+
break
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
perRun[run.runIndex] = { passed, detail: passed ? undefined : `Tools called: [${toolNames.join(', ')}]` }
|
|
492
|
+
if (!passed) allPassed = false
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
return {
|
|
496
|
+
expectationId: exp.id,
|
|
497
|
+
type: 'tool-called',
|
|
498
|
+
passed: allPassed,
|
|
499
|
+
detail: allPassed ? undefined : 'Tool call check failed.',
|
|
500
|
+
perRun,
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
function evaluateDeterminism(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
|
|
505
|
+
if (runs.length < 2) {
|
|
506
|
+
return {
|
|
507
|
+
expectationId: exp.id,
|
|
508
|
+
type: 'determinism',
|
|
509
|
+
passed: true,
|
|
510
|
+
detail: 'Only one run — determinism check skipped.',
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Compare outputs pairwise
|
|
515
|
+
const outputs = runs.map((r) =>
|
|
516
|
+
typeof r.output === 'string' ? r.output : JSON.stringify(r.output ?? '')
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
const reference = outputs[0]
|
|
520
|
+
let allSame = true
|
|
521
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
522
|
+
|
|
523
|
+
for (let i = 0; i < runs.length; i++) {
|
|
524
|
+
const same = outputs[i] === reference
|
|
525
|
+
perRun[runs[i].runIndex] = { passed: same, detail: same ? undefined : 'Output differs from run 1.' }
|
|
526
|
+
if (!same) allSame = false
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// If similarity_threshold is set, do a basic character-level similarity check
|
|
530
|
+
if (!allSame && exp.similarity_threshold != null) {
|
|
531
|
+
const threshold = exp.similarity_threshold
|
|
532
|
+
let allAboveThreshold = true
|
|
533
|
+
for (let i = 1; i < outputs.length; i++) {
|
|
534
|
+
const similarity = computeStringSimilarity(reference, outputs[i])
|
|
535
|
+
if (similarity < threshold) {
|
|
536
|
+
allAboveThreshold = false
|
|
537
|
+
perRun[runs[i].runIndex] = {
|
|
538
|
+
passed: false,
|
|
539
|
+
detail: `Similarity ${(similarity * 100).toFixed(1)}% < ${(threshold * 100).toFixed(1)}%`,
|
|
540
|
+
}
|
|
541
|
+
} else {
|
|
542
|
+
perRun[runs[i].runIndex] = { passed: true }
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
return {
|
|
546
|
+
expectationId: exp.id,
|
|
547
|
+
type: 'determinism',
|
|
548
|
+
passed: allAboveThreshold,
|
|
549
|
+
detail: allAboveThreshold ? undefined : 'Outputs are not sufficiently similar.',
|
|
550
|
+
perRun,
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
return {
|
|
555
|
+
expectationId: exp.id,
|
|
556
|
+
type: 'determinism',
|
|
557
|
+
passed: allSame,
|
|
558
|
+
detail: allSame ? undefined : 'Outputs are not identical across runs.',
|
|
559
|
+
perRun,
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
function computeStringSimilarity(a: string, b: string): number {
|
|
564
|
+
if (a === b) return 1
|
|
565
|
+
if (!a.length || !b.length) return 0
|
|
566
|
+
// Simple character overlap ratio
|
|
567
|
+
const longer = a.length >= b.length ? a : b
|
|
568
|
+
const shorter = a.length < b.length ? a : b
|
|
569
|
+
let matches = 0
|
|
570
|
+
for (let i = 0; i < shorter.length; i++) {
|
|
571
|
+
if (shorter[i] === longer[i]) matches++
|
|
572
|
+
}
|
|
573
|
+
return matches / longer.length
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
async function evaluateLLMJudge(exp: APIExpectation, runs: CISingleRunResult[]): Promise<CIExpectationResult> {
|
|
577
|
+
if (!exp.judge_prompt) {
|
|
578
|
+
return {
|
|
579
|
+
expectationId: exp.id,
|
|
580
|
+
type: 'llm-judge',
|
|
581
|
+
passed: false,
|
|
582
|
+
detail: 'LLM judge expectation requires judge_prompt.',
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
const perRun: Record<number, { passed: boolean; detail?: string }> = {}
|
|
587
|
+
let allPassed = true
|
|
588
|
+
const threshold = exp.judge_score_threshold ?? 7
|
|
589
|
+
|
|
590
|
+
for (const run of runs) {
|
|
591
|
+
const outputStr = typeof run.output === 'string'
|
|
592
|
+
? run.output
|
|
593
|
+
: JSON.stringify(run.output ?? '')
|
|
594
|
+
|
|
595
|
+
const preparedOutput = prepareOutputForJudge(outputStr, exp.judge_prompt)
|
|
596
|
+
const evalPrompt = `${exp.judge_prompt}
|
|
597
|
+
|
|
598
|
+
<output>
|
|
599
|
+
${preparedOutput}
|
|
600
|
+
</output>
|
|
601
|
+
|
|
602
|
+
Based on the evaluation criteria above, score this output on a scale of 0-10. Respond with only the number.`
|
|
603
|
+
|
|
604
|
+
try {
|
|
605
|
+
const provider = (exp.judge_provider as 'openai' | 'claude' | 'gemini' | 'grok' | 'kimi') || 'openai'
|
|
606
|
+
const result = await callProviderLLM(
|
|
607
|
+
evalPrompt,
|
|
608
|
+
{ provider, model: exp.judge_model ?? undefined },
|
|
609
|
+
'You are an expert test judge. Return only a number between 0 and 10.',
|
|
610
|
+
4096,
|
|
611
|
+
0,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
const score = parseFloat(result.content.match(/-?\d+(?:\.\d+)?/)?.[0] ?? '')
|
|
615
|
+
if (isNaN(score)) {
|
|
616
|
+
perRun[run.runIndex] = { passed: false, detail: `Could not parse score from: "${result.content}"` }
|
|
617
|
+
allPassed = false
|
|
618
|
+
} else {
|
|
619
|
+
const passed = score >= threshold
|
|
620
|
+
perRun[run.runIndex] = { passed, detail: `Score: ${score}/${threshold}` }
|
|
621
|
+
if (!passed) allPassed = false
|
|
622
|
+
}
|
|
623
|
+
} catch (err) {
|
|
624
|
+
perRun[run.runIndex] = {
|
|
625
|
+
passed: false,
|
|
626
|
+
detail: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
|
|
627
|
+
}
|
|
628
|
+
allPassed = false
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
return {
|
|
633
|
+
expectationId: exp.id,
|
|
634
|
+
type: 'llm-judge',
|
|
635
|
+
passed: allPassed,
|
|
636
|
+
detail: allPassed ? undefined : 'LLM judge check failed.',
|
|
637
|
+
perRun,
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// ─── Pass/Fail Threshold ─────────────────────────────────────
|
|
642
|
+
|
|
643
|
+
function determinePassFail(
|
|
644
|
+
passThreshold: string,
|
|
645
|
+
singleRuns: CISingleRunResult[],
|
|
646
|
+
expectationResults: CIExpectationResult[],
|
|
647
|
+
): boolean {
|
|
648
|
+
// Check single run pass/fail
|
|
649
|
+
const runsPassed = singleRuns.filter((r) => r.passed).length
|
|
650
|
+
const totalRuns = singleRuns.length
|
|
651
|
+
|
|
652
|
+
let runsPass: boolean
|
|
653
|
+
switch (passThreshold) {
|
|
654
|
+
case 'all':
|
|
655
|
+
runsPass = runsPassed === totalRuns
|
|
656
|
+
break
|
|
657
|
+
case 'majority':
|
|
658
|
+
runsPass = runsPassed > totalRuns / 2
|
|
659
|
+
break
|
|
660
|
+
case 'any':
|
|
661
|
+
runsPass = runsPassed > 0
|
|
662
|
+
break
|
|
663
|
+
default:
|
|
664
|
+
runsPass = runsPassed === totalRuns
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// All expectations must pass regardless of threshold
|
|
668
|
+
const expectationsPass = expectationResults.every((e) => e.passed)
|
|
669
|
+
|
|
670
|
+
return runsPass && expectationsPass
|
|
671
|
+
}
|