elasticdash-sdk 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +775 -0
- package/dist/browser-ui.d.ts +43 -0
- package/dist/browser-ui.d.ts.map +1 -0
- package/dist/browser-ui.js +246 -0
- package/dist/browser-ui.js.map +1 -0
- package/dist/capture/event.d.ts +33 -0
- package/dist/capture/event.d.ts.map +1 -0
- package/dist/capture/event.js +2 -0
- package/dist/capture/event.js.map +1 -0
- package/dist/capture/index.d.ts +4 -0
- package/dist/capture/index.d.ts.map +1 -0
- package/dist/capture/index.js +4 -0
- package/dist/capture/index.js.map +1 -0
- package/dist/capture/recorder.d.ts +24 -0
- package/dist/capture/recorder.d.ts.map +1 -0
- package/dist/capture/recorder.js +46 -0
- package/dist/capture/recorder.js.map +1 -0
- package/dist/capture/replay.d.ts +20 -0
- package/dist/capture/replay.d.ts.map +1 -0
- package/dist/capture/replay.js +47 -0
- package/dist/capture/replay.js.map +1 -0
- package/dist/ci/api-client.d.ts +38 -0
- package/dist/ci/api-client.d.ts.map +1 -0
- package/dist/ci/api-client.js +96 -0
- package/dist/ci/api-client.js.map +1 -0
- package/dist/ci/benchmark.d.ts +33 -0
- package/dist/ci/benchmark.d.ts.map +1 -0
- package/dist/ci/benchmark.js +213 -0
- package/dist/ci/benchmark.js.map +1 -0
- package/dist/ci/ed-runner.d.ts +48 -0
- package/dist/ci/ed-runner.d.ts.map +1 -0
- package/dist/ci/ed-runner.js +260 -0
- package/dist/ci/ed-runner.js.map +1 -0
- package/dist/ci/executor.d.ts +13 -0
- package/dist/ci/executor.d.ts.map +1 -0
- package/dist/ci/executor.js +542 -0
- package/dist/ci/executor.js.map +1 -0
- package/dist/ci/git-info.d.ts +17 -0
- package/dist/ci/git-info.d.ts.map +1 -0
- package/dist/ci/git-info.js +102 -0
- package/dist/ci/git-info.js.map +1 -0
- package/dist/ci/index.d.ts +6 -0
- package/dist/ci/index.d.ts.map +1 -0
- package/dist/ci/index.js +4 -0
- package/dist/ci/index.js.map +1 -0
- package/dist/ci/measurement.d.ts +9 -0
- package/dist/ci/measurement.d.ts.map +1 -0
- package/dist/ci/measurement.js +15 -0
- package/dist/ci/measurement.js.map +1 -0
- package/dist/ci/replay.d.ts +31 -0
- package/dist/ci/replay.d.ts.map +1 -0
- package/dist/ci/replay.js +96 -0
- package/dist/ci/replay.js.map +1 -0
- package/dist/ci/reporters/default.d.ts +8 -0
- package/dist/ci/reporters/default.d.ts.map +1 -0
- package/dist/ci/reporters/default.js +46 -0
- package/dist/ci/reporters/default.js.map +1 -0
- package/dist/ci/reporters/index.d.ts +8 -0
- package/dist/ci/reporters/index.d.ts.map +1 -0
- package/dist/ci/reporters/index.js +14 -0
- package/dist/ci/reporters/index.js.map +1 -0
- package/dist/ci/reporters/json.d.ts +8 -0
- package/dist/ci/reporters/json.d.ts.map +1 -0
- package/dist/ci/reporters/json.js +14 -0
- package/dist/ci/reporters/json.js.map +1 -0
- package/dist/ci/reporters/junit.d.ts +8 -0
- package/dist/ci/reporters/junit.d.ts.map +1 -0
- package/dist/ci/reporters/junit.js +48 -0
- package/dist/ci/reporters/junit.js.map +1 -0
- package/dist/ci/runner.d.ts +3 -0
- package/dist/ci/runner.d.ts.map +1 -0
- package/dist/ci/runner.js +187 -0
- package/dist/ci/runner.js.map +1 -0
- package/dist/ci/test-discovery.d.ts +5 -0
- package/dist/ci/test-discovery.d.ts.map +1 -0
- package/dist/ci/test-discovery.js +11 -0
- package/dist/ci/test-discovery.js.map +1 -0
- package/dist/ci/test-loader.d.ts +19 -0
- package/dist/ci/test-loader.d.ts.map +1 -0
- package/dist/ci/test-loader.js +149 -0
- package/dist/ci/test-loader.js.map +1 -0
- package/dist/ci/test-registry.d.ts +42 -0
- package/dist/ci/test-registry.d.ts.map +1 -0
- package/dist/ci/test-registry.js +18 -0
- package/dist/ci/test-registry.js.map +1 -0
- package/dist/ci/trace-schema.d.ts +30 -0
- package/dist/ci/trace-schema.d.ts.map +1 -0
- package/dist/ci/trace-schema.js +66 -0
- package/dist/ci/trace-schema.js.map +1 -0
- package/dist/ci/trace-writer.d.ts +16 -0
- package/dist/ci/trace-writer.d.ts.map +1 -0
- package/dist/ci/trace-writer.js +108 -0
- package/dist/ci/trace-writer.js.map +1 -0
- package/dist/ci/types.d.ts +108 -0
- package/dist/ci/types.d.ts.map +1 -0
- package/dist/ci/types.js +3 -0
- package/dist/ci/types.js.map +1 -0
- package/dist/ci/upload-client.d.ts +74 -0
- package/dist/ci/upload-client.d.ts.map +1 -0
- package/dist/ci/upload-client.js +195 -0
- package/dist/ci/upload-client.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +716 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/agent-state.d.ts +47 -0
- package/dist/core/agent-state.d.ts.map +1 -0
- package/dist/core/agent-state.js +137 -0
- package/dist/core/agent-state.js.map +1 -0
- package/dist/core/judge-utils.d.ts +22 -0
- package/dist/core/judge-utils.d.ts.map +1 -0
- package/dist/core/judge-utils.js +211 -0
- package/dist/core/judge-utils.js.map +1 -0
- package/dist/core/registry.d.ts +28 -0
- package/dist/core/registry.d.ts.map +1 -0
- package/dist/core/registry.js +52 -0
- package/dist/core/registry.js.map +1 -0
- package/dist/dashboard-server.d.ts +65 -0
- package/dist/dashboard-server.d.ts.map +1 -0
- package/dist/dashboard-server.js +3940 -0
- package/dist/dashboard-server.js.map +1 -0
- package/dist/execution/tool-runner.d.ts +26 -0
- package/dist/execution/tool-runner.d.ts.map +1 -0
- package/dist/execution/tool-runner.js +316 -0
- package/dist/execution/tool-runner.js.map +1 -0
- package/dist/html/dashboard.html +2218 -0
- package/dist/http.d.ts +14 -0
- package/dist/http.d.ts.map +1 -0
- package/dist/http.js +13 -0
- package/dist/http.js.map +1 -0
- package/dist/index.cjs +8102 -0
- package/dist/index.d.ts +61 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +67 -0
- package/dist/index.js.map +1 -0
- package/dist/interceptors/ai-interceptor.d.ts +26 -0
- package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
- package/dist/interceptors/ai-interceptor.js +756 -0
- package/dist/interceptors/ai-interceptor.js.map +1 -0
- package/dist/interceptors/db-auto.d.ts +8 -0
- package/dist/interceptors/db-auto.d.ts.map +1 -0
- package/dist/interceptors/db-auto.js +217 -0
- package/dist/interceptors/db-auto.js.map +1 -0
- package/dist/interceptors/db.d.ts +23 -0
- package/dist/interceptors/db.d.ts.map +1 -0
- package/dist/interceptors/db.js +137 -0
- package/dist/interceptors/db.js.map +1 -0
- package/dist/interceptors/http.d.ts +28 -0
- package/dist/interceptors/http.d.ts.map +1 -0
- package/dist/interceptors/http.js +356 -0
- package/dist/interceptors/http.js.map +1 -0
- package/dist/interceptors/side-effects.d.ts +7 -0
- package/dist/interceptors/side-effects.d.ts.map +1 -0
- package/dist/interceptors/side-effects.js +72 -0
- package/dist/interceptors/side-effects.js.map +1 -0
- package/dist/interceptors/telemetry-push.d.ts +142 -0
- package/dist/interceptors/telemetry-push.d.ts.map +1 -0
- package/dist/interceptors/telemetry-push.js +463 -0
- package/dist/interceptors/telemetry-push.js.map +1 -0
- package/dist/interceptors/tool.d.ts +2 -0
- package/dist/interceptors/tool.d.ts.map +1 -0
- package/dist/interceptors/tool.js +274 -0
- package/dist/interceptors/tool.js.map +1 -0
- package/dist/interceptors/workflow-ai.d.ts +5 -0
- package/dist/interceptors/workflow-ai.d.ts.map +1 -0
- package/dist/interceptors/workflow-ai.js +382 -0
- package/dist/interceptors/workflow-ai.js.map +1 -0
- package/dist/internals/conditional-recorder.d.ts +21 -0
- package/dist/internals/conditional-recorder.d.ts.map +1 -0
- package/dist/internals/conditional-recorder.js +54 -0
- package/dist/internals/conditional-recorder.js.map +1 -0
- package/dist/internals/mock-resolver.d.ts +146 -0
- package/dist/internals/mock-resolver.d.ts.map +1 -0
- package/dist/internals/mock-resolver.js +427 -0
- package/dist/internals/mock-resolver.js.map +1 -0
- package/dist/matchers/index.d.ts +96 -0
- package/dist/matchers/index.d.ts.map +1 -0
- package/dist/matchers/index.js +668 -0
- package/dist/matchers/index.js.map +1 -0
- package/dist/observability.d.ts +82 -0
- package/dist/observability.d.ts.map +1 -0
- package/dist/observability.js +471 -0
- package/dist/observability.js.map +1 -0
- package/dist/portal-executor.d.ts +30 -0
- package/dist/portal-executor.d.ts.map +1 -0
- package/dist/portal-executor.js +324 -0
- package/dist/portal-executor.js.map +1 -0
- package/dist/portal-server.d.ts +3 -0
- package/dist/portal-server.d.ts.map +1 -0
- package/dist/portal-server.js +279 -0
- package/dist/portal-server.js.map +1 -0
- package/dist/proxy/llm-capture.d.ts +14 -0
- package/dist/proxy/llm-capture.d.ts.map +1 -0
- package/dist/proxy/llm-capture.js +264 -0
- package/dist/proxy/llm-capture.js.map +1 -0
- package/dist/reporter.d.ts +3 -0
- package/dist/reporter.d.ts.map +1 -0
- package/dist/reporter.js +72 -0
- package/dist/reporter.js.map +1 -0
- package/dist/runWorkflowSubprocess.d.ts +14 -0
- package/dist/runWorkflowSubprocess.d.ts.map +1 -0
- package/dist/runWorkflowSubprocess.js +66 -0
- package/dist/runWorkflowSubprocess.js.map +1 -0
- package/dist/runner.d.ts +16 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +138 -0
- package/dist/runner.js.map +1 -0
- package/dist/socket-connector.d.ts +22 -0
- package/dist/socket-connector.d.ts.map +1 -0
- package/dist/socket-connector.js +104 -0
- package/dist/socket-connector.js.map +1 -0
- package/dist/telemetry-batcher.d.ts +56 -0
- package/dist/telemetry-batcher.d.ts.map +1 -0
- package/dist/telemetry-batcher.js +143 -0
- package/dist/telemetry-batcher.js.map +1 -0
- package/dist/test-setup.d.ts +12 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +13 -0
- package/dist/test-setup.js.map +1 -0
- package/dist/tool-registry.d.ts +31 -0
- package/dist/tool-registry.d.ts.map +1 -0
- package/dist/tool-registry.js +73 -0
- package/dist/tool-registry.js.map +1 -0
- package/dist/tool-runner-worker.d.ts +2 -0
- package/dist/tool-runner-worker.d.ts.map +1 -0
- package/dist/tool-runner-worker.js +215 -0
- package/dist/tool-runner-worker.js.map +1 -0
- package/dist/trace-adapter/context.d.ts +72 -0
- package/dist/trace-adapter/context.d.ts.map +1 -0
- package/dist/trace-adapter/context.js +80 -0
- package/dist/trace-adapter/context.js.map +1 -0
- package/dist/tracing.d.ts +2 -0
- package/dist/tracing.d.ts.map +1 -0
- package/dist/tracing.js +59 -0
- package/dist/tracing.js.map +1 -0
- package/dist/trigger-executor.d.ts +12 -0
- package/dist/trigger-executor.d.ts.map +1 -0
- package/dist/trigger-executor.js +130 -0
- package/dist/trigger-executor.js.map +1 -0
- package/dist/types/portal.d.ts +76 -0
- package/dist/types/portal.d.ts.map +1 -0
- package/dist/types/portal.js +2 -0
- package/dist/types/portal.js.map +1 -0
- package/dist/utils/debug.d.ts +3 -0
- package/dist/utils/debug.d.ts.map +1 -0
- package/dist/utils/debug.js +8 -0
- package/dist/utils/debug.js.map +1 -0
- package/dist/utils/license-error.d.ts +23 -0
- package/dist/utils/license-error.d.ts.map +1 -0
- package/dist/utils/license-error.js +42 -0
- package/dist/utils/license-error.js.map +1 -0
- package/dist/utils/redact.d.ts +7 -0
- package/dist/utils/redact.d.ts.map +1 -0
- package/dist/utils/redact.js +26 -0
- package/dist/utils/redact.js.map +1 -0
- package/dist/workflow-runner-worker.d.ts +2 -0
- package/dist/workflow-runner-worker.d.ts.map +1 -0
- package/dist/workflow-runner-worker.js +329 -0
- package/dist/workflow-runner-worker.js.map +1 -0
- package/dist/workflow-runner.d.ts +14 -0
- package/dist/workflow-runner.d.ts.map +1 -0
- package/dist/workflow-runner.js +34 -0
- package/dist/workflow-runner.js.map +1 -0
- package/docs/agent-coding-instructions.md +138 -0
- package/docs/agent-integration-guide.md +564 -0
- package/docs/agents.md +140 -0
- package/docs/dashboard.md +394 -0
- package/docs/deno.md +69 -0
- package/docs/instrumentation.md +424 -0
- package/docs/langfuse-trace-structure.md +145 -0
- package/docs/matchers.md +173 -0
- package/docs/observability_contract.md +192 -0
- package/docs/observability_mode.md +195 -0
- package/docs/quickstart.md +621 -0
- package/docs/security-compliance.md +566 -0
- package/docs/test-writing-guidelines.md +444 -0
- package/docs/tools.md +165 -0
- package/docs/workflow-modes.md +253 -0
- package/package.json +76 -0
- package/src/browser-ui.ts +281 -0
- package/src/capture/event.ts +30 -0
- package/src/capture/index.ts +3 -0
- package/src/capture/recorder.ts +62 -0
- package/src/capture/replay.ts +55 -0
- package/src/ci/api-client.ts +136 -0
- package/src/ci/benchmark.ts +257 -0
- package/src/ci/ed-runner.ts +351 -0
- package/src/ci/executor.ts +671 -0
- package/src/ci/git-info.ts +127 -0
- package/src/ci/index.ts +5 -0
- package/src/ci/measurement.ts +25 -0
- package/src/ci/replay.ts +127 -0
- package/src/ci/reporters/default.ts +50 -0
- package/src/ci/reporters/index.ts +21 -0
- package/src/ci/reporters/json.ts +18 -0
- package/src/ci/reporters/junit.ts +61 -0
- package/src/ci/runner.ts +208 -0
- package/src/ci/test-discovery.ts +16 -0
- package/src/ci/test-loader.ts +187 -0
- package/src/ci/test-registry.ts +62 -0
- package/src/ci/trace-schema.ts +96 -0
- package/src/ci/trace-writer.ts +107 -0
- package/src/ci/types.ts +115 -0
- package/src/ci/upload-client.ts +300 -0
- package/src/cli.ts +811 -0
- package/src/core/agent-state.ts +162 -0
- package/src/core/judge-utils.ts +232 -0
- package/src/core/registry.ts +92 -0
- package/src/dashboard-server.ts +2047 -0
- package/src/execution/tool-runner.ts +352 -0
- package/src/html/dashboard.html +2218 -0
- package/src/http.ts +13 -0
- package/src/index.ts +138 -0
- package/src/interceptors/ai-interceptor.ts +798 -0
- package/src/interceptors/db-auto.ts +243 -0
- package/src/interceptors/db.ts +156 -0
- package/src/interceptors/http.ts +393 -0
- package/src/interceptors/side-effects.ts +83 -0
- package/src/interceptors/telemetry-push.ts +537 -0
- package/src/interceptors/tool.ts +287 -0
- package/src/interceptors/workflow-ai.ts +419 -0
- package/src/internals/conditional-recorder.ts +63 -0
- package/src/internals/mock-resolver.ts +492 -0
- package/src/matchers/index.ts +824 -0
- package/src/observability.ts +501 -0
- package/src/portal-executor.ts +355 -0
- package/src/portal-server.ts +304 -0
- package/src/proxy/llm-capture.ts +301 -0
- package/src/reporter.ts +81 -0
- package/src/runWorkflowSubprocess.ts +74 -0
- package/src/runner.ts +178 -0
- package/src/socket-connector.ts +117 -0
- package/src/telemetry-batcher.ts +191 -0
- package/src/test-setup.ts +16 -0
- package/src/tool-registry.ts +94 -0
- package/src/tool-runner-worker.ts +244 -0
- package/src/trace-adapter/context.ts +156 -0
- package/src/tracing.ts +62 -0
- package/src/trigger-executor.ts +171 -0
- package/src/types/agent.d.ts +63 -0
- package/src/types/expect.d.ts +81 -0
- package/src/types/modules.d.ts +2 -0
- package/src/types/portal.ts +69 -0
- package/src/utils/debug.ts +8 -0
- package/src/utils/license-error.ts +43 -0
- package/src/utils/redact.ts +25 -0
- package/src/workflow-runner-worker.ts +386 -0
- package/src/workflow-runner.ts +58 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto'
|
|
2
|
+
import { loadTests } from './test-loader.js'
|
|
3
|
+
import { createReplayContext, installReplay, uninstallReplay, ReplayMissError } from './replay.js'
|
|
4
|
+
import { collectMeasurement } from './measurement.js'
|
|
5
|
+
import { SDK_VERSION } from './trace-schema.js'
|
|
6
|
+
import { compareBenchmarks } from './benchmark.js'
|
|
7
|
+
import { fetchEvaluatorConfig } from './api-client.js'
|
|
8
|
+
import type { EvaluatorConfig } from './api-client.js'
|
|
9
|
+
import type { TestMeasurement } from './measurement.js'
|
|
10
|
+
import type { BenchmarkResult } from './benchmark.js'
|
|
11
|
+
import type { ValidatedTest } from './test-loader.js'
|
|
12
|
+
|
|
13
|
+
// SDK_VERSION imported from trace-schema.ts (CJS-safe)
|
|
14
|
+
|
|
15
|
+
// ─── Types ──────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
export interface EdTestRunOptions {
|
|
18
|
+
cwd?: string
|
|
19
|
+
filter?: string
|
|
20
|
+
failFast?: boolean
|
|
21
|
+
noUpload?: boolean
|
|
22
|
+
reporter?: 'default' | 'json' | 'junit'
|
|
23
|
+
/** Number of times to run each test. Passes if any run succeeds. Defaults to 1. */
|
|
24
|
+
runs?: number
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface EdSingleRunResult {
|
|
28
|
+
status: 'pass' | 'fail'
|
|
29
|
+
failureReason?: string
|
|
30
|
+
measurement?: TestMeasurement
|
|
31
|
+
benchmarkResult?: BenchmarkResult
|
|
32
|
+
output?: unknown
|
|
33
|
+
durationMs: number
|
|
34
|
+
startedAt: string
|
|
35
|
+
finishedAt: string
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface EdTestResult {
|
|
39
|
+
testId: string
|
|
40
|
+
testName: string
|
|
41
|
+
status: 'pass' | 'fail'
|
|
42
|
+
failureReason?: string
|
|
43
|
+
measurement?: TestMeasurement
|
|
44
|
+
benchmarkResult?: BenchmarkResult
|
|
45
|
+
traceRef?: string
|
|
46
|
+
target?: { type: string; step_id: string }
|
|
47
|
+
input?: unknown
|
|
48
|
+
output?: unknown
|
|
49
|
+
durationMs: number
|
|
50
|
+
/** All individual run results when --runs > 1 */
|
|
51
|
+
singleRuns?: EdSingleRunResult[]
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface EdTestRunResult {
|
|
55
|
+
runId: string
|
|
56
|
+
startedAt: string
|
|
57
|
+
finishedAt: string
|
|
58
|
+
results: EdTestResult[]
|
|
59
|
+
sdkVersion: string
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ─── Runner ─────────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
export async function runEdTests(options?: EdTestRunOptions): Promise<EdTestRunResult> {
|
|
65
|
+
const cwd = options?.cwd ?? process.cwd()
|
|
66
|
+
const runId = randomUUID()
|
|
67
|
+
const startedAt = new Date().toISOString()
|
|
68
|
+
const results: EdTestResult[] = []
|
|
69
|
+
|
|
70
|
+
const { tests, errors } = await loadTests({ cwd })
|
|
71
|
+
|
|
72
|
+
// Report validation errors as failed tests
|
|
73
|
+
for (const err of errors) {
|
|
74
|
+
results.push({
|
|
75
|
+
testId: err.testName ?? 'unknown',
|
|
76
|
+
testName: err.testName ?? 'unknown',
|
|
77
|
+
status: 'fail',
|
|
78
|
+
failureReason: `validation error: ${err.message}`,
|
|
79
|
+
durationMs: 0,
|
|
80
|
+
singleRuns: [],
|
|
81
|
+
})
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Filter tests if pattern provided
|
|
85
|
+
let testsToRun: ValidatedTest[] = tests
|
|
86
|
+
if (options?.filter) {
|
|
87
|
+
const pattern = options.filter
|
|
88
|
+
testsToRun = tests.filter(t => matchGlob(t.name, pattern))
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const maxRuns = Math.max(1, options?.runs ?? 1)
|
|
92
|
+
|
|
93
|
+
// Fetch evaluator config from backend if any test uses llm_judge without
|
|
94
|
+
// explicit provider/model. Cached for the entire run to avoid repeated calls.
|
|
95
|
+
let evaluatorConfig: EvaluatorConfig | null = null
|
|
96
|
+
const needsEvaluatorConfig = testsToRun.some(
|
|
97
|
+
t => t.benchmarks.llm_judge && (!t.benchmarks.llm_judge.judge_provider || !t.benchmarks.llm_judge.judge_model)
|
|
98
|
+
)
|
|
99
|
+
if (needsEvaluatorConfig) {
|
|
100
|
+
const serverUrl = process.env.ELASTICDASH_API_URL ?? process.env.ELASTICDASH_SERVER ?? ''
|
|
101
|
+
const apiKey = process.env.ELASTICDASH_API_KEY ?? ''
|
|
102
|
+
if (serverUrl && apiKey) {
|
|
103
|
+
try {
|
|
104
|
+
evaluatorConfig = await fetchEvaluatorConfig(serverUrl, apiKey)
|
|
105
|
+
console.log(`[ed-test] Evaluator config: provider=${evaluatorConfig.provider}, model=${evaluatorConfig.model}, hasKey=${!!evaluatorConfig.apiKey}`)
|
|
106
|
+
} catch (err) {
|
|
107
|
+
console.warn(`[ed-test] Could not fetch evaluator config: ${err instanceof Error ? err.message : String(err)}`)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
for (const test of testsToRun) {
|
|
113
|
+
const allRuns: EdSingleRunResult[] = []
|
|
114
|
+
let bestResult: EdTestResult | null = null
|
|
115
|
+
|
|
116
|
+
for (let attempt = 1; attempt <= maxRuns; attempt++) {
|
|
117
|
+
const runStartedAt = new Date().toISOString()
|
|
118
|
+
const result = await runSingleTest(test, evaluatorConfig)
|
|
119
|
+
const runFinishedAt = new Date().toISOString()
|
|
120
|
+
|
|
121
|
+
if (attempt > 1) {
|
|
122
|
+
console.log(` [ed-test] ${test.name}: run ${attempt}/${maxRuns} — ${result.status}`)
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Collect every run for upload
|
|
126
|
+
allRuns.push({
|
|
127
|
+
status: result.status,
|
|
128
|
+
failureReason: result.failureReason,
|
|
129
|
+
measurement: result.measurement,
|
|
130
|
+
benchmarkResult: result.benchmarkResult,
|
|
131
|
+
output: result.output,
|
|
132
|
+
durationMs: result.durationMs,
|
|
133
|
+
startedAt: runStartedAt,
|
|
134
|
+
finishedAt: runFinishedAt,
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
// Keep the first passing result, or the last failure (for aggregate status)
|
|
138
|
+
if (!bestResult || result.status === 'pass') {
|
|
139
|
+
bestResult = result
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Aggregate: fail if ANY run failed
|
|
145
|
+
const anyFailed = allRuns.some(r => r.status === 'fail')
|
|
146
|
+
const failedRun = allRuns.find(r => r.status === 'fail')
|
|
147
|
+
|
|
148
|
+
results.push({
|
|
149
|
+
...bestResult!,
|
|
150
|
+
status: anyFailed ? 'fail' : 'pass',
|
|
151
|
+
failureReason: anyFailed ? (failedRun?.failureReason || bestResult!.failureReason) : undefined,
|
|
152
|
+
singleRuns: allRuns,
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
if (options?.failFast && anyFailed) {
|
|
156
|
+
break
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const finishedAt = new Date().toISOString()
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
runId,
|
|
164
|
+
startedAt,
|
|
165
|
+
finishedAt,
|
|
166
|
+
results,
|
|
167
|
+
sdkVersion: SDK_VERSION,
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ─── Single test execution ──────────────────────────────────
|
|
172
|
+
|
|
173
|
+
async function resolveCustomInput(input: unknown | (() => Promise<unknown> | unknown)): Promise<unknown> {
|
|
174
|
+
return typeof input === 'function' ? await (input as () => Promise<unknown> | unknown)() : input
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async function runSingleTest(test: ValidatedTest, evaluatorConfig?: EvaluatorConfig | null): Promise<EdTestResult> {
|
|
178
|
+
const startMs = Date.now()
|
|
179
|
+
const targetStep = test.traceData.steps.find(s => s.step_id === test.target.step_id)
|
|
180
|
+
|
|
181
|
+
const resolvedInput = test.input !== undefined
|
|
182
|
+
? await resolveCustomInput(test.input)
|
|
183
|
+
: targetStep?.input
|
|
184
|
+
|
|
185
|
+
const base: Partial<EdTestResult> = {
|
|
186
|
+
testId: test.name,
|
|
187
|
+
testName: test.name,
|
|
188
|
+
traceRef: test.trace,
|
|
189
|
+
target: { type: test.target.type, step_id: test.target.step_id },
|
|
190
|
+
input: resolvedInput,
|
|
191
|
+
output: targetStep?.output,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Check run function exists
|
|
195
|
+
if (!test.run || typeof test.run !== 'function') {
|
|
196
|
+
return {
|
|
197
|
+
...base,
|
|
198
|
+
testId: test.name,
|
|
199
|
+
testName: test.name,
|
|
200
|
+
status: 'fail',
|
|
201
|
+
failureReason: 'test has no run function',
|
|
202
|
+
durationMs: Date.now() - startMs,
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const replayCtx = createReplayContext(test.traceData, test.target.step_id)
|
|
207
|
+
installReplay(replayCtx)
|
|
208
|
+
|
|
209
|
+
try {
|
|
210
|
+
const timeoutMs = test.timeout_ms ?? 60000
|
|
211
|
+
|
|
212
|
+
await Promise.race([
|
|
213
|
+
test.run(resolvedInput),
|
|
214
|
+
new Promise<never>((_, reject) =>
|
|
215
|
+
setTimeout(() => reject(new TimeoutError(timeoutMs)), timeoutMs),
|
|
216
|
+
),
|
|
217
|
+
])
|
|
218
|
+
|
|
219
|
+
// Collect measurement from the target step.
|
|
220
|
+
// If replay captured it (in-process wrapTool/wrapAI), use that.
|
|
221
|
+
// Otherwise fall back to extracting the measurement directly from the
|
|
222
|
+
// trace data. This handles HTTP-mode workflows where wrapTool/wrapAI
|
|
223
|
+
// calls happen on a remote server and the replay ALS is not accessed.
|
|
224
|
+
// The measurement values are identical either way — both come from the
|
|
225
|
+
// recorded trace, not from live execution.
|
|
226
|
+
let measurement = collectMeasurement(replayCtx)
|
|
227
|
+
if (!measurement) {
|
|
228
|
+
measurement = extractMeasurementFromTrace(test) ?? null
|
|
229
|
+
if (measurement) {
|
|
230
|
+
console.log(` [ed-test] ${test.name}: extracted measurement from trace (HTTP-mode fallback)`)
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
if (!measurement) {
|
|
234
|
+
return {
|
|
235
|
+
...base,
|
|
236
|
+
testId: test.name,
|
|
237
|
+
testName: test.name,
|
|
238
|
+
status: 'fail',
|
|
239
|
+
failureReason: `target step "${test.target.step_id}" was not replayed during execution`,
|
|
240
|
+
durationMs: Date.now() - startMs,
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Compare against benchmarks (async to support llm_judge)
|
|
245
|
+
const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output, evaluatorConfig)
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
...base,
|
|
249
|
+
testId: test.name,
|
|
250
|
+
testName: test.name,
|
|
251
|
+
status: benchmarkResult.passed ? 'pass' : 'fail',
|
|
252
|
+
failureReason: benchmarkResult.failure_reason,
|
|
253
|
+
measurement,
|
|
254
|
+
benchmarkResult,
|
|
255
|
+
durationMs: Date.now() - startMs,
|
|
256
|
+
}
|
|
257
|
+
} catch (err) {
|
|
258
|
+
// For HTTP-mode workflows, run() may fail (e.g. server not running) but the
|
|
259
|
+
// measurement can still be extracted from the trace. The benchmarks compare
|
|
260
|
+
// against recorded data, not live performance, so this is valid.
|
|
261
|
+
const traceMeasurement = extractMeasurementFromTrace(test)
|
|
262
|
+
if (traceMeasurement) {
|
|
263
|
+
console.log(` [ed-test] ${test.name}: run() failed (${err instanceof Error ? err.message : String(err)}), using trace measurement fallback`)
|
|
264
|
+
const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output, evaluatorConfig)
|
|
265
|
+
return {
|
|
266
|
+
...base,
|
|
267
|
+
testId: test.name,
|
|
268
|
+
testName: test.name,
|
|
269
|
+
status: benchmarkResult.passed ? 'pass' : 'fail',
|
|
270
|
+
failureReason: benchmarkResult.failure_reason,
|
|
271
|
+
measurement: traceMeasurement,
|
|
272
|
+
benchmarkResult,
|
|
273
|
+
durationMs: Date.now() - startMs,
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if (err instanceof ReplayMissError) {
|
|
278
|
+
return {
|
|
279
|
+
...base,
|
|
280
|
+
testId: test.name,
|
|
281
|
+
testName: test.name,
|
|
282
|
+
status: 'fail',
|
|
283
|
+
failureReason: `replay miss: ${err.callType}::${err.callName}`,
|
|
284
|
+
durationMs: Date.now() - startMs,
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if (err instanceof TimeoutError) {
|
|
288
|
+
return {
|
|
289
|
+
...base,
|
|
290
|
+
testId: test.name,
|
|
291
|
+
testName: test.name,
|
|
292
|
+
status: 'fail',
|
|
293
|
+
failureReason: `test timed out after ${err.timeoutMs}ms`,
|
|
294
|
+
durationMs: Date.now() - startMs,
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
...base,
|
|
299
|
+
testId: test.name,
|
|
300
|
+
testName: test.name,
|
|
301
|
+
status: 'fail',
|
|
302
|
+
failureReason: `execution error: ${err instanceof Error ? err.message : String(err)}`,
|
|
303
|
+
durationMs: Date.now() - startMs,
|
|
304
|
+
}
|
|
305
|
+
} finally {
|
|
306
|
+
uninstallReplay()
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// ─── Trace-direct measurement extraction ────────────────────
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Extracts the target step's measurement directly from the trace data.
|
|
314
|
+
* Used as a fallback when the replay mechanism did not capture the step
|
|
315
|
+
* (e.g. HTTP-mode workflows where wrapTool/wrapAI run on a remote server).
|
|
316
|
+
*
|
|
317
|
+
* Returns the same TestMeasurement shape that collectMeasurement produces.
|
|
318
|
+
*/
|
|
319
|
+
function extractMeasurementFromTrace(test: ValidatedTest): TestMeasurement | undefined {
|
|
320
|
+
const step = test.traceData.steps.find(s => s.step_id === test.target.step_id)
|
|
321
|
+
if (!step) return undefined
|
|
322
|
+
|
|
323
|
+
const result: TestMeasurement = {
|
|
324
|
+
duration_ms: step.duration_ms,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
if (step.tokens) {
|
|
328
|
+
result.tokens_input = step.tokens.input
|
|
329
|
+
result.tokens_output = step.tokens.output
|
|
330
|
+
result.tokens_total = step.tokens.total
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return result
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// ─── Helpers ────────────────────────────────────────────────
|
|
337
|
+
|
|
338
|
+
class TimeoutError extends Error {
|
|
339
|
+
constructor(public timeoutMs: number) {
|
|
340
|
+
super(`Test timed out after ${timeoutMs}ms`)
|
|
341
|
+
this.name = 'TimeoutError'
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function matchGlob(name: string, pattern: string): boolean {
|
|
346
|
+
// Simple glob: convert * to .* and ? to .
|
|
347
|
+
const regex = new RegExp(
|
|
348
|
+
'^' + pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*').replace(/\?/g, '.') + '$',
|
|
349
|
+
)
|
|
350
|
+
return regex.test(name)
|
|
351
|
+
}
|