waypoi 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/instructions/ui.instructions.md +42 -0
- package/.github/workflows/ci.yml +35 -0
- package/.github/workflows/publish.yml +71 -0
- package/.github/workflows/release.yml +48 -0
- package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
- package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
- package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
- package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
- package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
- package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
- package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
- package/AGENTS.md +48 -0
- package/CHANGELOG.md +131 -0
- package/README.md +552 -0
- package/assets/agent-mode.png +0 -0
- package/assets/categorize.png +0 -0
- package/assets/dashboard.png +0 -0
- package/assets/endpoint-proxy.png +0 -0
- package/assets/icon.png +0 -0
- package/assets/mcp-generate-image.png +0 -0
- package/assets/mcp-understand-image.png +0 -0
- package/assets/peek-token-flow.png +0 -0
- package/assets/playground.png +0 -0
- package/assets/sankey.png +0 -0
- package/cli/index.ts +2805 -0
- package/cli/legacyRewrite.ts +108 -0
- package/cli/modelRef.ts +24 -0
- package/dist/cli/index.js +2536 -0
- package/dist/cli/legacyRewrite.js +92 -0
- package/dist/cli/modelRef.js +20 -0
- package/dist/src/benchmark/artifacts.js +131 -0
- package/dist/src/benchmark/capabilityClassifier.js +81 -0
- package/dist/src/benchmark/capabilityStore.js +144 -0
- package/dist/src/benchmark/config.js +238 -0
- package/dist/src/benchmark/gates.js +118 -0
- package/dist/src/benchmark/jobs.js +252 -0
- package/dist/src/benchmark/runner.js +1847 -0
- package/dist/src/benchmark/schema.js +353 -0
- package/dist/src/benchmark/suites.js +314 -0
- package/dist/src/benchmark/tinyQaDataset.js +422 -0
- package/dist/src/benchmark/types.js +25 -0
- package/dist/src/config.js +47 -0
- package/dist/src/index.js +178 -0
- package/dist/src/mcp/client.js +215 -0
- package/dist/src/mcp/discovery.js +226 -0
- package/dist/src/mcp/policy.js +65 -0
- package/dist/src/mcp/registry.js +129 -0
- package/dist/src/mcp/service.js +460 -0
- package/dist/src/middleware/auth.js +179 -0
- package/dist/src/middleware/requestCapture.js +192 -0
- package/dist/src/middleware/requestStats.js +118 -0
- package/dist/src/pools/builder.js +132 -0
- package/dist/src/pools/repository.js +69 -0
- package/dist/src/pools/scheduler.js +360 -0
- package/dist/src/pools/types.js +2 -0
- package/dist/src/protocols/adapters/dashscope.js +267 -0
- package/dist/src/protocols/adapters/inferenceV2.js +346 -0
- package/dist/src/protocols/adapters/openai.js +27 -0
- package/dist/src/protocols/registry.js +99 -0
- package/dist/src/protocols/types.js +2 -0
- package/dist/src/providers/health.js +153 -0
- package/dist/src/providers/importer.js +289 -0
- package/dist/src/providers/modelRegistry.js +313 -0
- package/dist/src/providers/repository.js +361 -0
- package/dist/src/providers/types.js +2 -0
- package/dist/src/routes/admin.js +531 -0
- package/dist/src/routes/audio.js +295 -0
- package/dist/src/routes/chat.js +240 -0
- package/dist/src/routes/embeddings.js +157 -0
- package/dist/src/routes/images.js +288 -0
- package/dist/src/routes/mcp.js +256 -0
- package/dist/src/routes/mcpService.js +100 -0
- package/dist/src/routes/models.js +48 -0
- package/dist/src/routes/responses.js +711 -0
- package/dist/src/routes/sessions.js +450 -0
- package/dist/src/routes/stats.js +270 -0
- package/dist/src/routes/ui.js +97 -0
- package/dist/src/routes/videos.js +107 -0
- package/dist/src/routing/router.js +338 -0
- package/dist/src/services/imageGeneration.js +280 -0
- package/dist/src/services/imageUnderstanding.js +352 -0
- package/dist/src/services/videoGeneration.js +79 -0
- package/dist/src/storage/captureRepository.js +1591 -0
- package/dist/src/storage/files.js +157 -0
- package/dist/src/storage/imageCache.js +346 -0
- package/dist/src/storage/repositories.js +388 -0
- package/dist/src/storage/sessionRepository.js +370 -0
- package/dist/src/storage/statsRepository.js +204 -0
- package/dist/src/transport/httpClient.js +126 -0
- package/dist/src/types.js +2 -0
- package/dist/src/utils/messageMedia.js +285 -0
- package/dist/src/utils/modelCapabilities.js +108 -0
- package/dist/src/utils/modelDiscovery.js +170 -0
- package/dist/src/version.js +5 -0
- package/dist/src/workers/captureRetention.js +25 -0
- package/dist/src/workers/configWatcher.js +91 -0
- package/dist/src/workers/healthChecker.js +21 -0
- package/dist/src/workers/statsRotation.js +41 -0
- package/docs/LLM/output_schema.md +312 -0
- package/docs/benchmark.md +208 -0
- package/docs/mcp-guidelines.md +125 -0
- package/docs/mcp-service.md +178 -0
- package/docs/opencode.md +86 -0
- package/docs/providers.md +79 -0
- package/examples/benchmark.config.yaml +28 -0
- package/examples/providers/alibaba-dashscope.yaml +88 -0
- package/examples/providers/alibaba-llm.yaml +64 -0
- package/examples/providers/alibaba-registry.yaml +7 -0
- package/examples/providers/inference-v2-ray.yaml +29 -0
- package/examples/scenarios/assets/omni-call-sample.wav +0 -0
- package/examples/scenarios/custom.jsonl +5 -0
- package/examples/scenarios/custom.yaml +40 -0
- package/model-form-v2.png +0 -0
- package/package.json +66 -0
- package/provider-form-v2.png +0 -0
- package/provider-form.png +0 -0
- package/scripts/manual-test.sh +11 -0
- package/scripts/version-from-git.js +23 -0
- package/src/benchmark/artifacts.ts +149 -0
- package/src/benchmark/capabilityClassifier.ts +99 -0
- package/src/benchmark/capabilityStore.ts +174 -0
- package/src/benchmark/config.ts +337 -0
- package/src/benchmark/gates.ts +164 -0
- package/src/benchmark/jobs.ts +312 -0
- package/src/benchmark/runner.ts +2519 -0
- package/src/benchmark/schema.ts +443 -0
- package/src/benchmark/suites.ts +323 -0
- package/src/benchmark/tinyQaDataset.ts +428 -0
- package/src/benchmark/types.ts +442 -0
- package/src/config.ts +44 -0
- package/src/index.ts +195 -0
- package/src/mcp/client.ts +305 -0
- package/src/mcp/discovery.ts +266 -0
- package/src/mcp/policy.ts +105 -0
- package/src/mcp/registry.ts +164 -0
- package/src/mcp/service.ts +611 -0
- package/src/middleware/auth.ts +251 -0
- package/src/middleware/requestCapture.ts +245 -0
- package/src/middleware/requestStats.ts +163 -0
- package/src/pools/builder.ts +159 -0
- package/src/pools/repository.ts +71 -0
- package/src/pools/scheduler.ts +425 -0
- package/src/pools/types.ts +117 -0
- package/src/protocols/adapters/dashscope.ts +335 -0
- package/src/protocols/adapters/inferenceV2.ts +428 -0
- package/src/protocols/adapters/openai.ts +32 -0
- package/src/protocols/registry.ts +117 -0
- package/src/protocols/types.ts +81 -0
- package/src/providers/health.ts +207 -0
- package/src/providers/importer.ts +402 -0
- package/src/providers/modelRegistry.ts +415 -0
- package/src/providers/repository.ts +439 -0
- package/src/providers/types.ts +113 -0
- package/src/routes/admin.ts +666 -0
- package/src/routes/audio.ts +372 -0
- package/src/routes/chat.ts +301 -0
- package/src/routes/embeddings.ts +197 -0
- package/src/routes/images.ts +356 -0
- package/src/routes/mcp.ts +320 -0
- package/src/routes/mcpService.ts +114 -0
- package/src/routes/models.ts +50 -0
- package/src/routes/responses.ts +872 -0
- package/src/routes/sessions.ts +558 -0
- package/src/routes/stats.ts +312 -0
- package/src/routes/ui.ts +96 -0
- package/src/routes/videos.ts +132 -0
- package/src/routing/router.ts +501 -0
- package/src/services/imageGeneration.ts +396 -0
- package/src/services/imageUnderstanding.ts +449 -0
- package/src/services/videoGeneration.ts +127 -0
- package/src/storage/captureRepository.ts +1835 -0
- package/src/storage/files.ts +178 -0
- package/src/storage/imageCache.ts +405 -0
- package/src/storage/repositories.ts +494 -0
- package/src/storage/sessionRepository.ts +419 -0
- package/src/storage/statsRepository.ts +238 -0
- package/src/transport/httpClient.ts +145 -0
- package/src/types.ts +322 -0
- package/src/utils/messageMedia.ts +293 -0
- package/src/utils/modelCapabilities.ts +161 -0
- package/src/utils/modelDiscovery.ts +203 -0
- package/src/workers/captureRetention.ts +25 -0
- package/src/workers/configWatcher.ts +115 -0
- package/src/workers/healthChecker.ts +22 -0
- package/src/workers/statsRotation.ts +49 -0
- package/tests/benchmarkAdminRoutes.test.ts +82 -0
- package/tests/benchmarkBasics.test.ts +116 -0
- package/tests/captureAdminRoutes.test.ts +420 -0
- package/tests/captureRepository.test.ts +797 -0
- package/tests/cliLegacyRewrite.test.ts +45 -0
- package/tests/imageGeneration.service.test.ts +107 -0
- package/tests/imageUnderstanding.service.test.ts +123 -0
- package/tests/mcpPolicy.test.ts +105 -0
- package/tests/mcpService.test.ts +1245 -0
- package/tests/modelRef.test.ts +23 -0
- package/tests/modelsRoutes.test.ts +154 -0
- package/tests/sessionMediaCache.test.ts +167 -0
- package/tests/statsRoutes.test.ts +323 -0
- package/tsconfig.json +15 -0
- package/ui/index.html +16 -0
- package/ui/package-lock.json +8521 -0
- package/ui/package.json +52 -0
- package/ui/postcss.config.js +6 -0
- package/ui/public/assets/apple-touch-icon.png +0 -0
- package/ui/public/assets/favicon-16.png +0 -0
- package/ui/public/assets/favicon-32.png +0 -0
- package/ui/public/assets/icon-192.png +0 -0
- package/ui/public/assets/icon-512.png +0 -0
- package/ui/src/App.tsx +27 -0
- package/ui/src/api/client.ts +1503 -0
- package/ui/src/components/EndpointUsageGuide.tsx +361 -0
- package/ui/src/components/Layout.tsx +124 -0
- package/ui/src/components/MessageContent.tsx +365 -0
- package/ui/src/components/ToolCallMessage.tsx +179 -0
- package/ui/src/components/ToolPicker.tsx +442 -0
- package/ui/src/components/messageContentParser.test.ts +41 -0
- package/ui/src/components/messageContentParser.ts +73 -0
- package/ui/src/components/thinkingPreview.test.ts +27 -0
- package/ui/src/components/thinkingPreview.ts +15 -0
- package/ui/src/components/toMermaidSankey.test.ts +78 -0
- package/ui/src/components/toMermaidSankey.ts +56 -0
- package/ui/src/components/ui/button.tsx +58 -0
- package/ui/src/components/ui/input.tsx +21 -0
- package/ui/src/components/ui/textarea.tsx +21 -0
- package/ui/src/lib/utils.ts +6 -0
- package/ui/src/main.tsx +9 -0
- package/ui/src/pages/AgentPlayground.tsx +2010 -0
- package/ui/src/pages/Benchmark.tsx +988 -0
- package/ui/src/pages/Dashboard.tsx +581 -0
- package/ui/src/pages/Peek.tsx +962 -0
- package/ui/src/pages/Settings.tsx +2013 -0
- package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
- package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
- package/ui/src/pages/agentThinkingContent.test.ts +50 -0
- package/ui/src/pages/agentThinkingContent.ts +57 -0
- package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
- package/ui/src/pages/dashboardTokenUsage.ts +36 -0
- package/ui/src/pages/imageUpload.test.ts +39 -0
- package/ui/src/pages/imageUpload.ts +71 -0
- package/ui/src/pages/peekFilters.test.ts +29 -0
- package/ui/src/pages/peekFilters.ts +13 -0
- package/ui/src/pages/peekMedia.test.ts +58 -0
- package/ui/src/pages/peekMedia.ts +148 -0
- package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
- package/ui/src/pages/sessionAutoTitle.ts +106 -0
- package/ui/src/stores/settings.ts +58 -0
- package/ui/src/styles/globals.css +223 -0
- package/ui/src/vite-env.d.ts +8 -0
- package/ui/tailwind.config.js +106 -0
- package/ui/tsconfig.json +32 -0
- package/ui/vite.config.ts +37 -0
|
@@ -0,0 +1,2519 @@
|
|
|
1
|
+
import { promises as fs } from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { randomUUID } from "crypto";
|
|
4
|
+
import YAML from "yaml";
|
|
5
|
+
import { routeRequest } from "../routing/router";
|
|
6
|
+
import { pickBestModelByCapabilities } from "../storage/repositories";
|
|
7
|
+
import { StoragePaths } from "../storage/files";
|
|
8
|
+
import {
|
|
9
|
+
discoverAllTools,
|
|
10
|
+
disconnectAllServers,
|
|
11
|
+
executeTool,
|
|
12
|
+
getCachedTools,
|
|
13
|
+
summarizeMcpError,
|
|
14
|
+
} from "../mcp/discovery";
|
|
15
|
+
import { writeBenchmarkArtifacts } from "./artifacts";
|
|
16
|
+
import { classifyCapabilityStatus } from "./capabilityClassifier";
|
|
17
|
+
import { computeConfigFingerprint, writeCapabilitySnapshots } from "./capabilityStore";
|
|
18
|
+
import { resolveBenchmarkConfig } from "./config";
|
|
19
|
+
import { evaluateGates } from "./gates";
|
|
20
|
+
import { validateScenarioCollection } from "./schema";
|
|
21
|
+
import { builtInSuite, listSuiteExamples } from "./suites";
|
|
22
|
+
import { listProviders } from "../providers/repository";
|
|
23
|
+
import { ProviderModelRecord } from "../providers/types";
|
|
24
|
+
import {
|
|
25
|
+
BENCHMARK_CAPABILITY_KEYS,
|
|
26
|
+
BENCHMARK_MODES,
|
|
27
|
+
BenchmarkCliOptions,
|
|
28
|
+
BenchmarkCapabilityKey,
|
|
29
|
+
BenchmarkCapabilityMatrix,
|
|
30
|
+
BenchmarkCapabilityStatus,
|
|
31
|
+
BenchmarkExchangeSummary,
|
|
32
|
+
BenchmarkMode,
|
|
33
|
+
BenchmarkModeRequirements,
|
|
34
|
+
BenchmarkReport,
|
|
35
|
+
BenchmarkRunOutput,
|
|
36
|
+
BenchmarkScenario,
|
|
37
|
+
BenchmarkScenarioDetail,
|
|
38
|
+
BenchmarkScenarioSummary,
|
|
39
|
+
BenchmarkToolTraceStep,
|
|
40
|
+
BenchmarkModelCapabilitySnapshot,
|
|
41
|
+
EffectiveBenchmarkConfig,
|
|
42
|
+
ScenarioResult,
|
|
43
|
+
ScenarioRunSample,
|
|
44
|
+
} from "./types";
|
|
45
|
+
|
|
46
|
+
export type BenchmarkProgressEventType =
|
|
47
|
+
| "run_started"
|
|
48
|
+
| "scenario_started"
|
|
49
|
+
| "exchange"
|
|
50
|
+
| "sample_completed"
|
|
51
|
+
| "scenario_completed"
|
|
52
|
+
| "warning"
|
|
53
|
+
| "run_completed";
|
|
54
|
+
|
|
55
|
+
export interface BenchmarkExchangeEvent {
|
|
56
|
+
scenarioInput: string;
|
|
57
|
+
requestPreview: string;
|
|
58
|
+
responsePreview: string;
|
|
59
|
+
mode: BenchmarkMode;
|
|
60
|
+
model: string;
|
|
61
|
+
requestPath: string;
|
|
62
|
+
statusCode: number;
|
|
63
|
+
contentType: string;
|
|
64
|
+
endpointId?: string;
|
|
65
|
+
endpointName?: string;
|
|
66
|
+
upstreamModel?: string;
|
|
67
|
+
toolTrace: BenchmarkToolTraceStep[];
|
|
68
|
+
requestRaw: unknown;
|
|
69
|
+
requestSanitized: unknown;
|
|
70
|
+
responseRaw: unknown;
|
|
71
|
+
responseSanitized: unknown;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export interface BenchmarkProgressEvent {
|
|
75
|
+
type: BenchmarkProgressEventType;
|
|
76
|
+
timestamp: string;
|
|
77
|
+
runId?: string;
|
|
78
|
+
scenarioId?: string;
|
|
79
|
+
scenarioIndex?: number;
|
|
80
|
+
totalScenarios?: number;
|
|
81
|
+
runIndex?: number;
|
|
82
|
+
totalRuns?: number;
|
|
83
|
+
phase?: "warmup" | "measured";
|
|
84
|
+
scenario?: BenchmarkScenarioSummary;
|
|
85
|
+
exchange?: BenchmarkExchangeEvent;
|
|
86
|
+
sample?: ScenarioRunSample;
|
|
87
|
+
result?: ScenarioResult;
|
|
88
|
+
warning?: string;
|
|
89
|
+
summary?: Pick<BenchmarkReport, "total" | "executed" | "succeeded" | "failed" | "successRate">;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export interface BenchmarkRunHooks {
|
|
93
|
+
runId?: string;
|
|
94
|
+
onEvent?: (event: BenchmarkProgressEvent) => void;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
interface ChatToolCall {
|
|
98
|
+
id?: string;
|
|
99
|
+
function?: {
|
|
100
|
+
name?: string;
|
|
101
|
+
arguments?: string;
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
interface ChatResponse {
|
|
106
|
+
choices?: Array<{
|
|
107
|
+
message?: {
|
|
108
|
+
content?: unknown;
|
|
109
|
+
tool_calls?: ChatToolCall[];
|
|
110
|
+
};
|
|
111
|
+
}>;
|
|
112
|
+
usage?: {
|
|
113
|
+
total_tokens?: number;
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
interface JsonResponseEnvelope {
|
|
118
|
+
statusCode: number;
|
|
119
|
+
payload: unknown;
|
|
120
|
+
contentType: string;
|
|
121
|
+
requestPayload: Record<string, unknown>;
|
|
122
|
+
route: {
|
|
123
|
+
endpointId: string;
|
|
124
|
+
endpointName?: string;
|
|
125
|
+
upstreamModel?: string;
|
|
126
|
+
};
|
|
127
|
+
poolMetrics?: {
|
|
128
|
+
candidateAttempts: number;
|
|
129
|
+
failovers: number;
|
|
130
|
+
rateLimitSwitches: number;
|
|
131
|
+
distinctProviders: number;
|
|
132
|
+
distinctModels: number;
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
interface BinaryResponseEnvelope {
|
|
137
|
+
statusCode: number;
|
|
138
|
+
buffer: Buffer;
|
|
139
|
+
contentType: string;
|
|
140
|
+
requestPayload: Record<string, unknown>;
|
|
141
|
+
route: {
|
|
142
|
+
endpointId: string;
|
|
143
|
+
endpointName?: string;
|
|
144
|
+
upstreamModel?: string;
|
|
145
|
+
};
|
|
146
|
+
poolMetrics?: {
|
|
147
|
+
candidateAttempts: number;
|
|
148
|
+
failovers: number;
|
|
149
|
+
rateLimitSwitches: number;
|
|
150
|
+
distinctProviders: number;
|
|
151
|
+
distinctModels: number;
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
interface ScenarioExecution {
|
|
156
|
+
scenario: BenchmarkScenario;
|
|
157
|
+
example: BenchmarkScenarioSummary;
|
|
158
|
+
result: ScenarioResult;
|
|
159
|
+
samples: ScenarioRunSample[];
|
|
160
|
+
exchanges: BenchmarkExchangeSummary[];
|
|
161
|
+
warnings: string[];
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
type ScenarioExchangeCallback = (event: BenchmarkExchangeEvent) => void;
|
|
165
|
+
|
|
166
|
+
export function listBenchmarkExamples(suite = "showcase"): BenchmarkScenarioSummary[] {
|
|
167
|
+
return listSuiteExamples(suite);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export async function runBenchmark(
|
|
171
|
+
paths: StoragePaths,
|
|
172
|
+
options: BenchmarkCliOptions,
|
|
173
|
+
hooks?: BenchmarkRunHooks
|
|
174
|
+
): Promise<BenchmarkRunOutput> {
|
|
175
|
+
const effective = await resolveBenchmarkConfig(paths, options);
|
|
176
|
+
const loaded = await loadScenarios(paths, effective);
|
|
177
|
+
const runId = hooks?.runId;
|
|
178
|
+
|
|
179
|
+
if (loaded.scenarios.length === 0) {
|
|
180
|
+
throw new Error("No benchmark scenarios found. Use --suite and/or --scenario.");
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
emitEvent(hooks, {
|
|
184
|
+
type: "run_started",
|
|
185
|
+
timestamp: new Date().toISOString(),
|
|
186
|
+
runId,
|
|
187
|
+
totalScenarios: loaded.scenarios.length,
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
const warnings = [...loaded.warnings];
|
|
191
|
+
for (const warning of loaded.warnings) {
|
|
192
|
+
emitEvent(hooks, {
|
|
193
|
+
type: "warning",
|
|
194
|
+
timestamp: new Date().toISOString(),
|
|
195
|
+
runId,
|
|
196
|
+
warning,
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const hasAgentScenarios = loaded.scenarios.some((scenario) => scenario.mode === "agent");
|
|
201
|
+
if (hasAgentScenarios) {
|
|
202
|
+
try {
|
|
203
|
+
await discoverAllTools(paths);
|
|
204
|
+
} catch (error) {
|
|
205
|
+
const warning = `MCP discovery failed for benchmark: ${summarizeMcpError(error)}`;
|
|
206
|
+
warnings.push(warning);
|
|
207
|
+
emitEvent(hooks, {
|
|
208
|
+
type: "warning",
|
|
209
|
+
timestamp: new Date().toISOString(),
|
|
210
|
+
runId,
|
|
211
|
+
warning,
|
|
212
|
+
});
|
|
213
|
+
if (process.env.WAYPOI_DEBUG_ERRORS === "1") {
|
|
214
|
+
console.error(error);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const executions: ScenarioExecution[] = [];
|
|
220
|
+
for (const [scenarioIndex, scenario] of loaded.scenarios.entries()) {
|
|
221
|
+
emitEvent(hooks, {
|
|
222
|
+
type: "scenario_started",
|
|
223
|
+
timestamp: new Date().toISOString(),
|
|
224
|
+
runId,
|
|
225
|
+
scenarioId: scenario.id,
|
|
226
|
+
scenarioIndex: scenarioIndex + 1,
|
|
227
|
+
totalScenarios: loaded.scenarios.length,
|
|
228
|
+
scenario: scenarioToSummary(scenario, effective.run.suite),
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
const execution = await runScenarioWithSampling(
|
|
232
|
+
paths,
|
|
233
|
+
scenario,
|
|
234
|
+
effective,
|
|
235
|
+
(sample, runIndex, phase, totalRuns) => {
|
|
236
|
+
emitEvent(hooks, {
|
|
237
|
+
type: "sample_completed",
|
|
238
|
+
timestamp: new Date().toISOString(),
|
|
239
|
+
runId,
|
|
240
|
+
scenarioId: scenario.id,
|
|
241
|
+
scenarioIndex: scenarioIndex + 1,
|
|
242
|
+
totalScenarios: loaded.scenarios.length,
|
|
243
|
+
runIndex,
|
|
244
|
+
totalRuns,
|
|
245
|
+
phase,
|
|
246
|
+
sample,
|
|
247
|
+
});
|
|
248
|
+
},
|
|
249
|
+
(exchange, runIndex, phase, totalRuns) => {
|
|
250
|
+
emitEvent(hooks, {
|
|
251
|
+
type: "exchange",
|
|
252
|
+
timestamp: new Date().toISOString(),
|
|
253
|
+
runId,
|
|
254
|
+
scenarioId: scenario.id,
|
|
255
|
+
scenarioIndex: scenarioIndex + 1,
|
|
256
|
+
totalScenarios: loaded.scenarios.length,
|
|
257
|
+
runIndex,
|
|
258
|
+
totalRuns,
|
|
259
|
+
phase,
|
|
260
|
+
exchange,
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
);
|
|
264
|
+
warnings.push(...execution.warnings);
|
|
265
|
+
for (const warning of execution.warnings) {
|
|
266
|
+
emitEvent(hooks, {
|
|
267
|
+
type: "warning",
|
|
268
|
+
timestamp: new Date().toISOString(),
|
|
269
|
+
runId,
|
|
270
|
+
scenarioId: scenario.id,
|
|
271
|
+
scenarioIndex: scenarioIndex + 1,
|
|
272
|
+
totalScenarios: loaded.scenarios.length,
|
|
273
|
+
warning,
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
emitEvent(hooks, {
|
|
277
|
+
type: "scenario_completed",
|
|
278
|
+
timestamp: new Date().toISOString(),
|
|
279
|
+
runId,
|
|
280
|
+
scenarioId: scenario.id,
|
|
281
|
+
scenarioIndex: scenarioIndex + 1,
|
|
282
|
+
totalScenarios: loaded.scenarios.length,
|
|
283
|
+
result: execution.result,
|
|
284
|
+
});
|
|
285
|
+
executions.push(execution);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const capabilityMatrix = buildCapabilityMatrix(effective, executions);
|
|
289
|
+
if (effective.run.updateCapCache && capabilityMatrix && capabilityMatrix.models.length > 0) {
|
|
290
|
+
await writeCapabilitySnapshots(paths, capabilityMatrix.models);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const reportBase = buildReport(
|
|
294
|
+
effective,
|
|
295
|
+
warnings,
|
|
296
|
+
loaded.scenarioPath,
|
|
297
|
+
executions,
|
|
298
|
+
capabilityMatrix,
|
|
299
|
+
runId
|
|
300
|
+
);
|
|
301
|
+
const gateResults = await evaluateGates(reportBase, effective);
|
|
302
|
+
const report: BenchmarkReport = {
|
|
303
|
+
...reportBase,
|
|
304
|
+
gateResults,
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
const artifacts = await writeBenchmarkArtifacts(paths, report, effective.run.outPath);
|
|
308
|
+
|
|
309
|
+
await disconnectAllServers();
|
|
310
|
+
|
|
311
|
+
emitEvent(hooks, {
|
|
312
|
+
type: "run_completed",
|
|
313
|
+
timestamp: new Date().toISOString(),
|
|
314
|
+
runId: report.id,
|
|
315
|
+
summary: {
|
|
316
|
+
total: report.total,
|
|
317
|
+
executed: report.executed,
|
|
318
|
+
succeeded: report.succeeded,
|
|
319
|
+
failed: report.failed,
|
|
320
|
+
successRate: report.successRate,
|
|
321
|
+
},
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
report,
|
|
326
|
+
artifactPath: artifacts.jsonPath,
|
|
327
|
+
textArtifactPath: artifacts.textPath,
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
async function loadScenarios(paths: StoragePaths, effective: EffectiveBenchmarkConfig): Promise<{
|
|
332
|
+
scenarios: BenchmarkScenario[];
|
|
333
|
+
warnings: string[];
|
|
334
|
+
scenarioPath?: string;
|
|
335
|
+
}> {
|
|
336
|
+
let allScenarios: BenchmarkScenario[] = [];
|
|
337
|
+
const warnings: string[] = [];
|
|
338
|
+
|
|
339
|
+
if (effective.run.suite) {
|
|
340
|
+
if (effective.run.suite === "capabilities") {
|
|
341
|
+
allScenarios = await buildCapabilitySuiteScenarios(paths, effective);
|
|
342
|
+
} else {
|
|
343
|
+
allScenarios.push(...builtInSuite(effective.run.suite));
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
if (effective.run.exampleId) {
|
|
348
|
+
allScenarios = allScenarios.filter((scenario) => scenario.id === effective.run.exampleId);
|
|
349
|
+
if (allScenarios.length === 0) {
|
|
350
|
+
throw new Error(
|
|
351
|
+
`Example '${effective.run.exampleId}' not found in suite '${effective.run.suite ?? "showcase"}'.`
|
|
352
|
+
);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (effective.run.scenarioPath) {
|
|
357
|
+
const filePath = path.resolve(effective.run.scenarioPath);
|
|
358
|
+
const fromFile = await loadScenarioFile(filePath);
|
|
359
|
+
const validated = validateScenarioCollection(fromFile, filePath);
|
|
360
|
+
for (const scenario of validated.scenarios) {
|
|
361
|
+
if (!scenario.exampleSource) {
|
|
362
|
+
scenario.exampleSource = "file";
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
allScenarios.push(...validated.scenarios);
|
|
366
|
+
warnings.push(...validated.warnings);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
ensureUniqueScenarioIds(allScenarios);
|
|
370
|
+
|
|
371
|
+
return {
|
|
372
|
+
scenarios: allScenarios,
|
|
373
|
+
warnings,
|
|
374
|
+
scenarioPath: effective.run.scenarioPath ? path.resolve(effective.run.scenarioPath) : undefined,
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
async function buildCapabilitySuiteScenarios(
|
|
379
|
+
paths: StoragePaths,
|
|
380
|
+
effective: EffectiveBenchmarkConfig
|
|
381
|
+
): Promise<BenchmarkScenario[]> {
|
|
382
|
+
const template = builtInSuite("capabilities");
|
|
383
|
+
if (effective.run.modelOverride) {
|
|
384
|
+
return materializeCapabilityScenariosForModel(template, effective.run.modelOverride);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const providers = await listProviders(paths);
|
|
388
|
+
const seen = new Set<string>();
|
|
389
|
+
const scenarios: BenchmarkScenario[] = [];
|
|
390
|
+
for (const provider of providers) {
|
|
391
|
+
if (!provider.enabled) {
|
|
392
|
+
continue;
|
|
393
|
+
}
|
|
394
|
+
for (const model of provider.models) {
|
|
395
|
+
if (model.enabled === false) {
|
|
396
|
+
continue;
|
|
397
|
+
}
|
|
398
|
+
const modelRef = `${provider.id}/${model.modelId}`;
|
|
399
|
+
if (seen.has(modelRef)) {
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
seen.add(modelRef);
|
|
403
|
+
scenarios.push(...materializeCapabilityScenariosForModel(template, modelRef, model));
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return scenarios;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
function materializeCapabilityScenariosForModel(
|
|
411
|
+
template: BenchmarkScenario[],
|
|
412
|
+
model: string,
|
|
413
|
+
providerModel?: ProviderModelRecord
|
|
414
|
+
): BenchmarkScenario[] {
|
|
415
|
+
return template
|
|
416
|
+
.filter((scenario) => {
|
|
417
|
+
if (scenario.id === "cap.chat_vision_input") {
|
|
418
|
+
return false;
|
|
419
|
+
}
|
|
420
|
+
if (scenario.id === "cap.images_edit") {
|
|
421
|
+
return false;
|
|
422
|
+
}
|
|
423
|
+
if (!providerModel) {
|
|
424
|
+
return true;
|
|
425
|
+
}
|
|
426
|
+
return supportsScenarioByDeclaredCapabilities(scenario, providerModel);
|
|
427
|
+
})
|
|
428
|
+
.map((scenario) => ({
|
|
429
|
+
...scenario,
|
|
430
|
+
id: `${scenario.id}::${model}`,
|
|
431
|
+
model,
|
|
432
|
+
assertions: { ...scenario.assertions },
|
|
433
|
+
}));
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
function supportsScenarioByDeclaredCapabilities(
|
|
437
|
+
scenario: BenchmarkScenario,
|
|
438
|
+
providerModel: ProviderModelRecord
|
|
439
|
+
): boolean {
|
|
440
|
+
const input = new Set(providerModel.capabilities.input);
|
|
441
|
+
const output = new Set(providerModel.capabilities.output);
|
|
442
|
+
if (scenario.mode === "chat" || scenario.mode === "agent") {
|
|
443
|
+
return input.has("text") && output.has("text");
|
|
444
|
+
}
|
|
445
|
+
if (scenario.mode === "embeddings") {
|
|
446
|
+
return input.has("text") && output.has("embedding");
|
|
447
|
+
}
|
|
448
|
+
if (scenario.mode === "image_generation") {
|
|
449
|
+
return output.has("image");
|
|
450
|
+
}
|
|
451
|
+
if (scenario.mode === "audio_transcription") {
|
|
452
|
+
return input.has("audio") && output.has("text");
|
|
453
|
+
}
|
|
454
|
+
if (scenario.mode === "audio_speech") {
|
|
455
|
+
return input.has("text") && output.has("audio");
|
|
456
|
+
}
|
|
457
|
+
if (scenario.mode === "omni_call") {
|
|
458
|
+
return input.has("audio") && output.has("text");
|
|
459
|
+
}
|
|
460
|
+
return true;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
async function loadScenarioFile(filePath: string): Promise<unknown[]> {
|
|
464
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
465
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
466
|
+
|
|
467
|
+
if (ext === ".jsonl") {
|
|
468
|
+
const rows = raw
|
|
469
|
+
.split("\n")
|
|
470
|
+
.map((line, index) => ({ line: line.trim(), lineNumber: index + 1 }))
|
|
471
|
+
.filter((entry) => entry.line.length > 0);
|
|
472
|
+
|
|
473
|
+
return rows.map((entry) => {
|
|
474
|
+
try {
|
|
475
|
+
return JSON.parse(entry.line) as unknown;
|
|
476
|
+
} catch (error) {
|
|
477
|
+
throw new Error(
|
|
478
|
+
`Failed to parse scenario JSONL ${filePath}:${entry.lineNumber}: ${(error as Error).message}`
|
|
479
|
+
);
|
|
480
|
+
}
|
|
481
|
+
});
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if (ext === ".yaml" || ext === ".yml") {
|
|
485
|
+
let parsed: unknown;
|
|
486
|
+
try {
|
|
487
|
+
parsed = YAML.parse(raw) as unknown;
|
|
488
|
+
} catch (error) {
|
|
489
|
+
throw new Error(`Failed to parse YAML scenario file ${filePath}: ${(error as Error).message}`);
|
|
490
|
+
}
|
|
491
|
+
return extractScenarioArray(parsed, filePath);
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
let parsed: unknown;
|
|
495
|
+
try {
|
|
496
|
+
parsed = JSON.parse(raw) as unknown;
|
|
497
|
+
} catch (error) {
|
|
498
|
+
throw new Error(`Failed to parse JSON scenario file ${filePath}: ${(error as Error).message}`);
|
|
499
|
+
}
|
|
500
|
+
return extractScenarioArray(parsed, filePath);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
function extractScenarioArray(parsed: unknown, source: string): unknown[] {
|
|
504
|
+
if (Array.isArray(parsed)) {
|
|
505
|
+
return parsed;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
if (
|
|
509
|
+
parsed &&
|
|
510
|
+
typeof parsed === "object" &&
|
|
511
|
+
Array.isArray((parsed as { scenarios?: unknown[] }).scenarios)
|
|
512
|
+
) {
|
|
513
|
+
return (parsed as { scenarios: unknown[] }).scenarios;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
throw new Error(`${source}: scenario file must be an array or an object with 'scenarios' array.`);
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
function ensureUniqueScenarioIds(scenarios: BenchmarkScenario[]): void {
|
|
520
|
+
const ids = new Set<string>();
|
|
521
|
+
for (const scenario of scenarios) {
|
|
522
|
+
if (ids.has(scenario.id)) {
|
|
523
|
+
throw new Error(`Scenario ID '${scenario.id}' is duplicated.`);
|
|
524
|
+
}
|
|
525
|
+
ids.add(scenario.id);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
async function runScenarioWithSampling(
|
|
530
|
+
paths: StoragePaths,
|
|
531
|
+
scenario: BenchmarkScenario,
|
|
532
|
+
effective: EffectiveBenchmarkConfig,
|
|
533
|
+
onSampleComplete?: (
|
|
534
|
+
sample: ScenarioRunSample,
|
|
535
|
+
runIndex: number,
|
|
536
|
+
phase: "warmup" | "measured",
|
|
537
|
+
totalRuns: number
|
|
538
|
+
) => void,
|
|
539
|
+
onExchange?: (
|
|
540
|
+
event: BenchmarkExchangeEvent,
|
|
541
|
+
runIndex: number,
|
|
542
|
+
phase: "warmup" | "measured",
|
|
543
|
+
totalRuns: number
|
|
544
|
+
) => void
|
|
545
|
+
): Promise<ScenarioExecution> {
|
|
546
|
+
const warnings: string[] = [];
|
|
547
|
+
const example = scenarioToSummary(scenario, effective.run.suite);
|
|
548
|
+
const model =
|
|
549
|
+
effective.run.modelOverride ||
|
|
550
|
+
scenario.model ||
|
|
551
|
+
(await pickBestModelForScenario(paths, scenario));
|
|
552
|
+
|
|
553
|
+
if (!model) {
|
|
554
|
+
const reason = `No model available for mode '${scenario.mode}'.`;
|
|
555
|
+
warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
|
|
556
|
+
return {
|
|
557
|
+
scenario,
|
|
558
|
+
example,
|
|
559
|
+
result: buildSkippedScenarioResult(scenario, reason),
|
|
560
|
+
samples: [],
|
|
561
|
+
exchanges: [],
|
|
562
|
+
warnings,
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
const runProfile =
|
|
567
|
+
effective.run.executionMode === "showcase"
|
|
568
|
+
? { warmupRuns: 0, measuredRuns: 1, minScenarioPassRate: 1 }
|
|
569
|
+
: effective.profileSettings;
|
|
570
|
+
const totalRuns = runProfile.warmupRuns + runProfile.measuredRuns;
|
|
571
|
+
const measuredSamples: ScenarioRunSample[] = [];
|
|
572
|
+
const measuredExchanges: BenchmarkExchangeSummary[] = [];
|
|
573
|
+
|
|
574
|
+
const selectedTools = getSelectedTools(scenario.tools);
|
|
575
|
+
if (scenario.requiresAvailableTools && selectedTools.length === 0) {
|
|
576
|
+
const reason = "No MCP tools are available for this tool-driven example.";
|
|
577
|
+
warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
|
|
578
|
+
return {
|
|
579
|
+
scenario,
|
|
580
|
+
example,
|
|
581
|
+
result: buildSkippedScenarioResult(scenario, reason),
|
|
582
|
+
samples: [],
|
|
583
|
+
exchanges: [],
|
|
584
|
+
warnings,
|
|
585
|
+
};
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
for (let index = 0; index < totalRuns; index++) {
|
|
589
|
+
const phase = index < runProfile.warmupRuns ? "warmup" : "measured";
|
|
590
|
+
const runIndex = index + 1;
|
|
591
|
+
const runExchanges: BenchmarkExchangeSummary[] = [];
|
|
592
|
+
const sample = await runSingleScenario(
|
|
593
|
+
paths,
|
|
594
|
+
scenario,
|
|
595
|
+
model,
|
|
596
|
+
effective,
|
|
597
|
+
runIndex,
|
|
598
|
+
(event) => {
|
|
599
|
+
if (phase === "measured") {
|
|
600
|
+
runExchanges.push(toExchangeSummary(event));
|
|
601
|
+
}
|
|
602
|
+
onExchange?.(event, runIndex, phase, totalRuns);
|
|
603
|
+
}
|
|
604
|
+
);
|
|
605
|
+
onSampleComplete?.(sample, index + 1, phase, totalRuns);
|
|
606
|
+
if (index >= runProfile.warmupRuns) {
|
|
607
|
+
measuredSamples.push(sample);
|
|
608
|
+
measuredExchanges.push(...runExchanges);
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return {
|
|
613
|
+
scenario,
|
|
614
|
+
example,
|
|
615
|
+
result: buildScenarioResult(
|
|
616
|
+
scenario,
|
|
617
|
+
model,
|
|
618
|
+
measuredSamples,
|
|
619
|
+
runProfile.minScenarioPassRate
|
|
620
|
+
),
|
|
621
|
+
exchanges: measuredExchanges,
|
|
622
|
+
samples: measuredSamples,
|
|
623
|
+
warnings,
|
|
624
|
+
};
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
async function pickBestModelForScenario(
|
|
628
|
+
paths: StoragePaths,
|
|
629
|
+
scenario: BenchmarkScenario
|
|
630
|
+
): Promise<string | null> {
|
|
631
|
+
const requirements = getModeRequirements(scenario.mode);
|
|
632
|
+
return pickBestModelByCapabilities(
|
|
633
|
+
paths,
|
|
634
|
+
{
|
|
635
|
+
requiredInput: requirements.requiredInput,
|
|
636
|
+
requiredOutput: requirements.requiredOutput,
|
|
637
|
+
},
|
|
638
|
+
requirements.preferredEndpointType
|
|
639
|
+
);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
function getModeRequirements(mode: BenchmarkMode): BenchmarkModeRequirements {
|
|
643
|
+
switch (mode) {
|
|
644
|
+
case "chat":
|
|
645
|
+
case "agent":
|
|
646
|
+
case "responses":
|
|
647
|
+
return { requiredInput: ["text"], requiredOutput: ["text"], preferredEndpointType: "llm" };
|
|
648
|
+
case "embeddings":
|
|
649
|
+
return { requiredInput: ["text"], requiredOutput: ["embedding"], preferredEndpointType: "embedding" };
|
|
650
|
+
case "image_generation":
|
|
651
|
+
return { requiredInput: ["text"], requiredOutput: ["image"], preferredEndpointType: "diffusion" };
|
|
652
|
+
case "audio_transcription":
|
|
653
|
+
return { requiredInput: ["audio"], requiredOutput: ["text"], preferredEndpointType: "audio" };
|
|
654
|
+
case "audio_speech":
|
|
655
|
+
return { requiredInput: ["text"], requiredOutput: ["audio"], preferredEndpointType: "audio" };
|
|
656
|
+
case "omni_call":
|
|
657
|
+
return { requiredInput: ["text", "audio"], requiredOutput: ["text"], preferredEndpointType: "llm" };
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
async function runSingleScenario(
|
|
662
|
+
paths: StoragePaths,
|
|
663
|
+
scenario: BenchmarkScenario,
|
|
664
|
+
model: string,
|
|
665
|
+
effective: EffectiveBenchmarkConfig,
|
|
666
|
+
runIndex: number,
|
|
667
|
+
onExchange?: ScenarioExchangeCallback
|
|
668
|
+
): Promise<ScenarioRunSample> {
|
|
669
|
+
const startTime = Date.now();
|
|
670
|
+
|
|
671
|
+
try {
|
|
672
|
+
const sample = await runModeScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
673
|
+
return { ...sample, runIndex };
|
|
674
|
+
} catch (error) {
|
|
675
|
+
const latencyMs = Date.now() - startTime;
|
|
676
|
+
return {
|
|
677
|
+
runIndex,
|
|
678
|
+
success: false,
|
|
679
|
+
latencyMs,
|
|
680
|
+
statusCode: 0,
|
|
681
|
+
tokens: 0,
|
|
682
|
+
toolCalls: 0,
|
|
683
|
+
throughputTokensPerSec: 0,
|
|
684
|
+
finalOutput: "",
|
|
685
|
+
outputPreview: "",
|
|
686
|
+
verdict: (error as Error).message,
|
|
687
|
+
usedToolNames: [],
|
|
688
|
+
error: (error as Error).message,
|
|
689
|
+
candidateAttempts: 0,
|
|
690
|
+
failovers: 0,
|
|
691
|
+
rateLimitSwitches: 0,
|
|
692
|
+
distinctProviders: 0,
|
|
693
|
+
distinctModels: 0,
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
async function runModeScenario(
|
|
699
|
+
paths: StoragePaths,
|
|
700
|
+
scenario: BenchmarkScenario,
|
|
701
|
+
model: string,
|
|
702
|
+
effective: EffectiveBenchmarkConfig,
|
|
703
|
+
startTime: number,
|
|
704
|
+
onExchange?: ScenarioExchangeCallback
|
|
705
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
706
|
+
switch (scenario.mode) {
|
|
707
|
+
case "chat":
|
|
708
|
+
return runChatScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
709
|
+
case "agent":
|
|
710
|
+
return runAgentScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
711
|
+
case "responses":
|
|
712
|
+
return runResponsesScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
713
|
+
case "embeddings":
|
|
714
|
+
return runEmbeddingsScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
715
|
+
case "image_generation":
|
|
716
|
+
return runImageScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
717
|
+
case "audio_transcription":
|
|
718
|
+
return runAudioTranscriptionScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
719
|
+
case "audio_speech":
|
|
720
|
+
return runAudioSpeechScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
721
|
+
case "omni_call":
|
|
722
|
+
return runOmniCallScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
async function runChatScenario(
|
|
727
|
+
paths: StoragePaths,
|
|
728
|
+
scenario: BenchmarkScenario,
|
|
729
|
+
model: string,
|
|
730
|
+
effective: EffectiveBenchmarkConfig,
|
|
731
|
+
startTime: number,
|
|
732
|
+
onExchange?: ScenarioExchangeCallback
|
|
733
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
734
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
735
|
+
const payload: Record<string, unknown> = {
|
|
736
|
+
model,
|
|
737
|
+
messages: [{ role: "user", content: scenario.prompt }],
|
|
738
|
+
stream: false,
|
|
739
|
+
...buildGenerationParams(scenario, effective),
|
|
740
|
+
};
|
|
741
|
+
|
|
742
|
+
const envelope = await requestJson(
|
|
743
|
+
paths,
|
|
744
|
+
model,
|
|
745
|
+
"/v1/chat/completions",
|
|
746
|
+
payload,
|
|
747
|
+
timeoutMs,
|
|
748
|
+
getModeRequirements("chat")
|
|
749
|
+
);
|
|
750
|
+
onExchange?.(
|
|
751
|
+
buildExchangeEvent({
|
|
752
|
+
scenario,
|
|
753
|
+
mode: "chat",
|
|
754
|
+
model,
|
|
755
|
+
requestPath: "/v1/chat/completions",
|
|
756
|
+
requestPayload: envelope.requestPayload,
|
|
757
|
+
responsePayload: envelope.payload,
|
|
758
|
+
statusCode: envelope.statusCode,
|
|
759
|
+
contentType: envelope.contentType,
|
|
760
|
+
endpointId: envelope.route.endpointId,
|
|
761
|
+
endpointName: envelope.route.endpointName,
|
|
762
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
763
|
+
})
|
|
764
|
+
);
|
|
765
|
+
|
|
766
|
+
const response = (envelope.payload as ChatResponse) ?? {};
|
|
767
|
+
const output = parseAssistantContent(response);
|
|
768
|
+
const tokens = Number(response.usage?.total_tokens ?? 0);
|
|
769
|
+
const latencyMs = Date.now() - startTime;
|
|
770
|
+
|
|
771
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
772
|
+
output,
|
|
773
|
+
toolCalls: 0,
|
|
774
|
+
toolNames: [],
|
|
775
|
+
latencyMs,
|
|
776
|
+
statusCode: envelope.statusCode,
|
|
777
|
+
});
|
|
778
|
+
|
|
779
|
+
return {
|
|
780
|
+
success: !assertionError,
|
|
781
|
+
latencyMs,
|
|
782
|
+
statusCode: envelope.statusCode,
|
|
783
|
+
tokens,
|
|
784
|
+
toolCalls: 0,
|
|
785
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
786
|
+
finalOutput: output,
|
|
787
|
+
outputPreview: truncate(output, 180),
|
|
788
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
789
|
+
usedToolNames: [],
|
|
790
|
+
error: assertionError ?? undefined,
|
|
791
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
792
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
793
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
794
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
795
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
796
|
+
};
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
async function runResponsesScenario(
|
|
800
|
+
paths: StoragePaths,
|
|
801
|
+
scenario: BenchmarkScenario,
|
|
802
|
+
model: string,
|
|
803
|
+
effective: EffectiveBenchmarkConfig,
|
|
804
|
+
startTime: number,
|
|
805
|
+
onExchange?: ScenarioExchangeCallback
|
|
806
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
807
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
808
|
+
const payload: Record<string, unknown> = {
|
|
809
|
+
model,
|
|
810
|
+
input: scenario.prompt,
|
|
811
|
+
stream: false,
|
|
812
|
+
...buildGenerationParams(scenario, effective),
|
|
813
|
+
};
|
|
814
|
+
|
|
815
|
+
const envelope = await requestJson(
|
|
816
|
+
paths,
|
|
817
|
+
model,
|
|
818
|
+
"/v1/responses",
|
|
819
|
+
payload,
|
|
820
|
+
timeoutMs,
|
|
821
|
+
getModeRequirements("responses")
|
|
822
|
+
);
|
|
823
|
+
onExchange?.(
|
|
824
|
+
buildExchangeEvent({
|
|
825
|
+
scenario,
|
|
826
|
+
mode: "responses",
|
|
827
|
+
model,
|
|
828
|
+
requestPath: "/v1/responses",
|
|
829
|
+
requestPayload: envelope.requestPayload,
|
|
830
|
+
responsePayload: envelope.payload,
|
|
831
|
+
statusCode: envelope.statusCode,
|
|
832
|
+
contentType: envelope.contentType,
|
|
833
|
+
endpointId: envelope.route.endpointId,
|
|
834
|
+
endpointName: envelope.route.endpointName,
|
|
835
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
836
|
+
})
|
|
837
|
+
);
|
|
838
|
+
|
|
839
|
+
const response = envelope.payload as {
|
|
840
|
+
output?: Array<{
|
|
841
|
+
type?: string;
|
|
842
|
+
content?: Array<{ type?: string; text?: string }>;
|
|
843
|
+
arguments?: string;
|
|
844
|
+
name?: string;
|
|
845
|
+
}>;
|
|
846
|
+
usage?: { total_tokens?: number };
|
|
847
|
+
};
|
|
848
|
+
const output = extractResponsesOutputText(response);
|
|
849
|
+
const tokens = Number(response?.usage?.total_tokens ?? 0);
|
|
850
|
+
const latencyMs = Date.now() - startTime;
|
|
851
|
+
|
|
852
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
853
|
+
output,
|
|
854
|
+
toolCalls: 0,
|
|
855
|
+
toolNames: [],
|
|
856
|
+
latencyMs,
|
|
857
|
+
statusCode: envelope.statusCode,
|
|
858
|
+
});
|
|
859
|
+
|
|
860
|
+
return {
|
|
861
|
+
success: !assertionError,
|
|
862
|
+
latencyMs,
|
|
863
|
+
statusCode: envelope.statusCode,
|
|
864
|
+
tokens,
|
|
865
|
+
toolCalls: 0,
|
|
866
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
867
|
+
finalOutput: output,
|
|
868
|
+
outputPreview: truncate(output, 180),
|
|
869
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
870
|
+
usedToolNames: [],
|
|
871
|
+
error: assertionError ?? undefined,
|
|
872
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
873
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
874
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
875
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
876
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
877
|
+
};
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
async function runAgentScenario(
|
|
881
|
+
paths: StoragePaths,
|
|
882
|
+
scenario: BenchmarkScenario,
|
|
883
|
+
model: string,
|
|
884
|
+
effective: EffectiveBenchmarkConfig,
|
|
885
|
+
startTime: number,
|
|
886
|
+
onExchange?: ScenarioExchangeCallback
|
|
887
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
888
|
+
const selectedTools = getSelectedTools(scenario.tools);
|
|
889
|
+
const messages: Array<Record<string, unknown>> = [{ role: "user", content: scenario.prompt }];
|
|
890
|
+
const maxIterations = scenario.maxIterations ?? effective.defaults.maxIterations;
|
|
891
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
892
|
+
const toolTimeoutMs = effective.defaults.toolTimeoutMs;
|
|
893
|
+
|
|
894
|
+
let toolCalls = 0;
|
|
895
|
+
let totalTokens = 0;
|
|
896
|
+
let finalOutput = "";
|
|
897
|
+
const usedToolNames = new Set<string>();
|
|
898
|
+
let statusCode = 200;
|
|
899
|
+
let reachedIterationCap = true;
|
|
900
|
+
let candidateAttempts = 0;
|
|
901
|
+
let failovers = 0;
|
|
902
|
+
let rateLimitSwitches = 0;
|
|
903
|
+
let distinctProviders = 0;
|
|
904
|
+
let distinctModels = 0;
|
|
905
|
+
|
|
906
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
907
|
+
const payload: Record<string, unknown> = {
|
|
908
|
+
model,
|
|
909
|
+
messages,
|
|
910
|
+
stream: false,
|
|
911
|
+
...buildGenerationParams(scenario, effective),
|
|
912
|
+
};
|
|
913
|
+
|
|
914
|
+
if (selectedTools.length > 0) {
|
|
915
|
+
payload.tools = selectedTools.map((tool) => ({
|
|
916
|
+
type: "function",
|
|
917
|
+
function: {
|
|
918
|
+
name: tool.name,
|
|
919
|
+
description: tool.description ?? "",
|
|
920
|
+
parameters: tool.inputSchema,
|
|
921
|
+
},
|
|
922
|
+
}));
|
|
923
|
+
payload.tool_choice = "auto";
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
const envelope = await requestJson(
|
|
927
|
+
paths,
|
|
928
|
+
model,
|
|
929
|
+
"/v1/chat/completions",
|
|
930
|
+
payload,
|
|
931
|
+
timeoutMs,
|
|
932
|
+
getModeRequirements("agent")
|
|
933
|
+
);
|
|
934
|
+
|
|
935
|
+
statusCode = envelope.statusCode;
|
|
936
|
+
const response = (envelope.payload as ChatResponse) ?? {};
|
|
937
|
+
totalTokens += Number(response.usage?.total_tokens ?? 0);
|
|
938
|
+
candidateAttempts += envelope.poolMetrics?.candidateAttempts ?? 0;
|
|
939
|
+
failovers += envelope.poolMetrics?.failovers ?? 0;
|
|
940
|
+
rateLimitSwitches += envelope.poolMetrics?.rateLimitSwitches ?? 0;
|
|
941
|
+
distinctProviders = Math.max(distinctProviders, envelope.poolMetrics?.distinctProviders ?? 0);
|
|
942
|
+
distinctModels = Math.max(distinctModels, envelope.poolMetrics?.distinctModels ?? 0);
|
|
943
|
+
|
|
944
|
+
const assistantMessage = response.choices?.[0]?.message;
|
|
945
|
+
const assistantContent = parseMessageContent(assistantMessage?.content);
|
|
946
|
+
finalOutput = assistantContent || finalOutput;
|
|
947
|
+
const toolCallList = Array.isArray(assistantMessage?.tool_calls)
|
|
948
|
+
? assistantMessage.tool_calls
|
|
949
|
+
: [];
|
|
950
|
+
onExchange?.(
|
|
951
|
+
buildExchangeEvent({
|
|
952
|
+
scenario,
|
|
953
|
+
mode: "agent",
|
|
954
|
+
model,
|
|
955
|
+
requestPath: "/v1/chat/completions",
|
|
956
|
+
requestPayload: envelope.requestPayload,
|
|
957
|
+
responsePayload: envelope.payload,
|
|
958
|
+
statusCode: envelope.statusCode,
|
|
959
|
+
contentType: envelope.contentType,
|
|
960
|
+
endpointId: envelope.route.endpointId,
|
|
961
|
+
endpointName: envelope.route.endpointName,
|
|
962
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
963
|
+
toolTrace: buildToolTrace(toolCallList, []),
|
|
964
|
+
})
|
|
965
|
+
);
|
|
966
|
+
|
|
967
|
+
if (toolCallList.length === 0) {
|
|
968
|
+
messages.push({ role: "assistant", content: assistantContent });
|
|
969
|
+
reachedIterationCap = false;
|
|
970
|
+
break;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
messages.push({
|
|
974
|
+
role: "assistant",
|
|
975
|
+
content: assistantContent || null,
|
|
976
|
+
tool_calls: toolCallList,
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
for (const call of toolCallList) {
|
|
980
|
+
const name = call.function?.name;
|
|
981
|
+
if (!name) {
|
|
982
|
+
throw new Error(`Scenario ${scenario.id}: tool call is missing function.name.`);
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
let args: Record<string, unknown> = {};
|
|
986
|
+
const rawArguments = call.function?.arguments;
|
|
987
|
+
if (rawArguments && rawArguments.trim().length > 0) {
|
|
988
|
+
try {
|
|
989
|
+
args = JSON.parse(rawArguments) as Record<string, unknown>;
|
|
990
|
+
} catch {
|
|
991
|
+
throw new Error(`Scenario ${scenario.id}: invalid tool arguments for ${name}.`);
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
const result = await withTimeout(
|
|
996
|
+
executeTool(name, args),
|
|
997
|
+
toolTimeoutMs,
|
|
998
|
+
`Tool execution timed out for ${name} after ${toolTimeoutMs}ms`
|
|
999
|
+
);
|
|
1000
|
+
|
|
1001
|
+
toolCalls += 1;
|
|
1002
|
+
usedToolNames.add(name);
|
|
1003
|
+
messages.push({
|
|
1004
|
+
role: "tool",
|
|
1005
|
+
tool_call_id: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
|
|
1006
|
+
content: result.content,
|
|
1007
|
+
});
|
|
1008
|
+
onExchange?.(
|
|
1009
|
+
buildExchangeEvent({
|
|
1010
|
+
scenario,
|
|
1011
|
+
mode: "agent",
|
|
1012
|
+
model,
|
|
1013
|
+
requestPath: "/mcp/tools/call",
|
|
1014
|
+
requestPayload: {
|
|
1015
|
+
tool_name: name,
|
|
1016
|
+
arguments: args,
|
|
1017
|
+
},
|
|
1018
|
+
responsePayload: result.content,
|
|
1019
|
+
statusCode: 200,
|
|
1020
|
+
contentType: "application/json",
|
|
1021
|
+
toolTrace: buildToolTrace([call], [
|
|
1022
|
+
{
|
|
1023
|
+
name,
|
|
1024
|
+
toolCallId: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
|
|
1025
|
+
content: result.content,
|
|
1026
|
+
},
|
|
1027
|
+
]),
|
|
1028
|
+
})
|
|
1029
|
+
);
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
const latencyMs = Date.now() - startTime;
|
|
1034
|
+
const capError = reachedIterationCap ? "max_iterations_reached" : null;
|
|
1035
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1036
|
+
output: finalOutput,
|
|
1037
|
+
toolCalls,
|
|
1038
|
+
toolNames: Array.from(usedToolNames),
|
|
1039
|
+
latencyMs,
|
|
1040
|
+
statusCode,
|
|
1041
|
+
});
|
|
1042
|
+
const error = capError ?? assertionError;
|
|
1043
|
+
|
|
1044
|
+
return {
|
|
1045
|
+
success: !error,
|
|
1046
|
+
latencyMs,
|
|
1047
|
+
statusCode,
|
|
1048
|
+
tokens: totalTokens,
|
|
1049
|
+
toolCalls,
|
|
1050
|
+
throughputTokensPerSec: calculateThroughput(totalTokens, latencyMs),
|
|
1051
|
+
finalOutput: finalOutput,
|
|
1052
|
+
outputPreview: truncate(finalOutput, 180),
|
|
1053
|
+
verdict: error ?? "All assertions passed.",
|
|
1054
|
+
usedToolNames: Array.from(usedToolNames),
|
|
1055
|
+
error: error ?? undefined,
|
|
1056
|
+
candidateAttempts,
|
|
1057
|
+
failovers,
|
|
1058
|
+
rateLimitSwitches,
|
|
1059
|
+
distinctProviders,
|
|
1060
|
+
distinctModels,
|
|
1061
|
+
};
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
async function runEmbeddingsScenario(
|
|
1065
|
+
paths: StoragePaths,
|
|
1066
|
+
scenario: BenchmarkScenario,
|
|
1067
|
+
model: string,
|
|
1068
|
+
effective: EffectiveBenchmarkConfig,
|
|
1069
|
+
startTime: number,
|
|
1070
|
+
onExchange?: ScenarioExchangeCallback
|
|
1071
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
1072
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
1073
|
+
const payload: Record<string, unknown> = {
|
|
1074
|
+
model,
|
|
1075
|
+
input: scenario.input,
|
|
1076
|
+
};
|
|
1077
|
+
|
|
1078
|
+
const envelope = await requestJson(
|
|
1079
|
+
paths,
|
|
1080
|
+
model,
|
|
1081
|
+
"/v1/embeddings",
|
|
1082
|
+
payload,
|
|
1083
|
+
timeoutMs,
|
|
1084
|
+
getModeRequirements("embeddings")
|
|
1085
|
+
);
|
|
1086
|
+
onExchange?.(
|
|
1087
|
+
buildExchangeEvent({
|
|
1088
|
+
scenario,
|
|
1089
|
+
mode: "embeddings",
|
|
1090
|
+
model,
|
|
1091
|
+
requestPath: "/v1/embeddings",
|
|
1092
|
+
requestPayload: envelope.requestPayload,
|
|
1093
|
+
responsePayload: envelope.payload,
|
|
1094
|
+
statusCode: envelope.statusCode,
|
|
1095
|
+
contentType: envelope.contentType,
|
|
1096
|
+
endpointId: envelope.route.endpointId,
|
|
1097
|
+
endpointName: envelope.route.endpointName,
|
|
1098
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
1099
|
+
})
|
|
1100
|
+
);
|
|
1101
|
+
|
|
1102
|
+
const response = envelope.payload as {
|
|
1103
|
+
data?: Array<{ embedding?: number[] }>;
|
|
1104
|
+
usage?: { total_tokens?: number };
|
|
1105
|
+
};
|
|
1106
|
+
|
|
1107
|
+
const data = Array.isArray(response?.data) ? response.data : [];
|
|
1108
|
+
const firstVectorLength = Array.isArray(data[0]?.embedding) ? data[0].embedding.length : 0;
|
|
1109
|
+
const text = `items=${data.length},vectorLength=${firstVectorLength}`;
|
|
1110
|
+
const tokens = Number(response?.usage?.total_tokens ?? 0);
|
|
1111
|
+
const latencyMs = Date.now() - startTime;
|
|
1112
|
+
|
|
1113
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1114
|
+
output: text,
|
|
1115
|
+
toolCalls: 0,
|
|
1116
|
+
toolNames: [],
|
|
1117
|
+
latencyMs,
|
|
1118
|
+
statusCode: envelope.statusCode,
|
|
1119
|
+
embeddingsItems: data.length,
|
|
1120
|
+
embeddingsVectorLength: firstVectorLength,
|
|
1121
|
+
});
|
|
1122
|
+
|
|
1123
|
+
return {
|
|
1124
|
+
success: !assertionError,
|
|
1125
|
+
latencyMs,
|
|
1126
|
+
statusCode: envelope.statusCode,
|
|
1127
|
+
tokens,
|
|
1128
|
+
toolCalls: 0,
|
|
1129
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
1130
|
+
finalOutput: text,
|
|
1131
|
+
outputPreview: truncate(text, 180),
|
|
1132
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
1133
|
+
usedToolNames: [],
|
|
1134
|
+
error: assertionError ?? undefined,
|
|
1135
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
1136
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
1137
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
1138
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
1139
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
1140
|
+
};
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
async function runImageScenario(
|
|
1144
|
+
paths: StoragePaths,
|
|
1145
|
+
scenario: BenchmarkScenario,
|
|
1146
|
+
model: string,
|
|
1147
|
+
effective: EffectiveBenchmarkConfig,
|
|
1148
|
+
startTime: number,
|
|
1149
|
+
onExchange?: ScenarioExchangeCallback
|
|
1150
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
1151
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
1152
|
+
const payload: Record<string, unknown> = {
|
|
1153
|
+
model,
|
|
1154
|
+
prompt: scenario.prompt,
|
|
1155
|
+
n: scenario.n,
|
|
1156
|
+
size: scenario.size,
|
|
1157
|
+
};
|
|
1158
|
+
|
|
1159
|
+
const envelope = await requestJson(
|
|
1160
|
+
paths,
|
|
1161
|
+
model,
|
|
1162
|
+
"/v1/images/generations",
|
|
1163
|
+
payload,
|
|
1164
|
+
timeoutMs,
|
|
1165
|
+
getModeRequirements("image_generation")
|
|
1166
|
+
);
|
|
1167
|
+
onExchange?.(
|
|
1168
|
+
buildExchangeEvent({
|
|
1169
|
+
scenario,
|
|
1170
|
+
mode: "image_generation",
|
|
1171
|
+
model,
|
|
1172
|
+
requestPath: "/v1/images/generations",
|
|
1173
|
+
requestPayload: envelope.requestPayload,
|
|
1174
|
+
responsePayload: envelope.payload,
|
|
1175
|
+
statusCode: envelope.statusCode,
|
|
1176
|
+
contentType: envelope.contentType,
|
|
1177
|
+
endpointId: envelope.route.endpointId,
|
|
1178
|
+
endpointName: envelope.route.endpointName,
|
|
1179
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
1180
|
+
})
|
|
1181
|
+
);
|
|
1182
|
+
|
|
1183
|
+
const response = envelope.payload as { data?: Array<{ url?: string; b64_json?: string }> };
|
|
1184
|
+
const images = Array.isArray(response?.data) ? response.data : [];
|
|
1185
|
+
const text = `images=${images.length}`;
|
|
1186
|
+
const latencyMs = Date.now() - startTime;
|
|
1187
|
+
|
|
1188
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1189
|
+
output: text,
|
|
1190
|
+
toolCalls: 0,
|
|
1191
|
+
toolNames: [],
|
|
1192
|
+
latencyMs,
|
|
1193
|
+
statusCode: envelope.statusCode,
|
|
1194
|
+
imagesCount: images.length,
|
|
1195
|
+
});
|
|
1196
|
+
|
|
1197
|
+
return {
|
|
1198
|
+
success: !assertionError,
|
|
1199
|
+
latencyMs,
|
|
1200
|
+
statusCode: envelope.statusCode,
|
|
1201
|
+
tokens: 0,
|
|
1202
|
+
toolCalls: 0,
|
|
1203
|
+
throughputTokensPerSec: 0,
|
|
1204
|
+
finalOutput: text,
|
|
1205
|
+
outputPreview: truncate(text, 180),
|
|
1206
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
1207
|
+
usedToolNames: [],
|
|
1208
|
+
error: assertionError ?? undefined,
|
|
1209
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
1210
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
1211
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
1212
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
1213
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
1214
|
+
};
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
async function runAudioTranscriptionScenario(
|
|
1218
|
+
paths: StoragePaths,
|
|
1219
|
+
scenario: BenchmarkScenario,
|
|
1220
|
+
model: string,
|
|
1221
|
+
effective: EffectiveBenchmarkConfig,
|
|
1222
|
+
startTime: number,
|
|
1223
|
+
onExchange?: ScenarioExchangeCallback
|
|
1224
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
1225
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
1226
|
+
const audioPath = path.resolve(scenario.audioFile as string);
|
|
1227
|
+
const audioBuffer = await fs.readFile(audioPath);
|
|
1228
|
+
|
|
1229
|
+
const payload: Record<string, unknown> = {
|
|
1230
|
+
model,
|
|
1231
|
+
file: audioBuffer.toString("base64"),
|
|
1232
|
+
response_format: "json",
|
|
1233
|
+
};
|
|
1234
|
+
|
|
1235
|
+
const envelope = await requestJson(
|
|
1236
|
+
paths,
|
|
1237
|
+
model,
|
|
1238
|
+
"/v1/audio/transcriptions",
|
|
1239
|
+
payload,
|
|
1240
|
+
timeoutMs,
|
|
1241
|
+
getModeRequirements("audio_transcription")
|
|
1242
|
+
);
|
|
1243
|
+
onExchange?.(
|
|
1244
|
+
buildExchangeEvent({
|
|
1245
|
+
scenario,
|
|
1246
|
+
mode: "audio_transcription",
|
|
1247
|
+
model,
|
|
1248
|
+
requestPath: "/v1/audio/transcriptions",
|
|
1249
|
+
requestPayload: envelope.requestPayload,
|
|
1250
|
+
responsePayload: envelope.payload,
|
|
1251
|
+
statusCode: envelope.statusCode,
|
|
1252
|
+
contentType: envelope.contentType,
|
|
1253
|
+
endpointId: envelope.route.endpointId,
|
|
1254
|
+
endpointName: envelope.route.endpointName,
|
|
1255
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
1256
|
+
})
|
|
1257
|
+
);
|
|
1258
|
+
|
|
1259
|
+
const response = envelope.payload as { text?: string };
|
|
1260
|
+
const text = response?.text ?? "";
|
|
1261
|
+
const latencyMs = Date.now() - startTime;
|
|
1262
|
+
|
|
1263
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1264
|
+
output: text,
|
|
1265
|
+
toolCalls: 0,
|
|
1266
|
+
toolNames: [],
|
|
1267
|
+
latencyMs,
|
|
1268
|
+
statusCode: envelope.statusCode,
|
|
1269
|
+
});
|
|
1270
|
+
|
|
1271
|
+
return {
|
|
1272
|
+
success: !assertionError,
|
|
1273
|
+
latencyMs,
|
|
1274
|
+
statusCode: envelope.statusCode,
|
|
1275
|
+
tokens: 0,
|
|
1276
|
+
toolCalls: 0,
|
|
1277
|
+
throughputTokensPerSec: 0,
|
|
1278
|
+
finalOutput: text,
|
|
1279
|
+
outputPreview: truncate(text, 180),
|
|
1280
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
1281
|
+
usedToolNames: [],
|
|
1282
|
+
error: assertionError ?? undefined,
|
|
1283
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
1284
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
1285
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
1286
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
1287
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
1288
|
+
};
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
async function runAudioSpeechScenario(
|
|
1292
|
+
paths: StoragePaths,
|
|
1293
|
+
scenario: BenchmarkScenario,
|
|
1294
|
+
model: string,
|
|
1295
|
+
effective: EffectiveBenchmarkConfig,
|
|
1296
|
+
startTime: number,
|
|
1297
|
+
onExchange?: ScenarioExchangeCallback
|
|
1298
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
1299
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
1300
|
+
const payload: Record<string, unknown> = {
|
|
1301
|
+
model,
|
|
1302
|
+
input: scenario.inputText,
|
|
1303
|
+
voice: scenario.voice,
|
|
1304
|
+
response_format: scenario.response_format,
|
|
1305
|
+
};
|
|
1306
|
+
|
|
1307
|
+
const envelope = await requestBinary(
|
|
1308
|
+
paths,
|
|
1309
|
+
model,
|
|
1310
|
+
"/v1/audio/speech",
|
|
1311
|
+
payload,
|
|
1312
|
+
timeoutMs,
|
|
1313
|
+
getModeRequirements("audio_speech")
|
|
1314
|
+
);
|
|
1315
|
+
onExchange?.(
|
|
1316
|
+
buildExchangeEvent({
|
|
1317
|
+
scenario,
|
|
1318
|
+
mode: "audio_speech",
|
|
1319
|
+
model,
|
|
1320
|
+
requestPath: "/v1/audio/speech",
|
|
1321
|
+
requestPayload: envelope.requestPayload,
|
|
1322
|
+
responsePayload: {
|
|
1323
|
+
bytes: envelope.buffer.length,
|
|
1324
|
+
},
|
|
1325
|
+
statusCode: envelope.statusCode,
|
|
1326
|
+
contentType: envelope.contentType,
|
|
1327
|
+
endpointId: envelope.route.endpointId,
|
|
1328
|
+
endpointName: envelope.route.endpointName,
|
|
1329
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
1330
|
+
})
|
|
1331
|
+
);
|
|
1332
|
+
|
|
1333
|
+
const latencyMs = Date.now() - startTime;
|
|
1334
|
+
const output = `bytes=${envelope.buffer.length}`;
|
|
1335
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1336
|
+
output,
|
|
1337
|
+
toolCalls: 0,
|
|
1338
|
+
toolNames: [],
|
|
1339
|
+
latencyMs,
|
|
1340
|
+
statusCode: envelope.statusCode,
|
|
1341
|
+
bytesLength: envelope.buffer.length,
|
|
1342
|
+
contentType: envelope.contentType,
|
|
1343
|
+
});
|
|
1344
|
+
|
|
1345
|
+
return {
|
|
1346
|
+
success: !assertionError,
|
|
1347
|
+
latencyMs,
|
|
1348
|
+
statusCode: envelope.statusCode,
|
|
1349
|
+
tokens: 0,
|
|
1350
|
+
toolCalls: 0,
|
|
1351
|
+
throughputTokensPerSec: 0,
|
|
1352
|
+
finalOutput: output,
|
|
1353
|
+
outputPreview: truncate(output, 180),
|
|
1354
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
1355
|
+
usedToolNames: [],
|
|
1356
|
+
error: assertionError ?? undefined,
|
|
1357
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
1358
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
1359
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
1360
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
1361
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
1362
|
+
};
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
async function runOmniCallScenario(
|
|
1366
|
+
paths: StoragePaths,
|
|
1367
|
+
scenario: BenchmarkScenario,
|
|
1368
|
+
model: string,
|
|
1369
|
+
effective: EffectiveBenchmarkConfig,
|
|
1370
|
+
startTime: number,
|
|
1371
|
+
onExchange?: ScenarioExchangeCallback
|
|
1372
|
+
): Promise<Omit<ScenarioRunSample, "runIndex">> {
|
|
1373
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
1374
|
+
const audioPath = path.resolve(scenario.audioFile as string);
|
|
1375
|
+
const audioBuffer = await fs.readFile(audioPath);
|
|
1376
|
+
const audioFormat = audioFormatFromFile(audioPath);
|
|
1377
|
+
|
|
1378
|
+
const payload: Record<string, unknown> = {
|
|
1379
|
+
model,
|
|
1380
|
+
messages: [
|
|
1381
|
+
{
|
|
1382
|
+
role: "user",
|
|
1383
|
+
content: [
|
|
1384
|
+
{
|
|
1385
|
+
type: "input_audio",
|
|
1386
|
+
input_audio: {
|
|
1387
|
+
data: audioBuffer.toString("base64"),
|
|
1388
|
+
format: audioFormat,
|
|
1389
|
+
},
|
|
1390
|
+
},
|
|
1391
|
+
{
|
|
1392
|
+
type: "text",
|
|
1393
|
+
text:
|
|
1394
|
+
scenario.prompt ??
|
|
1395
|
+
"Summarize this audio briefly and answer as transcript text.",
|
|
1396
|
+
},
|
|
1397
|
+
],
|
|
1398
|
+
},
|
|
1399
|
+
],
|
|
1400
|
+
stream: false,
|
|
1401
|
+
...buildGenerationParams(scenario, effective),
|
|
1402
|
+
};
|
|
1403
|
+
|
|
1404
|
+
const envelope = await requestJson(
|
|
1405
|
+
paths,
|
|
1406
|
+
model,
|
|
1407
|
+
"/v1/chat/completions",
|
|
1408
|
+
payload,
|
|
1409
|
+
timeoutMs,
|
|
1410
|
+
getModeRequirements("omni_call")
|
|
1411
|
+
);
|
|
1412
|
+
onExchange?.(
|
|
1413
|
+
buildExchangeEvent({
|
|
1414
|
+
scenario,
|
|
1415
|
+
mode: "omni_call",
|
|
1416
|
+
model,
|
|
1417
|
+
requestPath: "/v1/chat/completions",
|
|
1418
|
+
requestPayload: envelope.requestPayload,
|
|
1419
|
+
responsePayload: envelope.payload,
|
|
1420
|
+
statusCode: envelope.statusCode,
|
|
1421
|
+
contentType: envelope.contentType,
|
|
1422
|
+
endpointId: envelope.route.endpointId,
|
|
1423
|
+
endpointName: envelope.route.endpointName,
|
|
1424
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
1425
|
+
})
|
|
1426
|
+
);
|
|
1427
|
+
|
|
1428
|
+
const response = (envelope.payload as ChatResponse) ?? {};
|
|
1429
|
+
const output = parseAssistantContent(response);
|
|
1430
|
+
const tokens = Number(response.usage?.total_tokens ?? 0);
|
|
1431
|
+
const audioOutputPresent = responseHasAudioOutput(response);
|
|
1432
|
+
const latencyMs = Date.now() - startTime;
|
|
1433
|
+
|
|
1434
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
1435
|
+
output,
|
|
1436
|
+
toolCalls: 0,
|
|
1437
|
+
toolNames: [],
|
|
1438
|
+
latencyMs,
|
|
1439
|
+
statusCode: envelope.statusCode,
|
|
1440
|
+
});
|
|
1441
|
+
|
|
1442
|
+
return {
|
|
1443
|
+
success: !assertionError,
|
|
1444
|
+
latencyMs,
|
|
1445
|
+
statusCode: envelope.statusCode,
|
|
1446
|
+
tokens,
|
|
1447
|
+
toolCalls: 0,
|
|
1448
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
1449
|
+
finalOutput: output,
|
|
1450
|
+
outputPreview: truncate(`${output}\naudio_output=${audioOutputPresent ? "yes" : "no"}`, 180),
|
|
1451
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
1452
|
+
usedToolNames: [],
|
|
1453
|
+
error: assertionError ?? undefined,
|
|
1454
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
1455
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
1456
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
1457
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
1458
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
1459
|
+
audioOutputPresent,
|
|
1460
|
+
};
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
function audioFormatFromFile(filePath: string): string {
|
|
1464
|
+
const ext = path.extname(filePath).slice(1).toLowerCase();
|
|
1465
|
+
if (ext === "mp3") return "mp3";
|
|
1466
|
+
if (ext === "wav") return "wav";
|
|
1467
|
+
if (ext === "ogg") return "ogg";
|
|
1468
|
+
if (ext === "m4a" || ext === "mp4") return "m4a";
|
|
1469
|
+
if (ext === "webm") return "webm";
|
|
1470
|
+
return "wav";
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
function responseHasAudioOutput(response: ChatResponse): boolean {
|
|
1474
|
+
const message = response.choices?.[0]?.message;
|
|
1475
|
+
if (!message || typeof message !== "object") {
|
|
1476
|
+
return false;
|
|
1477
|
+
}
|
|
1478
|
+
const directAudio = (message as { audio?: unknown }).audio;
|
|
1479
|
+
if (directAudio && typeof directAudio === "object") {
|
|
1480
|
+
const audio = directAudio as { url?: unknown; data?: unknown };
|
|
1481
|
+
if (typeof audio.url === "string" || typeof audio.data === "string") {
|
|
1482
|
+
return true;
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
const content = (message as { content?: unknown }).content;
|
|
1487
|
+
if (!Array.isArray(content)) {
|
|
1488
|
+
return false;
|
|
1489
|
+
}
|
|
1490
|
+
for (const part of content) {
|
|
1491
|
+
if (!part || typeof part !== "object") {
|
|
1492
|
+
continue;
|
|
1493
|
+
}
|
|
1494
|
+
const typed = part as { type?: unknown; audio?: unknown };
|
|
1495
|
+
if (typed.type !== "audio" && typed.type !== "output_audio") {
|
|
1496
|
+
continue;
|
|
1497
|
+
}
|
|
1498
|
+
if (!typed.audio || typeof typed.audio !== "object") {
|
|
1499
|
+
continue;
|
|
1500
|
+
}
|
|
1501
|
+
const audio = typed.audio as { url?: unknown; data?: unknown };
|
|
1502
|
+
if (typeof audio.url === "string" || typeof audio.data === "string") {
|
|
1503
|
+
return true;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
return false;
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
async function requestJson(
|
|
1510
|
+
paths: StoragePaths,
|
|
1511
|
+
model: string,
|
|
1512
|
+
requestPath: string,
|
|
1513
|
+
payload: Record<string, unknown>,
|
|
1514
|
+
timeoutMs: number,
|
|
1515
|
+
requirements: BenchmarkModeRequirements
|
|
1516
|
+
): Promise<JsonResponseEnvelope> {
|
|
1517
|
+
const requestPayload = JSON.parse(JSON.stringify(payload)) as Record<string, unknown>;
|
|
1518
|
+
const outcome = await routeRequest(
|
|
1519
|
+
paths,
|
|
1520
|
+
model,
|
|
1521
|
+
requestPath,
|
|
1522
|
+
requestPayload,
|
|
1523
|
+
{},
|
|
1524
|
+
AbortSignal.timeout(timeoutMs),
|
|
1525
|
+
{
|
|
1526
|
+
endpointType: requirements.preferredEndpointType,
|
|
1527
|
+
requiredInput: requirements.requiredInput,
|
|
1528
|
+
requiredOutput: requirements.requiredOutput,
|
|
1529
|
+
}
|
|
1530
|
+
);
|
|
1531
|
+
|
|
1532
|
+
const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
|
|
1533
|
+
const payloadData = parseJson(buffer);
|
|
1534
|
+
return {
|
|
1535
|
+
statusCode: outcome.attempt.response.statusCode,
|
|
1536
|
+
payload: payloadData,
|
|
1537
|
+
contentType,
|
|
1538
|
+
requestPayload,
|
|
1539
|
+
route: {
|
|
1540
|
+
endpointId: outcome.attempt.endpoint.id,
|
|
1541
|
+
endpointName: outcome.attempt.endpoint.name,
|
|
1542
|
+
upstreamModel: outcome.attempt.upstreamModel,
|
|
1543
|
+
},
|
|
1544
|
+
poolMetrics: outcome.attempt.pool,
|
|
1545
|
+
};
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
async function requestBinary(
|
|
1549
|
+
paths: StoragePaths,
|
|
1550
|
+
model: string,
|
|
1551
|
+
requestPath: string,
|
|
1552
|
+
payload: Record<string, unknown>,
|
|
1553
|
+
timeoutMs: number,
|
|
1554
|
+
requirements: BenchmarkModeRequirements
|
|
1555
|
+
): Promise<BinaryResponseEnvelope> {
|
|
1556
|
+
const requestPayload = JSON.parse(JSON.stringify(payload)) as Record<string, unknown>;
|
|
1557
|
+
const outcome = await routeRequest(
|
|
1558
|
+
paths,
|
|
1559
|
+
model,
|
|
1560
|
+
requestPath,
|
|
1561
|
+
requestPayload,
|
|
1562
|
+
{},
|
|
1563
|
+
AbortSignal.timeout(timeoutMs),
|
|
1564
|
+
{
|
|
1565
|
+
endpointType: requirements.preferredEndpointType,
|
|
1566
|
+
requiredInput: requirements.requiredInput,
|
|
1567
|
+
requiredOutput: requirements.requiredOutput,
|
|
1568
|
+
}
|
|
1569
|
+
);
|
|
1570
|
+
|
|
1571
|
+
const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
|
|
1572
|
+
return {
|
|
1573
|
+
statusCode: outcome.attempt.response.statusCode,
|
|
1574
|
+
buffer,
|
|
1575
|
+
contentType,
|
|
1576
|
+
requestPayload,
|
|
1577
|
+
route: {
|
|
1578
|
+
endpointId: outcome.attempt.endpoint.id,
|
|
1579
|
+
endpointName: outcome.attempt.endpoint.name,
|
|
1580
|
+
upstreamModel: outcome.attempt.upstreamModel,
|
|
1581
|
+
},
|
|
1582
|
+
poolMetrics: outcome.attempt.pool,
|
|
1583
|
+
};
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
async function readBody(
|
|
1587
|
+
stream: NodeJS.ReadableStream,
|
|
1588
|
+
headers: Record<string, string | string[]>
|
|
1589
|
+
): Promise<{ buffer: Buffer; contentType: string }> {
|
|
1590
|
+
const chunks: Buffer[] = [];
|
|
1591
|
+
for await (const chunk of stream) {
|
|
1592
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
1593
|
+
}
|
|
1594
|
+
const buffer = Buffer.concat(chunks);
|
|
1595
|
+
const contentTypeHeader = headers["content-type"] ?? headers["Content-Type"];
|
|
1596
|
+
const contentType = Array.isArray(contentTypeHeader)
|
|
1597
|
+
? contentTypeHeader.join(", ")
|
|
1598
|
+
: (contentTypeHeader ?? "");
|
|
1599
|
+
|
|
1600
|
+
return { buffer, contentType };
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
function parseJson(buffer: Buffer): unknown {
|
|
1604
|
+
const text = buffer.toString("utf8").trim();
|
|
1605
|
+
if (!text) {
|
|
1606
|
+
return {};
|
|
1607
|
+
}
|
|
1608
|
+
try {
|
|
1609
|
+
return JSON.parse(text);
|
|
1610
|
+
} catch {
|
|
1611
|
+
return { raw: text };
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
|
|
1615
|
+
function getSelectedTools(requestedNames?: string[]) {
|
|
1616
|
+
const tools = getCachedTools();
|
|
1617
|
+
if (!requestedNames || requestedNames.length === 0) {
|
|
1618
|
+
return tools;
|
|
1619
|
+
}
|
|
1620
|
+
const requested = new Set(requestedNames);
|
|
1621
|
+
return tools.filter((tool) => requested.has(tool.name));
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
function parseAssistantContent(response: ChatResponse): string {
|
|
1625
|
+
const content = response.choices?.[0]?.message?.content;
|
|
1626
|
+
return parseMessageContent(content);
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1629
|
+
function buildGenerationParams(
|
|
1630
|
+
scenario: BenchmarkScenario,
|
|
1631
|
+
effective: EffectiveBenchmarkConfig
|
|
1632
|
+
): Record<string, unknown> {
|
|
1633
|
+
return compactObject({
|
|
1634
|
+
temperature:
|
|
1635
|
+
scenario.temperature ??
|
|
1636
|
+
effective.run.temperature ??
|
|
1637
|
+
effective.defaults.temperature,
|
|
1638
|
+
top_p:
|
|
1639
|
+
scenario.top_p ??
|
|
1640
|
+
effective.run.top_p ??
|
|
1641
|
+
effective.defaults.top_p,
|
|
1642
|
+
max_tokens:
|
|
1643
|
+
scenario.max_tokens ??
|
|
1644
|
+
effective.run.max_tokens ??
|
|
1645
|
+
effective.defaults.max_tokens,
|
|
1646
|
+
presence_penalty:
|
|
1647
|
+
scenario.presence_penalty ??
|
|
1648
|
+
effective.run.presence_penalty ??
|
|
1649
|
+
effective.defaults.presence_penalty,
|
|
1650
|
+
frequency_penalty:
|
|
1651
|
+
scenario.frequency_penalty ??
|
|
1652
|
+
effective.run.frequency_penalty ??
|
|
1653
|
+
effective.defaults.frequency_penalty,
|
|
1654
|
+
seed:
|
|
1655
|
+
scenario.seed ??
|
|
1656
|
+
effective.run.seed ??
|
|
1657
|
+
effective.defaults.seed,
|
|
1658
|
+
stop:
|
|
1659
|
+
scenario.stop ??
|
|
1660
|
+
effective.run.stop ??
|
|
1661
|
+
effective.defaults.stop,
|
|
1662
|
+
});
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
function extractResponsesOutputText(response: {
|
|
1666
|
+
output?: Array<{
|
|
1667
|
+
type?: string;
|
|
1668
|
+
content?: Array<{ type?: string; text?: string }>;
|
|
1669
|
+
name?: string;
|
|
1670
|
+
arguments?: string;
|
|
1671
|
+
}>;
|
|
1672
|
+
}): string {
|
|
1673
|
+
const parts: string[] = [];
|
|
1674
|
+
for (const item of response.output ?? []) {
|
|
1675
|
+
if (!item || typeof item !== "object") {
|
|
1676
|
+
continue;
|
|
1677
|
+
}
|
|
1678
|
+
if (item.type === "message") {
|
|
1679
|
+
for (const part of item.content ?? []) {
|
|
1680
|
+
if (part?.type === "output_text" && typeof part.text === "string") {
|
|
1681
|
+
parts.push(part.text);
|
|
1682
|
+
}
|
|
1683
|
+
}
|
|
1684
|
+
continue;
|
|
1685
|
+
}
|
|
1686
|
+
if (item.type === "function_call" && typeof item.name === "string") {
|
|
1687
|
+
parts.push(`tool_call:${item.name}`);
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
return parts.join("\n").trim();
|
|
1691
|
+
}
|
|
1692
|
+
|
|
1693
|
+
function parseMessageContent(content: unknown): string {
|
|
1694
|
+
if (typeof content === "string") {
|
|
1695
|
+
return content;
|
|
1696
|
+
}
|
|
1697
|
+
if (Array.isArray(content)) {
|
|
1698
|
+
const parts = content
|
|
1699
|
+
.map((part) => {
|
|
1700
|
+
if (typeof part === "string") {
|
|
1701
|
+
return part;
|
|
1702
|
+
}
|
|
1703
|
+
if (part && typeof part === "object" && typeof (part as { text?: unknown }).text === "string") {
|
|
1704
|
+
return (part as { text: string }).text;
|
|
1705
|
+
}
|
|
1706
|
+
return "";
|
|
1707
|
+
})
|
|
1708
|
+
.filter((part) => part.length > 0);
|
|
1709
|
+
return parts.join("\n");
|
|
1710
|
+
}
|
|
1711
|
+
return "";
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
interface AssertionRuntime {
|
|
1715
|
+
output: string;
|
|
1716
|
+
toolCalls: number;
|
|
1717
|
+
toolNames: string[];
|
|
1718
|
+
latencyMs: number;
|
|
1719
|
+
statusCode: number;
|
|
1720
|
+
embeddingsItems?: number;
|
|
1721
|
+
embeddingsVectorLength?: number;
|
|
1722
|
+
imagesCount?: number;
|
|
1723
|
+
bytesLength?: number;
|
|
1724
|
+
contentType?: string;
|
|
1725
|
+
}
|
|
1726
|
+
|
|
1727
|
+
function evaluateAssertions(scenario: BenchmarkScenario, runtime: AssertionRuntime): string | null {
|
|
1728
|
+
const assertions = scenario.assertions;
|
|
1729
|
+
|
|
1730
|
+
for (const required of assertions.contains ?? []) {
|
|
1731
|
+
if (!runtime.output.includes(required)) {
|
|
1732
|
+
return `Assertion failed: output must include '${required}'.`;
|
|
1733
|
+
}
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
for (const forbidden of assertions.notContains ?? []) {
|
|
1737
|
+
if (runtime.output.includes(forbidden)) {
|
|
1738
|
+
return `Assertion failed: output must not include '${forbidden}'.`;
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1742
|
+
for (const toolName of assertions.requiredToolNames ?? []) {
|
|
1743
|
+
if (!runtime.toolNames.includes(toolName)) {
|
|
1744
|
+
return `Assertion failed: expected tool '${toolName}' to be used.`;
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
if (typeof assertions.minToolCalls === "number" && runtime.toolCalls < assertions.minToolCalls) {
|
|
1749
|
+
return `Assertion failed: expected at least ${assertions.minToolCalls} tool calls, got ${runtime.toolCalls}.`;
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
if (typeof assertions.maxToolCalls === "number" && runtime.toolCalls > assertions.maxToolCalls) {
|
|
1753
|
+
return `Assertion failed: expected at most ${assertions.maxToolCalls} tool calls, got ${runtime.toolCalls}.`;
|
|
1754
|
+
}
|
|
1755
|
+
|
|
1756
|
+
if (typeof assertions.maxLatencyMs === "number" && runtime.latencyMs > assertions.maxLatencyMs) {
|
|
1757
|
+
return `Assertion failed: latency ${runtime.latencyMs}ms exceeded ${assertions.maxLatencyMs}ms.`;
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
if (runtime.statusCode !== assertions.statusCode) {
|
|
1761
|
+
return `Assertion failed: expected status ${assertions.statusCode}, got ${runtime.statusCode}.`;
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
if (typeof assertions.minItems === "number") {
|
|
1765
|
+
const items = runtime.embeddingsItems ?? 0;
|
|
1766
|
+
if (items < assertions.minItems) {
|
|
1767
|
+
return `Assertion failed: expected at least ${assertions.minItems} embeddings, got ${items}.`;
|
|
1768
|
+
}
|
|
1769
|
+
}
|
|
1770
|
+
|
|
1771
|
+
if (typeof assertions.minVectorLength === "number") {
|
|
1772
|
+
const vectorLength = runtime.embeddingsVectorLength ?? 0;
|
|
1773
|
+
if (vectorLength < assertions.minVectorLength) {
|
|
1774
|
+
return `Assertion failed: expected vector length >= ${assertions.minVectorLength}, got ${vectorLength}.`;
|
|
1775
|
+
}
|
|
1776
|
+
}
|
|
1777
|
+
|
|
1778
|
+
if (typeof assertions.minImages === "number") {
|
|
1779
|
+
const images = runtime.imagesCount ?? 0;
|
|
1780
|
+
if (images < assertions.minImages) {
|
|
1781
|
+
return `Assertion failed: expected at least ${assertions.minImages} images, got ${images}.`;
|
|
1782
|
+
}
|
|
1783
|
+
}
|
|
1784
|
+
|
|
1785
|
+
for (const text of assertions.containsText ?? []) {
|
|
1786
|
+
if (!runtime.output.includes(text)) {
|
|
1787
|
+
return `Assertion failed: transcription must include '${text}'.`;
|
|
1788
|
+
}
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
for (const text of assertions.notContainsText ?? []) {
|
|
1792
|
+
if (runtime.output.includes(text)) {
|
|
1793
|
+
return `Assertion failed: transcription must not include '${text}'.`;
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
if (typeof assertions.minBytes === "number") {
|
|
1798
|
+
const bytes = runtime.bytesLength ?? 0;
|
|
1799
|
+
if (bytes < assertions.minBytes) {
|
|
1800
|
+
return `Assertion failed: expected at least ${assertions.minBytes} bytes, got ${bytes}.`;
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1804
|
+
if (assertions.contentType) {
|
|
1805
|
+
const contentType = runtime.contentType ?? "";
|
|
1806
|
+
if (!contentType.toLowerCase().includes(assertions.contentType.toLowerCase())) {
|
|
1807
|
+
return `Assertion failed: expected content type to include '${assertions.contentType}', got '${contentType}'.`;
|
|
1808
|
+
}
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
return null;
|
|
1812
|
+
}
|
|
1813
|
+
|
|
1814
|
+
function buildSkippedScenarioResult(
|
|
1815
|
+
scenario: BenchmarkScenario,
|
|
1816
|
+
reason: string
|
|
1817
|
+
): ScenarioResult {
|
|
1818
|
+
return {
|
|
1819
|
+
id: scenario.id,
|
|
1820
|
+
mode: scenario.mode,
|
|
1821
|
+
title: scenario.title,
|
|
1822
|
+
summary: scenario.summary,
|
|
1823
|
+
userVisibleGoal: scenario.userVisibleGoal,
|
|
1824
|
+
exampleSource: scenario.exampleSource,
|
|
1825
|
+
inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
|
|
1826
|
+
successCriteria: scenario.successCriteria,
|
|
1827
|
+
expectedHighlights: scenario.expectedHighlights,
|
|
1828
|
+
model: scenario.model ?? "unresolved",
|
|
1829
|
+
status: "skipped",
|
|
1830
|
+
success: true,
|
|
1831
|
+
skippedReason: reason,
|
|
1832
|
+
passRate: 1,
|
|
1833
|
+
passedRuns: 0,
|
|
1834
|
+
failedRuns: 0,
|
|
1835
|
+
avgLatencyMs: 0,
|
|
1836
|
+
p50LatencyMs: 0,
|
|
1837
|
+
p95LatencyMs: 0,
|
|
1838
|
+
p99LatencyMs: 0,
|
|
1839
|
+
totalTokens: 0,
|
|
1840
|
+
totalToolCalls: 0,
|
|
1841
|
+
avgThroughputTokensPerSec: 0,
|
|
1842
|
+
candidateAttempts: 0,
|
|
1843
|
+
failovers: 0,
|
|
1844
|
+
rateLimitSwitches: 0,
|
|
1845
|
+
distinctProviders: 0,
|
|
1846
|
+
distinctModels: 0,
|
|
1847
|
+
errorReasons: [],
|
|
1848
|
+
usedToolNames: [],
|
|
1849
|
+
verdict: reason,
|
|
1850
|
+
outputPreview: "",
|
|
1851
|
+
audioOutputRuns: 0,
|
|
1852
|
+
};
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
function buildScenarioResult(
|
|
1856
|
+
scenario: BenchmarkScenario,
|
|
1857
|
+
model: string,
|
|
1858
|
+
samples: ScenarioRunSample[],
|
|
1859
|
+
minScenarioPassRate: number
|
|
1860
|
+
): ScenarioResult {
|
|
1861
|
+
const latencies = samples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
|
|
1862
|
+
const passedRuns = samples.filter((sample) => sample.success).length;
|
|
1863
|
+
const failedRuns = samples.length - passedRuns;
|
|
1864
|
+
const totalTokens = samples.reduce((sum, sample) => sum + sample.tokens, 0);
|
|
1865
|
+
const totalToolCalls = samples.reduce((sum, sample) => sum + sample.toolCalls, 0);
|
|
1866
|
+
const passRate = samples.length > 0 ? passedRuns / samples.length : 0;
|
|
1867
|
+
const candidateAttempts = samples.reduce((sum, sample) => sum + (sample.candidateAttempts ?? 0), 0);
|
|
1868
|
+
const failovers = samples.reduce((sum, sample) => sum + (sample.failovers ?? 0), 0);
|
|
1869
|
+
const rateLimitSwitches = samples.reduce((sum, sample) => sum + (sample.rateLimitSwitches ?? 0), 0);
|
|
1870
|
+
const distinctProviders = samples.reduce((max, sample) => Math.max(max, sample.distinctProviders ?? 0), 0);
|
|
1871
|
+
const distinctModels = samples.reduce((max, sample) => Math.max(max, sample.distinctModels ?? 0), 0);
|
|
1872
|
+
const audioOutputRuns = samples.reduce((sum, sample) => sum + (sample.audioOutputPresent ? 1 : 0), 0);
|
|
1873
|
+
const avgLatencyMs =
|
|
1874
|
+
latencies.length > 0
|
|
1875
|
+
? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
|
|
1876
|
+
: 0;
|
|
1877
|
+
const avgThroughputTokensPerSec =
|
|
1878
|
+
samples.length > 0
|
|
1879
|
+
? samples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) / samples.length
|
|
1880
|
+
: 0;
|
|
1881
|
+
|
|
1882
|
+
const failureReasonCounts = new Map<string, number>();
|
|
1883
|
+
for (const sample of samples) {
|
|
1884
|
+
if (!sample.error) {
|
|
1885
|
+
continue;
|
|
1886
|
+
}
|
|
1887
|
+
failureReasonCounts.set(sample.error, (failureReasonCounts.get(sample.error) ?? 0) + 1);
|
|
1888
|
+
}
|
|
1889
|
+
|
|
1890
|
+
const errorReasons = [...failureReasonCounts.entries()]
|
|
1891
|
+
.sort((a, b) => b[1] - a[1])
|
|
1892
|
+
.map(([reason, count]) => `${reason} (${count})`);
|
|
1893
|
+
|
|
1894
|
+
const outputPreview =
|
|
1895
|
+
[...samples].reverse().find((sample) => sample.outputPreview)?.outputPreview ?? "";
|
|
1896
|
+
|
|
1897
|
+
const status = passRate >= minScenarioPassRate ? "passed" : "failed";
|
|
1898
|
+
|
|
1899
|
+
return {
|
|
1900
|
+
id: scenario.id,
|
|
1901
|
+
mode: scenario.mode,
|
|
1902
|
+
title: scenario.title,
|
|
1903
|
+
summary: scenario.summary,
|
|
1904
|
+
userVisibleGoal: scenario.userVisibleGoal,
|
|
1905
|
+
exampleSource: scenario.exampleSource,
|
|
1906
|
+
inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
|
|
1907
|
+
successCriteria: scenario.successCriteria,
|
|
1908
|
+
expectedHighlights: scenario.expectedHighlights,
|
|
1909
|
+
model,
|
|
1910
|
+
status,
|
|
1911
|
+
success: status === "passed",
|
|
1912
|
+
passRate: Number(passRate.toFixed(4)),
|
|
1913
|
+
passedRuns,
|
|
1914
|
+
failedRuns,
|
|
1915
|
+
avgLatencyMs,
|
|
1916
|
+
p50LatencyMs: percentile(latencies, 50),
|
|
1917
|
+
p95LatencyMs: percentile(latencies, 95),
|
|
1918
|
+
p99LatencyMs: percentile(latencies, 99),
|
|
1919
|
+
totalTokens,
|
|
1920
|
+
totalToolCalls,
|
|
1921
|
+
avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
|
|
1922
|
+
candidateAttempts,
|
|
1923
|
+
failovers,
|
|
1924
|
+
rateLimitSwitches,
|
|
1925
|
+
distinctProviders,
|
|
1926
|
+
distinctModels,
|
|
1927
|
+
audioOutputRuns,
|
|
1928
|
+
usedToolNames: uniqueToolNames(samples),
|
|
1929
|
+
verdict:
|
|
1930
|
+
status === "passed"
|
|
1931
|
+
? "All assertions passed."
|
|
1932
|
+
: (samples.find((sample) => sample.error)?.error ?? errorReasons[0] ?? "Scenario failed."),
|
|
1933
|
+
errorReasons,
|
|
1934
|
+
outputPreview,
|
|
1935
|
+
};
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
function uniqueToolNames(samples: ScenarioRunSample[]): string[] {
|
|
1939
|
+
const names = new Set<string>();
|
|
1940
|
+
for (const sample of samples) {
|
|
1941
|
+
for (const toolName of sample.usedToolNames) {
|
|
1942
|
+
names.add(toolName);
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
return Array.from(names).sort();
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
function describeScenarioInput(scenario: BenchmarkScenario): string {
|
|
1949
|
+
if (scenario.inputPreview) {
|
|
1950
|
+
return scenario.inputPreview;
|
|
1951
|
+
}
|
|
1952
|
+
if (scenario.prompt) {
|
|
1953
|
+
return scenario.prompt;
|
|
1954
|
+
}
|
|
1955
|
+
if (scenario.inputText) {
|
|
1956
|
+
return scenario.inputText;
|
|
1957
|
+
}
|
|
1958
|
+
if (typeof scenario.input === "string") {
|
|
1959
|
+
return scenario.input;
|
|
1960
|
+
}
|
|
1961
|
+
if (Array.isArray(scenario.input)) {
|
|
1962
|
+
return scenario.input.join(" | ");
|
|
1963
|
+
}
|
|
1964
|
+
if (scenario.audioFile) {
|
|
1965
|
+
return scenario.audioFile;
|
|
1966
|
+
}
|
|
1967
|
+
return "";
|
|
1968
|
+
}
|
|
1969
|
+
|
|
1970
|
+
function buildCapabilityMatrix(
|
|
1971
|
+
effective: EffectiveBenchmarkConfig,
|
|
1972
|
+
executions: ScenarioExecution[]
|
|
1973
|
+
): BenchmarkCapabilityMatrix | undefined {
|
|
1974
|
+
const ttlDays = effective.run.capTtlDays ?? 7;
|
|
1975
|
+
const ttlMs = ttlDays * 24 * 60 * 60 * 1000;
|
|
1976
|
+
const byModel = new Map<string, {
|
|
1977
|
+
providerId: string;
|
|
1978
|
+
modelId: string;
|
|
1979
|
+
findings: Partial<Record<BenchmarkCapabilityKey, {
|
|
1980
|
+
status: BenchmarkCapabilityStatus;
|
|
1981
|
+
confidence: number;
|
|
1982
|
+
evidence: string;
|
|
1983
|
+
observedAt: string;
|
|
1984
|
+
scenarioId?: string;
|
|
1985
|
+
statusCode?: number;
|
|
1986
|
+
}>>;
|
|
1987
|
+
lastVerifiedAt: string;
|
|
1988
|
+
}>();
|
|
1989
|
+
|
|
1990
|
+
for (const execution of executions) {
|
|
1991
|
+
const capability = execution.scenario.capability;
|
|
1992
|
+
if (!capability) {
|
|
1993
|
+
continue;
|
|
1994
|
+
}
|
|
1995
|
+
const { providerId, modelId } = splitModelRef(execution.result.model);
|
|
1996
|
+
const modelKey = `${providerId}/${modelId}`;
|
|
1997
|
+
const existing = byModel.get(modelKey) ?? {
|
|
1998
|
+
providerId,
|
|
1999
|
+
modelId,
|
|
2000
|
+
findings: {},
|
|
2001
|
+
lastVerifiedAt: new Date().toISOString(),
|
|
2002
|
+
};
|
|
2003
|
+
|
|
2004
|
+
const status = classifyFromExecution(execution);
|
|
2005
|
+
const confidence = confidenceFromExecution(status, execution.result);
|
|
2006
|
+
const primaryReason = execution.result.errorReasons[0] ?? execution.result.outputPreview;
|
|
2007
|
+
const statusCode =
|
|
2008
|
+
execution.samples.find((sample) => sample.statusCode > 0)?.statusCode ??
|
|
2009
|
+
(execution.result.status === "skipped" ? 0 : 200);
|
|
2010
|
+
|
|
2011
|
+
const nextFinding = {
|
|
2012
|
+
status,
|
|
2013
|
+
confidence,
|
|
2014
|
+
evidence: truncate(primaryReason || "No explicit evidence", 220),
|
|
2015
|
+
observedAt: new Date().toISOString(),
|
|
2016
|
+
scenarioId: execution.scenario.id,
|
|
2017
|
+
statusCode: statusCode > 0 ? statusCode : undefined,
|
|
2018
|
+
};
|
|
2019
|
+
|
|
2020
|
+
const prev = existing.findings[capability];
|
|
2021
|
+
if (!prev || shouldReplaceFinding(prev.status, nextFinding.status, prev.confidence, nextFinding.confidence)) {
|
|
2022
|
+
existing.findings[capability] = nextFinding;
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
existing.lastVerifiedAt = new Date().toISOString();
|
|
2026
|
+
byModel.set(modelKey, existing);
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
const models: BenchmarkModelCapabilitySnapshot[] = [];
|
|
2030
|
+
for (const [key, record] of byModel.entries()) {
|
|
2031
|
+
const findings = Object.fromEntries(
|
|
2032
|
+
BENCHMARK_CAPABILITY_KEYS.map((capability) => {
|
|
2033
|
+
const item = record.findings[capability];
|
|
2034
|
+
if (item) {
|
|
2035
|
+
return [
|
|
2036
|
+
capability,
|
|
2037
|
+
{
|
|
2038
|
+
capability,
|
|
2039
|
+
status: item.status,
|
|
2040
|
+
confidence: item.confidence,
|
|
2041
|
+
evidence: item.evidence,
|
|
2042
|
+
scenarioId: item.scenarioId,
|
|
2043
|
+
statusCode: item.statusCode,
|
|
2044
|
+
observedAt: item.observedAt,
|
|
2045
|
+
},
|
|
2046
|
+
];
|
|
2047
|
+
}
|
|
2048
|
+
return [
|
|
2049
|
+
capability,
|
|
2050
|
+
{
|
|
2051
|
+
capability,
|
|
2052
|
+
status: "unknown" as const,
|
|
2053
|
+
confidence: 0,
|
|
2054
|
+
evidence: "No probe evidence in this run.",
|
|
2055
|
+
observedAt: record.lastVerifiedAt,
|
|
2056
|
+
},
|
|
2057
|
+
];
|
|
2058
|
+
})
|
|
2059
|
+
) as BenchmarkModelCapabilitySnapshot["findings"];
|
|
2060
|
+
|
|
2061
|
+
const confidenceValues = Object.values(findings).map((finding) => finding.confidence);
|
|
2062
|
+
const avgConfidence =
|
|
2063
|
+
confidenceValues.length > 0
|
|
2064
|
+
? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
|
|
2065
|
+
: 0;
|
|
2066
|
+
|
|
2067
|
+
const expiresAt = new Date(Date.parse(record.lastVerifiedAt) + ttlMs).toISOString();
|
|
2068
|
+
|
|
2069
|
+
models.push({
|
|
2070
|
+
model: key,
|
|
2071
|
+
providerId: record.providerId,
|
|
2072
|
+
modelId: record.modelId,
|
|
2073
|
+
configFingerprint: computeConfigFingerprint({
|
|
2074
|
+
suite: effective.run.suite,
|
|
2075
|
+
model: key,
|
|
2076
|
+
profile: effective.profile,
|
|
2077
|
+
}),
|
|
2078
|
+
confidence: Number(avgConfidence.toFixed(3)),
|
|
2079
|
+
lastVerifiedAt: record.lastVerifiedAt,
|
|
2080
|
+
expiresAt,
|
|
2081
|
+
freshness: Date.now() <= Date.parse(expiresAt) ? "fresh" : "stale",
|
|
2082
|
+
findings,
|
|
2083
|
+
});
|
|
2084
|
+
}
|
|
2085
|
+
|
|
2086
|
+
models.sort((a, b) => a.model.localeCompare(b.model));
|
|
2087
|
+
|
|
2088
|
+
if (models.length === 0) {
|
|
2089
|
+
return undefined;
|
|
2090
|
+
}
|
|
2091
|
+
|
|
2092
|
+
return {
|
|
2093
|
+
generatedAt: new Date().toISOString(),
|
|
2094
|
+
ttlDays,
|
|
2095
|
+
models,
|
|
2096
|
+
};
|
|
2097
|
+
}
|
|
2098
|
+
|
|
2099
|
+
function classifyFromExecution(execution: ScenarioExecution): BenchmarkCapabilityStatus {
|
|
2100
|
+
if (execution.result.status === "skipped") {
|
|
2101
|
+
return "unknown";
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
if (execution.result.success) {
|
|
2105
|
+
return "supported";
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
const sample = execution.samples.find((item) => !item.success) ?? execution.samples[0];
|
|
2109
|
+
return classifyCapabilityStatus({
|
|
2110
|
+
success: false,
|
|
2111
|
+
statusCode: sample?.statusCode,
|
|
2112
|
+
error: sample?.error ?? execution.result.errorReasons[0],
|
|
2113
|
+
});
|
|
2114
|
+
}
|
|
2115
|
+
|
|
2116
|
+
function confidenceFromExecution(status: BenchmarkCapabilityStatus, result: ScenarioResult): number {
|
|
2117
|
+
if (status === "supported") {
|
|
2118
|
+
return Math.max(0.5, result.passRate);
|
|
2119
|
+
}
|
|
2120
|
+
if (status === "unsupported" || status === "misconfigured") {
|
|
2121
|
+
return 0.9;
|
|
2122
|
+
}
|
|
2123
|
+
return 0.4;
|
|
2124
|
+
}
|
|
2125
|
+
|
|
2126
|
+
function shouldReplaceFinding(
|
|
2127
|
+
currentStatus: BenchmarkCapabilityStatus,
|
|
2128
|
+
nextStatus: BenchmarkCapabilityStatus,
|
|
2129
|
+
currentConfidence: number,
|
|
2130
|
+
nextConfidence: number
|
|
2131
|
+
): boolean {
|
|
2132
|
+
const rank = (value: BenchmarkCapabilityStatus): number => {
|
|
2133
|
+
switch (value) {
|
|
2134
|
+
case "supported":
|
|
2135
|
+
return 4;
|
|
2136
|
+
case "unsupported":
|
|
2137
|
+
return 3;
|
|
2138
|
+
case "misconfigured":
|
|
2139
|
+
return 2;
|
|
2140
|
+
case "unknown":
|
|
2141
|
+
return 1;
|
|
2142
|
+
}
|
|
2143
|
+
};
|
|
2144
|
+
if (rank(nextStatus) !== rank(currentStatus)) {
|
|
2145
|
+
return rank(nextStatus) > rank(currentStatus);
|
|
2146
|
+
}
|
|
2147
|
+
return nextConfidence >= currentConfidence;
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
function splitModelRef(model: string): { providerId: string; modelId: string } {
|
|
2151
|
+
const [providerId, ...rest] = model.split("/");
|
|
2152
|
+
if (!providerId || rest.length === 0) {
|
|
2153
|
+
return { providerId: "unknown", modelId: model };
|
|
2154
|
+
}
|
|
2155
|
+
return {
|
|
2156
|
+
providerId,
|
|
2157
|
+
modelId: rest.join("/"),
|
|
2158
|
+
};
|
|
2159
|
+
}
|
|
2160
|
+
|
|
2161
|
+
function buildReport(
|
|
2162
|
+
effective: EffectiveBenchmarkConfig,
|
|
2163
|
+
warnings: string[],
|
|
2164
|
+
scenarioPath: string | undefined,
|
|
2165
|
+
executions: ScenarioExecution[],
|
|
2166
|
+
capabilityMatrix: BenchmarkCapabilityMatrix | undefined,
|
|
2167
|
+
reportId?: string
|
|
2168
|
+
): Omit<BenchmarkReport, "gateResults"> {
|
|
2169
|
+
const results = executions.map((item) => item.result);
|
|
2170
|
+
const executedResults = results.filter((result) => result.status !== "skipped");
|
|
2171
|
+
const allSamples = executions.flatMap((item) => item.samples);
|
|
2172
|
+
const latencies = allSamples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
|
|
2173
|
+
|
|
2174
|
+
const total = results.length;
|
|
2175
|
+
const executed = executedResults.length;
|
|
2176
|
+
const skipped = results.filter((result) => result.status === "skipped").length;
|
|
2177
|
+
const succeeded = results.filter((result) => result.status === "passed").length;
|
|
2178
|
+
const failed = results.filter((result) => result.status === "failed").length;
|
|
2179
|
+
|
|
2180
|
+
const totalTokens = executedResults.reduce((sum, result) => sum + result.totalTokens, 0);
|
|
2181
|
+
const totalToolCalls = executedResults.reduce((sum, result) => sum + result.totalToolCalls, 0);
|
|
2182
|
+
const avgLatencyMs =
|
|
2183
|
+
latencies.length > 0
|
|
2184
|
+
? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
|
|
2185
|
+
: 0;
|
|
2186
|
+
const avgThroughputTokensPerSec =
|
|
2187
|
+
allSamples.length > 0
|
|
2188
|
+
? allSamples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) /
|
|
2189
|
+
allSamples.length
|
|
2190
|
+
: 0;
|
|
2191
|
+
|
|
2192
|
+
const topFailureReasons = collectTopFailureReasons(allSamples);
|
|
2193
|
+
const modeSummary = summarizeByMode(results);
|
|
2194
|
+
|
|
2195
|
+
return {
|
|
2196
|
+
id: reportId ?? randomUUID(),
|
|
2197
|
+
createdAt: new Date().toISOString(),
|
|
2198
|
+
profile: effective.profile,
|
|
2199
|
+
executionMode: effective.run.executionMode ?? "diagnostic",
|
|
2200
|
+
suite: effective.run.suite,
|
|
2201
|
+
exampleId: effective.run.exampleId,
|
|
2202
|
+
scenarioPath,
|
|
2203
|
+
modelOverride: effective.run.modelOverride,
|
|
2204
|
+
configSource: effective.configSource,
|
|
2205
|
+
total,
|
|
2206
|
+
executed,
|
|
2207
|
+
skipped,
|
|
2208
|
+
succeeded,
|
|
2209
|
+
failed,
|
|
2210
|
+
successRate: executed > 0 ? Number((succeeded / executed).toFixed(4)) : 0,
|
|
2211
|
+
totalTokens,
|
|
2212
|
+
totalToolCalls,
|
|
2213
|
+
avgLatencyMs,
|
|
2214
|
+
p50LatencyMs: percentile(latencies, 50),
|
|
2215
|
+
p95LatencyMs: percentile(latencies, 95),
|
|
2216
|
+
p99LatencyMs: percentile(latencies, 99),
|
|
2217
|
+
avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
|
|
2218
|
+
modeSummary,
|
|
2219
|
+
effectiveConfig: {
|
|
2220
|
+
defaults: effective.defaults,
|
|
2221
|
+
profileSettings: effective.profileSettings,
|
|
2222
|
+
gates: effective.gates,
|
|
2223
|
+
},
|
|
2224
|
+
results,
|
|
2225
|
+
scenarioDetails: executions.map((execution) => ({
|
|
2226
|
+
id: execution.result.id,
|
|
2227
|
+
suite: effective.run.suite,
|
|
2228
|
+
example: execution.example,
|
|
2229
|
+
model: execution.result.model,
|
|
2230
|
+
status: execution.result.status,
|
|
2231
|
+
verdict: execution.result.verdict,
|
|
2232
|
+
exchanges: execution.exchanges,
|
|
2233
|
+
finalResponsePreview: execution.result.outputPreview,
|
|
2234
|
+
usedToolNames: execution.result.usedToolNames,
|
|
2235
|
+
})),
|
|
2236
|
+
scenarioRuns: executions.map((execution) => ({
|
|
2237
|
+
id: execution.result.id,
|
|
2238
|
+
samples: execution.samples,
|
|
2239
|
+
})),
|
|
2240
|
+
warnings,
|
|
2241
|
+
topFailureReasons,
|
|
2242
|
+
capabilityMatrix,
|
|
2243
|
+
};
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
function buildExchangeEvent(args: {
|
|
2247
|
+
scenario: BenchmarkScenario;
|
|
2248
|
+
mode: BenchmarkMode;
|
|
2249
|
+
model: string;
|
|
2250
|
+
requestPath: string;
|
|
2251
|
+
requestPayload: unknown;
|
|
2252
|
+
responsePayload: unknown;
|
|
2253
|
+
statusCode: number;
|
|
2254
|
+
contentType: string;
|
|
2255
|
+
endpointId?: string;
|
|
2256
|
+
endpointName?: string;
|
|
2257
|
+
upstreamModel?: string;
|
|
2258
|
+
toolTrace?: BenchmarkToolTraceStep[];
|
|
2259
|
+
}): BenchmarkExchangeEvent {
|
|
2260
|
+
const requestSanitized = sanitizeForTrace(args.requestPayload);
|
|
2261
|
+
const responseSanitized = sanitizeForTrace(args.responsePayload);
|
|
2262
|
+
return {
|
|
2263
|
+
scenarioInput: describeScenarioInput(args.scenario),
|
|
2264
|
+
requestPreview: truncate(previewForTrace(requestSanitized), 220),
|
|
2265
|
+
responsePreview: truncate(previewForTrace(responseSanitized), 220),
|
|
2266
|
+
mode: args.mode,
|
|
2267
|
+
model: args.model,
|
|
2268
|
+
requestPath: args.requestPath,
|
|
2269
|
+
statusCode: args.statusCode,
|
|
2270
|
+
contentType: args.contentType,
|
|
2271
|
+
endpointId: args.endpointId,
|
|
2272
|
+
endpointName: args.endpointName,
|
|
2273
|
+
upstreamModel: args.upstreamModel,
|
|
2274
|
+
toolTrace: args.toolTrace ?? [],
|
|
2275
|
+
requestRaw: safeSerialize(args.requestPayload),
|
|
2276
|
+
requestSanitized,
|
|
2277
|
+
responseRaw: safeSerialize(args.responsePayload),
|
|
2278
|
+
responseSanitized,
|
|
2279
|
+
};
|
|
2280
|
+
}
|
|
2281
|
+
|
|
2282
|
+
function toExchangeSummary(event: BenchmarkExchangeEvent): BenchmarkExchangeSummary {
|
|
2283
|
+
return {
|
|
2284
|
+
timestamp: new Date().toISOString(),
|
|
2285
|
+
mode: event.mode,
|
|
2286
|
+
model: event.model,
|
|
2287
|
+
requestPath: event.requestPath,
|
|
2288
|
+
statusCode: event.statusCode,
|
|
2289
|
+
contentType: event.contentType,
|
|
2290
|
+
requestSanitized: event.requestSanitized,
|
|
2291
|
+
responseSanitized: event.responseSanitized,
|
|
2292
|
+
requestPreview: event.requestPreview,
|
|
2293
|
+
responsePreview: event.responsePreview,
|
|
2294
|
+
endpointId: event.endpointId,
|
|
2295
|
+
endpointName: event.endpointName,
|
|
2296
|
+
upstreamModel: event.upstreamModel,
|
|
2297
|
+
toolTrace: event.toolTrace,
|
|
2298
|
+
};
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
function scenarioToSummary(
|
|
2302
|
+
scenario: BenchmarkScenario,
|
|
2303
|
+
suite?: string
|
|
2304
|
+
): BenchmarkScenarioSummary {
|
|
2305
|
+
return {
|
|
2306
|
+
id: scenario.id,
|
|
2307
|
+
suite: suite ?? "custom",
|
|
2308
|
+
mode: scenario.mode,
|
|
2309
|
+
title: scenario.title ?? scenario.id,
|
|
2310
|
+
summary: scenario.summary ?? "Benchmark scenario",
|
|
2311
|
+
userVisibleGoal:
|
|
2312
|
+
scenario.userVisibleGoal ?? "Inspect the exact request, response, and final verdict.",
|
|
2313
|
+
exampleSource: scenario.exampleSource ?? (suite ? "builtin" : "file"),
|
|
2314
|
+
inputPreview: describeScenarioInput(scenario),
|
|
2315
|
+
successCriteria: scenario.successCriteria ?? "All configured assertions pass.",
|
|
2316
|
+
expectedHighlights: scenario.expectedHighlights ?? [],
|
|
2317
|
+
requiresAvailableTools: scenario.requiresAvailableTools === true,
|
|
2318
|
+
model: scenario.model,
|
|
2319
|
+
};
|
|
2320
|
+
}
|
|
2321
|
+
|
|
2322
|
+
function buildToolTrace(
|
|
2323
|
+
toolCalls: ChatToolCall[],
|
|
2324
|
+
toolResults: Array<{ name: string; toolCallId?: string; content: unknown }>
|
|
2325
|
+
): BenchmarkToolTraceStep[] {
|
|
2326
|
+
const trace: BenchmarkToolTraceStep[] = [];
|
|
2327
|
+
for (const call of toolCalls) {
|
|
2328
|
+
const toolName = call.function?.name;
|
|
2329
|
+
if (!toolName) {
|
|
2330
|
+
continue;
|
|
2331
|
+
}
|
|
2332
|
+
trace.push({
|
|
2333
|
+
kind: "tool_call",
|
|
2334
|
+
toolName,
|
|
2335
|
+
toolCallId: call.id,
|
|
2336
|
+
argumentsText: call.function?.arguments,
|
|
2337
|
+
});
|
|
2338
|
+
}
|
|
2339
|
+
for (const result of toolResults) {
|
|
2340
|
+
trace.push({
|
|
2341
|
+
kind: "tool_result",
|
|
2342
|
+
toolName: result.name,
|
|
2343
|
+
toolCallId: result.toolCallId,
|
|
2344
|
+
contentText: previewForTrace(sanitizeForTrace(result.content)),
|
|
2345
|
+
});
|
|
2346
|
+
}
|
|
2347
|
+
return trace;
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
function safeSerialize(value: unknown): unknown {
|
|
2351
|
+
try {
|
|
2352
|
+
return JSON.parse(JSON.stringify(value));
|
|
2353
|
+
} catch {
|
|
2354
|
+
return { preview: String(value) };
|
|
2355
|
+
}
|
|
2356
|
+
}
|
|
2357
|
+
|
|
2358
|
+
function sanitizeForTrace(value: unknown, depth = 0): unknown {
|
|
2359
|
+
if (depth > 6) {
|
|
2360
|
+
return "[truncated-depth]";
|
|
2361
|
+
}
|
|
2362
|
+
if (value === null || value === undefined) {
|
|
2363
|
+
return value;
|
|
2364
|
+
}
|
|
2365
|
+
if (typeof value === "string") {
|
|
2366
|
+
const trimmed = value.trim();
|
|
2367
|
+
if (looksLikeBase64(trimmed) && trimmed.length > 64) {
|
|
2368
|
+
return `<base64 omitted len=${trimmed.length}>`;
|
|
2369
|
+
}
|
|
2370
|
+
if (trimmed.startsWith("data:") && trimmed.length > 80) {
|
|
2371
|
+
const mime = trimmed.slice(5, trimmed.indexOf(";")) || "unknown";
|
|
2372
|
+
return `<data-url ${mime} omitted len=${trimmed.length}>`;
|
|
2373
|
+
}
|
|
2374
|
+
if (trimmed.length > 500) {
|
|
2375
|
+
return `${trimmed.slice(0, 500)}…`;
|
|
2376
|
+
}
|
|
2377
|
+
return trimmed;
|
|
2378
|
+
}
|
|
2379
|
+
if (typeof value === "number" || typeof value === "boolean") {
|
|
2380
|
+
return value;
|
|
2381
|
+
}
|
|
2382
|
+
if (Array.isArray(value)) {
|
|
2383
|
+
if (value.length > 50) {
|
|
2384
|
+
return {
|
|
2385
|
+
summary: `array(${value.length})`,
|
|
2386
|
+
sample: value.slice(0, 10).map((item) => sanitizeForTrace(item, depth + 1)),
|
|
2387
|
+
};
|
|
2388
|
+
}
|
|
2389
|
+
return value.map((item) => sanitizeForTrace(item, depth + 1));
|
|
2390
|
+
}
|
|
2391
|
+
if (typeof value === "object") {
|
|
2392
|
+
const out: Record<string, unknown> = {};
|
|
2393
|
+
for (const [key, item] of Object.entries(value as Record<string, unknown>)) {
|
|
2394
|
+
if (/(api[-_]?key|authorization|token|secret)/i.test(key)) {
|
|
2395
|
+
out[key] = "***";
|
|
2396
|
+
continue;
|
|
2397
|
+
}
|
|
2398
|
+
if (key === "embedding" && Array.isArray(item)) {
|
|
2399
|
+
out[key] = {
|
|
2400
|
+
summary: `vector(${item.length})`,
|
|
2401
|
+
sample: item.slice(0, 8),
|
|
2402
|
+
};
|
|
2403
|
+
continue;
|
|
2404
|
+
}
|
|
2405
|
+
out[key] = sanitizeForTrace(item, depth + 1);
|
|
2406
|
+
}
|
|
2407
|
+
return out;
|
|
2408
|
+
}
|
|
2409
|
+
return String(value);
|
|
2410
|
+
}
|
|
2411
|
+
|
|
2412
|
+
function looksLikeBase64(value: string): boolean {
|
|
2413
|
+
if (value.length < 32 || value.length % 4 !== 0) {
|
|
2414
|
+
return false;
|
|
2415
|
+
}
|
|
2416
|
+
return /^[A-Za-z0-9+/=]+$/.test(value);
|
|
2417
|
+
}
|
|
2418
|
+
|
|
2419
|
+
function previewForTrace(value: unknown): string {
|
|
2420
|
+
if (typeof value === "string") {
|
|
2421
|
+
return value;
|
|
2422
|
+
}
|
|
2423
|
+
try {
|
|
2424
|
+
return JSON.stringify(value);
|
|
2425
|
+
} catch {
|
|
2426
|
+
return String(value);
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
|
|
2430
|
+
function emitEvent(hooks: BenchmarkRunHooks | undefined, event: BenchmarkProgressEvent): void {
|
|
2431
|
+
hooks?.onEvent?.(event);
|
|
2432
|
+
}
|
|
2433
|
+
|
|
2434
|
+
function summarizeByMode(
|
|
2435
|
+
results: ScenarioResult[]
|
|
2436
|
+
): BenchmarkReport["modeSummary"] {
|
|
2437
|
+
const summary = Object.fromEntries(
|
|
2438
|
+
BENCHMARK_MODES.map((mode) => [
|
|
2439
|
+
mode,
|
|
2440
|
+
{ total: 0, executed: 0, skipped: 0, passed: 0, failed: 0 },
|
|
2441
|
+
])
|
|
2442
|
+
) as BenchmarkReport["modeSummary"];
|
|
2443
|
+
|
|
2444
|
+
for (const result of results) {
|
|
2445
|
+
const row = summary[result.mode];
|
|
2446
|
+
row.total += 1;
|
|
2447
|
+
if (result.status === "skipped") {
|
|
2448
|
+
row.skipped += 1;
|
|
2449
|
+
continue;
|
|
2450
|
+
}
|
|
2451
|
+
row.executed += 1;
|
|
2452
|
+
if (result.status === "passed") {
|
|
2453
|
+
row.passed += 1;
|
|
2454
|
+
} else {
|
|
2455
|
+
row.failed += 1;
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
|
|
2459
|
+
return summary;
|
|
2460
|
+
}
|
|
2461
|
+
|
|
2462
|
+
function collectTopFailureReasons(samples: ScenarioRunSample[]): Array<{ reason: string; count: number }> {
|
|
2463
|
+
const counts = new Map<string, number>();
|
|
2464
|
+
for (const sample of samples) {
|
|
2465
|
+
if (!sample.error) {
|
|
2466
|
+
continue;
|
|
2467
|
+
}
|
|
2468
|
+
counts.set(sample.error, (counts.get(sample.error) ?? 0) + 1);
|
|
2469
|
+
}
|
|
2470
|
+
|
|
2471
|
+
return [...counts.entries()]
|
|
2472
|
+
.sort((a, b) => b[1] - a[1])
|
|
2473
|
+
.slice(0, 5)
|
|
2474
|
+
.map(([reason, count]) => ({ reason, count }));
|
|
2475
|
+
}
|
|
2476
|
+
|
|
2477
|
+
function compactObject(source: Record<string, unknown>): Record<string, unknown> {
|
|
2478
|
+
return Object.fromEntries(
|
|
2479
|
+
Object.entries(source).filter(([, value]) => value !== undefined)
|
|
2480
|
+
);
|
|
2481
|
+
}
|
|
2482
|
+
|
|
2483
|
+
function percentile(sorted: number[], p: number): number {
|
|
2484
|
+
if (sorted.length === 0) {
|
|
2485
|
+
return 0;
|
|
2486
|
+
}
|
|
2487
|
+
const idx = Math.ceil((p / 100) * sorted.length) - 1;
|
|
2488
|
+
return sorted[Math.max(0, idx)];
|
|
2489
|
+
}
|
|
2490
|
+
|
|
2491
|
+
function truncate(text: string, maxLength: number): string {
|
|
2492
|
+
if (text.length <= maxLength) {
|
|
2493
|
+
return text;
|
|
2494
|
+
}
|
|
2495
|
+
return `${text.slice(0, maxLength - 1)}…`;
|
|
2496
|
+
}
|
|
2497
|
+
|
|
2498
|
+
function calculateThroughput(tokens: number, latencyMs: number): number {
|
|
2499
|
+
if (tokens <= 0 || latencyMs <= 0) {
|
|
2500
|
+
return 0;
|
|
2501
|
+
}
|
|
2502
|
+
return (tokens * 1000) / latencyMs;
|
|
2503
|
+
}
|
|
2504
|
+
|
|
2505
|
+
async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, message: string): Promise<T> {
|
|
2506
|
+
let timeoutId: NodeJS.Timeout | null = null;
|
|
2507
|
+
try {
|
|
2508
|
+
return await Promise.race([
|
|
2509
|
+
promise,
|
|
2510
|
+
new Promise<T>((_resolve, reject) => {
|
|
2511
|
+
timeoutId = setTimeout(() => reject(new Error(message)), timeoutMs);
|
|
2512
|
+
}),
|
|
2513
|
+
]);
|
|
2514
|
+
} finally {
|
|
2515
|
+
if (timeoutId) {
|
|
2516
|
+
clearTimeout(timeoutId);
|
|
2517
|
+
}
|
|
2518
|
+
}
|
|
2519
|
+
}
|