waypoi 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/instructions/ui.instructions.md +42 -0
- package/.github/workflows/ci.yml +35 -0
- package/.github/workflows/publish.yml +71 -0
- package/.github/workflows/release.yml +48 -0
- package/.playwright-mcp/console-2026-04-04T01-41-10-746Z.log +2 -0
- package/.playwright-mcp/console-2026-04-04T01-41-28-799Z.log +3 -0
- package/.playwright-mcp/console-2026-04-05T02-26-51-909Z.log +76 -0
- package/.playwright-mcp/page-2026-04-04T01-41-10-816Z.yml +1 -0
- package/.playwright-mcp/page-2026-04-04T01-41-29-141Z.yml +77 -0
- package/.playwright-mcp/page-2026-04-04T01-41-42-633Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T01-42-03-929Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-12-54-813Z.yml +6 -0
- package/.playwright-mcp/page-2026-04-04T02-14-58-600Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-03-923Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-07-426Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-15-25-729Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-16-22-984Z.yml +262 -0
- package/.playwright-mcp/page-2026-04-04T02-17-00-599Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-04T02-17-50-874Z.yml +190 -0
- package/.playwright-mcp/page-2026-04-05T02-26-55-570Z.yml +6 -0
- package/AGENTS.md +48 -0
- package/CHANGELOG.md +131 -0
- package/README.md +552 -0
- package/assets/agent-mode.png +0 -0
- package/assets/categorize.png +0 -0
- package/assets/dashboard.png +0 -0
- package/assets/endpoint-proxy.png +0 -0
- package/assets/icon.png +0 -0
- package/assets/mcp-generate-image.png +0 -0
- package/assets/mcp-understand-image.png +0 -0
- package/assets/peek-token-flow.png +0 -0
- package/assets/playground.png +0 -0
- package/assets/sankey.png +0 -0
- package/cli/index.ts +2805 -0
- package/cli/legacyRewrite.ts +108 -0
- package/cli/modelRef.ts +24 -0
- package/dist/cli/index.js +2536 -0
- package/dist/cli/legacyRewrite.js +92 -0
- package/dist/cli/modelRef.js +20 -0
- package/dist/src/benchmark/artifacts.js +131 -0
- package/dist/src/benchmark/capabilityClassifier.js +81 -0
- package/dist/src/benchmark/capabilityStore.js +144 -0
- package/dist/src/benchmark/config.js +238 -0
- package/dist/src/benchmark/gates.js +118 -0
- package/dist/src/benchmark/jobs.js +252 -0
- package/dist/src/benchmark/runner.js +1847 -0
- package/dist/src/benchmark/schema.js +353 -0
- package/dist/src/benchmark/suites.js +314 -0
- package/dist/src/benchmark/tinyQaDataset.js +422 -0
- package/dist/src/benchmark/types.js +25 -0
- package/dist/src/config.js +47 -0
- package/dist/src/index.js +178 -0
- package/dist/src/mcp/client.js +215 -0
- package/dist/src/mcp/discovery.js +226 -0
- package/dist/src/mcp/policy.js +65 -0
- package/dist/src/mcp/registry.js +129 -0
- package/dist/src/mcp/service.js +460 -0
- package/dist/src/middleware/auth.js +179 -0
- package/dist/src/middleware/requestCapture.js +192 -0
- package/dist/src/middleware/requestStats.js +118 -0
- package/dist/src/pools/builder.js +132 -0
- package/dist/src/pools/repository.js +69 -0
- package/dist/src/pools/scheduler.js +360 -0
- package/dist/src/pools/types.js +2 -0
- package/dist/src/protocols/adapters/dashscope.js +267 -0
- package/dist/src/protocols/adapters/inferenceV2.js +346 -0
- package/dist/src/protocols/adapters/openai.js +27 -0
- package/dist/src/protocols/registry.js +99 -0
- package/dist/src/protocols/types.js +2 -0
- package/dist/src/providers/health.js +153 -0
- package/dist/src/providers/importer.js +289 -0
- package/dist/src/providers/modelRegistry.js +313 -0
- package/dist/src/providers/repository.js +361 -0
- package/dist/src/providers/types.js +2 -0
- package/dist/src/routes/admin.js +531 -0
- package/dist/src/routes/audio.js +295 -0
- package/dist/src/routes/chat.js +240 -0
- package/dist/src/routes/embeddings.js +157 -0
- package/dist/src/routes/images.js +288 -0
- package/dist/src/routes/mcp.js +256 -0
- package/dist/src/routes/mcpService.js +100 -0
- package/dist/src/routes/models.js +48 -0
- package/dist/src/routes/responses.js +711 -0
- package/dist/src/routes/sessions.js +450 -0
- package/dist/src/routes/stats.js +270 -0
- package/dist/src/routes/ui.js +97 -0
- package/dist/src/routes/videos.js +107 -0
- package/dist/src/routing/router.js +338 -0
- package/dist/src/services/imageGeneration.js +280 -0
- package/dist/src/services/imageUnderstanding.js +352 -0
- package/dist/src/services/videoGeneration.js +79 -0
- package/dist/src/storage/captureRepository.js +1591 -0
- package/dist/src/storage/files.js +157 -0
- package/dist/src/storage/imageCache.js +346 -0
- package/dist/src/storage/repositories.js +388 -0
- package/dist/src/storage/sessionRepository.js +370 -0
- package/dist/src/storage/statsRepository.js +204 -0
- package/dist/src/transport/httpClient.js +126 -0
- package/dist/src/types.js +2 -0
- package/dist/src/utils/messageMedia.js +285 -0
- package/dist/src/utils/modelCapabilities.js +108 -0
- package/dist/src/utils/modelDiscovery.js +170 -0
- package/dist/src/version.js +5 -0
- package/dist/src/workers/captureRetention.js +25 -0
- package/dist/src/workers/configWatcher.js +91 -0
- package/dist/src/workers/healthChecker.js +21 -0
- package/dist/src/workers/statsRotation.js +41 -0
- package/docs/LLM/output_schema.md +312 -0
- package/docs/benchmark.md +208 -0
- package/docs/mcp-guidelines.md +125 -0
- package/docs/mcp-service.md +178 -0
- package/docs/opencode.md +86 -0
- package/docs/providers.md +79 -0
- package/examples/benchmark.config.yaml +28 -0
- package/examples/providers/alibaba-dashscope.yaml +88 -0
- package/examples/providers/alibaba-llm.yaml +64 -0
- package/examples/providers/alibaba-registry.yaml +7 -0
- package/examples/providers/inference-v2-ray.yaml +29 -0
- package/examples/scenarios/assets/omni-call-sample.wav +0 -0
- package/examples/scenarios/custom.jsonl +5 -0
- package/examples/scenarios/custom.yaml +40 -0
- package/model-form-v2.png +0 -0
- package/package.json +66 -0
- package/provider-form-v2.png +0 -0
- package/provider-form.png +0 -0
- package/scripts/manual-test.sh +11 -0
- package/scripts/version-from-git.js +23 -0
- package/src/benchmark/artifacts.ts +149 -0
- package/src/benchmark/capabilityClassifier.ts +99 -0
- package/src/benchmark/capabilityStore.ts +174 -0
- package/src/benchmark/config.ts +337 -0
- package/src/benchmark/gates.ts +164 -0
- package/src/benchmark/jobs.ts +312 -0
- package/src/benchmark/runner.ts +2519 -0
- package/src/benchmark/schema.ts +443 -0
- package/src/benchmark/suites.ts +323 -0
- package/src/benchmark/tinyQaDataset.ts +428 -0
- package/src/benchmark/types.ts +442 -0
- package/src/config.ts +44 -0
- package/src/index.ts +195 -0
- package/src/mcp/client.ts +305 -0
- package/src/mcp/discovery.ts +266 -0
- package/src/mcp/policy.ts +105 -0
- package/src/mcp/registry.ts +164 -0
- package/src/mcp/service.ts +611 -0
- package/src/middleware/auth.ts +251 -0
- package/src/middleware/requestCapture.ts +245 -0
- package/src/middleware/requestStats.ts +163 -0
- package/src/pools/builder.ts +159 -0
- package/src/pools/repository.ts +71 -0
- package/src/pools/scheduler.ts +425 -0
- package/src/pools/types.ts +117 -0
- package/src/protocols/adapters/dashscope.ts +335 -0
- package/src/protocols/adapters/inferenceV2.ts +428 -0
- package/src/protocols/adapters/openai.ts +32 -0
- package/src/protocols/registry.ts +117 -0
- package/src/protocols/types.ts +81 -0
- package/src/providers/health.ts +207 -0
- package/src/providers/importer.ts +402 -0
- package/src/providers/modelRegistry.ts +415 -0
- package/src/providers/repository.ts +439 -0
- package/src/providers/types.ts +113 -0
- package/src/routes/admin.ts +666 -0
- package/src/routes/audio.ts +372 -0
- package/src/routes/chat.ts +301 -0
- package/src/routes/embeddings.ts +197 -0
- package/src/routes/images.ts +356 -0
- package/src/routes/mcp.ts +320 -0
- package/src/routes/mcpService.ts +114 -0
- package/src/routes/models.ts +50 -0
- package/src/routes/responses.ts +872 -0
- package/src/routes/sessions.ts +558 -0
- package/src/routes/stats.ts +312 -0
- package/src/routes/ui.ts +96 -0
- package/src/routes/videos.ts +132 -0
- package/src/routing/router.ts +501 -0
- package/src/services/imageGeneration.ts +396 -0
- package/src/services/imageUnderstanding.ts +449 -0
- package/src/services/videoGeneration.ts +127 -0
- package/src/storage/captureRepository.ts +1835 -0
- package/src/storage/files.ts +178 -0
- package/src/storage/imageCache.ts +405 -0
- package/src/storage/repositories.ts +494 -0
- package/src/storage/sessionRepository.ts +419 -0
- package/src/storage/statsRepository.ts +238 -0
- package/src/transport/httpClient.ts +145 -0
- package/src/types.ts +322 -0
- package/src/utils/messageMedia.ts +293 -0
- package/src/utils/modelCapabilities.ts +161 -0
- package/src/utils/modelDiscovery.ts +203 -0
- package/src/workers/captureRetention.ts +25 -0
- package/src/workers/configWatcher.ts +115 -0
- package/src/workers/healthChecker.ts +22 -0
- package/src/workers/statsRotation.ts +49 -0
- package/tests/benchmarkAdminRoutes.test.ts +82 -0
- package/tests/benchmarkBasics.test.ts +116 -0
- package/tests/captureAdminRoutes.test.ts +420 -0
- package/tests/captureRepository.test.ts +797 -0
- package/tests/cliLegacyRewrite.test.ts +45 -0
- package/tests/imageGeneration.service.test.ts +107 -0
- package/tests/imageUnderstanding.service.test.ts +123 -0
- package/tests/mcpPolicy.test.ts +105 -0
- package/tests/mcpService.test.ts +1245 -0
- package/tests/modelRef.test.ts +23 -0
- package/tests/modelsRoutes.test.ts +154 -0
- package/tests/sessionMediaCache.test.ts +167 -0
- package/tests/statsRoutes.test.ts +323 -0
- package/tsconfig.json +15 -0
- package/ui/index.html +16 -0
- package/ui/package-lock.json +8521 -0
- package/ui/package.json +52 -0
- package/ui/postcss.config.js +6 -0
- package/ui/public/assets/apple-touch-icon.png +0 -0
- package/ui/public/assets/favicon-16.png +0 -0
- package/ui/public/assets/favicon-32.png +0 -0
- package/ui/public/assets/icon-192.png +0 -0
- package/ui/public/assets/icon-512.png +0 -0
- package/ui/src/App.tsx +27 -0
- package/ui/src/api/client.ts +1503 -0
- package/ui/src/components/EndpointUsageGuide.tsx +361 -0
- package/ui/src/components/Layout.tsx +124 -0
- package/ui/src/components/MessageContent.tsx +365 -0
- package/ui/src/components/ToolCallMessage.tsx +179 -0
- package/ui/src/components/ToolPicker.tsx +442 -0
- package/ui/src/components/messageContentParser.test.ts +41 -0
- package/ui/src/components/messageContentParser.ts +73 -0
- package/ui/src/components/thinkingPreview.test.ts +27 -0
- package/ui/src/components/thinkingPreview.ts +15 -0
- package/ui/src/components/toMermaidSankey.test.ts +78 -0
- package/ui/src/components/toMermaidSankey.ts +56 -0
- package/ui/src/components/ui/button.tsx +58 -0
- package/ui/src/components/ui/input.tsx +21 -0
- package/ui/src/components/ui/textarea.tsx +21 -0
- package/ui/src/lib/utils.ts +6 -0
- package/ui/src/main.tsx +9 -0
- package/ui/src/pages/AgentPlayground.tsx +2010 -0
- package/ui/src/pages/Benchmark.tsx +988 -0
- package/ui/src/pages/Dashboard.tsx +581 -0
- package/ui/src/pages/Peek.tsx +962 -0
- package/ui/src/pages/Settings.tsx +2013 -0
- package/ui/src/pages/agentPlaygroundPayload.test.ts +109 -0
- package/ui/src/pages/agentPlaygroundPayload.ts +97 -0
- package/ui/src/pages/agentThinkingContent.test.ts +50 -0
- package/ui/src/pages/agentThinkingContent.ts +57 -0
- package/ui/src/pages/dashboardTokenUsage.test.ts +66 -0
- package/ui/src/pages/dashboardTokenUsage.ts +36 -0
- package/ui/src/pages/imageUpload.test.ts +39 -0
- package/ui/src/pages/imageUpload.ts +71 -0
- package/ui/src/pages/peekFilters.test.ts +29 -0
- package/ui/src/pages/peekFilters.ts +13 -0
- package/ui/src/pages/peekMedia.test.ts +58 -0
- package/ui/src/pages/peekMedia.ts +148 -0
- package/ui/src/pages/sessionAutoTitle.test.ts +128 -0
- package/ui/src/pages/sessionAutoTitle.ts +106 -0
- package/ui/src/stores/settings.ts +58 -0
- package/ui/src/styles/globals.css +223 -0
- package/ui/src/vite-env.d.ts +8 -0
- package/ui/tailwind.config.js +106 -0
- package/ui/tsconfig.json +32 -0
- package/ui/vite.config.ts +37 -0
|
@@ -0,0 +1,1847 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.listBenchmarkExamples = listBenchmarkExamples;
|
|
7
|
+
exports.runBenchmark = runBenchmark;
|
|
8
|
+
const fs_1 = require("fs");
|
|
9
|
+
const path_1 = __importDefault(require("path"));
|
|
10
|
+
const crypto_1 = require("crypto");
|
|
11
|
+
const yaml_1 = __importDefault(require("yaml"));
|
|
12
|
+
const router_1 = require("../routing/router");
|
|
13
|
+
const repositories_1 = require("../storage/repositories");
|
|
14
|
+
const discovery_1 = require("../mcp/discovery");
|
|
15
|
+
const artifacts_1 = require("./artifacts");
|
|
16
|
+
const capabilityClassifier_1 = require("./capabilityClassifier");
|
|
17
|
+
const capabilityStore_1 = require("./capabilityStore");
|
|
18
|
+
const config_1 = require("./config");
|
|
19
|
+
const gates_1 = require("./gates");
|
|
20
|
+
const schema_1 = require("./schema");
|
|
21
|
+
const suites_1 = require("./suites");
|
|
22
|
+
const repository_1 = require("../providers/repository");
|
|
23
|
+
const types_1 = require("./types");
|
|
24
|
+
function listBenchmarkExamples(suite = "showcase") {
|
|
25
|
+
return (0, suites_1.listSuiteExamples)(suite);
|
|
26
|
+
}
|
|
27
|
+
async function runBenchmark(paths, options, hooks) {
|
|
28
|
+
const effective = await (0, config_1.resolveBenchmarkConfig)(paths, options);
|
|
29
|
+
const loaded = await loadScenarios(paths, effective);
|
|
30
|
+
const runId = hooks?.runId;
|
|
31
|
+
if (loaded.scenarios.length === 0) {
|
|
32
|
+
throw new Error("No benchmark scenarios found. Use --suite and/or --scenario.");
|
|
33
|
+
}
|
|
34
|
+
emitEvent(hooks, {
|
|
35
|
+
type: "run_started",
|
|
36
|
+
timestamp: new Date().toISOString(),
|
|
37
|
+
runId,
|
|
38
|
+
totalScenarios: loaded.scenarios.length,
|
|
39
|
+
});
|
|
40
|
+
const warnings = [...loaded.warnings];
|
|
41
|
+
for (const warning of loaded.warnings) {
|
|
42
|
+
emitEvent(hooks, {
|
|
43
|
+
type: "warning",
|
|
44
|
+
timestamp: new Date().toISOString(),
|
|
45
|
+
runId,
|
|
46
|
+
warning,
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
const hasAgentScenarios = loaded.scenarios.some((scenario) => scenario.mode === "agent");
|
|
50
|
+
if (hasAgentScenarios) {
|
|
51
|
+
try {
|
|
52
|
+
await (0, discovery_1.discoverAllTools)(paths);
|
|
53
|
+
}
|
|
54
|
+
catch (error) {
|
|
55
|
+
const warning = `MCP discovery failed for benchmark: ${(0, discovery_1.summarizeMcpError)(error)}`;
|
|
56
|
+
warnings.push(warning);
|
|
57
|
+
emitEvent(hooks, {
|
|
58
|
+
type: "warning",
|
|
59
|
+
timestamp: new Date().toISOString(),
|
|
60
|
+
runId,
|
|
61
|
+
warning,
|
|
62
|
+
});
|
|
63
|
+
if (process.env.WAYPOI_DEBUG_ERRORS === "1") {
|
|
64
|
+
console.error(error);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
const executions = [];
|
|
69
|
+
for (const [scenarioIndex, scenario] of loaded.scenarios.entries()) {
|
|
70
|
+
emitEvent(hooks, {
|
|
71
|
+
type: "scenario_started",
|
|
72
|
+
timestamp: new Date().toISOString(),
|
|
73
|
+
runId,
|
|
74
|
+
scenarioId: scenario.id,
|
|
75
|
+
scenarioIndex: scenarioIndex + 1,
|
|
76
|
+
totalScenarios: loaded.scenarios.length,
|
|
77
|
+
scenario: scenarioToSummary(scenario, effective.run.suite),
|
|
78
|
+
});
|
|
79
|
+
const execution = await runScenarioWithSampling(paths, scenario, effective, (sample, runIndex, phase, totalRuns) => {
|
|
80
|
+
emitEvent(hooks, {
|
|
81
|
+
type: "sample_completed",
|
|
82
|
+
timestamp: new Date().toISOString(),
|
|
83
|
+
runId,
|
|
84
|
+
scenarioId: scenario.id,
|
|
85
|
+
scenarioIndex: scenarioIndex + 1,
|
|
86
|
+
totalScenarios: loaded.scenarios.length,
|
|
87
|
+
runIndex,
|
|
88
|
+
totalRuns,
|
|
89
|
+
phase,
|
|
90
|
+
sample,
|
|
91
|
+
});
|
|
92
|
+
}, (exchange, runIndex, phase, totalRuns) => {
|
|
93
|
+
emitEvent(hooks, {
|
|
94
|
+
type: "exchange",
|
|
95
|
+
timestamp: new Date().toISOString(),
|
|
96
|
+
runId,
|
|
97
|
+
scenarioId: scenario.id,
|
|
98
|
+
scenarioIndex: scenarioIndex + 1,
|
|
99
|
+
totalScenarios: loaded.scenarios.length,
|
|
100
|
+
runIndex,
|
|
101
|
+
totalRuns,
|
|
102
|
+
phase,
|
|
103
|
+
exchange,
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
warnings.push(...execution.warnings);
|
|
107
|
+
for (const warning of execution.warnings) {
|
|
108
|
+
emitEvent(hooks, {
|
|
109
|
+
type: "warning",
|
|
110
|
+
timestamp: new Date().toISOString(),
|
|
111
|
+
runId,
|
|
112
|
+
scenarioId: scenario.id,
|
|
113
|
+
scenarioIndex: scenarioIndex + 1,
|
|
114
|
+
totalScenarios: loaded.scenarios.length,
|
|
115
|
+
warning,
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
emitEvent(hooks, {
|
|
119
|
+
type: "scenario_completed",
|
|
120
|
+
timestamp: new Date().toISOString(),
|
|
121
|
+
runId,
|
|
122
|
+
scenarioId: scenario.id,
|
|
123
|
+
scenarioIndex: scenarioIndex + 1,
|
|
124
|
+
totalScenarios: loaded.scenarios.length,
|
|
125
|
+
result: execution.result,
|
|
126
|
+
});
|
|
127
|
+
executions.push(execution);
|
|
128
|
+
}
|
|
129
|
+
const capabilityMatrix = buildCapabilityMatrix(effective, executions);
|
|
130
|
+
if (effective.run.updateCapCache && capabilityMatrix && capabilityMatrix.models.length > 0) {
|
|
131
|
+
await (0, capabilityStore_1.writeCapabilitySnapshots)(paths, capabilityMatrix.models);
|
|
132
|
+
}
|
|
133
|
+
const reportBase = buildReport(effective, warnings, loaded.scenarioPath, executions, capabilityMatrix, runId);
|
|
134
|
+
const gateResults = await (0, gates_1.evaluateGates)(reportBase, effective);
|
|
135
|
+
const report = {
|
|
136
|
+
...reportBase,
|
|
137
|
+
gateResults,
|
|
138
|
+
};
|
|
139
|
+
const artifacts = await (0, artifacts_1.writeBenchmarkArtifacts)(paths, report, effective.run.outPath);
|
|
140
|
+
await (0, discovery_1.disconnectAllServers)();
|
|
141
|
+
emitEvent(hooks, {
|
|
142
|
+
type: "run_completed",
|
|
143
|
+
timestamp: new Date().toISOString(),
|
|
144
|
+
runId: report.id,
|
|
145
|
+
summary: {
|
|
146
|
+
total: report.total,
|
|
147
|
+
executed: report.executed,
|
|
148
|
+
succeeded: report.succeeded,
|
|
149
|
+
failed: report.failed,
|
|
150
|
+
successRate: report.successRate,
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
return {
|
|
154
|
+
report,
|
|
155
|
+
artifactPath: artifacts.jsonPath,
|
|
156
|
+
textArtifactPath: artifacts.textPath,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
async function loadScenarios(paths, effective) {
|
|
160
|
+
let allScenarios = [];
|
|
161
|
+
const warnings = [];
|
|
162
|
+
if (effective.run.suite) {
|
|
163
|
+
if (effective.run.suite === "capabilities") {
|
|
164
|
+
allScenarios = await buildCapabilitySuiteScenarios(paths, effective);
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
allScenarios.push(...(0, suites_1.builtInSuite)(effective.run.suite));
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
if (effective.run.exampleId) {
|
|
171
|
+
allScenarios = allScenarios.filter((scenario) => scenario.id === effective.run.exampleId);
|
|
172
|
+
if (allScenarios.length === 0) {
|
|
173
|
+
throw new Error(`Example '${effective.run.exampleId}' not found in suite '${effective.run.suite ?? "showcase"}'.`);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
if (effective.run.scenarioPath) {
|
|
177
|
+
const filePath = path_1.default.resolve(effective.run.scenarioPath);
|
|
178
|
+
const fromFile = await loadScenarioFile(filePath);
|
|
179
|
+
const validated = (0, schema_1.validateScenarioCollection)(fromFile, filePath);
|
|
180
|
+
for (const scenario of validated.scenarios) {
|
|
181
|
+
if (!scenario.exampleSource) {
|
|
182
|
+
scenario.exampleSource = "file";
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
allScenarios.push(...validated.scenarios);
|
|
186
|
+
warnings.push(...validated.warnings);
|
|
187
|
+
}
|
|
188
|
+
ensureUniqueScenarioIds(allScenarios);
|
|
189
|
+
return {
|
|
190
|
+
scenarios: allScenarios,
|
|
191
|
+
warnings,
|
|
192
|
+
scenarioPath: effective.run.scenarioPath ? path_1.default.resolve(effective.run.scenarioPath) : undefined,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
async function buildCapabilitySuiteScenarios(paths, effective) {
|
|
196
|
+
const template = (0, suites_1.builtInSuite)("capabilities");
|
|
197
|
+
if (effective.run.modelOverride) {
|
|
198
|
+
return materializeCapabilityScenariosForModel(template, effective.run.modelOverride);
|
|
199
|
+
}
|
|
200
|
+
const providers = await (0, repository_1.listProviders)(paths);
|
|
201
|
+
const seen = new Set();
|
|
202
|
+
const scenarios = [];
|
|
203
|
+
for (const provider of providers) {
|
|
204
|
+
if (!provider.enabled) {
|
|
205
|
+
continue;
|
|
206
|
+
}
|
|
207
|
+
for (const model of provider.models) {
|
|
208
|
+
if (model.enabled === false) {
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
const modelRef = `${provider.id}/${model.modelId}`;
|
|
212
|
+
if (seen.has(modelRef)) {
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
seen.add(modelRef);
|
|
216
|
+
scenarios.push(...materializeCapabilityScenariosForModel(template, modelRef, model));
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return scenarios;
|
|
220
|
+
}
|
|
221
|
+
function materializeCapabilityScenariosForModel(template, model, providerModel) {
|
|
222
|
+
return template
|
|
223
|
+
.filter((scenario) => {
|
|
224
|
+
if (scenario.id === "cap.chat_vision_input") {
|
|
225
|
+
return false;
|
|
226
|
+
}
|
|
227
|
+
if (scenario.id === "cap.images_edit") {
|
|
228
|
+
return false;
|
|
229
|
+
}
|
|
230
|
+
if (!providerModel) {
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
return supportsScenarioByDeclaredCapabilities(scenario, providerModel);
|
|
234
|
+
})
|
|
235
|
+
.map((scenario) => ({
|
|
236
|
+
...scenario,
|
|
237
|
+
id: `${scenario.id}::${model}`,
|
|
238
|
+
model,
|
|
239
|
+
assertions: { ...scenario.assertions },
|
|
240
|
+
}));
|
|
241
|
+
}
|
|
242
|
+
function supportsScenarioByDeclaredCapabilities(scenario, providerModel) {
|
|
243
|
+
const input = new Set(providerModel.capabilities.input);
|
|
244
|
+
const output = new Set(providerModel.capabilities.output);
|
|
245
|
+
if (scenario.mode === "chat" || scenario.mode === "agent") {
|
|
246
|
+
return input.has("text") && output.has("text");
|
|
247
|
+
}
|
|
248
|
+
if (scenario.mode === "embeddings") {
|
|
249
|
+
return input.has("text") && output.has("embedding");
|
|
250
|
+
}
|
|
251
|
+
if (scenario.mode === "image_generation") {
|
|
252
|
+
return output.has("image");
|
|
253
|
+
}
|
|
254
|
+
if (scenario.mode === "audio_transcription") {
|
|
255
|
+
return input.has("audio") && output.has("text");
|
|
256
|
+
}
|
|
257
|
+
if (scenario.mode === "audio_speech") {
|
|
258
|
+
return input.has("text") && output.has("audio");
|
|
259
|
+
}
|
|
260
|
+
if (scenario.mode === "omni_call") {
|
|
261
|
+
return input.has("audio") && output.has("text");
|
|
262
|
+
}
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
async function loadScenarioFile(filePath) {
|
|
266
|
+
const raw = await fs_1.promises.readFile(filePath, "utf8");
|
|
267
|
+
const ext = path_1.default.extname(filePath).toLowerCase();
|
|
268
|
+
if (ext === ".jsonl") {
|
|
269
|
+
const rows = raw
|
|
270
|
+
.split("\n")
|
|
271
|
+
.map((line, index) => ({ line: line.trim(), lineNumber: index + 1 }))
|
|
272
|
+
.filter((entry) => entry.line.length > 0);
|
|
273
|
+
return rows.map((entry) => {
|
|
274
|
+
try {
|
|
275
|
+
return JSON.parse(entry.line);
|
|
276
|
+
}
|
|
277
|
+
catch (error) {
|
|
278
|
+
throw new Error(`Failed to parse scenario JSONL ${filePath}:${entry.lineNumber}: ${error.message}`);
|
|
279
|
+
}
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
if (ext === ".yaml" || ext === ".yml") {
|
|
283
|
+
let parsed;
|
|
284
|
+
try {
|
|
285
|
+
parsed = yaml_1.default.parse(raw);
|
|
286
|
+
}
|
|
287
|
+
catch (error) {
|
|
288
|
+
throw new Error(`Failed to parse YAML scenario file ${filePath}: ${error.message}`);
|
|
289
|
+
}
|
|
290
|
+
return extractScenarioArray(parsed, filePath);
|
|
291
|
+
}
|
|
292
|
+
let parsed;
|
|
293
|
+
try {
|
|
294
|
+
parsed = JSON.parse(raw);
|
|
295
|
+
}
|
|
296
|
+
catch (error) {
|
|
297
|
+
throw new Error(`Failed to parse JSON scenario file ${filePath}: ${error.message}`);
|
|
298
|
+
}
|
|
299
|
+
return extractScenarioArray(parsed, filePath);
|
|
300
|
+
}
|
|
301
|
+
function extractScenarioArray(parsed, source) {
|
|
302
|
+
if (Array.isArray(parsed)) {
|
|
303
|
+
return parsed;
|
|
304
|
+
}
|
|
305
|
+
if (parsed &&
|
|
306
|
+
typeof parsed === "object" &&
|
|
307
|
+
Array.isArray(parsed.scenarios)) {
|
|
308
|
+
return parsed.scenarios;
|
|
309
|
+
}
|
|
310
|
+
throw new Error(`${source}: scenario file must be an array or an object with 'scenarios' array.`);
|
|
311
|
+
}
|
|
312
|
+
function ensureUniqueScenarioIds(scenarios) {
|
|
313
|
+
const ids = new Set();
|
|
314
|
+
for (const scenario of scenarios) {
|
|
315
|
+
if (ids.has(scenario.id)) {
|
|
316
|
+
throw new Error(`Scenario ID '${scenario.id}' is duplicated.`);
|
|
317
|
+
}
|
|
318
|
+
ids.add(scenario.id);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
async function runScenarioWithSampling(paths, scenario, effective, onSampleComplete, onExchange) {
|
|
322
|
+
const warnings = [];
|
|
323
|
+
const example = scenarioToSummary(scenario, effective.run.suite);
|
|
324
|
+
const model = effective.run.modelOverride ||
|
|
325
|
+
scenario.model ||
|
|
326
|
+
(await pickBestModelForScenario(paths, scenario));
|
|
327
|
+
if (!model) {
|
|
328
|
+
const reason = `No model available for mode '${scenario.mode}'.`;
|
|
329
|
+
warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
|
|
330
|
+
return {
|
|
331
|
+
scenario,
|
|
332
|
+
example,
|
|
333
|
+
result: buildSkippedScenarioResult(scenario, reason),
|
|
334
|
+
samples: [],
|
|
335
|
+
exchanges: [],
|
|
336
|
+
warnings,
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
const runProfile = effective.run.executionMode === "showcase"
|
|
340
|
+
? { warmupRuns: 0, measuredRuns: 1, minScenarioPassRate: 1 }
|
|
341
|
+
: effective.profileSettings;
|
|
342
|
+
const totalRuns = runProfile.warmupRuns + runProfile.measuredRuns;
|
|
343
|
+
const measuredSamples = [];
|
|
344
|
+
const measuredExchanges = [];
|
|
345
|
+
const selectedTools = getSelectedTools(scenario.tools);
|
|
346
|
+
if (scenario.requiresAvailableTools && selectedTools.length === 0) {
|
|
347
|
+
const reason = "No MCP tools are available for this tool-driven example.";
|
|
348
|
+
warnings.push(`Scenario '${scenario.id}' skipped: ${reason}`);
|
|
349
|
+
return {
|
|
350
|
+
scenario,
|
|
351
|
+
example,
|
|
352
|
+
result: buildSkippedScenarioResult(scenario, reason),
|
|
353
|
+
samples: [],
|
|
354
|
+
exchanges: [],
|
|
355
|
+
warnings,
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
for (let index = 0; index < totalRuns; index++) {
|
|
359
|
+
const phase = index < runProfile.warmupRuns ? "warmup" : "measured";
|
|
360
|
+
const runIndex = index + 1;
|
|
361
|
+
const runExchanges = [];
|
|
362
|
+
const sample = await runSingleScenario(paths, scenario, model, effective, runIndex, (event) => {
|
|
363
|
+
if (phase === "measured") {
|
|
364
|
+
runExchanges.push(toExchangeSummary(event));
|
|
365
|
+
}
|
|
366
|
+
onExchange?.(event, runIndex, phase, totalRuns);
|
|
367
|
+
});
|
|
368
|
+
onSampleComplete?.(sample, index + 1, phase, totalRuns);
|
|
369
|
+
if (index >= runProfile.warmupRuns) {
|
|
370
|
+
measuredSamples.push(sample);
|
|
371
|
+
measuredExchanges.push(...runExchanges);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
return {
|
|
375
|
+
scenario,
|
|
376
|
+
example,
|
|
377
|
+
result: buildScenarioResult(scenario, model, measuredSamples, runProfile.minScenarioPassRate),
|
|
378
|
+
exchanges: measuredExchanges,
|
|
379
|
+
samples: measuredSamples,
|
|
380
|
+
warnings,
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
async function pickBestModelForScenario(paths, scenario) {
|
|
384
|
+
const requirements = getModeRequirements(scenario.mode);
|
|
385
|
+
return (0, repositories_1.pickBestModelByCapabilities)(paths, {
|
|
386
|
+
requiredInput: requirements.requiredInput,
|
|
387
|
+
requiredOutput: requirements.requiredOutput,
|
|
388
|
+
}, requirements.preferredEndpointType);
|
|
389
|
+
}
|
|
390
|
+
function getModeRequirements(mode) {
|
|
391
|
+
switch (mode) {
|
|
392
|
+
case "chat":
|
|
393
|
+
case "agent":
|
|
394
|
+
case "responses":
|
|
395
|
+
return { requiredInput: ["text"], requiredOutput: ["text"], preferredEndpointType: "llm" };
|
|
396
|
+
case "embeddings":
|
|
397
|
+
return { requiredInput: ["text"], requiredOutput: ["embedding"], preferredEndpointType: "embedding" };
|
|
398
|
+
case "image_generation":
|
|
399
|
+
return { requiredInput: ["text"], requiredOutput: ["image"], preferredEndpointType: "diffusion" };
|
|
400
|
+
case "audio_transcription":
|
|
401
|
+
return { requiredInput: ["audio"], requiredOutput: ["text"], preferredEndpointType: "audio" };
|
|
402
|
+
case "audio_speech":
|
|
403
|
+
return { requiredInput: ["text"], requiredOutput: ["audio"], preferredEndpointType: "audio" };
|
|
404
|
+
case "omni_call":
|
|
405
|
+
return { requiredInput: ["text", "audio"], requiredOutput: ["text"], preferredEndpointType: "llm" };
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
async function runSingleScenario(paths, scenario, model, effective, runIndex, onExchange) {
|
|
409
|
+
const startTime = Date.now();
|
|
410
|
+
try {
|
|
411
|
+
const sample = await runModeScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
412
|
+
return { ...sample, runIndex };
|
|
413
|
+
}
|
|
414
|
+
catch (error) {
|
|
415
|
+
const latencyMs = Date.now() - startTime;
|
|
416
|
+
return {
|
|
417
|
+
runIndex,
|
|
418
|
+
success: false,
|
|
419
|
+
latencyMs,
|
|
420
|
+
statusCode: 0,
|
|
421
|
+
tokens: 0,
|
|
422
|
+
toolCalls: 0,
|
|
423
|
+
throughputTokensPerSec: 0,
|
|
424
|
+
finalOutput: "",
|
|
425
|
+
outputPreview: "",
|
|
426
|
+
verdict: error.message,
|
|
427
|
+
usedToolNames: [],
|
|
428
|
+
error: error.message,
|
|
429
|
+
candidateAttempts: 0,
|
|
430
|
+
failovers: 0,
|
|
431
|
+
rateLimitSwitches: 0,
|
|
432
|
+
distinctProviders: 0,
|
|
433
|
+
distinctModels: 0,
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
async function runModeScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
438
|
+
switch (scenario.mode) {
|
|
439
|
+
case "chat":
|
|
440
|
+
return runChatScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
441
|
+
case "agent":
|
|
442
|
+
return runAgentScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
443
|
+
case "responses":
|
|
444
|
+
return runResponsesScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
445
|
+
case "embeddings":
|
|
446
|
+
return runEmbeddingsScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
447
|
+
case "image_generation":
|
|
448
|
+
return runImageScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
449
|
+
case "audio_transcription":
|
|
450
|
+
return runAudioTranscriptionScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
451
|
+
case "audio_speech":
|
|
452
|
+
return runAudioSpeechScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
453
|
+
case "omni_call":
|
|
454
|
+
return runOmniCallScenario(paths, scenario, model, effective, startTime, onExchange);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
async function runChatScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
458
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
459
|
+
const payload = {
|
|
460
|
+
model,
|
|
461
|
+
messages: [{ role: "user", content: scenario.prompt }],
|
|
462
|
+
stream: false,
|
|
463
|
+
...buildGenerationParams(scenario, effective),
|
|
464
|
+
};
|
|
465
|
+
const envelope = await requestJson(paths, model, "/v1/chat/completions", payload, timeoutMs, getModeRequirements("chat"));
|
|
466
|
+
onExchange?.(buildExchangeEvent({
|
|
467
|
+
scenario,
|
|
468
|
+
mode: "chat",
|
|
469
|
+
model,
|
|
470
|
+
requestPath: "/v1/chat/completions",
|
|
471
|
+
requestPayload: envelope.requestPayload,
|
|
472
|
+
responsePayload: envelope.payload,
|
|
473
|
+
statusCode: envelope.statusCode,
|
|
474
|
+
contentType: envelope.contentType,
|
|
475
|
+
endpointId: envelope.route.endpointId,
|
|
476
|
+
endpointName: envelope.route.endpointName,
|
|
477
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
478
|
+
}));
|
|
479
|
+
const response = envelope.payload ?? {};
|
|
480
|
+
const output = parseAssistantContent(response);
|
|
481
|
+
const tokens = Number(response.usage?.total_tokens ?? 0);
|
|
482
|
+
const latencyMs = Date.now() - startTime;
|
|
483
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
484
|
+
output,
|
|
485
|
+
toolCalls: 0,
|
|
486
|
+
toolNames: [],
|
|
487
|
+
latencyMs,
|
|
488
|
+
statusCode: envelope.statusCode,
|
|
489
|
+
});
|
|
490
|
+
return {
|
|
491
|
+
success: !assertionError,
|
|
492
|
+
latencyMs,
|
|
493
|
+
statusCode: envelope.statusCode,
|
|
494
|
+
tokens,
|
|
495
|
+
toolCalls: 0,
|
|
496
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
497
|
+
finalOutput: output,
|
|
498
|
+
outputPreview: truncate(output, 180),
|
|
499
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
500
|
+
usedToolNames: [],
|
|
501
|
+
error: assertionError ?? undefined,
|
|
502
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
503
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
504
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
505
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
506
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
async function runResponsesScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
510
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
511
|
+
const payload = {
|
|
512
|
+
model,
|
|
513
|
+
input: scenario.prompt,
|
|
514
|
+
stream: false,
|
|
515
|
+
...buildGenerationParams(scenario, effective),
|
|
516
|
+
};
|
|
517
|
+
const envelope = await requestJson(paths, model, "/v1/responses", payload, timeoutMs, getModeRequirements("responses"));
|
|
518
|
+
onExchange?.(buildExchangeEvent({
|
|
519
|
+
scenario,
|
|
520
|
+
mode: "responses",
|
|
521
|
+
model,
|
|
522
|
+
requestPath: "/v1/responses",
|
|
523
|
+
requestPayload: envelope.requestPayload,
|
|
524
|
+
responsePayload: envelope.payload,
|
|
525
|
+
statusCode: envelope.statusCode,
|
|
526
|
+
contentType: envelope.contentType,
|
|
527
|
+
endpointId: envelope.route.endpointId,
|
|
528
|
+
endpointName: envelope.route.endpointName,
|
|
529
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
530
|
+
}));
|
|
531
|
+
const response = envelope.payload;
|
|
532
|
+
const output = extractResponsesOutputText(response);
|
|
533
|
+
const tokens = Number(response?.usage?.total_tokens ?? 0);
|
|
534
|
+
const latencyMs = Date.now() - startTime;
|
|
535
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
536
|
+
output,
|
|
537
|
+
toolCalls: 0,
|
|
538
|
+
toolNames: [],
|
|
539
|
+
latencyMs,
|
|
540
|
+
statusCode: envelope.statusCode,
|
|
541
|
+
});
|
|
542
|
+
return {
|
|
543
|
+
success: !assertionError,
|
|
544
|
+
latencyMs,
|
|
545
|
+
statusCode: envelope.statusCode,
|
|
546
|
+
tokens,
|
|
547
|
+
toolCalls: 0,
|
|
548
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
549
|
+
finalOutput: output,
|
|
550
|
+
outputPreview: truncate(output, 180),
|
|
551
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
552
|
+
usedToolNames: [],
|
|
553
|
+
error: assertionError ?? undefined,
|
|
554
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
555
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
556
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
557
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
558
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
async function runAgentScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
562
|
+
const selectedTools = getSelectedTools(scenario.tools);
|
|
563
|
+
const messages = [{ role: "user", content: scenario.prompt }];
|
|
564
|
+
const maxIterations = scenario.maxIterations ?? effective.defaults.maxIterations;
|
|
565
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
566
|
+
const toolTimeoutMs = effective.defaults.toolTimeoutMs;
|
|
567
|
+
let toolCalls = 0;
|
|
568
|
+
let totalTokens = 0;
|
|
569
|
+
let finalOutput = "";
|
|
570
|
+
const usedToolNames = new Set();
|
|
571
|
+
let statusCode = 200;
|
|
572
|
+
let reachedIterationCap = true;
|
|
573
|
+
let candidateAttempts = 0;
|
|
574
|
+
let failovers = 0;
|
|
575
|
+
let rateLimitSwitches = 0;
|
|
576
|
+
let distinctProviders = 0;
|
|
577
|
+
let distinctModels = 0;
|
|
578
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
579
|
+
const payload = {
|
|
580
|
+
model,
|
|
581
|
+
messages,
|
|
582
|
+
stream: false,
|
|
583
|
+
...buildGenerationParams(scenario, effective),
|
|
584
|
+
};
|
|
585
|
+
if (selectedTools.length > 0) {
|
|
586
|
+
payload.tools = selectedTools.map((tool) => ({
|
|
587
|
+
type: "function",
|
|
588
|
+
function: {
|
|
589
|
+
name: tool.name,
|
|
590
|
+
description: tool.description ?? "",
|
|
591
|
+
parameters: tool.inputSchema,
|
|
592
|
+
},
|
|
593
|
+
}));
|
|
594
|
+
payload.tool_choice = "auto";
|
|
595
|
+
}
|
|
596
|
+
const envelope = await requestJson(paths, model, "/v1/chat/completions", payload, timeoutMs, getModeRequirements("agent"));
|
|
597
|
+
statusCode = envelope.statusCode;
|
|
598
|
+
const response = envelope.payload ?? {};
|
|
599
|
+
totalTokens += Number(response.usage?.total_tokens ?? 0);
|
|
600
|
+
candidateAttempts += envelope.poolMetrics?.candidateAttempts ?? 0;
|
|
601
|
+
failovers += envelope.poolMetrics?.failovers ?? 0;
|
|
602
|
+
rateLimitSwitches += envelope.poolMetrics?.rateLimitSwitches ?? 0;
|
|
603
|
+
distinctProviders = Math.max(distinctProviders, envelope.poolMetrics?.distinctProviders ?? 0);
|
|
604
|
+
distinctModels = Math.max(distinctModels, envelope.poolMetrics?.distinctModels ?? 0);
|
|
605
|
+
const assistantMessage = response.choices?.[0]?.message;
|
|
606
|
+
const assistantContent = parseMessageContent(assistantMessage?.content);
|
|
607
|
+
finalOutput = assistantContent || finalOutput;
|
|
608
|
+
const toolCallList = Array.isArray(assistantMessage?.tool_calls)
|
|
609
|
+
? assistantMessage.tool_calls
|
|
610
|
+
: [];
|
|
611
|
+
onExchange?.(buildExchangeEvent({
|
|
612
|
+
scenario,
|
|
613
|
+
mode: "agent",
|
|
614
|
+
model,
|
|
615
|
+
requestPath: "/v1/chat/completions",
|
|
616
|
+
requestPayload: envelope.requestPayload,
|
|
617
|
+
responsePayload: envelope.payload,
|
|
618
|
+
statusCode: envelope.statusCode,
|
|
619
|
+
contentType: envelope.contentType,
|
|
620
|
+
endpointId: envelope.route.endpointId,
|
|
621
|
+
endpointName: envelope.route.endpointName,
|
|
622
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
623
|
+
toolTrace: buildToolTrace(toolCallList, []),
|
|
624
|
+
}));
|
|
625
|
+
if (toolCallList.length === 0) {
|
|
626
|
+
messages.push({ role: "assistant", content: assistantContent });
|
|
627
|
+
reachedIterationCap = false;
|
|
628
|
+
break;
|
|
629
|
+
}
|
|
630
|
+
messages.push({
|
|
631
|
+
role: "assistant",
|
|
632
|
+
content: assistantContent || null,
|
|
633
|
+
tool_calls: toolCallList,
|
|
634
|
+
});
|
|
635
|
+
for (const call of toolCallList) {
|
|
636
|
+
const name = call.function?.name;
|
|
637
|
+
if (!name) {
|
|
638
|
+
throw new Error(`Scenario ${scenario.id}: tool call is missing function.name.`);
|
|
639
|
+
}
|
|
640
|
+
let args = {};
|
|
641
|
+
const rawArguments = call.function?.arguments;
|
|
642
|
+
if (rawArguments && rawArguments.trim().length > 0) {
|
|
643
|
+
try {
|
|
644
|
+
args = JSON.parse(rawArguments);
|
|
645
|
+
}
|
|
646
|
+
catch {
|
|
647
|
+
throw new Error(`Scenario ${scenario.id}: invalid tool arguments for ${name}.`);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
const result = await withTimeout((0, discovery_1.executeTool)(name, args), toolTimeoutMs, `Tool execution timed out for ${name} after ${toolTimeoutMs}ms`);
|
|
651
|
+
toolCalls += 1;
|
|
652
|
+
usedToolNames.add(name);
|
|
653
|
+
messages.push({
|
|
654
|
+
role: "tool",
|
|
655
|
+
tool_call_id: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
|
|
656
|
+
content: result.content,
|
|
657
|
+
});
|
|
658
|
+
onExchange?.(buildExchangeEvent({
|
|
659
|
+
scenario,
|
|
660
|
+
mode: "agent",
|
|
661
|
+
model,
|
|
662
|
+
requestPath: "/mcp/tools/call",
|
|
663
|
+
requestPayload: {
|
|
664
|
+
tool_name: name,
|
|
665
|
+
arguments: args,
|
|
666
|
+
},
|
|
667
|
+
responsePayload: result.content,
|
|
668
|
+
statusCode: 200,
|
|
669
|
+
contentType: "application/json",
|
|
670
|
+
toolTrace: buildToolTrace([call], [
|
|
671
|
+
{
|
|
672
|
+
name,
|
|
673
|
+
toolCallId: call.id ?? `tool-${iteration + 1}-${toolCalls}`,
|
|
674
|
+
content: result.content,
|
|
675
|
+
},
|
|
676
|
+
]),
|
|
677
|
+
}));
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
const latencyMs = Date.now() - startTime;
|
|
681
|
+
const capError = reachedIterationCap ? "max_iterations_reached" : null;
|
|
682
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
683
|
+
output: finalOutput,
|
|
684
|
+
toolCalls,
|
|
685
|
+
toolNames: Array.from(usedToolNames),
|
|
686
|
+
latencyMs,
|
|
687
|
+
statusCode,
|
|
688
|
+
});
|
|
689
|
+
const error = capError ?? assertionError;
|
|
690
|
+
return {
|
|
691
|
+
success: !error,
|
|
692
|
+
latencyMs,
|
|
693
|
+
statusCode,
|
|
694
|
+
tokens: totalTokens,
|
|
695
|
+
toolCalls,
|
|
696
|
+
throughputTokensPerSec: calculateThroughput(totalTokens, latencyMs),
|
|
697
|
+
finalOutput: finalOutput,
|
|
698
|
+
outputPreview: truncate(finalOutput, 180),
|
|
699
|
+
verdict: error ?? "All assertions passed.",
|
|
700
|
+
usedToolNames: Array.from(usedToolNames),
|
|
701
|
+
error: error ?? undefined,
|
|
702
|
+
candidateAttempts,
|
|
703
|
+
failovers,
|
|
704
|
+
rateLimitSwitches,
|
|
705
|
+
distinctProviders,
|
|
706
|
+
distinctModels,
|
|
707
|
+
};
|
|
708
|
+
}
|
|
709
|
+
async function runEmbeddingsScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
710
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
711
|
+
const payload = {
|
|
712
|
+
model,
|
|
713
|
+
input: scenario.input,
|
|
714
|
+
};
|
|
715
|
+
const envelope = await requestJson(paths, model, "/v1/embeddings", payload, timeoutMs, getModeRequirements("embeddings"));
|
|
716
|
+
onExchange?.(buildExchangeEvent({
|
|
717
|
+
scenario,
|
|
718
|
+
mode: "embeddings",
|
|
719
|
+
model,
|
|
720
|
+
requestPath: "/v1/embeddings",
|
|
721
|
+
requestPayload: envelope.requestPayload,
|
|
722
|
+
responsePayload: envelope.payload,
|
|
723
|
+
statusCode: envelope.statusCode,
|
|
724
|
+
contentType: envelope.contentType,
|
|
725
|
+
endpointId: envelope.route.endpointId,
|
|
726
|
+
endpointName: envelope.route.endpointName,
|
|
727
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
728
|
+
}));
|
|
729
|
+
const response = envelope.payload;
|
|
730
|
+
const data = Array.isArray(response?.data) ? response.data : [];
|
|
731
|
+
const firstVectorLength = Array.isArray(data[0]?.embedding) ? data[0].embedding.length : 0;
|
|
732
|
+
const text = `items=${data.length},vectorLength=${firstVectorLength}`;
|
|
733
|
+
const tokens = Number(response?.usage?.total_tokens ?? 0);
|
|
734
|
+
const latencyMs = Date.now() - startTime;
|
|
735
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
736
|
+
output: text,
|
|
737
|
+
toolCalls: 0,
|
|
738
|
+
toolNames: [],
|
|
739
|
+
latencyMs,
|
|
740
|
+
statusCode: envelope.statusCode,
|
|
741
|
+
embeddingsItems: data.length,
|
|
742
|
+
embeddingsVectorLength: firstVectorLength,
|
|
743
|
+
});
|
|
744
|
+
return {
|
|
745
|
+
success: !assertionError,
|
|
746
|
+
latencyMs,
|
|
747
|
+
statusCode: envelope.statusCode,
|
|
748
|
+
tokens,
|
|
749
|
+
toolCalls: 0,
|
|
750
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
751
|
+
finalOutput: text,
|
|
752
|
+
outputPreview: truncate(text, 180),
|
|
753
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
754
|
+
usedToolNames: [],
|
|
755
|
+
error: assertionError ?? undefined,
|
|
756
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
757
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
758
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
759
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
760
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
761
|
+
};
|
|
762
|
+
}
|
|
763
|
+
async function runImageScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
764
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
765
|
+
const payload = {
|
|
766
|
+
model,
|
|
767
|
+
prompt: scenario.prompt,
|
|
768
|
+
n: scenario.n,
|
|
769
|
+
size: scenario.size,
|
|
770
|
+
};
|
|
771
|
+
const envelope = await requestJson(paths, model, "/v1/images/generations", payload, timeoutMs, getModeRequirements("image_generation"));
|
|
772
|
+
onExchange?.(buildExchangeEvent({
|
|
773
|
+
scenario,
|
|
774
|
+
mode: "image_generation",
|
|
775
|
+
model,
|
|
776
|
+
requestPath: "/v1/images/generations",
|
|
777
|
+
requestPayload: envelope.requestPayload,
|
|
778
|
+
responsePayload: envelope.payload,
|
|
779
|
+
statusCode: envelope.statusCode,
|
|
780
|
+
contentType: envelope.contentType,
|
|
781
|
+
endpointId: envelope.route.endpointId,
|
|
782
|
+
endpointName: envelope.route.endpointName,
|
|
783
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
784
|
+
}));
|
|
785
|
+
const response = envelope.payload;
|
|
786
|
+
const images = Array.isArray(response?.data) ? response.data : [];
|
|
787
|
+
const text = `images=${images.length}`;
|
|
788
|
+
const latencyMs = Date.now() - startTime;
|
|
789
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
790
|
+
output: text,
|
|
791
|
+
toolCalls: 0,
|
|
792
|
+
toolNames: [],
|
|
793
|
+
latencyMs,
|
|
794
|
+
statusCode: envelope.statusCode,
|
|
795
|
+
imagesCount: images.length,
|
|
796
|
+
});
|
|
797
|
+
return {
|
|
798
|
+
success: !assertionError,
|
|
799
|
+
latencyMs,
|
|
800
|
+
statusCode: envelope.statusCode,
|
|
801
|
+
tokens: 0,
|
|
802
|
+
toolCalls: 0,
|
|
803
|
+
throughputTokensPerSec: 0,
|
|
804
|
+
finalOutput: text,
|
|
805
|
+
outputPreview: truncate(text, 180),
|
|
806
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
807
|
+
usedToolNames: [],
|
|
808
|
+
error: assertionError ?? undefined,
|
|
809
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
810
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
811
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
812
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
813
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
814
|
+
};
|
|
815
|
+
}
|
|
816
|
+
async function runAudioTranscriptionScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
817
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
818
|
+
const audioPath = path_1.default.resolve(scenario.audioFile);
|
|
819
|
+
const audioBuffer = await fs_1.promises.readFile(audioPath);
|
|
820
|
+
const payload = {
|
|
821
|
+
model,
|
|
822
|
+
file: audioBuffer.toString("base64"),
|
|
823
|
+
response_format: "json",
|
|
824
|
+
};
|
|
825
|
+
const envelope = await requestJson(paths, model, "/v1/audio/transcriptions", payload, timeoutMs, getModeRequirements("audio_transcription"));
|
|
826
|
+
onExchange?.(buildExchangeEvent({
|
|
827
|
+
scenario,
|
|
828
|
+
mode: "audio_transcription",
|
|
829
|
+
model,
|
|
830
|
+
requestPath: "/v1/audio/transcriptions",
|
|
831
|
+
requestPayload: envelope.requestPayload,
|
|
832
|
+
responsePayload: envelope.payload,
|
|
833
|
+
statusCode: envelope.statusCode,
|
|
834
|
+
contentType: envelope.contentType,
|
|
835
|
+
endpointId: envelope.route.endpointId,
|
|
836
|
+
endpointName: envelope.route.endpointName,
|
|
837
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
838
|
+
}));
|
|
839
|
+
const response = envelope.payload;
|
|
840
|
+
const text = response?.text ?? "";
|
|
841
|
+
const latencyMs = Date.now() - startTime;
|
|
842
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
843
|
+
output: text,
|
|
844
|
+
toolCalls: 0,
|
|
845
|
+
toolNames: [],
|
|
846
|
+
latencyMs,
|
|
847
|
+
statusCode: envelope.statusCode,
|
|
848
|
+
});
|
|
849
|
+
return {
|
|
850
|
+
success: !assertionError,
|
|
851
|
+
latencyMs,
|
|
852
|
+
statusCode: envelope.statusCode,
|
|
853
|
+
tokens: 0,
|
|
854
|
+
toolCalls: 0,
|
|
855
|
+
throughputTokensPerSec: 0,
|
|
856
|
+
finalOutput: text,
|
|
857
|
+
outputPreview: truncate(text, 180),
|
|
858
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
859
|
+
usedToolNames: [],
|
|
860
|
+
error: assertionError ?? undefined,
|
|
861
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
862
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
863
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
864
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
865
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
866
|
+
};
|
|
867
|
+
}
|
|
868
|
+
async function runAudioSpeechScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
869
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
870
|
+
const payload = {
|
|
871
|
+
model,
|
|
872
|
+
input: scenario.inputText,
|
|
873
|
+
voice: scenario.voice,
|
|
874
|
+
response_format: scenario.response_format,
|
|
875
|
+
};
|
|
876
|
+
const envelope = await requestBinary(paths, model, "/v1/audio/speech", payload, timeoutMs, getModeRequirements("audio_speech"));
|
|
877
|
+
onExchange?.(buildExchangeEvent({
|
|
878
|
+
scenario,
|
|
879
|
+
mode: "audio_speech",
|
|
880
|
+
model,
|
|
881
|
+
requestPath: "/v1/audio/speech",
|
|
882
|
+
requestPayload: envelope.requestPayload,
|
|
883
|
+
responsePayload: {
|
|
884
|
+
bytes: envelope.buffer.length,
|
|
885
|
+
},
|
|
886
|
+
statusCode: envelope.statusCode,
|
|
887
|
+
contentType: envelope.contentType,
|
|
888
|
+
endpointId: envelope.route.endpointId,
|
|
889
|
+
endpointName: envelope.route.endpointName,
|
|
890
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
891
|
+
}));
|
|
892
|
+
const latencyMs = Date.now() - startTime;
|
|
893
|
+
const output = `bytes=${envelope.buffer.length}`;
|
|
894
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
895
|
+
output,
|
|
896
|
+
toolCalls: 0,
|
|
897
|
+
toolNames: [],
|
|
898
|
+
latencyMs,
|
|
899
|
+
statusCode: envelope.statusCode,
|
|
900
|
+
bytesLength: envelope.buffer.length,
|
|
901
|
+
contentType: envelope.contentType,
|
|
902
|
+
});
|
|
903
|
+
return {
|
|
904
|
+
success: !assertionError,
|
|
905
|
+
latencyMs,
|
|
906
|
+
statusCode: envelope.statusCode,
|
|
907
|
+
tokens: 0,
|
|
908
|
+
toolCalls: 0,
|
|
909
|
+
throughputTokensPerSec: 0,
|
|
910
|
+
finalOutput: output,
|
|
911
|
+
outputPreview: truncate(output, 180),
|
|
912
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
913
|
+
usedToolNames: [],
|
|
914
|
+
error: assertionError ?? undefined,
|
|
915
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
916
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
917
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
918
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
919
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
920
|
+
};
|
|
921
|
+
}
|
|
922
|
+
async function runOmniCallScenario(paths, scenario, model, effective, startTime, onExchange) {
|
|
923
|
+
const timeoutMs = scenario.timeoutMs ?? effective.defaults.requestTimeoutMs;
|
|
924
|
+
const audioPath = path_1.default.resolve(scenario.audioFile);
|
|
925
|
+
const audioBuffer = await fs_1.promises.readFile(audioPath);
|
|
926
|
+
const audioFormat = audioFormatFromFile(audioPath);
|
|
927
|
+
const payload = {
|
|
928
|
+
model,
|
|
929
|
+
messages: [
|
|
930
|
+
{
|
|
931
|
+
role: "user",
|
|
932
|
+
content: [
|
|
933
|
+
{
|
|
934
|
+
type: "input_audio",
|
|
935
|
+
input_audio: {
|
|
936
|
+
data: audioBuffer.toString("base64"),
|
|
937
|
+
format: audioFormat,
|
|
938
|
+
},
|
|
939
|
+
},
|
|
940
|
+
{
|
|
941
|
+
type: "text",
|
|
942
|
+
text: scenario.prompt ??
|
|
943
|
+
"Summarize this audio briefly and answer as transcript text.",
|
|
944
|
+
},
|
|
945
|
+
],
|
|
946
|
+
},
|
|
947
|
+
],
|
|
948
|
+
stream: false,
|
|
949
|
+
...buildGenerationParams(scenario, effective),
|
|
950
|
+
};
|
|
951
|
+
const envelope = await requestJson(paths, model, "/v1/chat/completions", payload, timeoutMs, getModeRequirements("omni_call"));
|
|
952
|
+
onExchange?.(buildExchangeEvent({
|
|
953
|
+
scenario,
|
|
954
|
+
mode: "omni_call",
|
|
955
|
+
model,
|
|
956
|
+
requestPath: "/v1/chat/completions",
|
|
957
|
+
requestPayload: envelope.requestPayload,
|
|
958
|
+
responsePayload: envelope.payload,
|
|
959
|
+
statusCode: envelope.statusCode,
|
|
960
|
+
contentType: envelope.contentType,
|
|
961
|
+
endpointId: envelope.route.endpointId,
|
|
962
|
+
endpointName: envelope.route.endpointName,
|
|
963
|
+
upstreamModel: envelope.route.upstreamModel,
|
|
964
|
+
}));
|
|
965
|
+
const response = envelope.payload ?? {};
|
|
966
|
+
const output = parseAssistantContent(response);
|
|
967
|
+
const tokens = Number(response.usage?.total_tokens ?? 0);
|
|
968
|
+
const audioOutputPresent = responseHasAudioOutput(response);
|
|
969
|
+
const latencyMs = Date.now() - startTime;
|
|
970
|
+
const assertionError = evaluateAssertions(scenario, {
|
|
971
|
+
output,
|
|
972
|
+
toolCalls: 0,
|
|
973
|
+
toolNames: [],
|
|
974
|
+
latencyMs,
|
|
975
|
+
statusCode: envelope.statusCode,
|
|
976
|
+
});
|
|
977
|
+
return {
|
|
978
|
+
success: !assertionError,
|
|
979
|
+
latencyMs,
|
|
980
|
+
statusCode: envelope.statusCode,
|
|
981
|
+
tokens,
|
|
982
|
+
toolCalls: 0,
|
|
983
|
+
throughputTokensPerSec: calculateThroughput(tokens, latencyMs),
|
|
984
|
+
finalOutput: output,
|
|
985
|
+
outputPreview: truncate(`${output}\naudio_output=${audioOutputPresent ? "yes" : "no"}`, 180),
|
|
986
|
+
verdict: assertionError ?? "All assertions passed.",
|
|
987
|
+
usedToolNames: [],
|
|
988
|
+
error: assertionError ?? undefined,
|
|
989
|
+
candidateAttempts: envelope.poolMetrics?.candidateAttempts ?? 0,
|
|
990
|
+
failovers: envelope.poolMetrics?.failovers ?? 0,
|
|
991
|
+
rateLimitSwitches: envelope.poolMetrics?.rateLimitSwitches ?? 0,
|
|
992
|
+
distinctProviders: envelope.poolMetrics?.distinctProviders ?? 0,
|
|
993
|
+
distinctModels: envelope.poolMetrics?.distinctModels ?? 0,
|
|
994
|
+
audioOutputPresent,
|
|
995
|
+
};
|
|
996
|
+
}
|
|
997
|
+
function audioFormatFromFile(filePath) {
|
|
998
|
+
const ext = path_1.default.extname(filePath).slice(1).toLowerCase();
|
|
999
|
+
if (ext === "mp3")
|
|
1000
|
+
return "mp3";
|
|
1001
|
+
if (ext === "wav")
|
|
1002
|
+
return "wav";
|
|
1003
|
+
if (ext === "ogg")
|
|
1004
|
+
return "ogg";
|
|
1005
|
+
if (ext === "m4a" || ext === "mp4")
|
|
1006
|
+
return "m4a";
|
|
1007
|
+
if (ext === "webm")
|
|
1008
|
+
return "webm";
|
|
1009
|
+
return "wav";
|
|
1010
|
+
}
|
|
1011
|
+
function responseHasAudioOutput(response) {
|
|
1012
|
+
const message = response.choices?.[0]?.message;
|
|
1013
|
+
if (!message || typeof message !== "object") {
|
|
1014
|
+
return false;
|
|
1015
|
+
}
|
|
1016
|
+
const directAudio = message.audio;
|
|
1017
|
+
if (directAudio && typeof directAudio === "object") {
|
|
1018
|
+
const audio = directAudio;
|
|
1019
|
+
if (typeof audio.url === "string" || typeof audio.data === "string") {
|
|
1020
|
+
return true;
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
const content = message.content;
|
|
1024
|
+
if (!Array.isArray(content)) {
|
|
1025
|
+
return false;
|
|
1026
|
+
}
|
|
1027
|
+
for (const part of content) {
|
|
1028
|
+
if (!part || typeof part !== "object") {
|
|
1029
|
+
continue;
|
|
1030
|
+
}
|
|
1031
|
+
const typed = part;
|
|
1032
|
+
if (typed.type !== "audio" && typed.type !== "output_audio") {
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
if (!typed.audio || typeof typed.audio !== "object") {
|
|
1036
|
+
continue;
|
|
1037
|
+
}
|
|
1038
|
+
const audio = typed.audio;
|
|
1039
|
+
if (typeof audio.url === "string" || typeof audio.data === "string") {
|
|
1040
|
+
return true;
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
return false;
|
|
1044
|
+
}
|
|
1045
|
+
async function requestJson(paths, model, requestPath, payload, timeoutMs, requirements) {
|
|
1046
|
+
const requestPayload = JSON.parse(JSON.stringify(payload));
|
|
1047
|
+
const outcome = await (0, router_1.routeRequest)(paths, model, requestPath, requestPayload, {}, AbortSignal.timeout(timeoutMs), {
|
|
1048
|
+
endpointType: requirements.preferredEndpointType,
|
|
1049
|
+
requiredInput: requirements.requiredInput,
|
|
1050
|
+
requiredOutput: requirements.requiredOutput,
|
|
1051
|
+
});
|
|
1052
|
+
const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
|
|
1053
|
+
const payloadData = parseJson(buffer);
|
|
1054
|
+
return {
|
|
1055
|
+
statusCode: outcome.attempt.response.statusCode,
|
|
1056
|
+
payload: payloadData,
|
|
1057
|
+
contentType,
|
|
1058
|
+
requestPayload,
|
|
1059
|
+
route: {
|
|
1060
|
+
endpointId: outcome.attempt.endpoint.id,
|
|
1061
|
+
endpointName: outcome.attempt.endpoint.name,
|
|
1062
|
+
upstreamModel: outcome.attempt.upstreamModel,
|
|
1063
|
+
},
|
|
1064
|
+
poolMetrics: outcome.attempt.pool,
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
async function requestBinary(paths, model, requestPath, payload, timeoutMs, requirements) {
|
|
1068
|
+
const requestPayload = JSON.parse(JSON.stringify(payload));
|
|
1069
|
+
const outcome = await (0, router_1.routeRequest)(paths, model, requestPath, requestPayload, {}, AbortSignal.timeout(timeoutMs), {
|
|
1070
|
+
endpointType: requirements.preferredEndpointType,
|
|
1071
|
+
requiredInput: requirements.requiredInput,
|
|
1072
|
+
requiredOutput: requirements.requiredOutput,
|
|
1073
|
+
});
|
|
1074
|
+
const { buffer, contentType } = await readBody(outcome.attempt.response.body, outcome.attempt.response.headers);
|
|
1075
|
+
return {
|
|
1076
|
+
statusCode: outcome.attempt.response.statusCode,
|
|
1077
|
+
buffer,
|
|
1078
|
+
contentType,
|
|
1079
|
+
requestPayload,
|
|
1080
|
+
route: {
|
|
1081
|
+
endpointId: outcome.attempt.endpoint.id,
|
|
1082
|
+
endpointName: outcome.attempt.endpoint.name,
|
|
1083
|
+
upstreamModel: outcome.attempt.upstreamModel,
|
|
1084
|
+
},
|
|
1085
|
+
poolMetrics: outcome.attempt.pool,
|
|
1086
|
+
};
|
|
1087
|
+
}
|
|
1088
|
+
async function readBody(stream, headers) {
|
|
1089
|
+
const chunks = [];
|
|
1090
|
+
for await (const chunk of stream) {
|
|
1091
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
1092
|
+
}
|
|
1093
|
+
const buffer = Buffer.concat(chunks);
|
|
1094
|
+
const contentTypeHeader = headers["content-type"] ?? headers["Content-Type"];
|
|
1095
|
+
const contentType = Array.isArray(contentTypeHeader)
|
|
1096
|
+
? contentTypeHeader.join(", ")
|
|
1097
|
+
: (contentTypeHeader ?? "");
|
|
1098
|
+
return { buffer, contentType };
|
|
1099
|
+
}
|
|
1100
|
+
function parseJson(buffer) {
|
|
1101
|
+
const text = buffer.toString("utf8").trim();
|
|
1102
|
+
if (!text) {
|
|
1103
|
+
return {};
|
|
1104
|
+
}
|
|
1105
|
+
try {
|
|
1106
|
+
return JSON.parse(text);
|
|
1107
|
+
}
|
|
1108
|
+
catch {
|
|
1109
|
+
return { raw: text };
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
function getSelectedTools(requestedNames) {
|
|
1113
|
+
const tools = (0, discovery_1.getCachedTools)();
|
|
1114
|
+
if (!requestedNames || requestedNames.length === 0) {
|
|
1115
|
+
return tools;
|
|
1116
|
+
}
|
|
1117
|
+
const requested = new Set(requestedNames);
|
|
1118
|
+
return tools.filter((tool) => requested.has(tool.name));
|
|
1119
|
+
}
|
|
1120
|
+
function parseAssistantContent(response) {
|
|
1121
|
+
const content = response.choices?.[0]?.message?.content;
|
|
1122
|
+
return parseMessageContent(content);
|
|
1123
|
+
}
|
|
1124
|
+
function buildGenerationParams(scenario, effective) {
|
|
1125
|
+
return compactObject({
|
|
1126
|
+
temperature: scenario.temperature ??
|
|
1127
|
+
effective.run.temperature ??
|
|
1128
|
+
effective.defaults.temperature,
|
|
1129
|
+
top_p: scenario.top_p ??
|
|
1130
|
+
effective.run.top_p ??
|
|
1131
|
+
effective.defaults.top_p,
|
|
1132
|
+
max_tokens: scenario.max_tokens ??
|
|
1133
|
+
effective.run.max_tokens ??
|
|
1134
|
+
effective.defaults.max_tokens,
|
|
1135
|
+
presence_penalty: scenario.presence_penalty ??
|
|
1136
|
+
effective.run.presence_penalty ??
|
|
1137
|
+
effective.defaults.presence_penalty,
|
|
1138
|
+
frequency_penalty: scenario.frequency_penalty ??
|
|
1139
|
+
effective.run.frequency_penalty ??
|
|
1140
|
+
effective.defaults.frequency_penalty,
|
|
1141
|
+
seed: scenario.seed ??
|
|
1142
|
+
effective.run.seed ??
|
|
1143
|
+
effective.defaults.seed,
|
|
1144
|
+
stop: scenario.stop ??
|
|
1145
|
+
effective.run.stop ??
|
|
1146
|
+
effective.defaults.stop,
|
|
1147
|
+
});
|
|
1148
|
+
}
|
|
1149
|
+
function extractResponsesOutputText(response) {
|
|
1150
|
+
const parts = [];
|
|
1151
|
+
for (const item of response.output ?? []) {
|
|
1152
|
+
if (!item || typeof item !== "object") {
|
|
1153
|
+
continue;
|
|
1154
|
+
}
|
|
1155
|
+
if (item.type === "message") {
|
|
1156
|
+
for (const part of item.content ?? []) {
|
|
1157
|
+
if (part?.type === "output_text" && typeof part.text === "string") {
|
|
1158
|
+
parts.push(part.text);
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
continue;
|
|
1162
|
+
}
|
|
1163
|
+
if (item.type === "function_call" && typeof item.name === "string") {
|
|
1164
|
+
parts.push(`tool_call:${item.name}`);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
return parts.join("\n").trim();
|
|
1168
|
+
}
|
|
1169
|
+
function parseMessageContent(content) {
|
|
1170
|
+
if (typeof content === "string") {
|
|
1171
|
+
return content;
|
|
1172
|
+
}
|
|
1173
|
+
if (Array.isArray(content)) {
|
|
1174
|
+
const parts = content
|
|
1175
|
+
.map((part) => {
|
|
1176
|
+
if (typeof part === "string") {
|
|
1177
|
+
return part;
|
|
1178
|
+
}
|
|
1179
|
+
if (part && typeof part === "object" && typeof part.text === "string") {
|
|
1180
|
+
return part.text;
|
|
1181
|
+
}
|
|
1182
|
+
return "";
|
|
1183
|
+
})
|
|
1184
|
+
.filter((part) => part.length > 0);
|
|
1185
|
+
return parts.join("\n");
|
|
1186
|
+
}
|
|
1187
|
+
return "";
|
|
1188
|
+
}
|
|
1189
|
+
function evaluateAssertions(scenario, runtime) {
|
|
1190
|
+
const assertions = scenario.assertions;
|
|
1191
|
+
for (const required of assertions.contains ?? []) {
|
|
1192
|
+
if (!runtime.output.includes(required)) {
|
|
1193
|
+
return `Assertion failed: output must include '${required}'.`;
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
for (const forbidden of assertions.notContains ?? []) {
|
|
1197
|
+
if (runtime.output.includes(forbidden)) {
|
|
1198
|
+
return `Assertion failed: output must not include '${forbidden}'.`;
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
for (const toolName of assertions.requiredToolNames ?? []) {
|
|
1202
|
+
if (!runtime.toolNames.includes(toolName)) {
|
|
1203
|
+
return `Assertion failed: expected tool '${toolName}' to be used.`;
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
if (typeof assertions.minToolCalls === "number" && runtime.toolCalls < assertions.minToolCalls) {
|
|
1207
|
+
return `Assertion failed: expected at least ${assertions.minToolCalls} tool calls, got ${runtime.toolCalls}.`;
|
|
1208
|
+
}
|
|
1209
|
+
if (typeof assertions.maxToolCalls === "number" && runtime.toolCalls > assertions.maxToolCalls) {
|
|
1210
|
+
return `Assertion failed: expected at most ${assertions.maxToolCalls} tool calls, got ${runtime.toolCalls}.`;
|
|
1211
|
+
}
|
|
1212
|
+
if (typeof assertions.maxLatencyMs === "number" && runtime.latencyMs > assertions.maxLatencyMs) {
|
|
1213
|
+
return `Assertion failed: latency ${runtime.latencyMs}ms exceeded ${assertions.maxLatencyMs}ms.`;
|
|
1214
|
+
}
|
|
1215
|
+
if (runtime.statusCode !== assertions.statusCode) {
|
|
1216
|
+
return `Assertion failed: expected status ${assertions.statusCode}, got ${runtime.statusCode}.`;
|
|
1217
|
+
}
|
|
1218
|
+
if (typeof assertions.minItems === "number") {
|
|
1219
|
+
const items = runtime.embeddingsItems ?? 0;
|
|
1220
|
+
if (items < assertions.minItems) {
|
|
1221
|
+
return `Assertion failed: expected at least ${assertions.minItems} embeddings, got ${items}.`;
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
if (typeof assertions.minVectorLength === "number") {
|
|
1225
|
+
const vectorLength = runtime.embeddingsVectorLength ?? 0;
|
|
1226
|
+
if (vectorLength < assertions.minVectorLength) {
|
|
1227
|
+
return `Assertion failed: expected vector length >= ${assertions.minVectorLength}, got ${vectorLength}.`;
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
if (typeof assertions.minImages === "number") {
|
|
1231
|
+
const images = runtime.imagesCount ?? 0;
|
|
1232
|
+
if (images < assertions.minImages) {
|
|
1233
|
+
return `Assertion failed: expected at least ${assertions.minImages} images, got ${images}.`;
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
for (const text of assertions.containsText ?? []) {
|
|
1237
|
+
if (!runtime.output.includes(text)) {
|
|
1238
|
+
return `Assertion failed: transcription must include '${text}'.`;
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
for (const text of assertions.notContainsText ?? []) {
|
|
1242
|
+
if (runtime.output.includes(text)) {
|
|
1243
|
+
return `Assertion failed: transcription must not include '${text}'.`;
|
|
1244
|
+
}
|
|
1245
|
+
}
|
|
1246
|
+
if (typeof assertions.minBytes === "number") {
|
|
1247
|
+
const bytes = runtime.bytesLength ?? 0;
|
|
1248
|
+
if (bytes < assertions.minBytes) {
|
|
1249
|
+
return `Assertion failed: expected at least ${assertions.minBytes} bytes, got ${bytes}.`;
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
if (assertions.contentType) {
|
|
1253
|
+
const contentType = runtime.contentType ?? "";
|
|
1254
|
+
if (!contentType.toLowerCase().includes(assertions.contentType.toLowerCase())) {
|
|
1255
|
+
return `Assertion failed: expected content type to include '${assertions.contentType}', got '${contentType}'.`;
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
return null;
|
|
1259
|
+
}
|
|
1260
|
+
function buildSkippedScenarioResult(scenario, reason) {
|
|
1261
|
+
return {
|
|
1262
|
+
id: scenario.id,
|
|
1263
|
+
mode: scenario.mode,
|
|
1264
|
+
title: scenario.title,
|
|
1265
|
+
summary: scenario.summary,
|
|
1266
|
+
userVisibleGoal: scenario.userVisibleGoal,
|
|
1267
|
+
exampleSource: scenario.exampleSource,
|
|
1268
|
+
inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
|
|
1269
|
+
successCriteria: scenario.successCriteria,
|
|
1270
|
+
expectedHighlights: scenario.expectedHighlights,
|
|
1271
|
+
model: scenario.model ?? "unresolved",
|
|
1272
|
+
status: "skipped",
|
|
1273
|
+
success: true,
|
|
1274
|
+
skippedReason: reason,
|
|
1275
|
+
passRate: 1,
|
|
1276
|
+
passedRuns: 0,
|
|
1277
|
+
failedRuns: 0,
|
|
1278
|
+
avgLatencyMs: 0,
|
|
1279
|
+
p50LatencyMs: 0,
|
|
1280
|
+
p95LatencyMs: 0,
|
|
1281
|
+
p99LatencyMs: 0,
|
|
1282
|
+
totalTokens: 0,
|
|
1283
|
+
totalToolCalls: 0,
|
|
1284
|
+
avgThroughputTokensPerSec: 0,
|
|
1285
|
+
candidateAttempts: 0,
|
|
1286
|
+
failovers: 0,
|
|
1287
|
+
rateLimitSwitches: 0,
|
|
1288
|
+
distinctProviders: 0,
|
|
1289
|
+
distinctModels: 0,
|
|
1290
|
+
errorReasons: [],
|
|
1291
|
+
usedToolNames: [],
|
|
1292
|
+
verdict: reason,
|
|
1293
|
+
outputPreview: "",
|
|
1294
|
+
audioOutputRuns: 0,
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
function buildScenarioResult(scenario, model, samples, minScenarioPassRate) {
|
|
1298
|
+
const latencies = samples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
|
|
1299
|
+
const passedRuns = samples.filter((sample) => sample.success).length;
|
|
1300
|
+
const failedRuns = samples.length - passedRuns;
|
|
1301
|
+
const totalTokens = samples.reduce((sum, sample) => sum + sample.tokens, 0);
|
|
1302
|
+
const totalToolCalls = samples.reduce((sum, sample) => sum + sample.toolCalls, 0);
|
|
1303
|
+
const passRate = samples.length > 0 ? passedRuns / samples.length : 0;
|
|
1304
|
+
const candidateAttempts = samples.reduce((sum, sample) => sum + (sample.candidateAttempts ?? 0), 0);
|
|
1305
|
+
const failovers = samples.reduce((sum, sample) => sum + (sample.failovers ?? 0), 0);
|
|
1306
|
+
const rateLimitSwitches = samples.reduce((sum, sample) => sum + (sample.rateLimitSwitches ?? 0), 0);
|
|
1307
|
+
const distinctProviders = samples.reduce((max, sample) => Math.max(max, sample.distinctProviders ?? 0), 0);
|
|
1308
|
+
const distinctModels = samples.reduce((max, sample) => Math.max(max, sample.distinctModels ?? 0), 0);
|
|
1309
|
+
const audioOutputRuns = samples.reduce((sum, sample) => sum + (sample.audioOutputPresent ? 1 : 0), 0);
|
|
1310
|
+
const avgLatencyMs = latencies.length > 0
|
|
1311
|
+
? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
|
|
1312
|
+
: 0;
|
|
1313
|
+
const avgThroughputTokensPerSec = samples.length > 0
|
|
1314
|
+
? samples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) / samples.length
|
|
1315
|
+
: 0;
|
|
1316
|
+
const failureReasonCounts = new Map();
|
|
1317
|
+
for (const sample of samples) {
|
|
1318
|
+
if (!sample.error) {
|
|
1319
|
+
continue;
|
|
1320
|
+
}
|
|
1321
|
+
failureReasonCounts.set(sample.error, (failureReasonCounts.get(sample.error) ?? 0) + 1);
|
|
1322
|
+
}
|
|
1323
|
+
const errorReasons = [...failureReasonCounts.entries()]
|
|
1324
|
+
.sort((a, b) => b[1] - a[1])
|
|
1325
|
+
.map(([reason, count]) => `${reason} (${count})`);
|
|
1326
|
+
const outputPreview = [...samples].reverse().find((sample) => sample.outputPreview)?.outputPreview ?? "";
|
|
1327
|
+
const status = passRate >= minScenarioPassRate ? "passed" : "failed";
|
|
1328
|
+
return {
|
|
1329
|
+
id: scenario.id,
|
|
1330
|
+
mode: scenario.mode,
|
|
1331
|
+
title: scenario.title,
|
|
1332
|
+
summary: scenario.summary,
|
|
1333
|
+
userVisibleGoal: scenario.userVisibleGoal,
|
|
1334
|
+
exampleSource: scenario.exampleSource,
|
|
1335
|
+
inputPreview: scenario.inputPreview ?? describeScenarioInput(scenario),
|
|
1336
|
+
successCriteria: scenario.successCriteria,
|
|
1337
|
+
expectedHighlights: scenario.expectedHighlights,
|
|
1338
|
+
model,
|
|
1339
|
+
status,
|
|
1340
|
+
success: status === "passed",
|
|
1341
|
+
passRate: Number(passRate.toFixed(4)),
|
|
1342
|
+
passedRuns,
|
|
1343
|
+
failedRuns,
|
|
1344
|
+
avgLatencyMs,
|
|
1345
|
+
p50LatencyMs: percentile(latencies, 50),
|
|
1346
|
+
p95LatencyMs: percentile(latencies, 95),
|
|
1347
|
+
p99LatencyMs: percentile(latencies, 99),
|
|
1348
|
+
totalTokens,
|
|
1349
|
+
totalToolCalls,
|
|
1350
|
+
avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
|
|
1351
|
+
candidateAttempts,
|
|
1352
|
+
failovers,
|
|
1353
|
+
rateLimitSwitches,
|
|
1354
|
+
distinctProviders,
|
|
1355
|
+
distinctModels,
|
|
1356
|
+
audioOutputRuns,
|
|
1357
|
+
usedToolNames: uniqueToolNames(samples),
|
|
1358
|
+
verdict: status === "passed"
|
|
1359
|
+
? "All assertions passed."
|
|
1360
|
+
: (samples.find((sample) => sample.error)?.error ?? errorReasons[0] ?? "Scenario failed."),
|
|
1361
|
+
errorReasons,
|
|
1362
|
+
outputPreview,
|
|
1363
|
+
};
|
|
1364
|
+
}
|
|
1365
|
+
function uniqueToolNames(samples) {
|
|
1366
|
+
const names = new Set();
|
|
1367
|
+
for (const sample of samples) {
|
|
1368
|
+
for (const toolName of sample.usedToolNames) {
|
|
1369
|
+
names.add(toolName);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
return Array.from(names).sort();
|
|
1373
|
+
}
|
|
1374
|
+
function describeScenarioInput(scenario) {
|
|
1375
|
+
if (scenario.inputPreview) {
|
|
1376
|
+
return scenario.inputPreview;
|
|
1377
|
+
}
|
|
1378
|
+
if (scenario.prompt) {
|
|
1379
|
+
return scenario.prompt;
|
|
1380
|
+
}
|
|
1381
|
+
if (scenario.inputText) {
|
|
1382
|
+
return scenario.inputText;
|
|
1383
|
+
}
|
|
1384
|
+
if (typeof scenario.input === "string") {
|
|
1385
|
+
return scenario.input;
|
|
1386
|
+
}
|
|
1387
|
+
if (Array.isArray(scenario.input)) {
|
|
1388
|
+
return scenario.input.join(" | ");
|
|
1389
|
+
}
|
|
1390
|
+
if (scenario.audioFile) {
|
|
1391
|
+
return scenario.audioFile;
|
|
1392
|
+
}
|
|
1393
|
+
return "";
|
|
1394
|
+
}
|
|
1395
|
+
function buildCapabilityMatrix(effective, executions) {
|
|
1396
|
+
const ttlDays = effective.run.capTtlDays ?? 7;
|
|
1397
|
+
const ttlMs = ttlDays * 24 * 60 * 60 * 1000;
|
|
1398
|
+
const byModel = new Map();
|
|
1399
|
+
for (const execution of executions) {
|
|
1400
|
+
const capability = execution.scenario.capability;
|
|
1401
|
+
if (!capability) {
|
|
1402
|
+
continue;
|
|
1403
|
+
}
|
|
1404
|
+
const { providerId, modelId } = splitModelRef(execution.result.model);
|
|
1405
|
+
const modelKey = `${providerId}/${modelId}`;
|
|
1406
|
+
const existing = byModel.get(modelKey) ?? {
|
|
1407
|
+
providerId,
|
|
1408
|
+
modelId,
|
|
1409
|
+
findings: {},
|
|
1410
|
+
lastVerifiedAt: new Date().toISOString(),
|
|
1411
|
+
};
|
|
1412
|
+
const status = classifyFromExecution(execution);
|
|
1413
|
+
const confidence = confidenceFromExecution(status, execution.result);
|
|
1414
|
+
const primaryReason = execution.result.errorReasons[0] ?? execution.result.outputPreview;
|
|
1415
|
+
const statusCode = execution.samples.find((sample) => sample.statusCode > 0)?.statusCode ??
|
|
1416
|
+
(execution.result.status === "skipped" ? 0 : 200);
|
|
1417
|
+
const nextFinding = {
|
|
1418
|
+
status,
|
|
1419
|
+
confidence,
|
|
1420
|
+
evidence: truncate(primaryReason || "No explicit evidence", 220),
|
|
1421
|
+
observedAt: new Date().toISOString(),
|
|
1422
|
+
scenarioId: execution.scenario.id,
|
|
1423
|
+
statusCode: statusCode > 0 ? statusCode : undefined,
|
|
1424
|
+
};
|
|
1425
|
+
const prev = existing.findings[capability];
|
|
1426
|
+
if (!prev || shouldReplaceFinding(prev.status, nextFinding.status, prev.confidence, nextFinding.confidence)) {
|
|
1427
|
+
existing.findings[capability] = nextFinding;
|
|
1428
|
+
}
|
|
1429
|
+
existing.lastVerifiedAt = new Date().toISOString();
|
|
1430
|
+
byModel.set(modelKey, existing);
|
|
1431
|
+
}
|
|
1432
|
+
const models = [];
|
|
1433
|
+
for (const [key, record] of byModel.entries()) {
|
|
1434
|
+
const findings = Object.fromEntries(types_1.BENCHMARK_CAPABILITY_KEYS.map((capability) => {
|
|
1435
|
+
const item = record.findings[capability];
|
|
1436
|
+
if (item) {
|
|
1437
|
+
return [
|
|
1438
|
+
capability,
|
|
1439
|
+
{
|
|
1440
|
+
capability,
|
|
1441
|
+
status: item.status,
|
|
1442
|
+
confidence: item.confidence,
|
|
1443
|
+
evidence: item.evidence,
|
|
1444
|
+
scenarioId: item.scenarioId,
|
|
1445
|
+
statusCode: item.statusCode,
|
|
1446
|
+
observedAt: item.observedAt,
|
|
1447
|
+
},
|
|
1448
|
+
];
|
|
1449
|
+
}
|
|
1450
|
+
return [
|
|
1451
|
+
capability,
|
|
1452
|
+
{
|
|
1453
|
+
capability,
|
|
1454
|
+
status: "unknown",
|
|
1455
|
+
confidence: 0,
|
|
1456
|
+
evidence: "No probe evidence in this run.",
|
|
1457
|
+
observedAt: record.lastVerifiedAt,
|
|
1458
|
+
},
|
|
1459
|
+
];
|
|
1460
|
+
}));
|
|
1461
|
+
const confidenceValues = Object.values(findings).map((finding) => finding.confidence);
|
|
1462
|
+
const avgConfidence = confidenceValues.length > 0
|
|
1463
|
+
? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
|
|
1464
|
+
: 0;
|
|
1465
|
+
const expiresAt = new Date(Date.parse(record.lastVerifiedAt) + ttlMs).toISOString();
|
|
1466
|
+
models.push({
|
|
1467
|
+
model: key,
|
|
1468
|
+
providerId: record.providerId,
|
|
1469
|
+
modelId: record.modelId,
|
|
1470
|
+
configFingerprint: (0, capabilityStore_1.computeConfigFingerprint)({
|
|
1471
|
+
suite: effective.run.suite,
|
|
1472
|
+
model: key,
|
|
1473
|
+
profile: effective.profile,
|
|
1474
|
+
}),
|
|
1475
|
+
confidence: Number(avgConfidence.toFixed(3)),
|
|
1476
|
+
lastVerifiedAt: record.lastVerifiedAt,
|
|
1477
|
+
expiresAt,
|
|
1478
|
+
freshness: Date.now() <= Date.parse(expiresAt) ? "fresh" : "stale",
|
|
1479
|
+
findings,
|
|
1480
|
+
});
|
|
1481
|
+
}
|
|
1482
|
+
models.sort((a, b) => a.model.localeCompare(b.model));
|
|
1483
|
+
if (models.length === 0) {
|
|
1484
|
+
return undefined;
|
|
1485
|
+
}
|
|
1486
|
+
return {
|
|
1487
|
+
generatedAt: new Date().toISOString(),
|
|
1488
|
+
ttlDays,
|
|
1489
|
+
models,
|
|
1490
|
+
};
|
|
1491
|
+
}
|
|
1492
|
+
function classifyFromExecution(execution) {
|
|
1493
|
+
if (execution.result.status === "skipped") {
|
|
1494
|
+
return "unknown";
|
|
1495
|
+
}
|
|
1496
|
+
if (execution.result.success) {
|
|
1497
|
+
return "supported";
|
|
1498
|
+
}
|
|
1499
|
+
const sample = execution.samples.find((item) => !item.success) ?? execution.samples[0];
|
|
1500
|
+
return (0, capabilityClassifier_1.classifyCapabilityStatus)({
|
|
1501
|
+
success: false,
|
|
1502
|
+
statusCode: sample?.statusCode,
|
|
1503
|
+
error: sample?.error ?? execution.result.errorReasons[0],
|
|
1504
|
+
});
|
|
1505
|
+
}
|
|
1506
|
+
function confidenceFromExecution(status, result) {
|
|
1507
|
+
if (status === "supported") {
|
|
1508
|
+
return Math.max(0.5, result.passRate);
|
|
1509
|
+
}
|
|
1510
|
+
if (status === "unsupported" || status === "misconfigured") {
|
|
1511
|
+
return 0.9;
|
|
1512
|
+
}
|
|
1513
|
+
return 0.4;
|
|
1514
|
+
}
|
|
1515
|
+
function shouldReplaceFinding(currentStatus, nextStatus, currentConfidence, nextConfidence) {
|
|
1516
|
+
const rank = (value) => {
|
|
1517
|
+
switch (value) {
|
|
1518
|
+
case "supported":
|
|
1519
|
+
return 4;
|
|
1520
|
+
case "unsupported":
|
|
1521
|
+
return 3;
|
|
1522
|
+
case "misconfigured":
|
|
1523
|
+
return 2;
|
|
1524
|
+
case "unknown":
|
|
1525
|
+
return 1;
|
|
1526
|
+
}
|
|
1527
|
+
};
|
|
1528
|
+
if (rank(nextStatus) !== rank(currentStatus)) {
|
|
1529
|
+
return rank(nextStatus) > rank(currentStatus);
|
|
1530
|
+
}
|
|
1531
|
+
return nextConfidence >= currentConfidence;
|
|
1532
|
+
}
|
|
1533
|
+
function splitModelRef(model) {
|
|
1534
|
+
const [providerId, ...rest] = model.split("/");
|
|
1535
|
+
if (!providerId || rest.length === 0) {
|
|
1536
|
+
return { providerId: "unknown", modelId: model };
|
|
1537
|
+
}
|
|
1538
|
+
return {
|
|
1539
|
+
providerId,
|
|
1540
|
+
modelId: rest.join("/"),
|
|
1541
|
+
};
|
|
1542
|
+
}
|
|
1543
|
+
function buildReport(effective, warnings, scenarioPath, executions, capabilityMatrix, reportId) {
|
|
1544
|
+
const results = executions.map((item) => item.result);
|
|
1545
|
+
const executedResults = results.filter((result) => result.status !== "skipped");
|
|
1546
|
+
const allSamples = executions.flatMap((item) => item.samples);
|
|
1547
|
+
const latencies = allSamples.map((sample) => sample.latencyMs).sort((a, b) => a - b);
|
|
1548
|
+
const total = results.length;
|
|
1549
|
+
const executed = executedResults.length;
|
|
1550
|
+
const skipped = results.filter((result) => result.status === "skipped").length;
|
|
1551
|
+
const succeeded = results.filter((result) => result.status === "passed").length;
|
|
1552
|
+
const failed = results.filter((result) => result.status === "failed").length;
|
|
1553
|
+
const totalTokens = executedResults.reduce((sum, result) => sum + result.totalTokens, 0);
|
|
1554
|
+
const totalToolCalls = executedResults.reduce((sum, result) => sum + result.totalToolCalls, 0);
|
|
1555
|
+
const avgLatencyMs = latencies.length > 0
|
|
1556
|
+
? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
|
|
1557
|
+
: 0;
|
|
1558
|
+
const avgThroughputTokensPerSec = allSamples.length > 0
|
|
1559
|
+
? allSamples.reduce((sum, sample) => sum + sample.throughputTokensPerSec, 0) /
|
|
1560
|
+
allSamples.length
|
|
1561
|
+
: 0;
|
|
1562
|
+
const topFailureReasons = collectTopFailureReasons(allSamples);
|
|
1563
|
+
const modeSummary = summarizeByMode(results);
|
|
1564
|
+
return {
|
|
1565
|
+
id: reportId ?? (0, crypto_1.randomUUID)(),
|
|
1566
|
+
createdAt: new Date().toISOString(),
|
|
1567
|
+
profile: effective.profile,
|
|
1568
|
+
executionMode: effective.run.executionMode ?? "diagnostic",
|
|
1569
|
+
suite: effective.run.suite,
|
|
1570
|
+
exampleId: effective.run.exampleId,
|
|
1571
|
+
scenarioPath,
|
|
1572
|
+
modelOverride: effective.run.modelOverride,
|
|
1573
|
+
configSource: effective.configSource,
|
|
1574
|
+
total,
|
|
1575
|
+
executed,
|
|
1576
|
+
skipped,
|
|
1577
|
+
succeeded,
|
|
1578
|
+
failed,
|
|
1579
|
+
successRate: executed > 0 ? Number((succeeded / executed).toFixed(4)) : 0,
|
|
1580
|
+
totalTokens,
|
|
1581
|
+
totalToolCalls,
|
|
1582
|
+
avgLatencyMs,
|
|
1583
|
+
p50LatencyMs: percentile(latencies, 50),
|
|
1584
|
+
p95LatencyMs: percentile(latencies, 95),
|
|
1585
|
+
p99LatencyMs: percentile(latencies, 99),
|
|
1586
|
+
avgThroughputTokensPerSec: Number(avgThroughputTokensPerSec.toFixed(3)),
|
|
1587
|
+
modeSummary,
|
|
1588
|
+
effectiveConfig: {
|
|
1589
|
+
defaults: effective.defaults,
|
|
1590
|
+
profileSettings: effective.profileSettings,
|
|
1591
|
+
gates: effective.gates,
|
|
1592
|
+
},
|
|
1593
|
+
results,
|
|
1594
|
+
scenarioDetails: executions.map((execution) => ({
|
|
1595
|
+
id: execution.result.id,
|
|
1596
|
+
suite: effective.run.suite,
|
|
1597
|
+
example: execution.example,
|
|
1598
|
+
model: execution.result.model,
|
|
1599
|
+
status: execution.result.status,
|
|
1600
|
+
verdict: execution.result.verdict,
|
|
1601
|
+
exchanges: execution.exchanges,
|
|
1602
|
+
finalResponsePreview: execution.result.outputPreview,
|
|
1603
|
+
usedToolNames: execution.result.usedToolNames,
|
|
1604
|
+
})),
|
|
1605
|
+
scenarioRuns: executions.map((execution) => ({
|
|
1606
|
+
id: execution.result.id,
|
|
1607
|
+
samples: execution.samples,
|
|
1608
|
+
})),
|
|
1609
|
+
warnings,
|
|
1610
|
+
topFailureReasons,
|
|
1611
|
+
capabilityMatrix,
|
|
1612
|
+
};
|
|
1613
|
+
}
|
|
1614
|
+
function buildExchangeEvent(args) {
|
|
1615
|
+
const requestSanitized = sanitizeForTrace(args.requestPayload);
|
|
1616
|
+
const responseSanitized = sanitizeForTrace(args.responsePayload);
|
|
1617
|
+
return {
|
|
1618
|
+
scenarioInput: describeScenarioInput(args.scenario),
|
|
1619
|
+
requestPreview: truncate(previewForTrace(requestSanitized), 220),
|
|
1620
|
+
responsePreview: truncate(previewForTrace(responseSanitized), 220),
|
|
1621
|
+
mode: args.mode,
|
|
1622
|
+
model: args.model,
|
|
1623
|
+
requestPath: args.requestPath,
|
|
1624
|
+
statusCode: args.statusCode,
|
|
1625
|
+
contentType: args.contentType,
|
|
1626
|
+
endpointId: args.endpointId,
|
|
1627
|
+
endpointName: args.endpointName,
|
|
1628
|
+
upstreamModel: args.upstreamModel,
|
|
1629
|
+
toolTrace: args.toolTrace ?? [],
|
|
1630
|
+
requestRaw: safeSerialize(args.requestPayload),
|
|
1631
|
+
requestSanitized,
|
|
1632
|
+
responseRaw: safeSerialize(args.responsePayload),
|
|
1633
|
+
responseSanitized,
|
|
1634
|
+
};
|
|
1635
|
+
}
|
|
1636
|
+
function toExchangeSummary(event) {
|
|
1637
|
+
return {
|
|
1638
|
+
timestamp: new Date().toISOString(),
|
|
1639
|
+
mode: event.mode,
|
|
1640
|
+
model: event.model,
|
|
1641
|
+
requestPath: event.requestPath,
|
|
1642
|
+
statusCode: event.statusCode,
|
|
1643
|
+
contentType: event.contentType,
|
|
1644
|
+
requestSanitized: event.requestSanitized,
|
|
1645
|
+
responseSanitized: event.responseSanitized,
|
|
1646
|
+
requestPreview: event.requestPreview,
|
|
1647
|
+
responsePreview: event.responsePreview,
|
|
1648
|
+
endpointId: event.endpointId,
|
|
1649
|
+
endpointName: event.endpointName,
|
|
1650
|
+
upstreamModel: event.upstreamModel,
|
|
1651
|
+
toolTrace: event.toolTrace,
|
|
1652
|
+
};
|
|
1653
|
+
}
|
|
1654
|
+
function scenarioToSummary(scenario, suite) {
|
|
1655
|
+
return {
|
|
1656
|
+
id: scenario.id,
|
|
1657
|
+
suite: suite ?? "custom",
|
|
1658
|
+
mode: scenario.mode,
|
|
1659
|
+
title: scenario.title ?? scenario.id,
|
|
1660
|
+
summary: scenario.summary ?? "Benchmark scenario",
|
|
1661
|
+
userVisibleGoal: scenario.userVisibleGoal ?? "Inspect the exact request, response, and final verdict.",
|
|
1662
|
+
exampleSource: scenario.exampleSource ?? (suite ? "builtin" : "file"),
|
|
1663
|
+
inputPreview: describeScenarioInput(scenario),
|
|
1664
|
+
successCriteria: scenario.successCriteria ?? "All configured assertions pass.",
|
|
1665
|
+
expectedHighlights: scenario.expectedHighlights ?? [],
|
|
1666
|
+
requiresAvailableTools: scenario.requiresAvailableTools === true,
|
|
1667
|
+
model: scenario.model,
|
|
1668
|
+
};
|
|
1669
|
+
}
|
|
1670
|
+
function buildToolTrace(toolCalls, toolResults) {
|
|
1671
|
+
const trace = [];
|
|
1672
|
+
for (const call of toolCalls) {
|
|
1673
|
+
const toolName = call.function?.name;
|
|
1674
|
+
if (!toolName) {
|
|
1675
|
+
continue;
|
|
1676
|
+
}
|
|
1677
|
+
trace.push({
|
|
1678
|
+
kind: "tool_call",
|
|
1679
|
+
toolName,
|
|
1680
|
+
toolCallId: call.id,
|
|
1681
|
+
argumentsText: call.function?.arguments,
|
|
1682
|
+
});
|
|
1683
|
+
}
|
|
1684
|
+
for (const result of toolResults) {
|
|
1685
|
+
trace.push({
|
|
1686
|
+
kind: "tool_result",
|
|
1687
|
+
toolName: result.name,
|
|
1688
|
+
toolCallId: result.toolCallId,
|
|
1689
|
+
contentText: previewForTrace(sanitizeForTrace(result.content)),
|
|
1690
|
+
});
|
|
1691
|
+
}
|
|
1692
|
+
return trace;
|
|
1693
|
+
}
|
|
1694
|
+
function safeSerialize(value) {
|
|
1695
|
+
try {
|
|
1696
|
+
return JSON.parse(JSON.stringify(value));
|
|
1697
|
+
}
|
|
1698
|
+
catch {
|
|
1699
|
+
return { preview: String(value) };
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
function sanitizeForTrace(value, depth = 0) {
|
|
1703
|
+
if (depth > 6) {
|
|
1704
|
+
return "[truncated-depth]";
|
|
1705
|
+
}
|
|
1706
|
+
if (value === null || value === undefined) {
|
|
1707
|
+
return value;
|
|
1708
|
+
}
|
|
1709
|
+
if (typeof value === "string") {
|
|
1710
|
+
const trimmed = value.trim();
|
|
1711
|
+
if (looksLikeBase64(trimmed) && trimmed.length > 64) {
|
|
1712
|
+
return `<base64 omitted len=${trimmed.length}>`;
|
|
1713
|
+
}
|
|
1714
|
+
if (trimmed.startsWith("data:") && trimmed.length > 80) {
|
|
1715
|
+
const mime = trimmed.slice(5, trimmed.indexOf(";")) || "unknown";
|
|
1716
|
+
return `<data-url ${mime} omitted len=${trimmed.length}>`;
|
|
1717
|
+
}
|
|
1718
|
+
if (trimmed.length > 500) {
|
|
1719
|
+
return `${trimmed.slice(0, 500)}…`;
|
|
1720
|
+
}
|
|
1721
|
+
return trimmed;
|
|
1722
|
+
}
|
|
1723
|
+
if (typeof value === "number" || typeof value === "boolean") {
|
|
1724
|
+
return value;
|
|
1725
|
+
}
|
|
1726
|
+
if (Array.isArray(value)) {
|
|
1727
|
+
if (value.length > 50) {
|
|
1728
|
+
return {
|
|
1729
|
+
summary: `array(${value.length})`,
|
|
1730
|
+
sample: value.slice(0, 10).map((item) => sanitizeForTrace(item, depth + 1)),
|
|
1731
|
+
};
|
|
1732
|
+
}
|
|
1733
|
+
return value.map((item) => sanitizeForTrace(item, depth + 1));
|
|
1734
|
+
}
|
|
1735
|
+
if (typeof value === "object") {
|
|
1736
|
+
const out = {};
|
|
1737
|
+
for (const [key, item] of Object.entries(value)) {
|
|
1738
|
+
if (/(api[-_]?key|authorization|token|secret)/i.test(key)) {
|
|
1739
|
+
out[key] = "***";
|
|
1740
|
+
continue;
|
|
1741
|
+
}
|
|
1742
|
+
if (key === "embedding" && Array.isArray(item)) {
|
|
1743
|
+
out[key] = {
|
|
1744
|
+
summary: `vector(${item.length})`,
|
|
1745
|
+
sample: item.slice(0, 8),
|
|
1746
|
+
};
|
|
1747
|
+
continue;
|
|
1748
|
+
}
|
|
1749
|
+
out[key] = sanitizeForTrace(item, depth + 1);
|
|
1750
|
+
}
|
|
1751
|
+
return out;
|
|
1752
|
+
}
|
|
1753
|
+
return String(value);
|
|
1754
|
+
}
|
|
1755
|
+
function looksLikeBase64(value) {
|
|
1756
|
+
if (value.length < 32 || value.length % 4 !== 0) {
|
|
1757
|
+
return false;
|
|
1758
|
+
}
|
|
1759
|
+
return /^[A-Za-z0-9+/=]+$/.test(value);
|
|
1760
|
+
}
|
|
1761
|
+
function previewForTrace(value) {
|
|
1762
|
+
if (typeof value === "string") {
|
|
1763
|
+
return value;
|
|
1764
|
+
}
|
|
1765
|
+
try {
|
|
1766
|
+
return JSON.stringify(value);
|
|
1767
|
+
}
|
|
1768
|
+
catch {
|
|
1769
|
+
return String(value);
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1772
|
+
function emitEvent(hooks, event) {
|
|
1773
|
+
hooks?.onEvent?.(event);
|
|
1774
|
+
}
|
|
1775
|
+
function summarizeByMode(results) {
|
|
1776
|
+
const summary = Object.fromEntries(types_1.BENCHMARK_MODES.map((mode) => [
|
|
1777
|
+
mode,
|
|
1778
|
+
{ total: 0, executed: 0, skipped: 0, passed: 0, failed: 0 },
|
|
1779
|
+
]));
|
|
1780
|
+
for (const result of results) {
|
|
1781
|
+
const row = summary[result.mode];
|
|
1782
|
+
row.total += 1;
|
|
1783
|
+
if (result.status === "skipped") {
|
|
1784
|
+
row.skipped += 1;
|
|
1785
|
+
continue;
|
|
1786
|
+
}
|
|
1787
|
+
row.executed += 1;
|
|
1788
|
+
if (result.status === "passed") {
|
|
1789
|
+
row.passed += 1;
|
|
1790
|
+
}
|
|
1791
|
+
else {
|
|
1792
|
+
row.failed += 1;
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
return summary;
|
|
1796
|
+
}
|
|
1797
|
+
function collectTopFailureReasons(samples) {
|
|
1798
|
+
const counts = new Map();
|
|
1799
|
+
for (const sample of samples) {
|
|
1800
|
+
if (!sample.error) {
|
|
1801
|
+
continue;
|
|
1802
|
+
}
|
|
1803
|
+
counts.set(sample.error, (counts.get(sample.error) ?? 0) + 1);
|
|
1804
|
+
}
|
|
1805
|
+
return [...counts.entries()]
|
|
1806
|
+
.sort((a, b) => b[1] - a[1])
|
|
1807
|
+
.slice(0, 5)
|
|
1808
|
+
.map(([reason, count]) => ({ reason, count }));
|
|
1809
|
+
}
|
|
1810
|
+
function compactObject(source) {
|
|
1811
|
+
return Object.fromEntries(Object.entries(source).filter(([, value]) => value !== undefined));
|
|
1812
|
+
}
|
|
1813
|
+
function percentile(sorted, p) {
|
|
1814
|
+
if (sorted.length === 0) {
|
|
1815
|
+
return 0;
|
|
1816
|
+
}
|
|
1817
|
+
const idx = Math.ceil((p / 100) * sorted.length) - 1;
|
|
1818
|
+
return sorted[Math.max(0, idx)];
|
|
1819
|
+
}
|
|
1820
|
+
function truncate(text, maxLength) {
|
|
1821
|
+
if (text.length <= maxLength) {
|
|
1822
|
+
return text;
|
|
1823
|
+
}
|
|
1824
|
+
return `${text.slice(0, maxLength - 1)}…`;
|
|
1825
|
+
}
|
|
1826
|
+
function calculateThroughput(tokens, latencyMs) {
|
|
1827
|
+
if (tokens <= 0 || latencyMs <= 0) {
|
|
1828
|
+
return 0;
|
|
1829
|
+
}
|
|
1830
|
+
return (tokens * 1000) / latencyMs;
|
|
1831
|
+
}
|
|
1832
|
+
async function withTimeout(promise, timeoutMs, message) {
|
|
1833
|
+
let timeoutId = null;
|
|
1834
|
+
try {
|
|
1835
|
+
return await Promise.race([
|
|
1836
|
+
promise,
|
|
1837
|
+
new Promise((_resolve, reject) => {
|
|
1838
|
+
timeoutId = setTimeout(() => reject(new Error(message)), timeoutMs);
|
|
1839
|
+
}),
|
|
1840
|
+
]);
|
|
1841
|
+
}
|
|
1842
|
+
finally {
|
|
1843
|
+
if (timeoutId) {
|
|
1844
|
+
clearTimeout(timeoutId);
|
|
1845
|
+
}
|
|
1846
|
+
}
|
|
1847
|
+
}
|