nodebench-mcp 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.js +51 -3
- package/dist/db.js.map +1 -1
- package/dist/index.js +13 -16
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +11 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/toolsetRegistry.js +74 -1
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +4 -3
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/sync/hyperloopEval.test.d.ts +0 -4
- package/dist/sync/hyperloopEval.test.js +0 -60
- package/dist/sync/hyperloopEval.test.js.map +0 -1
- package/dist/sync/store.test.d.ts +0 -4
- package/dist/sync/store.test.js +0 -43
- package/dist/sync/store.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,284 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Forecasting OS — Integration Tests (Dogfood)
|
|
3
|
-
*
|
|
4
|
-
* Full lifecycle tests: create → evidence → update → resolve → track record → calibration.
|
|
5
|
-
* Uses MCP tools directly (SQLite-backed, no Convex dependency).
|
|
6
|
-
*
|
|
7
|
-
* Run: npx vitest run src/__tests__/forecastingDogfood.test.ts
|
|
8
|
-
*/
|
|
9
|
-
import { describe, it, expect, beforeAll } from "vitest";
|
|
10
|
-
import { forecastingTools } from "../tools/forecastingTools.js";
|
|
11
|
-
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
12
|
-
const toolMap = new Map(forecastingTools.map((t) => [t.name, t]));
|
|
13
|
-
async function callTool(name, args) {
|
|
14
|
-
const tool = toolMap.get(name);
|
|
15
|
-
if (!tool)
|
|
16
|
-
throw new Error(`Tool ${name} not found`);
|
|
17
|
-
const result = await tool.handler(args);
|
|
18
|
-
const text = result.find((r) => r.type === "text")?.text;
|
|
19
|
-
if (!text)
|
|
20
|
-
throw new Error(`Tool ${name} returned no text`);
|
|
21
|
-
return JSON.parse(text);
|
|
22
|
-
}
|
|
23
|
-
// ─── Tool Structure ─────────────────────────────────────────────────────────
|
|
24
|
-
describe("Forecasting tools: structure", () => {
|
|
25
|
-
it("should have 9 tools", () => {
|
|
26
|
-
expect(forecastingTools.length).toBe(9);
|
|
27
|
-
});
|
|
28
|
-
it("every tool has name, description, inputSchema, handler", () => {
|
|
29
|
-
for (const tool of forecastingTools) {
|
|
30
|
-
expect(tool.name).toBeTruthy();
|
|
31
|
-
expect(tool.description).toBeTruthy();
|
|
32
|
-
expect(tool.inputSchema).toBeDefined();
|
|
33
|
-
expect(typeof tool.handler).toBe("function");
|
|
34
|
-
}
|
|
35
|
-
});
|
|
36
|
-
it("tool names match expected list", () => {
|
|
37
|
-
const names = forecastingTools.map((t) => t.name).sort();
|
|
38
|
-
expect(names).toEqual([
|
|
39
|
-
"add_forecast_evidence",
|
|
40
|
-
"compute_calibration",
|
|
41
|
-
"create_forecast",
|
|
42
|
-
"get_active_forecasts",
|
|
43
|
-
"get_forecast_chain",
|
|
44
|
-
"get_forecast_evidence",
|
|
45
|
-
"get_forecast_track_record",
|
|
46
|
-
"resolve_forecast",
|
|
47
|
-
"update_forecast_probability",
|
|
48
|
-
]);
|
|
49
|
-
});
|
|
50
|
-
});
|
|
51
|
-
// ─── Full Lifecycle ─────────────────────────────────────────────────────────
|
|
52
|
-
describe("Forecasting lifecycle", () => {
|
|
53
|
-
let forecastId;
|
|
54
|
-
it("create_forecast — creates a binary forecast", async () => {
|
|
55
|
-
const result = await callTool("create_forecast", {
|
|
56
|
-
question: "Will GPT-5 be released by 2026-12-31?",
|
|
57
|
-
forecastType: "binary",
|
|
58
|
-
resolutionDate: "2026-12-31",
|
|
59
|
-
resolutionCriteria: "OpenAI announces GPT-5 on official blog or press release",
|
|
60
|
-
probability: 0.5,
|
|
61
|
-
baseRate: 0.6,
|
|
62
|
-
refreshFrequency: "weekly",
|
|
63
|
-
topDrivers: ["Historical 18-month release cadence", "Sam Altman interview hints"],
|
|
64
|
-
topCounterarguments: ["No official roadmap published"],
|
|
65
|
-
tags: ["ai_tech"],
|
|
66
|
-
});
|
|
67
|
-
expect(result.forecastId).toBeTruthy();
|
|
68
|
-
expect(result.status).toBe("active");
|
|
69
|
-
expect(result.probability).toBe(0.5);
|
|
70
|
-
forecastId = result.forecastId;
|
|
71
|
-
});
|
|
72
|
-
it("add_forecast_evidence — adds supporting evidence", async () => {
|
|
73
|
-
const result = await callTool("add_forecast_evidence", {
|
|
74
|
-
forecastId,
|
|
75
|
-
sourceUrl: "https://example.com/sam-altman-interview",
|
|
76
|
-
sourceTitle: "Sam Altman Interview on AI Progress",
|
|
77
|
-
sourceType: "news",
|
|
78
|
-
excerpt: "Altman hints at a major model release in the second half of 2026",
|
|
79
|
-
signal: "supporting",
|
|
80
|
-
impactOnProbability: 0.1,
|
|
81
|
-
});
|
|
82
|
-
expect(result.evidenceId).toBeTruthy();
|
|
83
|
-
expect(result.signal).toBe("supporting");
|
|
84
|
-
});
|
|
85
|
-
it("add_forecast_evidence — adds disconfirming evidence", async () => {
|
|
86
|
-
const result = await callTool("add_forecast_evidence", {
|
|
87
|
-
forecastId,
|
|
88
|
-
sourceUrl: "https://example.com/compute-shortage",
|
|
89
|
-
sourceTitle: "GPU Shortage Report Q1 2026",
|
|
90
|
-
sourceType: "filing",
|
|
91
|
-
excerpt: "Major cloud providers report 40% compute capacity shortfall for large model training",
|
|
92
|
-
signal: "disconfirming",
|
|
93
|
-
});
|
|
94
|
-
expect(result.evidenceId).toBeTruthy();
|
|
95
|
-
expect(result.signal).toBe("disconfirming");
|
|
96
|
-
});
|
|
97
|
-
it("add_forecast_evidence — deduplicates by URL", async () => {
|
|
98
|
-
const result = await callTool("add_forecast_evidence", {
|
|
99
|
-
forecastId,
|
|
100
|
-
sourceUrl: "https://example.com/sam-altman-interview",
|
|
101
|
-
sourceTitle: "Duplicate",
|
|
102
|
-
sourceType: "news",
|
|
103
|
-
excerpt: "Duplicate entry",
|
|
104
|
-
signal: "supporting",
|
|
105
|
-
});
|
|
106
|
-
expect(result.error).toContain("already exists");
|
|
107
|
-
});
|
|
108
|
-
it("get_forecast_evidence — returns evidence", async () => {
|
|
109
|
-
const result = await callTool("get_forecast_evidence", {
|
|
110
|
-
forecastId,
|
|
111
|
-
});
|
|
112
|
-
expect(result.count).toBe(2);
|
|
113
|
-
expect(result.evidence.length).toBe(2);
|
|
114
|
-
});
|
|
115
|
-
it("get_forecast_evidence — filters by signal", async () => {
|
|
116
|
-
const result = await callTool("get_forecast_evidence", {
|
|
117
|
-
forecastId,
|
|
118
|
-
signal: "supporting",
|
|
119
|
-
});
|
|
120
|
-
expect(result.count).toBe(1);
|
|
121
|
-
});
|
|
122
|
-
it("update_forecast_probability — updates with reasoning", async () => {
|
|
123
|
-
const result = await callTool("update_forecast_probability", {
|
|
124
|
-
forecastId,
|
|
125
|
-
probability: 0.65,
|
|
126
|
-
topDrivers: [
|
|
127
|
-
"Sam Altman interview hint",
|
|
128
|
-
"Historical 18-month cadence",
|
|
129
|
-
"Competitor pressure from Google Gemini",
|
|
130
|
-
],
|
|
131
|
-
topCounterarguments: [
|
|
132
|
-
"GPU shortage may delay training",
|
|
133
|
-
"No official roadmap",
|
|
134
|
-
],
|
|
135
|
-
reasoning: "Soft leadership signal + historical pattern outweigh compute concerns",
|
|
136
|
-
});
|
|
137
|
-
expect(result.previousProbability).toBe(0.5);
|
|
138
|
-
expect(result.newProbability).toBe(0.65);
|
|
139
|
-
expect(result.diff).toBe("50% → 65% (+15pp)");
|
|
140
|
-
});
|
|
141
|
-
it("update_forecast_probability — validates range", async () => {
|
|
142
|
-
const result = await callTool("update_forecast_probability", {
|
|
143
|
-
forecastId,
|
|
144
|
-
probability: 1.5,
|
|
145
|
-
reasoning: "Invalid",
|
|
146
|
-
});
|
|
147
|
-
expect(result.error).toContain("between 0 and 1");
|
|
148
|
-
});
|
|
149
|
-
it("get_forecast_chain — returns full audit trail", async () => {
|
|
150
|
-
const result = await callTool("get_forecast_chain", {
|
|
151
|
-
forecastId,
|
|
152
|
-
});
|
|
153
|
-
expect(result.forecast).toBeTruthy();
|
|
154
|
-
expect(result.evidence.length).toBe(2);
|
|
155
|
-
expect(result.updateHistory.length).toBe(1);
|
|
156
|
-
expect(result.resolution).toBeNull();
|
|
157
|
-
expect(result.summary.evidenceCount).toBe(2);
|
|
158
|
-
expect(result.summary.updateCount).toBe(1);
|
|
159
|
-
expect(result.summary.isResolved).toBe(false);
|
|
160
|
-
});
|
|
161
|
-
it("get_active_forecasts — lists active forecasts", async () => {
|
|
162
|
-
const result = await callTool("get_active_forecasts", {});
|
|
163
|
-
expect(result.count).toBeGreaterThanOrEqual(1);
|
|
164
|
-
const forecasts = result.forecasts;
|
|
165
|
-
const found = forecasts.find((f) => f.id === forecastId);
|
|
166
|
-
expect(found).toBeTruthy();
|
|
167
|
-
expect(found?.status).toBe("active");
|
|
168
|
-
});
|
|
169
|
-
it("resolve_forecast — resolves with Brier score", async () => {
|
|
170
|
-
const result = await callTool("resolve_forecast", {
|
|
171
|
-
forecastId,
|
|
172
|
-
outcome: "yes",
|
|
173
|
-
resolutionNotes: "GPT-5 announced on 2026-11-15 via OpenAI blog post",
|
|
174
|
-
resolutionSourceUrl: "https://openai.com/gpt-5-announcement",
|
|
175
|
-
});
|
|
176
|
-
expect(result.status).toBe("resolved");
|
|
177
|
-
expect(result.outcome).toBe("yes");
|
|
178
|
-
// Brier: (0.65 - 1)^2 = 0.1225
|
|
179
|
-
expect(result.brierScore).toBeCloseTo(0.1225, 3);
|
|
180
|
-
// Log: -log(0.65) ≈ 0.431
|
|
181
|
-
expect(result.logScore).toBeCloseTo(0.431, 2);
|
|
182
|
-
});
|
|
183
|
-
it("resolve_forecast — cannot resolve twice", async () => {
|
|
184
|
-
const result = await callTool("resolve_forecast", {
|
|
185
|
-
forecastId,
|
|
186
|
-
outcome: "no",
|
|
187
|
-
resolutionNotes: "Already resolved",
|
|
188
|
-
});
|
|
189
|
-
expect(result.error).toContain("already resolved");
|
|
190
|
-
});
|
|
191
|
-
it("get_forecast_track_record — shows Brier aggregate", async () => {
|
|
192
|
-
const result = await callTool("get_forecast_track_record", {});
|
|
193
|
-
expect(result.scoredCount).toBeGreaterThanOrEqual(1);
|
|
194
|
-
// SQLite persists between test runs, so overallBrier is average across ALL
|
|
195
|
-
// resolved forecasts (not just this run). Use a reasonable bound instead.
|
|
196
|
-
expect(result.overallBrier).toBeGreaterThan(0);
|
|
197
|
-
expect(result.overallBrier).toBeLessThan(0.5);
|
|
198
|
-
});
|
|
199
|
-
});
|
|
200
|
-
// ─── Multi-Forecast Calibration ─────────────────────────────────────────────
|
|
201
|
-
describe("Forecasting calibration", () => {
|
|
202
|
-
beforeAll(async () => {
|
|
203
|
-
// Create and resolve 5 forecasts with known outcomes for calibration
|
|
204
|
-
const scenarios = [
|
|
205
|
-
{ probability: 0.9, outcome: "yes" }, // Brier: 0.01
|
|
206
|
-
{ probability: 0.8, outcome: "yes" }, // Brier: 0.04
|
|
207
|
-
{ probability: 0.3, outcome: "no" }, // Brier: 0.09
|
|
208
|
-
{ probability: 0.1, outcome: "no" }, // Brier: 0.01
|
|
209
|
-
{ probability: 0.6, outcome: "yes" }, // Brier: 0.16
|
|
210
|
-
];
|
|
211
|
-
for (let i = 0; i < scenarios.length; i++) {
|
|
212
|
-
const create = await callTool("create_forecast", {
|
|
213
|
-
question: `Calibration test forecast ${i + 1}?`,
|
|
214
|
-
resolutionDate: "2026-01-01",
|
|
215
|
-
resolutionCriteria: `Test criteria ${i + 1}`,
|
|
216
|
-
probability: scenarios[i].probability,
|
|
217
|
-
tags: ["test_calibration"],
|
|
218
|
-
});
|
|
219
|
-
await callTool("resolve_forecast", {
|
|
220
|
-
forecastId: create.forecastId,
|
|
221
|
-
outcome: scenarios[i].outcome,
|
|
222
|
-
resolutionNotes: `Test resolution ${i + 1}`,
|
|
223
|
-
});
|
|
224
|
-
}
|
|
225
|
-
});
|
|
226
|
-
it("compute_calibration — returns 10 bins", async () => {
|
|
227
|
-
const result = await callTool("compute_calibration", {});
|
|
228
|
-
expect(result.bins).toBeTruthy();
|
|
229
|
-
expect(result.bins.length).toBe(10);
|
|
230
|
-
expect(result.overallBrier).toBeTruthy();
|
|
231
|
-
expect(typeof result.overallBrier).toBe("number");
|
|
232
|
-
expect(result.forecastCount).toBeGreaterThanOrEqual(5);
|
|
233
|
-
});
|
|
234
|
-
it("get_forecast_track_record — aggregate includes all resolved", async () => {
|
|
235
|
-
const result = await callTool("get_forecast_track_record", {});
|
|
236
|
-
// At least 6 resolved (1 from lifecycle + 5 from calibration)
|
|
237
|
-
expect(result.scoredCount).toBeGreaterThanOrEqual(6);
|
|
238
|
-
// Average Brier should be reasonable
|
|
239
|
-
expect(result.overallBrier).toBeLessThan(0.25);
|
|
240
|
-
});
|
|
241
|
-
});
|
|
242
|
-
// ─── Edge Cases ─────────────────────────────────────────────────────────────
|
|
243
|
-
describe("Forecasting edge cases", () => {
|
|
244
|
-
it("create_forecast — rejects invalid probability", async () => {
|
|
245
|
-
const result = await callTool("create_forecast", {
|
|
246
|
-
question: "Invalid prob test",
|
|
247
|
-
resolutionDate: "2026-12-31",
|
|
248
|
-
resolutionCriteria: "Test",
|
|
249
|
-
probability: -0.1,
|
|
250
|
-
});
|
|
251
|
-
expect(result.error).toContain("between 0 and 1");
|
|
252
|
-
});
|
|
253
|
-
it("resolve_forecast — ambiguous outcome excluded from scoring", async () => {
|
|
254
|
-
const create = await callTool("create_forecast", {
|
|
255
|
-
question: "Ambiguous resolution test?",
|
|
256
|
-
resolutionDate: "2026-12-31",
|
|
257
|
-
resolutionCriteria: "Test",
|
|
258
|
-
probability: 0.7,
|
|
259
|
-
});
|
|
260
|
-
const resolve = await callTool("resolve_forecast", {
|
|
261
|
-
forecastId: create.forecastId,
|
|
262
|
-
outcome: "ambiguous",
|
|
263
|
-
resolutionNotes: "Resolution criteria were unclear",
|
|
264
|
-
});
|
|
265
|
-
expect(resolve.brierScore).toBeNull();
|
|
266
|
-
expect(resolve.logScore).toBeNull();
|
|
267
|
-
});
|
|
268
|
-
it("get_forecast_chain — returns error for nonexistent forecast", async () => {
|
|
269
|
-
const result = await callTool("get_forecast_chain", {
|
|
270
|
-
forecastId: "nonexistent_id",
|
|
271
|
-
});
|
|
272
|
-
expect(result.error).toContain("not found");
|
|
273
|
-
});
|
|
274
|
-
it("get_active_forecasts — filters by tags", async () => {
|
|
275
|
-
const result = await callTool("get_active_forecasts", {
|
|
276
|
-
tags: ["test_calibration"],
|
|
277
|
-
});
|
|
278
|
-
// All calibration forecasts are resolved, so none should be active
|
|
279
|
-
const forecasts = result.forecasts;
|
|
280
|
-
const calibration = forecasts.filter((f) => f.tags.includes("test_calibration"));
|
|
281
|
-
expect(calibration.length).toBe(0);
|
|
282
|
-
});
|
|
283
|
-
});
|
|
284
|
-
//# sourceMappingURL=forecastingDogfood.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"forecastingDogfood.test.js","sourceRoot":"","sources":["../../src/__tests__/forecastingDogfood.test.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAGhE,+EAA+E;AAE/E,MAAM,OAAO,GAAG,IAAI,GAAG,CACrB,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CACzC,CAAC;AAEF,KAAK,UAAU,QAAQ,CACrB,IAAY,EACZ,IAA6B;IAE7B,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,QAAQ,IAAI,YAAY,CAAC,CAAC;IACrD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,EAAE,IAAI,CAAC;IACzD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,QAAQ,IAAI,mBAAmB,CAAC,CAAC;IAC5D,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,+EAA+E;AAE/E,QAAQ,CAAC,8BAA8B,EAAE,GAAG,EAAE;IAC5C,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,KAAK,MAAM,IAAI,IAAI,gBAAgB,EAAE,CAAC;YACpC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,UAAU,EAAE,CAAC;YACtC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,WAAW,EAAE,CAAC;YACvC,MAAM,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,KAAK,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QACzD,MAAM,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC;YACpB,uBAAuB;YACvB,qBAAqB;YACrB,iBAAiB;YACjB,sBAAsB;YACtB,oBAAoB;YACpB,uBAAuB;YACvB,2BAA2B;YAC3B,kBAAkB;YAClB,6BAA6B;SAC9B,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,IAAI,UAAkB,CAAC;IAEvB,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC3D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,iBAAiB,EAAE;YAC/C,QAAQ,EAAE,uCAAuC;YACjD,YAAY,EAAE,QAAQ;YACtB,cAAc,EAAE,YAAY;YAC5B,kBAAkB,EAAE,0DAA0D;YAC9E,WAAW,EAAE,GAAG;YAChB,QAAQ,EAAE,GAAG;YACb,gBAAgB,EAAE,QAAQ;YAC1B,UAAU,EAAE,CAAC,qCAAqC,EAAE,4BAA4B,CAAC;YACjF,mBAAmB,EAAE,CAAC,+BAA+B,CAAC;YACtD,IAAI,EAAE,CAAC,SAAS,CAAC;SAClB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,UAAU,EAAE,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACrC,UAAU,GAAG,MAAM,CAAC,UAAoB,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;QAChE,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,uBAAuB,EAAE;YACrD,UAAU;YACV,SAAS,EAAE,0CAA0C;YACrD,WAAW,EAAE,qCAAqC;YAClD,UAAU,EAAE,MAAM;YAClB,OAAO,EAAE,kEAAkE;YAC3E,MAAM,EAAE,YAAY;YACpB,mBAAmB,EAAE,GAAG;SACzB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,UAAU,EAAE,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,uBAAuB,EAAE;YACrD,UAAU;YACV,SAAS,EAAE,sCAAsC;YACjD,WAAW,EAAE,6BAA6B;YAC1C,UAAU,EAAE,QAAQ;YACpB,OAAO,EAAE,sFAAsF;YAC/F,MAAM,EAAE,eAAe;SACxB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,UAAU,EAAE,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC3D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,uBAAuB,EAAE;YACrD,UAAU;YACV,SAAS,EAAE,0CAA0C;YACrD,WAAW,EAAE,WAAW;YACxB,UAAU,EAAE,MAAM;YAClB,OAAO,EAAE,iBAAiB;YAC1B,MAAM,EAAE,YAAY;SACrB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,uBAAuB,EAAE;YACrD,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,CAAE,MAAM,CAAC,QAAsB,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;QACzD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,uBAAuB,EAAE;YACrD,UAAU;YACV,MAAM,EAAE,YAAY;SACrB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;QACpE,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,6BAA6B,EAAE;YAC3D,UAAU;YACV,WAAW,EAAE,IAAI;YACjB,UAAU,EAAE;gBACV,2BAA2B;gBAC3B,6BAA6B;gBAC7B,wCAAwC;aACzC;YACD,mBAAmB,EAAE;gBACnB,iCAAiC;gBACjC,qBAAqB;aACtB;YACD,SAAS,EAAE,uEAAuE;SACnF,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC7C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,6BAA6B,EAAE;YAC3D,UAAU;YACV,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,SAAS;SACrB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,oBAAoB,EAAE;YAClD,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,UAAU,EAAE,CAAC;QACrC,MAAM,CAAE,MAAM,CAAC,QAAsB,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACtD,MAAM,CAAE,MAAM,CAAC,aAA2B,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,QAAQ,EAAE,CAAC;QACrC,MAAM,CAAE,MAAM,CAAC,OAAmC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1E,MAAM,CAAE,MAAM,CAAC,OAAmC,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACxE,MAAM,CAAE,MAAM,CAAC,OAAmC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7E,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,sBAAsB,EAAE,EAAE,CAAC,CAAC;QAE1D,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,MAAM,CAAC,SAA2C,CAAC;QACrE,MAAM,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,UAAU,CAAC,CAAC;QACzD,MAAM,CAAC,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;QAC3B,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;QAC5D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,kBAAkB,EAAE;YAChD,UAAU;YACV,OAAO,EAAE,KAAK;YACd,eAAe,EAAE,oDAAoD;YACrE,mBAAmB,EAAE,uCAAuC;SAC7D,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACnC,+BAA+B;QAC/B,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACjD,0BAA0B;QAC1B,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAChD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,kBAAkB,EAAE;YAChD,UAAU;YACV,OAAO,EAAE,IAAI;YACb,eAAe,EAAE,kBAAkB;SACpC,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;QACjE,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,2BAA2B,EAAE,EAAE,CAAC,CAAC;QAE/D,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACrD,2EAA2E;QAC3E,0EAA0E;QAC1E,MAAM,CAAC,MAAM,CAAC,YAAsB,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACzD,MAAM,CAAC,MAAM,CAAC,YAAsB,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,SAAS,CAAC,KAAK,IAAI,EAAE;QACnB,qEAAqE;QACrE,MAAM,SAAS,GAAG;YAChB,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,EAAI,cAAc;YACtD,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,EAAI,cAAc;YACtD,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,EAAK,cAAc;YACtD,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,EAAK,cAAc;YACtD,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,EAAI,cAAc;SACvD,CAAC;QAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,iBAAiB,EAAE;gBAC/C,QAAQ,EAAE,6BAA6B,CAAC,GAAG,CAAC,GAAG;gBAC/C,cAAc,EAAE,YAAY;gBAC5B,kBAAkB,EAAE,iBAAiB,CAAC,GAAG,CAAC,EAAE;gBAC5C,WAAW,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,WAAW;gBACrC,IAAI,EAAE,CAAC,kBAAkB,CAAC;aAC3B,CAAC,CAAC;YAEH,MAAM,QAAQ,CAAC,kBAAkB,EAAE;gBACjC,UAAU,EAAG,MAAkC,CAAC,UAAU;gBAC1D,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO;gBAC7B,eAAe,EAAE,mBAAmB,CAAC,GAAG,CAAC,EAAE;aAC5C,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,qBAAqB,EAAE,EAAE,CAAC,CAAC;QAEzD,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;QACjC,MAAM,CAAE,MAAM,CAAC,IAAkB,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnD,MAAM,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,UAAU,EAAE,CAAC;QACzC,MAAM,CAAC,OAAO,MAAM,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAClD,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6DAA6D,EAAE,KAAK,IAAI,EAAE;QAC3E,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,2BAA2B,EAAE,EAAE,CAAC,CAAC;QAE/D,8DAA8D;QAC9D,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACrD,qCAAqC;QACrC,MAAM,CAAC,MAAM,CAAC,YAAsB,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,iBAAiB,EAAE;YAC/C,QAAQ,EAAE,mBAAmB;YAC7B,cAAc,EAAE,YAAY;YAC5B,kBAAkB,EAAE,MAAM;YAC1B,WAAW,EAAE,CAAC,GAAG;SAClB,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC1E,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,iBAAiB,EAAE;YAC/C,QAAQ,EAAE,4BAA4B;YACtC,cAAc,EAAE,YAAY;YAC5B,kBAAkB,EAAE,MAAM;YAC1B,WAAW,EAAE,GAAG;SACjB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,kBAAkB,EAAE;YACjD,UAAU,EAAG,MAAkC,CAAC,UAAU;YAC1D,OAAO,EAAE,WAAW;YACpB,eAAe,EAAE,kCAAkC;SACpD,CAAC,CAAC;QAEH,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,QAAQ,EAAE,CAAC;QACtC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,QAAQ,EAAE,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6DAA6D,EAAE,KAAK,IAAI,EAAE;QAC3E,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,oBAAoB,EAAE;YAClD,UAAU,EAAE,gBAAgB;SAC7B,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,KAAK,IAAI,EAAE;QACtD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,sBAAsB,EAAE;YACpD,IAAI,EAAE,CAAC,kBAAkB,CAAC;SAC3B,CAAC,CAAC;QAEH,mEAAmE;QACnE,MAAM,SAAS,GAAG,MAAM,CAAC,SAA2C,CAAC;QACrE,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CACxC,CAAC,CAAC,IAAiB,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAClD,CAAC;QACF,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Forecasting Scoring Engine — Unit Tests
|
|
3
|
-
*
|
|
4
|
-
* Tests for brierScore, logScore, calibrationBins, isotonicCalibrate,
|
|
5
|
-
* averageBrier, averageLogScore, formatForecastDiff.
|
|
6
|
-
*
|
|
7
|
-
* All functions are pure — no Convex, no SQLite, no network.
|
|
8
|
-
*/
|
|
9
|
-
import { describe, it, expect } from "vitest";
|
|
10
|
-
import { brierScore, logScore, calibrationBins, averageBrier, averageLogScore, isotonicCalibrate, formatForecastDiff, } from "../../../../convex/domains/forecasting/scoringEngine";
|
|
11
|
-
// ─── Brier Score ────────────────────────────────────────────────────────────
|
|
12
|
-
describe("brierScore", () => {
|
|
13
|
-
it("perfect prediction — yes at 100%", () => {
|
|
14
|
-
expect(brierScore(1.0, "yes")).toBe(0);
|
|
15
|
-
});
|
|
16
|
-
it("perfect prediction — no at 0%", () => {
|
|
17
|
-
expect(brierScore(0.0, "no")).toBe(0);
|
|
18
|
-
});
|
|
19
|
-
it("worst prediction — yes at 0%", () => {
|
|
20
|
-
expect(brierScore(0.0, "yes")).toBe(1);
|
|
21
|
-
});
|
|
22
|
-
it("worst prediction — no at 100%", () => {
|
|
23
|
-
expect(brierScore(1.0, "no")).toBe(1);
|
|
24
|
-
});
|
|
25
|
-
it("coin flip — 50% on yes", () => {
|
|
26
|
-
expect(brierScore(0.5, "yes")).toBe(0.25);
|
|
27
|
-
});
|
|
28
|
-
it("coin flip — 50% on no", () => {
|
|
29
|
-
expect(brierScore(0.5, "no")).toBe(0.25);
|
|
30
|
-
});
|
|
31
|
-
it("65% on yes outcome", () => {
|
|
32
|
-
expect(brierScore(0.65, "yes")).toBeCloseTo(0.1225, 4);
|
|
33
|
-
});
|
|
34
|
-
it("80% on no outcome", () => {
|
|
35
|
-
expect(brierScore(0.8, "no")).toBeCloseTo(0.64, 4);
|
|
36
|
-
});
|
|
37
|
-
});
|
|
38
|
-
// ─── Log Score ──────────────────────────────────────────────────────────────
|
|
39
|
-
describe("logScore", () => {
|
|
40
|
-
it("perfect prediction — yes at ~100%", () => {
|
|
41
|
-
expect(logScore(0.999, "yes")).toBeCloseTo(0.001, 2);
|
|
42
|
-
});
|
|
43
|
-
it("perfect prediction — no at ~0%", () => {
|
|
44
|
-
expect(logScore(0.001, "no")).toBeCloseTo(0.001, 2);
|
|
45
|
-
});
|
|
46
|
-
it("worst prediction — yes at ~0%", () => {
|
|
47
|
-
// -log(0.001) ≈ 6.9
|
|
48
|
-
expect(logScore(0.001, "yes")).toBeCloseTo(6.908, 1);
|
|
49
|
-
});
|
|
50
|
-
it("coin flip — 50% on yes", () => {
|
|
51
|
-
// -log(0.5) ≈ 0.693
|
|
52
|
-
expect(logScore(0.5, "yes")).toBeCloseTo(0.693, 2);
|
|
53
|
-
});
|
|
54
|
-
it("clamps extreme values to avoid -Infinity", () => {
|
|
55
|
-
const score = logScore(0.0, "yes");
|
|
56
|
-
expect(Number.isFinite(score)).toBe(true);
|
|
57
|
-
expect(score).toBeGreaterThan(0);
|
|
58
|
-
});
|
|
59
|
-
it("clamps 1.0 for no outcome", () => {
|
|
60
|
-
const score = logScore(1.0, "no");
|
|
61
|
-
expect(Number.isFinite(score)).toBe(true);
|
|
62
|
-
expect(score).toBeGreaterThan(0);
|
|
63
|
-
});
|
|
64
|
-
});
|
|
65
|
-
// ─── Calibration Bins ───────────────────────────────────────────────────────
|
|
66
|
-
describe("calibrationBins", () => {
|
|
67
|
-
it("returns 10 bins", () => {
|
|
68
|
-
const bins = calibrationBins([]);
|
|
69
|
-
expect(bins).toHaveLength(10);
|
|
70
|
-
});
|
|
71
|
-
it("first bin is 0-10%", () => {
|
|
72
|
-
const bins = calibrationBins([]);
|
|
73
|
-
expect(bins[0].binLabel).toBe("0-10%");
|
|
74
|
-
expect(bins[0].predictedProb).toBe(0.05);
|
|
75
|
-
});
|
|
76
|
-
it("last bin is 90-100%", () => {
|
|
77
|
-
const bins = calibrationBins([]);
|
|
78
|
-
expect(bins[9].binLabel).toBe("90-100%");
|
|
79
|
-
expect(bins[9].predictedProb).toBe(0.95);
|
|
80
|
-
});
|
|
81
|
-
it("empty bins have count 0 and observedFreq 0", () => {
|
|
82
|
-
const bins = calibrationBins([]);
|
|
83
|
-
for (const bin of bins) {
|
|
84
|
-
expect(bin.count).toBe(0);
|
|
85
|
-
expect(bin.observedFreq).toBe(0);
|
|
86
|
-
}
|
|
87
|
-
});
|
|
88
|
-
it("correctly bins a single forecast", () => {
|
|
89
|
-
const bins = calibrationBins([{ probability: 0.75, outcome: "yes" }]);
|
|
90
|
-
const bin70 = bins[7]; // 70-80%
|
|
91
|
-
expect(bin70.count).toBe(1);
|
|
92
|
-
expect(bin70.observedFreq).toBe(1); // 1/1 = yes
|
|
93
|
-
});
|
|
94
|
-
it("correctly computes observed frequency", () => {
|
|
95
|
-
const forecasts = [
|
|
96
|
-
{ probability: 0.55, outcome: "yes" },
|
|
97
|
-
{ probability: 0.52, outcome: "no" },
|
|
98
|
-
{ probability: 0.58, outcome: "yes" },
|
|
99
|
-
{ probability: 0.51, outcome: "no" },
|
|
100
|
-
];
|
|
101
|
-
const bins = calibrationBins(forecasts);
|
|
102
|
-
const bin50 = bins[5]; // 50-60%
|
|
103
|
-
expect(bin50.count).toBe(4);
|
|
104
|
-
expect(bin50.observedFreq).toBe(0.5); // 2/4
|
|
105
|
-
});
|
|
106
|
-
it("boundary value 1.0 goes in 90-100% bin", () => {
|
|
107
|
-
const bins = calibrationBins([{ probability: 1.0, outcome: "yes" }]);
|
|
108
|
-
expect(bins[9].count).toBe(1);
|
|
109
|
-
});
|
|
110
|
-
it("boundary value 0.0 goes in 0-10% bin", () => {
|
|
111
|
-
const bins = calibrationBins([{ probability: 0.0, outcome: "no" }]);
|
|
112
|
-
expect(bins[0].count).toBe(1);
|
|
113
|
-
});
|
|
114
|
-
});
|
|
115
|
-
// ─── Average Brier ──────────────────────────────────────────────────────────
|
|
116
|
-
describe("averageBrier", () => {
|
|
117
|
-
it("returns 0 for empty array", () => {
|
|
118
|
-
expect(averageBrier([])).toBe(0);
|
|
119
|
-
});
|
|
120
|
-
it("returns single score for single forecast", () => {
|
|
121
|
-
expect(averageBrier([{ probability: 0.7, outcome: "yes" }])).toBeCloseTo(0.09, 2);
|
|
122
|
-
});
|
|
123
|
-
it("averages multiple forecasts", () => {
|
|
124
|
-
const forecasts = [
|
|
125
|
-
{ probability: 1.0, outcome: "yes" }, // 0
|
|
126
|
-
{ probability: 0.0, outcome: "yes" }, // 1
|
|
127
|
-
];
|
|
128
|
-
expect(averageBrier(forecasts)).toBe(0.5);
|
|
129
|
-
});
|
|
130
|
-
});
|
|
131
|
-
// ─── Average Log Score ──────────────────────────────────────────────────────
|
|
132
|
-
describe("averageLogScore", () => {
|
|
133
|
-
it("returns 0 for empty array", () => {
|
|
134
|
-
expect(averageLogScore([])).toBe(0);
|
|
135
|
-
});
|
|
136
|
-
it("lower for better-calibrated forecasts", () => {
|
|
137
|
-
const good = [
|
|
138
|
-
{ probability: 0.9, outcome: "yes" },
|
|
139
|
-
{ probability: 0.1, outcome: "no" },
|
|
140
|
-
];
|
|
141
|
-
const bad = [
|
|
142
|
-
{ probability: 0.1, outcome: "yes" },
|
|
143
|
-
{ probability: 0.9, outcome: "no" },
|
|
144
|
-
];
|
|
145
|
-
expect(averageLogScore(good)).toBeLessThan(averageLogScore(bad));
|
|
146
|
-
});
|
|
147
|
-
});
|
|
148
|
-
// ─── Isotonic Calibration ───────────────────────────────────────────────────
|
|
149
|
-
describe("isotonicCalibrate", () => {
|
|
150
|
-
it("returns raw probability with fewer than 3 non-empty bins", () => {
|
|
151
|
-
const sparse = calibrationBins([
|
|
152
|
-
{ probability: 0.15, outcome: "yes" },
|
|
153
|
-
{ probability: 0.85, outcome: "no" },
|
|
154
|
-
]);
|
|
155
|
-
expect(isotonicCalibrate(0.5, sparse)).toBe(0.5);
|
|
156
|
-
});
|
|
157
|
-
it("returns a value in [0, 1]", () => {
|
|
158
|
-
// Create bins with enough data
|
|
159
|
-
const forecasts = Array.from({ length: 50 }, (_, i) => ({
|
|
160
|
-
probability: (i + 0.5) / 50,
|
|
161
|
-
outcome: (Math.random() > 0.5 ? "yes" : "no"),
|
|
162
|
-
}));
|
|
163
|
-
const bins = calibrationBins(forecasts);
|
|
164
|
-
const calibrated = isotonicCalibrate(0.7, bins);
|
|
165
|
-
expect(calibrated).toBeGreaterThanOrEqual(0);
|
|
166
|
-
expect(calibrated).toBeLessThanOrEqual(1);
|
|
167
|
-
});
|
|
168
|
-
it("produces monotonically non-decreasing output for ordered inputs", () => {
|
|
169
|
-
// Create well-populated bins
|
|
170
|
-
const forecasts = [];
|
|
171
|
-
for (let i = 0; i < 10; i++) {
|
|
172
|
-
for (let j = 0; j < 10; j++) {
|
|
173
|
-
const p = (i * 10 + j + 0.5) / 100;
|
|
174
|
-
// Roughly calibrated: higher p → more "yes"
|
|
175
|
-
const outcome = Math.random() < p ? "yes" : "no";
|
|
176
|
-
forecasts.push({ probability: p, outcome: outcome });
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
const bins = calibrationBins(forecasts);
|
|
180
|
-
const inputs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9];
|
|
181
|
-
const outputs = inputs.map((p) => isotonicCalibrate(p, bins));
|
|
182
|
-
for (let i = 0; i < outputs.length - 1; i++) {
|
|
183
|
-
expect(outputs[i]).toBeLessThanOrEqual(outputs[i + 1] + 0.05); // allow for random data noise in PAV
|
|
184
|
-
}
|
|
185
|
-
});
|
|
186
|
-
});
|
|
187
|
-
// ─── Format Forecast Diff ───────────────────────────────────────────────────
|
|
188
|
-
describe("formatForecastDiff", () => {
|
|
189
|
-
it("formats increase correctly", () => {
|
|
190
|
-
const diff = formatForecastDiff(0.35, 0.55, "New evidence from Fed minutes");
|
|
191
|
-
expect(diff).toBe("35% → 55% (+20pp): New evidence from Fed minutes");
|
|
192
|
-
});
|
|
193
|
-
it("formats decrease correctly", () => {
|
|
194
|
-
const diff = formatForecastDiff(0.8, 0.6, "CEO resignation");
|
|
195
|
-
expect(diff).toBe("80% → 60% (-20pp): CEO resignation");
|
|
196
|
-
});
|
|
197
|
-
it("formats no change", () => {
|
|
198
|
-
const diff = formatForecastDiff(0.5, 0.5, "No new evidence");
|
|
199
|
-
expect(diff).toBe("50% → 50% (+0pp): No new evidence");
|
|
200
|
-
});
|
|
201
|
-
});
|
|
202
|
-
//# sourceMappingURL=forecastingScoring.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"forecastingScoring.test.js","sourceRoot":"","sources":["../../src/__tests__/forecastingScoring.test.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,UAAU,EACV,QAAQ,EACR,eAAe,EACf,YAAY,EACZ,eAAe,EACf,iBAAiB,EACjB,kBAAkB,GACnB,MAAM,sDAAsD,CAAC;AAE9D,+EAA+E;AAE/E,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wBAAwB,EAAE,GAAG,EAAE;QAChC,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uBAAuB,EAAE,GAAG,EAAE;QAC/B,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAC5B,MAAM,CAAC,UAAU,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mBAAmB,EAAE,GAAG,EAAE;QAC3B,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,UAAU,EAAE,GAAG,EAAE;IACxB,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,oBAAoB;QACpB,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wBAAwB,EAAE,GAAG,EAAE;QAChC,oBAAoB;QACpB,MAAM,CAAC,QAAQ,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IACrD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACnC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1C,MAAM,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QAClC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1C,MAAM,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,iBAAiB,EAAE,GAAG,EAAE;QACzB,MAAM,IAAI,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAC5B,MAAM,IAAI,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qBAAqB,EAAE,GAAG,EAAE;QAC7B,MAAM,IAAI,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;QACjC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACzC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;QACjC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,IAAI,GAAG,eAAe,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC;QACtE,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;QAChC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,YAAY;IAClD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,SAAS,GAAG;YAChB,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAc,EAAE;YAC9C,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,IAAa,EAAE;YAC7C,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAc,EAAE;YAC9C,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,IAAa,EAAE;SAC9C,CAAC;QACF,MAAM,IAAI,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;QAChC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,IAAI,GAAG,eAAe,CAAC,CAAC,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC;QACrE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,eAAe,CAAC,CAAC,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QACpE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,CAAC,YAAY,CAAC,CAAC,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,CACtE,IAAI,EACJ,CAAC,CACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,SAAS,GAAG;YAChB,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAc,EAAE,EAAE,IAAI;YACnD,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAc,EAAE,EAAE,IAAI;SACpD,CAAC;QACF,MAAM,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,IAAI,GAAG;YACX,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAc,EAAE;YAC7C,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,IAAa,EAAE;SAC7C,CAAC;QACF,MAAM,GAAG,GAAG;YACV,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,KAAc,EAAE;YAC7C,EAAE,WAAW,EAAE,GAAG,EAAE,OAAO,EAAE,IAAa,EAAE;SAC7C,CAAC;QACF,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC;IACnE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,EAAE,CAAC,0DAA0D,EAAE,GAAG,EAAE;QAClE,MAAM,MAAM,GAAG,eAAe,CAAC;YAC7B,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE;YACrC,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE;SACrC,CAAC,CAAC;QACH,MAAM,CAAC,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,+BAA+B;QAC/B,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YACtD,WAAW,EAAE,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,EAAE;YAC3B,OAAO,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAiB;SAC9D,CAAC,CAAC,CAAC;QACJ,MAAM,IAAI,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,UAAU,GAAG,iBAAiB,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC7C,MAAM,CAAC,UAAU,CAAC,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iEAAiE,EAAE,GAAG,EAAE;QACzE,6BAA6B;QAC7B,MAAM,SAAS,GAA0D,EAAE,CAAC;QAC5E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC5B,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC;gBACnC,4CAA4C;gBAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;gBACjD,SAAS,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,OAAO,EAAE,OAAuB,EAAE,CAAC,CAAC;YACvE,CAAC;QACH,CAAC;QACD,MAAM,IAAI,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QAExC,MAAM,MAAM,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;QAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;QAE9D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,qCAAqC;QACtG,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAE/E,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;IAClC,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,IAAI,GAAG,kBAAkB,CAAC,IAAI,EAAE,IAAI,EAAE,+BAA+B,CAAC,CAAC;QAC7E,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CACf,kDAAkD,CACnD,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,IAAI,GAAG,kBAAkB,CAAC,GAAG,EAAE,GAAG,EAAE,iBAAiB,CAAC,CAAC;QAC7D,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,mBAAmB,EAAE,GAAG,EAAE;QAC3B,MAAM,IAAI,GAAG,kBAAkB,CAAC,GAAG,EAAE,GAAG,EAAE,iBAAiB,CAAC,CAAC;QAC7D,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,mCAAmC,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA audio-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local audio tools.
|
|
3
|
-
*
|
|
4
|
-
* This lane targets GAIA tasks that include audio attachments (MP3/WAV/etc).
|
|
5
|
-
* We provide deterministic local transcription via NodeBench MCP tools and score answers against
|
|
6
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
7
|
-
*
|
|
8
|
-
* Safety:
|
|
9
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
10
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
11
|
-
*
|
|
12
|
-
* Disabled by default (cost + rate limits). Run with:
|
|
13
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
14
|
-
*/
|
|
15
|
-
export {};
|