nodebench-mcp 2.69.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -39
- package/dist/agents/alertRouter.d.ts +38 -0
- package/dist/agents/alertRouter.js +151 -0
- package/dist/agents/alertRouter.js.map +1 -0
- package/dist/agents/entityMemory.d.ts +40 -0
- package/dist/agents/entityMemory.js +64 -0
- package/dist/agents/entityMemory.js.map +1 -0
- package/dist/agents/subAgents.d.ts +35 -0
- package/dist/agents/subAgents.js +62 -0
- package/dist/agents/subAgents.js.map +1 -0
- package/dist/benchmarks/benchmarkRunner.js +14 -0
- package/dist/benchmarks/benchmarkRunner.js.map +1 -1
- package/dist/benchmarks/chainEval.js +107 -0
- package/dist/benchmarks/chainEval.js.map +1 -1
- package/dist/benchmarks/llmJudgeEval.js +85 -0
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/searchQualityEval.js +118 -5
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/cli/search.d.ts +13 -0
- package/dist/cli/search.js +130 -0
- package/dist/cli/search.js.map +1 -0
- package/dist/db.d.ts +6 -2
- package/dist/db.js +470 -3
- package/dist/db.js.map +1 -1
- package/dist/index.js +349 -64
- package/dist/index.js.map +1 -1
- package/dist/profiler/behaviorStore.d.ts +97 -0
- package/dist/profiler/behaviorStore.js +276 -0
- package/dist/profiler/behaviorStore.js.map +1 -0
- package/dist/profiler/eventCollector.d.ts +119 -0
- package/dist/profiler/eventCollector.js +267 -0
- package/dist/profiler/eventCollector.js.map +1 -0
- package/dist/profiler/index.d.ts +15 -0
- package/dist/profiler/index.js +16 -0
- package/dist/profiler/index.js.map +1 -0
- package/dist/profiler/mcpProxy.d.ts +49 -0
- package/dist/profiler/mcpProxy.js +123 -0
- package/dist/profiler/mcpProxy.js.map +1 -0
- package/dist/profiler/modelRouter.d.ts +30 -0
- package/dist/profiler/modelRouter.js +99 -0
- package/dist/profiler/modelRouter.js.map +1 -0
- package/dist/profiler/otelReceiver.d.ts +17 -0
- package/dist/profiler/otelReceiver.js +62 -0
- package/dist/profiler/otelReceiver.js.map +1 -0
- package/dist/profiler/proofEngine.d.ts +41 -0
- package/dist/profiler/proofEngine.js +93 -0
- package/dist/profiler/proofEngine.js.map +1 -0
- package/dist/profiler/workflowTemplates.d.ts +41 -0
- package/dist/profiler/workflowTemplates.js +95 -0
- package/dist/profiler/workflowTemplates.js.map +1 -0
- package/dist/providers/localMemoryProvider.js +3 -2
- package/dist/providers/localMemoryProvider.js.map +1 -1
- package/dist/runtimeConfig.d.ts +11 -0
- package/dist/runtimeConfig.js +27 -0
- package/dist/runtimeConfig.js.map +1 -0
- package/dist/security/auditLog.js +8 -3
- package/dist/security/auditLog.js.map +1 -1
- package/dist/subconscious/blocks.d.ts +43 -0
- package/dist/subconscious/blocks.js +158 -0
- package/dist/subconscious/blocks.js.map +1 -0
- package/dist/subconscious/classifier.d.ts +22 -0
- package/dist/subconscious/classifier.js +118 -0
- package/dist/subconscious/classifier.js.map +1 -0
- package/dist/subconscious/graphEngine.d.ts +65 -0
- package/dist/subconscious/graphEngine.js +234 -0
- package/dist/subconscious/graphEngine.js.map +1 -0
- package/dist/subconscious/index.d.ts +19 -0
- package/dist/subconscious/index.js +20 -0
- package/dist/subconscious/index.js.map +1 -0
- package/dist/subconscious/tools.d.ts +5 -0
- package/dist/subconscious/tools.js +255 -0
- package/dist/subconscious/tools.js.map +1 -0
- package/dist/subconscious/whisperPolicy.d.ts +20 -0
- package/dist/subconscious/whisperPolicy.js +171 -0
- package/dist/subconscious/whisperPolicy.js.map +1 -0
- package/dist/sweep/engine.d.ts +27 -0
- package/dist/sweep/engine.js +244 -0
- package/dist/sweep/engine.js.map +1 -0
- package/dist/sweep/index.d.ts +9 -0
- package/dist/sweep/index.js +8 -0
- package/dist/sweep/index.js.map +1 -0
- package/dist/sweep/sources/github_trending.d.ts +6 -0
- package/dist/sweep/sources/github_trending.js +37 -0
- package/dist/sweep/sources/github_trending.js.map +1 -0
- package/dist/sweep/sources/hackernews.d.ts +7 -0
- package/dist/sweep/sources/hackernews.js +57 -0
- package/dist/sweep/sources/hackernews.js.map +1 -0
- package/dist/sweep/sources/openbb_finance.d.ts +9 -0
- package/dist/sweep/sources/openbb_finance.js +46 -0
- package/dist/sweep/sources/openbb_finance.js.map +1 -0
- package/dist/sweep/sources/producthunt.d.ts +6 -0
- package/dist/sweep/sources/producthunt.js +41 -0
- package/dist/sweep/sources/producthunt.js.map +1 -0
- package/dist/sweep/sources/web_signals.d.ts +7 -0
- package/dist/sweep/sources/web_signals.js +63 -0
- package/dist/sweep/sources/web_signals.js.map +1 -0
- package/dist/sweep/sources/yahoo_finance.d.ts +6 -0
- package/dist/sweep/sources/yahoo_finance.js +47 -0
- package/dist/sweep/sources/yahoo_finance.js.map +1 -0
- package/dist/sweep/types.d.ts +50 -0
- package/dist/sweep/types.js +9 -0
- package/dist/sweep/types.js.map +1 -0
- package/dist/sync/founderEpisodeStore.d.ts +98 -0
- package/dist/sync/founderEpisodeStore.js +230 -0
- package/dist/sync/founderEpisodeStore.js.map +1 -0
- package/dist/sync/hyperloopArchive.d.ts +51 -0
- package/dist/sync/hyperloopArchive.js +153 -0
- package/dist/sync/hyperloopArchive.js.map +1 -0
- package/dist/sync/hyperloopEval.d.ts +123 -0
- package/dist/sync/hyperloopEval.js +389 -0
- package/dist/sync/hyperloopEval.js.map +1 -0
- package/dist/sync/hyperloopEval.test.d.ts +4 -0
- package/dist/sync/hyperloopEval.test.js +60 -0
- package/dist/sync/hyperloopEval.test.js.map +1 -0
- package/dist/sync/protocol.d.ts +172 -0
- package/dist/sync/protocol.js +9 -0
- package/dist/sync/protocol.js.map +1 -0
- package/dist/sync/sessionMemory.d.ts +47 -0
- package/dist/sync/sessionMemory.js +138 -0
- package/dist/sync/sessionMemory.js.map +1 -0
- package/dist/sync/store.d.ts +384 -0
- package/dist/sync/store.js +1435 -0
- package/dist/sync/store.js.map +1 -0
- package/dist/sync/store.test.d.ts +4 -0
- package/dist/sync/store.test.js +43 -0
- package/dist/sync/store.test.js.map +1 -0
- package/dist/sync/syncBridgeClient.d.ts +30 -0
- package/dist/sync/syncBridgeClient.js +172 -0
- package/dist/sync/syncBridgeClient.js.map +1 -0
- package/dist/tools/autonomousDeliveryTools.d.ts +2 -0
- package/dist/tools/autonomousDeliveryTools.js +1104 -0
- package/dist/tools/autonomousDeliveryTools.js.map +1 -0
- package/dist/tools/claudeCodeIngestTools.d.ts +10 -0
- package/dist/tools/claudeCodeIngestTools.js +347 -0
- package/dist/tools/claudeCodeIngestTools.js.map +1 -0
- package/dist/tools/coreWorkflowTools.d.ts +2 -0
- package/dist/tools/coreWorkflowTools.js +488 -0
- package/dist/tools/coreWorkflowTools.js.map +1 -0
- package/dist/tools/deltaTools.d.ts +15 -0
- package/dist/tools/deltaTools.js +1522 -0
- package/dist/tools/deltaTools.js.map +1 -0
- package/dist/tools/entityLookupTools.d.ts +14 -0
- package/dist/tools/entityLookupTools.js +159 -0
- package/dist/tools/entityLookupTools.js.map +1 -0
- package/dist/tools/entityTemporalTools.d.ts +12 -0
- package/dist/tools/entityTemporalTools.js +330 -0
- package/dist/tools/entityTemporalTools.js.map +1 -0
- package/dist/tools/founderLocalPipeline.d.ts +215 -0
- package/dist/tools/founderLocalPipeline.js +1516 -2
- package/dist/tools/founderLocalPipeline.js.map +1 -1
- package/dist/tools/founderOperatingModel.d.ts +120 -0
- package/dist/tools/founderOperatingModel.js +469 -0
- package/dist/tools/founderOperatingModel.js.map +1 -0
- package/dist/tools/founderOperatingModelTools.d.ts +2 -0
- package/dist/tools/founderOperatingModelTools.js +169 -0
- package/dist/tools/founderOperatingModelTools.js.map +1 -0
- package/dist/tools/founderStrategicOpsTools.d.ts +2 -0
- package/dist/tools/founderStrategicOpsTools.js +1310 -0
- package/dist/tools/founderStrategicOpsTools.js.map +1 -0
- package/dist/tools/graphifyTools.d.ts +19 -0
- package/dist/tools/graphifyTools.js +375 -0
- package/dist/tools/graphifyTools.js.map +1 -0
- package/dist/tools/index.d.ts +3 -0
- package/dist/tools/index.js +4 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/monteCarloTools.d.ts +16 -0
- package/dist/tools/monteCarloTools.js +225 -0
- package/dist/tools/monteCarloTools.js.map +1 -0
- package/dist/tools/packetCompilerTools.d.ts +12 -0
- package/dist/tools/packetCompilerTools.js +322 -0
- package/dist/tools/packetCompilerTools.js.map +1 -0
- package/dist/tools/planSynthesisTools.d.ts +15 -0
- package/dist/tools/planSynthesisTools.js +455 -0
- package/dist/tools/planSynthesisTools.js.map +1 -0
- package/dist/tools/profilerTools.d.ts +20 -0
- package/dist/tools/profilerTools.js +364 -0
- package/dist/tools/profilerTools.js.map +1 -0
- package/dist/tools/savingsTools.d.ts +11 -0
- package/dist/tools/savingsTools.js +155 -0
- package/dist/tools/savingsTools.js.map +1 -0
- package/dist/tools/scenarioCompilerTools.d.ts +14 -0
- package/dist/tools/scenarioCompilerTools.js +290 -0
- package/dist/tools/scenarioCompilerTools.js.map +1 -0
- package/dist/tools/sharedContextTools.d.ts +2 -0
- package/dist/tools/sharedContextTools.js +423 -0
- package/dist/tools/sharedContextTools.js.map +1 -0
- package/dist/tools/sitemapTools.d.ts +15 -0
- package/dist/tools/sitemapTools.js +560 -0
- package/dist/tools/sitemapTools.js.map +1 -0
- package/dist/tools/sweepTools.d.ts +9 -0
- package/dist/tools/sweepTools.js +112 -0
- package/dist/tools/sweepTools.js.map +1 -0
- package/dist/tools/syncBridgeTools.d.ts +2 -0
- package/dist/tools/syncBridgeTools.js +258 -0
- package/dist/tools/syncBridgeTools.js.map +1 -0
- package/dist/tools/toolRegistry.js +1216 -49
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/workspaceTools.d.ts +19 -0
- package/dist/tools/workspaceTools.js +762 -0
- package/dist/tools/workspaceTools.js.map +1 -0
- package/dist/toolsetRegistry.js +88 -2
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +36 -36
- package/rules/nodebench-agentic-reliability.md +32 -0
- package/rules/nodebench-analyst-diagnostic.md +25 -0
- package/rules/nodebench-auto-qa.md +31 -0
- package/rules/nodebench-completion-traceability.md +22 -0
- package/rules/nodebench-flywheel-continuous.md +25 -0
- package/rules/nodebench-pre-release-review.md +24 -0
- package/rules/nodebench-qa-dogfood.md +26 -0
- package/rules/nodebench-scenario-testing.md +30 -0
- package/rules/nodebench-self-direction.md +23 -0
- package/rules/nodebench-self-judge-loop.md +24 -0
- package/scripts/install.sh +215 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* hyperloopEval.ts — Per-task evaluation metrics + improvement@k tracking.
|
|
3
|
+
*
|
|
4
|
+
* Runs after every task to compute quality metrics.
|
|
5
|
+
* Compares against archive reference variant to compute improvement delta.
|
|
6
|
+
* Tracks improvement@k across generations for cross-domain transfer measurement.
|
|
7
|
+
*/
|
|
8
|
+
import { getDb, genId } from "../db.js";
|
|
9
|
+
import { lookupBestVariant, addArchiveEntry } from "./hyperloopArchive.js";
|
|
10
|
+
// ─── Schema ──────────────────────────────────────────────────────
|
|
11
|
+
export function initEvalTables() {
|
|
12
|
+
const db = getDb();
|
|
13
|
+
db.exec(`
|
|
14
|
+
CREATE TABLE IF NOT EXISTS hyperloop_evaluations (
|
|
15
|
+
eval_id TEXT PRIMARY KEY,
|
|
16
|
+
episode_id TEXT NOT NULL,
|
|
17
|
+
query TEXT NOT NULL,
|
|
18
|
+
lens TEXT NOT NULL,
|
|
19
|
+
entity TEXT,
|
|
20
|
+
classification TEXT NOT NULL,
|
|
21
|
+
evidence_coverage REAL NOT NULL,
|
|
22
|
+
contradiction_rate REAL NOT NULL,
|
|
23
|
+
grounding_rate REAL NOT NULL,
|
|
24
|
+
user_edit_distance REAL NOT NULL,
|
|
25
|
+
was_exported INTEGER NOT NULL DEFAULT 0,
|
|
26
|
+
was_delegated INTEGER NOT NULL DEFAULT 0,
|
|
27
|
+
latency_ms INTEGER NOT NULL,
|
|
28
|
+
cost_usd REAL NOT NULL DEFAULT 0,
|
|
29
|
+
tool_call_count INTEGER NOT NULL DEFAULT 0,
|
|
30
|
+
quality_score REAL NOT NULL,
|
|
31
|
+
rubric_version TEXT NOT NULL DEFAULT 'hyperloop_v2',
|
|
32
|
+
score_components TEXT NOT NULL DEFAULT '[]',
|
|
33
|
+
gates TEXT NOT NULL DEFAULT '[]',
|
|
34
|
+
policy_action TEXT NOT NULL DEFAULT 'archive_only',
|
|
35
|
+
llm_judge TEXT,
|
|
36
|
+
reference_variant_id TEXT,
|
|
37
|
+
improvement_delta REAL NOT NULL DEFAULT 0,
|
|
38
|
+
timestamp TEXT NOT NULL
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
CREATE INDEX IF NOT EXISTS idx_eval_episode ON hyperloop_evaluations(episode_id);
|
|
42
|
+
CREATE INDEX IF NOT EXISTS idx_eval_quality ON hyperloop_evaluations(quality_score DESC);
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_eval_classification ON hyperloop_evaluations(classification);
|
|
44
|
+
`);
|
|
45
|
+
ensureEvalColumn("rubric_version", "TEXT NOT NULL DEFAULT 'hyperloop_v2'");
|
|
46
|
+
ensureEvalColumn("score_components", "TEXT NOT NULL DEFAULT '[]'");
|
|
47
|
+
ensureEvalColumn("gates", "TEXT NOT NULL DEFAULT '[]'");
|
|
48
|
+
ensureEvalColumn("policy_action", "TEXT NOT NULL DEFAULT 'archive_only'");
|
|
49
|
+
ensureEvalColumn("llm_judge", "TEXT");
|
|
50
|
+
}
|
|
51
|
+
function ensureEvalColumn(columnName, definition) {
|
|
52
|
+
const db = getDb();
|
|
53
|
+
const columns = db.prepare("PRAGMA table_info(hyperloop_evaluations)").all();
|
|
54
|
+
if (!columns.some((column) => column.name === columnName)) {
|
|
55
|
+
db.exec(`ALTER TABLE hyperloop_evaluations ADD COLUMN ${columnName} ${definition}`);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// ─── Compute quality score ───────────────────────────────────────
|
|
59
|
+
function clamp01(value) {
|
|
60
|
+
if (!Number.isFinite(value))
|
|
61
|
+
return 0;
|
|
62
|
+
return Math.max(0, Math.min(1, value));
|
|
63
|
+
}
|
|
64
|
+
function roundScore(value) {
|
|
65
|
+
return Math.round(clamp01(value) * 100) / 100;
|
|
66
|
+
}
|
|
67
|
+
export function buildHyperloopScorecard(metrics) {
|
|
68
|
+
const scoreComponents = [
|
|
69
|
+
{
|
|
70
|
+
key: "evidence_coverage",
|
|
71
|
+
label: "Evidence coverage",
|
|
72
|
+
weight: 0.3,
|
|
73
|
+
rawValue: roundScore(metrics.evidenceCoverage),
|
|
74
|
+
normalizedScore: roundScore(metrics.evidenceCoverage),
|
|
75
|
+
weightedContribution: roundScore(metrics.evidenceCoverage * 0.3),
|
|
76
|
+
detail: "Verified or source-backed signals divided by total surfaced signals.",
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
key: "claim_grounding",
|
|
80
|
+
label: "Claim grounding",
|
|
81
|
+
weight: 0.25,
|
|
82
|
+
rawValue: roundScore(metrics.groundingRate),
|
|
83
|
+
normalizedScore: roundScore(metrics.groundingRate),
|
|
84
|
+
weightedContribution: roundScore(metrics.groundingRate * 0.25),
|
|
85
|
+
detail: "Claims with explicit evidence text divided by total surfaced claims.",
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
key: "contradiction_capture",
|
|
89
|
+
label: "Contradiction capture",
|
|
90
|
+
weight: 0.15,
|
|
91
|
+
rawValue: roundScore(metrics.contradictionRate),
|
|
92
|
+
normalizedScore: roundScore(Math.min(1, metrics.contradictionRate * 4)),
|
|
93
|
+
weightedContribution: roundScore(Math.min(1, metrics.contradictionRate * 4) * 0.15),
|
|
94
|
+
detail: "Non-zero only when the run actually surfaced contradictions or diligence flags.",
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
key: "human_edit_burden",
|
|
98
|
+
label: "Human edit burden",
|
|
99
|
+
weight: 0.15,
|
|
100
|
+
rawValue: roundScore(1 - metrics.userEditDistance),
|
|
101
|
+
normalizedScore: roundScore(1 - metrics.userEditDistance),
|
|
102
|
+
weightedContribution: roundScore((1 - metrics.userEditDistance) * 0.15),
|
|
103
|
+
detail: "Starts at 1.0 and falls as the human has to rewrite more of the result.",
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
key: "outcome_readiness",
|
|
107
|
+
label: "Outcome readiness",
|
|
108
|
+
weight: 0.1,
|
|
109
|
+
rawValue: roundScore((metrics.wasExported ? 0.5 : 0) + (metrics.wasDelegated ? 0.5 : 0)),
|
|
110
|
+
normalizedScore: roundScore((metrics.wasExported ? 0.5 : 0) + (metrics.wasDelegated ? 0.5 : 0)),
|
|
111
|
+
weightedContribution: roundScore(((metrics.wasExported ? 0.5 : 0) + (metrics.wasDelegated ? 0.5 : 0)) * 0.1),
|
|
112
|
+
detail: "Gives credit only when the result was strong enough to export or delegate.",
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
key: "latency_budget",
|
|
116
|
+
label: "Latency budget",
|
|
117
|
+
weight: 0.05,
|
|
118
|
+
rawValue: metrics.latencyMs,
|
|
119
|
+
normalizedScore: metrics.latencyMs <= 5000 ? 1 : metrics.latencyMs <= 12000 ? 0.6 : metrics.latencyMs <= 20000 ? 0.35 : 0.1,
|
|
120
|
+
weightedContribution: roundScore((metrics.latencyMs <= 5000 ? 1 : metrics.latencyMs <= 12000 ? 0.6 : metrics.latencyMs <= 20000 ? 0.35 : 0.1) * 0.05),
|
|
121
|
+
detail: "Fast runs get a small bonus, but speed is deliberately not a dominant factor.",
|
|
122
|
+
},
|
|
123
|
+
].map((component) => ({
|
|
124
|
+
...component,
|
|
125
|
+
normalizedScore: roundScore(component.normalizedScore),
|
|
126
|
+
weightedContribution: roundScore(component.weightedContribution),
|
|
127
|
+
}));
|
|
128
|
+
const qualityScore = roundScore(scoreComponents.reduce((sum, component) => sum + component.weightedContribution, 0));
|
|
129
|
+
const gates = [
|
|
130
|
+
{
|
|
131
|
+
key: "minimum_evidence",
|
|
132
|
+
label: "Minimum evidence",
|
|
133
|
+
passed: metrics.evidenceCoverage >= 0.25,
|
|
134
|
+
critical: true,
|
|
135
|
+
reason: metrics.evidenceCoverage >= 0.25
|
|
136
|
+
? "Evidence coverage cleared the minimum threshold."
|
|
137
|
+
: "Too few surfaced signals were actually source-backed.",
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
key: "minimum_grounding",
|
|
141
|
+
label: "Minimum grounding",
|
|
142
|
+
passed: metrics.groundingRate >= 0.25,
|
|
143
|
+
critical: true,
|
|
144
|
+
reason: metrics.groundingRate >= 0.25
|
|
145
|
+
? "Grounding rate cleared the minimum threshold."
|
|
146
|
+
: "Too many claims are unsupported or missing direct evidence text.",
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
key: "human_edit_load",
|
|
150
|
+
label: "Human edit load",
|
|
151
|
+
passed: metrics.userEditDistance <= 0.6,
|
|
152
|
+
critical: false,
|
|
153
|
+
reason: metrics.userEditDistance <= 0.6
|
|
154
|
+
? "Human edit burden is still within acceptable review bounds."
|
|
155
|
+
: "The human would need to rewrite too much of this output.",
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
key: "latency_window",
|
|
159
|
+
label: "Latency window",
|
|
160
|
+
passed: metrics.latencyMs <= 15000,
|
|
161
|
+
critical: false,
|
|
162
|
+
reason: metrics.latencyMs <= 15000
|
|
163
|
+
? "Latency stayed within the target review window."
|
|
164
|
+
: "This run was too slow for routine founder use.",
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
key: "archive_candidate_score",
|
|
168
|
+
label: "Archive candidate score",
|
|
169
|
+
passed: qualityScore >= 0.62,
|
|
170
|
+
critical: false,
|
|
171
|
+
reason: qualityScore >= 0.62
|
|
172
|
+
? "Composite score is high enough to consider archive candidacy."
|
|
173
|
+
: "Composite score is still too weak for a reusable archive candidate.",
|
|
174
|
+
},
|
|
175
|
+
];
|
|
176
|
+
const hasCriticalFailure = gates.some((gate) => gate.critical && !gate.passed);
|
|
177
|
+
const policyAction = !hasCriticalFailure && qualityScore >= 0.62 ? "candidate" : "archive_only";
|
|
178
|
+
return {
|
|
179
|
+
qualityScore,
|
|
180
|
+
scoreComponents,
|
|
181
|
+
gates,
|
|
182
|
+
policyAction,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
// ─── Evaluate a completed task ───────────────────────────────────
|
|
186
|
+
export function evaluateTask(input) {
|
|
187
|
+
const db = getDb();
|
|
188
|
+
initEvalTables();
|
|
189
|
+
const evidenceCoverage = input.totalSignals > 0 ? input.verifiedSignals / input.totalSignals : 0;
|
|
190
|
+
const groundingRate = input.totalClaims > 0 ? input.groundedClaims / input.totalClaims : 0;
|
|
191
|
+
const contradictionRate = input.totalClaims > 0 ? input.contradictionsCaught / input.totalClaims : 0;
|
|
192
|
+
const { qualityScore, scoreComponents, gates, policyAction, } = buildHyperloopScorecard({
|
|
193
|
+
evidenceCoverage,
|
|
194
|
+
groundingRate,
|
|
195
|
+
contradictionRate,
|
|
196
|
+
userEditDistance: input.userEditDistance,
|
|
197
|
+
wasExported: input.wasExported,
|
|
198
|
+
wasDelegated: input.wasDelegated,
|
|
199
|
+
latencyMs: input.latencyMs,
|
|
200
|
+
});
|
|
201
|
+
// Look up reference variant to compute improvement delta
|
|
202
|
+
const archiveType = classificationToArchiveType(input.classification);
|
|
203
|
+
const reference = lookupBestVariant(archiveType, input.lens, input.entity ?? undefined);
|
|
204
|
+
const improvementDelta = reference
|
|
205
|
+
? Math.round((qualityScore - reference.qualityScore) * 100) / 100
|
|
206
|
+
: 0;
|
|
207
|
+
const evalId = genId("eval");
|
|
208
|
+
const timestamp = new Date().toISOString();
|
|
209
|
+
const llmJudge = normalizeLlmJudge(input.llmJudge);
|
|
210
|
+
db.prepare(`
|
|
211
|
+
INSERT INTO hyperloop_evaluations (eval_id, episode_id, query, lens, entity, classification, evidence_coverage, contradiction_rate, grounding_rate, user_edit_distance, was_exported, was_delegated, latency_ms, cost_usd, tool_call_count, quality_score, rubric_version, score_components, gates, policy_action, llm_judge, reference_variant_id, improvement_delta, timestamp)
|
|
212
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
213
|
+
`).run(evalId, input.episodeId, input.query, input.lens, input.entity, input.classification, evidenceCoverage, contradictionRate, groundingRate, input.userEditDistance, input.wasExported ? 1 : 0, input.wasDelegated ? 1 : 0, input.latencyMs, input.costUsd, input.toolCallCount, qualityScore, "hyperloop_v2", JSON.stringify(scoreComponents), JSON.stringify(gates), policyAction, llmJudge ? JSON.stringify(llmJudge) : null, reference?.id ?? null, improvementDelta, timestamp);
|
|
214
|
+
// If quality is good enough, create archive candidate
|
|
215
|
+
if (policyAction === "candidate") {
|
|
216
|
+
addArchiveEntry({
|
|
217
|
+
type: archiveType,
|
|
218
|
+
name: `${input.classification}:${input.entity ?? input.lens}`,
|
|
219
|
+
description: `Structured candidate (${Math.round(qualityScore * 100)}%) from "${input.query.slice(0, 60)}"`,
|
|
220
|
+
content: JSON.stringify({
|
|
221
|
+
classification: input.classification,
|
|
222
|
+
lens: input.lens,
|
|
223
|
+
entity: input.entity,
|
|
224
|
+
toolCallCount: input.toolCallCount,
|
|
225
|
+
latencyMs: input.latencyMs,
|
|
226
|
+
rubricVersion: "hyperloop_v2",
|
|
227
|
+
scoreComponents,
|
|
228
|
+
gates,
|
|
229
|
+
llmJudge,
|
|
230
|
+
}),
|
|
231
|
+
sourceEpisodeId: input.episodeId,
|
|
232
|
+
sourceQuery: input.query,
|
|
233
|
+
sourceLens: input.lens,
|
|
234
|
+
sourceEntity: input.entity,
|
|
235
|
+
evidenceCoverage,
|
|
236
|
+
contradictionsCaught: input.contradictionsCaught,
|
|
237
|
+
userEditDistance: input.userEditDistance,
|
|
238
|
+
wasExported: input.wasExported,
|
|
239
|
+
wasDelegated: input.wasDelegated,
|
|
240
|
+
qualityScore,
|
|
241
|
+
improvementDelta,
|
|
242
|
+
createdAt: timestamp,
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
return {
|
|
246
|
+
evalId,
|
|
247
|
+
episodeId: input.episodeId,
|
|
248
|
+
query: input.query,
|
|
249
|
+
lens: input.lens,
|
|
250
|
+
entity: input.entity,
|
|
251
|
+
classification: input.classification,
|
|
252
|
+
evidenceCoverage,
|
|
253
|
+
contradictionRate,
|
|
254
|
+
groundingRate,
|
|
255
|
+
userEditDistance: input.userEditDistance,
|
|
256
|
+
wasExported: input.wasExported,
|
|
257
|
+
wasDelegated: input.wasDelegated,
|
|
258
|
+
latencyMs: input.latencyMs,
|
|
259
|
+
costUsd: input.costUsd,
|
|
260
|
+
toolCallCount: input.toolCallCount,
|
|
261
|
+
qualityScore,
|
|
262
|
+
rubricVersion: "hyperloop_v2",
|
|
263
|
+
scoreComponents,
|
|
264
|
+
gates,
|
|
265
|
+
policyAction,
|
|
266
|
+
llmJudge,
|
|
267
|
+
referenceVariantId: reference?.id ?? null,
|
|
268
|
+
improvementDelta,
|
|
269
|
+
timestamp,
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
/** Compute improvement@k for a classification type. */
|
|
273
|
+
export function computeImprovementAtK(classification, k = 5) {
|
|
274
|
+
const db = getDb();
|
|
275
|
+
initEvalTables();
|
|
276
|
+
const rows = db.prepare("SELECT quality_score, improvement_delta FROM hyperloop_evaluations WHERE classification = ? ORDER BY timestamp ASC").all(classification);
|
|
277
|
+
if (rows.length === 0)
|
|
278
|
+
return [];
|
|
279
|
+
const results = [];
|
|
280
|
+
const chunkSize = Math.max(1, Math.floor(rows.length / k));
|
|
281
|
+
for (let gen = 0; gen < k && gen * chunkSize < rows.length; gen++) {
|
|
282
|
+
const chunk = rows.slice(gen * chunkSize, (gen + 1) * chunkSize);
|
|
283
|
+
const avgQuality = chunk.reduce((s, r) => s + r.quality_score, 0) / chunk.length;
|
|
284
|
+
const avgImprovement = chunk.reduce((s, r) => s + r.improvement_delta, 0) / chunk.length;
|
|
285
|
+
results.push({
|
|
286
|
+
classification,
|
|
287
|
+
k: gen + 1,
|
|
288
|
+
avgQuality: Math.round(avgQuality * 100) / 100,
|
|
289
|
+
avgImprovement: Math.round(avgImprovement * 100) / 100,
|
|
290
|
+
sampleSize: chunk.length,
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
return results;
|
|
294
|
+
}
|
|
295
|
+
export function listRecentEvaluations(limit = 12) {
|
|
296
|
+
const db = getDb();
|
|
297
|
+
initEvalTables();
|
|
298
|
+
const rows = db.prepare(`SELECT eval_id, query, classification, quality_score, improvement_delta, evidence_coverage, grounding_rate, latency_ms, tool_call_count, timestamp, rubric_version, policy_action, score_components, gates, llm_judge
|
|
299
|
+
FROM hyperloop_evaluations
|
|
300
|
+
ORDER BY timestamp DESC
|
|
301
|
+
LIMIT ?`).all(limit);
|
|
302
|
+
return rows.map((row) => ({
|
|
303
|
+
evalId: row.eval_id,
|
|
304
|
+
query: row.query,
|
|
305
|
+
classification: row.classification,
|
|
306
|
+
qualityScore: row.quality_score,
|
|
307
|
+
improvementDelta: row.improvement_delta,
|
|
308
|
+
evidenceCoverage: row.evidence_coverage,
|
|
309
|
+
groundingRate: row.grounding_rate,
|
|
310
|
+
latencyMs: row.latency_ms,
|
|
311
|
+
toolCallCount: row.tool_call_count,
|
|
312
|
+
timestamp: row.timestamp,
|
|
313
|
+
rubricVersion: row.rubric_version ?? "hyperloop_v2",
|
|
314
|
+
policyAction: row.policy_action ?? "archive_only",
|
|
315
|
+
scoreComponents: parseJsonArray(row.score_components),
|
|
316
|
+
gates: parseJsonArray(row.gates),
|
|
317
|
+
llmJudge: parseJsonObject(row.llm_judge),
|
|
318
|
+
}));
|
|
319
|
+
}
|
|
320
|
+
export function listTrackedClassifications(limit = 6) {
|
|
321
|
+
const db = getDb();
|
|
322
|
+
initEvalTables();
|
|
323
|
+
const rows = db.prepare(`SELECT classification, COUNT(*) as cnt
|
|
324
|
+
FROM hyperloop_evaluations
|
|
325
|
+
GROUP BY classification
|
|
326
|
+
ORDER BY cnt DESC, MAX(timestamp) DESC
|
|
327
|
+
LIMIT ?`).all(limit);
|
|
328
|
+
return rows.map((row) => row.classification);
|
|
329
|
+
}
|
|
330
|
+
// ─── Helpers ─────────────────────────────────────────────────────
|
|
331
|
+
function classificationToArchiveType(classification) {
|
|
332
|
+
const map = {
|
|
333
|
+
company_search: "packet_template",
|
|
334
|
+
competitor: "packet_template",
|
|
335
|
+
multi_entity: "packet_template",
|
|
336
|
+
weekly_reset: "workflow_path",
|
|
337
|
+
pre_delegation: "delegation_shape",
|
|
338
|
+
important_change: "signal_recipe",
|
|
339
|
+
idea_validation: "packet_template",
|
|
340
|
+
general: "routing_policy",
|
|
341
|
+
};
|
|
342
|
+
return map[classification] ?? "routing_policy";
|
|
343
|
+
}
|
|
344
|
+
function parseJsonArray(value) {
|
|
345
|
+
if (!value)
|
|
346
|
+
return [];
|
|
347
|
+
try {
|
|
348
|
+
const parsed = JSON.parse(value);
|
|
349
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
350
|
+
}
|
|
351
|
+
catch {
|
|
352
|
+
return [];
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
function parseJsonObject(value) {
|
|
356
|
+
if (!value)
|
|
357
|
+
return undefined;
|
|
358
|
+
try {
|
|
359
|
+
const parsed = JSON.parse(value);
|
|
360
|
+
return parsed && typeof parsed === "object" ? parsed : undefined;
|
|
361
|
+
}
|
|
362
|
+
catch {
|
|
363
|
+
return undefined;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
function normalizeLlmJudge(input) {
|
|
367
|
+
if (!input)
|
|
368
|
+
return undefined;
|
|
369
|
+
const verdict = input.verdict?.trim();
|
|
370
|
+
const score = input.score?.trim();
|
|
371
|
+
const failingCriteria = Array.isArray(input.failingCriteria) ? input.failingCriteria.filter(Boolean) : [];
|
|
372
|
+
const fixSuggestions = Array.isArray(input.fixSuggestions) ? input.fixSuggestions.filter(Boolean) : [];
|
|
373
|
+
const reasoningParts = [
|
|
374
|
+
verdict ? `Verdict: ${verdict}.` : null,
|
|
375
|
+
failingCriteria.length > 0 ? `Failures: ${failingCriteria.join("; ")}.` : "Failures: none called out.",
|
|
376
|
+
fixSuggestions.length > 0 ? `Fixes: ${fixSuggestions.join("; ")}.` : "Fixes: none suggested.",
|
|
377
|
+
].filter(Boolean);
|
|
378
|
+
if (!verdict && !score && failingCriteria.length === 0 && fixSuggestions.length === 0) {
|
|
379
|
+
return undefined;
|
|
380
|
+
}
|
|
381
|
+
return {
|
|
382
|
+
verdict: verdict ?? "UNSPECIFIED",
|
|
383
|
+
score,
|
|
384
|
+
failingCriteria,
|
|
385
|
+
fixSuggestions,
|
|
386
|
+
reasoningSummary: reasoningParts.join(" "),
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
//# sourceMappingURL=hyperloopEval.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hyperloopEval.js","sourceRoot":"","sources":["../../src/sync/hyperloopEval.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,UAAU,CAAC;AACxC,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAyB,MAAM,uBAAuB,CAAC;AAkElG,oEAAoE;AAEpE,MAAM,UAAU,cAAc;IAC5B,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BP,CAAC,CAAC;IAEH,gBAAgB,CAAC,gBAAgB,EAAE,sCAAsC,CAAC,CAAC;IAC3E,gBAAgB,CAAC,kBAAkB,EAAE,4BAA4B,CAAC,CAAC;IACnE,gBAAgB,CAAC,OAAO,EAAE,4BAA4B,CAAC,CAAC;IACxD,gBAAgB,CAAC,eAAe,EAAE,sCAAsC,CAAC,CAAC;IAC1E,gBAAgB,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,gBAAgB,CAAC,UAAkB,EAAE,UAAkB;IAC9D,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,CAAC,0CAA0C,CAAC,CAAC,GAAG,EAA6B,CAAC;IACxG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,UAAU,CAAC,EAAE,CAAC;QAC1D,EAAE,CAAC,IAAI,CAAC,gDAAgD,UAAU,IAAI,UAAU,EAAE,CAAC,CAAC;IACtF,CAAC;AACH,CAAC;AAED,oEAAoE;AAEpE,SAAS,OAAO,CAAC,KAAa;IAC5B,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACtC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;AACzC,CAAC;AAED,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC;AAChD,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,OAQvC;IAMC,MAAM,eAAe,GAA8B;QACjD;YACE,GAAG,EAAE,mBAAmB;YACxB,KAAK,EAAE,mBAAmB;YAC1B,MAAM,EAAE,GAAG;YACX,QAAQ,EAAE,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC;YAC9C,eAAe,EAAE,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC;YACrD,oBAAoB,EAAE,UAAU,CAAC,OAAO,CAAC,gBAAgB,GAAG,GAAG,CAAC;YAChE,MAAM,EAAE,sEAAsE;SAC/E;QACD;YACE,GAAG,EAAE,iBAAiB;YACtB,KAAK,EAAE,iBAAiB;YACxB,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC;YAC3C,eAAe,EAAE,UAAU,CAAC,OAAO,CAAC,aAAa,CAAC;YAClD,oBAAoB,EAAE,UAAU,CAAC,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC;YAC9D,MAAM,EAAE,sEAAsE;SAC/E;QACD;YACE,GAAG,EAAE,uBAAuB;YAC5B,KAAK,EAAE,uBAAuB;YAC9B,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,UAAU,CAAC,OAAO,CAAC,iBAAiB,CAAC;YAC/C,eAAe,EAAE,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,iBAAiB,GAAG,CAAC,CAAC,CAAC;YACvE,oBAAoB,EAAE,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,iBAAiB,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC;YACnF,MAAM,EAAE,iFAAiF;SAC1F;QACD;YACE,GAAG,EAAE,mBAAmB;YACxB,KAAK,EAAE,mBAAmB;YAC1B,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAClD,eAAe,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,gBAAgB,CAAC;YACzD,oBAAoB,EAAE,UAAU,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC;YACvE,MAAM,EAAE,yEAAyE;SAClF;QACD;YACE,GAAG,EAAE,mBAAmB;YACxB,KAAK,EAAE,mBAAmB;YAC1B,MAAM,EAAE,GAAG;YACX,QAAQ,EAAE,UAAU,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACxF,eAAe,EAAE,UAAU,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC/F,oBAAoB,EAAE,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;YAC5G,MAAM,EAAE,4EAA4E;SACrF;QACD;YACE,GAAG,EAAE,gBAAgB;YACrB,KAAK,EAAE,gBAAgB;YACvB,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,OAAO,CAAC,SAAS;YAC3B,eAAe,EAAE,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG;YAC3H,oBAAoB,EAAE,UAAU,CAAC,CAAC,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC;YACrJ,MAAM,EAAE,+EAA+E;SACxF;KACF,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;QACpB,GAAG,SAAS;QACZ,eAAe,EAAE,UAAU,CAAC,SAAS,CAAC,eAAe,CAAC;QACtD,oBAAoB,EAAE,UAAU,CAAC,SAAS,CAAC,oBAAoB,CAAC;KACjE,CAAC,CAAC,CAAC;IAEJ,MAAM,YAAY,GAAG,UAAU,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,SAAS,EAAE,EAAE,CAAC,GAAG,GAAG,SAAS,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,CAAC;IAErH,MAAM,KAAK,GAA0B;QACnC;YACE,GAAG,EAAE,kBAAkB;YACvB,KAAK,EAAE,kBAAkB;YACzB,MAAM,EAAE,OAAO,CAAC,gBAAgB,IAAI,IAAI;YACxC,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,OAAO,CAAC,gBAAgB,IAAI,IAAI;gBACtC,CAAC,CAAC,kDAAkD;gBACpD,CAAC,CAAC,uDAAuD;SAC5D;QACD;YACE,GAAG,EAAE,mBAAmB;YACxB,KAAK,EAAE,mBAAmB;YAC1B,MAAM,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;YACrC,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;gBACnC,CAAC,CAAC,+CAA+C;gBACjD,CAAC,CAAC,kEAAkE;SACvE;QACD;YACE,GAAG,EAAE,iBAAiB;YACtB,KAAK,EAAE,iBAAiB;YACxB,MAAM,EAAE,OAAO,CAAC,gBAAgB,IAAI,GAAG;YACvC,QAAQ,EAAE,KAAK;YACf,MAAM,EAAE,OAAO,CAAC,gBAAgB,IAAI,GAAG;gBACrC,CAAC,CAAC,6DAA6D;gBAC/D,CAAC,CAAC,0DAA0D;SAC/D;QACD;YACE,GAAG,EAAE,gBAAgB;YACrB,KAAK,EAAE,gBAAgB;YACvB,MAAM,EAAE,OAAO,CAAC,SAAS,IAAI,KAAK;YAClC,QAAQ,EAAE,KAAK;YACf,MAAM,EAAE,OAAO,CAAC,SAAS,IAAI,KAAK;gBAChC,CAAC,CAAC,iDAAiD;gBACnD,CAAC,CAAC,gDAAgD;SACrD;QACD;YACE,GAAG,EAAE,yBAAyB;YAC9B,KAAK,EAAE,yBAAyB;YAChC,MAAM,EAAE,YAAY,IAAI,IAAI;YAC5B,QAAQ,EAAE,KAAK;YACf,MAAM,EAAE,YAAY,IAAI,IAAI;gBAC1B,CAAC,CAAC,+DAA+D;gBACjE,CAAC,CAAC,qEAAqE;SAC1E;KACF,CAAC;IAEF,MAAM,kBAAkB,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC/E,MAAM,YAAY,GAChB,CAAC,kBAAkB,IAAI,YAAY,IAAI,IAAI,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,cAAc,CAAC;IAE7E,OAAO;QACL,YAAY;QACZ,eAAe;QACf,KAAK;QACL,YAAY;KACb,CAAC;AACJ,CAAC;AAED,oEAAoE;AAEpE,MAAM,UAAU,YAAY,CAAC,KAuB5B;IACC,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,cAAc,EAAE,CAAC;IAEjB,MAAM,gBAAgB,GAAG,KAAK,CAAC,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,GAAG,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IACjG,MAAM,aAAa,GAAG,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,GAAG,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAC3F,MAAM,iBAAiB,GAAG,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,oBAAoB,GAAG,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAErG,MAAM,EACJ,YAAY,EACZ,eAAe,EACf,KAAK,EACL,YAAY,GACb,GAAG,uBAAuB,CAAC;QAC1B,gBAAgB;QAChB,aAAa;QACb,iBAAiB;QACjB,gBAAgB,EAAE,KAAK,CAAC,gBAAgB;QACxC,WAAW,EAAE,KAAK,CAAC,WAAW;QAC9B,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,SAAS,EAAE,KAAK,CAAC,SAAS;KAC3B,CAAC,CAAC;IAEH,yDAAyD;IACzD,MAAM,WAAW,GAAG,2BAA2B,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;IACtE,MAAM,SAAS,GAAG,iBAAiB,CAAC,WAAW,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,MAAM,IAAI,SAAS,CAAC,CAAC;IACxF,MAAM,gBAAgB,GAAG,SAAS;QAChC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,YAAY,GAAG,SAAS,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG;QACjE,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7B,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC3C,MAAM,QAAQ,GAAG,iBAAiB,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAEnD,EAAE,CAAC,OAAO,CAAC;;;GAGV,CAAC,CAAC,GAAG,CACJ,MAAM,EAAE,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,MAAM,EAC9D,KAAK,CAAC,cAAc,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,aAAa,EACxE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAC7E,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,aAAa,EACnD,YAAY,EAAE,cAAc,EAAE,IAAI,CAAC,SAAS,CAAC,eAAe,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,YAAY,EAClG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,SAAS,EAAE,EAAE,IAAI,IAAI,EAAE,gBAAgB,EAAE,SAAS,CAC/F,CAAC;IAEF,sDAAsD;IACtD,IAAI,YAAY,KAAK,WAAW,EAAE,CAAC;QACjC,eAAe,CAAC;YACd,IAAI,EAAE,WAAW;YACjB,IAAI,EAAE,GAAG,KAAK,CAAC,cAAc,IAAI,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,IAAI,EAAE;YAC7D,WAAW,EAAE,yBAAyB,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG;YAC3G,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC;gBACtB,cAAc,EAAE,KAAK,CAAC,cAAc;gBACpC,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,MAAM,EAAE,KAAK,CAAC,MAAM;gBACpB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,SAAS,EAAE,KAAK,CAAC,SAAS;gBAC1B,aAAa,EAAE,cAAc;gBAC7B,eAAe;gBACf,KAAK;gBACL,QAAQ;aACT,CAAC;YACF,eAAe,EAAE,KAAK,CAAC,SAAS;YAChC,WAAW,EAAE,KAAK,CAAC,KAAK;YACxB,UAAU,EAAE,KAAK,CAAC,IAAI;YACtB,YAAY,EAAE,KAAK,CAAC,MAAM;YAC1B,gBAAgB;YAChB,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;YAChD,gBAAgB,EAAE,KAAK,CAAC,gBAAgB;YACxC,WAAW,EAAE,KAAK,CAAC,WAAW;YAC9B,YAAY,EAAE,KAAK,CAAC,YAAY;YAChC,YAAY;YACZ,gBAAgB;YAChB,SAAS,EAAE,SAAS;SACrB,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,MAAM;QACN,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,MAAM,EAAE,KAAK,CAAC,MAAM;QACpB,cAAc,EAAE,KAAK,CAAC,cAAc;QACpC,gBAAgB;QAChB,iBAAiB;QACjB,aAAa;QACb,gBAAgB,EAAE,KAAK,CAAC,gBAAgB;QACxC,WAAW,EAAE,KAAK,CAAC,WAAW;QAC9B,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,aAAa,EAAE,KAAK,CAAC,aAAa;QAClC,YAAY;QACZ,aAAa,EAAE,cAAc;QAC7B,eAAe;QACf,KAAK;QACL,YAAY;QACZ,QAAQ;QACR,kBAAkB,EAAE,SAAS,EAAE,EAAE,IAAI,IAAI;QACzC,gBAAgB;QAChB,SAAS;KACV,CAAC;AACJ,CAAC;AA8BD,uDAAuD;AACvD,MAAM,UAAU,qBAAqB,CAAC,cAAsB,EAAE,CAAC,GAAG,CAAC;IACjE,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,cAAc,EAAE,CAAC;IAEjB,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CACrB,oHAAoH,CACrH,CAAC,GAAG,CAAC,cAAc,CAAU,CAAC;IAE/B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;IAE3D,KAAK,IAAI,GAAG,GAAG,CAAC,EAAE,GAAG,GAAG,CAAC,IAAI,GAAG,GAAG,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC;QAClE,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,SAAS,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC;QACjE,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAS,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;QAC9F,MAAM,cAAc,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAS,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,iBAAiB,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;QAEtG,OAAO,CAAC,IAAI,CAAC;YACX,cAAc;YACd,CAAC,EAAE,GAAG,GAAG,CAAC;YACV,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,GAAG,CAAC,GAAG,GAAG;YAC9C,cAAc,EAAE,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,GAAG,CAAC,GAAG,GAAG;YACtD,UAAU,EAAE,KAAK,CAAC,MAAM;SACzB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,KAAK,GAAG,EAAE;IAC9C,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,cAAc,EAAE,CAAC;IAEjB,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CACrB;;;aAGS,CACV,CAAC,GAAG,CAAC,KAAK,CAAU,CAAC;IAEtB,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACxB,MAAM,EAAE,GAAG,CAAC,OAAO;QACnB,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,cAAc,EAAE,GAAG,CAAC,cAAc;QAClC,YAAY,EAAE,GAAG,CAAC,aAAa;QAC/B,gBAAgB,EAAE,GAAG,CAAC,iBAAiB;QACvC,gBAAgB,EAAE,GAAG,CAAC,iBAAiB;QACvC,aAAa,EAAE,GAAG,CAAC,cAAc;QACjC,SAAS,EAAE,GAAG,CAAC,UAAU;QACzB,aAAa,EAAE,GAAG,CAAC,eAAe;QAClC,SAAS,EAAE,GAAG,CAAC,SAAS;QACxB,aAAa,EAAE,GAAG,CAAC,cAAc,IAAI,cAAc;QACnD,YAAY,EAAE,GAAG,CAAC,aAAa,IAAI,cAAc;QACjD,eAAe,EAAE,cAAc,CAA0B,GAAG,CAAC,gBAAgB,CAAC;QAC9E,KAAK,EAAE,cAAc,CAAsB,GAAG,CAAC,KAAK,CAAC;QACrD,QAAQ,EAAE,eAAe,CAA2B,GAAG,CAAC,SAAS,CAAC;KACnE,CAAC,CAAC,CAAC;AACN,CAAC;AAED,MAAM,UAAU,0BAA0B,CAAC,KAAK,GAAG,CAAC;IAClD,MAAM,EAAE,GAAG,KAAK,EAAE,CAAC;IACnB,cAAc,EAAE,CAAC;IAEjB,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CACrB;;;;aAIS,CACV,CAAC,GAAG,CAAC,KAAK,CAAsC,CAAC;IAElD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;AAC/C,CAAC;AAED,oEAAoE;AAEpE,SAAS,2BAA2B,CAAC,cAAsB;IACzD,MAAM,GAAG,GAAqC;QAC5C,cAAc,EAAE,iBAAiB;QACjC,UAAU,EAAE,iBAAiB;QAC7B,YAAY,EAAE,iBAAiB;QAC/B,YAAY,EAAE,eAAe;QAC7B,cAAc,EAAE,kBAAkB;QAClC,gBAAgB,EAAE,eAAe;QACjC,eAAe,EAAE,iBAAiB;QAClC,OAAO,EAAE,gBAAgB;KAC1B,CAAC;IACF,OAAO,GAAG,CAAC,cAAc,CAAC,IAAI,gBAAgB,CAAC;AACjD,CAAC;AAED,SAAS,cAAc,CAAI,KAAgC;IACzD,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,CAAC;IACtB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACjC,OAAO,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAa,CAAC,CAAC,CAAC,EAAE,CAAC;IACpD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAI,KAAgC;IAC1D,IAAI,CAAC,KAAK;QAAE,OAAO,SAAS,CAAC;IAC7B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACjC,OAAO,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAW,CAAC,CAAC,CAAC,SAAS,CAAC;IACxE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,KAKnB;IACN,IAAI,CAAC,KAAK;QAAE,OAAO,SAAS,CAAC;IAC7B,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC;IACtC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC;IAClC,MAAM,eAAe,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1G,MAAM,cAAc,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvG,MAAM,cAAc,GAAG;QACrB,OAAO,CAAC,CAAC,CAAC,YAAY,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI;QACvC,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,4BAA4B;QACtG,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,wBAAwB;KAC9F,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAClB,IAAI,CAAC,OAAO,IAAI,CAAC,KAAK,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtF,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,OAAO;QACL,OAAO,EAAE,OAAO,IAAI,aAAa;QACjC,KAAK;QACL,eAAe;QACf,cAAc;QACd,gBAAgB,EAAE,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC;KAC3C,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @vitest-environment node
|
|
3
|
+
*/
|
|
4
|
+
import { mkdtempSync, rmSync } from "node:fs";
|
|
5
|
+
import { tmpdir } from "node:os";
|
|
6
|
+
import { join } from "node:path";
|
|
7
|
+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
|
8
|
+
describe("hyperloopEval", () => {
|
|
9
|
+
let tempDir = "";
|
|
10
|
+
beforeEach(() => {
|
|
11
|
+
tempDir = mkdtempSync(join(tmpdir(), "nodebench-hyperloop-eval-"));
|
|
12
|
+
process.env.NODEBENCH_DATA_DIR = tempDir;
|
|
13
|
+
vi.resetModules();
|
|
14
|
+
});
|
|
15
|
+
afterEach(() => {
|
|
16
|
+
if (tempDir) {
|
|
17
|
+
try {
|
|
18
|
+
rmSync(tempDir, { recursive: true, force: true });
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
// SQLite can keep handles briefly on Windows.
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
it("returns a structured scorecard with deterministic gates and llm judge context", async () => {
|
|
26
|
+
const { evaluateTask } = await import("./hyperloopEval.js");
|
|
27
|
+
const evaluation = evaluateTask({
|
|
28
|
+
episodeId: "episode_1",
|
|
29
|
+
query: "Anthropic",
|
|
30
|
+
lens: "founder",
|
|
31
|
+
entity: "Anthropic",
|
|
32
|
+
classification: "company_search",
|
|
33
|
+
totalSignals: 4,
|
|
34
|
+
verifiedSignals: 1,
|
|
35
|
+
totalClaims: 5,
|
|
36
|
+
groundedClaims: 2,
|
|
37
|
+
contradictionsCaught: 1,
|
|
38
|
+
userEditDistance: 0.25,
|
|
39
|
+
wasExported: false,
|
|
40
|
+
wasDelegated: false,
|
|
41
|
+
latencyMs: 6100,
|
|
42
|
+
costUsd: 0.07,
|
|
43
|
+
toolCallCount: 8,
|
|
44
|
+
llmJudge: {
|
|
45
|
+
verdict: "PASS",
|
|
46
|
+
score: "6/7",
|
|
47
|
+
failingCriteria: ["Removed repeated cognition"],
|
|
48
|
+
fixSuggestions: ["Tighten evidence grounding"],
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
expect(evaluation.rubricVersion).toBe("hyperloop_v2");
|
|
52
|
+
expect(evaluation.scoreComponents.length).toBeGreaterThanOrEqual(6);
|
|
53
|
+
expect(evaluation.gates.length).toBeGreaterThanOrEqual(5);
|
|
54
|
+
expect(evaluation.gates.some((gate) => gate.key === "minimum_evidence")).toBe(true);
|
|
55
|
+
expect(evaluation.policyAction).toBe("archive_only");
|
|
56
|
+
expect(evaluation.llmJudge?.verdict).toBe("PASS");
|
|
57
|
+
expect(evaluation.llmJudge?.reasoningSummary).toContain("Removed repeated cognition");
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
//# sourceMappingURL=hyperloopEval.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hyperloopEval.test.js","sourceRoot":"","sources":["../../src/sync/hyperloopEval.test.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC9C,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,SAAS,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAEzE,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,IAAI,OAAO,GAAG,EAAE,CAAC;IAEjB,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,2BAA2B,CAAC,CAAC,CAAC;QACnE,OAAO,CAAC,GAAG,CAAC,kBAAkB,GAAG,OAAO,CAAC;QACzC,EAAE,CAAC,YAAY,EAAE,CAAC;IACpB,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,IAAI,OAAO,EAAE,CAAC;YACZ,IAAI,CAAC;gBACH,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACpD,CAAC;YAAC,MAAM,CAAC;gBACP,8CAA8C;YAChD,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+EAA+E,EAAE,KAAK,IAAI,EAAE;QAC7F,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;QAE5D,MAAM,UAAU,GAAG,YAAY,CAAC;YAC9B,SAAS,EAAE,WAAW;YACtB,KAAK,EAAE,WAAW;YAClB,IAAI,EAAE,SAAS;YACf,MAAM,EAAE,WAAW;YACnB,cAAc,EAAE,gBAAgB;YAChC,YAAY,EAAE,CAAC;YACf,eAAe,EAAE,CAAC;YAClB,WAAW,EAAE,CAAC;YACd,cAAc,EAAE,CAAC;YACjB,oBAAoB,EAAE,CAAC;YACvB,gBAAgB,EAAE,IAAI;YACtB,WAAW,EAAE,KAAK;YAClB,YAAY,EAAE,KAAK;YACnB,SAAS,EAAE,IAAI;YACf,OAAO,EAAE,IAAI;YACb,aAAa,EAAE,CAAC;YAChB,QAAQ,EAAE;gBACR,OAAO,EAAE,MAAM;gBACf,KAAK,EAAE,KAAK;gBACZ,eAAe,EAAE,CAAC,4BAA4B,CAAC;gBAC/C,cAAc,EAAE,CAAC,4BAA4B,CAAC;aAC/C;SACF,CAAC,CAAC;QAEH,MAAM,CAAC,UAAU,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACtD,MAAM,CAAC,UAAU,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QACpE,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAC;QAC1D,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,KAAK,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpF,MAAM,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QACrD,MAAM,CAAC,UAAU,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAClD,MAAM,CAAC,UAAU,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;IACxF,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
export type DurableObjectKind = "view" | "screen" | "action" | "tool" | "workflow" | "run" | "trace" | "artifact" | "outcome" | "approval" | "memory" | "device" | "account" | "peer" | "context_packet" | "task" | "message";
|
|
2
|
+
export type SyncScope = "metadata_only" | "receipts_and_traces" | "memory_and_artifacts" | "full_account_sync";
|
|
3
|
+
export type SyncOperationType = "upsert_object" | "link_object" | "record_receipt" | "record_artifact" | "record_outcome" | "record_memory" | "approval_event" | "register_peer" | "publish_context" | "ack_context" | "invalidate_context" | "send_peer_message" | "task_handoff";
|
|
4
|
+
export type SharedContextProduct = "nodebench" | "ta_studio";
|
|
5
|
+
export type SharedContextSurface = "web" | "browser" | "runner" | "evaluator" | "packet_engine" | "qa_runner" | "monitor" | "local_runtime" | "api";
|
|
6
|
+
export type SharedContextRole = "researcher" | "compiler" | "judge" | "explorer" | "replay" | "environment_builder" | "runner" | "observer" | "monitor" | "router";
|
|
7
|
+
export type SharedContextPacketType = "entity_packet" | "issue_packet" | "workflow_packet" | "trace_packet" | "judge_packet" | "environment_packet" | "failure_packet" | "state_snapshot_packet" | "verdict_packet" | "scenario_packet" | "change_packet";
|
|
8
|
+
export type SharedContextMessageClass = "request" | "response" | "context_offer" | "context_pull" | "task_handoff" | "status_update" | "verdict" | "escalation" | "invalidation";
|
|
9
|
+
export type SharedContextTaskStatus = "proposed" | "accepted" | "rejected" | "completed" | "escalated";
|
|
10
|
+
export interface SharedContextPeerSummary {
|
|
11
|
+
currentTask?: string;
|
|
12
|
+
focusEntity?: string;
|
|
13
|
+
focusWorkflow?: string;
|
|
14
|
+
currentState?: string;
|
|
15
|
+
confidence?: number;
|
|
16
|
+
lastUpdate?: string;
|
|
17
|
+
availableArtifacts?: string[];
|
|
18
|
+
permissionScope?: string[];
|
|
19
|
+
}
|
|
20
|
+
export interface SharedContextPeer {
|
|
21
|
+
peerId: string;
|
|
22
|
+
product: SharedContextProduct;
|
|
23
|
+
tenantId?: string | null;
|
|
24
|
+
workspaceId?: string | null;
|
|
25
|
+
surface: SharedContextSurface;
|
|
26
|
+
role: SharedContextRole;
|
|
27
|
+
capabilities: string[];
|
|
28
|
+
contextScopes: string[];
|
|
29
|
+
status: "active" | "idle" | "stale";
|
|
30
|
+
summary: SharedContextPeerSummary;
|
|
31
|
+
metadata?: Record<string, unknown>;
|
|
32
|
+
lastHeartbeatAt: string;
|
|
33
|
+
}
|
|
34
|
+
export interface SharedContextPacket {
|
|
35
|
+
contextId: string;
|
|
36
|
+
contextType: SharedContextPacketType;
|
|
37
|
+
producerPeerId: string;
|
|
38
|
+
tenantId?: string | null;
|
|
39
|
+
workspaceId?: string | null;
|
|
40
|
+
scope: string[];
|
|
41
|
+
subject: string;
|
|
42
|
+
summary: string;
|
|
43
|
+
claims: string[];
|
|
44
|
+
evidenceRefs: string[];
|
|
45
|
+
stateSnapshot?: Record<string, unknown>;
|
|
46
|
+
timeWindow?: {
|
|
47
|
+
from?: string;
|
|
48
|
+
to?: string;
|
|
49
|
+
};
|
|
50
|
+
freshness?: {
|
|
51
|
+
status?: "fresh" | "warming" | "stale";
|
|
52
|
+
expiresAt?: string;
|
|
53
|
+
trustTier?: "internal" | "verified" | "directional";
|
|
54
|
+
};
|
|
55
|
+
permissions?: {
|
|
56
|
+
visibility?: "internal" | "workspace" | "tenant";
|
|
57
|
+
allowedRoles?: string[];
|
|
58
|
+
};
|
|
59
|
+
confidence?: number;
|
|
60
|
+
lineage?: {
|
|
61
|
+
parentContextIds?: string[];
|
|
62
|
+
sourceRunId?: string;
|
|
63
|
+
sourceTraceId?: string;
|
|
64
|
+
supersedes?: string;
|
|
65
|
+
};
|
|
66
|
+
invalidates?: string[];
|
|
67
|
+
nextActions?: string[];
|
|
68
|
+
version: number;
|
|
69
|
+
status: "active" | "superseded" | "invalidated";
|
|
70
|
+
metadata?: Record<string, unknown>;
|
|
71
|
+
}
|
|
72
|
+
export interface SharedContextTask {
|
|
73
|
+
taskId: string;
|
|
74
|
+
taskType: string;
|
|
75
|
+
proposerPeerId: string;
|
|
76
|
+
assigneePeerId: string;
|
|
77
|
+
status: SharedContextTaskStatus;
|
|
78
|
+
taskSpec: Record<string, unknown>;
|
|
79
|
+
inputContextIds: string[];
|
|
80
|
+
outputContextId?: string | null;
|
|
81
|
+
reason?: string | null;
|
|
82
|
+
createdAt: string;
|
|
83
|
+
updatedAt: string;
|
|
84
|
+
}
|
|
85
|
+
export interface SyncQueueOperation {
|
|
86
|
+
id: string;
|
|
87
|
+
objectId: string | null;
|
|
88
|
+
objectKind: DurableObjectKind | string;
|
|
89
|
+
opType: SyncOperationType | string;
|
|
90
|
+
payload: Record<string, unknown>;
|
|
91
|
+
payloadHash: string;
|
|
92
|
+
createdAt: string;
|
|
93
|
+
}
|
|
94
|
+
export interface SyncBridgePairDevicePayload {
|
|
95
|
+
pairingCode?: string;
|
|
96
|
+
deviceToken?: string;
|
|
97
|
+
deviceId: string;
|
|
98
|
+
deviceName: string;
|
|
99
|
+
platform?: string;
|
|
100
|
+
appVersion?: string;
|
|
101
|
+
requestedScopes?: SyncScope[];
|
|
102
|
+
workspaceId?: string;
|
|
103
|
+
metadata?: Record<string, unknown>;
|
|
104
|
+
}
|
|
105
|
+
export interface SyncBridgePairedPayload {
|
|
106
|
+
deviceToken: string;
|
|
107
|
+
deviceId: string;
|
|
108
|
+
userId: string;
|
|
109
|
+
workspaceId?: string;
|
|
110
|
+
scopesGranted: SyncScope[];
|
|
111
|
+
pairedAt: string;
|
|
112
|
+
syncEnabled: boolean;
|
|
113
|
+
}
|
|
114
|
+
export interface SyncBridgeSyncBatchPayload {
|
|
115
|
+
deviceId: string;
|
|
116
|
+
operations: SyncQueueOperation[];
|
|
117
|
+
}
|
|
118
|
+
export interface SyncBridgeSyncAckPayload {
|
|
119
|
+
acceptedIds: string[];
|
|
120
|
+
rejected: Array<{
|
|
121
|
+
id: string;
|
|
122
|
+
reason: string;
|
|
123
|
+
}>;
|
|
124
|
+
serverWatermark: string;
|
|
125
|
+
}
|
|
126
|
+
export interface SyncBridgeApprovalEventPayload {
|
|
127
|
+
approvalId: string;
|
|
128
|
+
action: "approved" | "rejected" | "revoked";
|
|
129
|
+
actorUserId?: string;
|
|
130
|
+
detail?: string;
|
|
131
|
+
metadata?: Record<string, unknown>;
|
|
132
|
+
}
|
|
133
|
+
export interface SyncBridgeErrorPayload {
|
|
134
|
+
code: string;
|
|
135
|
+
message: string;
|
|
136
|
+
retryable?: boolean;
|
|
137
|
+
}
|
|
138
|
+
export interface PairingGrant {
|
|
139
|
+
pairingCode: string;
|
|
140
|
+
userId: string;
|
|
141
|
+
workspaceId?: string;
|
|
142
|
+
scopes: SyncScope[];
|
|
143
|
+
expiresAt: string;
|
|
144
|
+
metadata?: Record<string, unknown>;
|
|
145
|
+
}
|
|
146
|
+
export interface AccountSyncSnapshot {
|
|
147
|
+
userId: string;
|
|
148
|
+
workspaceId?: string;
|
|
149
|
+
connectedDevices: Array<{
|
|
150
|
+
deviceId: string;
|
|
151
|
+
deviceName: string;
|
|
152
|
+
platform?: string;
|
|
153
|
+
pairedAt: string;
|
|
154
|
+
lastSeenAt: string;
|
|
155
|
+
scopesGranted: SyncScope[];
|
|
156
|
+
}>;
|
|
157
|
+
recentOperations: Array<{
|
|
158
|
+
id: string;
|
|
159
|
+
deviceId: string;
|
|
160
|
+
objectId: string | null;
|
|
161
|
+
objectKind: string;
|
|
162
|
+
opType: string;
|
|
163
|
+
acceptedAt: string;
|
|
164
|
+
}>;
|
|
165
|
+
}
|
|
166
|
+
export interface SyncBridgeEnvelope {
|
|
167
|
+
type: "pair_device" | "paired" | "sync_batch" | "sync_ack" | "approval_event" | "error" | "ping" | "pong";
|
|
168
|
+
id: string;
|
|
169
|
+
timestamp: string;
|
|
170
|
+
payload: SyncBridgePairDevicePayload | SyncBridgePairedPayload | SyncBridgeSyncBatchPayload | SyncBridgeSyncAckPayload | SyncBridgeApprovalEventPayload | SyncBridgeErrorPayload | Record<string, never>;
|
|
171
|
+
}
|
|
172
|
+
export declare function createSyncEnvelope<T extends SyncBridgeEnvelope["payload"]>(type: SyncBridgeEnvelope["type"], payload: T): SyncBridgeEnvelope;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"protocol.js","sourceRoot":"","sources":["../../src/sync/protocol.ts"],"names":[],"mappings":"AA4RA,MAAM,UAAU,kBAAkB,CAChC,IAAgC,EAChC,OAAU;IAEV,OAAO;QACL,IAAI;QACJ,EAAE,EAAE,GAAG,IAAI,IAAI,IAAI,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE;QACtE,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,OAAO;KACR,CAAC;AACJ,CAAC"}
|