@tangle-network/agent-eval 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +72 -0
- package/README.md +4 -5
- package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/chunk-UW4NOOZI.js +1561 -0
- package/dist/chunk-UW4NOOZI.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
- package/dist/governance/index.d.ts +2 -2
- package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
- package/dist/index.d.ts +1279 -468
- package/dist/index.js +1992 -1259
- package/dist/index.js.map +1 -1
- package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/knowledge/index.js +2 -2
- package/dist/meta-eval/index.d.ts +1 -1
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +8 -8
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
- package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
- package/dist/replay-BX5Fm8en.d.ts +529 -0
- package/dist/reporting.d.ts +5 -5
- package/dist/reporting.js +5 -5
- package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
- package/dist/rl.d.ts +29 -47
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
- package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
- package/dist/traces.d.ts +9 -311
- package/dist/traces.js +16 -987
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
- package/dist/wire/index.d.ts +4 -4
- package/dist/wire/index.js +1 -1
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js +0 -569
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- package/dist/chunk-WHZMVFUV.js.map +0 -1
- package/dist/replay-BL96gCEP.d.ts +0 -226
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-K33INZHH.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -28,7 +28,7 @@ import {
|
|
|
28
28
|
pytestTestParser,
|
|
29
29
|
runTestGradedScenario,
|
|
30
30
|
vitestTestParser
|
|
31
|
-
} from "./chunk-
|
|
31
|
+
} from "./chunk-QHF6EQKK.js";
|
|
32
32
|
import {
|
|
33
33
|
classifyEuAiRisk,
|
|
34
34
|
euAiActReport,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
knowledgeReadinessTracePayload,
|
|
44
44
|
scoreKnowledgeReadiness,
|
|
45
45
|
userQuestionsForKnowledgeGaps
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-3CKU6VGU.js";
|
|
47
47
|
import {
|
|
48
48
|
controlFailureClassFromVerification,
|
|
49
49
|
controlRunToRunRecord,
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-PALJO75S.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -62,7 +62,7 @@ import {
|
|
|
62
62
|
stopOnNoProgress,
|
|
63
63
|
stopOnRepeatedAction,
|
|
64
64
|
subjectiveEval
|
|
65
|
-
} from "./chunk-
|
|
65
|
+
} from "./chunk-NCRFYPS3.js";
|
|
66
66
|
import {
|
|
67
67
|
CallbackResearcher,
|
|
68
68
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -96,7 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-SZSBQUIJ.js";
|
|
100
100
|
import {
|
|
101
101
|
RunRecordValidationError,
|
|
102
102
|
isRunRecord,
|
|
@@ -111,10 +111,10 @@ import {
|
|
|
111
111
|
judgeReplayGate,
|
|
112
112
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
113
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
114
|
+
} from "./chunk-DBIGN5MJ.js";
|
|
115
115
|
import {
|
|
116
116
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
117
|
+
} from "./chunk-RUI6SIHY.js";
|
|
118
118
|
import {
|
|
119
119
|
LlmCallError,
|
|
120
120
|
LlmClient,
|
|
@@ -128,7 +128,7 @@ import {
|
|
|
128
128
|
import {
|
|
129
129
|
evaluateInterimReleaseConfidence,
|
|
130
130
|
pairedEvalueSequence
|
|
131
|
-
} from "./chunk-
|
|
131
|
+
} from "./chunk-MAZ26DC7.js";
|
|
132
132
|
import {
|
|
133
133
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
134
|
benjaminiHochberg,
|
|
@@ -141,18 +141,26 @@ import {
|
|
|
141
141
|
requiredSampleSize,
|
|
142
142
|
researchReport,
|
|
143
143
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
144
|
+
} from "./chunk-5AKPEK5L.js";
|
|
145
145
|
import {
|
|
146
|
+
calibrateJudge,
|
|
147
|
+
calibrateJudgeContinuous,
|
|
146
148
|
cohensD,
|
|
147
149
|
confidenceInterval,
|
|
150
|
+
continuousAgreement,
|
|
151
|
+
corpusInterRaterAgreement,
|
|
152
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
148
153
|
interRaterReliability,
|
|
149
154
|
mannWhitneyU,
|
|
150
155
|
normalizeScores,
|
|
151
156
|
pairedTTest,
|
|
152
157
|
partialCredit,
|
|
158
|
+
positionalBias,
|
|
159
|
+
selfPreference,
|
|
160
|
+
verbosityBias,
|
|
153
161
|
weightedMean,
|
|
154
162
|
wilcoxonSignedRank
|
|
155
|
-
} from "./chunk-
|
|
163
|
+
} from "./chunk-R5UQJNKC.js";
|
|
156
164
|
import {
|
|
157
165
|
DEFAULT_REDACTION_RULES,
|
|
158
166
|
FileSystemTraceStore,
|
|
@@ -161,12 +169,15 @@ import {
|
|
|
161
169
|
REDACTION_VERSION,
|
|
162
170
|
ReplayCache,
|
|
163
171
|
ReplayCacheMissError,
|
|
172
|
+
TraceFileMissingError,
|
|
173
|
+
analyzeTraces,
|
|
174
|
+
buildTraceAnalystTools,
|
|
164
175
|
createReplayFetch,
|
|
165
176
|
exportRunAsOtlp,
|
|
166
177
|
iterateRawCalls,
|
|
167
178
|
redactString,
|
|
168
179
|
redactValue
|
|
169
|
-
} from "./chunk-
|
|
180
|
+
} from "./chunk-UW4NOOZI.js";
|
|
170
181
|
import {
|
|
171
182
|
aggregateLlm,
|
|
172
183
|
argHash,
|
|
@@ -208,7 +219,7 @@ import {
|
|
|
208
219
|
hashJson,
|
|
209
220
|
signManifest,
|
|
210
221
|
verifyManifest
|
|
211
|
-
} from "./chunk-
|
|
222
|
+
} from "./chunk-VSMTAMNK.js";
|
|
212
223
|
import {
|
|
213
224
|
AgentEvalError,
|
|
214
225
|
CaptureIntegrityError,
|
|
@@ -221,95 +232,1534 @@ import {
|
|
|
221
232
|
} from "./chunk-NG236HPC.js";
|
|
222
233
|
import "./chunk-PZ5AY32C.js";
|
|
223
234
|
|
|
224
|
-
// src/
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
235
|
+
// src/run-score.ts
|
|
236
|
+
var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
237
|
+
success: 4,
|
|
238
|
+
goalProgress: 2,
|
|
239
|
+
repoGroundedness: 1.5,
|
|
240
|
+
driftPenalty: -1.5,
|
|
241
|
+
toolUseQuality: 1,
|
|
242
|
+
patchQuality: 1.25,
|
|
243
|
+
testReality: 1.5,
|
|
244
|
+
finalGate: 3,
|
|
245
|
+
reviewerBlockers: -2,
|
|
246
|
+
costUsd: -0.2,
|
|
247
|
+
wallSeconds: -0.1
|
|
248
|
+
};
|
|
249
|
+
function aggregateRunScore(score, weights = {}) {
|
|
250
|
+
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
251
|
+
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
|
|
228
252
|
}
|
|
229
|
-
function
|
|
230
|
-
if (!
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
253
|
+
function clamp01(value) {
|
|
254
|
+
if (!Number.isFinite(value)) return 0;
|
|
255
|
+
return Math.max(0, Math.min(1, value));
|
|
256
|
+
}
|
|
257
|
+
function finiteOrZero(value) {
|
|
258
|
+
return Number.isFinite(value) ? value : 0;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// src/run-critic.ts
|
|
262
|
+
var DEFAULT_DRIFT_PATTERNS = [
|
|
263
|
+
/https?:\/\//i,
|
|
264
|
+
/\btitle:\s/i,
|
|
265
|
+
/\bsummary:\s/i,
|
|
266
|
+
/\burl:\s/i,
|
|
267
|
+
/\bnpm package usage\b/i,
|
|
268
|
+
/\bnews\b/i
|
|
269
|
+
];
|
|
270
|
+
var RunCritic = class {
|
|
271
|
+
weights;
|
|
272
|
+
driftPatterns;
|
|
273
|
+
constructor(options = {}) {
|
|
274
|
+
this.weights = options.weights;
|
|
275
|
+
this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
|
|
237
276
|
}
|
|
238
|
-
|
|
239
|
-
|
|
277
|
+
async score(store, runId) {
|
|
278
|
+
const run = await store.getRun(runId);
|
|
279
|
+
if (!run) throw new NotFoundError(`run ${runId} not found`);
|
|
280
|
+
const [spans, events, artifacts, budget] = await Promise.all([
|
|
281
|
+
store.spans({ runId }),
|
|
282
|
+
store.events({ runId }),
|
|
283
|
+
store.artifacts(runId),
|
|
284
|
+
store.budget(runId)
|
|
285
|
+
]);
|
|
286
|
+
return this.scoreTrace({ run, spans, events, artifacts, budget });
|
|
240
287
|
}
|
|
241
|
-
|
|
242
|
-
|
|
288
|
+
scoreTrace(trace) {
|
|
289
|
+
const notes = [];
|
|
290
|
+
const llmSpans2 = trace.spans.filter(
|
|
291
|
+
(s) => s.kind === "llm"
|
|
292
|
+
);
|
|
293
|
+
const toolSpans2 = trace.spans.filter(
|
|
294
|
+
(s) => s.kind === "tool"
|
|
295
|
+
);
|
|
296
|
+
const judgeSpans2 = trace.spans.filter(
|
|
297
|
+
(s) => s.kind === "judge"
|
|
298
|
+
);
|
|
299
|
+
const sandboxSpans = trace.spans.filter(
|
|
300
|
+
(s) => s.kind === "sandbox"
|
|
301
|
+
);
|
|
302
|
+
const finalGateSpans = judgeSpans2.filter(
|
|
303
|
+
(span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
|
|
304
|
+
);
|
|
305
|
+
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
306
|
+
if (!success) notes.push("run did not complete with pass=true");
|
|
307
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
308
|
+
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
|
|
309
|
+
trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
|
|
310
|
+
) : void 0;
|
|
311
|
+
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
312
|
+
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
313
|
+
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
314
|
+
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
315
|
+
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
316
|
+
const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
|
|
317
|
+
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
318
|
+
const sandboxTests = sandboxSpans.filter(
|
|
319
|
+
(span) => typeof span.testsTotal === "number" && span.testsTotal > 0
|
|
320
|
+
);
|
|
321
|
+
const testReality = sandboxTests.length ? sandboxTests.reduce(
|
|
322
|
+
(sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
|
|
323
|
+
0
|
|
324
|
+
) / sandboxTests.length : toolSpans2.some(
|
|
325
|
+
(span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
|
|
326
|
+
) ? 0.4 : 0;
|
|
327
|
+
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
328
|
+
const blockerSpans = judgeSpans2.filter((span) => isBlockingJudge(span));
|
|
329
|
+
const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
|
|
330
|
+
const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
|
|
331
|
+
if (finalGateBlockers.length)
|
|
332
|
+
notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
|
|
333
|
+
else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
|
|
334
|
+
const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
|
|
335
|
+
if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
|
|
336
|
+
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
337
|
+
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
338
|
+
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
339
|
+
const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
|
|
340
|
+
if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
|
|
341
|
+
const costUsd = trace.budget.length ? Math.max(
|
|
342
|
+
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
343
|
+
0
|
|
344
|
+
) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
|
|
345
|
+
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
346
|
+
return {
|
|
347
|
+
success,
|
|
348
|
+
goalProgress,
|
|
349
|
+
repoGroundedness,
|
|
350
|
+
driftPenalty,
|
|
351
|
+
toolUseQuality,
|
|
352
|
+
patchQuality,
|
|
353
|
+
testReality,
|
|
354
|
+
finalGate,
|
|
355
|
+
reviewerBlockers,
|
|
356
|
+
costUsd,
|
|
357
|
+
wallSeconds,
|
|
358
|
+
notes
|
|
359
|
+
};
|
|
243
360
|
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
|
|
247
|
-
throw new ValidationError(
|
|
248
|
-
`proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
|
|
249
|
-
);
|
|
250
|
-
}
|
|
251
|
-
if (seenPaths.has(change.path)) {
|
|
252
|
-
throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
|
|
253
|
-
}
|
|
254
|
-
seenPaths.add(change.path);
|
|
361
|
+
rank(score) {
|
|
362
|
+
return aggregateRunScore(score, this.weights);
|
|
255
363
|
}
|
|
256
|
-
|
|
257
|
-
|
|
364
|
+
isDrift(text) {
|
|
365
|
+
return this.driftPatterns.some((pattern) => pattern.test(text));
|
|
258
366
|
}
|
|
367
|
+
};
|
|
368
|
+
function normalizeJudgeScore(score) {
|
|
369
|
+
return score > 1 ? clamp01(score / 10) : clamp01(score);
|
|
259
370
|
}
|
|
260
|
-
function
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
371
|
+
function looksRepoGrounded(text) {
|
|
372
|
+
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
|
|
373
|
+
text
|
|
374
|
+
);
|
|
375
|
+
}
|
|
376
|
+
function isBlockingJudge(span) {
|
|
377
|
+
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
378
|
+
}
|
|
379
|
+
function positiveNumber(value) {
|
|
380
|
+
return typeof value === "number" && value > 0;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// src/semantic-concept-judge.ts
|
|
384
|
+
var DEFAULT_COMPLEXITY_WEIGHTS = {
|
|
385
|
+
render: 1,
|
|
386
|
+
integrate: 2,
|
|
387
|
+
compute: 2.5
|
|
388
|
+
};
|
|
389
|
+
var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
390
|
+
var DEFAULT_MAX_SOURCE = 45e3;
|
|
391
|
+
var DEFAULT_MAX_HTML = 3e4;
|
|
392
|
+
var DEFAULT_MAX_PER_FILE = 2e4;
|
|
393
|
+
var DEFAULT_TIMEOUT = 18e4;
|
|
394
|
+
var DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
395
|
+
var SEMANTIC_SCHEMA = {
|
|
396
|
+
type: "object",
|
|
397
|
+
additionalProperties: false,
|
|
398
|
+
required: ["summary", "concepts"],
|
|
399
|
+
properties: {
|
|
400
|
+
summary: { type: "string", minLength: 20, maxLength: 600 },
|
|
401
|
+
concepts: {
|
|
402
|
+
type: "array",
|
|
403
|
+
minItems: 1,
|
|
404
|
+
items: {
|
|
405
|
+
type: "object",
|
|
406
|
+
additionalProperties: false,
|
|
407
|
+
required: ["concept", "present", "score", "evidence", "severity"],
|
|
408
|
+
properties: {
|
|
409
|
+
concept: { type: "string", minLength: 1, maxLength: 120 },
|
|
410
|
+
present: { type: "boolean" },
|
|
411
|
+
score: { type: "number", minimum: 0, maximum: 10 },
|
|
412
|
+
evidence: { type: "string", minLength: 5, maxLength: 400 },
|
|
413
|
+
severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
|
|
414
|
+
}
|
|
415
|
+
}
|
|
281
416
|
}
|
|
282
|
-
return await res.json();
|
|
283
417
|
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
418
|
+
};
|
|
419
|
+
function truncate(body, cap, label) {
|
|
420
|
+
if (body.length <= cap) return body;
|
|
421
|
+
return `${body.slice(0, cap)}
|
|
422
|
+
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
423
|
+
}
|
|
424
|
+
function buildPrompt(input, opts) {
|
|
425
|
+
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
426
|
+
${f.content}`).join("\n\n");
|
|
427
|
+
const html = input.servedHtml ?? "";
|
|
428
|
+
return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
|
|
429
|
+
|
|
430
|
+
You MUST distinguish:
|
|
431
|
+
(a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
|
|
432
|
+
(b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
|
|
433
|
+
(c) ABSENT (concept nowhere).
|
|
434
|
+
|
|
435
|
+
A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
|
|
436
|
+
|
|
437
|
+
USER REQUEST (what the agent was asked to build):
|
|
438
|
+
${input.userRequest}
|
|
439
|
+
|
|
440
|
+
${input.artifactLabel ? `ARTIFACT METADATA:
|
|
441
|
+
name: ${input.artifactLabel}
|
|
442
|
+
description: ${input.artifactDescription ?? ""}
|
|
443
|
+
|
|
444
|
+
` : ""}EXPECTED CONCEPTS (each must be graded independently):
|
|
445
|
+
${input.expectedConcepts.map(
|
|
446
|
+
(c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
|
|
447
|
+
).join("\n")}
|
|
448
|
+
|
|
449
|
+
${html ? `SERVED HTML (what the preview returns when hit):
|
|
450
|
+
${truncate(html, opts.maxHtmlChars, "HTML")}
|
|
451
|
+
|
|
452
|
+
` : ""}SOURCE FILES (the agent's workdir):
|
|
453
|
+
${truncate(sourceBlob, opts.maxSourceChars, "source")}
|
|
454
|
+
|
|
455
|
+
For EACH concept, return:
|
|
456
|
+
- concept: the concept name as given (match exactly)
|
|
457
|
+
- present: boolean \u2014 does a working implementation exist?
|
|
458
|
+
- score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
|
|
459
|
+
- evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
|
|
460
|
+
- severity:
|
|
461
|
+
"info" when present: true AND score >= 7
|
|
462
|
+
"minor" when present: true AND 4 <= score < 7
|
|
463
|
+
"major" when present: false OR score < 4
|
|
464
|
+
"critical" when the concept is not only absent but a core user flow depends on it
|
|
465
|
+
|
|
466
|
+
Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
|
|
467
|
+
|
|
468
|
+
BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
|
|
469
|
+
|
|
470
|
+
Return STRICT JSON. No prose outside the JSON.`;
|
|
471
|
+
}
|
|
472
|
+
async function runSemanticConceptJudge(input, options = {}) {
|
|
473
|
+
const start = Date.now();
|
|
474
|
+
const totalCount = input.expectedConcepts.length;
|
|
475
|
+
if (totalCount === 0) {
|
|
476
|
+
return {
|
|
477
|
+
kind: "semantic-concept",
|
|
478
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
479
|
+
score: 0,
|
|
480
|
+
presentCount: 0,
|
|
481
|
+
totalCount: 0,
|
|
482
|
+
findings: [],
|
|
483
|
+
summary: "no expected concepts declared",
|
|
484
|
+
durationMs: 0,
|
|
485
|
+
costUsd: null,
|
|
486
|
+
available: false,
|
|
487
|
+
error: "no expected concepts declared"
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
const opts = {
|
|
491
|
+
model: options.model ?? DEFAULT_MODEL,
|
|
492
|
+
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
|
|
493
|
+
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
|
|
494
|
+
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
|
|
495
|
+
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
|
|
496
|
+
llm: options.llm ?? {},
|
|
497
|
+
weightConcepts: options.weightConcepts ?? "mean",
|
|
498
|
+
complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
|
|
499
|
+
};
|
|
500
|
+
const weightForConcept = (spec) => {
|
|
501
|
+
if (opts.weightConcepts === "mean") return 1;
|
|
502
|
+
if (spec.weight != null) return spec.weight;
|
|
503
|
+
if (opts.weightConcepts === "complexity") {
|
|
504
|
+
return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
|
|
505
|
+
}
|
|
506
|
+
return 1;
|
|
507
|
+
};
|
|
508
|
+
const weightByName = new Map(
|
|
509
|
+
input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
|
|
510
|
+
);
|
|
511
|
+
try {
|
|
512
|
+
const { value, result } = await callLlmJson(
|
|
513
|
+
{
|
|
514
|
+
model: opts.model,
|
|
515
|
+
messages: [
|
|
516
|
+
{
|
|
517
|
+
role: "system",
|
|
518
|
+
content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
|
|
519
|
+
},
|
|
520
|
+
{ role: "user", content: buildPrompt(input, opts) }
|
|
521
|
+
],
|
|
522
|
+
jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
|
|
523
|
+
temperature: 0,
|
|
524
|
+
timeoutMs: opts.timeoutMs
|
|
525
|
+
},
|
|
526
|
+
opts.llm
|
|
527
|
+
);
|
|
528
|
+
if (!value?.concepts || !Array.isArray(value.concepts)) {
|
|
529
|
+
throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
|
|
530
|
+
}
|
|
531
|
+
const findings = value.concepts.map((c) => ({
|
|
532
|
+
concept: String(c.concept),
|
|
533
|
+
present: Boolean(c.present),
|
|
534
|
+
score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
|
|
535
|
+
evidence: String(c.evidence ?? ""),
|
|
536
|
+
severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
|
|
537
|
+
}));
|
|
538
|
+
const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
|
|
539
|
+
let weightSum = 0;
|
|
540
|
+
let weightedScoreSum = 0;
|
|
541
|
+
for (const f of findings) {
|
|
542
|
+
const w = weightByName.get(f.concept) ?? 1;
|
|
543
|
+
weightSum += w;
|
|
544
|
+
weightedScoreSum += w * f.score;
|
|
545
|
+
}
|
|
546
|
+
const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
|
|
547
|
+
return {
|
|
548
|
+
kind: "semantic-concept",
|
|
549
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
550
|
+
score: Number((scoreAvg / 10).toFixed(3)),
|
|
551
|
+
presentCount,
|
|
552
|
+
totalCount,
|
|
553
|
+
findings,
|
|
554
|
+
summary: String(value.summary ?? ""),
|
|
555
|
+
durationMs: Date.now() - start,
|
|
556
|
+
costUsd: result.costUsd ?? null,
|
|
557
|
+
available: true
|
|
558
|
+
};
|
|
559
|
+
} catch (err) {
|
|
560
|
+
return {
|
|
561
|
+
kind: "semantic-concept",
|
|
562
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
563
|
+
score: 0,
|
|
564
|
+
presentCount: 0,
|
|
565
|
+
totalCount,
|
|
566
|
+
findings: [],
|
|
567
|
+
summary: "",
|
|
568
|
+
durationMs: Date.now() - start,
|
|
569
|
+
costUsd: null,
|
|
570
|
+
available: false,
|
|
571
|
+
error: err instanceof Error ? err.message : String(err)
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
function createSemanticConceptJudge(options = {}) {
|
|
576
|
+
return (input) => runSemanticConceptJudge(input, options);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
// src/analyst/types.ts
|
|
580
|
+
import { createHash } from "crypto";
|
|
581
|
+
function computeFindingId(input) {
|
|
582
|
+
const basis = JSON.stringify({
|
|
583
|
+
a: input.analyst_id,
|
|
584
|
+
r: input.area,
|
|
585
|
+
s: input.subject ?? "",
|
|
586
|
+
c: normalizeClaim(input.id_basis ?? input.claim)
|
|
587
|
+
});
|
|
588
|
+
return `f_${createHash("sha256").update(basis).digest("hex").slice(0, 20)}`;
|
|
589
|
+
}
|
|
590
|
+
function normalizeClaim(c) {
|
|
591
|
+
return c.toLowerCase().replace(/\s+/g, " ").replace(/[.!?;:,]+$/g, "").trim();
|
|
592
|
+
}
|
|
593
|
+
function makeFinding(init) {
|
|
594
|
+
const { id_basis, produced_at, ...rest } = init;
|
|
595
|
+
return {
|
|
596
|
+
schema_version: "1.0.0",
|
|
597
|
+
finding_id: computeFindingId({
|
|
598
|
+
analyst_id: rest.analyst_id,
|
|
599
|
+
area: rest.area,
|
|
600
|
+
subject: rest.subject,
|
|
601
|
+
claim: rest.claim,
|
|
602
|
+
id_basis
|
|
603
|
+
}),
|
|
604
|
+
produced_at: produced_at ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
605
|
+
...rest
|
|
606
|
+
};
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
// src/analyst/adapters.ts
|
|
610
|
+
var ADAPTER_REV = "1";
|
|
611
|
+
function liftSeverity(s) {
|
|
612
|
+
switch (s) {
|
|
613
|
+
case "critical":
|
|
614
|
+
return "critical";
|
|
615
|
+
case "major":
|
|
616
|
+
return "high";
|
|
617
|
+
case "minor":
|
|
618
|
+
return "medium";
|
|
619
|
+
case "info":
|
|
620
|
+
return "info";
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
function createTraceAnalystAdapter(opts) {
|
|
624
|
+
const id = opts.id ?? "trace-analyst";
|
|
625
|
+
const area = opts.area ?? "agent-reasoning";
|
|
626
|
+
return {
|
|
627
|
+
id,
|
|
628
|
+
description: "Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.",
|
|
629
|
+
inputKind: "trace-store",
|
|
630
|
+
cost: { kind: "llm", models: opts.model ? [opts.model] : void 0 },
|
|
631
|
+
version: `trace-analyst-${ADAPTER_REV}`,
|
|
632
|
+
async analyze(store, ctx) {
|
|
633
|
+
const out = [];
|
|
634
|
+
for (const question of opts.questions) {
|
|
635
|
+
if (ctx.signal?.aborted) break;
|
|
636
|
+
const result = await analyzeTraces(
|
|
637
|
+
{ question },
|
|
638
|
+
{ source: store, ai: opts.ai, model: opts.model, ...opts.extra }
|
|
639
|
+
);
|
|
640
|
+
const subject = ctx.tags?.subject ?? question.slice(0, 60);
|
|
641
|
+
if (result.findings.length === 0) {
|
|
642
|
+
out.push(
|
|
643
|
+
makeFinding({
|
|
644
|
+
analyst_id: id,
|
|
645
|
+
area,
|
|
646
|
+
subject,
|
|
647
|
+
claim: result.answer.slice(0, 200),
|
|
648
|
+
rationale: result.answer,
|
|
649
|
+
severity: "info",
|
|
650
|
+
confidence: 0.5,
|
|
651
|
+
evidence_refs: [],
|
|
652
|
+
metadata: {
|
|
653
|
+
actor_prompt_version: result.actorPromptVersion,
|
|
654
|
+
turns: result.turnCount
|
|
655
|
+
}
|
|
656
|
+
})
|
|
657
|
+
);
|
|
658
|
+
continue;
|
|
659
|
+
}
|
|
660
|
+
result.findings.forEach((claim, i) => {
|
|
661
|
+
out.push(
|
|
662
|
+
makeFinding({
|
|
663
|
+
analyst_id: id,
|
|
664
|
+
area,
|
|
665
|
+
subject,
|
|
666
|
+
claim,
|
|
667
|
+
rationale: i === 0 ? result.answer : void 0,
|
|
668
|
+
severity: "medium",
|
|
669
|
+
confidence: 0.6,
|
|
670
|
+
evidence_refs: [],
|
|
671
|
+
metadata: { question, turns: result.turnCount, finding_index: i }
|
|
672
|
+
})
|
|
673
|
+
);
|
|
674
|
+
});
|
|
675
|
+
}
|
|
676
|
+
return out;
|
|
677
|
+
}
|
|
678
|
+
};
|
|
679
|
+
}
|
|
680
|
+
function createVerifierAdapter(opts) {
|
|
681
|
+
const id = opts.id ?? "multi-layer-verifier";
|
|
682
|
+
const area = opts.area ?? "verification";
|
|
683
|
+
return {
|
|
684
|
+
id,
|
|
685
|
+
description: "Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.",
|
|
686
|
+
inputKind: "custom",
|
|
687
|
+
cost: { kind: "deterministic" },
|
|
688
|
+
version: `verifier-${ADAPTER_REV}`,
|
|
689
|
+
async analyze(env, ctx) {
|
|
690
|
+
const report = await opts.verifier.run({ env, ...opts.options });
|
|
691
|
+
const out = [];
|
|
692
|
+
for (const layer of report.layers) {
|
|
693
|
+
for (const finding of layer.findings) {
|
|
694
|
+
out.push(liftLayerFinding(id, area, layer.layer, finding));
|
|
695
|
+
}
|
|
696
|
+
if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
|
|
697
|
+
out.push(
|
|
698
|
+
makeFinding({
|
|
699
|
+
analyst_id: id,
|
|
700
|
+
area,
|
|
701
|
+
subject: layer.layer,
|
|
702
|
+
claim: `layer "${layer.layer}" ${layer.status}: ${layer.reason ?? "no reason given"}`,
|
|
703
|
+
severity: layer.status === "error" ? "high" : layer.status === "timeout" ? "medium" : "high",
|
|
704
|
+
confidence: 1,
|
|
705
|
+
evidence_refs: [],
|
|
706
|
+
metadata: {
|
|
707
|
+
layer_status: layer.status,
|
|
708
|
+
duration_ms: layer.durationMs,
|
|
709
|
+
score: layer.score,
|
|
710
|
+
diagnostics: layer.diagnostics
|
|
711
|
+
}
|
|
712
|
+
})
|
|
713
|
+
);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
ctx.log?.("verifier complete", {
|
|
717
|
+
layers: report.layers.length,
|
|
718
|
+
blended: report.blendedScore,
|
|
719
|
+
all_pass: report.allPass
|
|
720
|
+
});
|
|
721
|
+
return out;
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
function liftLayerFinding(analyst_id, area, layer, f) {
|
|
726
|
+
return makeFinding({
|
|
727
|
+
analyst_id,
|
|
728
|
+
area,
|
|
729
|
+
subject: f.layer ?? layer,
|
|
730
|
+
claim: f.message,
|
|
731
|
+
severity: liftSeverity(f.severity),
|
|
732
|
+
confidence: 0.85,
|
|
733
|
+
evidence_refs: f.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }] : [],
|
|
734
|
+
metadata: f.detail
|
|
735
|
+
});
|
|
736
|
+
}
|
|
737
|
+
function createRunCriticAdapter(opts = {}) {
|
|
738
|
+
const id = opts.id ?? "run-critic";
|
|
739
|
+
const area = opts.area ?? "run-quality";
|
|
740
|
+
const critic = opts.critic ?? new RunCritic();
|
|
741
|
+
const threshold = opts.threshold ?? 0.5;
|
|
742
|
+
return {
|
|
743
|
+
id,
|
|
744
|
+
description: "Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.",
|
|
745
|
+
inputKind: "custom",
|
|
746
|
+
cost: { kind: "deterministic" },
|
|
747
|
+
version: `run-critic-${ADAPTER_REV}`,
|
|
748
|
+
async analyze(trace) {
|
|
749
|
+
const score = critic.scoreTrace(trace);
|
|
750
|
+
const out = [];
|
|
751
|
+
const dims = [
|
|
752
|
+
["success", "critical", "run did not complete successfully"],
|
|
753
|
+
["goalProgress", "high", "goal progress is low"],
|
|
754
|
+
["repoGroundedness", "high", "output is poorly grounded in the repository"],
|
|
755
|
+
["toolUseQuality", "medium", "tool use quality is low"],
|
|
756
|
+
["patchQuality", "medium", "no real patch/edit evidence"],
|
|
757
|
+
["testReality", "high", "no real test/build evidence"],
|
|
758
|
+
["finalGate", "critical", "final gate is blocking"]
|
|
759
|
+
];
|
|
760
|
+
for (const [dim, sev, msg] of dims) {
|
|
761
|
+
const value = score[dim];
|
|
762
|
+
if (typeof value === "number" && value < threshold) {
|
|
763
|
+
out.push(
|
|
764
|
+
makeFinding({
|
|
765
|
+
analyst_id: id,
|
|
766
|
+
area,
|
|
767
|
+
subject: dim,
|
|
768
|
+
claim: msg,
|
|
769
|
+
rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,
|
|
770
|
+
severity: sev,
|
|
771
|
+
confidence: 1,
|
|
772
|
+
evidence_refs: [],
|
|
773
|
+
metadata: { dimension: dim, value, threshold, run_id: trace.run.runId }
|
|
774
|
+
})
|
|
775
|
+
);
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
if (score.driftPenalty > 1 - threshold) {
|
|
779
|
+
out.push(
|
|
780
|
+
makeFinding({
|
|
781
|
+
analyst_id: id,
|
|
782
|
+
area,
|
|
783
|
+
subject: "drift",
|
|
784
|
+
claim: "agent output drifted from repository signal",
|
|
785
|
+
rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,
|
|
786
|
+
severity: "medium",
|
|
787
|
+
confidence: 0.9,
|
|
788
|
+
evidence_refs: [],
|
|
789
|
+
metadata: { drift_penalty: score.driftPenalty, notes: score.notes }
|
|
790
|
+
})
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
return out;
|
|
794
|
+
}
|
|
795
|
+
};
|
|
796
|
+
}
|
|
797
|
+
function createJudgeAdapter(opts) {
|
|
798
|
+
const id = opts.id ?? "judge";
|
|
799
|
+
const area = opts.area ?? "judge";
|
|
800
|
+
const threshold = opts.threshold ?? 6;
|
|
801
|
+
return {
|
|
802
|
+
id,
|
|
803
|
+
description: "Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.",
|
|
804
|
+
inputKind: "judge-input",
|
|
805
|
+
cost: opts.cost ?? { kind: "llm" },
|
|
806
|
+
version: `judge-${ADAPTER_REV}`,
|
|
807
|
+
async analyze(input) {
|
|
808
|
+
const scores = await opts.judge(opts.tcloud, input);
|
|
809
|
+
return scores.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
|
|
810
|
+
}
|
|
811
|
+
};
|
|
812
|
+
}
|
|
813
|
+
function normalize10(s) {
|
|
814
|
+
return s <= 1 ? s * 10 : s;
|
|
815
|
+
}
|
|
816
|
+
function liftJudgeScore(analyst_id, area, s) {
|
|
817
|
+
const score10 = normalize10(s.score);
|
|
818
|
+
const severity = score10 < 3 ? "critical" : score10 < 5 ? "high" : score10 < 7 ? "medium" : "low";
|
|
819
|
+
return makeFinding({
|
|
820
|
+
analyst_id,
|
|
821
|
+
area,
|
|
822
|
+
subject: s.dimension,
|
|
823
|
+
claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,
|
|
824
|
+
rationale: s.reasoning,
|
|
825
|
+
severity,
|
|
826
|
+
confidence: 0.8,
|
|
827
|
+
evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
|
|
828
|
+
metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
|
|
829
|
+
});
|
|
830
|
+
}
|
|
831
|
+
function createSemanticConceptJudgeAdapter(opts = {}) {
|
|
832
|
+
const id = opts.id ?? "semantic-concept-judge";
|
|
833
|
+
const area = opts.area ?? "concept-coverage";
|
|
834
|
+
return {
|
|
835
|
+
id,
|
|
836
|
+
description: "Runs the semantic-concept judge and surfaces missing / weak concepts as findings.",
|
|
837
|
+
inputKind: "custom",
|
|
838
|
+
cost: { kind: "llm", models: opts.options?.model ? [opts.options.model] : void 0 },
|
|
839
|
+
version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,
|
|
840
|
+
async analyze(input) {
|
|
841
|
+
const result = await runSemanticConceptJudge(input, opts.options);
|
|
842
|
+
if (!result.available) {
|
|
843
|
+
return [
|
|
844
|
+
makeFinding({
|
|
845
|
+
analyst_id: id,
|
|
846
|
+
area,
|
|
847
|
+
claim: "semantic-concept judge unavailable",
|
|
848
|
+
rationale: result.error,
|
|
849
|
+
severity: "info",
|
|
850
|
+
confidence: 1,
|
|
851
|
+
evidence_refs: [],
|
|
852
|
+
metadata: { reason: result.error }
|
|
853
|
+
})
|
|
854
|
+
];
|
|
855
|
+
}
|
|
856
|
+
const out = [];
|
|
857
|
+
for (const f of result.findings) {
|
|
858
|
+
if (f.present && f.score >= 7) continue;
|
|
859
|
+
out.push(
|
|
860
|
+
makeFinding({
|
|
861
|
+
analyst_id: id,
|
|
862
|
+
area,
|
|
863
|
+
subject: f.concept,
|
|
864
|
+
claim: f.present ? `concept "${f.concept}" is weak (${f.score}/10)` : `concept "${f.concept}" is missing`,
|
|
865
|
+
rationale: f.evidence,
|
|
866
|
+
severity: liftSeverity(f.severity),
|
|
867
|
+
confidence: 0.85,
|
|
868
|
+
evidence_refs: [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }],
|
|
869
|
+
metadata: {
|
|
870
|
+
concept: f.concept,
|
|
871
|
+
present: f.present,
|
|
872
|
+
score_10: f.score,
|
|
873
|
+
cost_usd: result.costUsd ?? void 0
|
|
874
|
+
}
|
|
875
|
+
})
|
|
876
|
+
);
|
|
877
|
+
}
|
|
878
|
+
return out;
|
|
879
|
+
}
|
|
880
|
+
};
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
// src/analyst/chat-client.ts
|
|
884
|
+
function createChatClient(opts) {
|
|
885
|
+
switch (opts.transport) {
|
|
886
|
+
case "router":
|
|
887
|
+
return wrapLlmClient(
|
|
888
|
+
opts.transport,
|
|
889
|
+
opts.defaultModel,
|
|
890
|
+
new LlmClient({
|
|
891
|
+
baseUrl: opts.baseUrl ?? "https://router.tangle.tools/v1",
|
|
892
|
+
apiKey: opts.apiKey
|
|
893
|
+
})
|
|
894
|
+
);
|
|
895
|
+
case "cli-bridge":
|
|
896
|
+
return wrapLlmClient(
|
|
897
|
+
opts.transport,
|
|
898
|
+
opts.defaultModel,
|
|
899
|
+
new LlmClient({
|
|
900
|
+
baseUrl: opts.baseUrl ?? "http://127.0.0.1:3344/v1",
|
|
901
|
+
apiKey: opts.bearer ?? ""
|
|
902
|
+
})
|
|
903
|
+
);
|
|
904
|
+
case "direct-provider":
|
|
905
|
+
return wrapLlmClient(
|
|
906
|
+
opts.transport,
|
|
907
|
+
opts.defaultModel,
|
|
908
|
+
new LlmClient({
|
|
909
|
+
baseUrl: opts.baseUrl,
|
|
910
|
+
apiKey: opts.apiKey
|
|
911
|
+
})
|
|
912
|
+
);
|
|
913
|
+
case "sandbox-sdk":
|
|
914
|
+
return {
|
|
915
|
+
transport: "sandbox-sdk",
|
|
916
|
+
defaultModel: opts.defaultModel,
|
|
917
|
+
chat: async (req, callOpts) => opts.chat(resolveModel(req, opts.defaultModel), callOpts)
|
|
918
|
+
};
|
|
919
|
+
case "mock":
|
|
920
|
+
return {
|
|
921
|
+
transport: "mock",
|
|
922
|
+
defaultModel: opts.defaultModel,
|
|
923
|
+
chat: async (req, callOpts) => opts.handler(resolveModel(req, opts.defaultModel), callOpts)
|
|
924
|
+
};
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
function wrapLlmClient(transport, defaultModel, inner) {
|
|
928
|
+
return {
|
|
929
|
+
transport,
|
|
930
|
+
defaultModel,
|
|
931
|
+
chat: async (req, callOpts) => {
|
|
932
|
+
const resolved = resolveModel(req, defaultModel);
|
|
933
|
+
const call = inner.call({
|
|
934
|
+
model: resolved.model,
|
|
935
|
+
messages: req.messages,
|
|
936
|
+
jsonMode: req.jsonMode,
|
|
937
|
+
jsonSchema: req.jsonSchema,
|
|
938
|
+
temperature: req.temperature,
|
|
939
|
+
maxTokens: req.maxTokens,
|
|
940
|
+
timeoutMs: req.timeoutMs
|
|
941
|
+
});
|
|
942
|
+
if (!callOpts?.signal) return await call;
|
|
943
|
+
return await Promise.race([call, abortAsRejection(callOpts.signal)]);
|
|
944
|
+
}
|
|
945
|
+
};
|
|
946
|
+
}
|
|
947
|
+
function abortAsRejection(signal) {
|
|
948
|
+
if (signal.aborted) return Promise.reject(toAbortError(signal));
|
|
949
|
+
return new Promise((_, reject) => {
|
|
950
|
+
signal.addEventListener("abort", () => reject(toAbortError(signal)), { once: true });
|
|
951
|
+
});
|
|
952
|
+
}
|
|
953
|
+
function toAbortError(signal) {
|
|
954
|
+
const reason = signal.reason;
|
|
955
|
+
if (reason instanceof Error) return reason;
|
|
956
|
+
const e = new Error("ChatClient.chat: aborted");
|
|
957
|
+
e.name = "AbortError";
|
|
958
|
+
return e;
|
|
959
|
+
}
|
|
960
|
+
function resolveModel(req, defaultModel) {
|
|
961
|
+
if (req.model) return req;
|
|
962
|
+
if (!defaultModel) {
|
|
963
|
+
throw new Error(
|
|
964
|
+
"ChatClient.chat: no model on request and no defaultModel on the client. Either pass req.model or bind defaultModel at createChatClient()."
|
|
965
|
+
);
|
|
966
|
+
}
|
|
967
|
+
return { ...req, model: defaultModel };
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
// src/analyst/findings-store.ts
|
|
971
|
+
import { existsSync as existsSync2, readFileSync } from "fs";
|
|
972
|
+
|
|
973
|
+
// src/locked-jsonl-appender.ts
|
|
974
|
+
import { appendFileSync, existsSync, mkdirSync } from "fs";
|
|
975
|
+
import { dirname } from "path";
|
|
976
|
+
|
|
977
|
+
// src/concurrency.ts
|
|
978
|
+
var Mutex = class {
|
|
979
|
+
locked = false;
|
|
980
|
+
waiters = [];
|
|
981
|
+
async acquire() {
|
|
982
|
+
if (!this.locked) {
|
|
983
|
+
this.locked = true;
|
|
984
|
+
return () => this.release();
|
|
985
|
+
}
|
|
986
|
+
return new Promise((resolve) => {
|
|
987
|
+
this.waiters.push(() => {
|
|
988
|
+
resolve(() => this.release());
|
|
989
|
+
});
|
|
990
|
+
});
|
|
991
|
+
}
|
|
992
|
+
release() {
|
|
993
|
+
const next = this.waiters.shift();
|
|
994
|
+
if (next) {
|
|
995
|
+
next();
|
|
996
|
+
} else {
|
|
997
|
+
this.locked = false;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
async runExclusive(fn) {
|
|
1001
|
+
const release = await this.acquire();
|
|
1002
|
+
try {
|
|
1003
|
+
return await fn();
|
|
1004
|
+
} finally {
|
|
1005
|
+
release();
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
1009
|
+
get isLocked() {
|
|
1010
|
+
return this.locked;
|
|
1011
|
+
}
|
|
1012
|
+
/** Pending waiter count. Diagnostics only. */
|
|
1013
|
+
get pending() {
|
|
1014
|
+
return this.waiters.length;
|
|
1015
|
+
}
|
|
1016
|
+
};
|
|
1017
|
+
|
|
1018
|
+
// src/locked-jsonl-appender.ts
|
|
1019
|
+
var mutexes = /* @__PURE__ */ new Map();
|
|
1020
|
+
function getMutex(path) {
|
|
1021
|
+
let m = mutexes.get(path);
|
|
1022
|
+
if (!m) {
|
|
1023
|
+
m = new Mutex();
|
|
1024
|
+
mutexes.set(path, m);
|
|
1025
|
+
}
|
|
1026
|
+
return m;
|
|
1027
|
+
}
|
|
1028
|
+
var LockedJsonlAppender = class {
|
|
1029
|
+
constructor(path) {
|
|
1030
|
+
this.path = path;
|
|
1031
|
+
this.mutex = getMutex(path);
|
|
1032
|
+
if (!existsSync(dirname(path))) {
|
|
1033
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
path;
|
|
1037
|
+
mutex;
|
|
1038
|
+
async append(entry) {
|
|
1039
|
+
const line = `${JSON.stringify(entry)}
|
|
1040
|
+
`;
|
|
1041
|
+
await this.mutex.runExclusive(() => {
|
|
1042
|
+
appendFileSync(this.path, line);
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
};
|
|
1046
|
+
function resetLockedAppendersForTesting() {
|
|
1047
|
+
mutexes.clear();
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// src/analyst/findings-store.ts
|
|
1051
|
+
var FindingsStore = class {
|
|
1052
|
+
constructor(path) {
|
|
1053
|
+
this.path = path;
|
|
1054
|
+
this.appender = new LockedJsonlAppender(path);
|
|
1055
|
+
}
|
|
1056
|
+
path;
|
|
1057
|
+
appender;
|
|
1058
|
+
async append(runId, findings) {
|
|
1059
|
+
for (const f of findings) {
|
|
1060
|
+
const row = { ...f, run_id: runId };
|
|
1061
|
+
await this.appender.append(row);
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
/** Load every persisted finding. Discards malformed trailing lines silently. */
|
|
1065
|
+
loadAll() {
|
|
1066
|
+
if (!existsSync2(this.path)) return [];
|
|
1067
|
+
const raw = readFileSync(this.path, "utf8");
|
|
1068
|
+
if (!raw) return [];
|
|
1069
|
+
const out = [];
|
|
1070
|
+
for (const line of raw.split("\n")) {
|
|
1071
|
+
if (!line) continue;
|
|
1072
|
+
try {
|
|
1073
|
+
out.push(JSON.parse(line));
|
|
1074
|
+
} catch {
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
return out;
|
|
1078
|
+
}
|
|
1079
|
+
/** Filter to a single run. */
|
|
1080
|
+
loadRun(runId) {
|
|
1081
|
+
return this.loadAll().filter((r) => r.run_id === runId);
|
|
1082
|
+
}
|
|
1083
|
+
};
|
|
1084
|
+
function defaultIsMaterial(a, b) {
|
|
1085
|
+
if (a.severity !== b.severity) return true;
|
|
1086
|
+
if (Math.abs((a.confidence ?? 0) - (b.confidence ?? 0)) > 0.05) return true;
|
|
1087
|
+
if (a.evidence_refs.length !== b.evidence_refs.length) return true;
|
|
1088
|
+
return false;
|
|
1089
|
+
}
|
|
1090
|
+
function diffFindings(previous, current, policy = {}) {
|
|
1091
|
+
const isMaterial = policy.isMaterial ?? defaultIsMaterial;
|
|
1092
|
+
const prevById = new Map(previous.map((f) => [f.finding_id, f]));
|
|
1093
|
+
const curById = new Map(current.map((f) => [f.finding_id, f]));
|
|
1094
|
+
const appeared = [];
|
|
1095
|
+
const disappeared = [];
|
|
1096
|
+
const persisted = [];
|
|
1097
|
+
const changed = [];
|
|
1098
|
+
for (const [id, cur] of curById) {
|
|
1099
|
+
const prev = prevById.get(id);
|
|
1100
|
+
if (!prev) {
|
|
1101
|
+
appeared.push(cur);
|
|
1102
|
+
continue;
|
|
1103
|
+
}
|
|
1104
|
+
if (isMaterial(prev, cur)) {
|
|
1105
|
+
changed.push({ previous: prev, current: cur });
|
|
1106
|
+
} else {
|
|
1107
|
+
persisted.push(cur);
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
for (const [id, prev] of prevById) {
|
|
1111
|
+
if (!curById.has(id)) disappeared.push(prev);
|
|
1112
|
+
}
|
|
1113
|
+
return { appeared, disappeared, persisted, changed };
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
// src/analyst/finding-signature.ts
|
|
1117
|
+
import { z } from "zod";
|
|
1118
|
+
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
1119
|
+
var RawAnalystFindingSchema = z.object({
|
|
1120
|
+
severity: z.enum(ANALYST_SEVERITIES),
|
|
1121
|
+
claim: z.string().min(1).max(2e3),
|
|
1122
|
+
subject: z.string().max(400).optional(),
|
|
1123
|
+
evidence_uri: z.string().min(1).max(2e3),
|
|
1124
|
+
evidence_excerpt: z.string().max(2e3).optional(),
|
|
1125
|
+
confidence: z.number().min(0).max(1),
|
|
1126
|
+
rationale: z.string().max(4e3).optional(),
|
|
1127
|
+
recommended_action: z.string().max(2e3).optional()
|
|
1128
|
+
}).strict();
|
|
1129
|
+
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
1130
|
+
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
1131
|
+
- claim: one-sentence statement (max 2000 chars)
|
|
1132
|
+
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
1133
|
+
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
1134
|
+
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
1135
|
+
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
1136
|
+
- rationale?: one or two sentences explaining the reasoning
|
|
1137
|
+
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
1138
|
+
|
|
1139
|
+
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
1140
|
+
function parseRawFinding(row, log) {
|
|
1141
|
+
const result = RawAnalystFindingSchema.safeParse(row);
|
|
1142
|
+
if (!result.success) {
|
|
1143
|
+
log?.("finding rejected: schema failure", {
|
|
1144
|
+
issues: result.error.issues.map((i) => ({ path: i.path.join("."), code: i.code, message: i.message }))
|
|
1145
|
+
});
|
|
1146
|
+
return null;
|
|
1147
|
+
}
|
|
1148
|
+
return result.data;
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
// src/analyst/kind-factory.ts
|
|
1152
|
+
import { agent, AxJSRuntime } from "@ax-llm/ax";
|
|
1153
|
+
function createTraceAnalystKind(spec, opts) {
|
|
1154
|
+
const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
|
|
1155
|
+
return {
|
|
1156
|
+
id: spec.id,
|
|
1157
|
+
description: spec.description,
|
|
1158
|
+
inputKind: "trace-store",
|
|
1159
|
+
cost: spec.cost,
|
|
1160
|
+
version,
|
|
1161
|
+
async analyze(store, ctx) {
|
|
1162
|
+
const tools = spec.buildTools(store);
|
|
1163
|
+
const maxDepth = spec.recursion?.maxDepth ?? 0;
|
|
1164
|
+
const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
|
|
1165
|
+
const actorDescription = spec.actorDescription.trim() + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
|
|
1166
|
+
const ax = agent(
|
|
1167
|
+
"question:string -> findings:json[]",
|
|
1168
|
+
{
|
|
1169
|
+
agentIdentity: {
|
|
1170
|
+
name: spec.id,
|
|
1171
|
+
description: spec.description
|
|
1172
|
+
},
|
|
1173
|
+
contextFields: ["question"],
|
|
1174
|
+
runtime: new AxJSRuntime({
|
|
1175
|
+
permissions: [],
|
|
1176
|
+
blockDynamicImport: true,
|
|
1177
|
+
allowedModules: [],
|
|
1178
|
+
freezeIntrinsics: true,
|
|
1179
|
+
blockShadowRealm: true,
|
|
1180
|
+
preventGlobalThisExtensions: false
|
|
1181
|
+
}),
|
|
1182
|
+
mode: maxDepth > 0 ? "advanced" : "simple",
|
|
1183
|
+
recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
|
|
1184
|
+
maxTurns: spec.maxTurns ?? 12,
|
|
1185
|
+
maxRuntimeChars: spec.maxRuntimeChars ?? 6e3,
|
|
1186
|
+
maxBatchedLlmQueryConcurrency: maxParallel,
|
|
1187
|
+
promptLevel: "detailed",
|
|
1188
|
+
contextPolicy: { preset: "full", budget: "balanced" },
|
|
1189
|
+
functions: { local: tools },
|
|
1190
|
+
actorOptions: {
|
|
1191
|
+
description: actorDescription,
|
|
1192
|
+
...opts.model ? { model: opts.model } : {},
|
|
1193
|
+
showThoughts: false,
|
|
1194
|
+
thinkingTokenBudget: "none"
|
|
1195
|
+
},
|
|
1196
|
+
responderOptions: {
|
|
1197
|
+
description: spec.responderDescription ?? "Format the structured `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.",
|
|
1198
|
+
...opts.model ? { model: opts.model } : {},
|
|
1199
|
+
showThoughts: false
|
|
1200
|
+
},
|
|
1201
|
+
bubbleErrors: [TraceFileMissingError]
|
|
1202
|
+
}
|
|
1203
|
+
);
|
|
1204
|
+
ctx.log?.(`analyst.kind ${spec.id} forward`, {
|
|
1205
|
+
max_depth: maxDepth,
|
|
1206
|
+
tool_count: tools.length,
|
|
1207
|
+
tags: ctx.tags
|
|
1208
|
+
});
|
|
1209
|
+
const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
|
|
1210
|
+
const out = [];
|
|
1211
|
+
const rawRows = Array.isArray(result.findings) ? result.findings : [];
|
|
1212
|
+
for (const row of rawRows) {
|
|
1213
|
+
const parsed = parseRawFinding(row, ctx.log);
|
|
1214
|
+
if (!parsed) continue;
|
|
1215
|
+
const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
|
|
1216
|
+
if (!postProcessed) continue;
|
|
1217
|
+
out.push(toAnalystFinding(spec, postProcessed));
|
|
1218
|
+
}
|
|
1219
|
+
ctx.log?.(`analyst.kind ${spec.id} done`, {
|
|
1220
|
+
emitted: rawRows.length,
|
|
1221
|
+
accepted: out.length
|
|
1222
|
+
});
|
|
1223
|
+
return out;
|
|
1224
|
+
}
|
|
1225
|
+
};
|
|
1226
|
+
}
|
|
1227
|
+
function deriveQuestion(ctx, spec) {
|
|
1228
|
+
const focus = ctx.tags?.focus?.trim();
|
|
1229
|
+
if (focus) return `${spec.id}: ${focus}`;
|
|
1230
|
+
return spec.id;
|
|
1231
|
+
}
|
|
1232
|
+
function toAnalystFinding(spec, raw) {
|
|
1233
|
+
return makeFinding({
|
|
1234
|
+
analyst_id: spec.id,
|
|
1235
|
+
area: spec.area,
|
|
1236
|
+
subject: raw.subject,
|
|
1237
|
+
claim: raw.claim,
|
|
1238
|
+
rationale: raw.rationale,
|
|
1239
|
+
severity: raw.severity,
|
|
1240
|
+
confidence: raw.confidence,
|
|
1241
|
+
evidence_refs: [
|
|
1242
|
+
{
|
|
1243
|
+
kind: evidenceKindFromUri(raw.evidence_uri),
|
|
1244
|
+
uri: raw.evidence_uri,
|
|
1245
|
+
excerpt: raw.evidence_excerpt
|
|
1246
|
+
}
|
|
1247
|
+
],
|
|
1248
|
+
recommended_action: raw.recommended_action,
|
|
1249
|
+
metadata: { kind_version: spec.version }
|
|
1250
|
+
});
|
|
1251
|
+
}
|
|
1252
|
+
function evidenceKindFromUri(uri) {
|
|
1253
|
+
if (uri.startsWith("span://")) return "span";
|
|
1254
|
+
if (uri.startsWith("artifact://")) return "artifact";
|
|
1255
|
+
if (uri.startsWith("metric://")) return "metric";
|
|
1256
|
+
if (uri.startsWith("event://")) return "event";
|
|
1257
|
+
if (uri.startsWith("finding://")) return "finding";
|
|
1258
|
+
return "artifact";
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
// src/analyst/tool-groups.ts
|
|
1262
|
+
var TOOL_NAMES_BY_GROUP = {
|
|
1263
|
+
all: /* @__PURE__ */ new Set(),
|
|
1264
|
+
discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
|
|
1265
|
+
discoveryAndRead: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces", "viewTrace", "viewSpans"]),
|
|
1266
|
+
discoveryAndSearch: /* @__PURE__ */ new Set([
|
|
1267
|
+
"getDatasetOverview",
|
|
1268
|
+
"queryTraces",
|
|
1269
|
+
"countTraces",
|
|
1270
|
+
"searchTrace",
|
|
1271
|
+
"searchSpan"
|
|
1272
|
+
]),
|
|
1273
|
+
targeted: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "viewSpans", "searchSpan"])
|
|
1274
|
+
};
|
|
1275
|
+
function buildTraceToolsForGroup(group, store) {
|
|
1276
|
+
const all = buildTraceAnalystTools({ store });
|
|
1277
|
+
if (group === "all") return all;
|
|
1278
|
+
const allow = TOOL_NAMES_BY_GROUP[group];
|
|
1279
|
+
if (!allow) throw new Error(`unknown trace tool group: ${group}`);
|
|
1280
|
+
return all.filter((tool) => allow.has(tool.name));
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
// src/analyst/kinds/failure-mode.ts
|
|
1284
|
+
var ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.
|
|
1285
|
+
|
|
1286
|
+
DISCOVERY \u2192 CLUSTER \u2192 CITE protocol:
|
|
1287
|
+
|
|
1288
|
+
1. Call \`traces.getDatasetOverview({})\` first. Use \`has_errors\`, \`models\`, \`agent_names\`, \`tools\`, and \`sample_trace_ids\` to size the failure surface.
|
|
1289
|
+
2. Use \`traces.queryTraces({ filters: { has_errors: true }, limit })\` to pull error-bearing traces. Combine with \`traces.countTraces\` to see what fraction of the dataset failed.
|
|
1290
|
+
3. For each candidate failure cluster, use \`traces.searchTrace\` with regex like \`STATUS_CODE_ERROR\`, \`MaxTurnsExceeded\`, \`assertion\`, \`unauthorized\`, \`timeout\`, \`429\`, \`5\\d\\d\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.
|
|
1291
|
+
4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.
|
|
1292
|
+
5. For each cluster you can defend with evidence, emit ONE finding with:
|
|
1293
|
+
- \`area\` = "failure-mode"
|
|
1294
|
+
- \`subject\` = a short label for the cluster ("tool-call-loop", "auth-revoked-mid-run", "agent-asked-clarification-too-late", ...)
|
|
1295
|
+
- \`claim\` = one sentence stating the mode
|
|
1296
|
+
- \`severity\` = "critical" when it blocks the run, "high" when the run finished degraded, "medium" when it slowed convergence
|
|
1297
|
+
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the most representative span
|
|
1298
|
+
- \`evidence_excerpt\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)
|
|
1299
|
+
- \`confidence\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.
|
|
1300
|
+
- \`recommended_action\` = imperative-phrased fix idea (kept short \u2014 the improvement-analyst will expand on these)
|
|
1301
|
+
|
|
1302
|
+
If the dataset has no failures, return an empty findings array \u2014 do NOT pad with low-confidence speculation.
|
|
1303
|
+
|
|
1304
|
+
**Delegate aggressively.** The recursion budget is there to be used:
|
|
1305
|
+
- After your first \`getDatasetOverview\` + \`queryTraces\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \`llmQuery\` per cluster in a single batch \u2014 they investigate in parallel.
|
|
1306
|
+
- A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.
|
|
1307
|
+
- Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.
|
|
1308
|
+
- Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \`final()\` \u2014 they return their findings list to you, and you call \`final()\` once at the top.
|
|
1309
|
+
|
|
1310
|
+
OBSERVABILITY rules:
|
|
1311
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1312
|
+
- Reuse runtime variables across turns; don't recompute.
|
|
1313
|
+
- Call \`final({ findings: [...] })\` exactly once, after you've gathered evidence for every cluster you intend to report.`;
|
|
1314
|
+
var FAILURE_MODE_KIND_SPEC = {
|
|
1315
|
+
id: "failure-mode",
|
|
1316
|
+
description: "Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.",
|
|
1317
|
+
area: "failure-mode",
|
|
1318
|
+
version: "1.0.0",
|
|
1319
|
+
actorDescription: ACTOR_PROMPT,
|
|
1320
|
+
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1321
|
+
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1322
|
+
maxTurns: 24,
|
|
1323
|
+
cost: { kind: "llm" }
|
|
1324
|
+
};
|
|
1325
|
+
|
|
1326
|
+
// src/analyst/kinds/knowledge-gap.ts
|
|
1327
|
+
var ACTOR_PROMPT2 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
|
|
1328
|
+
|
|
1329
|
+
The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
|
|
1330
|
+
|
|
1331
|
+
DISCOVERY \u2192 ATTRIBUTE-TO-LAYER \u2192 CITE protocol:
|
|
1332
|
+
|
|
1333
|
+
1. \`traces.getDatasetOverview({})\` first. Note which agents, tools, and models appear.
|
|
1334
|
+
2. Pull traces where the agent shows gap signals. The strongest signals are:
|
|
1335
|
+
- Self-correction turns ("I assumed X but\u2026", "let me re-check", "actually,")
|
|
1336
|
+
- Clarifying-question turns where the agent asked the user something the runtime should have surfaced
|
|
1337
|
+
- Repeated retrieval / lookup calls for the same artifact with slightly varied queries
|
|
1338
|
+
- Tool errors that name a missing argument or unknown resource
|
|
1339
|
+
- Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)
|
|
1340
|
+
- Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient
|
|
1341
|
+
- Fabricated identifiers that don't appear in dataset \`sample_trace_ids\`
|
|
1342
|
+
Use \`traces.searchTrace\` with patterns like \`I (don.?t|do not) know\`, \`assumed\`, \`unclear\`, \`could you (clarify|tell me|provide)\`, \`not found\`, \`undefined\`, \`unknown\`, \`null\`, dates older than the analysis window, or the agent's specific clarification phrases.
|
|
1343
|
+
3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \`subject\` on the finding. Use one of:
|
|
1344
|
+
- \`agent-knowledge:wiki:<page-slug>\` \u2014 the wiki page that should exist but doesn't, or exists but lacks the claim
|
|
1345
|
+
- \`agent-knowledge:wiki:<page-slug>#<heading>\` \u2014 wiki page exists but a specific section is missing
|
|
1346
|
+
- \`agent-knowledge:claim:<topic>\` \u2014 a specific claim/relation triple that should be in the wiki
|
|
1347
|
+
- \`agent-knowledge:raw:<source-id>\` \u2014 raw source captured but never lifted into a curated page
|
|
1348
|
+
- \`agent-knowledge:stale:<page-slug>\` \u2014 wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)
|
|
1349
|
+
- \`websearch:outdated:<topic>\` \u2014 agent relied on a web result that was stale; wiki should have superseded it
|
|
1350
|
+
- \`tool-doc:<tool-name>:<aspect>\` \u2014 tool description missed a behavior aspect (return shape, failure modes, side effects)
|
|
1351
|
+
- \`system-prompt:<section>\` \u2014 system prompt should have stated the rule directly
|
|
1352
|
+
- \`memory:<key>\` \u2014 prior-run memory should have surfaced an earlier decision
|
|
1353
|
+
4. For each gap you can defend with evidence, emit ONE finding with:
|
|
1354
|
+
- \`area\` = "knowledge-gap"
|
|
1355
|
+
- \`subject\` = the locus string from the list above
|
|
1356
|
+
- \`claim\` = a sentence naming the missing or stale knowledge ("wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans")
|
|
1357
|
+
- \`severity\` = "high" when the gap caused a failure or a clarifying question; "medium" when it caused unnecessary turns; "low" when it caused minor inefficiency
|
|
1358
|
+
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)
|
|
1359
|
+
- \`evidence_excerpt\` = exact quote where the agent showed the gap
|
|
1360
|
+
- \`confidence\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior
|
|
1361
|
+
- \`recommended_action\` = phrased as a wiki edit when the locus is \`agent-knowledge:*\` ("Create wiki page \`invoice-line-items\` with claims: ..."), or as a prompt/tool-doc edit otherwise
|
|
1362
|
+
|
|
1363
|
+
**Delegate per layer.** After your first scan, you should have candidates spread across \`agent-knowledge:*\`, \`websearch:outdated\`, \`tool-doc:*\`, \`system-prompt:*\`, and \`memory:*\`. Spawn one \`llmQuery\` per layer in parallel \u2014 each subagent runs a focused detection (e.g. the \`agent-knowledge\` subagent looks for both missing-pages AND stale-pages; the \`websearch\` subagent looks specifically for date staleness signals; the \`tool-doc\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \`final({ findings })\` at the top.
|
|
1364
|
+
|
|
1365
|
+
Do NOT report a gap that the agent later recovered from cleanly within the same turn \u2014 that's resilience, not a gap. Cite the *non-recovery* version when both exist.
|
|
1366
|
+
|
|
1367
|
+
OBSERVABILITY rules:
|
|
1368
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1369
|
+
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1370
|
+
var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
1371
|
+
id: "knowledge-gap",
|
|
1372
|
+
description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
|
|
1373
|
+
area: "knowledge-gap",
|
|
1374
|
+
version: "1.0.0",
|
|
1375
|
+
actorDescription: ACTOR_PROMPT2,
|
|
1376
|
+
buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
|
|
1377
|
+
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1378
|
+
maxTurns: 18,
|
|
1379
|
+
cost: { kind: "llm" }
|
|
1380
|
+
};
|
|
1381
|
+
|
|
1382
|
+
// src/analyst/kinds/knowledge-poisoning.ts
|
|
1383
|
+
var ACTOR_PROMPT3 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
|
|
1384
|
+
|
|
1385
|
+
DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
|
|
1386
|
+
|
|
1387
|
+
1. \`traces.getDatasetOverview({})\` first. Identify the agents, models, and tools.
|
|
1388
|
+
2. Pull traces where the agent's confident action was later contradicted. Strongest signals:
|
|
1389
|
+
- Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.
|
|
1390
|
+
- Tool call with stale arguments (an id that no longer exists, an API shape that changed).
|
|
1391
|
+
- Agent cited an \`agent-knowledge\` wiki page or claim whose content contradicts the trace's own evidence \u2014 the wiki itself drifted.
|
|
1392
|
+
- Web-search result the agent cited that returned an outdated page; agent treated it as canonical.
|
|
1393
|
+
- System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says "use endpoint A"; tool reply says "endpoint A deprecated, use B").
|
|
1394
|
+
- Repeated wrong-shape parsing despite the tool's actual output proving the shape.
|
|
1395
|
+
3. Use \`traces.searchTrace\` with regex on phrases like \`actually\`, \`turns out\`, \`previously assumed\`, \`old version\`, \`deprecated\`, \`updated to\`, \`now uses\`, or specific entity names you suspect have changed.
|
|
1396
|
+
4. For each candidate poisoning, **DUAL-VERIFY**:
|
|
1397
|
+
- Confirm the agent actually acted on the false belief (cite the span where it did)
|
|
1398
|
+
- Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)
|
|
1399
|
+
Only emit a finding when both halves are nailed down. If you can only nail one, drop it \u2014 single-evidence poisoning findings are too speculative to be useful.
|
|
1400
|
+
|
|
1401
|
+
**Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating "did the agent act?" and one investigating "is the belief false?". After your first scan, fire off N parallel \`llmQuery\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.
|
|
1402
|
+
|
|
1403
|
+
For each confirmed poisoning, emit ONE finding with:
|
|
1404
|
+
- \`area\` = "knowledge-poisoning"
|
|
1405
|
+
- \`subject\` = the source of the false belief, one of: \`agent-knowledge:wiki:<page-slug>\` (wiki page contradicts current ground truth), \`agent-knowledge:claim:<topic>\` (a specific claim/relation went stale), \`agent-knowledge:raw:<source-id>\` (the raw source is outdated and the wiki inherited the drift), \`websearch:outdated:<url-or-topic>\`, \`tool-doc:<tool>\`, \`system-prompt:<section>\`, \`memory:<key>\`, \`prior-run-summary:<topic>\`
|
|
1406
|
+
- \`claim\` = one sentence: "agent believed X (from source S); evidence in trace shows X is false"
|
|
1407
|
+
- \`severity\` = "critical" when poisoning caused a wrong user-visible action; "high" when caught internally but wasted significant work; "medium" for inefficiency only
|
|
1408
|
+
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the action span (the moment the agent acted on the false belief)
|
|
1409
|
+
- \`evidence_excerpt\` = exact quote of the confident-but-wrong claim or action
|
|
1410
|
+
- \`confidence\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred
|
|
1411
|
+
- \`recommended_action\` = where the source should be updated and how ("Update wiki page \`X\` claim \`Y\` to '...'", "Invalidate raw source \`Z\` and re-curate", "Replace system-prompt section X with 'tool foo now returns Y'")
|
|
1412
|
+
|
|
1413
|
+
Do NOT report a finding if the agent caught and corrected the false belief in the same turn \u2014 that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.
|
|
1414
|
+
|
|
1415
|
+
OBSERVABILITY rules:
|
|
1416
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1417
|
+
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1418
|
+
var KNOWLEDGE_POISONING_KIND_SPEC = {
|
|
1419
|
+
id: "knowledge-poisoning",
|
|
1420
|
+
description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
|
|
1421
|
+
area: "knowledge-poisoning",
|
|
1422
|
+
version: "1.0.0",
|
|
1423
|
+
actorDescription: ACTOR_PROMPT3,
|
|
1424
|
+
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1425
|
+
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1426
|
+
maxTurns: 20,
|
|
1427
|
+
cost: { kind: "llm" }
|
|
1428
|
+
};
|
|
1429
|
+
|
|
1430
|
+
// src/analyst/kinds/improvement.ts
|
|
1431
|
+
var ACTOR_PROMPT4 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1432
|
+
|
|
1433
|
+
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1434
|
+
|
|
1435
|
+
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1436
|
+
|
|
1437
|
+
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1438
|
+
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1439
|
+
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1440
|
+
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1441
|
+
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1442
|
+
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1443
|
+
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1444
|
+
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1445
|
+
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1446
|
+
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1447
|
+
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1448
|
+
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1449
|
+
|
|
1450
|
+
For each winning recommendation, emit ONE finding with:
|
|
1451
|
+
- \`area\` = "improvement"
|
|
1452
|
+
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1453
|
+
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1454
|
+
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1455
|
+
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1456
|
+
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1457
|
+
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1458
|
+
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1459
|
+
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1460
|
+
|
|
1461
|
+
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1462
|
+
|
|
1463
|
+
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1464
|
+
|
|
1465
|
+
OBSERVABILITY rules:
|
|
1466
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1467
|
+
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1468
|
+
var IMPROVEMENT_KIND_SPEC = {
|
|
1469
|
+
id: "improvement",
|
|
1470
|
+
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1471
|
+
area: "improvement",
|
|
1472
|
+
version: "1.0.0",
|
|
1473
|
+
actorDescription: ACTOR_PROMPT4,
|
|
1474
|
+
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1475
|
+
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1476
|
+
maxTurns: 30,
|
|
1477
|
+
maxRuntimeChars: 12e3,
|
|
1478
|
+
cost: { kind: "llm" }
|
|
1479
|
+
};
|
|
1480
|
+
|
|
1481
|
+
// src/analyst/kinds/index.ts
|
|
1482
|
+
var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
1483
|
+
FAILURE_MODE_KIND_SPEC,
|
|
1484
|
+
KNOWLEDGE_GAP_KIND_SPEC,
|
|
1485
|
+
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
1486
|
+
IMPROVEMENT_KIND_SPEC
|
|
1487
|
+
];
|
|
1488
|
+
|
|
1489
|
+
// src/analyst/registry.ts
|
|
1490
|
+
import { randomUUID } from "crypto";
|
|
1491
|
+
var AnalystRegistry = class {
|
|
1492
|
+
analysts = /* @__PURE__ */ new Map();
|
|
1493
|
+
options;
|
|
1494
|
+
constructor(options = {}) {
|
|
1495
|
+
this.options = options;
|
|
1496
|
+
}
|
|
1497
|
+
register(analyst) {
|
|
1498
|
+
if (!analyst.id) throw new Error("AnalystRegistry.register: analyst.id is required");
|
|
1499
|
+
if (this.analysts.has(analyst.id)) {
|
|
1500
|
+
throw new Error(`AnalystRegistry.register: duplicate analyst id "${analyst.id}"`);
|
|
1501
|
+
}
|
|
1502
|
+
if (!analyst.version) {
|
|
1503
|
+
throw new Error(`AnalystRegistry.register: analyst "${analyst.id}" must declare a version`);
|
|
1504
|
+
}
|
|
1505
|
+
this.analysts.set(analyst.id, analyst);
|
|
1506
|
+
}
|
|
1507
|
+
list() {
|
|
1508
|
+
return Array.from(this.analysts.values()).map((a) => ({
|
|
1509
|
+
id: a.id,
|
|
1510
|
+
description: a.description,
|
|
1511
|
+
version: a.version,
|
|
1512
|
+
cost: a.cost
|
|
1513
|
+
}));
|
|
1514
|
+
}
|
|
1515
|
+
async run(runId, inputs, runOpts = {}) {
|
|
1516
|
+
const correlationId = `ar_${randomUUID().slice(0, 12)}`;
|
|
1517
|
+
const log = this.options.log ?? (() => {
|
|
1518
|
+
});
|
|
1519
|
+
const hooks = this.options.hooks ?? {};
|
|
1520
|
+
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
1521
|
+
const started = Date.now();
|
|
1522
|
+
const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
|
|
1523
|
+
const selected = this.selectAnalysts(runOpts);
|
|
1524
|
+
const budget = runOpts.budget ?? this.options.defaultBudget;
|
|
1525
|
+
const summaries = [];
|
|
1526
|
+
const allFindings = [];
|
|
1527
|
+
let totalCost = 0;
|
|
1528
|
+
let remainingUsd = budget?.totalUsd;
|
|
1529
|
+
for (const analyst of selected) {
|
|
1530
|
+
const t0 = Date.now();
|
|
1531
|
+
const input = this.routeInput(analyst, inputs);
|
|
1532
|
+
if (input.kind === "missing") {
|
|
1533
|
+
const summary = {
|
|
1534
|
+
analyst_id: analyst.id,
|
|
1535
|
+
status: "skipped",
|
|
1536
|
+
reason: `missing input of kind '${analyst.inputKind}'`,
|
|
1537
|
+
findings_count: 0,
|
|
1538
|
+
latency_ms: 0,
|
|
1539
|
+
cost_usd: 0
|
|
1540
|
+
};
|
|
1541
|
+
summaries.push(summary);
|
|
1542
|
+
log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
|
|
1543
|
+
await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
|
|
1544
|
+
continue;
|
|
1545
|
+
}
|
|
1546
|
+
const perBudget = allocateBudget(budget, {
|
|
1547
|
+
analyst,
|
|
1548
|
+
remainingUsd,
|
|
1549
|
+
runningCount: selected.length
|
|
1550
|
+
});
|
|
1551
|
+
const ctx = {
|
|
1552
|
+
runId,
|
|
1553
|
+
correlationId,
|
|
1554
|
+
deadlineMs,
|
|
1555
|
+
budgetUsd: perBudget,
|
|
1556
|
+
chat: this.options.chat,
|
|
1557
|
+
tags: runOpts.tags,
|
|
1558
|
+
log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
|
|
1559
|
+
signal: runOpts.signal
|
|
1560
|
+
};
|
|
1561
|
+
await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
|
|
1562
|
+
try {
|
|
1563
|
+
const findings = await analyst.analyze(input.value, ctx);
|
|
1564
|
+
const latency = Date.now() - t0;
|
|
1565
|
+
const cost = sumFindingCost(findings);
|
|
1566
|
+
totalCost += cost;
|
|
1567
|
+
if (typeof remainingUsd === "number") remainingUsd = Math.max(0, remainingUsd - cost);
|
|
1568
|
+
allFindings.push(...findings);
|
|
1569
|
+
const summary = {
|
|
1570
|
+
analyst_id: analyst.id,
|
|
1571
|
+
status: "ok",
|
|
1572
|
+
findings_count: findings.length,
|
|
1573
|
+
latency_ms: latency,
|
|
1574
|
+
cost_usd: cost
|
|
1575
|
+
};
|
|
1576
|
+
summaries.push(summary);
|
|
1577
|
+
log(`[analyst] ok ${analyst.id}`, {
|
|
1578
|
+
runId,
|
|
1579
|
+
findings: findings.length,
|
|
1580
|
+
latency_ms: latency,
|
|
1581
|
+
cost_usd: cost
|
|
1582
|
+
});
|
|
1583
|
+
await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
|
|
1584
|
+
} catch (err) {
|
|
1585
|
+
const latency = Date.now() - t0;
|
|
1586
|
+
const e = err instanceof Error ? err : new Error(String(err));
|
|
1587
|
+
const hookFindings = await hooks.onError?.({ analyst, error: e, runId }) ?? [];
|
|
1588
|
+
if (hookFindings.length) allFindings.push(...hookFindings);
|
|
1589
|
+
const summary = {
|
|
1590
|
+
analyst_id: analyst.id,
|
|
1591
|
+
status: "failed",
|
|
1592
|
+
findings_count: hookFindings.length,
|
|
1593
|
+
latency_ms: latency,
|
|
1594
|
+
cost_usd: 0,
|
|
1595
|
+
error: { class: e.constructor.name, message: e.message }
|
|
1596
|
+
};
|
|
1597
|
+
summaries.push(summary);
|
|
1598
|
+
log(`[analyst] FAIL ${analyst.id}`, {
|
|
1599
|
+
runId,
|
|
1600
|
+
error_class: e.constructor.name,
|
|
1601
|
+
error: e.message
|
|
1602
|
+
});
|
|
1603
|
+
await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
const result = {
|
|
1607
|
+
run_id: runId,
|
|
1608
|
+
correlation_id: correlationId,
|
|
1609
|
+
started_at: startedAt,
|
|
1610
|
+
ended_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1611
|
+
findings: allFindings,
|
|
1612
|
+
per_analyst: summaries,
|
|
1613
|
+
total_cost_usd: totalCost
|
|
1614
|
+
};
|
|
1615
|
+
await hooks.onComplete?.({ result });
|
|
1616
|
+
return result;
|
|
1617
|
+
}
|
|
1618
|
+
selectAnalysts(opts) {
|
|
1619
|
+
let candidates = Array.from(this.analysts.values());
|
|
1620
|
+
if (opts.only?.length) {
|
|
1621
|
+
const only = new Set(opts.only);
|
|
1622
|
+
candidates = candidates.filter((a) => only.has(a.id));
|
|
1623
|
+
}
|
|
1624
|
+
if (opts.skip?.length) {
|
|
1625
|
+
const skip = new Set(opts.skip);
|
|
1626
|
+
candidates = candidates.filter((a) => !skip.has(a.id));
|
|
1627
|
+
}
|
|
1628
|
+
return candidates;
|
|
1629
|
+
}
|
|
1630
|
+
routeInput(analyst, inputs) {
|
|
1631
|
+
switch (analyst.inputKind) {
|
|
1632
|
+
case "trace-store":
|
|
1633
|
+
return inputs.traceStore ? { kind: "present", value: inputs.traceStore } : { kind: "missing" };
|
|
1634
|
+
case "artifact-dir":
|
|
1635
|
+
return inputs.artifactDir ? { kind: "present", value: inputs.artifactDir } : { kind: "missing" };
|
|
1636
|
+
case "run-record":
|
|
1637
|
+
return inputs.runRecord ? { kind: "present", value: inputs.runRecord } : { kind: "missing" };
|
|
1638
|
+
case "judge-input":
|
|
1639
|
+
return inputs.judgeInput ? { kind: "present", value: inputs.judgeInput } : { kind: "missing" };
|
|
1640
|
+
case "custom": {
|
|
1641
|
+
const v = inputs.custom?.[analyst.id];
|
|
1642
|
+
return v !== void 0 ? { kind: "present", value: v } : { kind: "missing" };
|
|
1643
|
+
}
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
};
|
|
1647
|
+
function allocateBudget(policy, args) {
|
|
1648
|
+
if (!policy) return void 0;
|
|
1649
|
+
if (policy.allocate) {
|
|
1650
|
+
return policy.allocate({
|
|
1651
|
+
analyst: args.analyst,
|
|
1652
|
+
totalUsd: policy.totalUsd,
|
|
1653
|
+
remainingUsd: args.remainingUsd,
|
|
1654
|
+
runningCount: args.runningCount
|
|
1655
|
+
});
|
|
1656
|
+
}
|
|
1657
|
+
if (policy.totalUsd == null) return void 0;
|
|
1658
|
+
if (policy.weights) {
|
|
1659
|
+
const w = policy.weights[args.analyst.id] ?? 1;
|
|
1660
|
+
const totalWeight = Math.max(1, args.runningCount);
|
|
1661
|
+
return policy.totalUsd * w / totalWeight;
|
|
1662
|
+
}
|
|
1663
|
+
return policy.totalUsd / Math.max(1, args.runningCount);
|
|
1664
|
+
}
|
|
1665
|
+
function sumFindingCost(findings) {
|
|
1666
|
+
let sum2 = 0;
|
|
1667
|
+
for (const f of findings) {
|
|
1668
|
+
const c = f.metadata?.cost_usd;
|
|
1669
|
+
if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
|
|
1670
|
+
}
|
|
1671
|
+
return sum2;
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
// src/auto-pr.ts
|
|
1675
|
+
async function proposeAutomatedPullRequest(client, input) {
|
|
1676
|
+
validate(input);
|
|
1677
|
+
return client.proposeChange(input);
|
|
1678
|
+
}
|
|
1679
|
+
function validate(input) {
|
|
1680
|
+
if (!input.repo.owner.trim() || !input.repo.name.trim()) {
|
|
1681
|
+
throw new ValidationError("proposeAutomatedPullRequest: repo.owner and repo.name required");
|
|
1682
|
+
}
|
|
1683
|
+
if (!input.branchName.trim() || /\s/.test(input.branchName)) {
|
|
1684
|
+
throw new ValidationError(
|
|
1685
|
+
"proposeAutomatedPullRequest: branchName must be non-empty and contain no whitespace"
|
|
1686
|
+
);
|
|
1687
|
+
}
|
|
1688
|
+
if (input.branchName === (input.baseBranch ?? "main")) {
|
|
1689
|
+
throw new ValidationError("proposeAutomatedPullRequest: branchName must differ from baseBranch");
|
|
1690
|
+
}
|
|
1691
|
+
if (input.fileChanges.length === 0) {
|
|
1692
|
+
throw new ValidationError("proposeAutomatedPullRequest: fileChanges must not be empty");
|
|
1693
|
+
}
|
|
1694
|
+
const seenPaths = /* @__PURE__ */ new Set();
|
|
1695
|
+
for (const change of input.fileChanges) {
|
|
1696
|
+
if (!change.path.trim() || change.path.includes("..") || change.path.startsWith("/")) {
|
|
1697
|
+
throw new ValidationError(
|
|
1698
|
+
`proposeAutomatedPullRequest: invalid file path "${change.path}" (no '..' or leading '/')`
|
|
1699
|
+
);
|
|
1700
|
+
}
|
|
1701
|
+
if (seenPaths.has(change.path)) {
|
|
1702
|
+
throw new ValidationError(`proposeAutomatedPullRequest: duplicate file path "${change.path}"`);
|
|
1703
|
+
}
|
|
1704
|
+
seenPaths.add(change.path);
|
|
1705
|
+
}
|
|
1706
|
+
if (!input.title.trim()) {
|
|
1707
|
+
throw new ValidationError("proposeAutomatedPullRequest: title must not be empty");
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
function httpGithubClient(opts) {
|
|
1711
|
+
const fetchImpl = opts.fetchImpl ?? fetch;
|
|
1712
|
+
const apiBase = (opts.apiBase ?? "https://api.github.com").replace(/\/+$/, "");
|
|
1713
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
1714
|
+
async function api(method, path, body, accept404 = false) {
|
|
1715
|
+
const res = await fetchImpl(`${apiBase}${path}`, {
|
|
1716
|
+
method,
|
|
1717
|
+
headers: {
|
|
1718
|
+
accept: "application/vnd.github+json",
|
|
1719
|
+
"content-type": "application/json",
|
|
1720
|
+
authorization: `Bearer ${opts.token}`,
|
|
1721
|
+
"x-github-api-version": "2022-11-28"
|
|
1722
|
+
},
|
|
1723
|
+
body: body === void 0 ? void 0 : JSON.stringify(body)
|
|
1724
|
+
});
|
|
1725
|
+
if (accept404 && res.status === 404) return null;
|
|
1726
|
+
if (!res.ok) {
|
|
1727
|
+
const text = await res.text().catch(() => "");
|
|
1728
|
+
throw new ConfigError(
|
|
1729
|
+
`proposeAutomatedPullRequest: GitHub ${method} ${path} \u2192 ${res.status} ${text.slice(0, 400)}`
|
|
1730
|
+
);
|
|
1731
|
+
}
|
|
1732
|
+
return await res.json();
|
|
1733
|
+
}
|
|
1734
|
+
return {
|
|
1735
|
+
async proposeChange(input) {
|
|
1736
|
+
const baseBranch = input.baseBranch ?? "main";
|
|
1737
|
+
const repoPath = `/repos/${input.repo.owner}/${input.repo.name}`;
|
|
1738
|
+
if (input.dryRun) {
|
|
1739
|
+
return {
|
|
1740
|
+
prUrl: `https://github.com/${input.repo.owner}/${input.repo.name}/compare/${baseBranch}...${input.branchName}`,
|
|
1741
|
+
branchName: input.branchName,
|
|
1742
|
+
headSha: "dry-run",
|
|
1743
|
+
dryRun: true
|
|
1744
|
+
};
|
|
1745
|
+
}
|
|
1746
|
+
const baseRef = await api("GET", `${repoPath}/git/ref/heads/${baseBranch}`);
|
|
1747
|
+
if (!baseRef) {
|
|
1748
|
+
throw new ConfigError(`proposeAutomatedPullRequest: base branch "${baseBranch}" not found`);
|
|
1749
|
+
}
|
|
1750
|
+
const baseSha = baseRef.object.sha;
|
|
1751
|
+
const baseCommit = await api("GET", `${repoPath}/git/commits/${baseSha}`);
|
|
1752
|
+
if (!baseCommit) {
|
|
1753
|
+
throw new ConfigError(
|
|
1754
|
+
`proposeAutomatedPullRequest: base commit ${baseSha} not found (race condition?)`
|
|
1755
|
+
);
|
|
1756
|
+
}
|
|
1757
|
+
const treeEntries = [];
|
|
1758
|
+
for (const change of input.fileChanges) {
|
|
1759
|
+
const blob = await api("POST", `${repoPath}/git/blobs`, {
|
|
1760
|
+
content: change.contents,
|
|
1761
|
+
encoding: "utf-8"
|
|
1762
|
+
});
|
|
313
1763
|
if (!blob) throw new ConfigError("proposeAutomatedPullRequest: blob creation returned null");
|
|
314
1764
|
treeEntries.push({
|
|
315
1765
|
path: change.path,
|
|
@@ -2736,545 +4186,397 @@ var DualAgentBench = class {
|
|
|
2736
4186
|
const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
|
|
2737
4187
|
return {
|
|
2738
4188
|
scenarios: results,
|
|
2739
|
-
aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
|
|
2740
|
-
config: { maxRounds, convergenceThreshold: threshold }
|
|
2741
|
-
};
|
|
2742
|
-
}
|
|
2743
|
-
};
|
|
2744
|
-
|
|
2745
|
-
// src/experiment-tracker.ts
|
|
2746
|
-
var InMemoryExperimentStore = class {
|
|
2747
|
-
experiments = /* @__PURE__ */ new Map();
|
|
2748
|
-
runs = /* @__PURE__ */ new Map();
|
|
2749
|
-
async saveExperiment(exp) {
|
|
2750
|
-
this.experiments.set(exp.id, { ...exp });
|
|
2751
|
-
}
|
|
2752
|
-
async getExperiment(id) {
|
|
2753
|
-
const e = this.experiments.get(id);
|
|
2754
|
-
return e ? { ...e } : null;
|
|
2755
|
-
}
|
|
2756
|
-
async listExperiments() {
|
|
2757
|
-
return [...this.experiments.values()].sort((a, b) => b.createdAt.localeCompare(a.createdAt));
|
|
2758
|
-
}
|
|
2759
|
-
async saveRun(run) {
|
|
2760
|
-
this.runs.set(run.id, structuredClone(run));
|
|
2761
|
-
}
|
|
2762
|
-
async getRun(id) {
|
|
2763
|
-
const r = this.runs.get(id);
|
|
2764
|
-
return r ? structuredClone(r) : null;
|
|
2765
|
-
}
|
|
2766
|
-
async listRuns(experimentId) {
|
|
2767
|
-
return [...this.runs.values()].filter((r) => r.experimentId === experimentId).sort((a, b) => b.startedAt.localeCompare(a.startedAt)).map((r) => structuredClone(r));
|
|
2768
|
-
}
|
|
2769
|
-
};
|
|
2770
|
-
var ExperimentTracker = class {
|
|
2771
|
-
constructor(store) {
|
|
2772
|
-
this.store = store;
|
|
2773
|
-
}
|
|
2774
|
-
store;
|
|
2775
|
-
async startExperiment(name, metadata) {
|
|
2776
|
-
const exp = {
|
|
2777
|
-
id: `exp_${rand(8)}`,
|
|
2778
|
-
name,
|
|
2779
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2780
|
-
metadata
|
|
2781
|
-
};
|
|
2782
|
-
await this.store.saveExperiment(exp);
|
|
2783
|
-
return exp;
|
|
2784
|
-
}
|
|
2785
|
-
async startRun(config) {
|
|
2786
|
-
const exp = await this.store.getExperiment(config.experimentId);
|
|
2787
|
-
if (!exp) throw new Error(`Experiment ${config.experimentId} not found`);
|
|
2788
|
-
const run = {
|
|
2789
|
-
id: `run_${rand(10)}`,
|
|
2790
|
-
experimentId: config.experimentId,
|
|
2791
|
-
name: config.name,
|
|
2792
|
-
config,
|
|
2793
|
-
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2794
|
-
status: "running"
|
|
2795
|
-
};
|
|
2796
|
-
await this.store.saveRun(run);
|
|
2797
|
-
return run;
|
|
2798
|
-
}
|
|
2799
|
-
async completeRun(runId, report) {
|
|
2800
|
-
const run = await this.store.getRun(runId);
|
|
2801
|
-
if (!run) throw new Error(`Run ${runId} not found`);
|
|
2802
|
-
run.status = "completed";
|
|
2803
|
-
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2804
|
-
run.report = report;
|
|
2805
|
-
await this.store.saveRun(run);
|
|
2806
|
-
}
|
|
2807
|
-
async failRun(runId, error) {
|
|
2808
|
-
const run = await this.store.getRun(runId);
|
|
2809
|
-
if (!run) throw new Error(`Run ${runId} not found`);
|
|
2810
|
-
run.status = "failed";
|
|
2811
|
-
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2812
|
-
run.error = error;
|
|
2813
|
-
await this.store.saveRun(run);
|
|
2814
|
-
}
|
|
2815
|
-
/**
|
|
2816
|
-
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
2817
|
-
* and config changes that may explain the movement.
|
|
2818
|
-
*/
|
|
2819
|
-
async diff(runIdA, runIdB) {
|
|
2820
|
-
const [a, b] = await Promise.all([this.store.getRun(runIdA), this.store.getRun(runIdB)]);
|
|
2821
|
-
if (!a || !b) throw new Error("Both runs must exist");
|
|
2822
|
-
if (!a.report || !b.report) throw new Error("Both runs must be completed with reports");
|
|
2823
|
-
const byScenarioA = new Map(a.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
2824
|
-
const byScenarioB = new Map(b.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
2825
|
-
const scenarioIds = /* @__PURE__ */ new Set([...byScenarioA.keys(), ...byScenarioB.keys()]);
|
|
2826
|
-
const scenarios = [];
|
|
2827
|
-
for (const id of scenarioIds) {
|
|
2828
|
-
const aScore = byScenarioA.get(id);
|
|
2829
|
-
const bScore = byScenarioB.get(id);
|
|
2830
|
-
if (aScore === void 0) {
|
|
2831
|
-
scenarios.push({
|
|
2832
|
-
scenarioId: id,
|
|
2833
|
-
before: null,
|
|
2834
|
-
after: bScore,
|
|
2835
|
-
delta: null,
|
|
2836
|
-
status: "added"
|
|
2837
|
-
});
|
|
2838
|
-
} else if (bScore === void 0) {
|
|
2839
|
-
scenarios.push({
|
|
2840
|
-
scenarioId: id,
|
|
2841
|
-
before: aScore,
|
|
2842
|
-
after: null,
|
|
2843
|
-
delta: null,
|
|
2844
|
-
status: "removed"
|
|
2845
|
-
});
|
|
2846
|
-
} else {
|
|
2847
|
-
scenarios.push({
|
|
2848
|
-
scenarioId: id,
|
|
2849
|
-
before: aScore,
|
|
2850
|
-
after: bScore,
|
|
2851
|
-
delta: bScore - aScore,
|
|
2852
|
-
status: bScore > aScore ? "improved" : bScore < aScore ? "regressed" : "unchanged"
|
|
2853
|
-
});
|
|
2854
|
-
}
|
|
2855
|
-
}
|
|
2856
|
-
scenarios.sort((x, y) => (y.delta ?? 0) - (x.delta ?? 0));
|
|
2857
|
-
const aggregateDelta = b.report.summary.overallAvg - a.report.summary.overallAvg;
|
|
2858
|
-
const configChanges = {};
|
|
2859
|
-
const keys = /* @__PURE__ */ new Set([...Object.keys(a.config), ...Object.keys(b.config)]);
|
|
2860
|
-
const aCfg = a.config;
|
|
2861
|
-
const bCfg = b.config;
|
|
2862
|
-
for (const k of keys) {
|
|
2863
|
-
if (JSON.stringify(aCfg[k]) !== JSON.stringify(bCfg[k])) {
|
|
2864
|
-
configChanges[k] = { before: aCfg[k], after: bCfg[k] };
|
|
2865
|
-
}
|
|
2866
|
-
}
|
|
2867
|
-
return {
|
|
2868
|
-
before: { runId: runIdA, name: a.name, startedAt: a.startedAt },
|
|
2869
|
-
after: { runId: runIdB, name: b.name, startedAt: b.startedAt },
|
|
2870
|
-
aggregateDelta,
|
|
2871
|
-
scenarios,
|
|
2872
|
-
configChanges
|
|
2873
|
-
};
|
|
2874
|
-
}
|
|
2875
|
-
/** Timeline of aggregate scores for an experiment. */
|
|
2876
|
-
async timeline(experimentId) {
|
|
2877
|
-
const runs = await this.store.listRuns(experimentId);
|
|
2878
|
-
return runs.slice().sort((a, b) => a.startedAt.localeCompare(b.startedAt)).map((r) => ({
|
|
2879
|
-
runId: r.id,
|
|
2880
|
-
startedAt: r.startedAt,
|
|
2881
|
-
overall: r.report?.summary.overallAvg ?? null
|
|
2882
|
-
}));
|
|
2883
|
-
}
|
|
2884
|
-
};
|
|
2885
|
-
function rand(bytes) {
|
|
2886
|
-
const arr = new Uint8Array(bytes);
|
|
2887
|
-
crypto.getRandomValues(arr);
|
|
2888
|
-
return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
2889
|
-
}
|
|
2890
|
-
|
|
2891
|
-
// src/experiment-tracker-d1.ts
|
|
2892
|
-
var SCHEMA_VERSION = 1;
|
|
2893
|
-
var D1ExperimentStore = class {
|
|
2894
|
-
db;
|
|
2895
|
-
experimentsTable;
|
|
2896
|
-
runsTable;
|
|
2897
|
-
metaTable;
|
|
2898
|
-
schemaReady = false;
|
|
2899
|
-
constructor(options) {
|
|
2900
|
-
this.db = options.db;
|
|
2901
|
-
const prefix = options.tablePrefix ?? "agent_eval_";
|
|
2902
|
-
this.experimentsTable = `${prefix}experiments`;
|
|
2903
|
-
this.runsTable = `${prefix}runs`;
|
|
2904
|
-
this.metaTable = `${prefix}meta`;
|
|
2905
|
-
}
|
|
2906
|
-
/**
|
|
2907
|
-
* Idempotent schema setup. Safe to call before every operation; the second
|
|
2908
|
-
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
2909
|
-
* during Worker bootstrap.
|
|
2910
|
-
*/
|
|
2911
|
-
async ensureSchema() {
|
|
2912
|
-
if (this.schemaReady) return;
|
|
2913
|
-
const ddl = `
|
|
2914
|
-
CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
|
|
2915
|
-
id TEXT PRIMARY KEY,
|
|
2916
|
-
name TEXT NOT NULL,
|
|
2917
|
-
created_at TEXT NOT NULL,
|
|
2918
|
-
metadata_json TEXT
|
|
2919
|
-
);
|
|
2920
|
-
CREATE TABLE IF NOT EXISTS ${this.runsTable} (
|
|
2921
|
-
id TEXT PRIMARY KEY,
|
|
2922
|
-
experiment_id TEXT NOT NULL,
|
|
2923
|
-
name TEXT,
|
|
2924
|
-
status TEXT NOT NULL,
|
|
2925
|
-
started_at TEXT NOT NULL,
|
|
2926
|
-
completed_at TEXT,
|
|
2927
|
-
config_json TEXT NOT NULL,
|
|
2928
|
-
report_json TEXT,
|
|
2929
|
-
error TEXT
|
|
2930
|
-
);
|
|
2931
|
-
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
|
|
2932
|
-
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
|
|
2933
|
-
CREATE TABLE IF NOT EXISTS ${this.metaTable} (
|
|
2934
|
-
key TEXT PRIMARY KEY,
|
|
2935
|
-
value TEXT NOT NULL
|
|
2936
|
-
);
|
|
2937
|
-
INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
|
|
2938
|
-
`;
|
|
2939
|
-
await this.db.exec(ddl.trim().replace(/\s+/g, " "));
|
|
2940
|
-
this.schemaReady = true;
|
|
4189
|
+
aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
|
|
4190
|
+
config: { maxRounds, convergenceThreshold: threshold }
|
|
4191
|
+
};
|
|
2941
4192
|
}
|
|
4193
|
+
};
|
|
4194
|
+
|
|
4195
|
+
// src/experiment-tracker.ts
|
|
4196
|
+
var InMemoryExperimentStore = class {
|
|
4197
|
+
experiments = /* @__PURE__ */ new Map();
|
|
4198
|
+
runs = /* @__PURE__ */ new Map();
|
|
2942
4199
|
async saveExperiment(exp) {
|
|
2943
|
-
|
|
2944
|
-
await this.db.prepare(
|
|
2945
|
-
`INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
|
|
2946
|
-
VALUES (?1, ?2, ?3, ?4)
|
|
2947
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
2948
|
-
name = excluded.name,
|
|
2949
|
-
created_at = excluded.created_at,
|
|
2950
|
-
metadata_json = excluded.metadata_json`
|
|
2951
|
-
).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
|
|
4200
|
+
this.experiments.set(exp.id, { ...exp });
|
|
2952
4201
|
}
|
|
2953
4202
|
async getExperiment(id) {
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
`SELECT id, name, created_at, metadata_json
|
|
2957
|
-
FROM ${this.experimentsTable}
|
|
2958
|
-
WHERE id = ?1`
|
|
2959
|
-
).bind(id).first();
|
|
2960
|
-
return row ? rowToExperiment(row) : null;
|
|
4203
|
+
const e = this.experiments.get(id);
|
|
4204
|
+
return e ? { ...e } : null;
|
|
2961
4205
|
}
|
|
2962
4206
|
async listExperiments() {
|
|
2963
|
-
|
|
2964
|
-
const { results } = await this.db.prepare(
|
|
2965
|
-
`SELECT id, name, created_at, metadata_json
|
|
2966
|
-
FROM ${this.experimentsTable}
|
|
2967
|
-
ORDER BY created_at DESC`
|
|
2968
|
-
).all();
|
|
2969
|
-
return results.map(rowToExperiment);
|
|
4207
|
+
return [...this.experiments.values()].sort((a, b) => b.createdAt.localeCompare(a.createdAt));
|
|
2970
4208
|
}
|
|
2971
4209
|
async saveRun(run) {
|
|
2972
|
-
|
|
2973
|
-
await this.db.prepare(
|
|
2974
|
-
`INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
|
|
2975
|
-
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
2976
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
2977
|
-
experiment_id = excluded.experiment_id,
|
|
2978
|
-
name = excluded.name,
|
|
2979
|
-
status = excluded.status,
|
|
2980
|
-
started_at = excluded.started_at,
|
|
2981
|
-
completed_at = excluded.completed_at,
|
|
2982
|
-
config_json = excluded.config_json,
|
|
2983
|
-
report_json = excluded.report_json,
|
|
2984
|
-
error = excluded.error`
|
|
2985
|
-
).bind(
|
|
2986
|
-
run.id,
|
|
2987
|
-
run.experimentId,
|
|
2988
|
-
run.name ?? null,
|
|
2989
|
-
run.status,
|
|
2990
|
-
run.startedAt,
|
|
2991
|
-
run.completedAt ?? null,
|
|
2992
|
-
JSON.stringify(run.config),
|
|
2993
|
-
run.report ? JSON.stringify(run.report) : null,
|
|
2994
|
-
run.error ?? null
|
|
2995
|
-
).run();
|
|
4210
|
+
this.runs.set(run.id, structuredClone(run));
|
|
2996
4211
|
}
|
|
2997
4212
|
async getRun(id) {
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
3001
|
-
FROM ${this.runsTable}
|
|
3002
|
-
WHERE id = ?1`
|
|
3003
|
-
).bind(id).first();
|
|
3004
|
-
return row ? rowToRun(row) : null;
|
|
4213
|
+
const r = this.runs.get(id);
|
|
4214
|
+
return r ? structuredClone(r) : null;
|
|
3005
4215
|
}
|
|
3006
4216
|
async listRuns(experimentId) {
|
|
3007
|
-
|
|
3008
|
-
const { results } = await this.db.prepare(
|
|
3009
|
-
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
3010
|
-
FROM ${this.runsTable}
|
|
3011
|
-
WHERE experiment_id = ?1
|
|
3012
|
-
ORDER BY started_at DESC`
|
|
3013
|
-
).bind(experimentId).all();
|
|
3014
|
-
return results.map(rowToRun);
|
|
4217
|
+
return [...this.runs.values()].filter((r) => r.experimentId === experimentId).sort((a, b) => b.startedAt.localeCompare(a.startedAt)).map((r) => structuredClone(r));
|
|
3015
4218
|
}
|
|
3016
4219
|
};
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
name: row.name,
|
|
3021
|
-
createdAt: row.created_at,
|
|
3022
|
-
...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
|
|
3023
|
-
};
|
|
3024
|
-
}
|
|
3025
|
-
function rowToRun(row) {
|
|
3026
|
-
return {
|
|
3027
|
-
id: row.id,
|
|
3028
|
-
experimentId: row.experiment_id,
|
|
3029
|
-
...row.name ? { name: row.name } : {},
|
|
3030
|
-
status: row.status,
|
|
3031
|
-
startedAt: row.started_at,
|
|
3032
|
-
...row.completed_at ? { completedAt: row.completed_at } : {},
|
|
3033
|
-
config: JSON.parse(row.config_json),
|
|
3034
|
-
...row.report_json ? { report: JSON.parse(row.report_json) } : {},
|
|
3035
|
-
...row.error ? { error: row.error } : {}
|
|
3036
|
-
};
|
|
3037
|
-
}
|
|
3038
|
-
|
|
3039
|
-
// src/experiment-tracker-fs.ts
|
|
3040
|
-
var FileSystemExperimentStore = class {
|
|
3041
|
-
dir;
|
|
3042
|
-
maxBytes;
|
|
3043
|
-
index;
|
|
3044
|
-
loaded = false;
|
|
3045
|
-
constructor(options) {
|
|
3046
|
-
this.dir = options.dir;
|
|
3047
|
-
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
3048
|
-
}
|
|
3049
|
-
async saveExperiment(exp) {
|
|
3050
|
-
const idx = await this.load();
|
|
3051
|
-
await idx.saveExperiment(exp);
|
|
3052
|
-
await this.append("experiments", exp);
|
|
3053
|
-
}
|
|
3054
|
-
async getExperiment(id) {
|
|
3055
|
-
const idx = await this.load();
|
|
3056
|
-
return idx.getExperiment(id);
|
|
3057
|
-
}
|
|
3058
|
-
async listExperiments() {
|
|
3059
|
-
const idx = await this.load();
|
|
3060
|
-
return idx.listExperiments();
|
|
4220
|
+
var ExperimentTracker = class {
|
|
4221
|
+
constructor(store) {
|
|
4222
|
+
this.store = store;
|
|
3061
4223
|
}
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
4224
|
+
store;
|
|
4225
|
+
async startExperiment(name, metadata) {
|
|
4226
|
+
const exp = {
|
|
4227
|
+
id: `exp_${rand(8)}`,
|
|
4228
|
+
name,
|
|
4229
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4230
|
+
metadata
|
|
4231
|
+
};
|
|
4232
|
+
await this.store.saveExperiment(exp);
|
|
4233
|
+
return exp;
|
|
3066
4234
|
}
|
|
3067
|
-
async
|
|
3068
|
-
const
|
|
3069
|
-
|
|
4235
|
+
async startRun(config) {
|
|
4236
|
+
const exp = await this.store.getExperiment(config.experimentId);
|
|
4237
|
+
if (!exp) throw new Error(`Experiment ${config.experimentId} not found`);
|
|
4238
|
+
const run = {
|
|
4239
|
+
id: `run_${rand(10)}`,
|
|
4240
|
+
experimentId: config.experimentId,
|
|
4241
|
+
name: config.name,
|
|
4242
|
+
config,
|
|
4243
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4244
|
+
status: "running"
|
|
4245
|
+
};
|
|
4246
|
+
await this.store.saveRun(run);
|
|
4247
|
+
return run;
|
|
3070
4248
|
}
|
|
3071
|
-
async
|
|
3072
|
-
const
|
|
3073
|
-
|
|
4249
|
+
async completeRun(runId, report) {
|
|
4250
|
+
const run = await this.store.getRun(runId);
|
|
4251
|
+
if (!run) throw new Error(`Run ${runId} not found`);
|
|
4252
|
+
run.status = "completed";
|
|
4253
|
+
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4254
|
+
run.report = report;
|
|
4255
|
+
await this.store.saveRun(run);
|
|
3074
4256
|
}
|
|
3075
|
-
async
|
|
3076
|
-
const
|
|
3077
|
-
|
|
4257
|
+
async failRun(runId, error) {
|
|
4258
|
+
const run = await this.store.getRun(runId);
|
|
4259
|
+
if (!run) throw new Error(`Run ${runId} not found`);
|
|
4260
|
+
run.status = "failed";
|
|
4261
|
+
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4262
|
+
run.error = error;
|
|
4263
|
+
await this.store.saveRun(run);
|
|
3078
4264
|
}
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
4265
|
+
/**
|
|
4266
|
+
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
4267
|
+
* and config changes that may explain the movement.
|
|
4268
|
+
*/
|
|
4269
|
+
async diff(runIdA, runIdB) {
|
|
4270
|
+
const [a, b] = await Promise.all([this.store.getRun(runIdA), this.store.getRun(runIdB)]);
|
|
4271
|
+
if (!a || !b) throw new Error("Both runs must exist");
|
|
4272
|
+
if (!a.report || !b.report) throw new Error("Both runs must be completed with reports");
|
|
4273
|
+
const byScenarioA = new Map(a.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
4274
|
+
const byScenarioB = new Map(b.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
4275
|
+
const scenarioIds = /* @__PURE__ */ new Set([...byScenarioA.keys(), ...byScenarioB.keys()]);
|
|
4276
|
+
const scenarios = [];
|
|
4277
|
+
for (const id of scenarioIds) {
|
|
4278
|
+
const aScore = byScenarioA.get(id);
|
|
4279
|
+
const bScore = byScenarioB.get(id);
|
|
4280
|
+
if (aScore === void 0) {
|
|
4281
|
+
scenarios.push({
|
|
4282
|
+
scenarioId: id,
|
|
4283
|
+
before: null,
|
|
4284
|
+
after: bScore,
|
|
4285
|
+
delta: null,
|
|
4286
|
+
status: "added"
|
|
4287
|
+
});
|
|
4288
|
+
} else if (bScore === void 0) {
|
|
4289
|
+
scenarios.push({
|
|
4290
|
+
scenarioId: id,
|
|
4291
|
+
before: aScore,
|
|
4292
|
+
after: null,
|
|
4293
|
+
delta: null,
|
|
4294
|
+
status: "removed"
|
|
4295
|
+
});
|
|
4296
|
+
} else {
|
|
4297
|
+
scenarios.push({
|
|
4298
|
+
scenarioId: id,
|
|
4299
|
+
before: aScore,
|
|
4300
|
+
after: bScore,
|
|
4301
|
+
delta: bScore - aScore,
|
|
4302
|
+
status: bScore > aScore ? "improved" : bScore < aScore ? "regressed" : "unchanged"
|
|
4303
|
+
});
|
|
3089
4304
|
}
|
|
3090
|
-
}
|
|
3091
|
-
|
|
3092
|
-
|
|
3093
|
-
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
const
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
try {
|
|
3101
|
-
const entries = await fs2.readdir(this.dir);
|
|
3102
|
-
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
3103
|
-
for (const file of sorted) {
|
|
3104
|
-
const full = path.join(this.dir, file);
|
|
3105
|
-
const content = await fs2.readFile(full, "utf8");
|
|
3106
|
-
const base = file.split(".")[0];
|
|
3107
|
-
for (const line of content.split("\n")) {
|
|
3108
|
-
if (!line.trim()) continue;
|
|
3109
|
-
let record;
|
|
3110
|
-
try {
|
|
3111
|
-
record = JSON.parse(line);
|
|
3112
|
-
} catch {
|
|
3113
|
-
continue;
|
|
3114
|
-
}
|
|
3115
|
-
if (base === "experiments") {
|
|
3116
|
-
await store.saveExperiment(record);
|
|
3117
|
-
} else if (base === "runs") {
|
|
3118
|
-
await store.saveRun(record);
|
|
3119
|
-
}
|
|
3120
|
-
}
|
|
4305
|
+
}
|
|
4306
|
+
scenarios.sort((x, y) => (y.delta ?? 0) - (x.delta ?? 0));
|
|
4307
|
+
const aggregateDelta = b.report.summary.overallAvg - a.report.summary.overallAvg;
|
|
4308
|
+
const configChanges = {};
|
|
4309
|
+
const keys = /* @__PURE__ */ new Set([...Object.keys(a.config), ...Object.keys(b.config)]);
|
|
4310
|
+
const aCfg = a.config;
|
|
4311
|
+
const bCfg = b.config;
|
|
4312
|
+
for (const k of keys) {
|
|
4313
|
+
if (JSON.stringify(aCfg[k]) !== JSON.stringify(bCfg[k])) {
|
|
4314
|
+
configChanges[k] = { before: aCfg[k], after: bCfg[k] };
|
|
3121
4315
|
}
|
|
3122
|
-
} catch {
|
|
3123
4316
|
}
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
4317
|
+
return {
|
|
4318
|
+
before: { runId: runIdA, name: a.name, startedAt: a.startedAt },
|
|
4319
|
+
after: { runId: runIdB, name: b.name, startedAt: b.startedAt },
|
|
4320
|
+
aggregateDelta,
|
|
4321
|
+
scenarios,
|
|
4322
|
+
configChanges
|
|
4323
|
+
};
|
|
4324
|
+
}
|
|
4325
|
+
/** Timeline of aggregate scores for an experiment. */
|
|
4326
|
+
async timeline(experimentId) {
|
|
4327
|
+
const runs = await this.store.listRuns(experimentId);
|
|
4328
|
+
return runs.slice().sort((a, b) => a.startedAt.localeCompare(b.startedAt)).map((r) => ({
|
|
4329
|
+
runId: r.id,
|
|
4330
|
+
startedAt: r.startedAt,
|
|
4331
|
+
overall: r.report?.summary.overallAvg ?? null
|
|
4332
|
+
}));
|
|
3127
4333
|
}
|
|
3128
4334
|
};
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
goalProgress: 2,
|
|
3134
|
-
repoGroundedness: 1.5,
|
|
3135
|
-
driftPenalty: -1.5,
|
|
3136
|
-
toolUseQuality: 1,
|
|
3137
|
-
patchQuality: 1.25,
|
|
3138
|
-
testReality: 1.5,
|
|
3139
|
-
finalGate: 3,
|
|
3140
|
-
reviewerBlockers: -2,
|
|
3141
|
-
costUsd: -0.2,
|
|
3142
|
-
wallSeconds: -0.1
|
|
3143
|
-
};
|
|
3144
|
-
function aggregateRunScore(score, weights = {}) {
|
|
3145
|
-
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
3146
|
-
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
|
|
3147
|
-
}
|
|
3148
|
-
function clamp01(value) {
|
|
3149
|
-
if (!Number.isFinite(value)) return 0;
|
|
3150
|
-
return Math.max(0, Math.min(1, value));
|
|
3151
|
-
}
|
|
3152
|
-
function finiteOrZero(value) {
|
|
3153
|
-
return Number.isFinite(value) ? value : 0;
|
|
4335
|
+
function rand(bytes) {
|
|
4336
|
+
const arr = new Uint8Array(bytes);
|
|
4337
|
+
crypto.getRandomValues(arr);
|
|
4338
|
+
return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
3154
4339
|
}
|
|
3155
4340
|
|
|
3156
|
-
// src/
|
|
3157
|
-
var
|
|
3158
|
-
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
this.
|
|
3170
|
-
this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
|
|
4341
|
+
// src/experiment-tracker-d1.ts
|
|
4342
|
+
var SCHEMA_VERSION = 1;
|
|
4343
|
+
var D1ExperimentStore = class {
|
|
4344
|
+
db;
|
|
4345
|
+
experimentsTable;
|
|
4346
|
+
runsTable;
|
|
4347
|
+
metaTable;
|
|
4348
|
+
schemaReady = false;
|
|
4349
|
+
constructor(options) {
|
|
4350
|
+
this.db = options.db;
|
|
4351
|
+
const prefix = options.tablePrefix ?? "agent_eval_";
|
|
4352
|
+
this.experimentsTable = `${prefix}experiments`;
|
|
4353
|
+
this.runsTable = `${prefix}runs`;
|
|
4354
|
+
this.metaTable = `${prefix}meta`;
|
|
3171
4355
|
}
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
4356
|
+
/**
|
|
4357
|
+
* Idempotent schema setup. Safe to call before every operation; the second
|
|
4358
|
+
* call short-circuits via `schemaReady`. Most consumers will call it once
|
|
4359
|
+
* during Worker bootstrap.
|
|
4360
|
+
*/
|
|
4361
|
+
async ensureSchema() {
|
|
4362
|
+
if (this.schemaReady) return;
|
|
4363
|
+
const ddl = `
|
|
4364
|
+
CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
|
|
4365
|
+
id TEXT PRIMARY KEY,
|
|
4366
|
+
name TEXT NOT NULL,
|
|
4367
|
+
created_at TEXT NOT NULL,
|
|
4368
|
+
metadata_json TEXT
|
|
4369
|
+
);
|
|
4370
|
+
CREATE TABLE IF NOT EXISTS ${this.runsTable} (
|
|
4371
|
+
id TEXT PRIMARY KEY,
|
|
4372
|
+
experiment_id TEXT NOT NULL,
|
|
4373
|
+
name TEXT,
|
|
4374
|
+
status TEXT NOT NULL,
|
|
4375
|
+
started_at TEXT NOT NULL,
|
|
4376
|
+
completed_at TEXT,
|
|
4377
|
+
config_json TEXT NOT NULL,
|
|
4378
|
+
report_json TEXT,
|
|
4379
|
+
error TEXT
|
|
4380
|
+
);
|
|
4381
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
|
|
4382
|
+
CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
|
|
4383
|
+
CREATE TABLE IF NOT EXISTS ${this.metaTable} (
|
|
4384
|
+
key TEXT PRIMARY KEY,
|
|
4385
|
+
value TEXT NOT NULL
|
|
4386
|
+
);
|
|
4387
|
+
INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
|
|
4388
|
+
`;
|
|
4389
|
+
await this.db.exec(ddl.trim().replace(/\s+/g, " "));
|
|
4390
|
+
this.schemaReady = true;
|
|
3182
4391
|
}
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
|
|
3210
|
-
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
|
|
3214
|
-
|
|
3215
|
-
|
|
3216
|
-
|
|
3217
|
-
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
3229
|
-
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
3238
|
-
0
|
|
3239
|
-
) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
|
|
3240
|
-
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
3241
|
-
return {
|
|
3242
|
-
success,
|
|
3243
|
-
goalProgress,
|
|
3244
|
-
repoGroundedness,
|
|
3245
|
-
driftPenalty,
|
|
3246
|
-
toolUseQuality,
|
|
3247
|
-
patchQuality,
|
|
3248
|
-
testReality,
|
|
3249
|
-
finalGate,
|
|
3250
|
-
reviewerBlockers,
|
|
3251
|
-
costUsd,
|
|
3252
|
-
wallSeconds,
|
|
3253
|
-
notes
|
|
3254
|
-
};
|
|
4392
|
+
async saveExperiment(exp) {
|
|
4393
|
+
await this.ensureSchema();
|
|
4394
|
+
await this.db.prepare(
|
|
4395
|
+
`INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
|
|
4396
|
+
VALUES (?1, ?2, ?3, ?4)
|
|
4397
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
4398
|
+
name = excluded.name,
|
|
4399
|
+
created_at = excluded.created_at,
|
|
4400
|
+
metadata_json = excluded.metadata_json`
|
|
4401
|
+
).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
|
|
4402
|
+
}
|
|
4403
|
+
async getExperiment(id) {
|
|
4404
|
+
await this.ensureSchema();
|
|
4405
|
+
const row = await this.db.prepare(
|
|
4406
|
+
`SELECT id, name, created_at, metadata_json
|
|
4407
|
+
FROM ${this.experimentsTable}
|
|
4408
|
+
WHERE id = ?1`
|
|
4409
|
+
).bind(id).first();
|
|
4410
|
+
return row ? rowToExperiment(row) : null;
|
|
4411
|
+
}
|
|
4412
|
+
async listExperiments() {
|
|
4413
|
+
await this.ensureSchema();
|
|
4414
|
+
const { results } = await this.db.prepare(
|
|
4415
|
+
`SELECT id, name, created_at, metadata_json
|
|
4416
|
+
FROM ${this.experimentsTable}
|
|
4417
|
+
ORDER BY created_at DESC`
|
|
4418
|
+
).all();
|
|
4419
|
+
return results.map(rowToExperiment);
|
|
4420
|
+
}
|
|
4421
|
+
async saveRun(run) {
|
|
4422
|
+
await this.ensureSchema();
|
|
4423
|
+
await this.db.prepare(
|
|
4424
|
+
`INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
|
|
4425
|
+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
|
4426
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
4427
|
+
experiment_id = excluded.experiment_id,
|
|
4428
|
+
name = excluded.name,
|
|
4429
|
+
status = excluded.status,
|
|
4430
|
+
started_at = excluded.started_at,
|
|
4431
|
+
completed_at = excluded.completed_at,
|
|
4432
|
+
config_json = excluded.config_json,
|
|
4433
|
+
report_json = excluded.report_json,
|
|
4434
|
+
error = excluded.error`
|
|
4435
|
+
).bind(
|
|
4436
|
+
run.id,
|
|
4437
|
+
run.experimentId,
|
|
4438
|
+
run.name ?? null,
|
|
4439
|
+
run.status,
|
|
4440
|
+
run.startedAt,
|
|
4441
|
+
run.completedAt ?? null,
|
|
4442
|
+
JSON.stringify(run.config),
|
|
4443
|
+
run.report ? JSON.stringify(run.report) : null,
|
|
4444
|
+
run.error ?? null
|
|
4445
|
+
).run();
|
|
3255
4446
|
}
|
|
3256
|
-
|
|
3257
|
-
|
|
4447
|
+
async getRun(id) {
|
|
4448
|
+
await this.ensureSchema();
|
|
4449
|
+
const row = await this.db.prepare(
|
|
4450
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
4451
|
+
FROM ${this.runsTable}
|
|
4452
|
+
WHERE id = ?1`
|
|
4453
|
+
).bind(id).first();
|
|
4454
|
+
return row ? rowToRun(row) : null;
|
|
3258
4455
|
}
|
|
3259
|
-
|
|
3260
|
-
|
|
4456
|
+
async listRuns(experimentId) {
|
|
4457
|
+
await this.ensureSchema();
|
|
4458
|
+
const { results } = await this.db.prepare(
|
|
4459
|
+
`SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
|
|
4460
|
+
FROM ${this.runsTable}
|
|
4461
|
+
WHERE experiment_id = ?1
|
|
4462
|
+
ORDER BY started_at DESC`
|
|
4463
|
+
).bind(experimentId).all();
|
|
4464
|
+
return results.map(rowToRun);
|
|
3261
4465
|
}
|
|
3262
4466
|
};
|
|
3263
|
-
function
|
|
3264
|
-
return
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
}
|
|
3271
|
-
function isBlockingJudge(span) {
|
|
3272
|
-
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
4467
|
+
function rowToExperiment(row) {
|
|
4468
|
+
return {
|
|
4469
|
+
id: row.id,
|
|
4470
|
+
name: row.name,
|
|
4471
|
+
createdAt: row.created_at,
|
|
4472
|
+
...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
|
|
4473
|
+
};
|
|
3273
4474
|
}
|
|
3274
|
-
function
|
|
3275
|
-
return
|
|
4475
|
+
function rowToRun(row) {
|
|
4476
|
+
return {
|
|
4477
|
+
id: row.id,
|
|
4478
|
+
experimentId: row.experiment_id,
|
|
4479
|
+
...row.name ? { name: row.name } : {},
|
|
4480
|
+
status: row.status,
|
|
4481
|
+
startedAt: row.started_at,
|
|
4482
|
+
...row.completed_at ? { completedAt: row.completed_at } : {},
|
|
4483
|
+
config: JSON.parse(row.config_json),
|
|
4484
|
+
...row.report_json ? { report: JSON.parse(row.report_json) } : {},
|
|
4485
|
+
...row.error ? { error: row.error } : {}
|
|
4486
|
+
};
|
|
3276
4487
|
}
|
|
3277
4488
|
|
|
4489
|
+
// src/experiment-tracker-fs.ts
|
|
4490
|
+
var FileSystemExperimentStore = class {
|
|
4491
|
+
dir;
|
|
4492
|
+
maxBytes;
|
|
4493
|
+
index;
|
|
4494
|
+
loaded = false;
|
|
4495
|
+
constructor(options) {
|
|
4496
|
+
this.dir = options.dir;
|
|
4497
|
+
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
4498
|
+
}
|
|
4499
|
+
async saveExperiment(exp) {
|
|
4500
|
+
const idx = await this.load();
|
|
4501
|
+
await idx.saveExperiment(exp);
|
|
4502
|
+
await this.append("experiments", exp);
|
|
4503
|
+
}
|
|
4504
|
+
async getExperiment(id) {
|
|
4505
|
+
const idx = await this.load();
|
|
4506
|
+
return idx.getExperiment(id);
|
|
4507
|
+
}
|
|
4508
|
+
async listExperiments() {
|
|
4509
|
+
const idx = await this.load();
|
|
4510
|
+
return idx.listExperiments();
|
|
4511
|
+
}
|
|
4512
|
+
async saveRun(run) {
|
|
4513
|
+
const idx = await this.load();
|
|
4514
|
+
await idx.saveRun(run);
|
|
4515
|
+
await this.append("runs", run);
|
|
4516
|
+
}
|
|
4517
|
+
async getRun(id) {
|
|
4518
|
+
const idx = await this.load();
|
|
4519
|
+
return idx.getRun(id);
|
|
4520
|
+
}
|
|
4521
|
+
async listRuns(experimentId) {
|
|
4522
|
+
const idx = await this.load();
|
|
4523
|
+
return idx.listRuns(experimentId);
|
|
4524
|
+
}
|
|
4525
|
+
async ensureDir() {
|
|
4526
|
+
const fs2 = await import("fs/promises");
|
|
4527
|
+
await fs2.mkdir(this.dir, { recursive: true });
|
|
4528
|
+
}
|
|
4529
|
+
async append(name, record) {
|
|
4530
|
+
await this.ensureDir();
|
|
4531
|
+
const fs2 = await import("fs/promises");
|
|
4532
|
+
const path = await import("path");
|
|
4533
|
+
const active = path.join(this.dir, `${name}.ndjson`);
|
|
4534
|
+
try {
|
|
4535
|
+
const stat = await fs2.stat(active);
|
|
4536
|
+
if (stat.size >= this.maxBytes) {
|
|
4537
|
+
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
4538
|
+
await fs2.rename(active, rolled);
|
|
4539
|
+
}
|
|
4540
|
+
} catch {
|
|
4541
|
+
}
|
|
4542
|
+
await fs2.appendFile(active, `${JSON.stringify(record)}
|
|
4543
|
+
`, "utf8");
|
|
4544
|
+
}
|
|
4545
|
+
async load() {
|
|
4546
|
+
if (this.loaded && this.index) return this.index;
|
|
4547
|
+
const fs2 = await import("fs/promises");
|
|
4548
|
+
const path = await import("path");
|
|
4549
|
+
const store = new InMemoryExperimentStore();
|
|
4550
|
+
try {
|
|
4551
|
+
const entries = await fs2.readdir(this.dir);
|
|
4552
|
+
const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
|
|
4553
|
+
for (const file of sorted) {
|
|
4554
|
+
const full = path.join(this.dir, file);
|
|
4555
|
+
const content = await fs2.readFile(full, "utf8");
|
|
4556
|
+
const base = file.split(".")[0];
|
|
4557
|
+
for (const line of content.split("\n")) {
|
|
4558
|
+
if (!line.trim()) continue;
|
|
4559
|
+
let record;
|
|
4560
|
+
try {
|
|
4561
|
+
record = JSON.parse(line);
|
|
4562
|
+
} catch {
|
|
4563
|
+
continue;
|
|
4564
|
+
}
|
|
4565
|
+
if (base === "experiments") {
|
|
4566
|
+
await store.saveExperiment(record);
|
|
4567
|
+
} else if (base === "runs") {
|
|
4568
|
+
await store.saveRun(record);
|
|
4569
|
+
}
|
|
4570
|
+
}
|
|
4571
|
+
}
|
|
4572
|
+
} catch {
|
|
4573
|
+
}
|
|
4574
|
+
this.index = store;
|
|
4575
|
+
this.loaded = true;
|
|
4576
|
+
return store;
|
|
4577
|
+
}
|
|
4578
|
+
};
|
|
4579
|
+
|
|
3278
4580
|
// src/harness-optimizer.ts
|
|
3279
4581
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
3280
4582
|
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
@@ -3944,7 +5246,7 @@ function assertNonNegative(n, name) {
|
|
|
3944
5246
|
}
|
|
3945
5247
|
|
|
3946
5248
|
// src/muffled-gate-scanner.ts
|
|
3947
|
-
import { existsSync, readdirSync, readFileSync, statSync } from "fs";
|
|
5249
|
+
import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
|
|
3948
5250
|
import { join } from "path";
|
|
3949
5251
|
function codeOf(line) {
|
|
3950
5252
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
@@ -4058,7 +5360,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
|
4058
5360
|
const matches = [];
|
|
4059
5361
|
const walk = (rel) => {
|
|
4060
5362
|
const abs = join(repoRoot, rel);
|
|
4061
|
-
if (!
|
|
5363
|
+
if (!existsSync3(abs)) return;
|
|
4062
5364
|
for (const entry of readdirSync(abs)) {
|
|
4063
5365
|
const sub = join(rel, entry);
|
|
4064
5366
|
const subAbs = join(repoRoot, sub);
|
|
@@ -4077,7 +5379,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
|
4077
5379
|
continue;
|
|
4078
5380
|
let text;
|
|
4079
5381
|
try {
|
|
4080
|
-
text =
|
|
5382
|
+
text = readFileSync2(subAbs, "utf8");
|
|
4081
5383
|
} catch {
|
|
4082
5384
|
continue;
|
|
4083
5385
|
}
|
|
@@ -4093,8 +5395,8 @@ function scanForMuffledGates(opts) {
|
|
|
4093
5395
|
const scanned = /* @__PURE__ */ new Set();
|
|
4094
5396
|
for (const file of opts.scanFiles) {
|
|
4095
5397
|
const abs = join(opts.repoRoot, file);
|
|
4096
|
-
if (!
|
|
4097
|
-
const text =
|
|
5398
|
+
if (!existsSync3(abs)) continue;
|
|
5399
|
+
const text = readFileSync2(abs, "utf8");
|
|
4098
5400
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
4099
5401
|
scanned.add(file);
|
|
4100
5402
|
}
|
|
@@ -4108,8 +5410,8 @@ function scanForMuffledGates(opts) {
|
|
|
4108
5410
|
for (const file of importers) {
|
|
4109
5411
|
if (scanned.has(file)) continue;
|
|
4110
5412
|
const abs = join(opts.repoRoot, file);
|
|
4111
|
-
if (!
|
|
4112
|
-
const text =
|
|
5413
|
+
if (!existsSync3(abs)) continue;
|
|
5414
|
+
const text = readFileSync2(abs, "utf8");
|
|
4113
5415
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
4114
5416
|
}
|
|
4115
5417
|
}
|
|
@@ -4956,326 +6258,6 @@ function seededShuffle(items, seed) {
|
|
|
4956
6258
|
return out;
|
|
4957
6259
|
}
|
|
4958
6260
|
|
|
4959
|
-
// src/judge-calibration.ts
|
|
4960
|
-
function calibrateJudge(golden, candidate) {
|
|
4961
|
-
const map = /* @__PURE__ */ new Map();
|
|
4962
|
-
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
4963
|
-
for (const c of candidate) {
|
|
4964
|
-
const entry = map.get(c.itemId);
|
|
4965
|
-
if (entry) entry.j = c.score;
|
|
4966
|
-
}
|
|
4967
|
-
const common = [...map.values()].filter((v) => Number.isFinite(v.j));
|
|
4968
|
-
const n = common.length;
|
|
4969
|
-
if (n < 2) {
|
|
4970
|
-
return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
|
|
4971
|
-
}
|
|
4972
|
-
const humans = common.map((c) => c.h);
|
|
4973
|
-
const judges = common.map((c) => c.j);
|
|
4974
|
-
const pearson = pearsonR(humans, judges);
|
|
4975
|
-
const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
|
|
4976
|
-
const absDiffs = common.map((c) => Math.abs(c.j - c.h));
|
|
4977
|
-
const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
|
|
4978
|
-
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
4979
|
-
return { n, pearson, kappa, mae, worstItems: worst2 };
|
|
4980
|
-
}
|
|
4981
|
-
function positionalBias(scores) {
|
|
4982
|
-
const pairs = /* @__PURE__ */ new Map();
|
|
4983
|
-
for (const s of scores) {
|
|
4984
|
-
const slot = pairs.get(s.itemId) ?? {};
|
|
4985
|
-
if (s.positionOfAInput === "first") slot.first = s.score;
|
|
4986
|
-
else if (s.positionOfAInput === "second") slot.second = s.score;
|
|
4987
|
-
pairs.set(s.itemId, slot);
|
|
4988
|
-
}
|
|
4989
|
-
const deltas = [];
|
|
4990
|
-
for (const { first, second } of pairs.values()) {
|
|
4991
|
-
if (first !== void 0 && second !== void 0) deltas.push(first - second);
|
|
4992
|
-
}
|
|
4993
|
-
if (deltas.length === 0) return { avgDelta: 0, n: 0 };
|
|
4994
|
-
return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
|
|
4995
|
-
}
|
|
4996
|
-
function verbosityBias(samples) {
|
|
4997
|
-
const n = samples.length;
|
|
4998
|
-
if (n < 3) return { pearson: NaN, n };
|
|
4999
|
-
return {
|
|
5000
|
-
pearson: pearsonR(
|
|
5001
|
-
samples.map((s) => s.outputLen),
|
|
5002
|
-
samples.map((s) => s.score)
|
|
5003
|
-
),
|
|
5004
|
-
n
|
|
5005
|
-
};
|
|
5006
|
-
}
|
|
5007
|
-
function selfPreference(samples) {
|
|
5008
|
-
const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
|
|
5009
|
-
const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
|
|
5010
|
-
if (inF.length === 0 || outF.length === 0)
|
|
5011
|
-
return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
|
|
5012
|
-
const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
|
|
5013
|
-
const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
|
|
5014
|
-
return {
|
|
5015
|
-
inFamilyMean: inMean,
|
|
5016
|
-
outOfFamilyMean: outMean,
|
|
5017
|
-
deltaMean: inMean - outMean,
|
|
5018
|
-
n: samples.length
|
|
5019
|
-
};
|
|
5020
|
-
}
|
|
5021
|
-
function pearsonR(a, b) {
|
|
5022
|
-
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5023
|
-
const mA = a.reduce((s, v) => s + v, 0) / a.length;
|
|
5024
|
-
const mB = b.reduce((s, v) => s + v, 0) / b.length;
|
|
5025
|
-
let num = 0, dA = 0, dB = 0;
|
|
5026
|
-
for (let i = 0; i < a.length; i++) {
|
|
5027
|
-
const da = a[i] - mA;
|
|
5028
|
-
const db = b[i] - mB;
|
|
5029
|
-
num += da * db;
|
|
5030
|
-
dA += da * da;
|
|
5031
|
-
dB += db * db;
|
|
5032
|
-
}
|
|
5033
|
-
if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
|
|
5034
|
-
return num / Math.sqrt(dA * dB);
|
|
5035
|
-
}
|
|
5036
|
-
function weightedKappa(a, b) {
|
|
5037
|
-
if (a.length !== b.length || a.length === 0) return NaN;
|
|
5038
|
-
const min = Math.min(...a, ...b);
|
|
5039
|
-
const max = Math.max(...a, ...b);
|
|
5040
|
-
const K = max - min + 1;
|
|
5041
|
-
if (K < 2) return 1;
|
|
5042
|
-
const observed = Array.from({ length: K }, () => new Array(K).fill(0));
|
|
5043
|
-
const rowMarg = new Array(K).fill(0);
|
|
5044
|
-
const colMarg = new Array(K).fill(0);
|
|
5045
|
-
for (let i = 0; i < a.length; i++) {
|
|
5046
|
-
const ai = a[i] - min;
|
|
5047
|
-
const bi = b[i] - min;
|
|
5048
|
-
const row = observed[ai];
|
|
5049
|
-
row[bi] = (row[bi] ?? 0) + 1;
|
|
5050
|
-
rowMarg[ai]++;
|
|
5051
|
-
colMarg[bi]++;
|
|
5052
|
-
}
|
|
5053
|
-
let num = 0;
|
|
5054
|
-
let den = 0;
|
|
5055
|
-
for (let i = 0; i < K; i++) {
|
|
5056
|
-
for (let j = 0; j < K; j++) {
|
|
5057
|
-
const w = (i - j) ** 2 / (K - 1) ** 2;
|
|
5058
|
-
const expected = rowMarg[i] * colMarg[j] / a.length;
|
|
5059
|
-
num += w * observed[i][j];
|
|
5060
|
-
den += w * expected;
|
|
5061
|
-
}
|
|
5062
|
-
}
|
|
5063
|
-
if (den === 0) return 1;
|
|
5064
|
-
return 1 - num / den;
|
|
5065
|
-
}
|
|
5066
|
-
function continuousAgreement(scores, opts = {}) {
|
|
5067
|
-
const bootstrap = opts.bootstrap ?? 1e3;
|
|
5068
|
-
const weights = opts.weights ?? "quadratic";
|
|
5069
|
-
const seed = opts.seed ?? 12648430;
|
|
5070
|
-
const ciLevel = opts.ciLevel ?? 0.95;
|
|
5071
|
-
const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
|
|
5072
|
-
const raters = matrix[0]?.length ?? 0;
|
|
5073
|
-
const clean = matrix.filter((row) => row.length === raters);
|
|
5074
|
-
const nClean = clean.length;
|
|
5075
|
-
if (nClean < 2 || raters < 2) {
|
|
5076
|
-
return {
|
|
5077
|
-
weightedKappa: NaN,
|
|
5078
|
-
icc: NaN,
|
|
5079
|
-
pearson: NaN,
|
|
5080
|
-
spearman: NaN,
|
|
5081
|
-
ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
|
|
5082
|
-
n: nClean,
|
|
5083
|
-
raters
|
|
5084
|
-
};
|
|
5085
|
-
}
|
|
5086
|
-
const kappa = continuousWeightedKappa(clean, weights);
|
|
5087
|
-
const icc = icc21(clean);
|
|
5088
|
-
const pearson = avgPairwise(clean, pearsonR);
|
|
5089
|
-
const spearman = avgPairwise(clean, spearmanR);
|
|
5090
|
-
const ciIcc = [NaN, NaN];
|
|
5091
|
-
const ciKappa = [NaN, NaN];
|
|
5092
|
-
if (bootstrap > 0) {
|
|
5093
|
-
const rng = mulberry32(seed);
|
|
5094
|
-
const iccs = [];
|
|
5095
|
-
const kappas = [];
|
|
5096
|
-
for (let b = 0; b < bootstrap; b++) {
|
|
5097
|
-
const sample = new Array(nClean);
|
|
5098
|
-
for (let i = 0; i < nClean; i++) {
|
|
5099
|
-
sample[i] = clean[Math.floor(rng() * nClean)];
|
|
5100
|
-
}
|
|
5101
|
-
const iccB = icc21(sample);
|
|
5102
|
-
const kB = continuousWeightedKappa(sample, weights);
|
|
5103
|
-
if (Number.isFinite(iccB)) iccs.push(iccB);
|
|
5104
|
-
if (Number.isFinite(kB)) kappas.push(kB);
|
|
5105
|
-
}
|
|
5106
|
-
const [lo, hi] = percentileBounds(ciLevel);
|
|
5107
|
-
if (iccs.length > 0) {
|
|
5108
|
-
iccs.sort((a, b) => a - b);
|
|
5109
|
-
ciIcc[0] = quantile(iccs, lo);
|
|
5110
|
-
ciIcc[1] = quantile(iccs, hi);
|
|
5111
|
-
}
|
|
5112
|
-
if (kappas.length > 0) {
|
|
5113
|
-
kappas.sort((a, b) => a - b);
|
|
5114
|
-
ciKappa[0] = quantile(kappas, lo);
|
|
5115
|
-
ciKappa[1] = quantile(kappas, hi);
|
|
5116
|
-
}
|
|
5117
|
-
}
|
|
5118
|
-
return {
|
|
5119
|
-
weightedKappa: kappa,
|
|
5120
|
-
icc,
|
|
5121
|
-
pearson,
|
|
5122
|
-
spearman,
|
|
5123
|
-
ci: { icc: ciIcc, weightedKappa: ciKappa },
|
|
5124
|
-
n: nClean,
|
|
5125
|
-
raters
|
|
5126
|
-
};
|
|
5127
|
-
}
|
|
5128
|
-
function calibrateJudgeContinuous(golden, candidate, opts = {}) {
|
|
5129
|
-
const base = calibrateJudge(golden, candidate);
|
|
5130
|
-
const map = /* @__PURE__ */ new Map();
|
|
5131
|
-
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
5132
|
-
for (const c of candidate) {
|
|
5133
|
-
const entry = map.get(c.itemId);
|
|
5134
|
-
if (entry) entry.j = c.score;
|
|
5135
|
-
}
|
|
5136
|
-
const rows = [];
|
|
5137
|
-
for (const v of map.values()) {
|
|
5138
|
-
if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
|
|
5139
|
-
}
|
|
5140
|
-
const agreement = continuousAgreement(rows, opts);
|
|
5141
|
-
return {
|
|
5142
|
-
...base,
|
|
5143
|
-
weightedKappaContinuous: agreement.weightedKappa,
|
|
5144
|
-
icc: agreement.icc,
|
|
5145
|
-
spearman: agreement.spearman,
|
|
5146
|
-
ci: agreement.ci
|
|
5147
|
-
};
|
|
5148
|
-
}
|
|
5149
|
-
function continuousWeightedKappa(rows, scheme) {
|
|
5150
|
-
if (rows.length === 0) return NaN;
|
|
5151
|
-
const raters = rows[0].length;
|
|
5152
|
-
if (raters < 2) return NaN;
|
|
5153
|
-
const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
|
|
5154
|
-
let sum2 = 0;
|
|
5155
|
-
let pairs = 0;
|
|
5156
|
-
for (let r1 = 0; r1 < raters; r1++) {
|
|
5157
|
-
for (let r2 = r1 + 1; r2 < raters; r2++) {
|
|
5158
|
-
const a = rows.map((row) => row[r1]);
|
|
5159
|
-
const b = rows.map((row) => row[r2]);
|
|
5160
|
-
const n = a.length;
|
|
5161
|
-
let obs = 0;
|
|
5162
|
-
for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
|
|
5163
|
-
obs /= n;
|
|
5164
|
-
let exp = 0;
|
|
5165
|
-
for (let i = 0; i < n; i++) {
|
|
5166
|
-
for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
|
|
5167
|
-
}
|
|
5168
|
-
exp /= n * n;
|
|
5169
|
-
if (exp === 0) {
|
|
5170
|
-
sum2 += obs === 0 ? 1 : 0;
|
|
5171
|
-
} else {
|
|
5172
|
-
sum2 += 1 - obs / exp;
|
|
5173
|
-
}
|
|
5174
|
-
pairs++;
|
|
5175
|
-
}
|
|
5176
|
-
}
|
|
5177
|
-
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5178
|
-
}
|
|
5179
|
-
function icc21(rows) {
|
|
5180
|
-
const n = rows.length;
|
|
5181
|
-
if (n < 2) return NaN;
|
|
5182
|
-
const k = rows[0].length;
|
|
5183
|
-
if (k < 2) return NaN;
|
|
5184
|
-
const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
|
|
5185
|
-
const colMeans = new Array(k).fill(0);
|
|
5186
|
-
for (let j = 0; j < k; j++) {
|
|
5187
|
-
let s = 0;
|
|
5188
|
-
for (let i = 0; i < n; i++) s += rows[i][j];
|
|
5189
|
-
colMeans[j] = s / n;
|
|
5190
|
-
}
|
|
5191
|
-
let grand = 0;
|
|
5192
|
-
for (let i = 0; i < n; i++) grand += rowMeans[i];
|
|
5193
|
-
grand /= n;
|
|
5194
|
-
let ssR = 0;
|
|
5195
|
-
for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
|
|
5196
|
-
ssR *= k;
|
|
5197
|
-
let ssC = 0;
|
|
5198
|
-
for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
|
|
5199
|
-
ssC *= n;
|
|
5200
|
-
let ssT = 0;
|
|
5201
|
-
for (let i = 0; i < n; i++) {
|
|
5202
|
-
for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
|
|
5203
|
-
}
|
|
5204
|
-
const ssE = ssT - ssR - ssC;
|
|
5205
|
-
const dfR = n - 1;
|
|
5206
|
-
const dfC = k - 1;
|
|
5207
|
-
const dfE = (n - 1) * (k - 1);
|
|
5208
|
-
const msR = ssR / dfR;
|
|
5209
|
-
const msC = ssC / dfC;
|
|
5210
|
-
const msE = dfE > 0 ? ssE / dfE : 0;
|
|
5211
|
-
const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
|
|
5212
|
-
if (denom === 0) {
|
|
5213
|
-
return msR === 0 && msE === 0 ? 1 : 0;
|
|
5214
|
-
}
|
|
5215
|
-
return (msR - msE) / denom;
|
|
5216
|
-
}
|
|
5217
|
-
function avgPairwise(rows, fn) {
|
|
5218
|
-
const k = rows[0]?.length ?? 0;
|
|
5219
|
-
if (k < 2) return NaN;
|
|
5220
|
-
let sum2 = 0;
|
|
5221
|
-
let pairs = 0;
|
|
5222
|
-
for (let i = 0; i < k; i++) {
|
|
5223
|
-
for (let j = i + 1; j < k; j++) {
|
|
5224
|
-
const a = rows.map((row) => row[i]);
|
|
5225
|
-
const b = rows.map((row) => row[j]);
|
|
5226
|
-
const r = fn(a, b);
|
|
5227
|
-
if (Number.isFinite(r)) {
|
|
5228
|
-
sum2 += r;
|
|
5229
|
-
pairs++;
|
|
5230
|
-
}
|
|
5231
|
-
}
|
|
5232
|
-
}
|
|
5233
|
-
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5234
|
-
}
|
|
5235
|
-
function spearmanR(a, b) {
|
|
5236
|
-
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5237
|
-
return pearsonR(rankWithTies(a), rankWithTies(b));
|
|
5238
|
-
}
|
|
5239
|
-
function rankWithTies(xs) {
|
|
5240
|
-
const n = xs.length;
|
|
5241
|
-
const indexed = xs.map((v, i2) => ({ v, i: i2 }));
|
|
5242
|
-
indexed.sort((x, y) => x.v - y.v);
|
|
5243
|
-
const ranks = new Array(n).fill(0);
|
|
5244
|
-
let i = 0;
|
|
5245
|
-
while (i < n) {
|
|
5246
|
-
let j = i;
|
|
5247
|
-
while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
|
|
5248
|
-
const avg = (i + j) / 2 + 1;
|
|
5249
|
-
for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
|
|
5250
|
-
i = j + 1;
|
|
5251
|
-
}
|
|
5252
|
-
return ranks;
|
|
5253
|
-
}
|
|
5254
|
-
function mulberry32(seed) {
|
|
5255
|
-
let a = seed >>> 0;
|
|
5256
|
-
return () => {
|
|
5257
|
-
a = a + 1831565813 >>> 0;
|
|
5258
|
-
let t = a;
|
|
5259
|
-
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
5260
|
-
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
5261
|
-
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
5262
|
-
};
|
|
5263
|
-
}
|
|
5264
|
-
function percentileBounds(ciLevel) {
|
|
5265
|
-
const tail = (1 - ciLevel) / 2;
|
|
5266
|
-
return [tail, 1 - tail];
|
|
5267
|
-
}
|
|
5268
|
-
function quantile(sorted, q) {
|
|
5269
|
-
if (sorted.length === 0) return NaN;
|
|
5270
|
-
if (sorted.length === 1) return sorted[0];
|
|
5271
|
-
const pos = q * (sorted.length - 1);
|
|
5272
|
-
const lo = Math.floor(pos);
|
|
5273
|
-
const hi = Math.ceil(pos);
|
|
5274
|
-
if (lo === hi) return sorted[lo];
|
|
5275
|
-
const frac = pos - lo;
|
|
5276
|
-
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
5277
|
-
}
|
|
5278
|
-
|
|
5279
6261
|
// src/observability.ts
|
|
5280
6262
|
async function toLangfuseEnvelope(store, runId) {
|
|
5281
6263
|
const run = await store.getRun(runId);
|
|
@@ -6077,7 +7059,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
6077
7059
|
runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
|
|
6078
7060
|
}
|
|
6079
7061
|
const runCounts = [...runCountByScenario.values()];
|
|
6080
|
-
const p25 = runCounts.length > 0 ?
|
|
7062
|
+
const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
|
|
6081
7063
|
for (const s of scenarios) {
|
|
6082
7064
|
const count = runCountByScenario.get(s.id) ?? 0;
|
|
6083
7065
|
if (count <= p25 && count < 3) {
|
|
@@ -6131,7 +7113,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
6131
7113
|
}
|
|
6132
7114
|
return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
|
|
6133
7115
|
}
|
|
6134
|
-
function
|
|
7116
|
+
function quantile(xs, p) {
|
|
6135
7117
|
const sorted = [...xs].sort((a, b) => a - b);
|
|
6136
7118
|
const idx = p * (sorted.length - 1);
|
|
6137
7119
|
const lo = Math.floor(idx);
|
|
@@ -6326,7 +7308,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
6326
7308
|
|
|
6327
7309
|
// src/command-runner.ts
|
|
6328
7310
|
import { spawnSync } from "child_process";
|
|
6329
|
-
import { existsSync as
|
|
7311
|
+
import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
|
|
6330
7312
|
import { join as join2 } from "path";
|
|
6331
7313
|
var localCommandRunner = {
|
|
6332
7314
|
name: "local",
|
|
@@ -6355,11 +7337,11 @@ var localCommandRunner = {
|
|
|
6355
7337
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
6356
7338
|
},
|
|
6357
7339
|
async fileExists(path) {
|
|
6358
|
-
return
|
|
7340
|
+
return existsSync4(path);
|
|
6359
7341
|
},
|
|
6360
7342
|
async readFile(path) {
|
|
6361
7343
|
try {
|
|
6362
|
-
return
|
|
7344
|
+
return readFileSync3(path, "utf8");
|
|
6363
7345
|
} catch {
|
|
6364
7346
|
return null;
|
|
6365
7347
|
}
|
|
@@ -6713,11 +7695,11 @@ function flowLayer(input) {
|
|
|
6713
7695
|
|
|
6714
7696
|
// src/intent-match-judge.ts
|
|
6715
7697
|
var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
6716
|
-
var
|
|
6717
|
-
var
|
|
6718
|
-
var
|
|
6719
|
-
var
|
|
6720
|
-
var
|
|
7698
|
+
var DEFAULT_MODEL2 = "claude-sonnet-4-6";
|
|
7699
|
+
var DEFAULT_TIMEOUT2 = 9e4;
|
|
7700
|
+
var DEFAULT_MAX_SOURCE2 = 25e3;
|
|
7701
|
+
var DEFAULT_MAX_PER_FILE2 = 12e3;
|
|
7702
|
+
var DEFAULT_MAX_HTML2 = 2e4;
|
|
6721
7703
|
var INTENT_SCHEMA = {
|
|
6722
7704
|
type: "object",
|
|
6723
7705
|
additionalProperties: false,
|
|
@@ -6727,12 +7709,12 @@ var INTENT_SCHEMA = {
|
|
|
6727
7709
|
evidence: { type: "string", minLength: 10, maxLength: 400 }
|
|
6728
7710
|
}
|
|
6729
7711
|
};
|
|
6730
|
-
function
|
|
7712
|
+
function truncate2(body, cap, label) {
|
|
6731
7713
|
if (body.length <= cap) return body;
|
|
6732
7714
|
return `${body.slice(0, cap)}
|
|
6733
7715
|
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
6734
7716
|
}
|
|
6735
|
-
function
|
|
7717
|
+
function buildPrompt2(input, opts) {
|
|
6736
7718
|
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
6737
7719
|
${f.content}`).join("\n\n");
|
|
6738
7720
|
const html = input.servedHtml ?? "";
|
|
@@ -6751,10 +7733,10 @@ ${input.artifactLabel ? `ARTIFACT METADATA:
|
|
|
6751
7733
|
description: ${input.artifactDescription ?? ""}
|
|
6752
7734
|
|
|
6753
7735
|
` : ""}${html ? `SERVED HTML (what the preview returns):
|
|
6754
|
-
${
|
|
7736
|
+
${truncate2(html, opts.maxHtmlChars, "HTML")}
|
|
6755
7737
|
|
|
6756
7738
|
` : ""}SOURCE FILES (the agent's workdir):
|
|
6757
|
-
${
|
|
7739
|
+
${truncate2(sourceBlob, opts.maxSourceChars, "source")}
|
|
6758
7740
|
|
|
6759
7741
|
Score 0\u20131:
|
|
6760
7742
|
1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
|
|
@@ -6782,11 +7764,11 @@ Return STRICT JSON. No prose outside.`;
|
|
|
6782
7764
|
async function runIntentMatchJudge(input, options = {}) {
|
|
6783
7765
|
const start = Date.now();
|
|
6784
7766
|
const opts = {
|
|
6785
|
-
model: options.model ??
|
|
6786
|
-
timeoutMs: options.timeoutMs ??
|
|
6787
|
-
maxSourceChars: options.maxSourceChars ??
|
|
6788
|
-
maxPerFileChars: options.maxPerFileChars ??
|
|
6789
|
-
maxHtmlChars: options.maxHtmlChars ??
|
|
7767
|
+
model: options.model ?? DEFAULT_MODEL2,
|
|
7768
|
+
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
|
|
7769
|
+
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
|
|
7770
|
+
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
|
|
7771
|
+
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
|
|
6790
7772
|
llm: options.llm ?? {}
|
|
6791
7773
|
};
|
|
6792
7774
|
if (input.sourceFiles.length === 0 && !input.servedHtml) {
|
|
@@ -6810,7 +7792,7 @@ async function runIntentMatchJudge(input, options = {}) {
|
|
|
6810
7792
|
role: "system",
|
|
6811
7793
|
content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
|
|
6812
7794
|
},
|
|
6813
|
-
{ role: "user", content:
|
|
7795
|
+
{ role: "user", content: buildPrompt2(input, opts) }
|
|
6814
7796
|
],
|
|
6815
7797
|
jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
|
|
6816
7798
|
temperature: 0,
|
|
@@ -7270,72 +8252,29 @@ function multiToolchainLayer(config) {
|
|
|
7270
8252
|
{
|
|
7271
8253
|
severity: "major",
|
|
7272
8254
|
layer: config.name,
|
|
7273
|
-
message: err instanceof Error ? err.message : String(err),
|
|
7274
|
-
detail: { adapter: adapterName }
|
|
7275
|
-
}
|
|
7276
|
-
],
|
|
7277
|
-
reason: err instanceof Error ? err.message : String(err)
|
|
7278
|
-
}
|
|
7279
|
-
};
|
|
7280
|
-
}
|
|
7281
|
-
};
|
|
7282
|
-
const results = [];
|
|
7283
|
-
for (let i = 0; i < config.adapters.length; i += maxParallel) {
|
|
7284
|
-
const chunk = config.adapters.slice(i, i + maxParallel);
|
|
7285
|
-
const chunkResults = await Promise.all(chunk.map(runOne));
|
|
7286
|
-
results.push(...chunkResults);
|
|
7287
|
-
}
|
|
7288
|
-
return mergeLayerResults(config.name, results);
|
|
7289
|
-
}
|
|
7290
|
-
};
|
|
7291
|
-
}
|
|
7292
|
-
|
|
7293
|
-
// src/reference-replay.ts
|
|
7294
|
-
import { appendFileSync, existsSync as existsSync3, mkdirSync, readFileSync as readFileSync3 } from "fs";
|
|
7295
|
-
import { dirname } from "path";
|
|
7296
|
-
|
|
7297
|
-
// src/concurrency.ts
|
|
7298
|
-
var Mutex = class {
|
|
7299
|
-
locked = false;
|
|
7300
|
-
waiters = [];
|
|
7301
|
-
async acquire() {
|
|
7302
|
-
if (!this.locked) {
|
|
7303
|
-
this.locked = true;
|
|
7304
|
-
return () => this.release();
|
|
7305
|
-
}
|
|
7306
|
-
return new Promise((resolve) => {
|
|
7307
|
-
this.waiters.push(() => {
|
|
7308
|
-
resolve(() => this.release());
|
|
7309
|
-
});
|
|
7310
|
-
});
|
|
7311
|
-
}
|
|
7312
|
-
release() {
|
|
7313
|
-
const next = this.waiters.shift();
|
|
7314
|
-
if (next) {
|
|
7315
|
-
next();
|
|
7316
|
-
} else {
|
|
7317
|
-
this.locked = false;
|
|
7318
|
-
}
|
|
7319
|
-
}
|
|
7320
|
-
async runExclusive(fn) {
|
|
7321
|
-
const release = await this.acquire();
|
|
7322
|
-
try {
|
|
7323
|
-
return await fn();
|
|
7324
|
-
} finally {
|
|
7325
|
-
release();
|
|
8255
|
+
message: err instanceof Error ? err.message : String(err),
|
|
8256
|
+
detail: { adapter: adapterName }
|
|
8257
|
+
}
|
|
8258
|
+
],
|
|
8259
|
+
reason: err instanceof Error ? err.message : String(err)
|
|
8260
|
+
}
|
|
8261
|
+
};
|
|
8262
|
+
}
|
|
8263
|
+
};
|
|
8264
|
+
const results = [];
|
|
8265
|
+
for (let i = 0; i < config.adapters.length; i += maxParallel) {
|
|
8266
|
+
const chunk = config.adapters.slice(i, i + maxParallel);
|
|
8267
|
+
const chunkResults = await Promise.all(chunk.map(runOne));
|
|
8268
|
+
results.push(...chunkResults);
|
|
8269
|
+
}
|
|
8270
|
+
return mergeLayerResults(config.name, results);
|
|
7326
8271
|
}
|
|
7327
|
-
}
|
|
7328
|
-
|
|
7329
|
-
get isLocked() {
|
|
7330
|
-
return this.locked;
|
|
7331
|
-
}
|
|
7332
|
-
/** Pending waiter count. Diagnostics only. */
|
|
7333
|
-
get pending() {
|
|
7334
|
-
return this.waiters.length;
|
|
7335
|
-
}
|
|
7336
|
-
};
|
|
8272
|
+
};
|
|
8273
|
+
}
|
|
7337
8274
|
|
|
7338
8275
|
// src/reference-replay.ts
|
|
8276
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
8277
|
+
import { dirname as dirname2 } from "path";
|
|
7339
8278
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
7340
8279
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
7341
8280
|
async function runReferenceReplay(cases, options) {
|
|
@@ -7453,14 +8392,14 @@ function jsonlReferenceReplayStore(path) {
|
|
|
7453
8392
|
return {
|
|
7454
8393
|
async save(run) {
|
|
7455
8394
|
await lock.runExclusive(() => {
|
|
7456
|
-
|
|
7457
|
-
|
|
8395
|
+
mkdirSync2(dirname2(path), { recursive: true });
|
|
8396
|
+
appendFileSync2(path, `${JSON.stringify(run)}
|
|
7458
8397
|
`);
|
|
7459
8398
|
});
|
|
7460
8399
|
},
|
|
7461
8400
|
async list() {
|
|
7462
8401
|
return lock.runExclusive(() => {
|
|
7463
|
-
if (!
|
|
8402
|
+
if (!existsSync5(path)) return [];
|
|
7464
8403
|
return readJsonl(path);
|
|
7465
8404
|
});
|
|
7466
8405
|
}
|
|
@@ -7803,7 +8742,7 @@ function throwIfAborted(signal) {
|
|
|
7803
8742
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
7804
8743
|
}
|
|
7805
8744
|
function readJsonl(path) {
|
|
7806
|
-
const raw =
|
|
8745
|
+
const raw = readFileSync4(path, "utf8");
|
|
7807
8746
|
const out = [];
|
|
7808
8747
|
for (const line of raw.split("\n")) {
|
|
7809
8748
|
const trimmed = line.trim();
|
|
@@ -7958,202 +8897,6 @@ function createDefaultReviewer(options) {
|
|
|
7958
8897
|
};
|
|
7959
8898
|
}
|
|
7960
8899
|
|
|
7961
|
-
// src/semantic-concept-judge.ts
|
|
7962
|
-
var DEFAULT_COMPLEXITY_WEIGHTS = {
|
|
7963
|
-
render: 1,
|
|
7964
|
-
integrate: 2,
|
|
7965
|
-
compute: 2.5
|
|
7966
|
-
};
|
|
7967
|
-
var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
7968
|
-
var DEFAULT_MAX_SOURCE2 = 45e3;
|
|
7969
|
-
var DEFAULT_MAX_HTML2 = 3e4;
|
|
7970
|
-
var DEFAULT_MAX_PER_FILE2 = 2e4;
|
|
7971
|
-
var DEFAULT_TIMEOUT2 = 18e4;
|
|
7972
|
-
var DEFAULT_MODEL2 = "claude-sonnet-4-6";
|
|
7973
|
-
var SEMANTIC_SCHEMA = {
|
|
7974
|
-
type: "object",
|
|
7975
|
-
additionalProperties: false,
|
|
7976
|
-
required: ["summary", "concepts"],
|
|
7977
|
-
properties: {
|
|
7978
|
-
summary: { type: "string", minLength: 20, maxLength: 600 },
|
|
7979
|
-
concepts: {
|
|
7980
|
-
type: "array",
|
|
7981
|
-
minItems: 1,
|
|
7982
|
-
items: {
|
|
7983
|
-
type: "object",
|
|
7984
|
-
additionalProperties: false,
|
|
7985
|
-
required: ["concept", "present", "score", "evidence", "severity"],
|
|
7986
|
-
properties: {
|
|
7987
|
-
concept: { type: "string", minLength: 1, maxLength: 120 },
|
|
7988
|
-
present: { type: "boolean" },
|
|
7989
|
-
score: { type: "number", minimum: 0, maximum: 10 },
|
|
7990
|
-
evidence: { type: "string", minLength: 5, maxLength: 400 },
|
|
7991
|
-
severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
|
|
7992
|
-
}
|
|
7993
|
-
}
|
|
7994
|
-
}
|
|
7995
|
-
}
|
|
7996
|
-
};
|
|
7997
|
-
function truncate2(body, cap, label) {
|
|
7998
|
-
if (body.length <= cap) return body;
|
|
7999
|
-
return `${body.slice(0, cap)}
|
|
8000
|
-
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
8001
|
-
}
|
|
8002
|
-
function buildPrompt2(input, opts) {
|
|
8003
|
-
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
8004
|
-
${f.content}`).join("\n\n");
|
|
8005
|
-
const html = input.servedHtml ?? "";
|
|
8006
|
-
return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
|
|
8007
|
-
|
|
8008
|
-
You MUST distinguish:
|
|
8009
|
-
(a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
|
|
8010
|
-
(b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
|
|
8011
|
-
(c) ABSENT (concept nowhere).
|
|
8012
|
-
|
|
8013
|
-
A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
|
|
8014
|
-
|
|
8015
|
-
USER REQUEST (what the agent was asked to build):
|
|
8016
|
-
${input.userRequest}
|
|
8017
|
-
|
|
8018
|
-
${input.artifactLabel ? `ARTIFACT METADATA:
|
|
8019
|
-
name: ${input.artifactLabel}
|
|
8020
|
-
description: ${input.artifactDescription ?? ""}
|
|
8021
|
-
|
|
8022
|
-
` : ""}EXPECTED CONCEPTS (each must be graded independently):
|
|
8023
|
-
${input.expectedConcepts.map(
|
|
8024
|
-
(c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
|
|
8025
|
-
).join("\n")}
|
|
8026
|
-
|
|
8027
|
-
${html ? `SERVED HTML (what the preview returns when hit):
|
|
8028
|
-
${truncate2(html, opts.maxHtmlChars, "HTML")}
|
|
8029
|
-
|
|
8030
|
-
` : ""}SOURCE FILES (the agent's workdir):
|
|
8031
|
-
${truncate2(sourceBlob, opts.maxSourceChars, "source")}
|
|
8032
|
-
|
|
8033
|
-
For EACH concept, return:
|
|
8034
|
-
- concept: the concept name as given (match exactly)
|
|
8035
|
-
- present: boolean \u2014 does a working implementation exist?
|
|
8036
|
-
- score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
|
|
8037
|
-
- evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
|
|
8038
|
-
- severity:
|
|
8039
|
-
"info" when present: true AND score >= 7
|
|
8040
|
-
"minor" when present: true AND 4 <= score < 7
|
|
8041
|
-
"major" when present: false OR score < 4
|
|
8042
|
-
"critical" when the concept is not only absent but a core user flow depends on it
|
|
8043
|
-
|
|
8044
|
-
Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
|
|
8045
|
-
|
|
8046
|
-
BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
|
|
8047
|
-
|
|
8048
|
-
Return STRICT JSON. No prose outside the JSON.`;
|
|
8049
|
-
}
|
|
8050
|
-
async function runSemanticConceptJudge(input, options = {}) {
|
|
8051
|
-
const start = Date.now();
|
|
8052
|
-
const totalCount = input.expectedConcepts.length;
|
|
8053
|
-
if (totalCount === 0) {
|
|
8054
|
-
return {
|
|
8055
|
-
kind: "semantic-concept",
|
|
8056
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8057
|
-
score: 0,
|
|
8058
|
-
presentCount: 0,
|
|
8059
|
-
totalCount: 0,
|
|
8060
|
-
findings: [],
|
|
8061
|
-
summary: "no expected concepts declared",
|
|
8062
|
-
durationMs: 0,
|
|
8063
|
-
costUsd: null,
|
|
8064
|
-
available: false,
|
|
8065
|
-
error: "no expected concepts declared"
|
|
8066
|
-
};
|
|
8067
|
-
}
|
|
8068
|
-
const opts = {
|
|
8069
|
-
model: options.model ?? DEFAULT_MODEL2,
|
|
8070
|
-
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
|
|
8071
|
-
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
|
|
8072
|
-
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
|
|
8073
|
-
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
|
|
8074
|
-
llm: options.llm ?? {},
|
|
8075
|
-
weightConcepts: options.weightConcepts ?? "mean",
|
|
8076
|
-
complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
|
|
8077
|
-
};
|
|
8078
|
-
const weightForConcept = (spec) => {
|
|
8079
|
-
if (opts.weightConcepts === "mean") return 1;
|
|
8080
|
-
if (spec.weight != null) return spec.weight;
|
|
8081
|
-
if (opts.weightConcepts === "complexity") {
|
|
8082
|
-
return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
|
|
8083
|
-
}
|
|
8084
|
-
return 1;
|
|
8085
|
-
};
|
|
8086
|
-
const weightByName = new Map(
|
|
8087
|
-
input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
|
|
8088
|
-
);
|
|
8089
|
-
try {
|
|
8090
|
-
const { value, result } = await callLlmJson(
|
|
8091
|
-
{
|
|
8092
|
-
model: opts.model,
|
|
8093
|
-
messages: [
|
|
8094
|
-
{
|
|
8095
|
-
role: "system",
|
|
8096
|
-
content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
|
|
8097
|
-
},
|
|
8098
|
-
{ role: "user", content: buildPrompt2(input, opts) }
|
|
8099
|
-
],
|
|
8100
|
-
jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
|
|
8101
|
-
temperature: 0,
|
|
8102
|
-
timeoutMs: opts.timeoutMs
|
|
8103
|
-
},
|
|
8104
|
-
opts.llm
|
|
8105
|
-
);
|
|
8106
|
-
if (!value?.concepts || !Array.isArray(value.concepts)) {
|
|
8107
|
-
throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
|
|
8108
|
-
}
|
|
8109
|
-
const findings = value.concepts.map((c) => ({
|
|
8110
|
-
concept: String(c.concept),
|
|
8111
|
-
present: Boolean(c.present),
|
|
8112
|
-
score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
|
|
8113
|
-
evidence: String(c.evidence ?? ""),
|
|
8114
|
-
severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
|
|
8115
|
-
}));
|
|
8116
|
-
const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
|
|
8117
|
-
let weightSum = 0;
|
|
8118
|
-
let weightedScoreSum = 0;
|
|
8119
|
-
for (const f of findings) {
|
|
8120
|
-
const w = weightByName.get(f.concept) ?? 1;
|
|
8121
|
-
weightSum += w;
|
|
8122
|
-
weightedScoreSum += w * f.score;
|
|
8123
|
-
}
|
|
8124
|
-
const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
|
|
8125
|
-
return {
|
|
8126
|
-
kind: "semantic-concept",
|
|
8127
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8128
|
-
score: Number((scoreAvg / 10).toFixed(3)),
|
|
8129
|
-
presentCount,
|
|
8130
|
-
totalCount,
|
|
8131
|
-
findings,
|
|
8132
|
-
summary: String(value.summary ?? ""),
|
|
8133
|
-
durationMs: Date.now() - start,
|
|
8134
|
-
costUsd: result.costUsd ?? null,
|
|
8135
|
-
available: true
|
|
8136
|
-
};
|
|
8137
|
-
} catch (err) {
|
|
8138
|
-
return {
|
|
8139
|
-
kind: "semantic-concept",
|
|
8140
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8141
|
-
score: 0,
|
|
8142
|
-
presentCount: 0,
|
|
8143
|
-
totalCount,
|
|
8144
|
-
findings: [],
|
|
8145
|
-
summary: "",
|
|
8146
|
-
durationMs: Date.now() - start,
|
|
8147
|
-
costUsd: null,
|
|
8148
|
-
available: false,
|
|
8149
|
-
error: err instanceof Error ? err.message : String(err)
|
|
8150
|
-
};
|
|
8151
|
-
}
|
|
8152
|
-
}
|
|
8153
|
-
function createSemanticConceptJudge(options = {}) {
|
|
8154
|
-
return (input) => runSemanticConceptJudge(input, options);
|
|
8155
|
-
}
|
|
8156
|
-
|
|
8157
8900
|
// src/canary.ts
|
|
8158
8901
|
function runCanaries(runs, opts = {}) {
|
|
8159
8902
|
const alerts = [
|
|
@@ -8352,8 +9095,8 @@ function chiSquareCritical(df, alpha) {
|
|
|
8352
9095
|
if (TABLE[df]) return TABLE[df][idx];
|
|
8353
9096
|
if (df > 30) {
|
|
8354
9097
|
const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
|
|
8355
|
-
const
|
|
8356
|
-
const term = 1 - 2 / (9 * df) +
|
|
9098
|
+
const z2 = zMap[idx] ?? 1.96;
|
|
9099
|
+
const term = 1 - 2 / (9 * df) + z2 * Math.sqrt(2 / (9 * df));
|
|
8357
9100
|
return df * term ** 3;
|
|
8358
9101
|
}
|
|
8359
9102
|
const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
|
|
@@ -8567,44 +9310,8 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
8567
9310
|
}
|
|
8568
9311
|
|
|
8569
9312
|
// src/evolution-telemetry.ts
|
|
8570
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
9313
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
|
|
8571
9314
|
import { dirname as dirname3 } from "path";
|
|
8572
|
-
|
|
8573
|
-
// src/locked-jsonl-appender.ts
|
|
8574
|
-
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2 } from "fs";
|
|
8575
|
-
import { dirname as dirname2 } from "path";
|
|
8576
|
-
var mutexes = /* @__PURE__ */ new Map();
|
|
8577
|
-
function getMutex(path) {
|
|
8578
|
-
let m = mutexes.get(path);
|
|
8579
|
-
if (!m) {
|
|
8580
|
-
m = new Mutex();
|
|
8581
|
-
mutexes.set(path, m);
|
|
8582
|
-
}
|
|
8583
|
-
return m;
|
|
8584
|
-
}
|
|
8585
|
-
var LockedJsonlAppender = class {
|
|
8586
|
-
constructor(path) {
|
|
8587
|
-
this.path = path;
|
|
8588
|
-
this.mutex = getMutex(path);
|
|
8589
|
-
if (!existsSync4(dirname2(path))) {
|
|
8590
|
-
mkdirSync2(dirname2(path), { recursive: true });
|
|
8591
|
-
}
|
|
8592
|
-
}
|
|
8593
|
-
path;
|
|
8594
|
-
mutex;
|
|
8595
|
-
async append(entry) {
|
|
8596
|
-
const line = `${JSON.stringify(entry)}
|
|
8597
|
-
`;
|
|
8598
|
-
await this.mutex.runExclusive(() => {
|
|
8599
|
-
appendFileSync2(this.path, line);
|
|
8600
|
-
});
|
|
8601
|
-
}
|
|
8602
|
-
};
|
|
8603
|
-
function resetLockedAppendersForTesting() {
|
|
8604
|
-
mutexes.clear();
|
|
8605
|
-
}
|
|
8606
|
-
|
|
8607
|
-
// src/evolution-telemetry.ts
|
|
8608
9315
|
var MutationTelemetry = class {
|
|
8609
9316
|
appender;
|
|
8610
9317
|
constructor(path) {
|
|
@@ -8634,16 +9341,16 @@ var LineageRecorder = class {
|
|
|
8634
9341
|
this.snapshotPath = `${path}.snapshot`;
|
|
8635
9342
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
8636
9343
|
mkdirSync3(dirname3(path), { recursive: true });
|
|
8637
|
-
if (
|
|
9344
|
+
if (existsSync6(this.snapshotPath)) {
|
|
8638
9345
|
try {
|
|
8639
|
-
const parsed = JSON.parse(
|
|
9346
|
+
const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
|
|
8640
9347
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
8641
9348
|
} catch {
|
|
8642
9349
|
}
|
|
8643
9350
|
}
|
|
8644
|
-
if (
|
|
9351
|
+
if (existsSync6(path)) {
|
|
8645
9352
|
try {
|
|
8646
|
-
for (const line of
|
|
9353
|
+
for (const line of readFileSync5(path, "utf-8").split("\n")) {
|
|
8647
9354
|
if (!line.trim()) continue;
|
|
8648
9355
|
try {
|
|
8649
9356
|
const entry = JSON.parse(line);
|
|
@@ -8655,9 +9362,9 @@ var LineageRecorder = class {
|
|
|
8655
9362
|
} catch {
|
|
8656
9363
|
}
|
|
8657
9364
|
}
|
|
8658
|
-
if (
|
|
9365
|
+
if (existsSync6(path) && this.nodes.size === 0) {
|
|
8659
9366
|
try {
|
|
8660
|
-
const raw =
|
|
9367
|
+
const raw = readFileSync5(path, "utf-8").trim();
|
|
8661
9368
|
if (raw.startsWith("[")) {
|
|
8662
9369
|
const parsed = JSON.parse(raw);
|
|
8663
9370
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -8671,8 +9378,8 @@ var LineageRecorder = class {
|
|
|
8671
9378
|
const prev = this.nodes.get(node.id);
|
|
8672
9379
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
8673
9380
|
try {
|
|
8674
|
-
if (
|
|
8675
|
-
const head =
|
|
9381
|
+
if (existsSync6(this.path)) {
|
|
9382
|
+
const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
8676
9383
|
if (head === "[") {
|
|
8677
9384
|
writeFileSync(this.path, "");
|
|
8678
9385
|
}
|
|
@@ -8738,9 +9445,9 @@ var CostLedger = class {
|
|
|
8738
9445
|
mutex = new Mutex();
|
|
8739
9446
|
constructor(path) {
|
|
8740
9447
|
this.path = path;
|
|
8741
|
-
if (
|
|
9448
|
+
if (existsSync6(path)) {
|
|
8742
9449
|
try {
|
|
8743
|
-
const loaded = JSON.parse(
|
|
9450
|
+
const loaded = JSON.parse(readFileSync5(path, "utf-8"));
|
|
8744
9451
|
for (const k of Object.keys(this.totals)) {
|
|
8745
9452
|
if (k === "byGeneration") {
|
|
8746
9453
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -8909,7 +9616,7 @@ function precision(goldens, candidates, options = {}) {
|
|
|
8909
9616
|
}
|
|
8910
9617
|
|
|
8911
9618
|
// src/jsonl-trial-cache.ts
|
|
8912
|
-
import { appendFileSync as appendFileSync4, existsSync as
|
|
9619
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
|
|
8913
9620
|
import { dirname as dirname4 } from "path";
|
|
8914
9621
|
var JsonlTrialCache = class {
|
|
8915
9622
|
map = /* @__PURE__ */ new Map();
|
|
@@ -8917,8 +9624,8 @@ var JsonlTrialCache = class {
|
|
|
8917
9624
|
appender;
|
|
8918
9625
|
constructor(path) {
|
|
8919
9626
|
this.path = path;
|
|
8920
|
-
if (
|
|
8921
|
-
for (const line of
|
|
9627
|
+
if (existsSync7(path)) {
|
|
9628
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
8922
9629
|
if (!line.trim()) continue;
|
|
8923
9630
|
try {
|
|
8924
9631
|
const entry = JSON.parse(line);
|
|
@@ -9306,8 +10013,10 @@ function aggregateTrialsByMode(trials, opts) {
|
|
|
9306
10013
|
};
|
|
9307
10014
|
}
|
|
9308
10015
|
export {
|
|
10016
|
+
ANALYST_SEVERITIES,
|
|
9309
10017
|
AgentDriver,
|
|
9310
10018
|
AgentEvalError,
|
|
10019
|
+
AnalystRegistry,
|
|
9311
10020
|
AxGepaSteeringOptimizer,
|
|
9312
10021
|
BENCHMARK_SPLIT_SEED,
|
|
9313
10022
|
BenchmarkRunner,
|
|
@@ -9331,19 +10040,23 @@ export {
|
|
|
9331
10040
|
DEFAULT_RED_TEAM_CORPUS,
|
|
9332
10041
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
9333
10042
|
DEFAULT_SEVERITY_WEIGHTS,
|
|
10043
|
+
DEFAULT_TRACE_ANALYST_KINDS,
|
|
9334
10044
|
Dataset,
|
|
9335
10045
|
DockerSandboxDriver,
|
|
9336
10046
|
DualAgentBench,
|
|
9337
10047
|
ERROR_COUNT_PATTERNS,
|
|
9338
10048
|
ExperimentTracker,
|
|
9339
10049
|
FAILURE_CLASSES,
|
|
10050
|
+
FAILURE_MODE_KIND_SPEC,
|
|
9340
10051
|
FileSystemExperimentStore,
|
|
9341
10052
|
FileSystemFeedbackTrajectoryStore,
|
|
9342
10053
|
FileSystemRawProviderSink,
|
|
9343
10054
|
FileSystemTraceStore,
|
|
10055
|
+
FindingsStore,
|
|
9344
10056
|
HeldOutGate,
|
|
9345
10057
|
HoldoutAuditor,
|
|
9346
10058
|
HoldoutLockedError,
|
|
10059
|
+
IMPROVEMENT_KIND_SPEC,
|
|
9347
10060
|
INTENT_MATCH_JUDGE_VERSION,
|
|
9348
10061
|
InMemoryExperimentStore,
|
|
9349
10062
|
InMemoryFeedbackTrajectoryStore,
|
|
@@ -9354,6 +10067,8 @@ export {
|
|
|
9354
10067
|
JsonlTrialCache,
|
|
9355
10068
|
JudgeError,
|
|
9356
10069
|
JudgeRunner,
|
|
10070
|
+
KNOWLEDGE_GAP_KIND_SPEC,
|
|
10071
|
+
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
9357
10072
|
LineageRecorder,
|
|
9358
10073
|
LlmCallError,
|
|
9359
10074
|
LlmClient,
|
|
@@ -9371,8 +10086,10 @@ export {
|
|
|
9371
10086
|
PairwiseSteeringOptimizer,
|
|
9372
10087
|
ProductClient,
|
|
9373
10088
|
PromptRegistry,
|
|
10089
|
+
RAW_FINDING_SCHEMA_PROMPT,
|
|
9374
10090
|
REDACTION_VERSION,
|
|
9375
10091
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
10092
|
+
RawAnalystFindingSchema,
|
|
9376
10093
|
ReplayCache,
|
|
9377
10094
|
ReplayCacheMissError,
|
|
9378
10095
|
ReplayError,
|
|
@@ -9414,6 +10131,7 @@ export {
|
|
|
9414
10131
|
bootstrapCi,
|
|
9415
10132
|
buildReflectionPrompt,
|
|
9416
10133
|
buildReviewerPrompt,
|
|
10134
|
+
buildTraceToolsForGroup,
|
|
9417
10135
|
buildTrajectory,
|
|
9418
10136
|
byteLengthRange,
|
|
9419
10137
|
calibrateJudge,
|
|
@@ -9439,6 +10157,7 @@ export {
|
|
|
9439
10157
|
compilerJudge,
|
|
9440
10158
|
composeParsers,
|
|
9441
10159
|
composeValidators,
|
|
10160
|
+
computeFindingId,
|
|
9442
10161
|
computeToolUseMetrics,
|
|
9443
10162
|
confidenceInterval,
|
|
9444
10163
|
containsAll,
|
|
@@ -9446,27 +10165,38 @@ export {
|
|
|
9446
10165
|
controlFailureClassFromVerification,
|
|
9447
10166
|
controlRunToFeedbackTrajectory,
|
|
9448
10167
|
controlRunToRunRecord,
|
|
10168
|
+
corpusInterRaterAgreement,
|
|
10169
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
9449
10170
|
createAntiSlopJudge,
|
|
10171
|
+
createChatClient,
|
|
9450
10172
|
createCompositeMutator,
|
|
9451
10173
|
createCustomJudge,
|
|
9452
10174
|
createDefaultReviewer,
|
|
9453
10175
|
createDomainExpertJudge,
|
|
9454
10176
|
createFeedbackTrajectory,
|
|
9455
10177
|
createIntentMatchJudge,
|
|
10178
|
+
createJudgeAdapter,
|
|
9456
10179
|
createLlmReviewer,
|
|
9457
10180
|
createReplayFetch,
|
|
10181
|
+
createRunCriticAdapter,
|
|
9458
10182
|
createSandboxCodeMutator,
|
|
9459
10183
|
createSandboxPool,
|
|
9460
10184
|
createSemanticConceptJudge,
|
|
10185
|
+
createSemanticConceptJudgeAdapter,
|
|
10186
|
+
createTraceAnalystAdapter,
|
|
10187
|
+
createTraceAnalystKind,
|
|
10188
|
+
createVerifierAdapter,
|
|
9461
10189
|
crossTraceDiff,
|
|
9462
10190
|
crowdingDistance,
|
|
9463
10191
|
decideReferenceReplayPromotion,
|
|
9464
10192
|
decideReferenceReplayRunPromotion,
|
|
10193
|
+
defaultIsMaterial,
|
|
9465
10194
|
defaultJudges,
|
|
9466
10195
|
defaultMultiShotObjectives,
|
|
9467
10196
|
defaultProviderRedactor,
|
|
9468
10197
|
defaultReferenceReplayMatcher,
|
|
9469
10198
|
deployGateLayer,
|
|
10199
|
+
diffFindings,
|
|
9470
10200
|
discoverPersonas,
|
|
9471
10201
|
distillPlaybook,
|
|
9472
10202
|
dominates,
|
|
@@ -9535,12 +10265,14 @@ export {
|
|
|
9535
10265
|
judgeSpans,
|
|
9536
10266
|
keyPreserved,
|
|
9537
10267
|
knowledgeReadinessTracePayload,
|
|
10268
|
+
liftSeverity,
|
|
9538
10269
|
linterJudge,
|
|
9539
10270
|
llmSpanFromProvider,
|
|
9540
10271
|
llmSpans,
|
|
9541
10272
|
loadScorerFromGrader,
|
|
9542
10273
|
localCommandRunner,
|
|
9543
10274
|
lowercaseMutator,
|
|
10275
|
+
makeFinding,
|
|
9544
10276
|
mannWhitneyU,
|
|
9545
10277
|
matchGoldens,
|
|
9546
10278
|
mergeLayerResults,
|
|
@@ -9560,6 +10292,7 @@ export {
|
|
|
9560
10292
|
paretoFrontier,
|
|
9561
10293
|
paretoFrontierWithCrowding,
|
|
9562
10294
|
parseFeedbackTrajectoriesJsonl,
|
|
10295
|
+
parseRawFinding,
|
|
9563
10296
|
parseReflectionResponse,
|
|
9564
10297
|
parseRunRecordSafe,
|
|
9565
10298
|
partialCredit,
|