@tangle-network/agent-eval 0.71.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +618 -30
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
- package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +14 -8
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +339 -2627
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-6QZUCFKM.js.map +0 -1
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,6 +1,17 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
MODEL_PRICING,
|
|
3
|
+
MetricsCollector,
|
|
4
|
+
TokenCounter,
|
|
5
|
+
agentProfileHash,
|
|
6
|
+
createLlmCorrectnessChecker,
|
|
7
|
+
estimateCost,
|
|
8
|
+
estimateTokens,
|
|
9
|
+
extractProducedState,
|
|
10
|
+
isModelPriced,
|
|
11
|
+
parseCorrectnessResponse,
|
|
12
|
+
resolveModelPricing,
|
|
13
|
+
verifyCompletion
|
|
14
|
+
} from "./chunk-LB2UOI5F.js";
|
|
4
15
|
import {
|
|
5
16
|
HoldoutAuditor,
|
|
6
17
|
canaryLeakView,
|
|
@@ -31,12 +42,12 @@ import {
|
|
|
31
42
|
scoreRedTeamOutput,
|
|
32
43
|
surfaceContentHash,
|
|
33
44
|
toolNamesForRun
|
|
34
|
-
} from "./chunk-
|
|
45
|
+
} from "./chunk-JYE3WOTE.js";
|
|
35
46
|
import {
|
|
36
47
|
BackendIntegrityError,
|
|
37
48
|
assertRealBackend,
|
|
38
49
|
summarizeBackendIntegrity
|
|
39
|
-
} from "./chunk-
|
|
50
|
+
} from "./chunk-ZPSKPT3V.js";
|
|
40
51
|
import {
|
|
41
52
|
BENCHMARK_SPLIT_SEED,
|
|
42
53
|
benchmarks_exports,
|
|
@@ -82,6 +93,40 @@ import {
|
|
|
82
93
|
scoreKnowledgeReadiness,
|
|
83
94
|
userQuestionsForKnowledgeGaps
|
|
84
95
|
} from "./chunk-3CKU6VGU.js";
|
|
96
|
+
import {
|
|
97
|
+
DEFAULT_COMPLEXITY_WEIGHTS,
|
|
98
|
+
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
99
|
+
FindingsStore,
|
|
100
|
+
LockedJsonlAppender,
|
|
101
|
+
Mutex,
|
|
102
|
+
RunCritic,
|
|
103
|
+
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
104
|
+
SKILL_USAGE_ANALYST,
|
|
105
|
+
SkillUsageAnalyst,
|
|
106
|
+
aggregateRunScore,
|
|
107
|
+
buildDefaultAnalystRegistry,
|
|
108
|
+
clamp01,
|
|
109
|
+
computeTraceMetrics,
|
|
110
|
+
createAnalystAi,
|
|
111
|
+
createChatClient,
|
|
112
|
+
createSemanticConceptJudge,
|
|
113
|
+
defaultIsMaterial,
|
|
114
|
+
diffFindings,
|
|
115
|
+
resetLockedAppendersForTesting,
|
|
116
|
+
runSemanticConceptJudge
|
|
117
|
+
} from "./chunk-7W4SM7FD.js";
|
|
118
|
+
import {
|
|
119
|
+
AnalystRegistry,
|
|
120
|
+
DEFAULT_TRACE_ANALYST_KINDS,
|
|
121
|
+
FAILURE_MODE_KIND_SPEC,
|
|
122
|
+
IMPROVEMENT_KIND_SPEC,
|
|
123
|
+
KNOWLEDGE_GAP_KIND_SPEC,
|
|
124
|
+
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
125
|
+
computeFindingId,
|
|
126
|
+
createTraceAnalystKind,
|
|
127
|
+
makeFinding,
|
|
128
|
+
renderPriorFindings
|
|
129
|
+
} from "./chunk-WYIHD6EB.js";
|
|
85
130
|
import {
|
|
86
131
|
controlFailureClassFromVerification,
|
|
87
132
|
controlRunToRunRecord,
|
|
@@ -110,26 +155,19 @@ import {
|
|
|
110
155
|
} from "./chunk-B26KI423.js";
|
|
111
156
|
import {
|
|
112
157
|
runEvalCampaign
|
|
113
|
-
} from "./chunk-
|
|
158
|
+
} from "./chunk-GJJNJVIR.js";
|
|
114
159
|
import {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
requireAgentProfileCell,
|
|
127
|
-
roundTripRunRecord,
|
|
128
|
-
toAgentProfileJson,
|
|
129
|
-
validateAgentProfileCell,
|
|
130
|
-
validateRunRecord,
|
|
131
|
-
verifyAgentProfileCell
|
|
132
|
-
} from "./chunk-F3SRAAZO.js";
|
|
160
|
+
LlmCallError,
|
|
161
|
+
LlmClient,
|
|
162
|
+
LlmRouteAssertionError,
|
|
163
|
+
assertLlmRoute,
|
|
164
|
+
backoffMs,
|
|
165
|
+
callLlm,
|
|
166
|
+
callLlmJson,
|
|
167
|
+
isTransientLlmError,
|
|
168
|
+
probeLlm,
|
|
169
|
+
stripFencedJson
|
|
170
|
+
} from "./chunk-IHDHUN2X.js";
|
|
133
171
|
import {
|
|
134
172
|
evaluateInterimReleaseConfidence,
|
|
135
173
|
pairedEvalueSequence
|
|
@@ -165,2113 +203,139 @@ import {
|
|
|
165
203
|
selfPreference,
|
|
166
204
|
verbosityBias,
|
|
167
205
|
weightedComposite,
|
|
168
|
-
weightedMean,
|
|
169
|
-
wilcoxonSignedRank
|
|
170
|
-
} from "./chunk-ITBRCT73.js";
|
|
171
|
-
import {
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
} from "./chunk-
|
|
214
|
-
import {
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
} from "./chunk-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
if (!Number.isFinite(value)) return 0;
|
|
302
|
-
return Math.max(0, Math.min(1, value));
|
|
303
|
-
}
|
|
304
|
-
function finiteOrZero(value) {
|
|
305
|
-
return Number.isFinite(value) ? value : 0;
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
// src/run-critic.ts
|
|
309
|
-
var DEFAULT_DRIFT_PATTERNS = [
|
|
310
|
-
/https?:\/\//i,
|
|
311
|
-
/\btitle:\s/i,
|
|
312
|
-
/\bsummary:\s/i,
|
|
313
|
-
/\burl:\s/i,
|
|
314
|
-
/\bnpm package usage\b/i,
|
|
315
|
-
/\bnews\b/i
|
|
316
|
-
];
|
|
317
|
-
var RunCritic = class {
|
|
318
|
-
weights;
|
|
319
|
-
driftPatterns;
|
|
320
|
-
constructor(options = {}) {
|
|
321
|
-
this.weights = options.weights;
|
|
322
|
-
this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
|
|
323
|
-
}
|
|
324
|
-
async score(store, runId) {
|
|
325
|
-
const run = await store.getRun(runId);
|
|
326
|
-
if (!run) throw new NotFoundError(`run ${runId} not found`);
|
|
327
|
-
const [spans, events, artifacts, budget] = await Promise.all([
|
|
328
|
-
store.spans({ runId }),
|
|
329
|
-
store.events({ runId }),
|
|
330
|
-
store.artifacts(runId),
|
|
331
|
-
store.budget(runId)
|
|
332
|
-
]);
|
|
333
|
-
return this.scoreTrace({ run, spans, events, artifacts, budget });
|
|
334
|
-
}
|
|
335
|
-
scoreTrace(trace) {
|
|
336
|
-
const notes = [];
|
|
337
|
-
const llmSpans2 = trace.spans.filter(
|
|
338
|
-
(s) => s.kind === "llm"
|
|
339
|
-
);
|
|
340
|
-
const toolSpans2 = trace.spans.filter(
|
|
341
|
-
(s) => s.kind === "tool"
|
|
342
|
-
);
|
|
343
|
-
const judgeSpans2 = trace.spans.filter(
|
|
344
|
-
(s) => s.kind === "judge"
|
|
345
|
-
);
|
|
346
|
-
const sandboxSpans = trace.spans.filter(
|
|
347
|
-
(s) => s.kind === "sandbox"
|
|
348
|
-
);
|
|
349
|
-
const finalGateSpans = judgeSpans2.filter(
|
|
350
|
-
(span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
|
|
351
|
-
);
|
|
352
|
-
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
353
|
-
if (!success) notes.push("run did not complete with pass=true");
|
|
354
|
-
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
355
|
-
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
|
|
356
|
-
trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
|
|
357
|
-
) : void 0;
|
|
358
|
-
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
359
|
-
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
360
|
-
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
361
|
-
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
362
|
-
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
363
|
-
const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
|
|
364
|
-
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
365
|
-
const sandboxTests = sandboxSpans.filter(
|
|
366
|
-
(span) => typeof span.testsTotal === "number" && span.testsTotal > 0
|
|
367
|
-
);
|
|
368
|
-
const testReality = sandboxTests.length ? sandboxTests.reduce(
|
|
369
|
-
(sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
|
|
370
|
-
0
|
|
371
|
-
) / sandboxTests.length : toolSpans2.some(
|
|
372
|
-
(span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
|
|
373
|
-
) ? 0.4 : 0;
|
|
374
|
-
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
375
|
-
const blockerSpans = judgeSpans2.filter((span) => isBlockingJudge(span));
|
|
376
|
-
const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
|
|
377
|
-
const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
|
|
378
|
-
if (finalGateBlockers.length)
|
|
379
|
-
notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
|
|
380
|
-
else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
|
|
381
|
-
const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
|
|
382
|
-
if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
|
|
383
|
-
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
384
|
-
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
385
|
-
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
386
|
-
const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
|
|
387
|
-
if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
|
|
388
|
-
const costUsd = trace.budget.length ? Math.max(
|
|
389
|
-
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
390
|
-
0
|
|
391
|
-
) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
|
|
392
|
-
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
393
|
-
return {
|
|
394
|
-
success,
|
|
395
|
-
goalProgress,
|
|
396
|
-
repoGroundedness,
|
|
397
|
-
driftPenalty,
|
|
398
|
-
toolUseQuality,
|
|
399
|
-
patchQuality,
|
|
400
|
-
testReality,
|
|
401
|
-
finalGate,
|
|
402
|
-
reviewerBlockers,
|
|
403
|
-
costUsd,
|
|
404
|
-
wallSeconds,
|
|
405
|
-
notes
|
|
406
|
-
};
|
|
407
|
-
}
|
|
408
|
-
rank(score) {
|
|
409
|
-
return aggregateRunScore(score, this.weights);
|
|
410
|
-
}
|
|
411
|
-
isDrift(text) {
|
|
412
|
-
return this.driftPatterns.some((pattern) => pattern.test(text));
|
|
413
|
-
}
|
|
414
|
-
};
|
|
415
|
-
function normalizeJudgeScore(score) {
|
|
416
|
-
return score > 1 ? clamp01(score / 10) : clamp01(score);
|
|
417
|
-
}
|
|
418
|
-
function looksRepoGrounded(text) {
|
|
419
|
-
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
|
|
420
|
-
text
|
|
421
|
-
);
|
|
422
|
-
}
|
|
423
|
-
function isBlockingJudge(span) {
|
|
424
|
-
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
425
|
-
}
|
|
426
|
-
function positiveNumber(value) {
|
|
427
|
-
return typeof value === "number" && value > 0;
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
// src/semantic-concept-judge.ts
|
|
431
|
-
var DEFAULT_COMPLEXITY_WEIGHTS = {
|
|
432
|
-
render: 1,
|
|
433
|
-
integrate: 2,
|
|
434
|
-
compute: 2.5
|
|
435
|
-
};
|
|
436
|
-
var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
437
|
-
var DEFAULT_MAX_SOURCE = 45e3;
|
|
438
|
-
var DEFAULT_MAX_HTML = 3e4;
|
|
439
|
-
var DEFAULT_MAX_PER_FILE = 2e4;
|
|
440
|
-
var DEFAULT_TIMEOUT = 18e4;
|
|
441
|
-
var DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
442
|
-
var SEMANTIC_SCHEMA = {
|
|
443
|
-
type: "object",
|
|
444
|
-
additionalProperties: false,
|
|
445
|
-
required: ["summary", "concepts"],
|
|
446
|
-
properties: {
|
|
447
|
-
summary: { type: "string", minLength: 20, maxLength: 600 },
|
|
448
|
-
concepts: {
|
|
449
|
-
type: "array",
|
|
450
|
-
minItems: 1,
|
|
451
|
-
items: {
|
|
452
|
-
type: "object",
|
|
453
|
-
additionalProperties: false,
|
|
454
|
-
required: ["concept", "present", "score", "evidence", "severity"],
|
|
455
|
-
properties: {
|
|
456
|
-
concept: { type: "string", minLength: 1, maxLength: 120 },
|
|
457
|
-
present: { type: "boolean" },
|
|
458
|
-
score: { type: "number", minimum: 0, maximum: 10 },
|
|
459
|
-
evidence: { type: "string", minLength: 5, maxLength: 400 },
|
|
460
|
-
severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
|
|
461
|
-
}
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
};
|
|
466
|
-
function truncate(body, cap, label) {
|
|
467
|
-
if (body.length <= cap) return body;
|
|
468
|
-
return `${body.slice(0, cap)}
|
|
469
|
-
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
470
|
-
}
|
|
471
|
-
function buildPrompt(input, opts) {
|
|
472
|
-
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
473
|
-
${f.content}`).join("\n\n");
|
|
474
|
-
const html = input.servedHtml ?? "";
|
|
475
|
-
return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
|
|
476
|
-
|
|
477
|
-
You MUST distinguish:
|
|
478
|
-
(a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
|
|
479
|
-
(b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
|
|
480
|
-
(c) ABSENT (concept nowhere).
|
|
481
|
-
|
|
482
|
-
A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
|
|
483
|
-
|
|
484
|
-
USER REQUEST (what the agent was asked to build):
|
|
485
|
-
${input.userRequest}
|
|
486
|
-
|
|
487
|
-
${input.artifactLabel ? `ARTIFACT METADATA:
|
|
488
|
-
name: ${input.artifactLabel}
|
|
489
|
-
description: ${input.artifactDescription ?? ""}
|
|
490
|
-
|
|
491
|
-
` : ""}EXPECTED CONCEPTS (each must be graded independently):
|
|
492
|
-
${input.expectedConcepts.map(
|
|
493
|
-
(c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
|
|
494
|
-
).join("\n")}
|
|
495
|
-
|
|
496
|
-
${html ? `SERVED HTML (what the preview returns when hit):
|
|
497
|
-
${truncate(html, opts.maxHtmlChars, "HTML")}
|
|
498
|
-
|
|
499
|
-
` : ""}SOURCE FILES (the agent's workdir):
|
|
500
|
-
${truncate(sourceBlob, opts.maxSourceChars, "source")}
|
|
501
|
-
|
|
502
|
-
For EACH concept, return:
|
|
503
|
-
- concept: the concept name as given (match exactly)
|
|
504
|
-
- present: boolean \u2014 does a working implementation exist?
|
|
505
|
-
- score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
|
|
506
|
-
- evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
|
|
507
|
-
- severity:
|
|
508
|
-
"info" when present: true AND score >= 7
|
|
509
|
-
"minor" when present: true AND 4 <= score < 7
|
|
510
|
-
"major" when present: false OR score < 4
|
|
511
|
-
"critical" when the concept is not only absent but a core user flow depends on it
|
|
512
|
-
|
|
513
|
-
Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
|
|
514
|
-
|
|
515
|
-
BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
|
|
516
|
-
|
|
517
|
-
Return STRICT JSON. No prose outside the JSON.`;
|
|
518
|
-
}
|
|
519
|
-
async function runSemanticConceptJudge(input, options = {}) {
|
|
520
|
-
const start = Date.now();
|
|
521
|
-
const totalCount = input.expectedConcepts.length;
|
|
522
|
-
if (totalCount === 0) {
|
|
523
|
-
return {
|
|
524
|
-
kind: "semantic-concept",
|
|
525
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
526
|
-
score: 0,
|
|
527
|
-
presentCount: 0,
|
|
528
|
-
totalCount: 0,
|
|
529
|
-
findings: [],
|
|
530
|
-
summary: "no expected concepts declared",
|
|
531
|
-
durationMs: 0,
|
|
532
|
-
costUsd: null,
|
|
533
|
-
available: false,
|
|
534
|
-
error: "no expected concepts declared"
|
|
535
|
-
};
|
|
536
|
-
}
|
|
537
|
-
const opts = {
|
|
538
|
-
model: options.model ?? DEFAULT_MODEL,
|
|
539
|
-
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
|
|
540
|
-
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
|
|
541
|
-
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
|
|
542
|
-
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
|
|
543
|
-
llm: options.llm ?? {},
|
|
544
|
-
weightConcepts: options.weightConcepts ?? "mean",
|
|
545
|
-
complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
|
|
546
|
-
};
|
|
547
|
-
const weightForConcept = (spec) => {
|
|
548
|
-
if (opts.weightConcepts === "mean") return 1;
|
|
549
|
-
if (spec.weight != null) return spec.weight;
|
|
550
|
-
if (opts.weightConcepts === "complexity") {
|
|
551
|
-
return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
|
|
552
|
-
}
|
|
553
|
-
return 1;
|
|
554
|
-
};
|
|
555
|
-
const weightByName = new Map(
|
|
556
|
-
input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
|
|
557
|
-
);
|
|
558
|
-
try {
|
|
559
|
-
const { value, result } = await callLlmJson(
|
|
560
|
-
{
|
|
561
|
-
model: opts.model,
|
|
562
|
-
messages: [
|
|
563
|
-
{
|
|
564
|
-
role: "system",
|
|
565
|
-
content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
|
|
566
|
-
},
|
|
567
|
-
{ role: "user", content: buildPrompt(input, opts) }
|
|
568
|
-
],
|
|
569
|
-
jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
|
|
570
|
-
temperature: 0,
|
|
571
|
-
timeoutMs: opts.timeoutMs
|
|
572
|
-
},
|
|
573
|
-
opts.llm
|
|
574
|
-
);
|
|
575
|
-
if (!value?.concepts || !Array.isArray(value.concepts)) {
|
|
576
|
-
throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
|
|
577
|
-
}
|
|
578
|
-
const findings = value.concepts.map((c) => ({
|
|
579
|
-
concept: String(c.concept),
|
|
580
|
-
present: Boolean(c.present),
|
|
581
|
-
score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
|
|
582
|
-
evidence: String(c.evidence ?? ""),
|
|
583
|
-
severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
|
|
584
|
-
}));
|
|
585
|
-
const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
|
|
586
|
-
let weightSum = 0;
|
|
587
|
-
let weightedScoreSum = 0;
|
|
588
|
-
for (const f of findings) {
|
|
589
|
-
const w = weightByName.get(f.concept) ?? 1;
|
|
590
|
-
weightSum += w;
|
|
591
|
-
weightedScoreSum += w * f.score;
|
|
592
|
-
}
|
|
593
|
-
const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
|
|
594
|
-
return {
|
|
595
|
-
kind: "semantic-concept",
|
|
596
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
597
|
-
score: Number((scoreAvg / 10).toFixed(3)),
|
|
598
|
-
presentCount,
|
|
599
|
-
totalCount,
|
|
600
|
-
findings,
|
|
601
|
-
summary: String(value.summary ?? ""),
|
|
602
|
-
durationMs: Date.now() - start,
|
|
603
|
-
costUsd: result.costUsd ?? null,
|
|
604
|
-
available: true
|
|
605
|
-
};
|
|
606
|
-
} catch (err) {
|
|
607
|
-
return {
|
|
608
|
-
kind: "semantic-concept",
|
|
609
|
-
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
610
|
-
score: 0,
|
|
611
|
-
presentCount: 0,
|
|
612
|
-
totalCount,
|
|
613
|
-
findings: [],
|
|
614
|
-
summary: "",
|
|
615
|
-
durationMs: Date.now() - start,
|
|
616
|
-
costUsd: null,
|
|
617
|
-
available: false,
|
|
618
|
-
error: err instanceof Error ? err.message : String(err)
|
|
619
|
-
};
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
function createSemanticConceptJudge(options = {}) {
|
|
623
|
-
return (input) => runSemanticConceptJudge(input, options);
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
// src/analyst/types.ts
|
|
627
|
-
import { createHash } from "crypto";
|
|
628
|
-
function computeFindingId(input) {
|
|
629
|
-
const basis = JSON.stringify({
|
|
630
|
-
a: input.analyst_id,
|
|
631
|
-
r: input.area,
|
|
632
|
-
s: input.subject ?? "",
|
|
633
|
-
c: normalizeClaim(input.id_basis ?? input.claim)
|
|
634
|
-
});
|
|
635
|
-
return `f_${createHash("sha256").update(basis).digest("hex").slice(0, 20)}`;
|
|
636
|
-
}
|
|
637
|
-
function normalizeClaim(c) {
|
|
638
|
-
return c.toLowerCase().replace(/\s+/g, " ").replace(/[.!?;:,]+$/g, "").trim();
|
|
639
|
-
}
|
|
640
|
-
function makeFinding(init) {
|
|
641
|
-
const { id_basis, produced_at, ...rest } = init;
|
|
642
|
-
return {
|
|
643
|
-
schema_version: "1.0.0",
|
|
644
|
-
finding_id: computeFindingId({
|
|
645
|
-
analyst_id: rest.analyst_id,
|
|
646
|
-
area: rest.area,
|
|
647
|
-
subject: rest.subject,
|
|
648
|
-
claim: rest.claim,
|
|
649
|
-
id_basis
|
|
650
|
-
}),
|
|
651
|
-
produced_at: produced_at ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
652
|
-
...rest
|
|
653
|
-
};
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
// src/analyst/adapters.ts
|
|
657
|
-
var ADAPTER_REV = "1";
|
|
658
|
-
function liftSeverity(s) {
|
|
659
|
-
switch (s) {
|
|
660
|
-
case "critical":
|
|
661
|
-
return "critical";
|
|
662
|
-
case "major":
|
|
663
|
-
return "high";
|
|
664
|
-
case "minor":
|
|
665
|
-
return "medium";
|
|
666
|
-
case "info":
|
|
667
|
-
return "info";
|
|
668
|
-
}
|
|
669
|
-
}
|
|
670
|
-
function createTraceAnalystAdapter(opts) {
|
|
671
|
-
const id = opts.id ?? "trace-analyst";
|
|
672
|
-
const area = opts.area ?? "agent-reasoning";
|
|
673
|
-
return {
|
|
674
|
-
id,
|
|
675
|
-
description: "Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.",
|
|
676
|
-
inputKind: "trace-store",
|
|
677
|
-
cost: { kind: "llm", models: opts.model ? [opts.model] : void 0 },
|
|
678
|
-
version: `trace-analyst-${ADAPTER_REV}`,
|
|
679
|
-
async analyze(store, ctx) {
|
|
680
|
-
const out = [];
|
|
681
|
-
for (const question of opts.questions) {
|
|
682
|
-
if (ctx.signal?.aborted) break;
|
|
683
|
-
const result = await analyzeTraces(
|
|
684
|
-
{ question },
|
|
685
|
-
{ source: store, ai: opts.ai, model: opts.model, ...opts.extra }
|
|
686
|
-
);
|
|
687
|
-
const subject = ctx.tags?.subject ?? question.slice(0, 60);
|
|
688
|
-
if (result.findings.length === 0) {
|
|
689
|
-
out.push(
|
|
690
|
-
makeFinding({
|
|
691
|
-
analyst_id: id,
|
|
692
|
-
area,
|
|
693
|
-
subject,
|
|
694
|
-
claim: result.answer.slice(0, 200),
|
|
695
|
-
rationale: result.answer,
|
|
696
|
-
severity: "info",
|
|
697
|
-
confidence: 0.5,
|
|
698
|
-
evidence_refs: [],
|
|
699
|
-
metadata: {
|
|
700
|
-
actor_prompt_version: result.actorPromptVersion,
|
|
701
|
-
turns: result.turnCount
|
|
702
|
-
}
|
|
703
|
-
})
|
|
704
|
-
);
|
|
705
|
-
continue;
|
|
706
|
-
}
|
|
707
|
-
result.findings.forEach((claim, i) => {
|
|
708
|
-
out.push(
|
|
709
|
-
makeFinding({
|
|
710
|
-
analyst_id: id,
|
|
711
|
-
area,
|
|
712
|
-
subject,
|
|
713
|
-
claim,
|
|
714
|
-
rationale: i === 0 ? result.answer : void 0,
|
|
715
|
-
severity: "medium",
|
|
716
|
-
confidence: 0.6,
|
|
717
|
-
evidence_refs: [],
|
|
718
|
-
metadata: { question, turns: result.turnCount, finding_index: i }
|
|
719
|
-
})
|
|
720
|
-
);
|
|
721
|
-
});
|
|
722
|
-
}
|
|
723
|
-
return out;
|
|
724
|
-
}
|
|
725
|
-
};
|
|
726
|
-
}
|
|
727
|
-
function createVerifierAdapter(opts) {
|
|
728
|
-
const id = opts.id ?? "multi-layer-verifier";
|
|
729
|
-
const area = opts.area ?? "verification";
|
|
730
|
-
return {
|
|
731
|
-
id,
|
|
732
|
-
description: "Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.",
|
|
733
|
-
inputKind: "custom",
|
|
734
|
-
cost: { kind: "deterministic" },
|
|
735
|
-
version: `verifier-${ADAPTER_REV}`,
|
|
736
|
-
async analyze(env, ctx) {
|
|
737
|
-
const report = await opts.verifier.run({ env, ...opts.options });
|
|
738
|
-
const out = [];
|
|
739
|
-
for (const layer of report.layers) {
|
|
740
|
-
for (const finding2 of layer.findings) {
|
|
741
|
-
out.push(liftLayerFinding(id, area, layer.layer, finding2));
|
|
742
|
-
}
|
|
743
|
-
if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
|
|
744
|
-
out.push(
|
|
745
|
-
makeFinding({
|
|
746
|
-
analyst_id: id,
|
|
747
|
-
area,
|
|
748
|
-
subject: layer.layer,
|
|
749
|
-
claim: `layer "${layer.layer}" ${layer.status}: ${layer.reason ?? "no reason given"}`,
|
|
750
|
-
severity: layer.status === "error" ? "high" : layer.status === "timeout" ? "medium" : "high",
|
|
751
|
-
confidence: 1,
|
|
752
|
-
evidence_refs: [],
|
|
753
|
-
metadata: {
|
|
754
|
-
layer_status: layer.status,
|
|
755
|
-
duration_ms: layer.durationMs,
|
|
756
|
-
score: layer.score,
|
|
757
|
-
diagnostics: layer.diagnostics
|
|
758
|
-
}
|
|
759
|
-
})
|
|
760
|
-
);
|
|
761
|
-
}
|
|
762
|
-
}
|
|
763
|
-
ctx.log?.("verifier complete", {
|
|
764
|
-
layers: report.layers.length,
|
|
765
|
-
blended: report.blendedScore,
|
|
766
|
-
all_pass: report.allPass
|
|
767
|
-
});
|
|
768
|
-
return out;
|
|
769
|
-
}
|
|
770
|
-
};
|
|
771
|
-
}
|
|
772
|
-
function liftLayerFinding(analyst_id, area, layer, f) {
|
|
773
|
-
return makeFinding({
|
|
774
|
-
analyst_id,
|
|
775
|
-
area,
|
|
776
|
-
subject: f.layer ?? layer,
|
|
777
|
-
claim: f.message,
|
|
778
|
-
severity: liftSeverity(f.severity),
|
|
779
|
-
confidence: 0.85,
|
|
780
|
-
evidence_refs: f.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }] : [],
|
|
781
|
-
metadata: f.detail
|
|
782
|
-
});
|
|
783
|
-
}
|
|
784
|
-
function createRunCriticAdapter(opts = {}) {
|
|
785
|
-
const id = opts.id ?? "run-critic";
|
|
786
|
-
const area = opts.area ?? "run-quality";
|
|
787
|
-
const critic = opts.critic ?? new RunCritic();
|
|
788
|
-
const threshold = opts.threshold ?? 0.5;
|
|
789
|
-
return {
|
|
790
|
-
id,
|
|
791
|
-
description: "Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.",
|
|
792
|
-
inputKind: "custom",
|
|
793
|
-
cost: { kind: "deterministic" },
|
|
794
|
-
version: `run-critic-${ADAPTER_REV}`,
|
|
795
|
-
async analyze(trace) {
|
|
796
|
-
const score = critic.scoreTrace(trace);
|
|
797
|
-
const out = [];
|
|
798
|
-
const dims = [
|
|
799
|
-
["success", "critical", "run did not complete successfully"],
|
|
800
|
-
["goalProgress", "high", "goal progress is low"],
|
|
801
|
-
["repoGroundedness", "high", "output is poorly grounded in the repository"],
|
|
802
|
-
["toolUseQuality", "medium", "tool use quality is low"],
|
|
803
|
-
["patchQuality", "medium", "no real patch/edit evidence"],
|
|
804
|
-
["testReality", "high", "no real test/build evidence"],
|
|
805
|
-
["finalGate", "critical", "final gate is blocking"]
|
|
806
|
-
];
|
|
807
|
-
for (const [dim, sev, msg] of dims) {
|
|
808
|
-
const value = score[dim];
|
|
809
|
-
if (typeof value === "number" && value < threshold) {
|
|
810
|
-
out.push(
|
|
811
|
-
makeFinding({
|
|
812
|
-
analyst_id: id,
|
|
813
|
-
area,
|
|
814
|
-
subject: dim,
|
|
815
|
-
claim: msg,
|
|
816
|
-
rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,
|
|
817
|
-
severity: sev,
|
|
818
|
-
confidence: 1,
|
|
819
|
-
evidence_refs: [],
|
|
820
|
-
metadata: { dimension: dim, value, threshold, run_id: trace.run.runId }
|
|
821
|
-
})
|
|
822
|
-
);
|
|
823
|
-
}
|
|
824
|
-
}
|
|
825
|
-
if (score.driftPenalty > 1 - threshold) {
|
|
826
|
-
out.push(
|
|
827
|
-
makeFinding({
|
|
828
|
-
analyst_id: id,
|
|
829
|
-
area,
|
|
830
|
-
subject: "drift",
|
|
831
|
-
claim: "agent output drifted from repository signal",
|
|
832
|
-
rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,
|
|
833
|
-
severity: "medium",
|
|
834
|
-
confidence: 0.9,
|
|
835
|
-
evidence_refs: [],
|
|
836
|
-
metadata: { drift_penalty: score.driftPenalty, notes: score.notes }
|
|
837
|
-
})
|
|
838
|
-
);
|
|
839
|
-
}
|
|
840
|
-
return out;
|
|
841
|
-
}
|
|
842
|
-
};
|
|
843
|
-
}
|
|
844
|
-
function createJudgeAdapter(opts) {
|
|
845
|
-
const id = opts.id ?? "judge";
|
|
846
|
-
const area = opts.area ?? "judge";
|
|
847
|
-
const threshold = opts.threshold ?? 6;
|
|
848
|
-
return {
|
|
849
|
-
id,
|
|
850
|
-
description: "Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.",
|
|
851
|
-
inputKind: "judge-input",
|
|
852
|
-
cost: opts.cost ?? { kind: "llm" },
|
|
853
|
-
version: `judge-${ADAPTER_REV}`,
|
|
854
|
-
async analyze(input) {
|
|
855
|
-
const scores2 = await opts.judge(opts.tcloud, input);
|
|
856
|
-
return scores2.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
|
|
857
|
-
}
|
|
858
|
-
};
|
|
859
|
-
}
|
|
860
|
-
function normalize10(s) {
|
|
861
|
-
return s <= 1 ? s * 10 : s;
|
|
862
|
-
}
|
|
863
|
-
function liftJudgeScore(analyst_id, area, s) {
|
|
864
|
-
const score10 = normalize10(s.score);
|
|
865
|
-
const severity = score10 < 3 ? "critical" : score10 < 5 ? "high" : score10 < 7 ? "medium" : "low";
|
|
866
|
-
return makeFinding({
|
|
867
|
-
analyst_id,
|
|
868
|
-
area,
|
|
869
|
-
subject: s.dimension,
|
|
870
|
-
claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,
|
|
871
|
-
rationale: s.reasoning,
|
|
872
|
-
severity,
|
|
873
|
-
confidence: 0.8,
|
|
874
|
-
evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
|
|
875
|
-
metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
|
|
876
|
-
});
|
|
877
|
-
}
|
|
878
|
-
function createSemanticConceptJudgeAdapter(opts = {}) {
|
|
879
|
-
const id = opts.id ?? "semantic-concept-judge";
|
|
880
|
-
const area = opts.area ?? "concept-coverage";
|
|
881
|
-
return {
|
|
882
|
-
id,
|
|
883
|
-
description: "Runs the semantic-concept judge and surfaces missing / weak concepts as findings.",
|
|
884
|
-
inputKind: "custom",
|
|
885
|
-
cost: { kind: "llm", models: opts.options?.model ? [opts.options.model] : void 0 },
|
|
886
|
-
version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,
|
|
887
|
-
async analyze(input) {
|
|
888
|
-
const result = await runSemanticConceptJudge(input, opts.options);
|
|
889
|
-
if (!result.available) {
|
|
890
|
-
return [
|
|
891
|
-
makeFinding({
|
|
892
|
-
analyst_id: id,
|
|
893
|
-
area,
|
|
894
|
-
claim: "semantic-concept judge unavailable",
|
|
895
|
-
rationale: result.error,
|
|
896
|
-
severity: "info",
|
|
897
|
-
confidence: 1,
|
|
898
|
-
evidence_refs: [],
|
|
899
|
-
metadata: { reason: result.error }
|
|
900
|
-
})
|
|
901
|
-
];
|
|
902
|
-
}
|
|
903
|
-
const out = [];
|
|
904
|
-
for (const f of result.findings) {
|
|
905
|
-
if (f.present && f.score >= 7) continue;
|
|
906
|
-
out.push(
|
|
907
|
-
makeFinding({
|
|
908
|
-
analyst_id: id,
|
|
909
|
-
area,
|
|
910
|
-
subject: f.concept,
|
|
911
|
-
claim: f.present ? `concept "${f.concept}" is weak (${f.score}/10)` : `concept "${f.concept}" is missing`,
|
|
912
|
-
rationale: f.evidence,
|
|
913
|
-
severity: liftSeverity(f.severity),
|
|
914
|
-
confidence: 0.85,
|
|
915
|
-
evidence_refs: [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }],
|
|
916
|
-
metadata: {
|
|
917
|
-
concept: f.concept,
|
|
918
|
-
present: f.present,
|
|
919
|
-
score_10: f.score,
|
|
920
|
-
cost_usd: result.costUsd ?? void 0
|
|
921
|
-
}
|
|
922
|
-
})
|
|
923
|
-
);
|
|
924
|
-
}
|
|
925
|
-
return out;
|
|
926
|
-
}
|
|
927
|
-
};
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
// src/analyst/chat-client.ts
|
|
931
|
-
function createChatClient(opts) {
|
|
932
|
-
switch (opts.transport) {
|
|
933
|
-
case "router":
|
|
934
|
-
return wrapLlmClient(
|
|
935
|
-
opts.transport,
|
|
936
|
-
opts.defaultModel,
|
|
937
|
-
new LlmClient({
|
|
938
|
-
baseUrl: opts.baseUrl ?? "https://router.tangle.tools/v1",
|
|
939
|
-
apiKey: opts.apiKey
|
|
940
|
-
})
|
|
941
|
-
);
|
|
942
|
-
case "cli-bridge":
|
|
943
|
-
return wrapLlmClient(
|
|
944
|
-
opts.transport,
|
|
945
|
-
opts.defaultModel,
|
|
946
|
-
new LlmClient({
|
|
947
|
-
baseUrl: opts.baseUrl ?? "http://127.0.0.1:3344/v1",
|
|
948
|
-
apiKey: opts.bearer ?? ""
|
|
949
|
-
})
|
|
950
|
-
);
|
|
951
|
-
case "direct-provider":
|
|
952
|
-
return wrapLlmClient(
|
|
953
|
-
opts.transport,
|
|
954
|
-
opts.defaultModel,
|
|
955
|
-
new LlmClient({
|
|
956
|
-
baseUrl: opts.baseUrl,
|
|
957
|
-
apiKey: opts.apiKey
|
|
958
|
-
})
|
|
959
|
-
);
|
|
960
|
-
case "sandbox-sdk":
|
|
961
|
-
return {
|
|
962
|
-
transport: "sandbox-sdk",
|
|
963
|
-
defaultModel: opts.defaultModel,
|
|
964
|
-
chat: async (req, callOpts) => opts.chat(resolveModel(req, opts.defaultModel), callOpts)
|
|
965
|
-
};
|
|
966
|
-
case "mock":
|
|
967
|
-
return {
|
|
968
|
-
transport: "mock",
|
|
969
|
-
defaultModel: opts.defaultModel,
|
|
970
|
-
chat: async (req, callOpts) => opts.handler(resolveModel(req, opts.defaultModel), callOpts)
|
|
971
|
-
};
|
|
972
|
-
}
|
|
973
|
-
}
|
|
974
|
-
function wrapLlmClient(transport, defaultModel, inner) {
|
|
975
|
-
return {
|
|
976
|
-
transport,
|
|
977
|
-
defaultModel,
|
|
978
|
-
chat: async (req, callOpts) => {
|
|
979
|
-
const resolved = resolveModel(req, defaultModel);
|
|
980
|
-
const call = inner.call({
|
|
981
|
-
model: resolved.model,
|
|
982
|
-
messages: req.messages,
|
|
983
|
-
jsonMode: req.jsonMode,
|
|
984
|
-
jsonSchema: req.jsonSchema,
|
|
985
|
-
temperature: req.temperature,
|
|
986
|
-
maxTokens: req.maxTokens,
|
|
987
|
-
timeoutMs: req.timeoutMs
|
|
988
|
-
});
|
|
989
|
-
if (!callOpts?.signal) return await call;
|
|
990
|
-
return await Promise.race([call, abortAsRejection(callOpts.signal)]);
|
|
991
|
-
}
|
|
992
|
-
};
|
|
993
|
-
}
|
|
994
|
-
function abortAsRejection(signal) {
|
|
995
|
-
if (signal.aborted) return Promise.reject(toAbortError(signal));
|
|
996
|
-
return new Promise((_, reject) => {
|
|
997
|
-
signal.addEventListener("abort", () => reject(toAbortError(signal)), { once: true });
|
|
998
|
-
});
|
|
999
|
-
}
|
|
1000
|
-
function toAbortError(signal) {
|
|
1001
|
-
const reason = signal.reason;
|
|
1002
|
-
if (reason instanceof Error) return reason;
|
|
1003
|
-
const e = new Error("ChatClient.chat: aborted");
|
|
1004
|
-
e.name = "AbortError";
|
|
1005
|
-
return e;
|
|
1006
|
-
}
|
|
1007
|
-
function resolveModel(req, defaultModel) {
|
|
1008
|
-
if (req.model) return req;
|
|
1009
|
-
if (!defaultModel) {
|
|
1010
|
-
throw new Error(
|
|
1011
|
-
"ChatClient.chat: no model on request and no defaultModel on the client. Either pass req.model or bind defaultModel at createChatClient()."
|
|
1012
|
-
);
|
|
1013
|
-
}
|
|
1014
|
-
return { ...req, model: defaultModel };
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
// src/analyst/finding-signature.ts
|
|
1018
|
-
import { z as z2 } from "zod";
|
|
1019
|
-
|
|
1020
|
-
// src/analyst/finding-subject.ts
|
|
1021
|
-
import { z } from "zod";
|
|
1022
|
-
var FINDING_SUBJECT_KINDS = [
|
|
1023
|
-
"knowledge.wiki",
|
|
1024
|
-
"knowledge.claim",
|
|
1025
|
-
"knowledge.raw",
|
|
1026
|
-
"knowledge.stale",
|
|
1027
|
-
"system-prompt",
|
|
1028
|
-
"tool-doc",
|
|
1029
|
-
"new-tool",
|
|
1030
|
-
"rag",
|
|
1031
|
-
"memory",
|
|
1032
|
-
"scaffolding",
|
|
1033
|
-
"output-schema",
|
|
1034
|
-
"websearch.outdated",
|
|
1035
|
-
"prior-run-summary",
|
|
1036
|
-
"cluster"
|
|
1037
|
-
];
|
|
1038
|
-
function parseFindingSubject(raw) {
|
|
1039
|
-
if (raw === null || raw === void 0) return null;
|
|
1040
|
-
const trimmed = raw.trim();
|
|
1041
|
-
if (trimmed.length === 0) return null;
|
|
1042
|
-
const wiki = trimmed.match(
|
|
1043
|
-
/^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
|
|
1044
|
-
);
|
|
1045
|
-
if (wiki)
|
|
1046
|
-
return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
|
|
1047
|
-
const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
|
|
1048
|
-
if (claim && claim[1].trim().length > 0)
|
|
1049
|
-
return { kind: "knowledge.claim", topic: claim[1].trim() };
|
|
1050
|
-
const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
|
|
1051
|
-
if (raw_ && raw_[1].trim().length > 0)
|
|
1052
|
-
return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
|
|
1053
|
-
const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
|
|
1054
|
-
if (stale) return { kind: "knowledge.stale", slug: stale[1] };
|
|
1055
|
-
const sp = trimmed.match(/^system-prompt:(.+)$/);
|
|
1056
|
-
if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
|
|
1057
|
-
const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1058
|
-
if (tdAspect && tdAspect[2].trim().length > 0) {
|
|
1059
|
-
return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
|
|
1060
|
-
}
|
|
1061
|
-
const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
|
|
1062
|
-
if (td) return { kind: "tool-doc", tool: td[1] };
|
|
1063
|
-
const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
|
|
1064
|
-
if (nt) return { kind: "new-tool", name: nt[1] };
|
|
1065
|
-
const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1066
|
-
if (rag && rag[2].trim().length > 0) {
|
|
1067
|
-
return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
|
|
1068
|
-
}
|
|
1069
|
-
const mem = trimmed.match(/^memory:(.+)$/);
|
|
1070
|
-
if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
|
|
1071
|
-
const sc = trimmed.match(/^scaffolding:(.+)$/);
|
|
1072
|
-
if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
|
|
1073
|
-
const os = trimmed.match(/^output-schema:(.+)$/);
|
|
1074
|
-
if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
|
|
1075
|
-
const ws = trimmed.match(/^websearch:outdated:(.+)$/);
|
|
1076
|
-
if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
|
|
1077
|
-
const prs = trimmed.match(/^prior-run-summary:(.+)$/);
|
|
1078
|
-
if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
|
|
1079
|
-
if (/^[a-z0-9][a-z0-9-]*$/.test(trimmed) && trimmed.length <= 80) {
|
|
1080
|
-
return { kind: "cluster", label: trimmed };
|
|
1081
|
-
}
|
|
1082
|
-
return null;
|
|
1083
|
-
}
|
|
1084
|
-
function renderFindingSubject(s) {
|
|
1085
|
-
switch (s.kind) {
|
|
1086
|
-
case "knowledge.wiki":
|
|
1087
|
-
return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
|
|
1088
|
-
case "knowledge.claim":
|
|
1089
|
-
return `agent-knowledge:claim:${s.topic}`;
|
|
1090
|
-
case "knowledge.raw":
|
|
1091
|
-
return `agent-knowledge:raw:${s.sourceId}`;
|
|
1092
|
-
case "knowledge.stale":
|
|
1093
|
-
return `agent-knowledge:stale:${s.slug}`;
|
|
1094
|
-
case "system-prompt":
|
|
1095
|
-
return `system-prompt:${s.section}`;
|
|
1096
|
-
case "tool-doc":
|
|
1097
|
-
return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
|
|
1098
|
-
case "new-tool":
|
|
1099
|
-
return `new-tool:${s.name}`;
|
|
1100
|
-
case "rag":
|
|
1101
|
-
return `rag:${s.corpus}:${s.docId}`;
|
|
1102
|
-
case "memory":
|
|
1103
|
-
return `memory:${s.key}`;
|
|
1104
|
-
case "scaffolding":
|
|
1105
|
-
return `scaffolding:${s.concern}`;
|
|
1106
|
-
case "output-schema":
|
|
1107
|
-
return `output-schema:${s.field}`;
|
|
1108
|
-
case "websearch.outdated":
|
|
1109
|
-
return `websearch:outdated:${s.topic}`;
|
|
1110
|
-
case "prior-run-summary":
|
|
1111
|
-
return `prior-run-summary:${s.topic}`;
|
|
1112
|
-
case "cluster":
|
|
1113
|
-
return s.label;
|
|
1114
|
-
}
|
|
1115
|
-
}
|
|
1116
|
-
var FINDING_SUBJECT_GRAMMAR_PROMPT = [
|
|
1117
|
-
"Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
|
|
1118
|
-
"",
|
|
1119
|
-
" Knowledge loci (write to the agent-knowledge base):",
|
|
1120
|
-
" agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
|
|
1121
|
-
" agent-knowledge:claim:<topic> draft a claim / relation triple",
|
|
1122
|
-
" agent-knowledge:raw:<source-id> lift a raw source into a curated page",
|
|
1123
|
-
" agent-knowledge:stale:<slug> mark a page superseded",
|
|
1124
|
-
"",
|
|
1125
|
-
" Runtime mutable surfaces (write to prompts / tools / scaffolding):",
|
|
1126
|
-
" system-prompt:<section> add / replace a system-prompt section",
|
|
1127
|
-
" tool-doc:<tool>[:<aspect>] rewrite a tool description",
|
|
1128
|
-
" new-tool:<name> propose a new tool surface",
|
|
1129
|
-
" rag:<corpus>:<doc-id> ingest / correct a RAG document",
|
|
1130
|
-
" memory:<key> invalidate / set a memory entry",
|
|
1131
|
-
" scaffolding:<concern> change a precondition / retry / verifier",
|
|
1132
|
-
" output-schema:<field> constrain the agent output shape",
|
|
1133
|
-
"",
|
|
1134
|
-
" Stale signals (knowledge-poisoning only):",
|
|
1135
|
-
" websearch:outdated:<topic> stale web result",
|
|
1136
|
-
" prior-run-summary:<topic> stale prior-run summary",
|
|
1137
|
-
"",
|
|
1138
|
-
" Cluster label (failure-mode only):",
|
|
1139
|
-
' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
|
|
1140
|
-
"",
|
|
1141
|
-
"Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
|
|
1142
|
-
].join("\n");
|
|
1143
|
-
var KIND_EXPECTED_SUBJECTS = {
|
|
1144
|
-
"failure-mode": ["cluster"],
|
|
1145
|
-
"knowledge-gap": [
|
|
1146
|
-
"knowledge.wiki",
|
|
1147
|
-
"knowledge.claim",
|
|
1148
|
-
"knowledge.raw",
|
|
1149
|
-
"knowledge.stale",
|
|
1150
|
-
"tool-doc",
|
|
1151
|
-
"system-prompt",
|
|
1152
|
-
"memory",
|
|
1153
|
-
"websearch.outdated",
|
|
1154
|
-
"prior-run-summary"
|
|
1155
|
-
],
|
|
1156
|
-
"knowledge-poisoning": [
|
|
1157
|
-
"knowledge.wiki",
|
|
1158
|
-
"knowledge.claim",
|
|
1159
|
-
"knowledge.raw",
|
|
1160
|
-
"tool-doc",
|
|
1161
|
-
"system-prompt",
|
|
1162
|
-
"memory",
|
|
1163
|
-
"websearch.outdated",
|
|
1164
|
-
"prior-run-summary"
|
|
1165
|
-
],
|
|
1166
|
-
improvement: [
|
|
1167
|
-
"system-prompt",
|
|
1168
|
-
"tool-doc",
|
|
1169
|
-
"new-tool",
|
|
1170
|
-
"rag",
|
|
1171
|
-
"memory",
|
|
1172
|
-
"scaffolding",
|
|
1173
|
-
"output-schema",
|
|
1174
|
-
"knowledge.wiki",
|
|
1175
|
-
"knowledge.claim"
|
|
1176
|
-
]
|
|
1177
|
-
};
|
|
1178
|
-
var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
|
|
1179
|
-
message: "subject does not match the finding-subject grammar"
|
|
1180
|
-
});
|
|
1181
|
-
|
|
1182
|
-
// src/analyst/finding-signature.ts
|
|
1183
|
-
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
1184
|
-
var RawAnalystFindingSchema = z2.object({
|
|
1185
|
-
severity: z2.enum(ANALYST_SEVERITIES),
|
|
1186
|
-
claim: z2.string().min(1).max(2e3),
|
|
1187
|
-
/**
|
|
1188
|
-
* Subject locus the finding is about. Validated at parse time
|
|
1189
|
-
* against the documented grammar (`finding-subject.ts`). Findings
|
|
1190
|
-
* with a malformed subject are rejected — they would have been
|
|
1191
|
-
* silently skipped by every downstream adapter, so failing loud at
|
|
1192
|
-
* parse time turns a hidden no-op into a kind-prompt audit signal.
|
|
1193
|
-
*
|
|
1194
|
-
* Optional because purely descriptive findings (no actionable
|
|
1195
|
-
* locus) are legitimate; they just don't route through the
|
|
1196
|
-
* KnowledgeAdapter / ImprovementAdapter.
|
|
1197
|
-
*/
|
|
1198
|
-
subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
|
|
1199
|
-
message: "subject does not match the finding-subject grammar"
|
|
1200
|
-
}).optional(),
|
|
1201
|
-
evidence_uri: z2.string().min(1).max(2e3),
|
|
1202
|
-
evidence_excerpt: z2.string().max(2e3).optional(),
|
|
1203
|
-
confidence: z2.number().min(0).max(1),
|
|
1204
|
-
rationale: z2.string().max(4e3).optional(),
|
|
1205
|
-
recommended_action: z2.string().max(2e3).optional()
|
|
1206
|
-
}).strict();
|
|
1207
|
-
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
1208
|
-
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
1209
|
-
- claim: one-sentence statement (max 2000 chars)
|
|
1210
|
-
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
1211
|
-
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
1212
|
-
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
1213
|
-
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
1214
|
-
- rationale?: one or two sentences explaining the reasoning
|
|
1215
|
-
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
1216
|
-
|
|
1217
|
-
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
1218
|
-
function parseRawFinding(row, log) {
|
|
1219
|
-
const result = RawAnalystFindingSchema.safeParse(row);
|
|
1220
|
-
if (!result.success) {
|
|
1221
|
-
log?.("finding rejected: schema failure", {
|
|
1222
|
-
issues: result.error.issues.map((i) => ({
|
|
1223
|
-
path: i.path.join("."),
|
|
1224
|
-
code: i.code,
|
|
1225
|
-
message: i.message
|
|
1226
|
-
}))
|
|
1227
|
-
});
|
|
1228
|
-
return null;
|
|
1229
|
-
}
|
|
1230
|
-
return result.data;
|
|
1231
|
-
}
|
|
1232
|
-
|
|
1233
|
-
// src/analyst/findings-store.ts
|
|
1234
|
-
import { existsSync as existsSync2, readFileSync } from "fs";
|
|
1235
|
-
|
|
1236
|
-
// src/locked-jsonl-appender.ts
|
|
1237
|
-
import { appendFileSync, existsSync, mkdirSync } from "fs";
|
|
1238
|
-
import { dirname } from "path";
|
|
1239
|
-
|
|
1240
|
-
// src/concurrency.ts
|
|
1241
|
-
var Mutex = class {
|
|
1242
|
-
locked = false;
|
|
1243
|
-
waiters = [];
|
|
1244
|
-
async acquire() {
|
|
1245
|
-
if (!this.locked) {
|
|
1246
|
-
this.locked = true;
|
|
1247
|
-
return () => this.release();
|
|
1248
|
-
}
|
|
1249
|
-
return new Promise((resolve) => {
|
|
1250
|
-
this.waiters.push(() => {
|
|
1251
|
-
resolve(() => this.release());
|
|
1252
|
-
});
|
|
1253
|
-
});
|
|
1254
|
-
}
|
|
1255
|
-
release() {
|
|
1256
|
-
const next = this.waiters.shift();
|
|
1257
|
-
if (next) {
|
|
1258
|
-
next();
|
|
1259
|
-
} else {
|
|
1260
|
-
this.locked = false;
|
|
1261
|
-
}
|
|
1262
|
-
}
|
|
1263
|
-
async runExclusive(fn) {
|
|
1264
|
-
const release = await this.acquire();
|
|
1265
|
-
try {
|
|
1266
|
-
return await fn();
|
|
1267
|
-
} finally {
|
|
1268
|
-
release();
|
|
1269
|
-
}
|
|
1270
|
-
}
|
|
1271
|
-
/** True iff someone holds the lock right now. Diagnostics only. */
|
|
1272
|
-
get isLocked() {
|
|
1273
|
-
return this.locked;
|
|
1274
|
-
}
|
|
1275
|
-
/** Pending waiter count. Diagnostics only. */
|
|
1276
|
-
get pending() {
|
|
1277
|
-
return this.waiters.length;
|
|
1278
|
-
}
|
|
1279
|
-
};
|
|
1280
|
-
|
|
1281
|
-
// src/locked-jsonl-appender.ts
|
|
1282
|
-
var mutexes = /* @__PURE__ */ new Map();
|
|
1283
|
-
function getMutex(path) {
|
|
1284
|
-
let m = mutexes.get(path);
|
|
1285
|
-
if (!m) {
|
|
1286
|
-
m = new Mutex();
|
|
1287
|
-
mutexes.set(path, m);
|
|
1288
|
-
}
|
|
1289
|
-
return m;
|
|
1290
|
-
}
|
|
1291
|
-
var LockedJsonlAppender = class {
|
|
1292
|
-
constructor(path) {
|
|
1293
|
-
this.path = path;
|
|
1294
|
-
this.mutex = getMutex(path);
|
|
1295
|
-
if (!existsSync(dirname(path))) {
|
|
1296
|
-
mkdirSync(dirname(path), { recursive: true });
|
|
1297
|
-
}
|
|
1298
|
-
}
|
|
1299
|
-
path;
|
|
1300
|
-
mutex;
|
|
1301
|
-
async append(entry) {
|
|
1302
|
-
const line = `${JSON.stringify(entry)}
|
|
1303
|
-
`;
|
|
1304
|
-
await this.mutex.runExclusive(() => {
|
|
1305
|
-
appendFileSync(this.path, line);
|
|
1306
|
-
});
|
|
1307
|
-
}
|
|
1308
|
-
};
|
|
1309
|
-
function resetLockedAppendersForTesting() {
|
|
1310
|
-
mutexes.clear();
|
|
1311
|
-
}
|
|
1312
|
-
|
|
1313
|
-
// src/analyst/findings-store.ts
|
|
1314
|
-
var FindingsStore = class {
|
|
1315
|
-
constructor(path) {
|
|
1316
|
-
this.path = path;
|
|
1317
|
-
this.appender = new LockedJsonlAppender(path);
|
|
1318
|
-
}
|
|
1319
|
-
path;
|
|
1320
|
-
appender;
|
|
1321
|
-
async append(runId, findings) {
|
|
1322
|
-
for (const f of findings) {
|
|
1323
|
-
const row = { ...f, run_id: runId };
|
|
1324
|
-
await this.appender.append(row);
|
|
1325
|
-
}
|
|
1326
|
-
}
|
|
1327
|
-
/** Load every persisted finding. Discards malformed trailing lines silently. */
|
|
1328
|
-
loadAll() {
|
|
1329
|
-
if (!existsSync2(this.path)) return [];
|
|
1330
|
-
const raw = readFileSync(this.path, "utf8");
|
|
1331
|
-
if (!raw) return [];
|
|
1332
|
-
const out = [];
|
|
1333
|
-
for (const line of raw.split("\n")) {
|
|
1334
|
-
if (!line) continue;
|
|
1335
|
-
try {
|
|
1336
|
-
out.push(JSON.parse(line));
|
|
1337
|
-
} catch {
|
|
1338
|
-
}
|
|
1339
|
-
}
|
|
1340
|
-
return out;
|
|
1341
|
-
}
|
|
1342
|
-
/** Filter to a single run. */
|
|
1343
|
-
loadRun(runId) {
|
|
1344
|
-
return this.loadAll().filter((r) => r.run_id === runId);
|
|
1345
|
-
}
|
|
1346
|
-
};
|
|
1347
|
-
function defaultIsMaterial(a, b) {
|
|
1348
|
-
if (a.severity !== b.severity) return true;
|
|
1349
|
-
if (Math.abs((a.confidence ?? 0) - (b.confidence ?? 0)) > 0.05) return true;
|
|
1350
|
-
if (a.evidence_refs.length !== b.evidence_refs.length) return true;
|
|
1351
|
-
return false;
|
|
1352
|
-
}
|
|
1353
|
-
function diffFindings(previous, current, policy = {}) {
|
|
1354
|
-
const isMaterial = policy.isMaterial ?? defaultIsMaterial;
|
|
1355
|
-
const prevById = new Map(previous.map((f) => [f.finding_id, f]));
|
|
1356
|
-
const curById = new Map(current.map((f) => [f.finding_id, f]));
|
|
1357
|
-
const appeared = [];
|
|
1358
|
-
const disappeared = [];
|
|
1359
|
-
const persisted = [];
|
|
1360
|
-
const changed = [];
|
|
1361
|
-
for (const [id, cur] of curById) {
|
|
1362
|
-
const prev = prevById.get(id);
|
|
1363
|
-
if (!prev) {
|
|
1364
|
-
appeared.push(cur);
|
|
1365
|
-
continue;
|
|
1366
|
-
}
|
|
1367
|
-
if (isMaterial(prev, cur)) {
|
|
1368
|
-
changed.push({ previous: prev, current: cur });
|
|
1369
|
-
} else {
|
|
1370
|
-
persisted.push(cur);
|
|
1371
|
-
}
|
|
1372
|
-
}
|
|
1373
|
-
for (const [id, prev] of prevById) {
|
|
1374
|
-
if (!curById.has(id)) disappeared.push(prev);
|
|
1375
|
-
}
|
|
1376
|
-
return { appeared, disappeared, persisted, changed };
|
|
1377
|
-
}
|
|
1378
|
-
|
|
1379
|
-
// src/analyst/kind-factory.ts
|
|
1380
|
-
import { AxJSRuntime, agent } from "@ax-llm/ax";
|
|
1381
|
-
function createTraceAnalystKind(spec, opts) {
|
|
1382
|
-
const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
|
|
1383
|
-
return {
|
|
1384
|
-
id: spec.id,
|
|
1385
|
-
description: spec.description,
|
|
1386
|
-
inputKind: "trace-store",
|
|
1387
|
-
cost: spec.cost,
|
|
1388
|
-
version,
|
|
1389
|
-
async analyze(store, ctx) {
|
|
1390
|
-
const tools = spec.buildTools(store);
|
|
1391
|
-
const maxDepth = spec.recursion?.maxDepth ?? 0;
|
|
1392
|
-
const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
|
|
1393
|
-
const priorContext = renderPriorFindings(ctx.priorFindings);
|
|
1394
|
-
const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
|
|
1395
|
-
const ax = agent(
|
|
1396
|
-
"question:string -> findings:json[]",
|
|
1397
|
-
{
|
|
1398
|
-
agentIdentity: {
|
|
1399
|
-
name: spec.id,
|
|
1400
|
-
description: spec.description
|
|
1401
|
-
},
|
|
1402
|
-
contextFields: ["question"],
|
|
1403
|
-
runtime: new AxJSRuntime({
|
|
1404
|
-
permissions: [],
|
|
1405
|
-
blockDynamicImport: true,
|
|
1406
|
-
allowedModules: [],
|
|
1407
|
-
freezeIntrinsics: true,
|
|
1408
|
-
blockShadowRealm: true,
|
|
1409
|
-
preventGlobalThisExtensions: false
|
|
1410
|
-
}),
|
|
1411
|
-
mode: maxDepth > 0 ? "advanced" : "simple",
|
|
1412
|
-
recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
|
|
1413
|
-
maxTurns: spec.maxTurns ?? 12,
|
|
1414
|
-
maxRuntimeChars: spec.maxRuntimeChars ?? 6e3,
|
|
1415
|
-
maxBatchedLlmQueryConcurrency: maxParallel,
|
|
1416
|
-
promptLevel: "detailed",
|
|
1417
|
-
contextPolicy: { preset: "full", budget: "balanced" },
|
|
1418
|
-
functions: { local: tools },
|
|
1419
|
-
actorOptions: {
|
|
1420
|
-
description: actorDescription,
|
|
1421
|
-
...opts.model ? { model: opts.model } : {},
|
|
1422
|
-
showThoughts: false,
|
|
1423
|
-
thinkingTokenBudget: "none"
|
|
1424
|
-
},
|
|
1425
|
-
responderOptions: {
|
|
1426
|
-
description: spec.responderDescription ?? "Format the structured `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.",
|
|
1427
|
-
...opts.model ? { model: opts.model } : {},
|
|
1428
|
-
showThoughts: false
|
|
1429
|
-
},
|
|
1430
|
-
bubbleErrors: [TraceFileMissingError]
|
|
1431
|
-
}
|
|
1432
|
-
);
|
|
1433
|
-
ctx.log?.(`analyst.kind ${spec.id} forward`, {
|
|
1434
|
-
max_depth: maxDepth,
|
|
1435
|
-
tool_count: tools.length,
|
|
1436
|
-
tags: ctx.tags
|
|
1437
|
-
});
|
|
1438
|
-
const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
|
|
1439
|
-
const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
|
|
1440
|
-
const out = [];
|
|
1441
|
-
const rawRows = Array.isArray(result.findings) ? result.findings : [];
|
|
1442
|
-
let rejectedWrongKind = 0;
|
|
1443
|
-
for (const row of rawRows) {
|
|
1444
|
-
const parsed = parseRawFinding(row, ctx.log);
|
|
1445
|
-
if (!parsed) continue;
|
|
1446
|
-
if (expectedSubjects && parsed.subject !== void 0) {
|
|
1447
|
-
const parsedSubject = parseFindingSubject(parsed.subject);
|
|
1448
|
-
if (parsedSubject === null) {
|
|
1449
|
-
ctx.log?.("finding rejected: subject failed to parse", {
|
|
1450
|
-
kind: spec.id,
|
|
1451
|
-
subject: parsed.subject
|
|
1452
|
-
});
|
|
1453
|
-
rejectedWrongKind += 1;
|
|
1454
|
-
continue;
|
|
1455
|
-
}
|
|
1456
|
-
if (!expectedSubjects.includes(parsedSubject.kind)) {
|
|
1457
|
-
ctx.log?.("finding rejected: subject variant not allowed for this kind", {
|
|
1458
|
-
kind: spec.id,
|
|
1459
|
-
subject_kind: parsedSubject.kind,
|
|
1460
|
-
subject: parsed.subject,
|
|
1461
|
-
allowed: expectedSubjects
|
|
1462
|
-
});
|
|
1463
|
-
rejectedWrongKind += 1;
|
|
1464
|
-
continue;
|
|
1465
|
-
}
|
|
1466
|
-
}
|
|
1467
|
-
const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
|
|
1468
|
-
if (!postProcessed) continue;
|
|
1469
|
-
out.push(toAnalystFinding(spec, postProcessed));
|
|
1470
|
-
}
|
|
1471
|
-
ctx.log?.(`analyst.kind ${spec.id} done`, {
|
|
1472
|
-
emitted: rawRows.length,
|
|
1473
|
-
accepted: out.length,
|
|
1474
|
-
rejected_wrong_subject: rejectedWrongKind
|
|
1475
|
-
});
|
|
1476
|
-
return out;
|
|
1477
|
-
}
|
|
1478
|
-
};
|
|
1479
|
-
}
|
|
1480
|
-
function deriveQuestion(ctx, spec) {
|
|
1481
|
-
const focus = ctx.tags?.focus?.trim();
|
|
1482
|
-
if (focus) return `${spec.id}: ${focus}`;
|
|
1483
|
-
return spec.id;
|
|
1484
|
-
}
|
|
1485
|
-
function toAnalystFinding(spec, raw) {
|
|
1486
|
-
return makeFinding({
|
|
1487
|
-
analyst_id: spec.id,
|
|
1488
|
-
area: spec.area,
|
|
1489
|
-
subject: raw.subject,
|
|
1490
|
-
claim: raw.claim,
|
|
1491
|
-
rationale: raw.rationale,
|
|
1492
|
-
severity: raw.severity,
|
|
1493
|
-
confidence: raw.confidence,
|
|
1494
|
-
evidence_refs: [
|
|
1495
|
-
{
|
|
1496
|
-
kind: evidenceKindFromUri(raw.evidence_uri),
|
|
1497
|
-
uri: raw.evidence_uri,
|
|
1498
|
-
excerpt: raw.evidence_excerpt
|
|
1499
|
-
}
|
|
1500
|
-
],
|
|
1501
|
-
recommended_action: raw.recommended_action,
|
|
1502
|
-
metadata: { kind_version: spec.version }
|
|
1503
|
-
});
|
|
1504
|
-
}
|
|
1505
|
-
function evidenceKindFromUri(uri) {
|
|
1506
|
-
if (uri.startsWith("span://")) return "span";
|
|
1507
|
-
if (uri.startsWith("artifact://")) return "artifact";
|
|
1508
|
-
if (uri.startsWith("metric://")) return "metric";
|
|
1509
|
-
if (uri.startsWith("event://")) return "event";
|
|
1510
|
-
if (uri.startsWith("finding://")) return "finding";
|
|
1511
|
-
return "artifact";
|
|
1512
|
-
}
|
|
1513
|
-
function renderPriorFindings(prior) {
|
|
1514
|
-
if (!prior || prior.length === 0) return "";
|
|
1515
|
-
const MAX_ROWS = 40;
|
|
1516
|
-
const rows = prior.slice(0, MAX_ROWS).map((f) => {
|
|
1517
|
-
const subject = f.subject ? ` [${f.subject}]` : "";
|
|
1518
|
-
return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
|
|
1519
|
-
});
|
|
1520
|
-
const overflow = prior.length > MAX_ROWS ? `
|
|
1521
|
-
... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
|
|
1522
|
-
return [
|
|
1523
|
-
"",
|
|
1524
|
-
"",
|
|
1525
|
-
"PRIOR FINDINGS (from a previous run on related data):",
|
|
1526
|
-
"When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
|
|
1527
|
-
"A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
|
|
1528
|
-
...rows,
|
|
1529
|
-
overflow
|
|
1530
|
-
].filter(Boolean).join("\n");
|
|
1531
|
-
}
|
|
1532
|
-
function truncateForContext(s, max) {
|
|
1533
|
-
if (s.length <= max) return s;
|
|
1534
|
-
return `${s.slice(0, max - 1).trimEnd()}\u2026`;
|
|
1535
|
-
}
|
|
1536
|
-
|
|
1537
|
-
// src/analyst/tool-groups.ts
|
|
1538
|
-
var TOOL_NAMES_BY_GROUP = {
|
|
1539
|
-
all: /* @__PURE__ */ new Set(),
|
|
1540
|
-
discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
|
|
1541
|
-
discoveryAndRead: /* @__PURE__ */ new Set([
|
|
1542
|
-
"getDatasetOverview",
|
|
1543
|
-
"queryTraces",
|
|
1544
|
-
"countTraces",
|
|
1545
|
-
"viewTrace",
|
|
1546
|
-
"viewSpans"
|
|
1547
|
-
]),
|
|
1548
|
-
discoveryAndSearch: /* @__PURE__ */ new Set([
|
|
1549
|
-
"getDatasetOverview",
|
|
1550
|
-
"queryTraces",
|
|
1551
|
-
"countTraces",
|
|
1552
|
-
"searchTrace",
|
|
1553
|
-
"searchSpan"
|
|
1554
|
-
]),
|
|
1555
|
-
targeted: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "viewSpans", "searchSpan"])
|
|
1556
|
-
};
|
|
1557
|
-
function buildTraceToolsForGroup(group, store) {
|
|
1558
|
-
const all = buildTraceAnalystTools({ store });
|
|
1559
|
-
if (group === "all") return all;
|
|
1560
|
-
const allow = TOOL_NAMES_BY_GROUP[group];
|
|
1561
|
-
if (!allow) throw new Error(`unknown trace tool group: ${group}`);
|
|
1562
|
-
return all.filter((tool) => allow.has(tool.name));
|
|
1563
|
-
}
|
|
1564
|
-
|
|
1565
|
-
// src/analyst/kinds/failure-mode.ts
|
|
1566
|
-
var ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.
|
|
1567
|
-
|
|
1568
|
-
DISCOVERY \u2192 CLUSTER \u2192 CITE protocol:
|
|
1569
|
-
|
|
1570
|
-
1. Call \`traces.getDatasetOverview({})\` first. Use \`has_errors\`, \`models\`, \`agent_names\`, \`tools\`, and \`sample_trace_ids\` to size the failure surface.
|
|
1571
|
-
2. Use \`traces.queryTraces({ filters: { has_errors: true }, limit })\` to pull error-bearing traces. Combine with \`traces.countTraces\` to see what fraction of the dataset failed.
|
|
1572
|
-
3. For each candidate failure cluster, use \`traces.searchTrace\` with regex like \`STATUS_CODE_ERROR\`, \`MaxTurnsExceeded\`, \`assertion\`, \`unauthorized\`, \`timeout\`, \`429\`, \`5\\d\\d\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.
|
|
1573
|
-
4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.
|
|
1574
|
-
5. For each cluster you can defend with evidence, emit ONE finding with:
|
|
1575
|
-
- \`area\` = "failure-mode"
|
|
1576
|
-
- \`subject\` = a short label for the cluster ("tool-call-loop", "auth-revoked-mid-run", "agent-asked-clarification-too-late", ...)
|
|
1577
|
-
- \`claim\` = one sentence stating the mode
|
|
1578
|
-
- \`severity\` = "critical" when it blocks the run, "high" when the run finished degraded, "medium" when it slowed convergence
|
|
1579
|
-
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the most representative span
|
|
1580
|
-
- \`evidence_excerpt\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)
|
|
1581
|
-
- \`confidence\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.
|
|
1582
|
-
- \`recommended_action\` = imperative-phrased fix idea (kept short \u2014 the improvement-analyst will expand on these)
|
|
1583
|
-
|
|
1584
|
-
If the dataset has no failures, return an empty findings array \u2014 do NOT pad with low-confidence speculation.
|
|
1585
|
-
|
|
1586
|
-
**Delegate aggressively.** The recursion budget is there to be used:
|
|
1587
|
-
- After your first \`getDatasetOverview\` + \`queryTraces\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \`llmQuery\` per cluster in a single batch \u2014 they investigate in parallel.
|
|
1588
|
-
- A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.
|
|
1589
|
-
- Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.
|
|
1590
|
-
- Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \`final()\` \u2014 they return their findings list to you, and you call \`final()\` once at the top.
|
|
1591
|
-
|
|
1592
|
-
OBSERVABILITY rules:
|
|
1593
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1594
|
-
- Reuse runtime variables across turns; don't recompute.
|
|
1595
|
-
- Call \`final({ findings: [...] })\` exactly once, after you've gathered evidence for every cluster you intend to report.`;
|
|
1596
|
-
var FAILURE_MODE_KIND_SPEC = {
|
|
1597
|
-
id: "failure-mode",
|
|
1598
|
-
description: "Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.",
|
|
1599
|
-
area: "failure-mode",
|
|
1600
|
-
version: "1.0.0",
|
|
1601
|
-
actorDescription: ACTOR_PROMPT,
|
|
1602
|
-
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1603
|
-
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1604
|
-
maxTurns: 24,
|
|
1605
|
-
cost: { kind: "llm" }
|
|
1606
|
-
};
|
|
1607
|
-
|
|
1608
|
-
// src/analyst/kinds/improvement.ts
|
|
1609
|
-
var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1610
|
-
|
|
1611
|
-
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1612
|
-
|
|
1613
|
-
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1614
|
-
|
|
1615
|
-
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1616
|
-
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1617
|
-
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1618
|
-
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1619
|
-
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1620
|
-
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1621
|
-
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1622
|
-
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1623
|
-
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1624
|
-
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1625
|
-
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1626
|
-
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1627
|
-
|
|
1628
|
-
For each winning recommendation, emit ONE finding with:
|
|
1629
|
-
- \`area\` = "improvement"
|
|
1630
|
-
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1631
|
-
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1632
|
-
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1633
|
-
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1634
|
-
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1635
|
-
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1636
|
-
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1637
|
-
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1638
|
-
|
|
1639
|
-
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1640
|
-
|
|
1641
|
-
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1642
|
-
|
|
1643
|
-
OBSERVABILITY rules:
|
|
1644
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1645
|
-
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1646
|
-
var IMPROVEMENT_KIND_SPEC = {
|
|
1647
|
-
id: "improvement",
|
|
1648
|
-
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1649
|
-
area: "improvement",
|
|
1650
|
-
version: "1.0.0",
|
|
1651
|
-
actorDescription: ACTOR_PROMPT2,
|
|
1652
|
-
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1653
|
-
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1654
|
-
maxTurns: 30,
|
|
1655
|
-
maxRuntimeChars: 12e3,
|
|
1656
|
-
cost: { kind: "llm" }
|
|
1657
|
-
};
|
|
1658
|
-
|
|
1659
|
-
// src/analyst/kinds/knowledge-gap.ts
|
|
1660
|
-
var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
|
|
1661
|
-
|
|
1662
|
-
The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
|
|
1663
|
-
|
|
1664
|
-
DISCOVERY \u2192 ATTRIBUTE-TO-LAYER \u2192 CITE protocol:
|
|
1665
|
-
|
|
1666
|
-
1. \`traces.getDatasetOverview({})\` first. Note which agents, tools, and models appear.
|
|
1667
|
-
2. Pull traces where the agent shows gap signals. The strongest signals are:
|
|
1668
|
-
- Self-correction turns ("I assumed X but\u2026", "let me re-check", "actually,")
|
|
1669
|
-
- Clarifying-question turns where the agent asked the user something the runtime should have surfaced
|
|
1670
|
-
- Repeated retrieval / lookup calls for the same artifact with slightly varied queries
|
|
1671
|
-
- Tool errors that name a missing argument or unknown resource
|
|
1672
|
-
- Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)
|
|
1673
|
-
- Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient
|
|
1674
|
-
- Fabricated identifiers that don't appear in dataset \`sample_trace_ids\`
|
|
1675
|
-
Use \`traces.searchTrace\` with patterns like \`I (don.?t|do not) know\`, \`assumed\`, \`unclear\`, \`could you (clarify|tell me|provide)\`, \`not found\`, \`undefined\`, \`unknown\`, \`null\`, dates older than the analysis window, or the agent's specific clarification phrases.
|
|
1676
|
-
3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \`subject\` on the finding. Use one of:
|
|
1677
|
-
- \`agent-knowledge:wiki:<page-slug>\` \u2014 the wiki page that should exist but doesn't, or exists but lacks the claim
|
|
1678
|
-
- \`agent-knowledge:wiki:<page-slug>#<heading>\` \u2014 wiki page exists but a specific section is missing
|
|
1679
|
-
- \`agent-knowledge:claim:<topic>\` \u2014 a specific claim/relation triple that should be in the wiki
|
|
1680
|
-
- \`agent-knowledge:raw:<source-id>\` \u2014 raw source captured but never lifted into a curated page
|
|
1681
|
-
- \`agent-knowledge:stale:<page-slug>\` \u2014 wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)
|
|
1682
|
-
- \`websearch:outdated:<topic>\` \u2014 agent relied on a web result that was stale; wiki should have superseded it
|
|
1683
|
-
- \`tool-doc:<tool-name>:<aspect>\` \u2014 tool description missed a behavior aspect (return shape, failure modes, side effects)
|
|
1684
|
-
- \`system-prompt:<section>\` \u2014 system prompt should have stated the rule directly
|
|
1685
|
-
- \`memory:<key>\` \u2014 prior-run memory should have surfaced an earlier decision
|
|
1686
|
-
4. For each gap you can defend with evidence, emit ONE finding with:
|
|
1687
|
-
- \`area\` = "knowledge-gap"
|
|
1688
|
-
- \`subject\` = the locus string from the list above
|
|
1689
|
-
- \`claim\` = a sentence naming the missing or stale knowledge ("wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans")
|
|
1690
|
-
- \`severity\` = "high" when the gap caused a failure or a clarifying question; "medium" when it caused unnecessary turns; "low" when it caused minor inefficiency
|
|
1691
|
-
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)
|
|
1692
|
-
- \`evidence_excerpt\` = exact quote where the agent showed the gap
|
|
1693
|
-
- \`confidence\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior
|
|
1694
|
-
- \`recommended_action\` = phrased as a wiki edit when the locus is \`agent-knowledge:*\` ("Create wiki page \`invoice-line-items\` with claims: ..."), or as a prompt/tool-doc edit otherwise
|
|
1695
|
-
|
|
1696
|
-
**Delegate per layer.** After your first scan, you should have candidates spread across \`agent-knowledge:*\`, \`websearch:outdated\`, \`tool-doc:*\`, \`system-prompt:*\`, and \`memory:*\`. Spawn one \`llmQuery\` per layer in parallel \u2014 each subagent runs a focused detection (e.g. the \`agent-knowledge\` subagent looks for both missing-pages AND stale-pages; the \`websearch\` subagent looks specifically for date staleness signals; the \`tool-doc\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \`final({ findings })\` at the top.
|
|
1697
|
-
|
|
1698
|
-
Do NOT report a gap that the agent later recovered from cleanly within the same turn \u2014 that's resilience, not a gap. Cite the *non-recovery* version when both exist.
|
|
1699
|
-
|
|
1700
|
-
OBSERVABILITY rules:
|
|
1701
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1702
|
-
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1703
|
-
var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
1704
|
-
id: "knowledge-gap",
|
|
1705
|
-
description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
|
|
1706
|
-
area: "knowledge-gap",
|
|
1707
|
-
version: "1.0.0",
|
|
1708
|
-
actorDescription: ACTOR_PROMPT3,
|
|
1709
|
-
buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
|
|
1710
|
-
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1711
|
-
maxTurns: 18,
|
|
1712
|
-
cost: { kind: "llm" }
|
|
1713
|
-
};
|
|
1714
|
-
|
|
1715
|
-
// src/analyst/kinds/knowledge-poisoning.ts
|
|
1716
|
-
var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
|
|
1717
|
-
|
|
1718
|
-
DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
|
|
1719
|
-
|
|
1720
|
-
1. \`traces.getDatasetOverview({})\` first. Identify the agents, models, and tools.
|
|
1721
|
-
2. Pull traces where the agent's confident action was later contradicted. Strongest signals:
|
|
1722
|
-
- Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.
|
|
1723
|
-
- Tool call with stale arguments (an id that no longer exists, an API shape that changed).
|
|
1724
|
-
- Agent cited an \`agent-knowledge\` wiki page or claim whose content contradicts the trace's own evidence \u2014 the wiki itself drifted.
|
|
1725
|
-
- Web-search result the agent cited that returned an outdated page; agent treated it as canonical.
|
|
1726
|
-
- System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says "use endpoint A"; tool reply says "endpoint A deprecated, use B").
|
|
1727
|
-
- Repeated wrong-shape parsing despite the tool's actual output proving the shape.
|
|
1728
|
-
3. Use \`traces.searchTrace\` with regex on phrases like \`actually\`, \`turns out\`, \`previously assumed\`, \`old version\`, \`deprecated\`, \`updated to\`, \`now uses\`, or specific entity names you suspect have changed.
|
|
1729
|
-
4. For each candidate poisoning, **DUAL-VERIFY**:
|
|
1730
|
-
- Confirm the agent actually acted on the false belief (cite the span where it did)
|
|
1731
|
-
- Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)
|
|
1732
|
-
Only emit a finding when both halves are nailed down. If you can only nail one, drop it \u2014 single-evidence poisoning findings are too speculative to be useful.
|
|
1733
|
-
|
|
1734
|
-
**Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating "did the agent act?" and one investigating "is the belief false?". After your first scan, fire off N parallel \`llmQuery\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.
|
|
1735
|
-
|
|
1736
|
-
For each confirmed poisoning, emit ONE finding with:
|
|
1737
|
-
- \`area\` = "knowledge-poisoning"
|
|
1738
|
-
- \`subject\` = the source of the false belief, one of: \`agent-knowledge:wiki:<page-slug>\` (wiki page contradicts current ground truth), \`agent-knowledge:claim:<topic>\` (a specific claim/relation went stale), \`agent-knowledge:raw:<source-id>\` (the raw source is outdated and the wiki inherited the drift), \`websearch:outdated:<url-or-topic>\`, \`tool-doc:<tool>\`, \`system-prompt:<section>\`, \`memory:<key>\`, \`prior-run-summary:<topic>\`
|
|
1739
|
-
- \`claim\` = one sentence: "agent believed X (from source S); evidence in trace shows X is false"
|
|
1740
|
-
- \`severity\` = "critical" when poisoning caused a wrong user-visible action; "high" when caught internally but wasted significant work; "medium" for inefficiency only
|
|
1741
|
-
- \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the action span (the moment the agent acted on the false belief)
|
|
1742
|
-
- \`evidence_excerpt\` = exact quote of the confident-but-wrong claim or action
|
|
1743
|
-
- \`confidence\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred
|
|
1744
|
-
- \`recommended_action\` = where the source should be updated and how ("Update wiki page \`X\` claim \`Y\` to '...'", "Invalidate raw source \`Z\` and re-curate", "Replace system-prompt section X with 'tool foo now returns Y'")
|
|
1745
|
-
|
|
1746
|
-
Do NOT report a finding if the agent caught and corrected the false belief in the same turn \u2014 that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.
|
|
1747
|
-
|
|
1748
|
-
OBSERVABILITY rules:
|
|
1749
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1750
|
-
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1751
|
-
var KNOWLEDGE_POISONING_KIND_SPEC = {
|
|
1752
|
-
id: "knowledge-poisoning",
|
|
1753
|
-
description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
|
|
1754
|
-
area: "knowledge-poisoning",
|
|
1755
|
-
version: "1.0.0",
|
|
1756
|
-
actorDescription: ACTOR_PROMPT4,
|
|
1757
|
-
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1758
|
-
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1759
|
-
maxTurns: 20,
|
|
1760
|
-
cost: { kind: "llm" }
|
|
1761
|
-
};
|
|
1762
|
-
|
|
1763
|
-
// src/analyst/kinds/index.ts
|
|
1764
|
-
var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
1765
|
-
FAILURE_MODE_KIND_SPEC,
|
|
1766
|
-
KNOWLEDGE_GAP_KIND_SPEC,
|
|
1767
|
-
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
1768
|
-
IMPROVEMENT_KIND_SPEC
|
|
1769
|
-
];
|
|
1770
|
-
|
|
1771
|
-
// src/analyst/kinds/skill-usage.ts
|
|
1772
|
-
import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
|
|
1773
|
-
import { join } from "path";
|
|
1774
|
-
var BLOAT_LINE_THRESHOLD = 300;
|
|
1775
|
-
var TANGLE_PRIVATE_RE = /\b(cli-bridge|tangletools|ops-board|drew-gtr-pro|@tangle-network\/|~\/company|tangle\.tools|gtm-agent)\b|\bkimi\b|\btcloud\b/gi;
|
|
1776
|
-
var TRIGGER_RE = /triggers?\s*[:-]/i;
|
|
1777
|
-
function listSkillDirs(root) {
|
|
1778
|
-
if (!existsSync3(root)) return [];
|
|
1779
|
-
const out = [];
|
|
1780
|
-
for (const entry of readdirSync(root, { withFileTypes: true })) {
|
|
1781
|
-
if (!entry.isDirectory() && !entry.isSymbolicLink()) continue;
|
|
1782
|
-
const skillMd = join(root, entry.name, "SKILL.md");
|
|
1783
|
-
if (existsSync3(skillMd)) out.push({ name: entry.name, path: skillMd });
|
|
1784
|
-
}
|
|
1785
|
-
return out;
|
|
1786
|
-
}
|
|
1787
|
-
function walkJsonl(dir, cap) {
|
|
1788
|
-
if (!existsSync3(dir)) return [];
|
|
1789
|
-
const files = [];
|
|
1790
|
-
const stack = [dir];
|
|
1791
|
-
while (stack.length) {
|
|
1792
|
-
const cur = stack.pop();
|
|
1793
|
-
let entries;
|
|
1794
|
-
try {
|
|
1795
|
-
entries = readdirSync(cur, { withFileTypes: true });
|
|
1796
|
-
} catch {
|
|
1797
|
-
continue;
|
|
1798
|
-
}
|
|
1799
|
-
for (const e of entries) {
|
|
1800
|
-
const full = join(cur, e.name);
|
|
1801
|
-
if (e.isDirectory()) stack.push(full);
|
|
1802
|
-
else if (e.name.endsWith(".jsonl")) {
|
|
1803
|
-
files.push(full);
|
|
1804
|
-
if (cap > 0 && files.length >= cap) return files;
|
|
1805
|
-
}
|
|
1806
|
-
}
|
|
1807
|
-
}
|
|
1808
|
-
return files;
|
|
1809
|
-
}
|
|
1810
|
-
function frontmatterDescription(body) {
|
|
1811
|
-
const fm = /^---\n([\s\S]*?)\n---/.exec(body);
|
|
1812
|
-
const block = fm?.[1] ?? "";
|
|
1813
|
-
const m = /description:\s*(.+)/i.exec(block);
|
|
1814
|
-
return m?.[1] ?? "";
|
|
1815
|
-
}
|
|
1816
|
-
function countArtifacts(roots, name, aliases) {
|
|
1817
|
-
let n = 0;
|
|
1818
|
-
for (const root of roots) {
|
|
1819
|
-
const candidates = [join(root, ".evolve", name), ...aliases.map((a) => join(root, a))];
|
|
1820
|
-
for (const dir of candidates) {
|
|
1821
|
-
if (!existsSync3(dir)) continue;
|
|
1822
|
-
try {
|
|
1823
|
-
if (statSync(dir).isDirectory()) n += readdirSync(dir).length;
|
|
1824
|
-
else n += 1;
|
|
1825
|
-
} catch {
|
|
1826
|
-
}
|
|
1827
|
-
}
|
|
1828
|
-
}
|
|
1829
|
-
return n;
|
|
1830
|
-
}
|
|
1831
|
-
function buildSkillUsageReport(config) {
|
|
1832
|
-
const skills = config.skillRoots.flatMap(
|
|
1833
|
-
({ root, kind }) => listSkillDirs(root).map((s) => ({ ...s, kind }))
|
|
1834
|
-
);
|
|
1835
|
-
const names = skills.map((s) => s.name);
|
|
1836
|
-
const direct = new Map(names.map((n) => [n, 0]));
|
|
1837
|
-
const slash = new Map(names.map((n) => [n, 0]));
|
|
1838
|
-
const skillRe = /"skill"\s*:\s*"([a-z0-9_:-]+)"/g;
|
|
1839
|
-
const cmdRe = /<command-name>\/?([a-z0-9_:-]+)<\/command-name>/g;
|
|
1840
|
-
let transcripts = 0;
|
|
1841
|
-
for (const dir of config.transcriptDirs) {
|
|
1842
|
-
for (const file of walkJsonl(dir, config.maxTranscriptsPerDir ?? 0)) {
|
|
1843
|
-
transcripts += 1;
|
|
1844
|
-
let data;
|
|
1845
|
-
try {
|
|
1846
|
-
data = readFileSync2(file, "utf8");
|
|
1847
|
-
} catch {
|
|
1848
|
-
continue;
|
|
1849
|
-
}
|
|
1850
|
-
for (const m of data.matchAll(skillRe)) {
|
|
1851
|
-
const g = m[1];
|
|
1852
|
-
if (!g) continue;
|
|
1853
|
-
const n = g.split(":").pop() ?? g;
|
|
1854
|
-
const prev = direct.get(n);
|
|
1855
|
-
if (prev !== void 0) direct.set(n, prev + 1);
|
|
1856
|
-
}
|
|
1857
|
-
for (const m of data.matchAll(cmdRe)) {
|
|
1858
|
-
const g = m[1];
|
|
1859
|
-
if (g === void 0) continue;
|
|
1860
|
-
const prev = slash.get(g);
|
|
1861
|
-
if (prev !== void 0) slash.set(g, prev + 1);
|
|
1862
|
-
}
|
|
1863
|
-
}
|
|
1864
|
-
}
|
|
1865
|
-
const bodies = /* @__PURE__ */ new Map();
|
|
1866
|
-
for (const s of skills) {
|
|
1867
|
-
try {
|
|
1868
|
-
bodies.set(s.name, readFileSync2(s.path, "utf8"));
|
|
1869
|
-
} catch {
|
|
1870
|
-
bodies.set(s.name, "");
|
|
1871
|
-
}
|
|
1872
|
-
}
|
|
1873
|
-
const inbound = new Map(names.map((n) => [n, 0]));
|
|
1874
|
-
for (const target of names) {
|
|
1875
|
-
const ref = new RegExp(`/${target}\\b|\\[\\[${target}\\]\\]`);
|
|
1876
|
-
for (const s of skills) {
|
|
1877
|
-
if (s.name === target) continue;
|
|
1878
|
-
if (ref.test(bodies.get(s.name) ?? "")) inbound.set(target, inbound.get(target) + 1);
|
|
1879
|
-
}
|
|
1880
|
-
}
|
|
1881
|
-
const records = skills.map((s) => {
|
|
1882
|
-
const body = bodies.get(s.name) ?? "";
|
|
1883
|
-
const dir = s.path.replace(/\/SKILL\.md$/, "");
|
|
1884
|
-
return {
|
|
1885
|
-
name: s.name,
|
|
1886
|
-
kind: s.kind,
|
|
1887
|
-
path: s.path,
|
|
1888
|
-
lines: body ? body.split("\n").length : 0,
|
|
1889
|
-
directInvocations: direct.get(s.name) ?? 0,
|
|
1890
|
-
slashInvocations: slash.get(s.name) ?? 0,
|
|
1891
|
-
inboundRefs: inbound.get(s.name) ?? 0,
|
|
1892
|
-
artifactCount: countArtifacts(
|
|
1893
|
-
config.artifactRoots ?? [],
|
|
1894
|
-
s.name,
|
|
1895
|
-
config.artifactAliases?.[s.name] ?? []
|
|
1896
|
-
),
|
|
1897
|
-
tanglePrivateRefs: (body.match(TANGLE_PRIVATE_RE) ?? []).length,
|
|
1898
|
-
hasReferencesDir: existsSync3(join(dir, "references")),
|
|
1899
|
-
hasEvalsDir: existsSync3(join(dir, "evals")),
|
|
1900
|
-
logsRuns: body.includes("skill-runs.jsonl"),
|
|
1901
|
-
hasTriggerPhrases: TRIGGER_RE.test(frontmatterDescription(body) || body.slice(0, 600))
|
|
1902
|
-
};
|
|
1903
|
-
});
|
|
1904
|
-
return { generatedFromTraces: transcripts, records };
|
|
1905
|
-
}
|
|
1906
|
-
var ANALYST_ID = "skill-usage";
|
|
1907
|
-
function finding(area, subject, claim, severity, confidence, producedAt, recommended, evidenceUri, rationale) {
|
|
1908
|
-
return {
|
|
1909
|
-
schema_version: "1.0.0",
|
|
1910
|
-
finding_id: computeFindingId({ analyst_id: ANALYST_ID, area, subject, claim }),
|
|
1911
|
-
analyst_id: ANALYST_ID,
|
|
1912
|
-
produced_at: producedAt,
|
|
1913
|
-
severity,
|
|
1914
|
-
area,
|
|
1915
|
-
claim,
|
|
1916
|
-
rationale,
|
|
1917
|
-
evidence_refs: [{ kind: "artifact", uri: evidenceUri }],
|
|
1918
|
-
recommended_action: recommended,
|
|
1919
|
-
confidence,
|
|
1920
|
-
subject
|
|
1921
|
-
};
|
|
1922
|
-
}
|
|
1923
|
-
function emitSkillUsageFindings(report, producedAt) {
|
|
1924
|
-
const out = [];
|
|
1925
|
-
for (const r of report.records) {
|
|
1926
|
-
const directTotal = r.directInvocations + r.slashInvocations;
|
|
1927
|
-
const trueUsage = directTotal + r.inboundRefs + r.artifactCount;
|
|
1928
|
-
if (trueUsage === 0) {
|
|
1929
|
-
out.push(
|
|
1930
|
-
finding(
|
|
1931
|
-
"skill-usage",
|
|
1932
|
-
r.name,
|
|
1933
|
-
`Skill '${r.name}' has zero usage across all signals (direct, slash, inbound-refs, artifacts)`,
|
|
1934
|
-
"high",
|
|
1935
|
-
0.6,
|
|
1936
|
-
producedAt,
|
|
1937
|
-
"Confirm the skill covers a real recurring job; if not, deprecate. Zero true usage is the only deterministic deprecation candidate.",
|
|
1938
|
-
r.path,
|
|
1939
|
-
"No Skill-tool call, no slash invocation, no sibling dispatches to it, and no on-disk artifacts."
|
|
1940
|
-
)
|
|
1941
|
-
);
|
|
1942
|
-
} else if (directTotal === 0 && r.inboundRefs + r.artifactCount > 0) {
|
|
1943
|
-
out.push(
|
|
1944
|
-
finding(
|
|
1945
|
-
"skill-usage",
|
|
1946
|
-
r.name,
|
|
1947
|
-
`Skill '${r.name}' shows 0 direct invocations but is used via orchestration/artifacts (inbound=${r.inboundRefs}, artifacts=${r.artifactCount})`,
|
|
1948
|
-
"info",
|
|
1949
|
-
0.8,
|
|
1950
|
-
producedAt,
|
|
1951
|
-
"Do NOT treat as unused \u2014 usage is real but logged under parent skills or on disk. Strengthen direct-invocation discovery only if direct use is desired.",
|
|
1952
|
-
r.path,
|
|
1953
|
-
"The Skill-tool counter undercounts orchestrated/chained leaf skills."
|
|
1954
|
-
)
|
|
1955
|
-
);
|
|
1956
|
-
}
|
|
1957
|
-
if (directTotal <= 2 && !r.hasTriggerPhrases) {
|
|
1958
|
-
out.push(
|
|
1959
|
-
finding(
|
|
1960
|
-
"discoverability",
|
|
1961
|
-
r.name,
|
|
1962
|
-
`Skill '${r.name}' is rarely invoked directly and its description has no explicit trigger phrases`,
|
|
1963
|
-
"medium",
|
|
1964
|
-
0.7,
|
|
1965
|
-
producedAt,
|
|
1966
|
-
"Add a `Triggers:` clause with verbatim user phrases to the frontmatter description so the model auto-invokes it.",
|
|
1967
|
-
r.path
|
|
1968
|
-
)
|
|
1969
|
-
);
|
|
1970
|
-
}
|
|
1971
|
-
if (r.kind === "public" && r.tanglePrivateRefs > 0) {
|
|
1972
|
-
out.push(
|
|
1973
|
-
finding(
|
|
1974
|
-
"safety",
|
|
1975
|
-
r.name,
|
|
1976
|
-
`Public skill '${r.name}' carries ${r.tanglePrivateRefs} Tangle-private reference(s)`,
|
|
1977
|
-
"high",
|
|
1978
|
-
0.75,
|
|
1979
|
-
producedAt,
|
|
1980
|
-
"Sanitize incidental internal refs (cli-bridge/kimi/tcloud/~company/private repos) or relocate to a private repo. Verify @tangle-network/* refs are to PUBLISHED packages before treating as a leak.",
|
|
1981
|
-
r.path
|
|
1982
|
-
)
|
|
1983
|
-
);
|
|
1984
|
-
}
|
|
1985
|
-
if (r.lines > BLOAT_LINE_THRESHOLD && !r.hasReferencesDir) {
|
|
1986
|
-
out.push(
|
|
1987
|
-
finding(
|
|
1988
|
-
"maintainability",
|
|
1989
|
-
r.name,
|
|
1990
|
-
`Skill '${r.name}' is ${r.lines} lines with no references/ split (progressive disclosure)`,
|
|
1991
|
-
"medium",
|
|
1992
|
-
0.8,
|
|
1993
|
-
producedAt,
|
|
1994
|
-
`Split detail into references/ loaded on demand; keep SKILL.md a short overview. ${r.lines} lines load into every session's context budget.`,
|
|
1995
|
-
r.path
|
|
1996
|
-
)
|
|
1997
|
-
);
|
|
1998
|
-
}
|
|
1999
|
-
if (!r.hasEvalsDir) {
|
|
2000
|
-
out.push(
|
|
2001
|
-
finding(
|
|
2002
|
-
"data-quality",
|
|
2003
|
-
r.name,
|
|
2004
|
-
`Skill '${r.name}' ships no evals/`,
|
|
2005
|
-
"low",
|
|
2006
|
-
0.6,
|
|
2007
|
-
producedAt,
|
|
2008
|
-
"Add evals/evals.json with >=3 scenarios proving the skill beats baseline; gives regression coverage.",
|
|
2009
|
-
r.path
|
|
2010
|
-
)
|
|
2011
|
-
);
|
|
2012
|
-
}
|
|
2013
|
-
if (!r.logsRuns) {
|
|
2014
|
-
out.push(
|
|
2015
|
-
finding(
|
|
2016
|
-
"observability",
|
|
2017
|
-
r.name,
|
|
2018
|
-
`Skill '${r.name}' never appends to .evolve/skill-runs.jsonl`,
|
|
2019
|
-
"low",
|
|
2020
|
-
0.55,
|
|
2021
|
-
producedAt,
|
|
2022
|
-
"Append one run line to .evolve/skill-runs.jsonl on completion, or declare it a non-logging leaf, so the self-improvement loop can see it ran.",
|
|
2023
|
-
r.path
|
|
2024
|
-
)
|
|
2025
|
-
);
|
|
2026
|
-
}
|
|
2027
|
-
}
|
|
2028
|
-
return out;
|
|
2029
|
-
}
|
|
2030
|
-
var SkillUsageAnalyst = class {
|
|
2031
|
-
id = ANALYST_ID;
|
|
2032
|
-
description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
|
|
2033
|
-
inputKind = "custom";
|
|
2034
|
-
cost = { kind: "deterministic", est_usd_per_run: 0 };
|
|
2035
|
-
version = "1.0.0";
|
|
2036
|
-
async analyze(input, ctx) {
|
|
2037
|
-
const producedAt = ctx.tags?.producedAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2038
|
-
ctx.log?.(
|
|
2039
|
-
`skill-usage: ${input.records.length} skills over ${input.generatedFromTraces} transcripts`
|
|
2040
|
-
);
|
|
2041
|
-
return emitSkillUsageFindings(input, producedAt);
|
|
2042
|
-
}
|
|
2043
|
-
};
|
|
2044
|
-
var SKILL_USAGE_ANALYST = new SkillUsageAnalyst();
|
|
2045
|
-
|
|
2046
|
-
// src/analyst/registry.ts
|
|
2047
|
-
import { randomUUID } from "crypto";
|
|
2048
|
-
var AnalystRegistry = class {
|
|
2049
|
-
analysts = /* @__PURE__ */ new Map();
|
|
2050
|
-
options;
|
|
2051
|
-
constructor(options = {}) {
|
|
2052
|
-
this.options = options;
|
|
2053
|
-
}
|
|
2054
|
-
register(analyst) {
|
|
2055
|
-
if (!analyst.id) throw new Error("AnalystRegistry.register: analyst.id is required");
|
|
2056
|
-
if (this.analysts.has(analyst.id)) {
|
|
2057
|
-
throw new Error(`AnalystRegistry.register: duplicate analyst id "${analyst.id}"`);
|
|
2058
|
-
}
|
|
2059
|
-
if (!analyst.version) {
|
|
2060
|
-
throw new Error(`AnalystRegistry.register: analyst "${analyst.id}" must declare a version`);
|
|
2061
|
-
}
|
|
2062
|
-
this.analysts.set(analyst.id, analyst);
|
|
2063
|
-
}
|
|
2064
|
-
list() {
|
|
2065
|
-
return Array.from(this.analysts.values()).map((a) => ({
|
|
2066
|
-
id: a.id,
|
|
2067
|
-
description: a.description,
|
|
2068
|
-
version: a.version,
|
|
2069
|
-
cost: a.cost
|
|
2070
|
-
}));
|
|
2071
|
-
}
|
|
2072
|
-
async run(runId, inputs, runOpts = {}) {
|
|
2073
|
-
for await (const ev of this.runStream(runId, inputs, runOpts)) {
|
|
2074
|
-
if (ev.type === "run-completed") return ev.result;
|
|
2075
|
-
}
|
|
2076
|
-
throw new Error("AnalystRegistry.run: stream completed without run-completed event");
|
|
2077
|
-
}
|
|
2078
|
-
/**
|
|
2079
|
-
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
2080
|
-
* in real time — `run-started`, then per-analyst `skipped` /
|
|
2081
|
-
* `started` / `completed`, then a terminal `run-completed` whose
|
|
2082
|
-
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
2083
|
-
* progress; persistence consumers use `run()` and read the result.
|
|
2084
|
-
*
|
|
2085
|
-
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
2086
|
-
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
2087
|
-
* replacement.
|
|
2088
|
-
*/
|
|
2089
|
-
async *runStream(runId, inputs, runOpts = {}) {
|
|
2090
|
-
const correlationId = `ar_${randomUUID().slice(0, 12)}`;
|
|
2091
|
-
const log = this.options.log ?? (() => {
|
|
2092
|
-
});
|
|
2093
|
-
const hooks = this.options.hooks ?? {};
|
|
2094
|
-
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2095
|
-
const started = Date.now();
|
|
2096
|
-
const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
|
|
2097
|
-
const selected = this.selectAnalysts(runOpts);
|
|
2098
|
-
const budget = runOpts.budget ?? this.options.defaultBudget;
|
|
2099
|
-
yield {
|
|
2100
|
-
type: "run-started",
|
|
2101
|
-
run_id: runId,
|
|
2102
|
-
correlation_id: correlationId,
|
|
2103
|
-
started_at: startedAt,
|
|
2104
|
-
analyst_ids: selected.map((a) => a.id)
|
|
2105
|
-
};
|
|
2106
|
-
const summaries = [];
|
|
2107
|
-
const allFindings = [];
|
|
2108
|
-
let totalCost = 0;
|
|
2109
|
-
let remainingUsd = budget?.totalUsd;
|
|
2110
|
-
for (const analyst of selected) {
|
|
2111
|
-
const t0 = Date.now();
|
|
2112
|
-
const input = this.routeInput(analyst, inputs);
|
|
2113
|
-
if (input.kind === "missing") {
|
|
2114
|
-
const summary = {
|
|
2115
|
-
analyst_id: analyst.id,
|
|
2116
|
-
status: "skipped",
|
|
2117
|
-
reason: `missing input of kind '${analyst.inputKind}'`,
|
|
2118
|
-
findings_count: 0,
|
|
2119
|
-
latency_ms: 0,
|
|
2120
|
-
cost_usd: 0
|
|
2121
|
-
};
|
|
2122
|
-
summaries.push(summary);
|
|
2123
|
-
log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
|
|
2124
|
-
await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
|
|
2125
|
-
yield { type: "analyst-skipped", summary };
|
|
2126
|
-
continue;
|
|
2127
|
-
}
|
|
2128
|
-
const perBudget = allocateBudget(budget, {
|
|
2129
|
-
analyst,
|
|
2130
|
-
remainingUsd,
|
|
2131
|
-
runningCount: selected.length
|
|
2132
|
-
});
|
|
2133
|
-
const ctx = {
|
|
2134
|
-
runId,
|
|
2135
|
-
correlationId,
|
|
2136
|
-
deadlineMs,
|
|
2137
|
-
budgetUsd: perBudget,
|
|
2138
|
-
chat: this.options.chat,
|
|
2139
|
-
tags: runOpts.tags,
|
|
2140
|
-
log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
|
|
2141
|
-
signal: runOpts.signal,
|
|
2142
|
-
priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
|
|
2143
|
-
};
|
|
2144
|
-
await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
|
|
2145
|
-
yield {
|
|
2146
|
-
type: "analyst-started",
|
|
2147
|
-
analyst_id: analyst.id,
|
|
2148
|
-
started_at: new Date(t0).toISOString()
|
|
2149
|
-
};
|
|
2150
|
-
try {
|
|
2151
|
-
const findings = await analyst.analyze(input.value, ctx);
|
|
2152
|
-
const latency = Date.now() - t0;
|
|
2153
|
-
const cost = sumFindingCost(findings);
|
|
2154
|
-
totalCost += cost;
|
|
2155
|
-
if (typeof remainingUsd === "number") remainingUsd = Math.max(0, remainingUsd - cost);
|
|
2156
|
-
allFindings.push(...findings);
|
|
2157
|
-
const summary = {
|
|
2158
|
-
analyst_id: analyst.id,
|
|
2159
|
-
status: "ok",
|
|
2160
|
-
findings_count: findings.length,
|
|
2161
|
-
latency_ms: latency,
|
|
2162
|
-
cost_usd: cost
|
|
2163
|
-
};
|
|
2164
|
-
summaries.push(summary);
|
|
2165
|
-
log(`[analyst] ok ${analyst.id}`, {
|
|
2166
|
-
runId,
|
|
2167
|
-
findings: findings.length,
|
|
2168
|
-
latency_ms: latency,
|
|
2169
|
-
cost_usd: cost
|
|
2170
|
-
});
|
|
2171
|
-
await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
|
|
2172
|
-
yield { type: "analyst-completed", summary, findings };
|
|
2173
|
-
} catch (err) {
|
|
2174
|
-
const latency = Date.now() - t0;
|
|
2175
|
-
const e = err instanceof Error ? err : new Error(String(err));
|
|
2176
|
-
const hookFindings = await hooks.onError?.({ analyst, error: e, runId }) ?? [];
|
|
2177
|
-
if (hookFindings.length) allFindings.push(...hookFindings);
|
|
2178
|
-
const summary = {
|
|
2179
|
-
analyst_id: analyst.id,
|
|
2180
|
-
status: "failed",
|
|
2181
|
-
findings_count: hookFindings.length,
|
|
2182
|
-
latency_ms: latency,
|
|
2183
|
-
cost_usd: 0,
|
|
2184
|
-
error: { class: e.constructor.name, message: e.message }
|
|
2185
|
-
};
|
|
2186
|
-
summaries.push(summary);
|
|
2187
|
-
log(`[analyst] FAIL ${analyst.id}`, {
|
|
2188
|
-
runId,
|
|
2189
|
-
error_class: e.constructor.name,
|
|
2190
|
-
error: e.message
|
|
2191
|
-
});
|
|
2192
|
-
await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
|
|
2193
|
-
yield { type: "analyst-completed", summary, findings: hookFindings };
|
|
2194
|
-
}
|
|
2195
|
-
}
|
|
2196
|
-
const result = {
|
|
2197
|
-
run_id: runId,
|
|
2198
|
-
correlation_id: correlationId,
|
|
2199
|
-
started_at: startedAt,
|
|
2200
|
-
ended_at: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2201
|
-
findings: allFindings,
|
|
2202
|
-
per_analyst: summaries,
|
|
2203
|
-
total_cost_usd: totalCost
|
|
2204
|
-
};
|
|
2205
|
-
await hooks.onComplete?.({ result });
|
|
2206
|
-
yield { type: "run-completed", result };
|
|
2207
|
-
}
|
|
2208
|
-
selectAnalysts(opts) {
|
|
2209
|
-
let candidates = Array.from(this.analysts.values());
|
|
2210
|
-
if (opts.only?.length) {
|
|
2211
|
-
const only = new Set(opts.only);
|
|
2212
|
-
candidates = candidates.filter((a) => only.has(a.id));
|
|
2213
|
-
}
|
|
2214
|
-
if (opts.skip?.length) {
|
|
2215
|
-
const skip = new Set(opts.skip);
|
|
2216
|
-
candidates = candidates.filter((a) => !skip.has(a.id));
|
|
2217
|
-
}
|
|
2218
|
-
return candidates;
|
|
2219
|
-
}
|
|
2220
|
-
routeInput(analyst, inputs) {
|
|
2221
|
-
switch (analyst.inputKind) {
|
|
2222
|
-
case "trace-store":
|
|
2223
|
-
return inputs.traceStore ? { kind: "present", value: inputs.traceStore } : { kind: "missing" };
|
|
2224
|
-
case "artifact-dir":
|
|
2225
|
-
return inputs.artifactDir ? { kind: "present", value: inputs.artifactDir } : { kind: "missing" };
|
|
2226
|
-
case "run-record":
|
|
2227
|
-
return inputs.runRecord ? { kind: "present", value: inputs.runRecord } : { kind: "missing" };
|
|
2228
|
-
case "judge-input":
|
|
2229
|
-
return inputs.judgeInput ? { kind: "present", value: inputs.judgeInput } : { kind: "missing" };
|
|
2230
|
-
case "custom": {
|
|
2231
|
-
const v = inputs.custom?.[analyst.id];
|
|
2232
|
-
return v !== void 0 ? { kind: "present", value: v } : { kind: "missing" };
|
|
2233
|
-
}
|
|
2234
|
-
}
|
|
2235
|
-
}
|
|
2236
|
-
};
|
|
2237
|
-
function allocateBudget(policy, args) {
|
|
2238
|
-
if (!policy) return void 0;
|
|
2239
|
-
if (policy.allocate) {
|
|
2240
|
-
return policy.allocate({
|
|
2241
|
-
analyst: args.analyst,
|
|
2242
|
-
totalUsd: policy.totalUsd,
|
|
2243
|
-
remainingUsd: args.remainingUsd,
|
|
2244
|
-
runningCount: args.runningCount
|
|
2245
|
-
});
|
|
2246
|
-
}
|
|
2247
|
-
if (policy.totalUsd == null) return void 0;
|
|
2248
|
-
if (policy.weights) {
|
|
2249
|
-
const w = policy.weights[args.analyst.id] ?? 1;
|
|
2250
|
-
const totalWeight = Math.max(1, args.runningCount);
|
|
2251
|
-
return policy.totalUsd * w / totalWeight;
|
|
2252
|
-
}
|
|
2253
|
-
return policy.totalUsd / Math.max(1, args.runningCount);
|
|
2254
|
-
}
|
|
2255
|
-
function sumFindingCost(findings) {
|
|
2256
|
-
let sum3 = 0;
|
|
2257
|
-
for (const f of findings) {
|
|
2258
|
-
const c = f.metadata?.cost_usd;
|
|
2259
|
-
if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
|
|
2260
|
-
}
|
|
2261
|
-
return sum3;
|
|
2262
|
-
}
|
|
2263
|
-
function selectPriorFindings(source, analystId) {
|
|
2264
|
-
if (!source) return void 0;
|
|
2265
|
-
if (Array.isArray(source)) {
|
|
2266
|
-
const own2 = source.filter((f) => f.analyst_id === analystId);
|
|
2267
|
-
return own2.length > 0 ? own2 : void 0;
|
|
2268
|
-
}
|
|
2269
|
-
const record = source;
|
|
2270
|
-
const own = record[analystId] ?? [];
|
|
2271
|
-
const wildcard = record["*"] ?? [];
|
|
2272
|
-
const merged = [...own, ...wildcard];
|
|
2273
|
-
return merged.length > 0 ? merged : void 0;
|
|
2274
|
-
}
|
|
206
|
+
weightedMean,
|
|
207
|
+
wilcoxonSignedRank
|
|
208
|
+
} from "./chunk-ITBRCT73.js";
|
|
209
|
+
import {
|
|
210
|
+
FileSystemTraceStore,
|
|
211
|
+
InMemoryTraceStore,
|
|
212
|
+
OTEL_AGENT_EVAL_SCOPE,
|
|
213
|
+
ReplayCache,
|
|
214
|
+
ReplayCacheMissError,
|
|
215
|
+
buildTraceInsightContext,
|
|
216
|
+
buildTraceInsightPrompt,
|
|
217
|
+
captureFetchToRawSink,
|
|
218
|
+
createOtelExporter,
|
|
219
|
+
createOtelTracingStore,
|
|
220
|
+
createReplayFetch,
|
|
221
|
+
defaultTraceInsightPanel,
|
|
222
|
+
describeTraceInsightScope,
|
|
223
|
+
domainEvidencePattern,
|
|
224
|
+
exportRunAsOtlp,
|
|
225
|
+
flattenOtlpExportToNdjson,
|
|
226
|
+
inferDomainKeywords,
|
|
227
|
+
iterateRawCalls,
|
|
228
|
+
otelRunCompleteHook,
|
|
229
|
+
otlpToRunRecords,
|
|
230
|
+
otlpToTraceRunRecords,
|
|
231
|
+
planTraceInsightQuestions,
|
|
232
|
+
scoreTraceInsightReadiness,
|
|
233
|
+
tokenizeDomainWords,
|
|
234
|
+
traceAnalystOnRunComplete
|
|
235
|
+
} from "./chunk-JHA3ZGSO.js";
|
|
236
|
+
import {
|
|
237
|
+
DEFAULT_REDACTION_RULES,
|
|
238
|
+
REDACTION_VERSION,
|
|
239
|
+
redactString,
|
|
240
|
+
redactValue
|
|
241
|
+
} from "./chunk-GGE4NNQT.js";
|
|
242
|
+
import {
|
|
243
|
+
aggregateLlm,
|
|
244
|
+
argHash,
|
|
245
|
+
groupBy,
|
|
246
|
+
judgeSpans,
|
|
247
|
+
llmSpans,
|
|
248
|
+
runFailureClass,
|
|
249
|
+
runsForScenario,
|
|
250
|
+
toolSpans
|
|
251
|
+
} from "./chunk-47X6LRCE.js";
|
|
252
|
+
import {
|
|
253
|
+
FAILURE_CLASSES,
|
|
254
|
+
TRACE_SCHEMA_VERSION,
|
|
255
|
+
isJudgeSpan,
|
|
256
|
+
isLlmSpan,
|
|
257
|
+
isRetrievalSpan,
|
|
258
|
+
isSandboxSpan,
|
|
259
|
+
isToolSpan
|
|
260
|
+
} from "./chunk-5BKGXME7.js";
|
|
261
|
+
import {
|
|
262
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
263
|
+
OtlpFileTraceStore,
|
|
264
|
+
SpanNotFoundError,
|
|
265
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
266
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
267
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
268
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
269
|
+
TraceFileMissingError,
|
|
270
|
+
TraceNotFoundError,
|
|
271
|
+
analyzeTraces,
|
|
272
|
+
asNumber,
|
|
273
|
+
asString,
|
|
274
|
+
buildTraceAnalystTools,
|
|
275
|
+
extractOtlpAttributes,
|
|
276
|
+
firstNumberAttr,
|
|
277
|
+
firstStringAttr,
|
|
278
|
+
inferOtlpKind,
|
|
279
|
+
projectOtlpFlatLine,
|
|
280
|
+
readOtlpStatus,
|
|
281
|
+
stringField,
|
|
282
|
+
traceAnalystFunctionGroup
|
|
283
|
+
} from "./chunk-VUINJM5M.js";
|
|
284
|
+
import {
|
|
285
|
+
RunIntegrityError,
|
|
286
|
+
assertRunCaptured,
|
|
287
|
+
throwIfRunIncomplete
|
|
288
|
+
} from "./chunk-SBCB6VZY.js";
|
|
289
|
+
import {
|
|
290
|
+
FileSystemRawProviderSink,
|
|
291
|
+
InMemoryRawProviderSink,
|
|
292
|
+
NoopRawProviderSink,
|
|
293
|
+
defaultProviderRedactor,
|
|
294
|
+
providerFromBaseUrl
|
|
295
|
+
} from "./chunk-PC4UYEBM.js";
|
|
296
|
+
import {
|
|
297
|
+
AGENT_PROFILE_KINDS,
|
|
298
|
+
AgentProfileCellValidationError,
|
|
299
|
+
RunRecordValidationError,
|
|
300
|
+
agentProfileCellHashMaterial,
|
|
301
|
+
agentProfileCellKey,
|
|
302
|
+
assertRunAgentProfileCell,
|
|
303
|
+
buildAgentProfileCell,
|
|
304
|
+
buildSandboxAgentProfileCell,
|
|
305
|
+
groupRunsByAgentProfileCell,
|
|
306
|
+
isRunRecord,
|
|
307
|
+
parseRunRecordSafe,
|
|
308
|
+
requireAgentProfileCell,
|
|
309
|
+
roundTripRunRecord,
|
|
310
|
+
toAgentProfileJson,
|
|
311
|
+
validateAgentProfileCell,
|
|
312
|
+
validateRunRecord,
|
|
313
|
+
verifyAgentProfileCell
|
|
314
|
+
} from "./chunk-F3SRAAZO.js";
|
|
315
|
+
import {
|
|
316
|
+
TraceEmitter,
|
|
317
|
+
llmSpanFromProvider
|
|
318
|
+
} from "./chunk-TVVP3ZZQ.js";
|
|
319
|
+
import {
|
|
320
|
+
canonicalize,
|
|
321
|
+
evaluateHypothesis,
|
|
322
|
+
hashJson,
|
|
323
|
+
signManifest,
|
|
324
|
+
verifyManifest
|
|
325
|
+
} from "./chunk-VSMTAMNK.js";
|
|
326
|
+
import {
|
|
327
|
+
AgentEvalError,
|
|
328
|
+
CaptureIntegrityError,
|
|
329
|
+
ConfigError,
|
|
330
|
+
JudgeError,
|
|
331
|
+
NotFoundError,
|
|
332
|
+
ReplayError,
|
|
333
|
+
ValidationError,
|
|
334
|
+
VerificationError
|
|
335
|
+
} from "./chunk-3BFEG2F6.js";
|
|
336
|
+
import {
|
|
337
|
+
__export
|
|
338
|
+
} from "./chunk-PZ5AY32C.js";
|
|
2275
339
|
|
|
2276
340
|
// src/auto-pr.ts
|
|
2277
341
|
async function proposeAutomatedPullRequest(client, input) {
|
|
@@ -2477,12 +541,12 @@ function ghCliClient(opts = {}) {
|
|
|
2477
541
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2478
542
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2479
543
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2480
|
-
const { dirname:
|
|
544
|
+
const { dirname: dirname3, join: join4, resolve } = await import("path");
|
|
2481
545
|
for (const change of input.fileChanges) {
|
|
2482
546
|
const abs = resolve(cwd, change.path);
|
|
2483
|
-
await mkdir(
|
|
547
|
+
await mkdir(dirname3(abs), { recursive: true });
|
|
2484
548
|
await writeFile(abs, change.contents, "utf8");
|
|
2485
|
-
await run("git", ["add",
|
|
549
|
+
await run("git", ["add", join4(change.path)]);
|
|
2486
550
|
}
|
|
2487
551
|
const env = {};
|
|
2488
552
|
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
@@ -3093,158 +1157,6 @@ var ConvergenceTracker = class {
|
|
|
3093
1157
|
}
|
|
3094
1158
|
};
|
|
3095
1159
|
|
|
3096
|
-
// src/metrics.ts
|
|
3097
|
-
var MODEL_PRICING = {
|
|
3098
|
-
"gpt-4o": { input: 25e-4, output: 0.01 },
|
|
3099
|
-
"gpt-4o-mini": { input: 15e-5, output: 6e-4 },
|
|
3100
|
-
"gpt-4-turbo": { input: 0.01, output: 0.03 },
|
|
3101
|
-
"claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
|
|
3102
|
-
"claude-opus-4-20250514": { input: 0.015, output: 0.075 },
|
|
3103
|
-
"claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
|
|
3104
|
-
};
|
|
3105
|
-
var FAMILY_PRICING = [
|
|
3106
|
-
[/claude.*opus/, { input: 0.015, output: 0.075 }],
|
|
3107
|
-
[/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
|
|
3108
|
-
[/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
|
|
3109
|
-
[/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
|
|
3110
|
-
[/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
|
|
3111
|
-
[/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
|
|
3112
|
-
[/deepseek/, { input: 3e-4, output: 11e-4 }],
|
|
3113
|
-
[/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
|
|
3114
|
-
[/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
|
|
3115
|
-
[/qwen/, { input: 4e-4, output: 12e-4 }],
|
|
3116
|
-
[/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
|
|
3117
|
-
[/gemini/, { input: 125e-5, output: 5e-3 }],
|
|
3118
|
-
[/llama/, { input: 2e-4, output: 6e-4 }]
|
|
3119
|
-
];
|
|
3120
|
-
function normalizeModelId(model) {
|
|
3121
|
-
return (model.split("@")[0] ?? model).trim().toLowerCase();
|
|
3122
|
-
}
|
|
3123
|
-
function resolveModelPricing(model) {
|
|
3124
|
-
if (MODEL_PRICING[model]) return MODEL_PRICING[model];
|
|
3125
|
-
const id = normalizeModelId(model);
|
|
3126
|
-
if (MODEL_PRICING[id]) return MODEL_PRICING[id];
|
|
3127
|
-
for (const [pattern, price] of FAMILY_PRICING) {
|
|
3128
|
-
if (pattern.test(id)) return price;
|
|
3129
|
-
}
|
|
3130
|
-
return null;
|
|
3131
|
-
}
|
|
3132
|
-
function isModelPriced(model) {
|
|
3133
|
-
return resolveModelPricing(model) !== null;
|
|
3134
|
-
}
|
|
3135
|
-
var warnedUnpricedModels = /* @__PURE__ */ new Set();
|
|
3136
|
-
function estimateTokens(text) {
|
|
3137
|
-
return Math.ceil(text.length / 4);
|
|
3138
|
-
}
|
|
3139
|
-
function estimateCost(inputTokens, outputTokens, model) {
|
|
3140
|
-
const pricing = resolveModelPricing(model);
|
|
3141
|
-
if (!pricing) {
|
|
3142
|
-
if (!warnedUnpricedModels.has(model)) {
|
|
3143
|
-
warnedUnpricedModels.add(model);
|
|
3144
|
-
console.warn(
|
|
3145
|
-
`estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
|
|
3146
|
-
);
|
|
3147
|
-
}
|
|
3148
|
-
return 0;
|
|
3149
|
-
}
|
|
3150
|
-
return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
|
|
3151
|
-
}
|
|
3152
|
-
var TokenCounter = class {
|
|
3153
|
-
totalInput = 0;
|
|
3154
|
-
totalOutput = 0;
|
|
3155
|
-
totalCost = 0;
|
|
3156
|
-
model;
|
|
3157
|
-
constructor(model = "gpt-4o") {
|
|
3158
|
-
this.model = model;
|
|
3159
|
-
}
|
|
3160
|
-
/** Record tokens for a turn, returns per-turn cost */
|
|
3161
|
-
record(inputTokens, outputTokens) {
|
|
3162
|
-
this.totalInput += inputTokens;
|
|
3163
|
-
this.totalOutput += outputTokens;
|
|
3164
|
-
const cost = estimateCost(inputTokens, outputTokens, this.model);
|
|
3165
|
-
this.totalCost += cost;
|
|
3166
|
-
return cost;
|
|
3167
|
-
}
|
|
3168
|
-
/** Estimate and record from raw text */
|
|
3169
|
-
recordFromText(inputText, outputText) {
|
|
3170
|
-
const inputTokens = estimateTokens(inputText);
|
|
3171
|
-
const outputTokens = estimateTokens(outputText);
|
|
3172
|
-
const cost = this.record(inputTokens, outputTokens);
|
|
3173
|
-
return { inputTokens, outputTokens, cost };
|
|
3174
|
-
}
|
|
3175
|
-
getTotalInput() {
|
|
3176
|
-
return this.totalInput;
|
|
3177
|
-
}
|
|
3178
|
-
getTotalOutput() {
|
|
3179
|
-
return this.totalOutput;
|
|
3180
|
-
}
|
|
3181
|
-
getTotalCost() {
|
|
3182
|
-
return this.totalCost;
|
|
3183
|
-
}
|
|
3184
|
-
};
|
|
3185
|
-
var MetricsCollector = class {
|
|
3186
|
-
client;
|
|
3187
|
-
workspaceId;
|
|
3188
|
-
metrics = [];
|
|
3189
|
-
constructor(client, workspaceId) {
|
|
3190
|
-
this.client = client;
|
|
3191
|
-
this.workspaceId = workspaceId;
|
|
3192
|
-
}
|
|
3193
|
-
/** Collect metrics after a turn completes */
|
|
3194
|
-
async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
|
|
3195
|
-
const state = await this.getState();
|
|
3196
|
-
const m = {
|
|
3197
|
-
turn,
|
|
3198
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3199
|
-
tasks: state.tasks,
|
|
3200
|
-
events: state.events,
|
|
3201
|
-
proposals: state.proposals,
|
|
3202
|
-
vaultFiles: state.vaultFiles.length,
|
|
3203
|
-
responseLatencyMs,
|
|
3204
|
-
responseChars,
|
|
3205
|
-
codeBlocksProduced,
|
|
3206
|
-
blocksExtracted,
|
|
3207
|
-
qualityScore,
|
|
3208
|
-
inputTokens,
|
|
3209
|
-
outputTokens,
|
|
3210
|
-
estimatedCostUsd,
|
|
3211
|
-
totalCostUsd: estimatedCostUsd,
|
|
3212
|
-
completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
|
|
3213
|
-
};
|
|
3214
|
-
this.metrics.push(m);
|
|
3215
|
-
return m;
|
|
3216
|
-
}
|
|
3217
|
-
/** Get current product state */
|
|
3218
|
-
async getState() {
|
|
3219
|
-
const [tasks, events, approvals, vaultFiles] = await Promise.all([
|
|
3220
|
-
this.client.getTasks(this.workspaceId),
|
|
3221
|
-
this.client.getEvents(this.workspaceId),
|
|
3222
|
-
this.client.getApprovals(this.workspaceId),
|
|
3223
|
-
this.client.getVaultTree(this.workspaceId)
|
|
3224
|
-
]);
|
|
3225
|
-
return {
|
|
3226
|
-
tasks: tasks.length,
|
|
3227
|
-
events: events.length,
|
|
3228
|
-
proposals: {
|
|
3229
|
-
pending: approvals.filter((a) => a.status === "pending").length,
|
|
3230
|
-
approved: approvals.filter((a) => a.status === "approved").length,
|
|
3231
|
-
rejected: approvals.filter((a) => a.status === "rejected").length
|
|
3232
|
-
},
|
|
3233
|
-
vaultFiles,
|
|
3234
|
-
codeBlocks: 0,
|
|
3235
|
-
generations: 0
|
|
3236
|
-
};
|
|
3237
|
-
}
|
|
3238
|
-
/** Get all collected metrics */
|
|
3239
|
-
getMetrics() {
|
|
3240
|
-
return [...this.metrics];
|
|
3241
|
-
}
|
|
3242
|
-
/** Get convergence curve (completion% over turns) */
|
|
3243
|
-
getConvergenceCurve() {
|
|
3244
|
-
return this.metrics.map((m) => m.completionPercent);
|
|
3245
|
-
}
|
|
3246
|
-
};
|
|
3247
|
-
|
|
3248
1160
|
// src/driver.ts
|
|
3249
1161
|
var RIGOR_STANCE = {
|
|
3250
1162
|
cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
|
|
@@ -3517,10 +1429,10 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3517
1429
|
}
|
|
3518
1430
|
async append(record) {
|
|
3519
1431
|
const { appendFile, mkdir } = await import("fs/promises");
|
|
3520
|
-
const { join:
|
|
1432
|
+
const { join: join4 } = await import("path");
|
|
3521
1433
|
await mkdir(this.dir, { recursive: true });
|
|
3522
1434
|
await appendFile(
|
|
3523
|
-
|
|
1435
|
+
join4(this.dir, "feedback-trajectories.ndjson"),
|
|
3524
1436
|
`${JSON.stringify(record)}
|
|
3525
1437
|
`,
|
|
3526
1438
|
"utf8"
|
|
@@ -3529,8 +1441,8 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3529
1441
|
async load() {
|
|
3530
1442
|
if (this.loaded) return;
|
|
3531
1443
|
const { readFile } = await import("fs/promises");
|
|
3532
|
-
const { join:
|
|
3533
|
-
const file =
|
|
1444
|
+
const { join: join4 } = await import("path");
|
|
1445
|
+
const file = join4(this.dir, "feedback-trajectories.ndjson");
|
|
3534
1446
|
try {
|
|
3535
1447
|
const raw = await readFile(file, "utf8");
|
|
3536
1448
|
for (const line of raw.split("\n")) {
|
|
@@ -3843,21 +1755,21 @@ var SingleBackendError = class extends AgentEvalError {
|
|
|
3843
1755
|
function stripSlash(url) {
|
|
3844
1756
|
return url.replace(/\/+$/, "");
|
|
3845
1757
|
}
|
|
3846
|
-
function assertSingleBackend(
|
|
1758
|
+
function assertSingleBackend(agent, judge, opts = {}) {
|
|
3847
1759
|
const divergences = [];
|
|
3848
|
-
if (
|
|
3849
|
-
divergences.push({ field: "kind", agent:
|
|
1760
|
+
if (agent.kind !== judge.kind) {
|
|
1761
|
+
divergences.push({ field: "kind", agent: agent.kind, judge: judge.kind });
|
|
3850
1762
|
}
|
|
3851
|
-
if (stripSlash(
|
|
3852
|
-
divergences.push({ field: "baseUrl", agent:
|
|
1763
|
+
if (stripSlash(agent.baseUrl) !== stripSlash(judge.baseUrl)) {
|
|
1764
|
+
divergences.push({ field: "baseUrl", agent: agent.baseUrl, judge: judge.baseUrl });
|
|
3853
1765
|
}
|
|
3854
|
-
if (
|
|
3855
|
-
divergences.push({ field: "model", agent:
|
|
1766
|
+
if (agent.model !== judge.model) {
|
|
1767
|
+
divergences.push({ field: "model", agent: agent.model, judge: judge.model });
|
|
3856
1768
|
}
|
|
3857
|
-
if (
|
|
3858
|
-
divergences.push({ field: "provider", agent:
|
|
1769
|
+
if (agent.provider !== judge.provider) {
|
|
1770
|
+
divergences.push({ field: "provider", agent: agent.provider, judge: judge.provider });
|
|
3859
1771
|
}
|
|
3860
|
-
const agentHasKey = Boolean(
|
|
1772
|
+
const agentHasKey = Boolean(agent.apiKey);
|
|
3861
1773
|
const judgeHasKey = Boolean(judge.apiKey);
|
|
3862
1774
|
if (agentHasKey !== judgeHasKey) {
|
|
3863
1775
|
divergences.push({
|
|
@@ -4910,194 +2822,6 @@ function pathExists(obj, path) {
|
|
|
4910
2822
|
return true;
|
|
4911
2823
|
}
|
|
4912
2824
|
|
|
4913
|
-
// src/completion-verifier.ts
|
|
4914
|
-
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
4915
|
-
"the",
|
|
4916
|
-
"a",
|
|
4917
|
-
"an",
|
|
4918
|
-
"of",
|
|
4919
|
-
"for",
|
|
4920
|
-
"and",
|
|
4921
|
-
"or",
|
|
4922
|
-
"to",
|
|
4923
|
-
"in",
|
|
4924
|
-
"on",
|
|
4925
|
-
"with",
|
|
4926
|
-
"by"
|
|
4927
|
-
]);
|
|
4928
|
-
var MATCH_THRESHOLD = 0.5;
|
|
4929
|
-
var MIN_CONTENT_CHARS = 50;
|
|
4930
|
-
function tokens(s) {
|
|
4931
|
-
return new Set(
|
|
4932
|
-
s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
|
|
4933
|
-
);
|
|
4934
|
-
}
|
|
4935
|
-
function tokenRecall(requirementText, candidateText) {
|
|
4936
|
-
const req = tokens(requirementText);
|
|
4937
|
-
if (req.size === 0) return 0;
|
|
4938
|
-
const cand = tokens(candidateText);
|
|
4939
|
-
let hit = 0;
|
|
4940
|
-
for (const t of req) if (cand.has(t)) hit++;
|
|
4941
|
-
return hit / req.size;
|
|
4942
|
-
}
|
|
4943
|
-
function artifactCandidates(req, reqIndex, artifacts) {
|
|
4944
|
-
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4945
|
-
const out = [];
|
|
4946
|
-
artifacts.forEach((a, i) => {
|
|
4947
|
-
if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
|
|
4948
|
-
let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
|
|
4949
|
-
if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
|
|
4950
|
-
score = Math.max(score, 1);
|
|
4951
|
-
}
|
|
4952
|
-
if (score < MATCH_THRESHOLD) return;
|
|
4953
|
-
out.push({
|
|
4954
|
-
reqIndex,
|
|
4955
|
-
itemKey: `artifact:${i}`,
|
|
4956
|
-
score,
|
|
4957
|
-
evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
|
|
4958
|
-
content: a.content ?? null
|
|
4959
|
-
});
|
|
4960
|
-
});
|
|
4961
|
-
return out;
|
|
4962
|
-
}
|
|
4963
|
-
function proposalCandidates(req, reqIndex, proposals) {
|
|
4964
|
-
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4965
|
-
const out = [];
|
|
4966
|
-
for (const p of proposals) {
|
|
4967
|
-
if (p.status !== "approved") continue;
|
|
4968
|
-
const score = tokenRecall(reqText, p.title);
|
|
4969
|
-
if (score < MATCH_THRESHOLD) continue;
|
|
4970
|
-
const body = p.content ?? "";
|
|
4971
|
-
out.push({
|
|
4972
|
-
reqIndex,
|
|
4973
|
-
itemKey: `proposal:${p.id}`,
|
|
4974
|
-
score,
|
|
4975
|
-
evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
|
|
4976
|
-
content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
|
|
4977
|
-
});
|
|
4978
|
-
}
|
|
4979
|
-
return out;
|
|
4980
|
-
}
|
|
4981
|
-
function toolCallCandidates(req, reqIndex, toolCalls) {
|
|
4982
|
-
const out = [];
|
|
4983
|
-
toolCalls.forEach((name, i) => {
|
|
4984
|
-
const score = tokenRecall(req.title, name);
|
|
4985
|
-
if (score < MATCH_THRESHOLD) return;
|
|
4986
|
-
out.push({
|
|
4987
|
-
reqIndex,
|
|
4988
|
-
itemKey: `tool:${i}`,
|
|
4989
|
-
score,
|
|
4990
|
-
evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
|
|
4991
|
-
content: null
|
|
4992
|
-
});
|
|
4993
|
-
});
|
|
4994
|
-
return out;
|
|
4995
|
-
}
|
|
4996
|
-
async function verifyCompletion(gold, state, checkCorrectness) {
|
|
4997
|
-
if (gold.requirements.length === 0) {
|
|
4998
|
-
throw new Error(
|
|
4999
|
-
`verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
|
|
5000
|
-
);
|
|
5001
|
-
}
|
|
5002
|
-
const candidates = [];
|
|
5003
|
-
gold.requirements.forEach((req, i) => {
|
|
5004
|
-
const by = req.satisfiedBy ?? "any";
|
|
5005
|
-
if (by === "artifact" || by === "any") {
|
|
5006
|
-
candidates.push(...artifactCandidates(req, i, state.artifacts));
|
|
5007
|
-
}
|
|
5008
|
-
if (by === "proposal" || by === "any") {
|
|
5009
|
-
candidates.push(...proposalCandidates(req, i, state.proposals));
|
|
5010
|
-
}
|
|
5011
|
-
if (by === "tool-call" || by === "any") {
|
|
5012
|
-
candidates.push(...toolCallCandidates(req, i, state.toolCalls));
|
|
5013
|
-
}
|
|
5014
|
-
});
|
|
5015
|
-
candidates.sort((a, b) => b.score - a.score);
|
|
5016
|
-
const assigned = /* @__PURE__ */ new Map();
|
|
5017
|
-
const itemTaken = /* @__PURE__ */ new Set();
|
|
5018
|
-
for (const c of candidates) {
|
|
5019
|
-
if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
|
|
5020
|
-
assigned.set(c.reqIndex, c);
|
|
5021
|
-
itemTaken.add(c.itemKey);
|
|
5022
|
-
}
|
|
5023
|
-
const requirements = [];
|
|
5024
|
-
for (let i = 0; i < gold.requirements.length; i++) {
|
|
5025
|
-
const req = gold.requirements[i];
|
|
5026
|
-
const match = assigned.get(i);
|
|
5027
|
-
const evidence = [];
|
|
5028
|
-
let correct = null;
|
|
5029
|
-
if (match) {
|
|
5030
|
-
evidence.push(match.evidence);
|
|
5031
|
-
if (match.content !== null) {
|
|
5032
|
-
const r = await checkCorrectness(req, match.content);
|
|
5033
|
-
correct = r.correct;
|
|
5034
|
-
evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
|
|
5035
|
-
} else {
|
|
5036
|
-
evidence.push("correctness: not assessed \u2014 matched item carries no content");
|
|
5037
|
-
}
|
|
5038
|
-
} else {
|
|
5039
|
-
const by = req.satisfiedBy ?? "any";
|
|
5040
|
-
const kind = by === "any" ? "artifact/proposal/tool-call" : by;
|
|
5041
|
-
evidence.push(`no produced ${kind} matched this requirement`);
|
|
5042
|
-
}
|
|
5043
|
-
const structurallyPresent = match !== void 0;
|
|
5044
|
-
const satisfied = structurallyPresent && correct !== false;
|
|
5045
|
-
requirements.push({
|
|
5046
|
-
reqId: req.reqId,
|
|
5047
|
-
title: req.title,
|
|
5048
|
-
structurallyPresent,
|
|
5049
|
-
correct,
|
|
5050
|
-
satisfied,
|
|
5051
|
-
evidence
|
|
5052
|
-
});
|
|
5053
|
-
}
|
|
5054
|
-
const satisfiedCount = requirements.filter((r) => r.satisfied).length;
|
|
5055
|
-
return {
|
|
5056
|
-
taskId: gold.taskId,
|
|
5057
|
-
requirements,
|
|
5058
|
-
completionRate: satisfiedCount / requirements.length,
|
|
5059
|
-
fullyComplete: satisfiedCount === requirements.length
|
|
5060
|
-
};
|
|
5061
|
-
}
|
|
5062
|
-
function parseCorrectnessResponse(raw) {
|
|
5063
|
-
const match = raw.match(/\{[\s\S]*\}/);
|
|
5064
|
-
if (!match) {
|
|
5065
|
-
throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
|
|
5066
|
-
}
|
|
5067
|
-
const parsed = JSON.parse(match[0]);
|
|
5068
|
-
if (typeof parsed.correct !== "boolean") {
|
|
5069
|
-
throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
|
|
5070
|
-
}
|
|
5071
|
-
return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
|
|
5072
|
-
}
|
|
5073
|
-
function createLlmCorrectnessChecker(tc, opts = {}) {
|
|
5074
|
-
const model = opts.model ?? "claude-sonnet-4-6";
|
|
5075
|
-
const maxContentChars = opts.maxContentChars ?? 8e3;
|
|
5076
|
-
return async (requirement, content) => {
|
|
5077
|
-
const resp = await tc.chat({
|
|
5078
|
-
model,
|
|
5079
|
-
messages: [
|
|
5080
|
-
{
|
|
5081
|
-
role: "system",
|
|
5082
|
-
content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
|
|
5083
|
-
},
|
|
5084
|
-
{
|
|
5085
|
-
role: "user",
|
|
5086
|
-
content: `Requirement: ${requirement.title}
|
|
5087
|
-
${requirement.category ? `Category: ${requirement.category}
|
|
5088
|
-
` : ""}
|
|
5089
|
-
Produced artifact:
|
|
5090
|
-
${content.slice(0, maxContentChars)}`
|
|
5091
|
-
}
|
|
5092
|
-
],
|
|
5093
|
-
temperature: 0,
|
|
5094
|
-
maxTokens: 200
|
|
5095
|
-
});
|
|
5096
|
-
const raw = resp.choices?.[0]?.message?.content ?? "";
|
|
5097
|
-
return parseCorrectnessResponse(raw);
|
|
5098
|
-
};
|
|
5099
|
-
}
|
|
5100
|
-
|
|
5101
2825
|
// src/dual-agent-bench.ts
|
|
5102
2826
|
var DualAgentBench = class {
|
|
5103
2827
|
async run(config) {
|
|
@@ -5752,40 +3476,6 @@ function canonicalInstruction(value) {
|
|
|
5752
3476
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
5753
3477
|
}
|
|
5754
3478
|
|
|
5755
|
-
// src/produced-state.ts
|
|
5756
|
-
function artifactKind(mimeType) {
|
|
5757
|
-
if (!mimeType) return "file";
|
|
5758
|
-
if (mimeType.includes("json")) return "json";
|
|
5759
|
-
if (mimeType.startsWith("text/")) return "text";
|
|
5760
|
-
return "file";
|
|
5761
|
-
}
|
|
5762
|
-
function extractProducedState(events) {
|
|
5763
|
-
const artifacts = [];
|
|
5764
|
-
const proposals = [];
|
|
5765
|
-
const toolCalls = [];
|
|
5766
|
-
const seenTools = /* @__PURE__ */ new Set();
|
|
5767
|
-
for (const ev of events) {
|
|
5768
|
-
if (ev.type === "tool_call") {
|
|
5769
|
-
const name = ev.toolName;
|
|
5770
|
-
if (name && !seenTools.has(name)) {
|
|
5771
|
-
seenTools.add(name);
|
|
5772
|
-
toolCalls.push(name);
|
|
5773
|
-
}
|
|
5774
|
-
} else if (ev.type === "artifact") {
|
|
5775
|
-
const a = ev;
|
|
5776
|
-
artifacts.push({
|
|
5777
|
-
kind: artifactKind(a.mimeType),
|
|
5778
|
-
path: a.name ?? a.uri ?? a.artifactId,
|
|
5779
|
-
content: a.content ?? ""
|
|
5780
|
-
});
|
|
5781
|
-
} else if (ev.type === "proposal_created") {
|
|
5782
|
-
const p = ev;
|
|
5783
|
-
proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
|
|
5784
|
-
}
|
|
5785
|
-
}
|
|
5786
|
-
return { artifacts, proposals, toolCalls };
|
|
5787
|
-
}
|
|
5788
|
-
|
|
5789
3479
|
// src/prompt-registry.ts
|
|
5790
3480
|
var PromptRegistry = class {
|
|
5791
3481
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -5885,12 +3575,17 @@ function renderSteeringText(bundle) {
|
|
|
5885
3575
|
([a], [b]) => a.localeCompare(b)
|
|
5886
3576
|
);
|
|
5887
3577
|
for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
|
|
3578
|
+
const roles = Object.entries(bundle.rolePrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
|
|
3579
|
+
for (const [name, role] of roles) {
|
|
3580
|
+
lines.push(`role:${name}:system:${role.system ?? ""}:append:${role.append ?? ""}`);
|
|
3581
|
+
}
|
|
5888
3582
|
const skills = [...bundle.skills ?? []].sort();
|
|
5889
3583
|
if (skills.length) lines.push(`skills:${skills.join(",")}`);
|
|
5890
3584
|
return lines.join("\n");
|
|
5891
3585
|
}
|
|
5892
3586
|
|
|
5893
3587
|
// src/steering-optimizer.ts
|
|
3588
|
+
import { AxGEPA, ai, ax } from "@ax-llm/ax";
|
|
5894
3589
|
var PairwiseSteeringOptimizer = class {
|
|
5895
3590
|
optimize(rows, config = {}) {
|
|
5896
3591
|
const ranked = rankRows(rows, config.weights);
|
|
@@ -5910,36 +3605,25 @@ var AxGepaSteeringOptimizer = class {
|
|
|
5910
3605
|
config;
|
|
5911
3606
|
async optimize(rows) {
|
|
5912
3607
|
const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
|
|
5913
|
-
const
|
|
3608
|
+
const minScenarioWinners = this.config.minScenarioWinners ?? 6;
|
|
5914
3609
|
const variantIds = [...new Set(rows.map((row) => row.variantId))];
|
|
5915
3610
|
const byScenario = collapseScenarioWinners(rows, this.config.weights);
|
|
5916
|
-
if (variantIds.length < 2 || byScenario.length <
|
|
5917
|
-
return {
|
|
5918
|
-
...fallback,
|
|
5919
|
-
backend: "ax-gepa",
|
|
5920
|
-
skipped: true,
|
|
5921
|
-
rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
|
|
5922
|
-
};
|
|
5923
|
-
}
|
|
5924
|
-
let axLib;
|
|
5925
|
-
try {
|
|
5926
|
-
axLib = await import("@ax-llm/ax");
|
|
5927
|
-
} catch {
|
|
3611
|
+
if (variantIds.length < 2 || byScenario.length < minScenarioWinners) {
|
|
5928
3612
|
return {
|
|
5929
3613
|
...fallback,
|
|
5930
3614
|
backend: "ax-gepa",
|
|
5931
3615
|
skipped: true,
|
|
5932
|
-
rationale:
|
|
3616
|
+
rationale: `AxGEPA skipped: need >=2 variants and >=${minScenarioWinners} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
|
|
5933
3617
|
};
|
|
5934
3618
|
}
|
|
5935
|
-
const { ai, ax, AxGEPA } = axLib;
|
|
5936
3619
|
const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
|
|
5937
3620
|
const selector = ax(signature, {
|
|
5938
3621
|
description: "Choose the best steering bundle variant for an autopilot task."
|
|
5939
3622
|
});
|
|
5940
|
-
const
|
|
5941
|
-
const
|
|
5942
|
-
const
|
|
3623
|
+
const shuffled = seededShuffle(byScenario, signature);
|
|
3624
|
+
const splitIndex = Math.max(1, Math.floor(shuffled.length * 0.8));
|
|
3625
|
+
const train = shuffled.slice(0, splitIndex);
|
|
3626
|
+
const validation = shuffled.slice(splitIndex);
|
|
5943
3627
|
if (!validation.length) {
|
|
5944
3628
|
return {
|
|
5945
3629
|
...fallback,
|
|
@@ -5948,10 +3632,10 @@ var AxGepaSteeringOptimizer = class {
|
|
|
5948
3632
|
rationale: "AxGEPA skipped: no validation examples after split."
|
|
5949
3633
|
};
|
|
5950
3634
|
}
|
|
3635
|
+
const studentAI = createAxService(this.config.provider, this.config.apiKey, this.config.model);
|
|
5951
3636
|
const optimizer = new AxGEPA({
|
|
5952
|
-
studentAI
|
|
3637
|
+
studentAI,
|
|
5953
3638
|
teacherAI: createAxService(
|
|
5954
|
-
ai,
|
|
5955
3639
|
this.config.provider,
|
|
5956
3640
|
this.config.apiKey,
|
|
5957
3641
|
this.config.teacherModel ?? this.config.model
|
|
@@ -5965,7 +3649,7 @@ var AxGepaSteeringOptimizer = class {
|
|
|
5965
3649
|
const compiled = await optimizer.compile(
|
|
5966
3650
|
selector,
|
|
5967
3651
|
train,
|
|
5968
|
-
(
|
|
3652
|
+
(input) => input.prediction?.variantId === input.example?.variantId ? 1 : 0,
|
|
5969
3653
|
{
|
|
5970
3654
|
validationExamples: validation,
|
|
5971
3655
|
maxMetricCalls: 64
|
|
@@ -5974,6 +3658,13 @@ var AxGepaSteeringOptimizer = class {
|
|
|
5974
3658
|
if (compiled.optimizedProgram !== void 0) {
|
|
5975
3659
|
selector.applyOptimization(compiled.optimizedProgram);
|
|
5976
3660
|
}
|
|
3661
|
+
const selectVariant = async (row) => {
|
|
3662
|
+
const prediction = await selector.forward(studentAI, row);
|
|
3663
|
+
return {
|
|
3664
|
+
variantId: String(prediction.variantId),
|
|
3665
|
+
rationale: String(prediction.rationale ?? "")
|
|
3666
|
+
};
|
|
3667
|
+
};
|
|
5977
3668
|
return {
|
|
5978
3669
|
...fallback,
|
|
5979
3670
|
backend: "ax-gepa",
|
|
@@ -5983,7 +3674,8 @@ var AxGepaSteeringOptimizer = class {
|
|
|
5983
3674
|
signature,
|
|
5984
3675
|
labels: variantIds,
|
|
5985
3676
|
rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
|
|
5986
|
-
}
|
|
3677
|
+
},
|
|
3678
|
+
selectVariant
|
|
5987
3679
|
};
|
|
5988
3680
|
}
|
|
5989
3681
|
};
|
|
@@ -6017,13 +3709,39 @@ function collapseScenarioWinners(rows, weights) {
|
|
|
6017
3709
|
};
|
|
6018
3710
|
});
|
|
6019
3711
|
}
|
|
6020
|
-
function createAxService(
|
|
6021
|
-
return
|
|
3712
|
+
function createAxService(provider, apiKey, model) {
|
|
3713
|
+
return ai({
|
|
6022
3714
|
name: provider,
|
|
6023
3715
|
apiKey,
|
|
6024
3716
|
config: { model }
|
|
6025
3717
|
});
|
|
6026
3718
|
}
|
|
3719
|
+
function seededShuffle(items, seed) {
|
|
3720
|
+
const rng = mulberry32(hashString(seed));
|
|
3721
|
+
const out = [...items];
|
|
3722
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
3723
|
+
const j = Math.floor(rng() * (i + 1));
|
|
3724
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
3725
|
+
}
|
|
3726
|
+
return out;
|
|
3727
|
+
}
|
|
3728
|
+
function hashString(value) {
|
|
3729
|
+
let h = 2166136261;
|
|
3730
|
+
for (let i = 0; i < value.length; i++) {
|
|
3731
|
+
h ^= value.charCodeAt(i);
|
|
3732
|
+
h = Math.imul(h, 16777619);
|
|
3733
|
+
}
|
|
3734
|
+
return h >>> 0;
|
|
3735
|
+
}
|
|
3736
|
+
function mulberry32(seed) {
|
|
3737
|
+
let a = seed >>> 0;
|
|
3738
|
+
return () => {
|
|
3739
|
+
a = a + 1831565813 | 0;
|
|
3740
|
+
let t = Math.imul(a ^ a >>> 15, 1 | a);
|
|
3741
|
+
t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
|
|
3742
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
3743
|
+
};
|
|
3744
|
+
}
|
|
6027
3745
|
|
|
6028
3746
|
// src/workspace-inspector.ts
|
|
6029
3747
|
var InMemoryWorkspaceInspector = class {
|
|
@@ -6258,8 +3976,8 @@ function assertNonNegative(n, name) {
|
|
|
6258
3976
|
}
|
|
6259
3977
|
|
|
6260
3978
|
// src/muffled-gate-scanner.ts
|
|
6261
|
-
import { existsSync
|
|
6262
|
-
import { join
|
|
3979
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "fs";
|
|
3980
|
+
import { join } from "path";
|
|
6263
3981
|
function codeOf(line) {
|
|
6264
3982
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
6265
3983
|
}
|
|
@@ -6371,14 +4089,14 @@ var UNIVERSAL_FINDERS = [findConstructorCwdDropped];
|
|
|
6371
4089
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
6372
4090
|
const matches = [];
|
|
6373
4091
|
const walk = (rel) => {
|
|
6374
|
-
const abs =
|
|
6375
|
-
if (!
|
|
6376
|
-
for (const entry of
|
|
6377
|
-
const sub =
|
|
6378
|
-
const subAbs =
|
|
4092
|
+
const abs = join(repoRoot, rel);
|
|
4093
|
+
if (!existsSync(abs)) return;
|
|
4094
|
+
for (const entry of readdirSync(abs)) {
|
|
4095
|
+
const sub = join(rel, entry);
|
|
4096
|
+
const subAbs = join(repoRoot, sub);
|
|
6379
4097
|
let st;
|
|
6380
4098
|
try {
|
|
6381
|
-
st =
|
|
4099
|
+
st = statSync(subAbs);
|
|
6382
4100
|
} catch {
|
|
6383
4101
|
continue;
|
|
6384
4102
|
}
|
|
@@ -6391,7 +4109,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
|
6391
4109
|
continue;
|
|
6392
4110
|
let text;
|
|
6393
4111
|
try {
|
|
6394
|
-
text =
|
|
4112
|
+
text = readFileSync(subAbs, "utf8");
|
|
6395
4113
|
} catch {
|
|
6396
4114
|
continue;
|
|
6397
4115
|
}
|
|
@@ -6406,9 +4124,9 @@ function scanForMuffledGates(opts) {
|
|
|
6406
4124
|
const findings = [];
|
|
6407
4125
|
const scanned = /* @__PURE__ */ new Set();
|
|
6408
4126
|
for (const file of opts.scanFiles) {
|
|
6409
|
-
const abs =
|
|
6410
|
-
if (!
|
|
6411
|
-
const text =
|
|
4127
|
+
const abs = join(opts.repoRoot, file);
|
|
4128
|
+
if (!existsSync(abs)) continue;
|
|
4129
|
+
const text = readFileSync(abs, "utf8");
|
|
6412
4130
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
6413
4131
|
scanned.add(file);
|
|
6414
4132
|
}
|
|
@@ -6421,9 +4139,9 @@ function scanForMuffledGates(opts) {
|
|
|
6421
4139
|
);
|
|
6422
4140
|
for (const file of importers) {
|
|
6423
4141
|
if (scanned.has(file)) continue;
|
|
6424
|
-
const abs =
|
|
6425
|
-
if (!
|
|
6426
|
-
const text =
|
|
4142
|
+
const abs = join(opts.repoRoot, file);
|
|
4143
|
+
if (!existsSync(abs)) continue;
|
|
4144
|
+
const text = readFileSync(abs, "utf8");
|
|
6427
4145
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
6428
4146
|
}
|
|
6429
4147
|
}
|
|
@@ -6573,8 +4291,8 @@ function isObject(v) {
|
|
|
6573
4291
|
}
|
|
6574
4292
|
|
|
6575
4293
|
// src/scorecard.ts
|
|
6576
|
-
import { appendFileSync
|
|
6577
|
-
import { dirname
|
|
4294
|
+
import { appendFileSync, existsSync as existsSync2, mkdirSync, readFileSync as readFileSync2 } from "fs";
|
|
4295
|
+
import { dirname } from "path";
|
|
6578
4296
|
function median(xs) {
|
|
6579
4297
|
if (xs.length === 0) return 0;
|
|
6580
4298
|
const sorted = [...xs].sort((a, b) => a - b);
|
|
@@ -6639,8 +4357,8 @@ function recordRuns(runs, opts) {
|
|
|
6639
4357
|
}
|
|
6640
4358
|
function appendScorecard(logPath, lines) {
|
|
6641
4359
|
if (lines.length === 0) return;
|
|
6642
|
-
|
|
6643
|
-
|
|
4360
|
+
mkdirSync(dirname(logPath), { recursive: true });
|
|
4361
|
+
appendFileSync(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
|
|
6644
4362
|
`);
|
|
6645
4363
|
}
|
|
6646
4364
|
function recordRunsToScorecard(logPath, runs, opts) {
|
|
@@ -6649,10 +4367,10 @@ function recordRunsToScorecard(logPath, runs, opts) {
|
|
|
6649
4367
|
return lines;
|
|
6650
4368
|
}
|
|
6651
4369
|
function loadScorecard(logPath) {
|
|
6652
|
-
if (!
|
|
4370
|
+
if (!existsSync2(logPath)) return { cells: [], profiles: {} };
|
|
6653
4371
|
const cells = /* @__PURE__ */ new Map();
|
|
6654
4372
|
const profiles = {};
|
|
6655
|
-
for (const raw of
|
|
4373
|
+
for (const raw of readFileSync2(logPath, "utf8").split("\n")) {
|
|
6656
4374
|
const line = raw.trim();
|
|
6657
4375
|
if (!line) continue;
|
|
6658
4376
|
let parsed;
|
|
@@ -6917,9 +4635,9 @@ function statusAdvanced(key, progression) {
|
|
|
6917
4635
|
description: `"${key}" progressed along ${progression.join("\u2192")}`,
|
|
6918
4636
|
score: ({ before, after }) => {
|
|
6919
4637
|
const bi = progression.indexOf(String(before[key]));
|
|
6920
|
-
const
|
|
6921
|
-
if (bi === -1 ||
|
|
6922
|
-
return
|
|
4638
|
+
const ai2 = progression.indexOf(String(after[key]));
|
|
4639
|
+
if (bi === -1 || ai2 === -1) return 0;
|
|
4640
|
+
return ai2 >= bi ? 1 : 0;
|
|
6923
4641
|
}
|
|
6924
4642
|
};
|
|
6925
4643
|
}
|
|
@@ -7529,7 +5247,7 @@ async function commitBisect(options) {
|
|
|
7529
5247
|
}
|
|
7530
5248
|
async function promptBisect(options) {
|
|
7531
5249
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7532
|
-
const
|
|
5250
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
7533
5251
|
const goodParas = split(options.good);
|
|
7534
5252
|
const badParas = split(options.bad);
|
|
7535
5253
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7549,7 +5267,7 @@ async function promptBisect(options) {
|
|
|
7549
5267
|
const result = await bisect({
|
|
7550
5268
|
good: goodMask,
|
|
7551
5269
|
bad: badMask,
|
|
7552
|
-
runEval: (mask) => options.runEval(
|
|
5270
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
7553
5271
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7554
5272
|
halfway: (g, b) => {
|
|
7555
5273
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7580,12 +5298,12 @@ async function promptBisect(options) {
|
|
|
7580
5298
|
}
|
|
7581
5299
|
}
|
|
7582
5300
|
const materializedPath = result.path.map((s) => ({
|
|
7583
|
-
state:
|
|
5301
|
+
state: join4(paragraphsFor(s.state)),
|
|
7584
5302
|
score: s.score,
|
|
7585
5303
|
pass: s.pass
|
|
7586
5304
|
}));
|
|
7587
5305
|
return {
|
|
7588
|
-
culprit:
|
|
5306
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
7589
5307
|
path: materializedPath,
|
|
7590
5308
|
converged: result.converged,
|
|
7591
5309
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -8079,8 +5797,8 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
8079
5797
|
|
|
8080
5798
|
// src/command-runner.ts
|
|
8081
5799
|
import { spawnSync } from "child_process";
|
|
8082
|
-
import { existsSync as
|
|
8083
|
-
import { join as
|
|
5800
|
+
import { existsSync as existsSync3, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
|
|
5801
|
+
import { join as join2 } from "path";
|
|
8084
5802
|
var localCommandRunner = {
|
|
8085
5803
|
name: "local",
|
|
8086
5804
|
async run(input) {
|
|
@@ -8108,11 +5826,11 @@ var localCommandRunner = {
|
|
|
8108
5826
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
8109
5827
|
},
|
|
8110
5828
|
async fileExists(path) {
|
|
8111
|
-
return
|
|
5829
|
+
return existsSync3(path);
|
|
8112
5830
|
},
|
|
8113
5831
|
async readFile(path) {
|
|
8114
5832
|
try {
|
|
8115
|
-
return
|
|
5833
|
+
return readFileSync3(path, "utf8");
|
|
8116
5834
|
} catch {
|
|
8117
5835
|
return null;
|
|
8118
5836
|
}
|
|
@@ -8120,14 +5838,14 @@ var localCommandRunner = {
|
|
|
8120
5838
|
async readDir(path) {
|
|
8121
5839
|
let entries;
|
|
8122
5840
|
try {
|
|
8123
|
-
entries =
|
|
5841
|
+
entries = readdirSync2(path);
|
|
8124
5842
|
} catch {
|
|
8125
5843
|
return [];
|
|
8126
5844
|
}
|
|
8127
5845
|
const out = [];
|
|
8128
5846
|
for (const name of entries) {
|
|
8129
5847
|
try {
|
|
8130
|
-
const st =
|
|
5848
|
+
const st = statSync2(join2(path, name));
|
|
8131
5849
|
out.push({
|
|
8132
5850
|
name,
|
|
8133
5851
|
isDirectory: st.isDirectory(),
|
|
@@ -8466,11 +6184,11 @@ function flowLayer(input) {
|
|
|
8466
6184
|
|
|
8467
6185
|
// src/intent-match-judge.ts
|
|
8468
6186
|
var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
8469
|
-
var
|
|
8470
|
-
var
|
|
8471
|
-
var
|
|
8472
|
-
var
|
|
8473
|
-
var
|
|
6187
|
+
var DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
6188
|
+
var DEFAULT_TIMEOUT = 9e4;
|
|
6189
|
+
var DEFAULT_MAX_SOURCE = 25e3;
|
|
6190
|
+
var DEFAULT_MAX_PER_FILE = 12e3;
|
|
6191
|
+
var DEFAULT_MAX_HTML = 2e4;
|
|
8474
6192
|
var INTENT_SCHEMA = {
|
|
8475
6193
|
type: "object",
|
|
8476
6194
|
additionalProperties: false,
|
|
@@ -8480,12 +6198,12 @@ var INTENT_SCHEMA = {
|
|
|
8480
6198
|
evidence: { type: "string", minLength: 10, maxLength: 400 }
|
|
8481
6199
|
}
|
|
8482
6200
|
};
|
|
8483
|
-
function
|
|
6201
|
+
function truncate(body, cap, label) {
|
|
8484
6202
|
if (body.length <= cap) return body;
|
|
8485
6203
|
return `${body.slice(0, cap)}
|
|
8486
6204
|
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
8487
6205
|
}
|
|
8488
|
-
function
|
|
6206
|
+
function buildPrompt(input, opts) {
|
|
8489
6207
|
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
8490
6208
|
${f.content}`).join("\n\n");
|
|
8491
6209
|
const html = input.servedHtml ?? "";
|
|
@@ -8504,10 +6222,10 @@ ${input.artifactLabel ? `ARTIFACT METADATA:
|
|
|
8504
6222
|
description: ${input.artifactDescription ?? ""}
|
|
8505
6223
|
|
|
8506
6224
|
` : ""}${html ? `SERVED HTML (what the preview returns):
|
|
8507
|
-
${
|
|
6225
|
+
${truncate(html, opts.maxHtmlChars, "HTML")}
|
|
8508
6226
|
|
|
8509
6227
|
` : ""}SOURCE FILES (the agent's workdir):
|
|
8510
|
-
${
|
|
6228
|
+
${truncate(sourceBlob, opts.maxSourceChars, "source")}
|
|
8511
6229
|
|
|
8512
6230
|
Score 0\u20131:
|
|
8513
6231
|
1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
|
|
@@ -8535,11 +6253,11 @@ Return STRICT JSON. No prose outside.`;
|
|
|
8535
6253
|
async function runIntentMatchJudge(input, options = {}) {
|
|
8536
6254
|
const start = Date.now();
|
|
8537
6255
|
const opts = {
|
|
8538
|
-
model: options.model ??
|
|
8539
|
-
timeoutMs: options.timeoutMs ??
|
|
8540
|
-
maxSourceChars: options.maxSourceChars ??
|
|
8541
|
-
maxPerFileChars: options.maxPerFileChars ??
|
|
8542
|
-
maxHtmlChars: options.maxHtmlChars ??
|
|
6256
|
+
model: options.model ?? DEFAULT_MODEL,
|
|
6257
|
+
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
|
|
6258
|
+
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
|
|
6259
|
+
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
|
|
6260
|
+
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
|
|
8543
6261
|
llm: options.llm ?? {}
|
|
8544
6262
|
};
|
|
8545
6263
|
if (input.sourceFiles.length === 0 && !input.servedHtml) {
|
|
@@ -8563,7 +6281,7 @@ async function runIntentMatchJudge(input, options = {}) {
|
|
|
8563
6281
|
role: "system",
|
|
8564
6282
|
content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
|
|
8565
6283
|
},
|
|
8566
|
-
{ role: "user", content:
|
|
6284
|
+
{ role: "user", content: buildPrompt(input, opts) }
|
|
8567
6285
|
],
|
|
8568
6286
|
jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
|
|
8569
6287
|
temperature: 0,
|
|
@@ -9045,8 +6763,8 @@ function multiToolchainLayer(config) {
|
|
|
9045
6763
|
}
|
|
9046
6764
|
|
|
9047
6765
|
// src/reference-replay.ts
|
|
9048
|
-
import { appendFileSync as
|
|
9049
|
-
import { dirname as
|
|
6766
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
6767
|
+
import { dirname as dirname2 } from "path";
|
|
9050
6768
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
9051
6769
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
9052
6770
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9164,14 +6882,14 @@ function jsonlReferenceReplayStore(path) {
|
|
|
9164
6882
|
return {
|
|
9165
6883
|
async save(run) {
|
|
9166
6884
|
await lock.runExclusive(() => {
|
|
9167
|
-
|
|
9168
|
-
|
|
6885
|
+
mkdirSync2(dirname2(path), { recursive: true });
|
|
6886
|
+
appendFileSync2(path, `${JSON.stringify(run)}
|
|
9169
6887
|
`);
|
|
9170
6888
|
});
|
|
9171
6889
|
},
|
|
9172
6890
|
async list() {
|
|
9173
6891
|
return lock.runExclusive(() => {
|
|
9174
|
-
if (!
|
|
6892
|
+
if (!existsSync4(path)) return [];
|
|
9175
6893
|
return readJsonl(path);
|
|
9176
6894
|
});
|
|
9177
6895
|
}
|
|
@@ -9464,8 +7182,8 @@ function ratio(numerator, denominator) {
|
|
|
9464
7182
|
return denominator > 0 ? numerator / denominator : 0;
|
|
9465
7183
|
}
|
|
9466
7184
|
function tokenJaccard(a, b) {
|
|
9467
|
-
const left = new Set(
|
|
9468
|
-
const right = new Set(
|
|
7185
|
+
const left = new Set(tokens(a));
|
|
7186
|
+
const right = new Set(tokens(b));
|
|
9469
7187
|
if (left.size === 0 || right.size === 0) return 0;
|
|
9470
7188
|
let intersection = 0;
|
|
9471
7189
|
for (const token of left) {
|
|
@@ -9483,7 +7201,7 @@ function tagOverlap(a, b) {
|
|
|
9483
7201
|
}
|
|
9484
7202
|
return intersection / Math.max(left.size, right.size);
|
|
9485
7203
|
}
|
|
9486
|
-
function
|
|
7204
|
+
function tokens(text) {
|
|
9487
7205
|
return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
|
|
9488
7206
|
}
|
|
9489
7207
|
function normalize(text) {
|
|
@@ -9514,7 +7232,7 @@ function throwIfAborted(signal) {
|
|
|
9514
7232
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9515
7233
|
}
|
|
9516
7234
|
function readJsonl(path) {
|
|
9517
|
-
const raw =
|
|
7235
|
+
const raw = readFileSync4(path, "utf8");
|
|
9518
7236
|
const out = [];
|
|
9519
7237
|
for (const line of raw.split("\n")) {
|
|
9520
7238
|
const trimmed = line.trim();
|
|
@@ -9671,7 +7389,7 @@ function createDefaultReviewer(options) {
|
|
|
9671
7389
|
|
|
9672
7390
|
// src/discover-personas.ts
|
|
9673
7391
|
import { promises as fs } from "fs";
|
|
9674
|
-
import { basename, extname, join as
|
|
7392
|
+
import { basename, extname, join as join3 } from "path";
|
|
9675
7393
|
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
9676
7394
|
async function discoverPersonas(dir, opts = {}) {
|
|
9677
7395
|
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
@@ -9689,7 +7407,7 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9689
7407
|
}
|
|
9690
7408
|
const out = [];
|
|
9691
7409
|
for (const entry of entries) {
|
|
9692
|
-
const full =
|
|
7410
|
+
const full = join3(d, entry.name);
|
|
9693
7411
|
if (entry.isDir) {
|
|
9694
7412
|
if (opts.recursive) out.push(...await walk(full));
|
|
9695
7413
|
continue;
|
|
@@ -10516,9 +8234,9 @@ function jaccard(a, b) {
|
|
|
10516
8234
|
}
|
|
10517
8235
|
|
|
10518
8236
|
// src/campaign/distillation/gold-scenarios.ts
|
|
10519
|
-
import { readFileSync as
|
|
8237
|
+
import { readFileSync as readFileSync5 } from "fs";
|
|
10520
8238
|
function loadGoldScenarios(jsonlPath) {
|
|
10521
|
-
const text =
|
|
8239
|
+
const text = readFileSync5(jsonlPath, "utf8");
|
|
10522
8240
|
return parseGoldJsonl(text, jsonlPath);
|
|
10523
8241
|
}
|
|
10524
8242
|
function parseGoldJsonl(text, sourceLabel = "<inline>") {
|
|
@@ -10863,7 +8581,6 @@ function sectionHash(section) {
|
|
|
10863
8581
|
}
|
|
10864
8582
|
export {
|
|
10865
8583
|
AGENT_PROFILE_KINDS,
|
|
10866
|
-
ANALYST_SEVERITIES,
|
|
10867
8584
|
AgentDriver,
|
|
10868
8585
|
AgentEvalError,
|
|
10869
8586
|
AgentProfileCellValidationError,
|
|
@@ -10902,13 +8619,10 @@ export {
|
|
|
10902
8619
|
ExperimentTracker,
|
|
10903
8620
|
FAILURE_CLASSES,
|
|
10904
8621
|
FAILURE_MODE_KIND_SPEC,
|
|
10905
|
-
FINDING_SUBJECT_GRAMMAR_PROMPT,
|
|
10906
|
-
FINDING_SUBJECT_KINDS,
|
|
10907
8622
|
FileSystemExperimentStore,
|
|
10908
8623
|
FileSystemFeedbackTrajectoryStore,
|
|
10909
8624
|
FileSystemRawProviderSink,
|
|
10910
8625
|
FileSystemTraceStore,
|
|
10911
|
-
FindingSubjectStringSchema,
|
|
10912
8626
|
FindingsStore,
|
|
10913
8627
|
HeldOutGate,
|
|
10914
8628
|
HoldoutAuditor,
|
|
@@ -10922,7 +8636,6 @@ export {
|
|
|
10922
8636
|
InMemoryWorkspaceInspector,
|
|
10923
8637
|
JudgeError,
|
|
10924
8638
|
JudgeRunner,
|
|
10925
|
-
KIND_EXPECTED_SUBJECTS,
|
|
10926
8639
|
KNOWLEDGE_GAP_KIND_SPEC,
|
|
10927
8640
|
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
10928
8641
|
LlmCallError,
|
|
@@ -10941,10 +8654,8 @@ export {
|
|
|
10941
8654
|
PairwiseSteeringOptimizer,
|
|
10942
8655
|
ProductClient,
|
|
10943
8656
|
PromptRegistry,
|
|
10944
|
-
RAW_FINDING_SCHEMA_PROMPT,
|
|
10945
8657
|
REDACTION_VERSION,
|
|
10946
8658
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
10947
|
-
RawAnalystFindingSchema,
|
|
10948
8659
|
ReplayCache,
|
|
10949
8660
|
ReplayCacheMissError,
|
|
10950
8661
|
ReplayError,
|
|
@@ -10985,6 +8696,8 @@ export {
|
|
|
10985
8696
|
analyzeTraces,
|
|
10986
8697
|
appendScorecard,
|
|
10987
8698
|
argHash,
|
|
8699
|
+
asNumber,
|
|
8700
|
+
asString,
|
|
10988
8701
|
assertCrossFamily,
|
|
10989
8702
|
assertLlmRoute,
|
|
10990
8703
|
assertRealBackend,
|
|
@@ -11004,15 +8717,14 @@ export {
|
|
|
11004
8717
|
bootstrapCi,
|
|
11005
8718
|
buildAgentProfileCell,
|
|
11006
8719
|
buildAgreementJudge,
|
|
8720
|
+
buildDefaultAnalystRegistry,
|
|
11007
8721
|
buildDriverSystemPrompt,
|
|
11008
8722
|
buildReflectionPrompt,
|
|
11009
8723
|
buildReviewerPrompt,
|
|
11010
8724
|
buildSandboxAgentProfileCell,
|
|
11011
|
-
buildSkillUsageReport,
|
|
11012
8725
|
buildTraceAnalystTools,
|
|
11013
8726
|
buildTraceInsightContext,
|
|
11014
8727
|
buildTraceInsightPrompt,
|
|
11015
|
-
buildTraceToolsForGroup,
|
|
11016
8728
|
buildTrajectory,
|
|
11017
8729
|
byteLengthRange,
|
|
11018
8730
|
calibrateJudge,
|
|
@@ -11043,6 +8755,7 @@ export {
|
|
|
11043
8755
|
composeValidators,
|
|
11044
8756
|
computeFindingId,
|
|
11045
8757
|
computeToolUseMetrics,
|
|
8758
|
+
computeTraceMetrics,
|
|
11046
8759
|
confidenceInterval,
|
|
11047
8760
|
containsAll,
|
|
11048
8761
|
continuousAgreement,
|
|
@@ -11051,26 +8764,21 @@ export {
|
|
|
11051
8764
|
controlRunToRunRecord,
|
|
11052
8765
|
corpusInterRaterAgreement,
|
|
11053
8766
|
corpusInterRaterAgreementFromJudgeScores,
|
|
8767
|
+
createAnalystAi,
|
|
11054
8768
|
createAntiSlopJudge,
|
|
11055
|
-
createChatClient,
|
|
11056
8769
|
createCustomJudge,
|
|
11057
8770
|
createDefaultReviewer,
|
|
11058
8771
|
createDomainExpertJudge,
|
|
11059
8772
|
createFeedbackTrajectory,
|
|
11060
8773
|
createIntentMatchJudge,
|
|
11061
|
-
createJudgeAdapter,
|
|
11062
8774
|
createLlmCorrectnessChecker,
|
|
11063
8775
|
createLlmReviewer,
|
|
11064
8776
|
createOtelExporter,
|
|
11065
8777
|
createOtelTracingStore,
|
|
11066
8778
|
createReplayFetch,
|
|
11067
|
-
createRunCriticAdapter,
|
|
11068
8779
|
createSandboxPool,
|
|
11069
8780
|
createSemanticConceptJudge,
|
|
11070
|
-
createSemanticConceptJudgeAdapter,
|
|
11071
|
-
createTraceAnalystAdapter,
|
|
11072
8781
|
createTraceAnalystKind,
|
|
11073
|
-
createVerifierAdapter,
|
|
11074
8782
|
crossTraceDiff,
|
|
11075
8783
|
crowdingDistance,
|
|
11076
8784
|
decideNextUserTurn,
|
|
@@ -11091,7 +8799,6 @@ export {
|
|
|
11091
8799
|
distillPlaybook,
|
|
11092
8800
|
domainEvidencePattern,
|
|
11093
8801
|
dominates,
|
|
11094
|
-
emitSkillUsageFindings,
|
|
11095
8802
|
estimateCost,
|
|
11096
8803
|
estimateTokens,
|
|
11097
8804
|
euAiActReport,
|
|
@@ -11107,6 +8814,7 @@ export {
|
|
|
11107
8814
|
exportRunAsOtlp,
|
|
11108
8815
|
extractAssetUrls,
|
|
11109
8816
|
extractErrorCount,
|
|
8817
|
+
extractOtlpAttributes,
|
|
11110
8818
|
extractProducedState,
|
|
11111
8819
|
feedbackTrajectoriesToDatasetScenarios,
|
|
11112
8820
|
feedbackTrajectoriesToOptimizerRows,
|
|
@@ -11120,6 +8828,8 @@ export {
|
|
|
11120
8828
|
findFallbackToPass,
|
|
11121
8829
|
findLiteralTruePass,
|
|
11122
8830
|
findSkipCountsAsPass,
|
|
8831
|
+
firstNumberAttr,
|
|
8832
|
+
firstStringAttr,
|
|
11123
8833
|
flattenOtlpExportToNdjson,
|
|
11124
8834
|
flowLayer,
|
|
11125
8835
|
formatBenchmarkReport,
|
|
@@ -11140,6 +8850,7 @@ export {
|
|
|
11140
8850
|
inMemoryReferenceReplayStore,
|
|
11141
8851
|
inMemoryReviewStore,
|
|
11142
8852
|
inferDomainKeywords,
|
|
8853
|
+
inferOtlpKind,
|
|
11143
8854
|
interRaterReliability,
|
|
11144
8855
|
interpretCliffs,
|
|
11145
8856
|
iqr,
|
|
@@ -11163,7 +8874,6 @@ export {
|
|
|
11163
8874
|
judgeSpans,
|
|
11164
8875
|
keyPreserved,
|
|
11165
8876
|
knowledgeReadinessTracePayload,
|
|
11166
|
-
liftSeverity,
|
|
11167
8877
|
linterJudge,
|
|
11168
8878
|
llmSpanFromProvider,
|
|
11169
8879
|
llmSpans,
|
|
@@ -11183,6 +8893,8 @@ export {
|
|
|
11183
8893
|
notBlocked,
|
|
11184
8894
|
objectiveEval,
|
|
11185
8895
|
otelRunCompleteHook,
|
|
8896
|
+
otlpToRunRecords,
|
|
8897
|
+
otlpToTraceRunRecords,
|
|
11186
8898
|
pairedBootstrap,
|
|
11187
8899
|
pairedEvalueSequence,
|
|
11188
8900
|
pairedMde,
|
|
@@ -11194,9 +8906,7 @@ export {
|
|
|
11194
8906
|
paretoFrontierWithCrowding,
|
|
11195
8907
|
parseCorrectnessResponse,
|
|
11196
8908
|
parseFeedbackTrajectoriesJsonl,
|
|
11197
|
-
parseFindingSubject,
|
|
11198
8909
|
parseGoldJsonl,
|
|
11199
|
-
parseRawFinding,
|
|
11200
8910
|
parseReflectionResponse,
|
|
11201
8911
|
parseRunRecordSafe,
|
|
11202
8912
|
partialCredit,
|
|
@@ -11208,11 +8918,13 @@ export {
|
|
|
11208
8918
|
printDriverSummary,
|
|
11209
8919
|
probeLlm,
|
|
11210
8920
|
profile_exports as profile,
|
|
8921
|
+
projectOtlpFlatLine,
|
|
11211
8922
|
promptBisect,
|
|
11212
8923
|
proposeAutomatedPullRequest,
|
|
11213
8924
|
proposeSynthesisTargets,
|
|
11214
8925
|
providerFromBaseUrl,
|
|
11215
8926
|
pytestTestParser,
|
|
8927
|
+
readOtlpStatus,
|
|
11216
8928
|
recordRuns,
|
|
11217
8929
|
recordRunsToScorecard,
|
|
11218
8930
|
redTeamDataset,
|
|
@@ -11223,7 +8935,6 @@ export {
|
|
|
11223
8935
|
referenceReplayScenarioToRunScore,
|
|
11224
8936
|
regexMatch,
|
|
11225
8937
|
regexMatches,
|
|
11226
|
-
renderFindingSubject,
|
|
11227
8938
|
renderMarkdown,
|
|
11228
8939
|
renderMarkdownReport,
|
|
11229
8940
|
renderPlaybookMarkdown,
|
|
@@ -11287,6 +8998,7 @@ export {
|
|
|
11287
8998
|
statusAdvanced,
|
|
11288
8999
|
stopOnNoProgress,
|
|
11289
9000
|
stopOnRepeatedAction,
|
|
9001
|
+
stringField,
|
|
11290
9002
|
stripFencedJson,
|
|
11291
9003
|
subjectiveEval,
|
|
11292
9004
|
summarize,
|