@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +518 -9
  11. package/dist/campaign/index.js +672 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
package/dist/index.js CHANGED
@@ -3,11 +3,15 @@ import {
3
3
  MetricsCollector,
4
4
  TokenCounter,
5
5
  agentProfileHash,
6
+ createLlmCorrectnessChecker,
6
7
  estimateCost,
7
8
  estimateTokens,
9
+ extractProducedState,
8
10
  isModelPriced,
9
- resolveModelPricing
10
- } from "./chunk-SL55X4VN.js";
11
+ parseCorrectnessResponse,
12
+ resolveModelPricing,
13
+ verifyCompletion
14
+ } from "./chunk-LB2UOI5F.js";
11
15
  import {
12
16
  HoldoutAuditor,
13
17
  canaryLeakView,
@@ -38,7 +42,7 @@ import {
38
42
  scoreRedTeamOutput,
39
43
  surfaceContentHash,
40
44
  toolNamesForRun
41
- } from "./chunk-4QJN7RDX.js";
45
+ } from "./chunk-JYE3WOTE.js";
42
46
  import {
43
47
  BackendIntegrityError,
44
48
  assertRealBackend,
@@ -89,6 +93,40 @@ import {
89
93
  scoreKnowledgeReadiness,
90
94
  userQuestionsForKnowledgeGaps
91
95
  } from "./chunk-3CKU6VGU.js";
96
+ import {
97
+ DEFAULT_COMPLEXITY_WEIGHTS,
98
+ DEFAULT_RUN_SCORE_WEIGHTS,
99
+ FindingsStore,
100
+ LockedJsonlAppender,
101
+ Mutex,
102
+ RunCritic,
103
+ SEMANTIC_CONCEPT_JUDGE_VERSION,
104
+ SKILL_USAGE_ANALYST,
105
+ SkillUsageAnalyst,
106
+ aggregateRunScore,
107
+ buildDefaultAnalystRegistry,
108
+ clamp01,
109
+ computeTraceMetrics,
110
+ createAnalystAi,
111
+ createChatClient,
112
+ createSemanticConceptJudge,
113
+ defaultIsMaterial,
114
+ diffFindings,
115
+ resetLockedAppendersForTesting,
116
+ runSemanticConceptJudge
117
+ } from "./chunk-7W4SM7FD.js";
118
+ import {
119
+ AnalystRegistry,
120
+ DEFAULT_TRACE_ANALYST_KINDS,
121
+ FAILURE_MODE_KIND_SPEC,
122
+ IMPROVEMENT_KIND_SPEC,
123
+ KNOWLEDGE_GAP_KIND_SPEC,
124
+ KNOWLEDGE_POISONING_KIND_SPEC,
125
+ computeFindingId,
126
+ createTraceAnalystKind,
127
+ makeFinding,
128
+ renderPriorFindings
129
+ } from "./chunk-WYIHD6EB.js";
92
130
  import {
93
131
  controlFailureClassFromVerification,
94
132
  controlRunToRunRecord,
@@ -117,142 +155,7 @@ import {
117
155
  } from "./chunk-B26KI423.js";
118
156
  import {
119
157
  runEvalCampaign
120
- } from "./chunk-AIWHLG7J.js";
121
- import {
122
- AGENT_PROFILE_KINDS,
123
- AgentProfileCellValidationError,
124
- RunRecordValidationError,
125
- agentProfileCellHashMaterial,
126
- agentProfileCellKey,
127
- assertRunAgentProfileCell,
128
- buildAgentProfileCell,
129
- buildSandboxAgentProfileCell,
130
- groupRunsByAgentProfileCell,
131
- isRunRecord,
132
- parseRunRecordSafe,
133
- requireAgentProfileCell,
134
- roundTripRunRecord,
135
- toAgentProfileJson,
136
- validateAgentProfileCell,
137
- validateRunRecord,
138
- verifyAgentProfileCell
139
- } from "./chunk-F3SRAAZO.js";
140
- import {
141
- evaluateInterimReleaseConfidence,
142
- pairedEvalueSequence
143
- } from "./chunk-MAZ26DC7.js";
144
- import {
145
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
146
- gainHistogram,
147
- paretoChart,
148
- researchReport,
149
- summaryTable
150
- } from "./chunk-KX6F6NCG.js";
151
- import {
152
- benjaminiHochberg,
153
- bonferroni,
154
- calibrateJudge,
155
- calibrateJudgeContinuous,
156
- cliffsDelta,
157
- cohensD,
158
- confidenceInterval,
159
- continuousAgreement,
160
- corpusInterRaterAgreement,
161
- corpusInterRaterAgreementFromJudgeScores,
162
- interRaterReliability,
163
- interpretCliffs,
164
- mannWhitneyU,
165
- normalizeScores,
166
- pairedBootstrap,
167
- pairedMde,
168
- pairedTTest,
169
- partialCredit,
170
- positionalBias,
171
- requiredSampleSize,
172
- selfPreference,
173
- verbosityBias,
174
- weightedComposite,
175
- weightedMean,
176
- wilcoxonSignedRank
177
- } from "./chunk-ITBRCT73.js";
178
- import {
179
- DEFAULT_TRACE_ANALYST_BUDGETS,
180
- FileSystemTraceStore,
181
- InMemoryTraceStore,
182
- OTEL_AGENT_EVAL_SCOPE,
183
- OtlpFileTraceStore,
184
- ReplayCache,
185
- ReplayCacheMissError,
186
- SpanNotFoundError,
187
- TRACE_ANALYST_ACTOR_DESCRIPTION,
188
- TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
189
- TRACE_ANALYST_SUBAGENT_DESCRIPTION,
190
- TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
191
- TraceFileMissingError,
192
- TraceNotFoundError,
193
- analyzeTraces,
194
- buildTraceAnalystTools,
195
- buildTraceInsightContext,
196
- buildTraceInsightPrompt,
197
- captureFetchToRawSink,
198
- createOtelExporter,
199
- createOtelTracingStore,
200
- createReplayFetch,
201
- defaultTraceInsightPanel,
202
- describeTraceInsightScope,
203
- domainEvidencePattern,
204
- exportRunAsOtlp,
205
- flattenOtlpExportToNdjson,
206
- inferDomainKeywords,
207
- iterateRawCalls,
208
- otelRunCompleteHook,
209
- planTraceInsightQuestions,
210
- scoreTraceInsightReadiness,
211
- tokenizeDomainWords,
212
- traceAnalystFunctionGroup,
213
- traceAnalystOnRunComplete
214
- } from "./chunk-ODGETRTM.js";
215
- import {
216
- DEFAULT_REDACTION_RULES,
217
- REDACTION_VERSION,
218
- redactString,
219
- redactValue
220
- } from "./chunk-GGE4NNQT.js";
221
- import {
222
- aggregateLlm,
223
- argHash,
224
- groupBy,
225
- judgeSpans,
226
- llmSpans,
227
- runFailureClass,
228
- runsForScenario,
229
- toolSpans
230
- } from "./chunk-47X6LRCE.js";
231
- import {
232
- FAILURE_CLASSES,
233
- TRACE_SCHEMA_VERSION,
234
- isJudgeSpan,
235
- isLlmSpan,
236
- isRetrievalSpan,
237
- isSandboxSpan,
238
- isToolSpan
239
- } from "./chunk-5BKGXME7.js";
240
- import {
241
- RunIntegrityError,
242
- assertRunCaptured,
243
- throwIfRunIncomplete
244
- } from "./chunk-SBCB6VZY.js";
245
- import {
246
- TraceEmitter,
247
- llmSpanFromProvider
248
- } from "./chunk-TVVP3ZZQ.js";
249
- import {
250
- canonicalize,
251
- evaluateHypothesis,
252
- hashJson,
253
- signManifest,
254
- verifyManifest
255
- } from "./chunk-VSMTAMNK.js";
158
+ } from "./chunk-GJJNJVIR.js";
256
159
  import {
257
160
  LlmCallError,
258
161
  LlmClient,
@@ -266,2019 +169,173 @@ import {
266
169
  stripFencedJson
267
170
  } from "./chunk-IHDHUN2X.js";
268
171
  import {
269
- FileSystemRawProviderSink,
270
- InMemoryRawProviderSink,
271
- NoopRawProviderSink,
272
- defaultProviderRedactor,
273
- providerFromBaseUrl
274
- } from "./chunk-PC4UYEBM.js";
275
- import {
276
- AgentEvalError,
277
- CaptureIntegrityError,
278
- ConfigError,
279
- JudgeError,
280
- NotFoundError,
281
- ReplayError,
282
- ValidationError,
283
- VerificationError
284
- } from "./chunk-3BFEG2F6.js";
172
+ evaluateInterimReleaseConfidence,
173
+ pairedEvalueSequence
174
+ } from "./chunk-MAZ26DC7.js";
285
175
  import {
286
- __export
287
- } from "./chunk-PZ5AY32C.js";
288
-
289
- // src/run-score.ts
290
- var DEFAULT_RUN_SCORE_WEIGHTS = {
291
- success: 4,
292
- goalProgress: 2,
293
- repoGroundedness: 1.5,
294
- driftPenalty: -1.5,
295
- toolUseQuality: 1,
296
- patchQuality: 1.25,
297
- testReality: 1.5,
298
- finalGate: 3,
299
- reviewerBlockers: -2,
300
- costUsd: -0.2,
301
- wallSeconds: -0.1
302
- };
303
- function aggregateRunScore(score, weights = {}) {
304
- const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
305
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
306
- }
307
- function clamp01(value) {
308
- if (!Number.isFinite(value)) return 0;
309
- return Math.max(0, Math.min(1, value));
310
- }
311
- function finiteOrZero(value) {
312
- return Number.isFinite(value) ? value : 0;
313
- }
314
-
315
- // src/run-critic.ts
316
- var DEFAULT_DRIFT_PATTERNS = [
317
- /https?:\/\//i,
318
- /\btitle:\s/i,
319
- /\bsummary:\s/i,
320
- /\burl:\s/i,
321
- /\bnpm package usage\b/i,
322
- /\bnews\b/i
323
- ];
324
- var RunCritic = class {
325
- weights;
326
- driftPatterns;
327
- constructor(options = {}) {
328
- this.weights = options.weights;
329
- this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
330
- }
331
- async score(store, runId) {
332
- const run = await store.getRun(runId);
333
- if (!run) throw new NotFoundError(`run ${runId} not found`);
334
- const [spans, events, artifacts, budget] = await Promise.all([
335
- store.spans({ runId }),
336
- store.events({ runId }),
337
- store.artifacts(runId),
338
- store.budget(runId)
339
- ]);
340
- return this.scoreTrace({ run, spans, events, artifacts, budget });
341
- }
342
- scoreTrace(trace) {
343
- const notes = [];
344
- const llmSpans2 = trace.spans.filter(
345
- (s) => s.kind === "llm"
346
- );
347
- const toolSpans2 = trace.spans.filter(
348
- (s) => s.kind === "tool"
349
- );
350
- const judgeSpans2 = trace.spans.filter(
351
- (s) => s.kind === "judge"
352
- );
353
- const sandboxSpans = trace.spans.filter(
354
- (s) => s.kind === "sandbox"
355
- );
356
- const finalGateSpans = judgeSpans2.filter(
357
- (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
358
- );
359
- const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
360
- if (!success) notes.push("run did not complete with pass=true");
361
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
362
- const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
363
- trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
364
- ) : void 0;
365
- const goalProgress = outcomeScore ?? judgeAverage ?? success;
366
- const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
367
- const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
368
- if (toolSpans2.length === 0) notes.push("no tool spans recorded");
369
- const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
370
- const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
371
- if (!patchQuality) notes.push("no artifact or edit evidence recorded");
372
- const sandboxTests = sandboxSpans.filter(
373
- (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
374
- );
375
- const testReality = sandboxTests.length ? sandboxTests.reduce(
376
- (sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
377
- 0
378
- ) / sandboxTests.length : toolSpans2.some(
379
- (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
380
- ) ? 0.4 : 0;
381
- if (!testReality) notes.push("no real test/build evidence recorded");
382
- const blockerSpans = judgeSpans2.filter((span) => isBlockingJudge(span));
383
- const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
384
- const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
385
- if (finalGateBlockers.length)
386
- notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
387
- else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
388
- const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
389
- if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
390
- const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
391
- const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
392
- const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
393
- const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
394
- if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
395
- const costUsd = trace.budget.length ? Math.max(
396
- ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
397
- 0
398
- ) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
399
- const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
400
- return {
401
- success,
402
- goalProgress,
403
- repoGroundedness,
404
- driftPenalty,
405
- toolUseQuality,
406
- patchQuality,
407
- testReality,
408
- finalGate,
409
- reviewerBlockers,
410
- costUsd,
411
- wallSeconds,
412
- notes
413
- };
414
- }
415
- rank(score) {
416
- return aggregateRunScore(score, this.weights);
417
- }
418
- isDrift(text) {
419
- return this.driftPatterns.some((pattern) => pattern.test(text));
420
- }
421
- };
422
- function normalizeJudgeScore(score) {
423
- return score > 1 ? clamp01(score / 10) : clamp01(score);
424
- }
425
- function looksRepoGrounded(text) {
426
- return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
427
- text
428
- );
429
- }
430
- function isBlockingJudge(span) {
431
- return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
432
- }
433
- function positiveNumber(value) {
434
- return typeof value === "number" && value > 0;
435
- }
436
-
437
- // src/semantic-concept-judge.ts
438
- var DEFAULT_COMPLEXITY_WEIGHTS = {
439
- render: 1,
440
- integrate: 2,
441
- compute: 2.5
442
- };
443
- var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
444
- var DEFAULT_MAX_SOURCE = 45e3;
445
- var DEFAULT_MAX_HTML = 3e4;
446
- var DEFAULT_MAX_PER_FILE = 2e4;
447
- var DEFAULT_TIMEOUT = 18e4;
448
- var DEFAULT_MODEL = "claude-sonnet-4-6";
449
- var SEMANTIC_SCHEMA = {
450
- type: "object",
451
- additionalProperties: false,
452
- required: ["summary", "concepts"],
453
- properties: {
454
- summary: { type: "string", minLength: 20, maxLength: 600 },
455
- concepts: {
456
- type: "array",
457
- minItems: 1,
458
- items: {
459
- type: "object",
460
- additionalProperties: false,
461
- required: ["concept", "present", "score", "evidence", "severity"],
462
- properties: {
463
- concept: { type: "string", minLength: 1, maxLength: 120 },
464
- present: { type: "boolean" },
465
- score: { type: "number", minimum: 0, maximum: 10 },
466
- evidence: { type: "string", minLength: 5, maxLength: 400 },
467
- severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
468
- }
469
- }
470
- }
471
- }
472
- };
473
- function truncate(body, cap, label) {
474
- if (body.length <= cap) return body;
475
- return `${body.slice(0, cap)}
476
- \u2026 [truncated ${body.length - cap} chars of ${label}]`;
477
- }
478
- function buildPrompt(input, opts) {
479
- const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
480
- ${f.content}`).join("\n\n");
481
- const html = input.servedHtml ?? "";
482
- return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
483
-
484
- You MUST distinguish:
485
- (a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
486
- (b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
487
- (c) ABSENT (concept nowhere).
488
-
489
- A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
490
-
491
- USER REQUEST (what the agent was asked to build):
492
- ${input.userRequest}
493
-
494
- ${input.artifactLabel ? `ARTIFACT METADATA:
495
- name: ${input.artifactLabel}
496
- description: ${input.artifactDescription ?? ""}
497
-
498
- ` : ""}EXPECTED CONCEPTS (each must be graded independently):
499
- ${input.expectedConcepts.map(
500
- (c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
501
- ).join("\n")}
502
-
503
- ${html ? `SERVED HTML (what the preview returns when hit):
504
- ${truncate(html, opts.maxHtmlChars, "HTML")}
505
-
506
- ` : ""}SOURCE FILES (the agent's workdir):
507
- ${truncate(sourceBlob, opts.maxSourceChars, "source")}
508
-
509
- For EACH concept, return:
510
- - concept: the concept name as given (match exactly)
511
- - present: boolean \u2014 does a working implementation exist?
512
- - score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
513
- - evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
514
- - severity:
515
- "info" when present: true AND score >= 7
516
- "minor" when present: true AND 4 <= score < 7
517
- "major" when present: false OR score < 4
518
- "critical" when the concept is not only absent but a core user flow depends on it
519
-
520
- Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
521
-
522
- BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
523
-
524
- Return STRICT JSON. No prose outside the JSON.`;
525
- }
526
- async function runSemanticConceptJudge(input, options = {}) {
527
- const start = Date.now();
528
- const totalCount = input.expectedConcepts.length;
529
- if (totalCount === 0) {
530
- return {
531
- kind: "semantic-concept",
532
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
533
- score: 0,
534
- presentCount: 0,
535
- totalCount: 0,
536
- findings: [],
537
- summary: "no expected concepts declared",
538
- durationMs: 0,
539
- costUsd: null,
540
- available: false,
541
- error: "no expected concepts declared"
542
- };
543
- }
544
- const opts = {
545
- model: options.model ?? DEFAULT_MODEL,
546
- timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
547
- maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
548
- maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
549
- maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
550
- llm: options.llm ?? {},
551
- weightConcepts: options.weightConcepts ?? "mean",
552
- complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
553
- };
554
- const weightForConcept = (spec) => {
555
- if (opts.weightConcepts === "mean") return 1;
556
- if (spec.weight != null) return spec.weight;
557
- if (opts.weightConcepts === "complexity") {
558
- return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
559
- }
560
- return 1;
561
- };
562
- const weightByName = new Map(
563
- input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
564
- );
565
- try {
566
- const { value, result } = await callLlmJson(
567
- {
568
- model: opts.model,
569
- messages: [
570
- {
571
- role: "system",
572
- content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
573
- },
574
- { role: "user", content: buildPrompt(input, opts) }
575
- ],
576
- jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
577
- temperature: 0,
578
- timeoutMs: opts.timeoutMs
579
- },
580
- opts.llm
581
- );
582
- if (!value?.concepts || !Array.isArray(value.concepts)) {
583
- throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
584
- }
585
- const findings = value.concepts.map((c) => ({
586
- concept: String(c.concept),
587
- present: Boolean(c.present),
588
- score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
589
- evidence: String(c.evidence ?? ""),
590
- severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
591
- }));
592
- const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
593
- let weightSum = 0;
594
- let weightedScoreSum = 0;
595
- for (const f of findings) {
596
- const w = weightByName.get(f.concept) ?? 1;
597
- weightSum += w;
598
- weightedScoreSum += w * f.score;
599
- }
600
- const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
601
- return {
602
- kind: "semantic-concept",
603
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
604
- score: Number((scoreAvg / 10).toFixed(3)),
605
- presentCount,
606
- totalCount,
607
- findings,
608
- summary: String(value.summary ?? ""),
609
- durationMs: Date.now() - start,
610
- costUsd: result.costUsd ?? null,
611
- available: true
612
- };
613
- } catch (err) {
614
- return {
615
- kind: "semantic-concept",
616
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
617
- score: 0,
618
- presentCount: 0,
619
- totalCount,
620
- findings: [],
621
- summary: "",
622
- durationMs: Date.now() - start,
623
- costUsd: null,
624
- available: false,
625
- error: err instanceof Error ? err.message : String(err)
626
- };
627
- }
628
- }
629
- function createSemanticConceptJudge(options = {}) {
630
- return (input) => runSemanticConceptJudge(input, options);
631
- }
632
-
633
- // src/analyst/types.ts
634
- import { createHash } from "crypto";
635
- function computeFindingId(input) {
636
- const basis = JSON.stringify({
637
- a: input.analyst_id,
638
- r: input.area,
639
- s: input.subject ?? "",
640
- c: normalizeClaim(input.id_basis ?? input.claim)
641
- });
642
- return `f_${createHash("sha256").update(basis).digest("hex").slice(0, 20)}`;
643
- }
644
- function normalizeClaim(c) {
645
- return c.toLowerCase().replace(/\s+/g, " ").replace(/[.!?;:,]+$/g, "").trim();
646
- }
647
- function makeFinding(init) {
648
- const { id_basis, produced_at, ...rest } = init;
649
- return {
650
- schema_version: "1.0.0",
651
- finding_id: computeFindingId({
652
- analyst_id: rest.analyst_id,
653
- area: rest.area,
654
- subject: rest.subject,
655
- claim: rest.claim,
656
- id_basis
657
- }),
658
- produced_at: produced_at ?? (/* @__PURE__ */ new Date()).toISOString(),
659
- ...rest
660
- };
661
- }
662
-
663
- // src/analyst/adapters.ts
664
- var ADAPTER_REV = "1";
665
- function liftSeverity(s) {
666
- switch (s) {
667
- case "critical":
668
- return "critical";
669
- case "major":
670
- return "high";
671
- case "minor":
672
- return "medium";
673
- case "info":
674
- return "info";
675
- }
676
- }
677
- function createTraceAnalystAdapter(opts) {
678
- const id = opts.id ?? "trace-analyst";
679
- const area = opts.area ?? "agent-reasoning";
680
- return {
681
- id,
682
- description: "Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.",
683
- inputKind: "trace-store",
684
- cost: { kind: "llm", models: opts.model ? [opts.model] : void 0 },
685
- version: `trace-analyst-${ADAPTER_REV}`,
686
- async analyze(store, ctx) {
687
- const out = [];
688
- for (const question of opts.questions) {
689
- if (ctx.signal?.aborted) break;
690
- const result = await analyzeTraces(
691
- { question },
692
- { source: store, ai: opts.ai, model: opts.model, ...opts.extra }
693
- );
694
- const subject = ctx.tags?.subject ?? question.slice(0, 60);
695
- if (result.findings.length === 0) {
696
- out.push(
697
- makeFinding({
698
- analyst_id: id,
699
- area,
700
- subject,
701
- claim: result.answer.slice(0, 200),
702
- rationale: result.answer,
703
- severity: "info",
704
- confidence: 0.5,
705
- evidence_refs: [],
706
- metadata: {
707
- actor_prompt_version: result.actorPromptVersion,
708
- turns: result.turnCount
709
- }
710
- })
711
- );
712
- continue;
713
- }
714
- result.findings.forEach((claim, i) => {
715
- out.push(
716
- makeFinding({
717
- analyst_id: id,
718
- area,
719
- subject,
720
- claim,
721
- rationale: i === 0 ? result.answer : void 0,
722
- severity: "medium",
723
- confidence: 0.6,
724
- evidence_refs: [],
725
- metadata: { question, turns: result.turnCount, finding_index: i }
726
- })
727
- );
728
- });
729
- }
730
- return out;
731
- }
732
- };
733
- }
734
- function createVerifierAdapter(opts) {
735
- const id = opts.id ?? "multi-layer-verifier";
736
- const area = opts.area ?? "verification";
737
- return {
738
- id,
739
- description: "Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.",
740
- inputKind: "custom",
741
- cost: { kind: "deterministic" },
742
- version: `verifier-${ADAPTER_REV}`,
743
- async analyze(env, ctx) {
744
- const report = await opts.verifier.run({ env, ...opts.options });
745
- const out = [];
746
- for (const layer of report.layers) {
747
- for (const finding2 of layer.findings) {
748
- out.push(liftLayerFinding(id, area, layer.layer, finding2));
749
- }
750
- if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
751
- out.push(
752
- makeFinding({
753
- analyst_id: id,
754
- area,
755
- subject: layer.layer,
756
- claim: `layer "${layer.layer}" ${layer.status}: ${layer.reason ?? "no reason given"}`,
757
- severity: layer.status === "error" ? "high" : layer.status === "timeout" ? "medium" : "high",
758
- confidence: 1,
759
- evidence_refs: [],
760
- metadata: {
761
- layer_status: layer.status,
762
- duration_ms: layer.durationMs,
763
- score: layer.score,
764
- diagnostics: layer.diagnostics
765
- }
766
- })
767
- );
768
- }
769
- }
770
- ctx.log?.("verifier complete", {
771
- layers: report.layers.length,
772
- blended: report.blendedScore,
773
- all_pass: report.allPass
774
- });
775
- return out;
776
- }
777
- };
778
- }
779
- function liftLayerFinding(analyst_id, area, layer, f) {
780
- return makeFinding({
781
- analyst_id,
782
- area,
783
- subject: f.layer ?? layer,
784
- claim: f.message,
785
- severity: liftSeverity(f.severity),
786
- confidence: 0.85,
787
- evidence_refs: f.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }] : [],
788
- metadata: f.detail
789
- });
790
- }
791
- function createRunCriticAdapter(opts = {}) {
792
- const id = opts.id ?? "run-critic";
793
- const area = opts.area ?? "run-quality";
794
- const critic = opts.critic ?? new RunCritic();
795
- const threshold = opts.threshold ?? 0.5;
796
- return {
797
- id,
798
- description: "Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.",
799
- inputKind: "custom",
800
- cost: { kind: "deterministic" },
801
- version: `run-critic-${ADAPTER_REV}`,
802
- async analyze(trace) {
803
- const score = critic.scoreTrace(trace);
804
- const out = [];
805
- const dims = [
806
- ["success", "critical", "run did not complete successfully"],
807
- ["goalProgress", "high", "goal progress is low"],
808
- ["repoGroundedness", "high", "output is poorly grounded in the repository"],
809
- ["toolUseQuality", "medium", "tool use quality is low"],
810
- ["patchQuality", "medium", "no real patch/edit evidence"],
811
- ["testReality", "high", "no real test/build evidence"],
812
- ["finalGate", "critical", "final gate is blocking"]
813
- ];
814
- for (const [dim, sev, msg] of dims) {
815
- const value = score[dim];
816
- if (typeof value === "number" && value < threshold) {
817
- out.push(
818
- makeFinding({
819
- analyst_id: id,
820
- area,
821
- subject: dim,
822
- claim: msg,
823
- rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,
824
- severity: sev,
825
- confidence: 1,
826
- evidence_refs: [],
827
- metadata: { dimension: dim, value, threshold, run_id: trace.run.runId }
828
- })
829
- );
830
- }
831
- }
832
- if (score.driftPenalty > 1 - threshold) {
833
- out.push(
834
- makeFinding({
835
- analyst_id: id,
836
- area,
837
- subject: "drift",
838
- claim: "agent output drifted from repository signal",
839
- rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,
840
- severity: "medium",
841
- confidence: 0.9,
842
- evidence_refs: [],
843
- metadata: { drift_penalty: score.driftPenalty, notes: score.notes }
844
- })
845
- );
846
- }
847
- return out;
848
- }
849
- };
850
- }
851
- function createJudgeAdapter(opts) {
852
- const id = opts.id ?? "judge";
853
- const area = opts.area ?? "judge";
854
- const threshold = opts.threshold ?? 6;
855
- return {
856
- id,
857
- description: "Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.",
858
- inputKind: "judge-input",
859
- cost: opts.cost ?? { kind: "llm" },
860
- version: `judge-${ADAPTER_REV}`,
861
- async analyze(input) {
862
- const scores2 = await opts.judge(opts.tcloud, input);
863
- return scores2.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
864
- }
865
- };
866
- }
867
- function normalize10(s) {
868
- return s <= 1 ? s * 10 : s;
869
- }
870
- function liftJudgeScore(analyst_id, area, s) {
871
- const score10 = normalize10(s.score);
872
- const severity = score10 < 3 ? "critical" : score10 < 5 ? "high" : score10 < 7 ? "medium" : "low";
873
- return makeFinding({
874
- analyst_id,
875
- area,
876
- subject: s.dimension,
877
- claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,
878
- rationale: s.reasoning,
879
- severity,
880
- confidence: 0.8,
881
- evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
882
- metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
883
- });
884
- }
885
- function createSemanticConceptJudgeAdapter(opts = {}) {
886
- const id = opts.id ?? "semantic-concept-judge";
887
- const area = opts.area ?? "concept-coverage";
888
- return {
889
- id,
890
- description: "Runs the semantic-concept judge and surfaces missing / weak concepts as findings.",
891
- inputKind: "custom",
892
- cost: { kind: "llm", models: opts.options?.model ? [opts.options.model] : void 0 },
893
- version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,
894
- async analyze(input) {
895
- const result = await runSemanticConceptJudge(input, opts.options);
896
- if (!result.available) {
897
- return [
898
- makeFinding({
899
- analyst_id: id,
900
- area,
901
- claim: "semantic-concept judge unavailable",
902
- rationale: result.error,
903
- severity: "info",
904
- confidence: 1,
905
- evidence_refs: [],
906
- metadata: { reason: result.error }
907
- })
908
- ];
909
- }
910
- const out = [];
911
- for (const f of result.findings) {
912
- if (f.present && f.score >= 7) continue;
913
- out.push(
914
- makeFinding({
915
- analyst_id: id,
916
- area,
917
- subject: f.concept,
918
- claim: f.present ? `concept "${f.concept}" is weak (${f.score}/10)` : `concept "${f.concept}" is missing`,
919
- rationale: f.evidence,
920
- severity: liftSeverity(f.severity),
921
- confidence: 0.85,
922
- evidence_refs: [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }],
923
- metadata: {
924
- concept: f.concept,
925
- present: f.present,
926
- score_10: f.score,
927
- cost_usd: result.costUsd ?? void 0
928
- }
929
- })
930
- );
931
- }
932
- return out;
933
- }
934
- };
935
- }
936
-
937
- // src/analyst/chat-client.ts
938
- function createChatClient(opts) {
939
- switch (opts.transport) {
940
- case "router":
941
- return wrapLlmClient(
942
- opts.transport,
943
- opts.defaultModel,
944
- new LlmClient({
945
- baseUrl: opts.baseUrl ?? "https://router.tangle.tools/v1",
946
- apiKey: opts.apiKey
947
- })
948
- );
949
- case "cli-bridge":
950
- return wrapLlmClient(
951
- opts.transport,
952
- opts.defaultModel,
953
- new LlmClient({
954
- baseUrl: opts.baseUrl ?? "http://127.0.0.1:3344/v1",
955
- apiKey: opts.bearer ?? ""
956
- })
957
- );
958
- case "direct-provider":
959
- return wrapLlmClient(
960
- opts.transport,
961
- opts.defaultModel,
962
- new LlmClient({
963
- baseUrl: opts.baseUrl,
964
- apiKey: opts.apiKey
965
- })
966
- );
967
- case "sandbox-sdk":
968
- return {
969
- transport: "sandbox-sdk",
970
- defaultModel: opts.defaultModel,
971
- chat: async (req, callOpts) => opts.chat(resolveModel(req, opts.defaultModel), callOpts)
972
- };
973
- case "mock":
974
- return {
975
- transport: "mock",
976
- defaultModel: opts.defaultModel,
977
- chat: async (req, callOpts) => opts.handler(resolveModel(req, opts.defaultModel), callOpts)
978
- };
979
- }
980
- }
981
- function wrapLlmClient(transport, defaultModel, inner) {
982
- return {
983
- transport,
984
- defaultModel,
985
- chat: async (req, callOpts) => {
986
- const resolved = resolveModel(req, defaultModel);
987
- const call = inner.call({
988
- model: resolved.model,
989
- messages: req.messages,
990
- jsonMode: req.jsonMode,
991
- jsonSchema: req.jsonSchema,
992
- temperature: req.temperature,
993
- maxTokens: req.maxTokens,
994
- timeoutMs: req.timeoutMs
995
- });
996
- if (!callOpts?.signal) return await call;
997
- return await Promise.race([call, abortAsRejection(callOpts.signal)]);
998
- }
999
- };
1000
- }
1001
- function abortAsRejection(signal) {
1002
- if (signal.aborted) return Promise.reject(toAbortError(signal));
1003
- return new Promise((_, reject) => {
1004
- signal.addEventListener("abort", () => reject(toAbortError(signal)), { once: true });
1005
- });
1006
- }
1007
- function toAbortError(signal) {
1008
- const reason = signal.reason;
1009
- if (reason instanceof Error) return reason;
1010
- const e = new Error("ChatClient.chat: aborted");
1011
- e.name = "AbortError";
1012
- return e;
1013
- }
1014
- function resolveModel(req, defaultModel) {
1015
- if (req.model) return req;
1016
- if (!defaultModel) {
1017
- throw new Error(
1018
- "ChatClient.chat: no model on request and no defaultModel on the client. Either pass req.model or bind defaultModel at createChatClient()."
1019
- );
1020
- }
1021
- return { ...req, model: defaultModel };
1022
- }
1023
-
1024
- // src/analyst/finding-signature.ts
1025
- import { z as z2 } from "zod";
1026
-
1027
- // src/analyst/finding-subject.ts
1028
- import { z } from "zod";
1029
- var FINDING_SUBJECT_KINDS = [
1030
- "knowledge.wiki",
1031
- "knowledge.claim",
1032
- "knowledge.raw",
1033
- "knowledge.stale",
1034
- "system-prompt",
1035
- "tool-doc",
1036
- "new-tool",
1037
- "rag",
1038
- "memory",
1039
- "scaffolding",
1040
- "output-schema",
1041
- "websearch.outdated",
1042
- "prior-run-summary",
1043
- "cluster"
1044
- ];
1045
- function parseFindingSubject(raw) {
1046
- if (raw === null || raw === void 0) return null;
1047
- const trimmed = raw.trim();
1048
- if (trimmed.length === 0) return null;
1049
- const wiki = trimmed.match(
1050
- /^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
1051
- );
1052
- if (wiki)
1053
- return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
1054
- const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
1055
- if (claim && claim[1].trim().length > 0)
1056
- return { kind: "knowledge.claim", topic: claim[1].trim() };
1057
- const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
1058
- if (raw_ && raw_[1].trim().length > 0)
1059
- return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
1060
- const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
1061
- if (stale) return { kind: "knowledge.stale", slug: stale[1] };
1062
- const sp = trimmed.match(/^system-prompt:(.+)$/);
1063
- if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
1064
- const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
1065
- if (tdAspect && tdAspect[2].trim().length > 0) {
1066
- return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
1067
- }
1068
- const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
1069
- if (td) return { kind: "tool-doc", tool: td[1] };
1070
- const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
1071
- if (nt) return { kind: "new-tool", name: nt[1] };
1072
- const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
1073
- if (rag && rag[2].trim().length > 0) {
1074
- return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
1075
- }
1076
- const mem = trimmed.match(/^memory:(.+)$/);
1077
- if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
1078
- const sc = trimmed.match(/^scaffolding:(.+)$/);
1079
- if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
1080
- const os = trimmed.match(/^output-schema:(.+)$/);
1081
- if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
1082
- const ws = trimmed.match(/^websearch:outdated:(.+)$/);
1083
- if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
1084
- const prs = trimmed.match(/^prior-run-summary:(.+)$/);
1085
- if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
1086
- if (/^[a-z0-9][a-z0-9-]*$/.test(trimmed) && trimmed.length <= 80) {
1087
- return { kind: "cluster", label: trimmed };
1088
- }
1089
- return null;
1090
- }
1091
- function renderFindingSubject(s) {
1092
- switch (s.kind) {
1093
- case "knowledge.wiki":
1094
- return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
1095
- case "knowledge.claim":
1096
- return `agent-knowledge:claim:${s.topic}`;
1097
- case "knowledge.raw":
1098
- return `agent-knowledge:raw:${s.sourceId}`;
1099
- case "knowledge.stale":
1100
- return `agent-knowledge:stale:${s.slug}`;
1101
- case "system-prompt":
1102
- return `system-prompt:${s.section}`;
1103
- case "tool-doc":
1104
- return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
1105
- case "new-tool":
1106
- return `new-tool:${s.name}`;
1107
- case "rag":
1108
- return `rag:${s.corpus}:${s.docId}`;
1109
- case "memory":
1110
- return `memory:${s.key}`;
1111
- case "scaffolding":
1112
- return `scaffolding:${s.concern}`;
1113
- case "output-schema":
1114
- return `output-schema:${s.field}`;
1115
- case "websearch.outdated":
1116
- return `websearch:outdated:${s.topic}`;
1117
- case "prior-run-summary":
1118
- return `prior-run-summary:${s.topic}`;
1119
- case "cluster":
1120
- return s.label;
1121
- }
1122
- }
1123
- var FINDING_SUBJECT_GRAMMAR_PROMPT = [
1124
- "Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
1125
- "",
1126
- " Knowledge loci (write to the agent-knowledge base):",
1127
- " agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
1128
- " agent-knowledge:claim:<topic> draft a claim / relation triple",
1129
- " agent-knowledge:raw:<source-id> lift a raw source into a curated page",
1130
- " agent-knowledge:stale:<slug> mark a page superseded",
1131
- "",
1132
- " Runtime mutable surfaces (write to prompts / tools / scaffolding):",
1133
- " system-prompt:<section> add / replace a system-prompt section",
1134
- " tool-doc:<tool>[:<aspect>] rewrite a tool description",
1135
- " new-tool:<name> propose a new tool surface",
1136
- " rag:<corpus>:<doc-id> ingest / correct a RAG document",
1137
- " memory:<key> invalidate / set a memory entry",
1138
- " scaffolding:<concern> change a precondition / retry / verifier",
1139
- " output-schema:<field> constrain the agent output shape",
1140
- "",
1141
- " Stale signals (knowledge-poisoning only):",
1142
- " websearch:outdated:<topic> stale web result",
1143
- " prior-run-summary:<topic> stale prior-run summary",
1144
- "",
1145
- " Cluster label (failure-mode only):",
1146
- ' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
1147
- "",
1148
- "Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
1149
- ].join("\n");
1150
- var KIND_EXPECTED_SUBJECTS = {
1151
- "failure-mode": ["cluster"],
1152
- "knowledge-gap": [
1153
- "knowledge.wiki",
1154
- "knowledge.claim",
1155
- "knowledge.raw",
1156
- "knowledge.stale",
1157
- "tool-doc",
1158
- "system-prompt",
1159
- "memory",
1160
- "websearch.outdated",
1161
- "prior-run-summary"
1162
- ],
1163
- "knowledge-poisoning": [
1164
- "knowledge.wiki",
1165
- "knowledge.claim",
1166
- "knowledge.raw",
1167
- "tool-doc",
1168
- "system-prompt",
1169
- "memory",
1170
- "websearch.outdated",
1171
- "prior-run-summary"
1172
- ],
1173
- improvement: [
1174
- "system-prompt",
1175
- "tool-doc",
1176
- "new-tool",
1177
- "rag",
1178
- "memory",
1179
- "scaffolding",
1180
- "output-schema",
1181
- "knowledge.wiki",
1182
- "knowledge.claim"
1183
- ]
1184
- };
1185
- var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
1186
- message: "subject does not match the finding-subject grammar"
1187
- });
1188
-
1189
- // src/analyst/finding-signature.ts
1190
- var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
1191
- var RawAnalystFindingSchema = z2.object({
1192
- severity: z2.enum(ANALYST_SEVERITIES),
1193
- claim: z2.string().min(1).max(2e3),
1194
- /**
1195
- * Subject locus the finding is about. Validated at parse time
1196
- * against the documented grammar (`finding-subject.ts`). Findings
1197
- * with a malformed subject are rejected — they would have been
1198
- * silently skipped by every downstream adapter, so failing loud at
1199
- * parse time turns a hidden no-op into a kind-prompt audit signal.
1200
- *
1201
- * Optional because purely descriptive findings (no actionable
1202
- * locus) are legitimate; they just don't route through the
1203
- * KnowledgeAdapter / ImprovementAdapter.
1204
- */
1205
- subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
1206
- message: "subject does not match the finding-subject grammar"
1207
- }).optional(),
1208
- evidence_uri: z2.string().min(1).max(2e3),
1209
- evidence_excerpt: z2.string().max(2e3).optional(),
1210
- confidence: z2.number().min(0).max(1),
1211
- rationale: z2.string().max(4e3).optional(),
1212
- recommended_action: z2.string().max(2e3).optional()
1213
- }).strict();
1214
- var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
1215
- - severity: one of "critical" | "high" | "medium" | "low" | "info"
1216
- - claim: one-sentence statement (max 2000 chars)
1217
- - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
1218
- - evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
1219
- - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
1220
- - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
1221
- - rationale?: one or two sentences explaining the reasoning
1222
- - recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
1223
-
1224
- Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
1225
- function parseRawFinding(row, log) {
1226
- const result = RawAnalystFindingSchema.safeParse(row);
1227
- if (!result.success) {
1228
- log?.("finding rejected: schema failure", {
1229
- issues: result.error.issues.map((i) => ({
1230
- path: i.path.join("."),
1231
- code: i.code,
1232
- message: i.message
1233
- }))
1234
- });
1235
- return null;
1236
- }
1237
- return result.data;
1238
- }
1239
-
1240
- // src/analyst/findings-store.ts
1241
- import { existsSync as existsSync2, readFileSync } from "fs";
1242
-
1243
- // src/locked-jsonl-appender.ts
1244
- import { appendFileSync, existsSync, mkdirSync } from "fs";
1245
- import { dirname } from "path";
1246
-
1247
- // src/concurrency.ts
1248
- var Mutex = class {
1249
- locked = false;
1250
- waiters = [];
1251
- async acquire() {
1252
- if (!this.locked) {
1253
- this.locked = true;
1254
- return () => this.release();
1255
- }
1256
- return new Promise((resolve) => {
1257
- this.waiters.push(() => {
1258
- resolve(() => this.release());
1259
- });
1260
- });
1261
- }
1262
- release() {
1263
- const next = this.waiters.shift();
1264
- if (next) {
1265
- next();
1266
- } else {
1267
- this.locked = false;
1268
- }
1269
- }
1270
- async runExclusive(fn) {
1271
- const release = await this.acquire();
1272
- try {
1273
- return await fn();
1274
- } finally {
1275
- release();
1276
- }
1277
- }
1278
- /** True iff someone holds the lock right now. Diagnostics only. */
1279
- get isLocked() {
1280
- return this.locked;
1281
- }
1282
- /** Pending waiter count. Diagnostics only. */
1283
- get pending() {
1284
- return this.waiters.length;
1285
- }
1286
- };
1287
-
1288
- // src/locked-jsonl-appender.ts
1289
- var mutexes = /* @__PURE__ */ new Map();
1290
- function getMutex(path) {
1291
- let m = mutexes.get(path);
1292
- if (!m) {
1293
- m = new Mutex();
1294
- mutexes.set(path, m);
1295
- }
1296
- return m;
1297
- }
1298
- var LockedJsonlAppender = class {
1299
- constructor(path) {
1300
- this.path = path;
1301
- this.mutex = getMutex(path);
1302
- if (!existsSync(dirname(path))) {
1303
- mkdirSync(dirname(path), { recursive: true });
1304
- }
1305
- }
1306
- path;
1307
- mutex;
1308
- async append(entry) {
1309
- const line = `${JSON.stringify(entry)}
1310
- `;
1311
- await this.mutex.runExclusive(() => {
1312
- appendFileSync(this.path, line);
1313
- });
1314
- }
1315
- };
1316
- function resetLockedAppendersForTesting() {
1317
- mutexes.clear();
1318
- }
1319
-
1320
- // src/analyst/findings-store.ts
1321
- var FindingsStore = class {
1322
- constructor(path) {
1323
- this.path = path;
1324
- this.appender = new LockedJsonlAppender(path);
1325
- }
1326
- path;
1327
- appender;
1328
- async append(runId, findings) {
1329
- for (const f of findings) {
1330
- const row = { ...f, run_id: runId };
1331
- await this.appender.append(row);
1332
- }
1333
- }
1334
- /** Load every persisted finding. Discards malformed trailing lines silently. */
1335
- loadAll() {
1336
- if (!existsSync2(this.path)) return [];
1337
- const raw = readFileSync(this.path, "utf8");
1338
- if (!raw) return [];
1339
- const out = [];
1340
- for (const line of raw.split("\n")) {
1341
- if (!line) continue;
1342
- try {
1343
- out.push(JSON.parse(line));
1344
- } catch {
1345
- }
1346
- }
1347
- return out;
1348
- }
1349
- /** Filter to a single run. */
1350
- loadRun(runId) {
1351
- return this.loadAll().filter((r) => r.run_id === runId);
1352
- }
1353
- };
1354
- function defaultIsMaterial(a, b) {
1355
- if (a.severity !== b.severity) return true;
1356
- if (Math.abs((a.confidence ?? 0) - (b.confidence ?? 0)) > 0.05) return true;
1357
- if (a.evidence_refs.length !== b.evidence_refs.length) return true;
1358
- return false;
1359
- }
1360
- function diffFindings(previous, current, policy = {}) {
1361
- const isMaterial = policy.isMaterial ?? defaultIsMaterial;
1362
- const prevById = new Map(previous.map((f) => [f.finding_id, f]));
1363
- const curById = new Map(current.map((f) => [f.finding_id, f]));
1364
- const appeared = [];
1365
- const disappeared = [];
1366
- const persisted = [];
1367
- const changed = [];
1368
- for (const [id, cur] of curById) {
1369
- const prev = prevById.get(id);
1370
- if (!prev) {
1371
- appeared.push(cur);
1372
- continue;
1373
- }
1374
- if (isMaterial(prev, cur)) {
1375
- changed.push({ previous: prev, current: cur });
1376
- } else {
1377
- persisted.push(cur);
1378
- }
1379
- }
1380
- for (const [id, prev] of prevById) {
1381
- if (!curById.has(id)) disappeared.push(prev);
1382
- }
1383
- return { appeared, disappeared, persisted, changed };
1384
- }
1385
-
1386
- // src/analyst/kind-factory.ts
1387
- import { AxJSRuntime, agent } from "@ax-llm/ax";
1388
- function createTraceAnalystKind(spec, opts) {
1389
- const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
1390
- return {
1391
- id: spec.id,
1392
- description: spec.description,
1393
- inputKind: "trace-store",
1394
- cost: spec.cost,
1395
- version,
1396
- async analyze(store, ctx) {
1397
- const tools = spec.buildTools(store);
1398
- const maxDepth = spec.recursion?.maxDepth ?? 0;
1399
- const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
1400
- const priorContext = renderPriorFindings(ctx.priorFindings);
1401
- const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
1402
- const ax = agent(
1403
- "question:string -> findings:json[]",
1404
- {
1405
- agentIdentity: {
1406
- name: spec.id,
1407
- description: spec.description
1408
- },
1409
- contextFields: ["question"],
1410
- runtime: new AxJSRuntime({
1411
- permissions: [],
1412
- blockDynamicImport: true,
1413
- allowedModules: [],
1414
- freezeIntrinsics: true,
1415
- blockShadowRealm: true,
1416
- preventGlobalThisExtensions: false
1417
- }),
1418
- mode: maxDepth > 0 ? "advanced" : "simple",
1419
- recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
1420
- maxTurns: spec.maxTurns ?? 12,
1421
- maxRuntimeChars: spec.maxRuntimeChars ?? 6e3,
1422
- maxBatchedLlmQueryConcurrency: maxParallel,
1423
- promptLevel: "detailed",
1424
- contextPolicy: { preset: "full", budget: "balanced" },
1425
- functions: { local: tools },
1426
- actorOptions: {
1427
- description: actorDescription,
1428
- ...opts.model ? { model: opts.model } : {},
1429
- showThoughts: false,
1430
- thinkingTokenBudget: "none"
1431
- },
1432
- responderOptions: {
1433
- description: spec.responderDescription ?? "Format the structured `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.",
1434
- ...opts.model ? { model: opts.model } : {},
1435
- showThoughts: false
1436
- },
1437
- bubbleErrors: [TraceFileMissingError]
1438
- }
1439
- );
1440
- ctx.log?.(`analyst.kind ${spec.id} forward`, {
1441
- max_depth: maxDepth,
1442
- tool_count: tools.length,
1443
- tags: ctx.tags
1444
- });
1445
- const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
1446
- const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
1447
- const out = [];
1448
- const rawRows = Array.isArray(result.findings) ? result.findings : [];
1449
- let rejectedWrongKind = 0;
1450
- for (const row of rawRows) {
1451
- const parsed = parseRawFinding(row, ctx.log);
1452
- if (!parsed) continue;
1453
- if (expectedSubjects && parsed.subject !== void 0) {
1454
- const parsedSubject = parseFindingSubject(parsed.subject);
1455
- if (parsedSubject === null) {
1456
- ctx.log?.("finding rejected: subject failed to parse", {
1457
- kind: spec.id,
1458
- subject: parsed.subject
1459
- });
1460
- rejectedWrongKind += 1;
1461
- continue;
1462
- }
1463
- if (!expectedSubjects.includes(parsedSubject.kind)) {
1464
- ctx.log?.("finding rejected: subject variant not allowed for this kind", {
1465
- kind: spec.id,
1466
- subject_kind: parsedSubject.kind,
1467
- subject: parsed.subject,
1468
- allowed: expectedSubjects
1469
- });
1470
- rejectedWrongKind += 1;
1471
- continue;
1472
- }
1473
- }
1474
- const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
1475
- if (!postProcessed) continue;
1476
- out.push(toAnalystFinding(spec, postProcessed));
1477
- }
1478
- ctx.log?.(`analyst.kind ${spec.id} done`, {
1479
- emitted: rawRows.length,
1480
- accepted: out.length,
1481
- rejected_wrong_subject: rejectedWrongKind
1482
- });
1483
- return out;
1484
- }
1485
- };
1486
- }
1487
- function deriveQuestion(ctx, spec) {
1488
- const focus = ctx.tags?.focus?.trim();
1489
- if (focus) return `${spec.id}: ${focus}`;
1490
- return spec.id;
1491
- }
1492
- function toAnalystFinding(spec, raw) {
1493
- return makeFinding({
1494
- analyst_id: spec.id,
1495
- area: spec.area,
1496
- subject: raw.subject,
1497
- claim: raw.claim,
1498
- rationale: raw.rationale,
1499
- severity: raw.severity,
1500
- confidence: raw.confidence,
1501
- evidence_refs: [
1502
- {
1503
- kind: evidenceKindFromUri(raw.evidence_uri),
1504
- uri: raw.evidence_uri,
1505
- excerpt: raw.evidence_excerpt
1506
- }
1507
- ],
1508
- recommended_action: raw.recommended_action,
1509
- metadata: { kind_version: spec.version }
1510
- });
1511
- }
1512
- function evidenceKindFromUri(uri) {
1513
- if (uri.startsWith("span://")) return "span";
1514
- if (uri.startsWith("artifact://")) return "artifact";
1515
- if (uri.startsWith("metric://")) return "metric";
1516
- if (uri.startsWith("event://")) return "event";
1517
- if (uri.startsWith("finding://")) return "finding";
1518
- return "artifact";
1519
- }
1520
- function renderPriorFindings(prior) {
1521
- if (!prior || prior.length === 0) return "";
1522
- const MAX_ROWS = 40;
1523
- const rows = prior.slice(0, MAX_ROWS).map((f) => {
1524
- const subject = f.subject ? ` [${f.subject}]` : "";
1525
- return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
1526
- });
1527
- const overflow = prior.length > MAX_ROWS ? `
1528
- ... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
1529
- return [
1530
- "",
1531
- "",
1532
- "PRIOR FINDINGS (from a previous run on related data):",
1533
- "When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
1534
- "A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
1535
- ...rows,
1536
- overflow
1537
- ].filter(Boolean).join("\n");
1538
- }
1539
- function truncateForContext(s, max) {
1540
- if (s.length <= max) return s;
1541
- return `${s.slice(0, max - 1).trimEnd()}\u2026`;
1542
- }
1543
-
1544
- // src/analyst/tool-groups.ts
1545
- var TOOL_NAMES_BY_GROUP = {
1546
- all: /* @__PURE__ */ new Set(),
1547
- discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
1548
- discoveryAndRead: /* @__PURE__ */ new Set([
1549
- "getDatasetOverview",
1550
- "queryTraces",
1551
- "countTraces",
1552
- "viewTrace",
1553
- "viewSpans"
1554
- ]),
1555
- discoveryAndSearch: /* @__PURE__ */ new Set([
1556
- "getDatasetOverview",
1557
- "queryTraces",
1558
- "countTraces",
1559
- "searchTrace",
1560
- "searchSpan"
1561
- ]),
1562
- targeted: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "viewSpans", "searchSpan"])
1563
- };
1564
- function buildTraceToolsForGroup(group, store) {
1565
- const all = buildTraceAnalystTools({ store });
1566
- if (group === "all") return all;
1567
- const allow = TOOL_NAMES_BY_GROUP[group];
1568
- if (!allow) throw new Error(`unknown trace tool group: ${group}`);
1569
- return all.filter((tool) => allow.has(tool.name));
1570
- }
1571
-
1572
- // src/analyst/kinds/failure-mode.ts
1573
- var ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.
1574
-
1575
- DISCOVERY \u2192 CLUSTER \u2192 CITE protocol:
1576
-
1577
- 1. Call \`traces.getDatasetOverview({})\` first. Use \`has_errors\`, \`models\`, \`agent_names\`, \`tools\`, and \`sample_trace_ids\` to size the failure surface.
1578
- 2. Use \`traces.queryTraces({ filters: { has_errors: true }, limit })\` to pull error-bearing traces. Combine with \`traces.countTraces\` to see what fraction of the dataset failed.
1579
- 3. For each candidate failure cluster, use \`traces.searchTrace\` with regex like \`STATUS_CODE_ERROR\`, \`MaxTurnsExceeded\`, \`assertion\`, \`unauthorized\`, \`timeout\`, \`429\`, \`5\\d\\d\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.
1580
- 4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.
1581
- 5. For each cluster you can defend with evidence, emit ONE finding with:
1582
- - \`area\` = "failure-mode"
1583
- - \`subject\` = a short label for the cluster ("tool-call-loop", "auth-revoked-mid-run", "agent-asked-clarification-too-late", ...)
1584
- - \`claim\` = one sentence stating the mode
1585
- - \`severity\` = "critical" when it blocks the run, "high" when the run finished degraded, "medium" when it slowed convergence
1586
- - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the most representative span
1587
- - \`evidence_excerpt\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)
1588
- - \`confidence\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.
1589
- - \`recommended_action\` = imperative-phrased fix idea (kept short \u2014 the improvement-analyst will expand on these)
1590
-
1591
- If the dataset has no failures, return an empty findings array \u2014 do NOT pad with low-confidence speculation.
1592
-
1593
- **Delegate aggressively.** The recursion budget is there to be used:
1594
- - After your first \`getDatasetOverview\` + \`queryTraces\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \`llmQuery\` per cluster in a single batch \u2014 they investigate in parallel.
1595
- - A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.
1596
- - Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.
1597
- - Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \`final()\` \u2014 they return their findings list to you, and you call \`final()\` once at the top.
1598
-
1599
- OBSERVABILITY rules:
1600
- - Each non-final turn must emit at least one \`console.log\` for evidence.
1601
- - Reuse runtime variables across turns; don't recompute.
1602
- - Call \`final({ findings: [...] })\` exactly once, after you've gathered evidence for every cluster you intend to report.`;
1603
- var FAILURE_MODE_KIND_SPEC = {
1604
- id: "failure-mode",
1605
- description: "Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.",
1606
- area: "failure-mode",
1607
- version: "1.0.0",
1608
- actorDescription: ACTOR_PROMPT,
1609
- buildTools: (store) => buildTraceToolsForGroup("all", store),
1610
- recursion: { maxDepth: 3, maxParallelSubagents: 4 },
1611
- maxTurns: 24,
1612
- cost: { kind: "llm" }
1613
- };
1614
-
1615
- // src/analyst/kinds/improvement.ts
1616
- var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
1617
-
1618
- Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
1619
-
1620
- DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
1621
-
1622
- 1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
1623
- 2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
1624
- - **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
1625
- - **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
1626
- - **New tool** \u2014 add a tool the agent kept emulating in code
1627
- - **RAG ingestion** \u2014 add a document or correct a stale one
1628
- - **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
1629
- - **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
1630
- - **Output schema** \u2014 narrow the agent's output to forbid the failure shape
1631
- 3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
1632
- 4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
1633
- 5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
1634
-
1635
- For each winning recommendation, emit ONE finding with:
1636
- - \`area\` = "improvement"
1637
- - \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
1638
- - \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
1639
- - \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
1640
- - \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
1641
- - \`evidence_excerpt\` = a fragment showing the problem the fix targets
1642
- - \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
1643
- - \`rationale\` = why this candidate beat its alternatives (2 sentences max)
1644
- - \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
1645
-
1646
- If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
1647
-
1648
- Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
1649
-
1650
- OBSERVABILITY rules:
1651
- - Each non-final turn must emit at least one \`console.log\` for evidence.
1652
- - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1653
- var IMPROVEMENT_KIND_SPEC = {
1654
- id: "improvement",
1655
- description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
1656
- area: "improvement",
1657
- version: "1.0.0",
1658
- actorDescription: ACTOR_PROMPT2,
1659
- buildTools: (store) => buildTraceToolsForGroup("all", store),
1660
- recursion: { maxDepth: 3, maxParallelSubagents: 4 },
1661
- maxTurns: 30,
1662
- maxRuntimeChars: 12e3,
1663
- cost: { kind: "llm" }
1664
- };
1665
-
1666
- // src/analyst/kinds/knowledge-gap.ts
1667
- var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
1668
-
1669
- The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
1670
-
1671
- DISCOVERY \u2192 ATTRIBUTE-TO-LAYER \u2192 CITE protocol:
1672
-
1673
- 1. \`traces.getDatasetOverview({})\` first. Note which agents, tools, and models appear.
1674
- 2. Pull traces where the agent shows gap signals. The strongest signals are:
1675
- - Self-correction turns ("I assumed X but\u2026", "let me re-check", "actually,")
1676
- - Clarifying-question turns where the agent asked the user something the runtime should have surfaced
1677
- - Repeated retrieval / lookup calls for the same artifact with slightly varied queries
1678
- - Tool errors that name a missing argument or unknown resource
1679
- - Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)
1680
- - Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient
1681
- - Fabricated identifiers that don't appear in dataset \`sample_trace_ids\`
1682
- Use \`traces.searchTrace\` with patterns like \`I (don.?t|do not) know\`, \`assumed\`, \`unclear\`, \`could you (clarify|tell me|provide)\`, \`not found\`, \`undefined\`, \`unknown\`, \`null\`, dates older than the analysis window, or the agent's specific clarification phrases.
1683
- 3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \`subject\` on the finding. Use one of:
1684
- - \`agent-knowledge:wiki:<page-slug>\` \u2014 the wiki page that should exist but doesn't, or exists but lacks the claim
1685
- - \`agent-knowledge:wiki:<page-slug>#<heading>\` \u2014 wiki page exists but a specific section is missing
1686
- - \`agent-knowledge:claim:<topic>\` \u2014 a specific claim/relation triple that should be in the wiki
1687
- - \`agent-knowledge:raw:<source-id>\` \u2014 raw source captured but never lifted into a curated page
1688
- - \`agent-knowledge:stale:<page-slug>\` \u2014 wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)
1689
- - \`websearch:outdated:<topic>\` \u2014 agent relied on a web result that was stale; wiki should have superseded it
1690
- - \`tool-doc:<tool-name>:<aspect>\` \u2014 tool description missed a behavior aspect (return shape, failure modes, side effects)
1691
- - \`system-prompt:<section>\` \u2014 system prompt should have stated the rule directly
1692
- - \`memory:<key>\` \u2014 prior-run memory should have surfaced an earlier decision
1693
- 4. For each gap you can defend with evidence, emit ONE finding with:
1694
- - \`area\` = "knowledge-gap"
1695
- - \`subject\` = the locus string from the list above
1696
- - \`claim\` = a sentence naming the missing or stale knowledge ("wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans")
1697
- - \`severity\` = "high" when the gap caused a failure or a clarifying question; "medium" when it caused unnecessary turns; "low" when it caused minor inefficiency
1698
- - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)
1699
- - \`evidence_excerpt\` = exact quote where the agent showed the gap
1700
- - \`confidence\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior
1701
- - \`recommended_action\` = phrased as a wiki edit when the locus is \`agent-knowledge:*\` ("Create wiki page \`invoice-line-items\` with claims: ..."), or as a prompt/tool-doc edit otherwise
1702
-
1703
- **Delegate per layer.** After your first scan, you should have candidates spread across \`agent-knowledge:*\`, \`websearch:outdated\`, \`tool-doc:*\`, \`system-prompt:*\`, and \`memory:*\`. Spawn one \`llmQuery\` per layer in parallel \u2014 each subagent runs a focused detection (e.g. the \`agent-knowledge\` subagent looks for both missing-pages AND stale-pages; the \`websearch\` subagent looks specifically for date staleness signals; the \`tool-doc\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \`final({ findings })\` at the top.
1704
-
1705
- Do NOT report a gap that the agent later recovered from cleanly within the same turn \u2014 that's resilience, not a gap. Cite the *non-recovery* version when both exist.
1706
-
1707
- OBSERVABILITY rules:
1708
- - Each non-final turn must emit at least one \`console.log\` for evidence.
1709
- - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1710
- var KNOWLEDGE_GAP_KIND_SPEC = {
1711
- id: "knowledge-gap",
1712
- description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
1713
- area: "knowledge-gap",
1714
- version: "1.0.0",
1715
- actorDescription: ACTOR_PROMPT3,
1716
- buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
1717
- recursion: { maxDepth: 2, maxParallelSubagents: 4 },
1718
- maxTurns: 18,
1719
- cost: { kind: "llm" }
1720
- };
1721
-
1722
- // src/analyst/kinds/knowledge-poisoning.ts
1723
- var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
1724
-
1725
- DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
1726
-
1727
- 1. \`traces.getDatasetOverview({})\` first. Identify the agents, models, and tools.
1728
- 2. Pull traces where the agent's confident action was later contradicted. Strongest signals:
1729
- - Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.
1730
- - Tool call with stale arguments (an id that no longer exists, an API shape that changed).
1731
- - Agent cited an \`agent-knowledge\` wiki page or claim whose content contradicts the trace's own evidence \u2014 the wiki itself drifted.
1732
- - Web-search result the agent cited that returned an outdated page; agent treated it as canonical.
1733
- - System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says "use endpoint A"; tool reply says "endpoint A deprecated, use B").
1734
- - Repeated wrong-shape parsing despite the tool's actual output proving the shape.
1735
- 3. Use \`traces.searchTrace\` with regex on phrases like \`actually\`, \`turns out\`, \`previously assumed\`, \`old version\`, \`deprecated\`, \`updated to\`, \`now uses\`, or specific entity names you suspect have changed.
1736
- 4. For each candidate poisoning, **DUAL-VERIFY**:
1737
- - Confirm the agent actually acted on the false belief (cite the span where it did)
1738
- - Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)
1739
- Only emit a finding when both halves are nailed down. If you can only nail one, drop it \u2014 single-evidence poisoning findings are too speculative to be useful.
1740
-
1741
- **Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating "did the agent act?" and one investigating "is the belief false?". After your first scan, fire off N parallel \`llmQuery\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.
1742
-
1743
- For each confirmed poisoning, emit ONE finding with:
1744
- - \`area\` = "knowledge-poisoning"
1745
- - \`subject\` = the source of the false belief, one of: \`agent-knowledge:wiki:<page-slug>\` (wiki page contradicts current ground truth), \`agent-knowledge:claim:<topic>\` (a specific claim/relation went stale), \`agent-knowledge:raw:<source-id>\` (the raw source is outdated and the wiki inherited the drift), \`websearch:outdated:<url-or-topic>\`, \`tool-doc:<tool>\`, \`system-prompt:<section>\`, \`memory:<key>\`, \`prior-run-summary:<topic>\`
1746
- - \`claim\` = one sentence: "agent believed X (from source S); evidence in trace shows X is false"
1747
- - \`severity\` = "critical" when poisoning caused a wrong user-visible action; "high" when caught internally but wasted significant work; "medium" for inefficiency only
1748
- - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the action span (the moment the agent acted on the false belief)
1749
- - \`evidence_excerpt\` = exact quote of the confident-but-wrong claim or action
1750
- - \`confidence\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred
1751
- - \`recommended_action\` = where the source should be updated and how ("Update wiki page \`X\` claim \`Y\` to '...'", "Invalidate raw source \`Z\` and re-curate", "Replace system-prompt section X with 'tool foo now returns Y'")
1752
-
1753
- Do NOT report a finding if the agent caught and corrected the false belief in the same turn \u2014 that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.
1754
-
1755
- OBSERVABILITY rules:
1756
- - Each non-final turn must emit at least one \`console.log\` for evidence.
1757
- - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1758
- var KNOWLEDGE_POISONING_KIND_SPEC = {
1759
- id: "knowledge-poisoning",
1760
- description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
1761
- area: "knowledge-poisoning",
1762
- version: "1.0.0",
1763
- actorDescription: ACTOR_PROMPT4,
1764
- buildTools: (store) => buildTraceToolsForGroup("all", store),
1765
- recursion: { maxDepth: 2, maxParallelSubagents: 4 },
1766
- maxTurns: 20,
1767
- cost: { kind: "llm" }
1768
- };
1769
-
1770
- // src/analyst/kinds/index.ts
1771
- var DEFAULT_TRACE_ANALYST_KINDS = [
1772
- FAILURE_MODE_KIND_SPEC,
1773
- KNOWLEDGE_GAP_KIND_SPEC,
1774
- KNOWLEDGE_POISONING_KIND_SPEC,
1775
- IMPROVEMENT_KIND_SPEC
1776
- ];
1777
-
1778
- // src/analyst/kinds/skill-usage.ts
1779
- import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
1780
- import { join } from "path";
1781
- var BLOAT_LINE_THRESHOLD = 300;
1782
- var TANGLE_PRIVATE_RE = /\b(cli-bridge|tangletools|ops-board|drew-gtr-pro|@tangle-network\/|~\/company|tangle\.tools|gtm-agent)\b|\bkimi\b|\btcloud\b/gi;
1783
- var TRIGGER_RE = /triggers?\s*[:-]/i;
1784
- function listSkillDirs(root) {
1785
- if (!existsSync3(root)) return [];
1786
- const out = [];
1787
- for (const entry of readdirSync(root, { withFileTypes: true })) {
1788
- if (!entry.isDirectory() && !entry.isSymbolicLink()) continue;
1789
- const skillMd = join(root, entry.name, "SKILL.md");
1790
- if (existsSync3(skillMd)) out.push({ name: entry.name, path: skillMd });
1791
- }
1792
- return out;
1793
- }
1794
- function walkJsonl(dir, cap) {
1795
- if (!existsSync3(dir)) return [];
1796
- const files = [];
1797
- const stack = [dir];
1798
- while (stack.length) {
1799
- const cur = stack.pop();
1800
- let entries;
1801
- try {
1802
- entries = readdirSync(cur, { withFileTypes: true });
1803
- } catch {
1804
- continue;
1805
- }
1806
- for (const e of entries) {
1807
- const full = join(cur, e.name);
1808
- if (e.isDirectory()) stack.push(full);
1809
- else if (e.name.endsWith(".jsonl")) {
1810
- files.push(full);
1811
- if (cap > 0 && files.length >= cap) return files;
1812
- }
1813
- }
1814
- }
1815
- return files;
1816
- }
1817
- function frontmatterDescription(body) {
1818
- const fm = /^---\n([\s\S]*?)\n---/.exec(body);
1819
- const block = fm?.[1] ?? "";
1820
- const m = /description:\s*(.+)/i.exec(block);
1821
- return m?.[1] ?? "";
1822
- }
1823
- function countArtifacts(roots, name, aliases) {
1824
- let n = 0;
1825
- for (const root of roots) {
1826
- const candidates = [join(root, ".evolve", name), ...aliases.map((a) => join(root, a))];
1827
- for (const dir of candidates) {
1828
- if (!existsSync3(dir)) continue;
1829
- try {
1830
- if (statSync(dir).isDirectory()) n += readdirSync(dir).length;
1831
- else n += 1;
1832
- } catch {
1833
- }
1834
- }
1835
- }
1836
- return n;
1837
- }
1838
- function buildSkillUsageReport(config) {
1839
- const skills = config.skillRoots.flatMap(
1840
- ({ root, kind }) => listSkillDirs(root).map((s) => ({ ...s, kind }))
1841
- );
1842
- const names = skills.map((s) => s.name);
1843
- const direct = new Map(names.map((n) => [n, 0]));
1844
- const slash = new Map(names.map((n) => [n, 0]));
1845
- const skillRe = /"skill"\s*:\s*"([a-z0-9_:-]+)"/g;
1846
- const cmdRe = /<command-name>\/?([a-z0-9_:-]+)<\/command-name>/g;
1847
- let transcripts = 0;
1848
- for (const dir of config.transcriptDirs) {
1849
- for (const file of walkJsonl(dir, config.maxTranscriptsPerDir ?? 0)) {
1850
- transcripts += 1;
1851
- let data;
1852
- try {
1853
- data = readFileSync2(file, "utf8");
1854
- } catch {
1855
- continue;
1856
- }
1857
- for (const m of data.matchAll(skillRe)) {
1858
- const g = m[1];
1859
- if (!g) continue;
1860
- const n = g.split(":").pop() ?? g;
1861
- const prev = direct.get(n);
1862
- if (prev !== void 0) direct.set(n, prev + 1);
1863
- }
1864
- for (const m of data.matchAll(cmdRe)) {
1865
- const g = m[1];
1866
- if (g === void 0) continue;
1867
- const prev = slash.get(g);
1868
- if (prev !== void 0) slash.set(g, prev + 1);
1869
- }
1870
- }
1871
- }
1872
- const bodies = /* @__PURE__ */ new Map();
1873
- for (const s of skills) {
1874
- try {
1875
- bodies.set(s.name, readFileSync2(s.path, "utf8"));
1876
- } catch {
1877
- bodies.set(s.name, "");
1878
- }
1879
- }
1880
- const inbound = new Map(names.map((n) => [n, 0]));
1881
- for (const target of names) {
1882
- const ref = new RegExp(`/${target}\\b|\\[\\[${target}\\]\\]`);
1883
- for (const s of skills) {
1884
- if (s.name === target) continue;
1885
- if (ref.test(bodies.get(s.name) ?? "")) inbound.set(target, inbound.get(target) + 1);
1886
- }
1887
- }
1888
- const records = skills.map((s) => {
1889
- const body = bodies.get(s.name) ?? "";
1890
- const dir = s.path.replace(/\/SKILL\.md$/, "");
1891
- return {
1892
- name: s.name,
1893
- kind: s.kind,
1894
- path: s.path,
1895
- lines: body ? body.split("\n").length : 0,
1896
- directInvocations: direct.get(s.name) ?? 0,
1897
- slashInvocations: slash.get(s.name) ?? 0,
1898
- inboundRefs: inbound.get(s.name) ?? 0,
1899
- artifactCount: countArtifacts(
1900
- config.artifactRoots ?? [],
1901
- s.name,
1902
- config.artifactAliases?.[s.name] ?? []
1903
- ),
1904
- tanglePrivateRefs: (body.match(TANGLE_PRIVATE_RE) ?? []).length,
1905
- hasReferencesDir: existsSync3(join(dir, "references")),
1906
- hasEvalsDir: existsSync3(join(dir, "evals")),
1907
- logsRuns: body.includes("skill-runs.jsonl"),
1908
- hasTriggerPhrases: TRIGGER_RE.test(frontmatterDescription(body) || body.slice(0, 600))
1909
- };
1910
- });
1911
- return { generatedFromTraces: transcripts, records };
1912
- }
1913
- var ANALYST_ID = "skill-usage";
1914
- function finding(area, subject, claim, severity, confidence, producedAt, recommended, evidenceUri, rationale) {
1915
- return {
1916
- schema_version: "1.0.0",
1917
- finding_id: computeFindingId({ analyst_id: ANALYST_ID, area, subject, claim }),
1918
- analyst_id: ANALYST_ID,
1919
- produced_at: producedAt,
1920
- severity,
1921
- area,
1922
- claim,
1923
- rationale,
1924
- evidence_refs: [{ kind: "artifact", uri: evidenceUri }],
1925
- recommended_action: recommended,
1926
- confidence,
1927
- subject
1928
- };
1929
- }
1930
- function emitSkillUsageFindings(report, producedAt) {
1931
- const out = [];
1932
- for (const r of report.records) {
1933
- const directTotal = r.directInvocations + r.slashInvocations;
1934
- const trueUsage = directTotal + r.inboundRefs + r.artifactCount;
1935
- if (trueUsage === 0) {
1936
- out.push(
1937
- finding(
1938
- "skill-usage",
1939
- r.name,
1940
- `Skill '${r.name}' has zero usage across all signals (direct, slash, inbound-refs, artifacts)`,
1941
- "high",
1942
- 0.6,
1943
- producedAt,
1944
- "Confirm the skill covers a real recurring job; if not, deprecate. Zero true usage is the only deterministic deprecation candidate.",
1945
- r.path,
1946
- "No Skill-tool call, no slash invocation, no sibling dispatches to it, and no on-disk artifacts."
1947
- )
1948
- );
1949
- } else if (directTotal === 0 && r.inboundRefs + r.artifactCount > 0) {
1950
- out.push(
1951
- finding(
1952
- "skill-usage",
1953
- r.name,
1954
- `Skill '${r.name}' shows 0 direct invocations but is used via orchestration/artifacts (inbound=${r.inboundRefs}, artifacts=${r.artifactCount})`,
1955
- "info",
1956
- 0.8,
1957
- producedAt,
1958
- "Do NOT treat as unused \u2014 usage is real but logged under parent skills or on disk. Strengthen direct-invocation discovery only if direct use is desired.",
1959
- r.path,
1960
- "The Skill-tool counter undercounts orchestrated/chained leaf skills."
1961
- )
1962
- );
1963
- }
1964
- if (directTotal <= 2 && !r.hasTriggerPhrases) {
1965
- out.push(
1966
- finding(
1967
- "discoverability",
1968
- r.name,
1969
- `Skill '${r.name}' is rarely invoked directly and its description has no explicit trigger phrases`,
1970
- "medium",
1971
- 0.7,
1972
- producedAt,
1973
- "Add a `Triggers:` clause with verbatim user phrases to the frontmatter description so the model auto-invokes it.",
1974
- r.path
1975
- )
1976
- );
1977
- }
1978
- if (r.kind === "public" && r.tanglePrivateRefs > 0) {
1979
- out.push(
1980
- finding(
1981
- "safety",
1982
- r.name,
1983
- `Public skill '${r.name}' carries ${r.tanglePrivateRefs} Tangle-private reference(s)`,
1984
- "high",
1985
- 0.75,
1986
- producedAt,
1987
- "Sanitize incidental internal refs (cli-bridge/kimi/tcloud/~company/private repos) or relocate to a private repo. Verify @tangle-network/* refs are to PUBLISHED packages before treating as a leak.",
1988
- r.path
1989
- )
1990
- );
1991
- }
1992
- if (r.lines > BLOAT_LINE_THRESHOLD && !r.hasReferencesDir) {
1993
- out.push(
1994
- finding(
1995
- "maintainability",
1996
- r.name,
1997
- `Skill '${r.name}' is ${r.lines} lines with no references/ split (progressive disclosure)`,
1998
- "medium",
1999
- 0.8,
2000
- producedAt,
2001
- `Split detail into references/ loaded on demand; keep SKILL.md a short overview. ${r.lines} lines load into every session's context budget.`,
2002
- r.path
2003
- )
2004
- );
2005
- }
2006
- if (!r.hasEvalsDir) {
2007
- out.push(
2008
- finding(
2009
- "data-quality",
2010
- r.name,
2011
- `Skill '${r.name}' ships no evals/`,
2012
- "low",
2013
- 0.6,
2014
- producedAt,
2015
- "Add evals/evals.json with >=3 scenarios proving the skill beats baseline; gives regression coverage.",
2016
- r.path
2017
- )
2018
- );
2019
- }
2020
- if (!r.logsRuns) {
2021
- out.push(
2022
- finding(
2023
- "observability",
2024
- r.name,
2025
- `Skill '${r.name}' never appends to .evolve/skill-runs.jsonl`,
2026
- "low",
2027
- 0.55,
2028
- producedAt,
2029
- "Append one run line to .evolve/skill-runs.jsonl on completion, or declare it a non-logging leaf, so the self-improvement loop can see it ran.",
2030
- r.path
2031
- )
2032
- );
2033
- }
2034
- }
2035
- return out;
2036
- }
2037
- var SkillUsageAnalyst = class {
2038
- id = ANALYST_ID;
2039
- description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
2040
- inputKind = "custom";
2041
- cost = { kind: "deterministic", est_usd_per_run: 0 };
2042
- version = "1.0.0";
2043
- async analyze(input, ctx) {
2044
- const producedAt = ctx.tags?.producedAt ?? (/* @__PURE__ */ new Date()).toISOString();
2045
- ctx.log?.(
2046
- `skill-usage: ${input.records.length} skills over ${input.generatedFromTraces} transcripts`
2047
- );
2048
- return emitSkillUsageFindings(input, producedAt);
2049
- }
2050
- };
2051
- var SKILL_USAGE_ANALYST = new SkillUsageAnalyst();
2052
-
2053
- // src/analyst/registry.ts
2054
- import { randomUUID } from "crypto";
2055
- var AnalystRegistry = class {
2056
- analysts = /* @__PURE__ */ new Map();
2057
- options;
2058
- constructor(options = {}) {
2059
- this.options = options;
2060
- }
2061
- register(analyst) {
2062
- if (!analyst.id) throw new Error("AnalystRegistry.register: analyst.id is required");
2063
- if (this.analysts.has(analyst.id)) {
2064
- throw new Error(`AnalystRegistry.register: duplicate analyst id "${analyst.id}"`);
2065
- }
2066
- if (!analyst.version) {
2067
- throw new Error(`AnalystRegistry.register: analyst "${analyst.id}" must declare a version`);
2068
- }
2069
- this.analysts.set(analyst.id, analyst);
2070
- }
2071
- list() {
2072
- return Array.from(this.analysts.values()).map((a) => ({
2073
- id: a.id,
2074
- description: a.description,
2075
- version: a.version,
2076
- cost: a.cost
2077
- }));
2078
- }
2079
- async run(runId, inputs, runOpts = {}) {
2080
- for await (const ev of this.runStream(runId, inputs, runOpts)) {
2081
- if (ev.type === "run-completed") return ev.result;
2082
- }
2083
- throw new Error("AnalystRegistry.run: stream completed without run-completed event");
2084
- }
2085
- /**
2086
- * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
2087
- * in real time — `run-started`, then per-analyst `skipped` /
2088
- * `started` / `completed`, then a terminal `run-completed` whose
2089
- * payload is the full `AnalystRunResult`. UIs use this to render
2090
- * progress; persistence consumers use `run()` and read the result.
2091
- *
2092
- * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
2093
- * `onComplete`) fire as before — streaming is additive, not a hook
2094
- * replacement.
2095
- */
2096
- async *runStream(runId, inputs, runOpts = {}) {
2097
- const correlationId = `ar_${randomUUID().slice(0, 12)}`;
2098
- const log = this.options.log ?? (() => {
2099
- });
2100
- const hooks = this.options.hooks ?? {};
2101
- const startedAt = (/* @__PURE__ */ new Date()).toISOString();
2102
- const started = Date.now();
2103
- const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
2104
- const selected = this.selectAnalysts(runOpts);
2105
- const budget = runOpts.budget ?? this.options.defaultBudget;
2106
- yield {
2107
- type: "run-started",
2108
- run_id: runId,
2109
- correlation_id: correlationId,
2110
- started_at: startedAt,
2111
- analyst_ids: selected.map((a) => a.id)
2112
- };
2113
- const summaries = [];
2114
- const allFindings = [];
2115
- let totalCost = 0;
2116
- let remainingUsd = budget?.totalUsd;
2117
- for (const analyst of selected) {
2118
- const t0 = Date.now();
2119
- const input = this.routeInput(analyst, inputs);
2120
- if (input.kind === "missing") {
2121
- const summary = {
2122
- analyst_id: analyst.id,
2123
- status: "skipped",
2124
- reason: `missing input of kind '${analyst.inputKind}'`,
2125
- findings_count: 0,
2126
- latency_ms: 0,
2127
- cost_usd: 0
2128
- };
2129
- summaries.push(summary);
2130
- log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
2131
- await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
2132
- yield { type: "analyst-skipped", summary };
2133
- continue;
2134
- }
2135
- const perBudget = allocateBudget(budget, {
2136
- analyst,
2137
- remainingUsd,
2138
- runningCount: selected.length
2139
- });
2140
- const ctx = {
2141
- runId,
2142
- correlationId,
2143
- deadlineMs,
2144
- budgetUsd: perBudget,
2145
- chat: this.options.chat,
2146
- tags: runOpts.tags,
2147
- log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
2148
- signal: runOpts.signal,
2149
- priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
2150
- };
2151
- await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
2152
- yield {
2153
- type: "analyst-started",
2154
- analyst_id: analyst.id,
2155
- started_at: new Date(t0).toISOString()
2156
- };
2157
- try {
2158
- const findings = await analyst.analyze(input.value, ctx);
2159
- const latency = Date.now() - t0;
2160
- const cost = sumFindingCost(findings);
2161
- totalCost += cost;
2162
- if (typeof remainingUsd === "number") remainingUsd = Math.max(0, remainingUsd - cost);
2163
- allFindings.push(...findings);
2164
- const summary = {
2165
- analyst_id: analyst.id,
2166
- status: "ok",
2167
- findings_count: findings.length,
2168
- latency_ms: latency,
2169
- cost_usd: cost
2170
- };
2171
- summaries.push(summary);
2172
- log(`[analyst] ok ${analyst.id}`, {
2173
- runId,
2174
- findings: findings.length,
2175
- latency_ms: latency,
2176
- cost_usd: cost
2177
- });
2178
- await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
2179
- yield { type: "analyst-completed", summary, findings };
2180
- } catch (err) {
2181
- const latency = Date.now() - t0;
2182
- const e = err instanceof Error ? err : new Error(String(err));
2183
- const hookFindings = await hooks.onError?.({ analyst, error: e, runId }) ?? [];
2184
- if (hookFindings.length) allFindings.push(...hookFindings);
2185
- const summary = {
2186
- analyst_id: analyst.id,
2187
- status: "failed",
2188
- findings_count: hookFindings.length,
2189
- latency_ms: latency,
2190
- cost_usd: 0,
2191
- error: { class: e.constructor.name, message: e.message }
2192
- };
2193
- summaries.push(summary);
2194
- log(`[analyst] FAIL ${analyst.id}`, {
2195
- runId,
2196
- error_class: e.constructor.name,
2197
- error: e.message
2198
- });
2199
- await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
2200
- yield { type: "analyst-completed", summary, findings: hookFindings };
2201
- }
2202
- }
2203
- const result = {
2204
- run_id: runId,
2205
- correlation_id: correlationId,
2206
- started_at: startedAt,
2207
- ended_at: (/* @__PURE__ */ new Date()).toISOString(),
2208
- findings: allFindings,
2209
- per_analyst: summaries,
2210
- total_cost_usd: totalCost
2211
- };
2212
- await hooks.onComplete?.({ result });
2213
- yield { type: "run-completed", result };
2214
- }
2215
- selectAnalysts(opts) {
2216
- let candidates = Array.from(this.analysts.values());
2217
- if (opts.only?.length) {
2218
- const only = new Set(opts.only);
2219
- candidates = candidates.filter((a) => only.has(a.id));
2220
- }
2221
- if (opts.skip?.length) {
2222
- const skip = new Set(opts.skip);
2223
- candidates = candidates.filter((a) => !skip.has(a.id));
2224
- }
2225
- return candidates;
2226
- }
2227
- routeInput(analyst, inputs) {
2228
- switch (analyst.inputKind) {
2229
- case "trace-store":
2230
- return inputs.traceStore ? { kind: "present", value: inputs.traceStore } : { kind: "missing" };
2231
- case "artifact-dir":
2232
- return inputs.artifactDir ? { kind: "present", value: inputs.artifactDir } : { kind: "missing" };
2233
- case "run-record":
2234
- return inputs.runRecord ? { kind: "present", value: inputs.runRecord } : { kind: "missing" };
2235
- case "judge-input":
2236
- return inputs.judgeInput ? { kind: "present", value: inputs.judgeInput } : { kind: "missing" };
2237
- case "custom": {
2238
- const v = inputs.custom?.[analyst.id];
2239
- return v !== void 0 ? { kind: "present", value: v } : { kind: "missing" };
2240
- }
2241
- }
2242
- }
2243
- };
2244
- function allocateBudget(policy, args) {
2245
- if (!policy) return void 0;
2246
- if (policy.allocate) {
2247
- return policy.allocate({
2248
- analyst: args.analyst,
2249
- totalUsd: policy.totalUsd,
2250
- remainingUsd: args.remainingUsd,
2251
- runningCount: args.runningCount
2252
- });
2253
- }
2254
- if (policy.totalUsd == null) return void 0;
2255
- if (policy.weights) {
2256
- const w = policy.weights[args.analyst.id] ?? 1;
2257
- const totalWeight = Math.max(1, args.runningCount);
2258
- return policy.totalUsd * w / totalWeight;
2259
- }
2260
- return policy.totalUsd / Math.max(1, args.runningCount);
2261
- }
2262
- function sumFindingCost(findings) {
2263
- let sum3 = 0;
2264
- for (const f of findings) {
2265
- const c = f.metadata?.cost_usd;
2266
- if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
2267
- }
2268
- return sum3;
2269
- }
2270
- function selectPriorFindings(source, analystId) {
2271
- if (!source) return void 0;
2272
- if (Array.isArray(source)) {
2273
- const own2 = source.filter((f) => f.analyst_id === analystId);
2274
- return own2.length > 0 ? own2 : void 0;
2275
- }
2276
- const record = source;
2277
- const own = record[analystId] ?? [];
2278
- const wildcard = record["*"] ?? [];
2279
- const merged = [...own, ...wildcard];
2280
- return merged.length > 0 ? merged : void 0;
2281
- }
176
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
177
+ gainHistogram,
178
+ paretoChart,
179
+ researchReport,
180
+ summaryTable
181
+ } from "./chunk-KX6F6NCG.js";
182
+ import {
183
+ benjaminiHochberg,
184
+ bonferroni,
185
+ calibrateJudge,
186
+ calibrateJudgeContinuous,
187
+ cliffsDelta,
188
+ cohensD,
189
+ confidenceInterval,
190
+ continuousAgreement,
191
+ corpusInterRaterAgreement,
192
+ corpusInterRaterAgreementFromJudgeScores,
193
+ interRaterReliability,
194
+ interpretCliffs,
195
+ mannWhitneyU,
196
+ normalizeScores,
197
+ pairedBootstrap,
198
+ pairedMde,
199
+ pairedTTest,
200
+ partialCredit,
201
+ positionalBias,
202
+ requiredSampleSize,
203
+ selfPreference,
204
+ verbosityBias,
205
+ weightedComposite,
206
+ weightedMean,
207
+ wilcoxonSignedRank
208
+ } from "./chunk-ITBRCT73.js";
209
+ import {
210
+ FileSystemTraceStore,
211
+ InMemoryTraceStore,
212
+ OTEL_AGENT_EVAL_SCOPE,
213
+ ReplayCache,
214
+ ReplayCacheMissError,
215
+ buildTraceInsightContext,
216
+ buildTraceInsightPrompt,
217
+ captureFetchToRawSink,
218
+ createOtelExporter,
219
+ createOtelTracingStore,
220
+ createReplayFetch,
221
+ defaultTraceInsightPanel,
222
+ describeTraceInsightScope,
223
+ domainEvidencePattern,
224
+ exportRunAsOtlp,
225
+ flattenOtlpExportToNdjson,
226
+ inferDomainKeywords,
227
+ iterateRawCalls,
228
+ otelRunCompleteHook,
229
+ otlpToRunRecords,
230
+ otlpToTraceRunRecords,
231
+ planTraceInsightQuestions,
232
+ scoreTraceInsightReadiness,
233
+ tokenizeDomainWords,
234
+ traceAnalystOnRunComplete
235
+ } from "./chunk-JHA3ZGSO.js";
236
+ import {
237
+ DEFAULT_REDACTION_RULES,
238
+ REDACTION_VERSION,
239
+ redactString,
240
+ redactValue
241
+ } from "./chunk-GGE4NNQT.js";
242
+ import {
243
+ aggregateLlm,
244
+ argHash,
245
+ groupBy,
246
+ judgeSpans,
247
+ llmSpans,
248
+ runFailureClass,
249
+ runsForScenario,
250
+ toolSpans
251
+ } from "./chunk-47X6LRCE.js";
252
+ import {
253
+ FAILURE_CLASSES,
254
+ TRACE_SCHEMA_VERSION,
255
+ isJudgeSpan,
256
+ isLlmSpan,
257
+ isRetrievalSpan,
258
+ isSandboxSpan,
259
+ isToolSpan
260
+ } from "./chunk-5BKGXME7.js";
261
+ import {
262
+ DEFAULT_TRACE_ANALYST_BUDGETS,
263
+ OtlpFileTraceStore,
264
+ SpanNotFoundError,
265
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
266
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
267
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
268
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
269
+ TraceFileMissingError,
270
+ TraceNotFoundError,
271
+ analyzeTraces,
272
+ asNumber,
273
+ asString,
274
+ buildTraceAnalystTools,
275
+ extractOtlpAttributes,
276
+ firstNumberAttr,
277
+ firstStringAttr,
278
+ inferOtlpKind,
279
+ projectOtlpFlatLine,
280
+ readOtlpStatus,
281
+ stringField,
282
+ traceAnalystFunctionGroup
283
+ } from "./chunk-VUINJM5M.js";
284
+ import {
285
+ RunIntegrityError,
286
+ assertRunCaptured,
287
+ throwIfRunIncomplete
288
+ } from "./chunk-SBCB6VZY.js";
289
+ import {
290
+ FileSystemRawProviderSink,
291
+ InMemoryRawProviderSink,
292
+ NoopRawProviderSink,
293
+ defaultProviderRedactor,
294
+ providerFromBaseUrl
295
+ } from "./chunk-PC4UYEBM.js";
296
+ import {
297
+ AGENT_PROFILE_KINDS,
298
+ AgentProfileCellValidationError,
299
+ RunRecordValidationError,
300
+ agentProfileCellHashMaterial,
301
+ agentProfileCellKey,
302
+ assertRunAgentProfileCell,
303
+ buildAgentProfileCell,
304
+ buildSandboxAgentProfileCell,
305
+ groupRunsByAgentProfileCell,
306
+ isRunRecord,
307
+ parseRunRecordSafe,
308
+ requireAgentProfileCell,
309
+ roundTripRunRecord,
310
+ toAgentProfileJson,
311
+ validateAgentProfileCell,
312
+ validateRunRecord,
313
+ verifyAgentProfileCell
314
+ } from "./chunk-F3SRAAZO.js";
315
+ import {
316
+ TraceEmitter,
317
+ llmSpanFromProvider
318
+ } from "./chunk-TVVP3ZZQ.js";
319
+ import {
320
+ canonicalize,
321
+ evaluateHypothesis,
322
+ hashJson,
323
+ signManifest,
324
+ verifyManifest
325
+ } from "./chunk-VSMTAMNK.js";
326
+ import {
327
+ AgentEvalError,
328
+ CaptureIntegrityError,
329
+ ConfigError,
330
+ JudgeError,
331
+ NotFoundError,
332
+ ReplayError,
333
+ ValidationError,
334
+ VerificationError
335
+ } from "./chunk-3BFEG2F6.js";
336
+ import {
337
+ __export
338
+ } from "./chunk-PZ5AY32C.js";
2282
339
 
2283
340
  // src/auto-pr.ts
2284
341
  async function proposeAutomatedPullRequest(client, input) {
@@ -2484,12 +541,12 @@ function ghCliClient(opts = {}) {
2484
541
  await exec("git", ["branch", "-D", input.branchName], { cwd });
2485
542
  await run("git", ["checkout", "-b", input.branchName]);
2486
543
  const { mkdir, writeFile } = await import("fs/promises");
2487
- const { dirname: dirname4, join: join5, resolve } = await import("path");
544
+ const { dirname: dirname3, join: join4, resolve } = await import("path");
2488
545
  for (const change of input.fileChanges) {
2489
546
  const abs = resolve(cwd, change.path);
2490
- await mkdir(dirname4(abs), { recursive: true });
547
+ await mkdir(dirname3(abs), { recursive: true });
2491
548
  await writeFile(abs, change.contents, "utf8");
2492
- await run("git", ["add", join5(change.path)]);
549
+ await run("git", ["add", join4(change.path)]);
2493
550
  }
2494
551
  const env = {};
2495
552
  if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
@@ -3372,10 +1429,10 @@ var FileSystemFeedbackTrajectoryStore = class {
3372
1429
  }
3373
1430
  async append(record) {
3374
1431
  const { appendFile, mkdir } = await import("fs/promises");
3375
- const { join: join5 } = await import("path");
1432
+ const { join: join4 } = await import("path");
3376
1433
  await mkdir(this.dir, { recursive: true });
3377
1434
  await appendFile(
3378
- join5(this.dir, "feedback-trajectories.ndjson"),
1435
+ join4(this.dir, "feedback-trajectories.ndjson"),
3379
1436
  `${JSON.stringify(record)}
3380
1437
  `,
3381
1438
  "utf8"
@@ -3384,8 +1441,8 @@ var FileSystemFeedbackTrajectoryStore = class {
3384
1441
  async load() {
3385
1442
  if (this.loaded) return;
3386
1443
  const { readFile } = await import("fs/promises");
3387
- const { join: join5 } = await import("path");
3388
- const file = join5(this.dir, "feedback-trajectories.ndjson");
1444
+ const { join: join4 } = await import("path");
1445
+ const file = join4(this.dir, "feedback-trajectories.ndjson");
3389
1446
  try {
3390
1447
  const raw = await readFile(file, "utf8");
3391
1448
  for (const line of raw.split("\n")) {
@@ -3698,21 +1755,21 @@ var SingleBackendError = class extends AgentEvalError {
3698
1755
  function stripSlash(url) {
3699
1756
  return url.replace(/\/+$/, "");
3700
1757
  }
3701
- function assertSingleBackend(agent2, judge, opts = {}) {
1758
+ function assertSingleBackend(agent, judge, opts = {}) {
3702
1759
  const divergences = [];
3703
- if (agent2.kind !== judge.kind) {
3704
- divergences.push({ field: "kind", agent: agent2.kind, judge: judge.kind });
1760
+ if (agent.kind !== judge.kind) {
1761
+ divergences.push({ field: "kind", agent: agent.kind, judge: judge.kind });
3705
1762
  }
3706
- if (stripSlash(agent2.baseUrl) !== stripSlash(judge.baseUrl)) {
3707
- divergences.push({ field: "baseUrl", agent: agent2.baseUrl, judge: judge.baseUrl });
1763
+ if (stripSlash(agent.baseUrl) !== stripSlash(judge.baseUrl)) {
1764
+ divergences.push({ field: "baseUrl", agent: agent.baseUrl, judge: judge.baseUrl });
3708
1765
  }
3709
- if (agent2.model !== judge.model) {
3710
- divergences.push({ field: "model", agent: agent2.model, judge: judge.model });
1766
+ if (agent.model !== judge.model) {
1767
+ divergences.push({ field: "model", agent: agent.model, judge: judge.model });
3711
1768
  }
3712
- if (agent2.provider !== judge.provider) {
3713
- divergences.push({ field: "provider", agent: agent2.provider, judge: judge.provider });
1769
+ if (agent.provider !== judge.provider) {
1770
+ divergences.push({ field: "provider", agent: agent.provider, judge: judge.provider });
3714
1771
  }
3715
- const agentHasKey = Boolean(agent2.apiKey);
1772
+ const agentHasKey = Boolean(agent.apiKey);
3716
1773
  const judgeHasKey = Boolean(judge.apiKey);
3717
1774
  if (agentHasKey !== judgeHasKey) {
3718
1775
  divergences.push({
@@ -4765,194 +2822,6 @@ function pathExists(obj, path) {
4765
2822
  return true;
4766
2823
  }
4767
2824
 
4768
- // src/completion-verifier.ts
4769
- var STOPWORDS = /* @__PURE__ */ new Set([
4770
- "the",
4771
- "a",
4772
- "an",
4773
- "of",
4774
- "for",
4775
- "and",
4776
- "or",
4777
- "to",
4778
- "in",
4779
- "on",
4780
- "with",
4781
- "by"
4782
- ]);
4783
- var MATCH_THRESHOLD = 0.5;
4784
- var MIN_CONTENT_CHARS = 50;
4785
- function tokens(s) {
4786
- return new Set(
4787
- s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
4788
- );
4789
- }
4790
- function tokenRecall(requirementText, candidateText) {
4791
- const req = tokens(requirementText);
4792
- if (req.size === 0) return 0;
4793
- const cand = tokens(candidateText);
4794
- let hit = 0;
4795
- for (const t of req) if (cand.has(t)) hit++;
4796
- return hit / req.size;
4797
- }
4798
- function artifactCandidates(req, reqIndex, artifacts) {
4799
- const reqText = `${req.title} ${req.category ?? ""}`;
4800
- const out = [];
4801
- artifacts.forEach((a, i) => {
4802
- if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
4803
- let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
4804
- if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
4805
- score = Math.max(score, 1);
4806
- }
4807
- if (score < MATCH_THRESHOLD) return;
4808
- out.push({
4809
- reqIndex,
4810
- itemKey: `artifact:${i}`,
4811
- score,
4812
- evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
4813
- content: a.content ?? null
4814
- });
4815
- });
4816
- return out;
4817
- }
4818
- function proposalCandidates(req, reqIndex, proposals) {
4819
- const reqText = `${req.title} ${req.category ?? ""}`;
4820
- const out = [];
4821
- for (const p of proposals) {
4822
- if (p.status !== "approved") continue;
4823
- const score = tokenRecall(reqText, p.title);
4824
- if (score < MATCH_THRESHOLD) continue;
4825
- const body = p.content ?? "";
4826
- out.push({
4827
- reqIndex,
4828
- itemKey: `proposal:${p.id}`,
4829
- score,
4830
- evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
4831
- content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
4832
- });
4833
- }
4834
- return out;
4835
- }
4836
- function toolCallCandidates(req, reqIndex, toolCalls) {
4837
- const out = [];
4838
- toolCalls.forEach((name, i) => {
4839
- const score = tokenRecall(req.title, name);
4840
- if (score < MATCH_THRESHOLD) return;
4841
- out.push({
4842
- reqIndex,
4843
- itemKey: `tool:${i}`,
4844
- score,
4845
- evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
4846
- content: null
4847
- });
4848
- });
4849
- return out;
4850
- }
4851
- async function verifyCompletion(gold, state, checkCorrectness) {
4852
- if (gold.requirements.length === 0) {
4853
- throw new Error(
4854
- `verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
4855
- );
4856
- }
4857
- const candidates = [];
4858
- gold.requirements.forEach((req, i) => {
4859
- const by = req.satisfiedBy ?? "any";
4860
- if (by === "artifact" || by === "any") {
4861
- candidates.push(...artifactCandidates(req, i, state.artifacts));
4862
- }
4863
- if (by === "proposal" || by === "any") {
4864
- candidates.push(...proposalCandidates(req, i, state.proposals));
4865
- }
4866
- if (by === "tool-call" || by === "any") {
4867
- candidates.push(...toolCallCandidates(req, i, state.toolCalls));
4868
- }
4869
- });
4870
- candidates.sort((a, b) => b.score - a.score);
4871
- const assigned = /* @__PURE__ */ new Map();
4872
- const itemTaken = /* @__PURE__ */ new Set();
4873
- for (const c of candidates) {
4874
- if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
4875
- assigned.set(c.reqIndex, c);
4876
- itemTaken.add(c.itemKey);
4877
- }
4878
- const requirements = [];
4879
- for (let i = 0; i < gold.requirements.length; i++) {
4880
- const req = gold.requirements[i];
4881
- const match = assigned.get(i);
4882
- const evidence = [];
4883
- let correct = null;
4884
- if (match) {
4885
- evidence.push(match.evidence);
4886
- if (match.content !== null) {
4887
- const r = await checkCorrectness(req, match.content);
4888
- correct = r.correct;
4889
- evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
4890
- } else {
4891
- evidence.push("correctness: not assessed \u2014 matched item carries no content");
4892
- }
4893
- } else {
4894
- const by = req.satisfiedBy ?? "any";
4895
- const kind = by === "any" ? "artifact/proposal/tool-call" : by;
4896
- evidence.push(`no produced ${kind} matched this requirement`);
4897
- }
4898
- const structurallyPresent = match !== void 0;
4899
- const satisfied = structurallyPresent && correct !== false;
4900
- requirements.push({
4901
- reqId: req.reqId,
4902
- title: req.title,
4903
- structurallyPresent,
4904
- correct,
4905
- satisfied,
4906
- evidence
4907
- });
4908
- }
4909
- const satisfiedCount = requirements.filter((r) => r.satisfied).length;
4910
- return {
4911
- taskId: gold.taskId,
4912
- requirements,
4913
- completionRate: satisfiedCount / requirements.length,
4914
- fullyComplete: satisfiedCount === requirements.length
4915
- };
4916
- }
4917
- function parseCorrectnessResponse(raw) {
4918
- const match = raw.match(/\{[\s\S]*\}/);
4919
- if (!match) {
4920
- throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
4921
- }
4922
- const parsed = JSON.parse(match[0]);
4923
- if (typeof parsed.correct !== "boolean") {
4924
- throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
4925
- }
4926
- return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
4927
- }
4928
- function createLlmCorrectnessChecker(tc, opts = {}) {
4929
- const model = opts.model ?? "claude-sonnet-4-6";
4930
- const maxContentChars = opts.maxContentChars ?? 8e3;
4931
- return async (requirement, content) => {
4932
- const resp = await tc.chat({
4933
- model,
4934
- messages: [
4935
- {
4936
- role: "system",
4937
- content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
4938
- },
4939
- {
4940
- role: "user",
4941
- content: `Requirement: ${requirement.title}
4942
- ${requirement.category ? `Category: ${requirement.category}
4943
- ` : ""}
4944
- Produced artifact:
4945
- ${content.slice(0, maxContentChars)}`
4946
- }
4947
- ],
4948
- temperature: 0,
4949
- maxTokens: 200
4950
- });
4951
- const raw = resp.choices?.[0]?.message?.content ?? "";
4952
- return parseCorrectnessResponse(raw);
4953
- };
4954
- }
4955
-
4956
2825
  // src/dual-agent-bench.ts
4957
2826
  var DualAgentBench = class {
4958
2827
  async run(config) {
@@ -5607,40 +3476,6 @@ function canonicalInstruction(value) {
5607
3476
  return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
5608
3477
  }
5609
3478
 
5610
- // src/produced-state.ts
5611
- function artifactKind(mimeType) {
5612
- if (!mimeType) return "file";
5613
- if (mimeType.includes("json")) return "json";
5614
- if (mimeType.startsWith("text/")) return "text";
5615
- return "file";
5616
- }
5617
- function extractProducedState(events) {
5618
- const artifacts = [];
5619
- const proposals = [];
5620
- const toolCalls = [];
5621
- const seenTools = /* @__PURE__ */ new Set();
5622
- for (const ev of events) {
5623
- if (ev.type === "tool_call") {
5624
- const name = ev.toolName;
5625
- if (name && !seenTools.has(name)) {
5626
- seenTools.add(name);
5627
- toolCalls.push(name);
5628
- }
5629
- } else if (ev.type === "artifact") {
5630
- const a = ev;
5631
- artifacts.push({
5632
- kind: artifactKind(a.mimeType),
5633
- path: a.name ?? a.uri ?? a.artifactId,
5634
- content: a.content ?? ""
5635
- });
5636
- } else if (ev.type === "proposal_created") {
5637
- const p = ev;
5638
- proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
5639
- }
5640
- }
5641
- return { artifacts, proposals, toolCalls };
5642
- }
5643
-
5644
3479
  // src/prompt-registry.ts
5645
3480
  var PromptRegistry = class {
5646
3481
  entries = /* @__PURE__ */ new Map();
@@ -5740,12 +3575,17 @@ function renderSteeringText(bundle) {
5740
3575
  ([a], [b]) => a.localeCompare(b)
5741
3576
  );
5742
3577
  for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
3578
+ const roles = Object.entries(bundle.rolePrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
3579
+ for (const [name, role] of roles) {
3580
+ lines.push(`role:${name}:system:${role.system ?? ""}:append:${role.append ?? ""}`);
3581
+ }
5743
3582
  const skills = [...bundle.skills ?? []].sort();
5744
3583
  if (skills.length) lines.push(`skills:${skills.join(",")}`);
5745
3584
  return lines.join("\n");
5746
3585
  }
5747
3586
 
5748
3587
  // src/steering-optimizer.ts
3588
+ import { AxGEPA, ai, ax } from "@ax-llm/ax";
5749
3589
  var PairwiseSteeringOptimizer = class {
5750
3590
  optimize(rows, config = {}) {
5751
3591
  const ranked = rankRows(rows, config.weights);
@@ -5765,36 +3605,25 @@ var AxGepaSteeringOptimizer = class {
5765
3605
  config;
5766
3606
  async optimize(rows) {
5767
3607
  const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
5768
- const minRows = this.config.minRows ?? 6;
3608
+ const minScenarioWinners = this.config.minScenarioWinners ?? 6;
5769
3609
  const variantIds = [...new Set(rows.map((row) => row.variantId))];
5770
3610
  const byScenario = collapseScenarioWinners(rows, this.config.weights);
5771
- if (variantIds.length < 2 || byScenario.length < minRows) {
5772
- return {
5773
- ...fallback,
5774
- backend: "ax-gepa",
5775
- skipped: true,
5776
- rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
5777
- };
5778
- }
5779
- let axLib;
5780
- try {
5781
- axLib = await import("@ax-llm/ax");
5782
- } catch {
3611
+ if (variantIds.length < 2 || byScenario.length < minScenarioWinners) {
5783
3612
  return {
5784
3613
  ...fallback,
5785
3614
  backend: "ax-gepa",
5786
3615
  skipped: true,
5787
- rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
3616
+ rationale: `AxGEPA skipped: need >=2 variants and >=${minScenarioWinners} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
5788
3617
  };
5789
3618
  }
5790
- const { ai, ax, AxGEPA } = axLib;
5791
3619
  const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
5792
3620
  const selector = ax(signature, {
5793
3621
  description: "Choose the best steering bundle variant for an autopilot task."
5794
3622
  });
5795
- const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
5796
- const train = byScenario.slice(0, splitIndex);
5797
- const validation = byScenario.slice(splitIndex);
3623
+ const shuffled = seededShuffle(byScenario, signature);
3624
+ const splitIndex = Math.max(1, Math.floor(shuffled.length * 0.8));
3625
+ const train = shuffled.slice(0, splitIndex);
3626
+ const validation = shuffled.slice(splitIndex);
5798
3627
  if (!validation.length) {
5799
3628
  return {
5800
3629
  ...fallback,
@@ -5803,10 +3632,10 @@ var AxGepaSteeringOptimizer = class {
5803
3632
  rationale: "AxGEPA skipped: no validation examples after split."
5804
3633
  };
5805
3634
  }
3635
+ const studentAI = createAxService(this.config.provider, this.config.apiKey, this.config.model);
5806
3636
  const optimizer = new AxGEPA({
5807
- studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
3637
+ studentAI,
5808
3638
  teacherAI: createAxService(
5809
- ai,
5810
3639
  this.config.provider,
5811
3640
  this.config.apiKey,
5812
3641
  this.config.teacherModel ?? this.config.model
@@ -5820,7 +3649,7 @@ var AxGepaSteeringOptimizer = class {
5820
3649
  const compiled = await optimizer.compile(
5821
3650
  selector,
5822
3651
  train,
5823
- ({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0,
3652
+ (input) => input.prediction?.variantId === input.example?.variantId ? 1 : 0,
5824
3653
  {
5825
3654
  validationExamples: validation,
5826
3655
  maxMetricCalls: 64
@@ -5829,6 +3658,13 @@ var AxGepaSteeringOptimizer = class {
5829
3658
  if (compiled.optimizedProgram !== void 0) {
5830
3659
  selector.applyOptimization(compiled.optimizedProgram);
5831
3660
  }
3661
+ const selectVariant = async (row) => {
3662
+ const prediction = await selector.forward(studentAI, row);
3663
+ return {
3664
+ variantId: String(prediction.variantId),
3665
+ rationale: String(prediction.rationale ?? "")
3666
+ };
3667
+ };
5832
3668
  return {
5833
3669
  ...fallback,
5834
3670
  backend: "ax-gepa",
@@ -5838,7 +3674,8 @@ var AxGepaSteeringOptimizer = class {
5838
3674
  signature,
5839
3675
  labels: variantIds,
5840
3676
  rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
5841
- }
3677
+ },
3678
+ selectVariant
5842
3679
  };
5843
3680
  }
5844
3681
  };
@@ -5872,13 +3709,39 @@ function collapseScenarioWinners(rows, weights) {
5872
3709
  };
5873
3710
  });
5874
3711
  }
5875
- function createAxService(aiFactory, provider, apiKey, model) {
5876
- return aiFactory({
3712
+ function createAxService(provider, apiKey, model) {
3713
+ return ai({
5877
3714
  name: provider,
5878
3715
  apiKey,
5879
3716
  config: { model }
5880
3717
  });
5881
3718
  }
3719
+ function seededShuffle(items, seed) {
3720
+ const rng = mulberry32(hashString(seed));
3721
+ const out = [...items];
3722
+ for (let i = out.length - 1; i > 0; i--) {
3723
+ const j = Math.floor(rng() * (i + 1));
3724
+ [out[i], out[j]] = [out[j], out[i]];
3725
+ }
3726
+ return out;
3727
+ }
3728
+ function hashString(value) {
3729
+ let h = 2166136261;
3730
+ for (let i = 0; i < value.length; i++) {
3731
+ h ^= value.charCodeAt(i);
3732
+ h = Math.imul(h, 16777619);
3733
+ }
3734
+ return h >>> 0;
3735
+ }
3736
+ function mulberry32(seed) {
3737
+ let a = seed >>> 0;
3738
+ return () => {
3739
+ a = a + 1831565813 | 0;
3740
+ let t = Math.imul(a ^ a >>> 15, 1 | a);
3741
+ t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
3742
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
3743
+ };
3744
+ }
5882
3745
 
5883
3746
  // src/workspace-inspector.ts
5884
3747
  var InMemoryWorkspaceInspector = class {
@@ -6113,8 +3976,8 @@ function assertNonNegative(n, name) {
6113
3976
  }
6114
3977
 
6115
3978
  // src/muffled-gate-scanner.ts
6116
- import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
6117
- import { join as join2 } from "path";
3979
+ import { existsSync, readdirSync, readFileSync, statSync } from "fs";
3980
+ import { join } from "path";
6118
3981
  function codeOf(line) {
6119
3982
  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
6120
3983
  }
@@ -6226,14 +4089,14 @@ var UNIVERSAL_FINDERS = [findConstructorCwdDropped];
6226
4089
  function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
6227
4090
  const matches = [];
6228
4091
  const walk = (rel) => {
6229
- const abs = join2(repoRoot, rel);
6230
- if (!existsSync4(abs)) return;
6231
- for (const entry of readdirSync2(abs)) {
6232
- const sub = join2(rel, entry);
6233
- const subAbs = join2(repoRoot, sub);
4092
+ const abs = join(repoRoot, rel);
4093
+ if (!existsSync(abs)) return;
4094
+ for (const entry of readdirSync(abs)) {
4095
+ const sub = join(rel, entry);
4096
+ const subAbs = join(repoRoot, sub);
6234
4097
  let st;
6235
4098
  try {
6236
- st = statSync2(subAbs);
4099
+ st = statSync(subAbs);
6237
4100
  } catch {
6238
4101
  continue;
6239
4102
  }
@@ -6246,7 +4109,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
6246
4109
  continue;
6247
4110
  let text;
6248
4111
  try {
6249
- text = readFileSync3(subAbs, "utf8");
4112
+ text = readFileSync(subAbs, "utf8");
6250
4113
  } catch {
6251
4114
  continue;
6252
4115
  }
@@ -6261,9 +4124,9 @@ function scanForMuffledGates(opts) {
6261
4124
  const findings = [];
6262
4125
  const scanned = /* @__PURE__ */ new Set();
6263
4126
  for (const file of opts.scanFiles) {
6264
- const abs = join2(opts.repoRoot, file);
6265
- if (!existsSync4(abs)) continue;
6266
- const text = readFileSync3(abs, "utf8");
4127
+ const abs = join(opts.repoRoot, file);
4128
+ if (!existsSync(abs)) continue;
4129
+ const text = readFileSync(abs, "utf8");
6267
4130
  for (const find of opts.finders) findings.push(...find(file, text));
6268
4131
  scanned.add(file);
6269
4132
  }
@@ -6276,9 +4139,9 @@ function scanForMuffledGates(opts) {
6276
4139
  );
6277
4140
  for (const file of importers) {
6278
4141
  if (scanned.has(file)) continue;
6279
- const abs = join2(opts.repoRoot, file);
6280
- if (!existsSync4(abs)) continue;
6281
- const text = readFileSync3(abs, "utf8");
4142
+ const abs = join(opts.repoRoot, file);
4143
+ if (!existsSync(abs)) continue;
4144
+ const text = readFileSync(abs, "utf8");
6282
4145
  for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
6283
4146
  }
6284
4147
  }
@@ -6428,8 +4291,8 @@ function isObject(v) {
6428
4291
  }
6429
4292
 
6430
4293
  // src/scorecard.ts
6431
- import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
6432
- import { dirname as dirname2 } from "path";
4294
+ import { appendFileSync, existsSync as existsSync2, mkdirSync, readFileSync as readFileSync2 } from "fs";
4295
+ import { dirname } from "path";
6433
4296
  function median(xs) {
6434
4297
  if (xs.length === 0) return 0;
6435
4298
  const sorted = [...xs].sort((a, b) => a - b);
@@ -6494,8 +4357,8 @@ function recordRuns(runs, opts) {
6494
4357
  }
6495
4358
  function appendScorecard(logPath, lines) {
6496
4359
  if (lines.length === 0) return;
6497
- mkdirSync2(dirname2(logPath), { recursive: true });
6498
- appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
4360
+ mkdirSync(dirname(logPath), { recursive: true });
4361
+ appendFileSync(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
6499
4362
  `);
6500
4363
  }
6501
4364
  function recordRunsToScorecard(logPath, runs, opts) {
@@ -6504,10 +4367,10 @@ function recordRunsToScorecard(logPath, runs, opts) {
6504
4367
  return lines;
6505
4368
  }
6506
4369
  function loadScorecard(logPath) {
6507
- if (!existsSync5(logPath)) return { cells: [], profiles: {} };
4370
+ if (!existsSync2(logPath)) return { cells: [], profiles: {} };
6508
4371
  const cells = /* @__PURE__ */ new Map();
6509
4372
  const profiles = {};
6510
- for (const raw of readFileSync4(logPath, "utf8").split("\n")) {
4373
+ for (const raw of readFileSync2(logPath, "utf8").split("\n")) {
6511
4374
  const line = raw.trim();
6512
4375
  if (!line) continue;
6513
4376
  let parsed;
@@ -6772,9 +4635,9 @@ function statusAdvanced(key, progression) {
6772
4635
  description: `"${key}" progressed along ${progression.join("\u2192")}`,
6773
4636
  score: ({ before, after }) => {
6774
4637
  const bi = progression.indexOf(String(before[key]));
6775
- const ai = progression.indexOf(String(after[key]));
6776
- if (bi === -1 || ai === -1) return 0;
6777
- return ai >= bi ? 1 : 0;
4638
+ const ai2 = progression.indexOf(String(after[key]));
4639
+ if (bi === -1 || ai2 === -1) return 0;
4640
+ return ai2 >= bi ? 1 : 0;
6778
4641
  }
6779
4642
  };
6780
4643
  }
@@ -7384,7 +5247,7 @@ async function commitBisect(options) {
7384
5247
  }
7385
5248
  async function promptBisect(options) {
7386
5249
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
7387
- const join5 = (paragraphs) => paragraphs.join("\n\n");
5250
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
7388
5251
  const goodParas = split(options.good);
7389
5252
  const badParas = split(options.bad);
7390
5253
  if (goodParas.length !== badParas.length) {
@@ -7404,7 +5267,7 @@ async function promptBisect(options) {
7404
5267
  const result = await bisect({
7405
5268
  good: goodMask,
7406
5269
  bad: badMask,
7407
- runEval: (mask) => options.runEval(join5(paragraphsFor(mask))),
5270
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
7408
5271
  maxIterations: options.maxIterations ?? n + 5,
7409
5272
  halfway: (g, b) => {
7410
5273
  for (let i = 0; i < g.length; i++) {
@@ -7435,12 +5298,12 @@ async function promptBisect(options) {
7435
5298
  }
7436
5299
  }
7437
5300
  const materializedPath = result.path.map((s) => ({
7438
- state: join5(paragraphsFor(s.state)),
5301
+ state: join4(paragraphsFor(s.state)),
7439
5302
  score: s.score,
7440
5303
  pass: s.pass
7441
5304
  }));
7442
5305
  return {
7443
- culprit: join5(paragraphsFor(culprit)),
5306
+ culprit: join4(paragraphsFor(culprit)),
7444
5307
  path: materializedPath,
7445
5308
  converged: result.converged,
7446
5309
  inputInconsistent: result.inputInconsistent,
@@ -7934,8 +5797,8 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7934
5797
 
7935
5798
  // src/command-runner.ts
7936
5799
  import { spawnSync } from "child_process";
7937
- import { existsSync as existsSync6, readdirSync as readdirSync3, readFileSync as readFileSync5, statSync as statSync3 } from "fs";
7938
- import { join as join3 } from "path";
5800
+ import { existsSync as existsSync3, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
5801
+ import { join as join2 } from "path";
7939
5802
  var localCommandRunner = {
7940
5803
  name: "local",
7941
5804
  async run(input) {
@@ -7963,11 +5826,11 @@ var localCommandRunner = {
7963
5826
  return r.status === 0 && (r.stdout ?? "").trim().length > 0;
7964
5827
  },
7965
5828
  async fileExists(path) {
7966
- return existsSync6(path);
5829
+ return existsSync3(path);
7967
5830
  },
7968
5831
  async readFile(path) {
7969
5832
  try {
7970
- return readFileSync5(path, "utf8");
5833
+ return readFileSync3(path, "utf8");
7971
5834
  } catch {
7972
5835
  return null;
7973
5836
  }
@@ -7975,14 +5838,14 @@ var localCommandRunner = {
7975
5838
  async readDir(path) {
7976
5839
  let entries;
7977
5840
  try {
7978
- entries = readdirSync3(path);
5841
+ entries = readdirSync2(path);
7979
5842
  } catch {
7980
5843
  return [];
7981
5844
  }
7982
5845
  const out = [];
7983
5846
  for (const name of entries) {
7984
5847
  try {
7985
- const st = statSync3(join3(path, name));
5848
+ const st = statSync2(join2(path, name));
7986
5849
  out.push({
7987
5850
  name,
7988
5851
  isDirectory: st.isDirectory(),
@@ -8321,11 +6184,11 @@ function flowLayer(input) {
8321
6184
 
8322
6185
  // src/intent-match-judge.ts
8323
6186
  var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
8324
- var DEFAULT_MODEL2 = "claude-sonnet-4-6";
8325
- var DEFAULT_TIMEOUT2 = 9e4;
8326
- var DEFAULT_MAX_SOURCE2 = 25e3;
8327
- var DEFAULT_MAX_PER_FILE2 = 12e3;
8328
- var DEFAULT_MAX_HTML2 = 2e4;
6187
+ var DEFAULT_MODEL = "claude-sonnet-4-6";
6188
+ var DEFAULT_TIMEOUT = 9e4;
6189
+ var DEFAULT_MAX_SOURCE = 25e3;
6190
+ var DEFAULT_MAX_PER_FILE = 12e3;
6191
+ var DEFAULT_MAX_HTML = 2e4;
8329
6192
  var INTENT_SCHEMA = {
8330
6193
  type: "object",
8331
6194
  additionalProperties: false,
@@ -8335,12 +6198,12 @@ var INTENT_SCHEMA = {
8335
6198
  evidence: { type: "string", minLength: 10, maxLength: 400 }
8336
6199
  }
8337
6200
  };
8338
- function truncate2(body, cap, label) {
6201
+ function truncate(body, cap, label) {
8339
6202
  if (body.length <= cap) return body;
8340
6203
  return `${body.slice(0, cap)}
8341
6204
  \u2026 [truncated ${body.length - cap} chars of ${label}]`;
8342
6205
  }
8343
- function buildPrompt2(input, opts) {
6206
+ function buildPrompt(input, opts) {
8344
6207
  const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
8345
6208
  ${f.content}`).join("\n\n");
8346
6209
  const html = input.servedHtml ?? "";
@@ -8359,10 +6222,10 @@ ${input.artifactLabel ? `ARTIFACT METADATA:
8359
6222
  description: ${input.artifactDescription ?? ""}
8360
6223
 
8361
6224
  ` : ""}${html ? `SERVED HTML (what the preview returns):
8362
- ${truncate2(html, opts.maxHtmlChars, "HTML")}
6225
+ ${truncate(html, opts.maxHtmlChars, "HTML")}
8363
6226
 
8364
6227
  ` : ""}SOURCE FILES (the agent's workdir):
8365
- ${truncate2(sourceBlob, opts.maxSourceChars, "source")}
6228
+ ${truncate(sourceBlob, opts.maxSourceChars, "source")}
8366
6229
 
8367
6230
  Score 0\u20131:
8368
6231
  1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
@@ -8390,11 +6253,11 @@ Return STRICT JSON. No prose outside.`;
8390
6253
  async function runIntentMatchJudge(input, options = {}) {
8391
6254
  const start = Date.now();
8392
6255
  const opts = {
8393
- model: options.model ?? DEFAULT_MODEL2,
8394
- timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
8395
- maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
8396
- maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
8397
- maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
6256
+ model: options.model ?? DEFAULT_MODEL,
6257
+ timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
6258
+ maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
6259
+ maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
6260
+ maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
8398
6261
  llm: options.llm ?? {}
8399
6262
  };
8400
6263
  if (input.sourceFiles.length === 0 && !input.servedHtml) {
@@ -8418,7 +6281,7 @@ async function runIntentMatchJudge(input, options = {}) {
8418
6281
  role: "system",
8419
6282
  content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
8420
6283
  },
8421
- { role: "user", content: buildPrompt2(input, opts) }
6284
+ { role: "user", content: buildPrompt(input, opts) }
8422
6285
  ],
8423
6286
  jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
8424
6287
  temperature: 0,
@@ -8900,8 +6763,8 @@ function multiToolchainLayer(config) {
8900
6763
  }
8901
6764
 
8902
6765
  // src/reference-replay.ts
8903
- import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3, readFileSync as readFileSync6 } from "fs";
8904
- import { dirname as dirname3 } from "path";
6766
+ import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
6767
+ import { dirname as dirname2 } from "path";
8905
6768
  var DEFAULT_MATCH_THRESHOLD = 0.55;
8906
6769
  var ALL_SPLITS = ["train", "dev", "test", "holdout"];
8907
6770
  async function runReferenceReplay(cases, options) {
@@ -9019,14 +6882,14 @@ function jsonlReferenceReplayStore(path) {
9019
6882
  return {
9020
6883
  async save(run) {
9021
6884
  await lock.runExclusive(() => {
9022
- mkdirSync3(dirname3(path), { recursive: true });
9023
- appendFileSync3(path, `${JSON.stringify(run)}
6885
+ mkdirSync2(dirname2(path), { recursive: true });
6886
+ appendFileSync2(path, `${JSON.stringify(run)}
9024
6887
  `);
9025
6888
  });
9026
6889
  },
9027
6890
  async list() {
9028
6891
  return lock.runExclusive(() => {
9029
- if (!existsSync7(path)) return [];
6892
+ if (!existsSync4(path)) return [];
9030
6893
  return readJsonl(path);
9031
6894
  });
9032
6895
  }
@@ -9319,8 +7182,8 @@ function ratio(numerator, denominator) {
9319
7182
  return denominator > 0 ? numerator / denominator : 0;
9320
7183
  }
9321
7184
  function tokenJaccard(a, b) {
9322
- const left = new Set(tokens2(a));
9323
- const right = new Set(tokens2(b));
7185
+ const left = new Set(tokens(a));
7186
+ const right = new Set(tokens(b));
9324
7187
  if (left.size === 0 || right.size === 0) return 0;
9325
7188
  let intersection = 0;
9326
7189
  for (const token of left) {
@@ -9338,7 +7201,7 @@ function tagOverlap(a, b) {
9338
7201
  }
9339
7202
  return intersection / Math.max(left.size, right.size);
9340
7203
  }
9341
- function tokens2(text) {
7204
+ function tokens(text) {
9342
7205
  return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
9343
7206
  }
9344
7207
  function normalize(text) {
@@ -9369,7 +7232,7 @@ function throwIfAborted(signal) {
9369
7232
  throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
9370
7233
  }
9371
7234
  function readJsonl(path) {
9372
- const raw = readFileSync6(path, "utf8");
7235
+ const raw = readFileSync4(path, "utf8");
9373
7236
  const out = [];
9374
7237
  for (const line of raw.split("\n")) {
9375
7238
  const trimmed = line.trim();
@@ -9526,7 +7389,7 @@ function createDefaultReviewer(options) {
9526
7389
 
9527
7390
  // src/discover-personas.ts
9528
7391
  import { promises as fs } from "fs";
9529
- import { basename, extname, join as join4 } from "path";
7392
+ import { basename, extname, join as join3 } from "path";
9530
7393
  var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
9531
7394
  async function discoverPersonas(dir, opts = {}) {
9532
7395
  const pattern = opts.pattern ?? DEFAULT_PATTERN;
@@ -9544,7 +7407,7 @@ async function discoverPersonas(dir, opts = {}) {
9544
7407
  }
9545
7408
  const out = [];
9546
7409
  for (const entry of entries) {
9547
- const full = join4(d, entry.name);
7410
+ const full = join3(d, entry.name);
9548
7411
  if (entry.isDir) {
9549
7412
  if (opts.recursive) out.push(...await walk(full));
9550
7413
  continue;
@@ -10371,9 +8234,9 @@ function jaccard(a, b) {
10371
8234
  }
10372
8235
 
10373
8236
  // src/campaign/distillation/gold-scenarios.ts
10374
- import { readFileSync as readFileSync7 } from "fs";
8237
+ import { readFileSync as readFileSync5 } from "fs";
10375
8238
  function loadGoldScenarios(jsonlPath) {
10376
- const text = readFileSync7(jsonlPath, "utf8");
8239
+ const text = readFileSync5(jsonlPath, "utf8");
10377
8240
  return parseGoldJsonl(text, jsonlPath);
10378
8241
  }
10379
8242
  function parseGoldJsonl(text, sourceLabel = "<inline>") {
@@ -10718,7 +8581,6 @@ function sectionHash(section) {
10718
8581
  }
10719
8582
  export {
10720
8583
  AGENT_PROFILE_KINDS,
10721
- ANALYST_SEVERITIES,
10722
8584
  AgentDriver,
10723
8585
  AgentEvalError,
10724
8586
  AgentProfileCellValidationError,
@@ -10757,13 +8619,10 @@ export {
10757
8619
  ExperimentTracker,
10758
8620
  FAILURE_CLASSES,
10759
8621
  FAILURE_MODE_KIND_SPEC,
10760
- FINDING_SUBJECT_GRAMMAR_PROMPT,
10761
- FINDING_SUBJECT_KINDS,
10762
8622
  FileSystemExperimentStore,
10763
8623
  FileSystemFeedbackTrajectoryStore,
10764
8624
  FileSystemRawProviderSink,
10765
8625
  FileSystemTraceStore,
10766
- FindingSubjectStringSchema,
10767
8626
  FindingsStore,
10768
8627
  HeldOutGate,
10769
8628
  HoldoutAuditor,
@@ -10777,7 +8636,6 @@ export {
10777
8636
  InMemoryWorkspaceInspector,
10778
8637
  JudgeError,
10779
8638
  JudgeRunner,
10780
- KIND_EXPECTED_SUBJECTS,
10781
8639
  KNOWLEDGE_GAP_KIND_SPEC,
10782
8640
  KNOWLEDGE_POISONING_KIND_SPEC,
10783
8641
  LlmCallError,
@@ -10796,10 +8654,8 @@ export {
10796
8654
  PairwiseSteeringOptimizer,
10797
8655
  ProductClient,
10798
8656
  PromptRegistry,
10799
- RAW_FINDING_SCHEMA_PROMPT,
10800
8657
  REDACTION_VERSION,
10801
8658
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
10802
- RawAnalystFindingSchema,
10803
8659
  ReplayCache,
10804
8660
  ReplayCacheMissError,
10805
8661
  ReplayError,
@@ -10840,6 +8696,8 @@ export {
10840
8696
  analyzeTraces,
10841
8697
  appendScorecard,
10842
8698
  argHash,
8699
+ asNumber,
8700
+ asString,
10843
8701
  assertCrossFamily,
10844
8702
  assertLlmRoute,
10845
8703
  assertRealBackend,
@@ -10859,15 +8717,14 @@ export {
10859
8717
  bootstrapCi,
10860
8718
  buildAgentProfileCell,
10861
8719
  buildAgreementJudge,
8720
+ buildDefaultAnalystRegistry,
10862
8721
  buildDriverSystemPrompt,
10863
8722
  buildReflectionPrompt,
10864
8723
  buildReviewerPrompt,
10865
8724
  buildSandboxAgentProfileCell,
10866
- buildSkillUsageReport,
10867
8725
  buildTraceAnalystTools,
10868
8726
  buildTraceInsightContext,
10869
8727
  buildTraceInsightPrompt,
10870
- buildTraceToolsForGroup,
10871
8728
  buildTrajectory,
10872
8729
  byteLengthRange,
10873
8730
  calibrateJudge,
@@ -10898,6 +8755,7 @@ export {
10898
8755
  composeValidators,
10899
8756
  computeFindingId,
10900
8757
  computeToolUseMetrics,
8758
+ computeTraceMetrics,
10901
8759
  confidenceInterval,
10902
8760
  containsAll,
10903
8761
  continuousAgreement,
@@ -10906,26 +8764,21 @@ export {
10906
8764
  controlRunToRunRecord,
10907
8765
  corpusInterRaterAgreement,
10908
8766
  corpusInterRaterAgreementFromJudgeScores,
8767
+ createAnalystAi,
10909
8768
  createAntiSlopJudge,
10910
- createChatClient,
10911
8769
  createCustomJudge,
10912
8770
  createDefaultReviewer,
10913
8771
  createDomainExpertJudge,
10914
8772
  createFeedbackTrajectory,
10915
8773
  createIntentMatchJudge,
10916
- createJudgeAdapter,
10917
8774
  createLlmCorrectnessChecker,
10918
8775
  createLlmReviewer,
10919
8776
  createOtelExporter,
10920
8777
  createOtelTracingStore,
10921
8778
  createReplayFetch,
10922
- createRunCriticAdapter,
10923
8779
  createSandboxPool,
10924
8780
  createSemanticConceptJudge,
10925
- createSemanticConceptJudgeAdapter,
10926
- createTraceAnalystAdapter,
10927
8781
  createTraceAnalystKind,
10928
- createVerifierAdapter,
10929
8782
  crossTraceDiff,
10930
8783
  crowdingDistance,
10931
8784
  decideNextUserTurn,
@@ -10946,7 +8799,6 @@ export {
10946
8799
  distillPlaybook,
10947
8800
  domainEvidencePattern,
10948
8801
  dominates,
10949
- emitSkillUsageFindings,
10950
8802
  estimateCost,
10951
8803
  estimateTokens,
10952
8804
  euAiActReport,
@@ -10962,6 +8814,7 @@ export {
10962
8814
  exportRunAsOtlp,
10963
8815
  extractAssetUrls,
10964
8816
  extractErrorCount,
8817
+ extractOtlpAttributes,
10965
8818
  extractProducedState,
10966
8819
  feedbackTrajectoriesToDatasetScenarios,
10967
8820
  feedbackTrajectoriesToOptimizerRows,
@@ -10975,6 +8828,8 @@ export {
10975
8828
  findFallbackToPass,
10976
8829
  findLiteralTruePass,
10977
8830
  findSkipCountsAsPass,
8831
+ firstNumberAttr,
8832
+ firstStringAttr,
10978
8833
  flattenOtlpExportToNdjson,
10979
8834
  flowLayer,
10980
8835
  formatBenchmarkReport,
@@ -10995,6 +8850,7 @@ export {
10995
8850
  inMemoryReferenceReplayStore,
10996
8851
  inMemoryReviewStore,
10997
8852
  inferDomainKeywords,
8853
+ inferOtlpKind,
10998
8854
  interRaterReliability,
10999
8855
  interpretCliffs,
11000
8856
  iqr,
@@ -11018,7 +8874,6 @@ export {
11018
8874
  judgeSpans,
11019
8875
  keyPreserved,
11020
8876
  knowledgeReadinessTracePayload,
11021
- liftSeverity,
11022
8877
  linterJudge,
11023
8878
  llmSpanFromProvider,
11024
8879
  llmSpans,
@@ -11038,6 +8893,8 @@ export {
11038
8893
  notBlocked,
11039
8894
  objectiveEval,
11040
8895
  otelRunCompleteHook,
8896
+ otlpToRunRecords,
8897
+ otlpToTraceRunRecords,
11041
8898
  pairedBootstrap,
11042
8899
  pairedEvalueSequence,
11043
8900
  pairedMde,
@@ -11049,9 +8906,7 @@ export {
11049
8906
  paretoFrontierWithCrowding,
11050
8907
  parseCorrectnessResponse,
11051
8908
  parseFeedbackTrajectoriesJsonl,
11052
- parseFindingSubject,
11053
8909
  parseGoldJsonl,
11054
- parseRawFinding,
11055
8910
  parseReflectionResponse,
11056
8911
  parseRunRecordSafe,
11057
8912
  partialCredit,
@@ -11063,11 +8918,13 @@ export {
11063
8918
  printDriverSummary,
11064
8919
  probeLlm,
11065
8920
  profile_exports as profile,
8921
+ projectOtlpFlatLine,
11066
8922
  promptBisect,
11067
8923
  proposeAutomatedPullRequest,
11068
8924
  proposeSynthesisTargets,
11069
8925
  providerFromBaseUrl,
11070
8926
  pytestTestParser,
8927
+ readOtlpStatus,
11071
8928
  recordRuns,
11072
8929
  recordRunsToScorecard,
11073
8930
  redTeamDataset,
@@ -11078,7 +8935,6 @@ export {
11078
8935
  referenceReplayScenarioToRunScore,
11079
8936
  regexMatch,
11080
8937
  regexMatches,
11081
- renderFindingSubject,
11082
8938
  renderMarkdown,
11083
8939
  renderMarkdownReport,
11084
8940
  renderPlaybookMarkdown,
@@ -11142,6 +8998,7 @@ export {
11142
8998
  statusAdvanced,
11143
8999
  stopOnNoProgress,
11144
9000
  stopOnRepeatedAction,
9001
+ stringField,
11145
9002
  stripFencedJson,
11146
9003
  subjectiveEval,
11147
9004
  summarize,