@tangle-network/agent-eval 0.20.10 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +129 -126
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +221 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,7 +1,22 @@
1
1
  import { TCloud } from '@tangle-network/tcloud';
2
- import { R as RunRecord, a as RunSplitTag } from './index-1PZOtZFr.js';
3
- export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, e as RunJudgeMetadata, f as RunOutcome, g as RunRecordValidationError, h as RunTokenUsage, i as benchmarkDeterministicSplit, j as benchmarks, k as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './index-1PZOtZFr.js';
4
- import { AxAIService, AxFunction } from '@ax-llm/ax';
2
+ import { ReleaseConfidenceThresholds, ReleaseConfidenceScorecard } from './reporting.js';
3
+ export { BootstrapOptions, BootstrapResult, GainDistributionBin, GainDistributionFigureSpec, GainDistributionOptions, JudgeReplayGateArgs, PairedBootstrapOptions, PairedBootstrapResult, ParetoFigureSpec, ParetoPoint, ReleaseConfidenceAxis, ReleaseConfidenceAxisName, ReleaseConfidenceInput, ReleaseConfidenceIssue, ReleaseConfidenceMetrics, ReleaseConfidenceStatus, ReleaseTraceEvidence, RenderReleaseReportOptions, SummaryTable, SummaryTableOptions, SummaryTableRow, Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, gainHistogram, judgeReplayGate, pairedBootstrap, pairedWilcoxon, paretoChart, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport, summaryTable } from './reporting.js';
4
+ import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-BGQ_ANCN.js';
5
+ export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-BGQ_ANCN.js';
6
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-C8NKbF3w.js';
7
+ import { T as TraceEmitter, R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, a as TraceStore, F as FailureClass, c as BudgetSpec, d as ToolSpan, e as RunFilter, L as LlmSpan, J as JudgeSpan } from './emitter-BYO2nSDA.js';
8
+ export { E as EventFilter, f as EventKind, g as FAILURE_CLASSES, h as FileSystemTraceStore, i as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, j as RetrievalSpan, k as RunLayer, l as RunStatus, m as SandboxSpan, n as SpanBase, o as SpanFilter, p as SpanHandle, q as SpanKind, r as SpanStatus, s as TRACE_SCHEMA_VERSION, t as TraceEmitterOptions, u as isJudgeSpan, v as isLlmSpan, w as isRetrievalSpan, x as isSandboxSpan, y as isToolSpan, z as llmSpanFromProvider } from './emitter-BYO2nSDA.js';
9
+ import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './multi-shot-optimization-Bvtz294B.js';
10
+ export { b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, c as Direction, G as GateDecision, d as GateEvidence, e as GenerationReport, H as HeldOutGate, f as HeldOutGateConfig, g as HeldOutGateRejectionCode, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, B as TrialTrace, C as buildReflectionPrompt, F as crowdingDistance, J as defaultMultiShotObjectives, K as dominates, L as paretoFrontier, N as paretoFrontierWithCrowding, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, X as scalarScore, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
11
+ import { llmSpans } from './traces.js';
12
+ export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup } from './traces.js';
13
+ import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
14
+ export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
15
+ import { a as RunRecord } from './run-record-CX_jcAyr.js';
16
+ export { b as RunJudgeMetadata, c as RunOutcome, d as RunRecordValidationError, R as RunSplitTag, e as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CX_jcAyr.js';
17
+ export { CallbackResearcher, CallbackResearcherOptions, ExperimentPlan, ExperimentResult, FailureMode, NoopResearcher, Researcher, SteeringChange } from './optimization.js';
18
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-c5saLbKD.js';
19
+ import '@ax-llm/ax';
5
20
 
6
21
  interface Scenario {
7
22
  id: string;
@@ -323,1525 +338,6 @@ declare class ProductClient {
323
338
  */
324
339
  declare function runE2EWorkflow(client: ProductClient, name: string, workflow: (client: ProductClient) => Promise<CheckResult[]>): Promise<TestResult>;
325
340
 
326
- /**
327
- * Dataset — versioned, sliceable, content-hashed scenario collection.
328
- *
329
- * Scenarios stop being ephemeral arrays and become first-class
330
- * artifacts. Every Dataset carries:
331
- * - content hash (sha256 over canonicalized scenario array)
332
- * - provenance (contributor, createdAt, sourceUrl)
333
- * - split labels (train | dev | test | holdout)
334
- * - difficulty tiers (easy | medium | hard | extreme)
335
- * - tags (free-form, per-scenario)
336
- *
337
- * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
338
- * deterministic, reproducible subset. Holdout slices are locked: you
339
- * can read them but `mutate` throws, which prevents "oh I'll just
340
- * tweak that one scenario" contamination drift.
341
- */
342
- type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
343
- type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
344
- interface DatasetScenario {
345
- id: string;
346
- /** Arbitrary payload; the framework doesn't interpret it. */
347
- payload: unknown;
348
- split?: DatasetSplit;
349
- difficulty?: DatasetDifficulty;
350
- /** Canary token that MUST NOT round-trip through a correct agent output. */
351
- canary?: string;
352
- /**
353
- * Behavioral-canary forbidden pattern. A string OR a serialized regex
354
- * (`/.../flags`) that the agent under test MUST NOT emit. Used by
355
- * {@link import('./canary').checkBehavioralCanary | checkBehavioralCanary},
356
- * which inverts the contamination-style semantic: presence in the
357
- * agent output is a LEAK / failure, not a positive signal.
358
- *
359
- * Falls back to {@link canary} when omitted.
360
- */
361
- forbiddenPattern?: string;
362
- tags?: Record<string, string>;
363
- }
364
- interface DatasetProvenance {
365
- contributor?: string;
366
- createdAt: string;
367
- sourceUrl?: string;
368
- license?: string;
369
- description?: string;
370
- /** Monotonic human-readable version (e.g. "2026.04.20"). */
371
- version: string;
372
- }
373
- interface DatasetManifest {
374
- name: string;
375
- provenance: DatasetProvenance;
376
- /** sha256 hex over canonicalized scenarios. */
377
- contentHash: string;
378
- scenarioCount: number;
379
- splitCounts: Record<DatasetSplit, number>;
380
- }
381
- interface SliceOptions {
382
- split?: DatasetSplit;
383
- difficulty?: DatasetDifficulty;
384
- /** Number of scenarios (random sample, seeded). Omit to take all that match. */
385
- limit?: number;
386
- seed?: number;
387
- /** Predicate narrowing. Applied after split/difficulty filters. */
388
- filter?: (scenario: DatasetScenario) => boolean;
389
- /** If true, include scenarios marked as holdout. Default false. */
390
- includeHoldout?: boolean;
391
- }
392
- /** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
393
- declare class HoldoutLockedError extends Error {
394
- constructor(datasetName: string);
395
- }
396
- declare class Dataset {
397
- readonly name: string;
398
- readonly provenance: DatasetProvenance;
399
- private scenarios;
400
- private locked;
401
- constructor(init: {
402
- name: string;
403
- provenance: DatasetProvenance;
404
- scenarios: DatasetScenario[];
405
- locked?: boolean;
406
- });
407
- /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
408
- all(): readonly DatasetScenario[];
409
- get size(): number;
410
- /**
411
- * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
412
- * the same arguments always produce the same slice across machines.
413
- */
414
- slice(options?: SliceOptions): DatasetScenario[];
415
- /**
416
- * Assemble the manifest (name + provenance + content hash + counts).
417
- * Content hash is deterministic over canonicalized scenarios.
418
- */
419
- manifest(): Promise<DatasetManifest>;
420
- /** Fresh unlocked copy — for post-release forks when mutation is needed. */
421
- clone(overrides?: Partial<{
422
- name: string;
423
- version: string;
424
- }>): Dataset;
425
- lock(): void;
426
- add(scenario: DatasetScenario): void;
427
- remove(scenarioId: string): void;
428
- /**
429
- * Stable JSON-Lines serialization — deterministic byte-for-byte.
430
- * Write to disk for contamination-verifiable archives.
431
- */
432
- toJsonl(): string;
433
- static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
434
- }
435
- declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
436
-
437
- /**
438
- * HeldOutGate — first-class held-out paired-delta promotion gate.
439
- *
440
- * Encodes the "honesty override" pattern that lived inline in
441
- * `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
442
- * The optimizer's best-guess is one thing; what we should actually
443
- * ship is another. The gate is the line between them.
444
- *
445
- * A candidate is promoted iff ALL three pass:
446
- *
447
- * 1. **Productive runs**: the candidate has at least
448
- * `minProductiveRuns` paired observations on items where BOTH
449
- * candidate and baseline produced a real (non-silent) score.
450
- * 2. **Paired delta**: the lower bound of the bootstrap CI on the
451
- * median per-item delta (candidate − baseline) on the HOLDOUT
452
- * split is strictly greater than `pairedDeltaThreshold`.
453
- * 3. **Overfit gap**: the candidate's gap between search-split
454
- * score and holdout-split score is no worse (more positive)
455
- * than the baseline's gap by more than `overfitGapThreshold`.
456
- * "Better on search, worse on holdout" is the canonical
457
- * overfit pattern; this catches it.
458
- *
459
- * The decision carries a machine-readable `rejectionCode` plus an
460
- * `evidence` block with every number the gate looked at, so the
461
- * downstream researcher / paper / dashboard can re-derive the
462
- * verdict without re-running.
463
- *
464
- * See also:
465
- * - `src/paired-stats.ts` for `pairedBootstrap` + `pairedWilcoxon`
466
- * - `src/run-record.ts` for the input row schema
467
- * - `src/reference-replay.ts` for the older, reference-replay-
468
- * specific promotion path (still useful for replay-style evals).
469
- */
470
-
471
- type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
472
- interface HeldOutGateConfig {
473
- /** Minimum number of paired (candidate, baseline) holdout observations
474
- * required before the gate will even consider promoting. Default 3. */
475
- minProductiveRuns?: number;
476
- /** The bootstrap-CI lower bound on the median paired holdout delta
477
- * must exceed this to promote. Default 0. */
478
- pairedDeltaThreshold?: number;
479
- /** Maximum allowed worsening of (search − holdout) gap relative to
480
- * baseline. Default 0.15 (i.e. candidate may overfit by up to 15
481
- * absolute score points more than baseline before rejection). */
482
- overfitGapThreshold?: number;
483
- /** Stable label of the baseline candidate. Required — paper-grade
484
- * evaluation never compares two unlabelled candidates. */
485
- baselineKey: string;
486
- /** Confidence level for the bootstrap CI. Default 0.95. */
487
- confidence?: number;
488
- /** Bootstrap resamples. Default 2000. */
489
- bootstrapResamples?: number;
490
- /** Optional deterministic seed for the bootstrap. Default undefined
491
- * (Math.random). */
492
- seed?: number;
493
- }
494
- interface GateEvidence {
495
- /** Number of paired (candidate, baseline) holdout observations used. */
496
- productiveRuns: number;
497
- /** Median of (candidate − baseline) paired holdout deltas. */
498
- medianPairedDelta: number;
499
- /** Bootstrap CI on the median paired holdout delta. */
500
- pairedCI: {
501
- low: number;
502
- high: number;
503
- };
504
- /** Wilcoxon signed-rank p-value on the paired holdout deltas. */
505
- pairedPValue: number;
506
- /** Mean candidate score on the search split (NaN if none). */
507
- searchScore: number;
508
- /** Mean candidate score on the holdout split (NaN if none). */
509
- holdoutScore: number;
510
- /** Candidate (search − holdout) gap. */
511
- overfitGap: number;
512
- /** Baseline (search − holdout) gap. */
513
- baselineOverfitGap: number;
514
- }
515
- interface GateDecision {
516
- /** Final promote/no-promote verdict. */
517
- promote: boolean;
518
- /** The candidate that was evaluated. */
519
- candidateId: string;
520
- /** The baseline it was compared against. */
521
- baselineId: string;
522
- /** Every number the gate looked at, for audit + paper export. */
523
- evidence: GateEvidence;
524
- /** Human-readable reason. */
525
- reason: string;
526
- /** Machine-readable rejection code, or null on promote. */
527
- rejectionCode: HeldOutGateRejectionCode | null;
528
- }
529
- /**
530
- * Held-out paired-delta promotion gate. Construct once with config,
531
- * call `evaluate(candidateRuns, baselineRuns)` per (candidate,
532
- * baseline) pair. Stateless across calls.
533
- */
534
- declare class HeldOutGate {
535
- private readonly minProductiveRuns;
536
- private readonly pairedDeltaThreshold;
537
- private readonly overfitGapThreshold;
538
- private readonly baselineKey;
539
- private readonly confidence;
540
- private readonly resamples;
541
- private readonly seed?;
542
- constructor(config: HeldOutGateConfig);
543
- /** Decide whether `candidate` should replace `baseline`. Pairing
544
- * is by (experimentId, seed) — identical experiment + seed pairs
545
- * the candidate run with the matching baseline run. Pairs without
546
- * a holdout score on both sides are dropped. */
547
- evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
548
- }
549
-
550
- /**
551
- * Pareto frontier — multi-objective optimization over candidate runs.
552
- *
553
- * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
554
- * trading off (cost, latency, quality) or (passRate, tokenBudget,
555
- * ttfb), you rarely have a single "winner" — you have a set of
556
- * non-dominated candidates. This module exposes:
557
- *
558
- * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
559
- * - `dominates`: does A dominate B across all objectives?
560
- *
561
- * Each objective is declared with a direction: 'maximize' (higher=better)
562
- * or 'minimize' (lower=better). Candidates are any object; pass an
563
- * `objective(candidate)` accessor.
564
- */
565
- type Direction = 'maximize' | 'minimize';
566
- interface Objective<T> {
567
- /** Stable label used in reports. */
568
- name: string;
569
- direction: Direction;
570
- value: (candidate: T) => number;
571
- }
572
- interface ParetoResult<T> {
573
- frontier: T[];
574
- dominated: T[];
575
- /** Index map: frontier[i] dominates each of dominatedBy[i]. */
576
- dominanceMap: Array<{
577
- dominator: T;
578
- dominated: T[];
579
- }>;
580
- }
581
- /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
582
- declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
583
- /**
584
- * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
585
- * objective are excluded (can't rank them). A candidate enters the frontier
586
- * iff no other candidate dominates it.
587
- */
588
- declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
589
- /**
590
- * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
591
- * when callers don't want to consume a frontier. Each objective contributes
592
- * its normalised value (0..1 via min-max across the candidate pool) times
593
- * its weight; missing weights default to 1/N.
594
- *
595
- * Direction is honoured automatically — `minimize` axes have their values
596
- * inverted before scaling so "higher scalar = better" always holds.
597
- */
598
- declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
599
- weights?: Partial<Record<string, number>>;
600
- }): Array<{
601
- candidate: T;
602
- score: number;
603
- }>;
604
- /**
605
- * NSGA-II crowding distance — secondary sort for ties on the frontier.
606
- *
607
- * When the Pareto front collapses to a single point (or many candidates tie
608
- * on dominance), naive selection picks arbitrarily and the population
609
- * degenerates over generations. NSGA-II preserves diversity by preferring
610
- * candidates with more empty space around them on the frontier.
611
- *
612
- * Returns an array of `{ candidate, distance }` in the SAME order as the
613
- * input. Higher distance = more isolated = should be preferred when
614
- * preserving diversity.
615
- */
616
- declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
617
- candidate: T;
618
- distance: number;
619
- }>;
620
- /**
621
- * Pareto frontier with tie-break by crowding distance — the canonical
622
- * NSGA-II selection step. Returns the frontier sorted by descending crowding
623
- * distance so callers can `.slice(0, k)` to pick K diverse winners.
624
- */
625
- declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
626
- candidate: T;
627
- distance: number;
628
- }>;
629
-
630
- /**
631
- * PromptEvolutionLoop — population-based reflective-mutation evolution.
632
- *
633
- * Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
634
- * this loop GENERATES variants. Each generation:
635
- * 1. Score the population across (variant × scenario × rep).
636
- * 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
637
- * 3. Ask the mutator for replacements until population size is restored.
638
- * 4. Repeat for N generations OR until convergence.
639
- *
640
- * Domain-agnostic. Consumers supply:
641
- * - A seed population of `EvolvableVariant`s.
642
- * - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
643
- * - A `MutateAdapter` that produces children given trace evidence.
644
- * - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
645
- *
646
- * The loop owns: population management, parallel scheduling (concurrency-
647
- * limited), Pareto selection with crowding distance, generation reporting.
648
- *
649
- * It does NOT own: rendering trials to a model, executing prompts, choosing
650
- * mutation primitives, persisting to disk. Those are the consumer's call.
651
- */
652
-
653
- interface EvolvableVariant<P = unknown> {
654
- /** Stable id for the variant — surfaces in reports and trial results. */
655
- id: string;
656
- /** Variant payload — interpretation is the consumer's responsibility. */
657
- payload: P;
658
- /** Generation index (0 = seed, then 1, 2, ...). */
659
- generation: number;
660
- /** Parent variant id when produced via mutation; absent for seeds. */
661
- parentId?: string;
662
- /** Human label for reports. */
663
- label: string;
664
- /** What the mutator was trying to fix. */
665
- rationale?: string;
666
- }
667
- interface TrialResult {
668
- variantId: string;
669
- scenarioId: string;
670
- rep: number;
671
- ok: boolean;
672
- /** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
673
- score: number;
674
- /** Token cost (or any cost-like dimension). */
675
- cost?: number;
676
- /** Wall time in ms. */
677
- durationMs?: number;
678
- /** Free-form metric bag for objective accessors. */
679
- metrics?: Record<string, number>;
680
- error?: string;
681
- }
682
- /** Aggregated trial summary for one (variant, scenario) pair across reps. */
683
- interface ScenarioAggregate {
684
- variantId: string;
685
- scenarioId: string;
686
- meanScore: number;
687
- meanCost: number;
688
- meanDurationMs: number;
689
- okRate: number;
690
- trials: number;
691
- /** Mean of every numeric metric across reps. */
692
- metrics: Record<string, number>;
693
- }
694
- /** Aggregated trial summary for one variant across all scenarios. */
695
- interface VariantAggregate {
696
- variantId: string;
697
- meanScore: number;
698
- meanCost: number;
699
- meanDurationMs: number;
700
- okRate: number;
701
- scenarios: ScenarioAggregate[];
702
- /** Mean of every numeric metric, averaged across scenarios. */
703
- metrics: Record<string, number>;
704
- }
705
- interface ScoreAdapter<P = unknown> {
706
- score(args: {
707
- variant: EvolvableVariant<P>;
708
- scenarioId: string;
709
- rep: number;
710
- }): Promise<TrialResult>;
711
- }
712
- interface MutateAdapter<P = unknown> {
713
- mutate(args: {
714
- parent: EvolvableVariant<P>;
715
- parentAggregate: VariantAggregate;
716
- topTrials: TrialResult[];
717
- bottomTrials: TrialResult[];
718
- childCount: number;
719
- generation: number;
720
- }): Promise<EvolvableVariant<P>[]>;
721
- }
722
- interface PromptEvolutionConfig<P = unknown> {
723
- runId: string;
724
- /** What component is being mutated — surfaces in reports + reflection prompts. */
725
- target: string;
726
- seedVariants: EvolvableVariant<P>[];
727
- scenarioIds: string[];
728
- reps: number;
729
- generations: number;
730
- populationSize: number;
731
- /** Maximum concurrent score() calls. */
732
- scoreConcurrency: number;
733
- scoreAdapter: ScoreAdapter<P>;
734
- mutateAdapter: MutateAdapter<P>;
735
- /** Pareto objectives over `VariantAggregate`. Ordered by importance. */
736
- objectives: Objective<VariantAggregate>[];
737
- /** Optional weights for the scalar tie-break selector (by objective name). */
738
- scalarWeights?: Record<string, number>;
739
- /** Stop early if a generation produces no Pareto improvement. Default true. */
740
- earlyStopOnNoImprovement?: boolean;
741
- onProgress?: (event: PromptEvolutionEvent) => void;
742
- /**
743
- * Optional cache key for memoising scored (variantId, scenarioId, rep)
744
- * tuples. When provided AND a cache instance is passed, repeated trials
745
- * skip re-scoring. Cache keys are stable across runs.
746
- */
747
- cache?: TrialCache;
748
- }
749
- interface TrialCache {
750
- get(key: string): TrialResult | undefined;
751
- set(key: string, value: TrialResult): void;
752
- }
753
- declare class InMemoryTrialCache implements TrialCache {
754
- private store;
755
- get(key: string): TrialResult | undefined;
756
- set(key: string, value: TrialResult): void;
757
- size(): number;
758
- clear(): void;
759
- }
760
- type PromptEvolutionEvent = {
761
- type: 'generation-start';
762
- generation: number;
763
- populationSize: number;
764
- } | {
765
- type: 'trial-complete';
766
- generation: number;
767
- variantId: string;
768
- scenarioId: string;
769
- rep: number;
770
- ok: boolean;
771
- score: number;
772
- cached: boolean;
773
- } | {
774
- type: 'generation-complete';
775
- report: GenerationReport<unknown>;
776
- } | {
777
- type: 'converged';
778
- generation: number;
779
- reason: string;
780
- };
781
- interface GenerationReport<P = unknown> {
782
- runId: string;
783
- target: string;
784
- generation: number;
785
- variants: EvolvableVariant<P>[];
786
- aggregates: VariantAggregate[];
787
- /** Frontier candidates, sorted by descending crowding distance. */
788
- paretoFrontIds: string[];
789
- /** Scalar-best variant id — used for the single "winner" if callers want one. */
790
- winnerId: string;
791
- /** Trials that fed this generation (kept for downstream reporting). */
792
- trials: TrialResult[];
793
- }
794
- interface PromptEvolutionResult<P = unknown> {
795
- runId: string;
796
- target: string;
797
- generations: GenerationReport<P>[];
798
- /** Best variant by scalar score in the final generation. */
799
- bestVariant: EvolvableVariant<P>;
800
- /** Best aggregate (matches bestVariant). */
801
- bestAggregate: VariantAggregate;
802
- }
803
- declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
804
-
805
- /**
806
- * Reflective mutation — primitives for trace-conditioned prompt rewriting.
807
- *
808
- * Used by `prompt-evolution.ts` (and any consumer running iterative
809
- * improvement). Given a parent prompt + concrete trace evidence (top trials,
810
- * bottom trials, missed expectations), produce an LLM-ready prompt that
811
- * proposes targeted mutations — not blind rephrasings.
812
- *
813
- * Why this lives outside `prompt-evolution.ts`: any consumer that wants to
814
- * run reflective rewriting WITHOUT the population/Pareto machinery can
815
- * import these primitives directly.
816
- *
817
- * Quality bar (vs. naive "mutate this prompt"):
818
- * - Show parent ↔ children diff, not just one variant
819
- * - Quote specific missed goldens with their match phrases
820
- * - Surface the model's actual emitted output side-by-side with what was expected
821
- * - Quote concrete mutation primitives so the model has a vocabulary
822
- */
823
- interface TrialTrace {
824
- /** Stable id for the trial — surfaces in the prompt for grounding. */
825
- id: string;
826
- /** Score the trial received on its primary metric. */
827
- score: number;
828
- /** Candidate inputs the agent was given (e.g., the fixture or scenario). */
829
- inputName?: string;
830
- /**
831
- * Goldens / expectations this trial was tested against, with whether each
832
- * was matched. The reflection prompt quotes the missed ones specifically.
833
- */
834
- expectations?: Array<{
835
- id: string;
836
- phrase: string;
837
- matched: boolean;
838
- }>;
839
- /** Free-form text — what the agent actually emitted (e.g., findings, plan). */
840
- emitted?: string;
841
- /** Optional structured metrics (recall, precision, cost, latency). */
842
- metrics?: Record<string, number>;
843
- }
844
- interface ReflectionContext {
845
- /** What is being mutated — appears in the system prompt for orientation. */
846
- target: string;
847
- /** Current variant's payload — JSON-serialised for the prompt. */
848
- parentPayload: unknown;
849
- /** Best-performing trials this generation. */
850
- topTrials: TrialTrace[];
851
- /** Worst-performing trials this generation — the missed-golden source. */
852
- bottomTrials: TrialTrace[];
853
- /** How many children the mutator should propose. */
854
- childCount: number;
855
- /** Optional: domain-specific mutation primitives the model can pick from. */
856
- mutationPrimitives?: string[];
857
- }
858
- declare const DEFAULT_MUTATION_PRIMITIVES: string[];
859
- /**
860
- * Build the LLM-ready reflection prompt. Output is plain text — pass it as
861
- * the user message. The system message should be small and stable (e.g.
862
- * "Output ONLY a JSON object matching the schema below.").
863
- */
864
- declare function buildReflectionPrompt(ctx: ReflectionContext): string;
865
- interface ReflectionProposal {
866
- label: string;
867
- rationale: string;
868
- payload: unknown;
869
- }
870
- declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
871
-
872
- /**
873
- * Multi-shot optimization adapter.
874
- *
875
- * This is the canonical bridge between variable-length agent trajectories
876
- * and `runPromptEvolution`. Apps provide four things:
877
- *
878
- * - variants: prompt/config/tool-policy candidates
879
- * - runner: executes one full task trajectory for a variant
880
- * - scorer: turns that trajectory into score + actionable side information
881
- * - mutator: proposes new variants from top/bottom scored trials
882
- *
883
- * The adapter owns the boring but easy-to-get-wrong glue: stable seeds,
884
- * score/cost objectives, error-to-trial conversion, ASI metric projection,
885
- * and optional paired holdout gating via `HeldOutGate`.
886
- */
887
-
888
- type MultiShotSplit = 'search' | 'dev' | 'holdout';
889
- type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
890
- type MultiShotVariant<P = unknown> = EvolvableVariant<P>;
891
- interface ActionableSideInfo {
892
- /** Stable expectation/check id when available. */
893
- expectationId?: string;
894
- /** Human-readable diagnosis of what happened. */
895
- message: string;
896
- severity?: AsiSeverity;
897
- /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
898
- evidence?: string;
899
- /** Prompt/tool/context surface likely responsible. */
900
- responsibleSurface?: string;
901
- /** Suggested fix in natural language. */
902
- suggestion?: string;
903
- /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
904
- matched?: boolean;
905
- metadata?: Record<string, unknown>;
906
- }
907
- interface MultiShotTrace {
908
- scenarioId: string;
909
- /** Full turn/tool trace. Shape is intentionally app-owned. */
910
- turns?: unknown[];
911
- toolCalls?: unknown[];
912
- artifacts?: unknown[];
913
- /** Compact final output or summary used by reflection prompts. */
914
- transcript?: string;
915
- output?: unknown;
916
- metadata?: Record<string, unknown>;
917
- }
918
- interface MultiShotRun {
919
- trace: MultiShotTrace;
920
- costUsd?: number;
921
- durationMs?: number;
922
- tokenUsage?: {
923
- input?: number;
924
- output?: number;
925
- cached?: number;
926
- };
927
- metadata?: Record<string, unknown>;
928
- }
929
- interface MultiShotRunInput<P = unknown> {
930
- variant: EvolvableVariant<P>;
931
- scenarioId: string;
932
- rep: number;
933
- split: MultiShotSplit;
934
- /** Stable paired seed for baseline/candidate comparisons. */
935
- seed: number;
936
- }
937
- interface MultiShotRunner<P = unknown> {
938
- run(input: MultiShotRunInput<P>): Promise<MultiShotRun> | MultiShotRun;
939
- }
940
- interface MultiShotScore {
941
- /** Primary score in [0,1]. The adapter clamps for safety. */
942
- score: number;
943
- /** Pass/fail for top/bottom trial selection. Defaults to true. */
944
- ok?: boolean;
945
- costUsd?: number;
946
- durationMs?: number;
947
- metrics?: Record<string, number>;
948
- asi?: ActionableSideInfo[];
949
- /** Optional rich output shown to reflection mutators. */
950
- emitted?: string;
951
- metadata?: Record<string, unknown>;
952
- }
953
- interface MultiShotScorer<P = unknown> {
954
- score(input: MultiShotRunInput<P> & {
955
- run: MultiShotRun;
956
- }): Promise<MultiShotScore> | MultiShotScore;
957
- }
958
- interface MultiShotTrialResult extends TrialResult {
959
- split: MultiShotSplit;
960
- seed: number;
961
- trace?: MultiShotTrace;
962
- asi?: ActionableSideInfo[];
963
- emitted?: string;
964
- metadata?: Record<string, unknown>;
965
- }
966
- interface MultiShotMutateAdapter<P = unknown> {
967
- mutate(args: {
968
- parent: EvolvableVariant<P>;
969
- parentAggregate: VariantAggregate;
970
- topTrials: MultiShotTrialResult[];
971
- bottomTrials: MultiShotTrialResult[];
972
- childCount: number;
973
- generation: number;
974
- }): Promise<EvolvableVariant<P>[]>;
975
- }
976
- interface MultiShotGateConfig<P = unknown> {
977
- /** Search rows are optional, but enable HeldOutGate's overfit-gap check. */
978
- searchScenarioIds?: string[];
979
- holdoutScenarioIds: string[];
980
- reps?: number;
981
- gate: HeldOutGateConfig;
982
- /** Convert scored trajectory runs into paper-grade RunRecords. */
983
- toRunRecord(input: {
984
- variant: EvolvableVariant<P>;
985
- scenarioId: string;
986
- rep: number;
987
- split: RunSplitTag;
988
- seed: number;
989
- trial: MultiShotTrialResult;
990
- }): RunRecord;
991
- }
992
- interface MultiShotOptimizationConfig<P = unknown> {
993
- runId: string;
994
- target: string;
995
- seedVariants: EvolvableVariant<P>[];
996
- searchScenarioIds: string[];
997
- reps: number;
998
- generations: number;
999
- populationSize: number;
1000
- scoreConcurrency?: number;
1001
- runner: MultiShotRunner<P>;
1002
- scorer: MultiShotScorer<P>;
1003
- mutateAdapter: MultiShotMutateAdapter<P>;
1004
- objectives?: Objective<VariantAggregate>[];
1005
- scalarWeights?: Record<string, number>;
1006
- cache?: TrialCache;
1007
- earlyStopOnNoImprovement?: boolean;
1008
- seedBase?: number;
1009
- onProgress?: (event: PromptEvolutionEvent) => void;
1010
- gate?: MultiShotGateConfig<P>;
1011
- }
1012
- interface MultiShotGateResult {
1013
- decision: GateDecision;
1014
- candidateRuns: RunRecord[];
1015
- baselineRuns: RunRecord[];
1016
- }
1017
- interface MultiShotOptimizationResult<P = unknown> {
1018
- evolution: PromptEvolutionResult<P>;
1019
- /** Best candidate on the optimizer-visible search split. */
1020
- searchBestVariant: EvolvableVariant<P>;
1021
- searchBestAggregate: VariantAggregate;
1022
- /** Variant callers should actually ship after optional holdout gating. */
1023
- promotedVariant: EvolvableVariant<P>;
1024
- promotedAggregate: VariantAggregate;
1025
- /** Null when no gate was configured or the search-best candidate was the baseline. */
1026
- gate: MultiShotGateResult | null;
1027
- }
1028
- declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig<P>): Promise<MultiShotOptimizationResult<P>>;
1029
- declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
1030
- declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
1031
-
1032
- /**
1033
- * Release confidence gate.
1034
- *
1035
- * This is the production-facing composition layer over the lower-level
1036
- * primitives:
1037
- * - Dataset manifests prove corpus/version coverage.
1038
- * - RunRecord rows prove reproducible search/holdout outcomes.
1039
- * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
1040
- * - HeldOutGate decisions remain the paired promotion authority.
1041
- *
1042
- * The gate is intentionally pure and conservative. Missing declared evidence
1043
- * fails closed instead of being treated as a neutral zero.
1044
- */
1045
-
1046
- type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
1047
- type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
1048
- interface ReleaseTraceEvidence {
1049
- scenarioId: string;
1050
- candidateId?: string;
1051
- split?: RunSplitTag;
1052
- score?: number;
1053
- ok?: boolean;
1054
- turnCount?: number;
1055
- costUsd?: number;
1056
- durationMs?: number;
1057
- failureMode?: string;
1058
- asi?: ActionableSideInfo[];
1059
- metadata?: Record<string, unknown>;
1060
- }
1061
- interface ReleaseConfidenceThresholds {
1062
- /** Require a Dataset manifest or explicit scenarios. Default true. */
1063
- requireCorpus?: boolean;
1064
- minScenarioCount?: number;
1065
- minSearchRuns?: number;
1066
- minHoldoutRuns?: number;
1067
- /** Require at least one holdout scenario/run. Default true. */
1068
- requireHoldout?: boolean;
1069
- minPassRate?: number;
1070
- minMeanScore?: number;
1071
- /** Search mean may exceed holdout mean by at most this much. */
1072
- maxOverfitGap?: number;
1073
- maxMeanCostUsd?: number;
1074
- maxP95WallMs?: number;
1075
- /** Low-score/failed rows must carry ASI. Default true. */
1076
- requireAsiForFailures?: boolean;
1077
- /** Score below this is considered a failure for ASI coverage. Default 0.5. */
1078
- failureScoreThreshold?: number;
1079
- }
1080
- interface ReleaseConfidenceInput {
1081
- target: string;
1082
- candidateId?: string;
1083
- baselineId?: string;
1084
- dataset?: DatasetManifest;
1085
- scenarios?: readonly DatasetScenario[];
1086
- runs?: readonly RunRecord[];
1087
- traces?: readonly ReleaseTraceEvidence[];
1088
- gateDecision?: GateDecision | null;
1089
- thresholds?: ReleaseConfidenceThresholds;
1090
- }
1091
- interface ReleaseConfidenceAxis {
1092
- name: ReleaseConfidenceAxisName;
1093
- status: ReleaseConfidenceStatus;
1094
- score: number;
1095
- detail: string;
1096
- }
1097
- interface ReleaseConfidenceIssue {
1098
- axis: ReleaseConfidenceAxisName;
1099
- severity: 'critical' | 'warning';
1100
- code: string;
1101
- detail: string;
1102
- }
1103
- interface ReleaseConfidenceMetrics {
1104
- scenarioCount: number;
1105
- searchRuns: number;
1106
- holdoutRuns: number;
1107
- passRate: number;
1108
- meanScore: number;
1109
- searchMeanScore: number;
1110
- holdoutMeanScore: number;
1111
- overfitGap: number;
1112
- meanCostUsd: number;
1113
- p95WallMs: number;
1114
- failedRows: number;
1115
- failuresWithAsi: number;
1116
- singleShotTraces: number;
1117
- multiShotTraces: number;
1118
- splitCounts: Record<DatasetSplit, number>;
1119
- domainCounts: Record<string, number>;
1120
- failureModeCounts: Record<string, number>;
1121
- responsibleSurfaceCounts: Record<string, number>;
1122
- }
1123
- interface ReleaseConfidenceScorecard {
1124
- target: string;
1125
- candidateId: string | null;
1126
- baselineId: string | null;
1127
- status: ReleaseConfidenceStatus;
1128
- promote: boolean;
1129
- axes: ReleaseConfidenceAxis[];
1130
- issues: ReleaseConfidenceIssue[];
1131
- metrics: ReleaseConfidenceMetrics;
1132
- dataset: DatasetManifest | null;
1133
- gateDecision: GateDecision | null;
1134
- summary: string;
1135
- }
1136
- declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
1137
- declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
1138
- declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
1139
-
1140
- /**
1141
- * TraceSchema v1 — the canonical data model for agent-eval.
1142
- *
1143
- * Every score, every failure class, every pipeline in the framework is
1144
- * a view over this data. Shape it once, live with it.
1145
- *
1146
- * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
1147
- * but extended with agent-specific span kinds (llm, tool, retrieval,
1148
- * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
1149
- * entities that OTEL leaves as free-form attributes.
1150
- */
1151
- declare const TRACE_SCHEMA_VERSION = "1.0.0";
1152
- type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
1153
- interface BudgetSpec {
1154
- tokens?: number;
1155
- wallMs?: number;
1156
- calls?: number;
1157
- usd?: number;
1158
- }
1159
- interface RunOutcome {
1160
- score?: number;
1161
- pass?: boolean;
1162
- failureClass?: FailureClass;
1163
- notes?: string;
1164
- }
1165
- /**
1166
- * Layer — optional classification in a nested build workflow.
1167
- * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
1168
- * `app-build`: sandbox harness that compiled + tested the generated scaffold.
1169
- * `app-runtime`: a run of the generated agent against a domain scenario.
1170
- * `meta`: any meta-eval (judge replay, correlation analysis).
1171
- */
1172
- type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
1173
- interface Run$1 {
1174
- runId: string;
1175
- scenarioId: string;
1176
- variantId?: string;
1177
- datasetVersion?: string;
1178
- /** Git SHA of agent code at run time. */
1179
- codeSha?: string;
1180
- /** Hash of the prompt template + any system prompt. */
1181
- promptSha?: string;
1182
- /** Model id + date + system-prompt hash, concatenated. */
1183
- modelFingerprint?: string;
1184
- seed?: number;
1185
- /** Arbitrary environment markers (shell, docker version, tz). */
1186
- envFingerprint?: Record<string, string>;
1187
- /** Version of the redaction rules applied to this run. */
1188
- redactionVersion?: string;
1189
- /** Parent run in a nested build workflow. A builder run's children are
1190
- * app-build runs; those children are app-runtime runs. */
1191
- parentRunId?: string;
1192
- /** Stable project identifier — groups runs across chats + sessions. */
1193
- projectId?: string;
1194
- /** Chat/conversation identifier within a project. */
1195
- chatId?: string;
1196
- /** Layer classification — hint for aggregation; not enforced. */
1197
- layer?: RunLayer;
1198
- startedAt: number;
1199
- endedAt?: number;
1200
- status: RunStatus;
1201
- outcome?: RunOutcome;
1202
- budget?: BudgetSpec;
1203
- /** Free-form labels for downstream grouping. */
1204
- tags?: Record<string, string>;
1205
- }
1206
- type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
1207
- type SpanStatus = 'ok' | 'error';
1208
- interface SpanBase {
1209
- spanId: string;
1210
- parentSpanId?: string;
1211
- runId: string;
1212
- kind: SpanKind;
1213
- name: string;
1214
- startedAt: number;
1215
- endedAt?: number;
1216
- status?: SpanStatus;
1217
- error?: string;
1218
- /** Anything not covered by typed fields. Kept deliberately free-form. */
1219
- attributes?: Record<string, unknown>;
1220
- }
1221
- interface Message {
1222
- role: 'system' | 'user' | 'assistant' | 'tool';
1223
- content: string;
1224
- tokens?: number;
1225
- /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
1226
- images?: Array<{
1227
- artifactId?: string;
1228
- url?: string;
1229
- mime?: string;
1230
- }>;
1231
- }
1232
- interface LlmSpan extends SpanBase {
1233
- kind: 'llm';
1234
- model: string;
1235
- messages: Message[];
1236
- output?: string;
1237
- inputTokens?: number;
1238
- outputTokens?: number;
1239
- cachedTokens?: number;
1240
- reasoningTokens?: number;
1241
- costUsd?: number;
1242
- finishReason?: string;
1243
- }
1244
- interface ToolSpan extends SpanBase {
1245
- kind: 'tool';
1246
- toolName: string;
1247
- args: unknown;
1248
- result?: unknown;
1249
- latencyMs?: number;
1250
- }
1251
- interface RetrievalSpan extends SpanBase {
1252
- kind: 'retrieval';
1253
- query: string;
1254
- hits: Array<{
1255
- docId: string;
1256
- score: number;
1257
- content?: string;
1258
- }>;
1259
- }
1260
- interface JudgeSpan extends SpanBase {
1261
- kind: 'judge';
1262
- judgeId: string;
1263
- /** Span this judgment applies to. */
1264
- targetSpanId: string;
1265
- dimension: string;
1266
- /** Numeric score (free-range; interpretation up to the judge). */
1267
- score: number;
1268
- rationale?: string;
1269
- evidence?: string;
1270
- }
1271
- interface SandboxSpan extends SpanBase {
1272
- kind: 'sandbox';
1273
- image?: string;
1274
- command?: string;
1275
- exitCode?: number;
1276
- testsTotal?: number;
1277
- testsPassed?: number;
1278
- stdoutHash?: string;
1279
- stderrHash?: string;
1280
- /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
1281
- wallMs?: number;
1282
- }
1283
- interface GenericSpan extends SpanBase {
1284
- kind: 'agent' | 'custom';
1285
- }
1286
- type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
1287
- type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
1288
- interface TraceEvent {
1289
- eventId: string;
1290
- runId: string;
1291
- spanId?: string;
1292
- kind: EventKind;
1293
- timestamp: number;
1294
- payload: Record<string, unknown>;
1295
- }
1296
- interface BudgetLedgerEntry {
1297
- runId: string;
1298
- dimension: keyof BudgetSpec;
1299
- limit: number;
1300
- consumed: number;
1301
- remaining: number;
1302
- timestamp: number;
1303
- breached: boolean;
1304
- /** Span that triggered this entry, if any. */
1305
- spanId?: string;
1306
- }
1307
- interface Artifact$1 {
1308
- artifactId: string;
1309
- runId: string;
1310
- spanId?: string;
1311
- contentType: string;
1312
- sizeBytes: number;
1313
- /** sha256 in hex. */
1314
- hash: string;
1315
- /** External storage URL (R2, S3, filesystem path). */
1316
- storageUrl?: string;
1317
- /** Inline content for small blobs — keep under ~64KB. */
1318
- inlineContent?: string;
1319
- }
1320
- type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'missing_user_data' | 'missing_domain_data' | 'missing_codebase_context' | 'missing_runtime_context' | 'missing_credentials' | 'stale_external_data' | 'bad_retrieval' | 'insufficient_evidence' | 'contradictory_evidence' | 'ambiguous_user_intent' | 'knowledge_readiness_blocked' | 'unknown';
1321
- declare const FAILURE_CLASSES: readonly FailureClass[];
1322
- declare function isLlmSpan(s: Span): s is LlmSpan;
1323
- declare function isToolSpan(s: Span): s is ToolSpan;
1324
- declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
1325
- declare function isJudgeSpan(s: Span): s is JudgeSpan;
1326
- declare function isSandboxSpan(s: Span): s is SandboxSpan;
1327
-
1328
- interface RunFilter {
1329
- scenarioId?: string;
1330
- variantId?: string;
1331
- status?: RunStatus;
1332
- since?: number;
1333
- until?: number;
1334
- tag?: {
1335
- key: string;
1336
- value: string;
1337
- };
1338
- parentRunId?: string;
1339
- projectId?: string;
1340
- chatId?: string;
1341
- layer?: RunLayer;
1342
- }
1343
- interface SpanFilter {
1344
- runId?: string;
1345
- parentSpanId?: string;
1346
- kind?: SpanKind;
1347
- name?: string;
1348
- toolName?: string;
1349
- judgeId?: string;
1350
- since?: number;
1351
- until?: number;
1352
- }
1353
- interface EventFilter {
1354
- runId?: string;
1355
- spanId?: string;
1356
- kind?: EventKind;
1357
- since?: number;
1358
- until?: number;
1359
- }
1360
- interface TraceStore {
1361
- appendRun(run: Run$1): Promise<void>;
1362
- updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
1363
- appendSpan(span: Span): Promise<void>;
1364
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1365
- appendEvent(event: TraceEvent): Promise<void>;
1366
- appendArtifact(artifact: Artifact$1): Promise<void>;
1367
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1368
- getRun(runId: string): Promise<Run$1 | undefined>;
1369
- listRuns(filter?: RunFilter): Promise<Run$1[]>;
1370
- spans(filter?: SpanFilter): Promise<Span[]>;
1371
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1372
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1373
- artifacts(runId: string): Promise<Artifact$1[]>;
1374
- }
1375
- declare class InMemoryTraceStore implements TraceStore {
1376
- private runs;
1377
- private allSpans;
1378
- private allEvents;
1379
- private allArtifacts;
1380
- private allBudget;
1381
- appendRun(run: Run$1): Promise<void>;
1382
- updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
1383
- appendSpan(span: Span): Promise<void>;
1384
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1385
- appendEvent(event: TraceEvent): Promise<void>;
1386
- appendArtifact(artifact: Artifact$1): Promise<void>;
1387
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1388
- getRun(runId: string): Promise<Run$1 | undefined>;
1389
- listRuns(filter?: RunFilter): Promise<Run$1[]>;
1390
- spans(filter?: SpanFilter): Promise<Span[]>;
1391
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1392
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1393
- artifacts(runId: string): Promise<Artifact$1[]>;
1394
- }
1395
- interface FileSystemTraceStoreOptions {
1396
- dir: string;
1397
- /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
1398
- maxBytes?: number;
1399
- }
1400
- declare class FileSystemTraceStore implements TraceStore {
1401
- private dir;
1402
- private maxBytes;
1403
- /** Lazy in-memory index for queries — populated on first read. */
1404
- private index?;
1405
- private loaded;
1406
- constructor(options: FileSystemTraceStoreOptions);
1407
- private ensureDir;
1408
- private append;
1409
- private insertInto;
1410
- private load;
1411
- appendRun(run: Run$1): Promise<void>;
1412
- updateRun(runId: string, patch: Partial<Run$1>): Promise<void>;
1413
- appendSpan(span: Span): Promise<void>;
1414
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1415
- appendEvent(event: TraceEvent): Promise<void>;
1416
- appendArtifact(artifact: Artifact$1): Promise<void>;
1417
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1418
- getRun(runId: string): Promise<Run$1 | undefined>;
1419
- listRuns(filter?: RunFilter): Promise<Run$1[]>;
1420
- spans(filter?: SpanFilter): Promise<Span[]>;
1421
- events(filter?: EventFilter): Promise<TraceEvent[]>;
1422
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
1423
- artifacts(runId: string): Promise<Artifact$1[]>;
1424
- }
1425
-
1426
- /**
1427
- * TraceEmitter — hierarchical span builder that auto-parents using an
1428
- * internal stack. One emitter per Run; emitters do NOT share state.
1429
- *
1430
- * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
1431
- * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
1432
- * have to thread spanIds manually. For async workflows that can't use
1433
- * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
1434
- * explicitly.
1435
- */
1436
-
1437
- interface SpanHandle<S extends Span = Span> {
1438
- span: S;
1439
- end(patch?: Partial<S>): Promise<void>;
1440
- fail(error: string | Error, patch?: Partial<S>): Promise<void>;
1441
- }
1442
- interface TraceEmitterOptions {
1443
- runId?: string;
1444
- /** Inject a clock for deterministic tests. */
1445
- now?: () => number;
1446
- /** Inject an id generator for deterministic tests. */
1447
- id?: () => string;
1448
- }
1449
- declare class TraceEmitter {
1450
- private store;
1451
- private stack;
1452
- private _runId;
1453
- private now;
1454
- private id;
1455
- constructor(store: TraceStore, options?: TraceEmitterOptions);
1456
- get runId(): string;
1457
- startRun(run: Omit<Run$1, 'runId' | 'startedAt' | 'status'>): Promise<Run$1>;
1458
- endRun(outcome?: RunOutcome): Promise<void>;
1459
- abortRun(reason: string): Promise<void>;
1460
- span<S extends Span = Span>(init: {
1461
- kind: SpanKind;
1462
- name: string;
1463
- parentSpanId?: string;
1464
- attributes?: Record<string, unknown>;
1465
- } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
1466
- private handle;
1467
- private pop;
1468
- llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
1469
- tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
1470
- retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
1471
- recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
1472
- sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
1473
- emit(event: {
1474
- kind: EventKind;
1475
- spanId?: string;
1476
- payload?: Record<string, unknown>;
1477
- }): Promise<TraceEvent>;
1478
- recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
1479
- timestamp?: number;
1480
- }): Promise<BudgetLedgerEntry>;
1481
- recordArtifact(artifact: Omit<Artifact$1, 'artifactId' | 'runId'>): Promise<Artifact$1>;
1482
- /**
1483
- * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1484
- * Returns the fn's return value. Use this for the 95% case.
1485
- */
1486
- within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
1487
- }
1488
- /** Helper to build an LLM span handle args object from a provider-shaped response. */
1489
- declare function llmSpanFromProvider(args: {
1490
- name?: string;
1491
- model: string;
1492
- messages: Message[];
1493
- output: string;
1494
- usage?: {
1495
- inputTokens?: number;
1496
- outputTokens?: number;
1497
- cachedTokens?: number;
1498
- reasoningTokens?: number;
1499
- };
1500
- costUsd?: number;
1501
- finishReason?: string;
1502
- }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
1503
-
1504
- /**
1505
- * Policy-based agent control runtime.
1506
- *
1507
- * This is the minimal reusable loop behind driver-agent patterns:
1508
- *
1509
- * observe state -> validate -> decide next action -> act -> observe -> ...
1510
- *
1511
- * It deliberately does not model named "topologies". Direct execution,
1512
- * critic/revise, driver intervention, specialist calls, and human escalation
1513
- * are all just actions chosen by the control policy.
1514
- */
1515
-
1516
- type ControlSeverity = 'info' | 'warning' | 'error' | 'critical';
1517
- type ControlActionFailureMode = 'continue' | 'stop';
1518
- interface ControlEvalResult {
1519
- /** Stable validator or judge id. */
1520
- id: string;
1521
- /** Whether this check passed. */
1522
- passed: boolean;
1523
- /** Optional normalized score. 1 = best, 0 = worst. */
1524
- score?: number;
1525
- /** Objective validators should usually be "error" or "critical" when failed. */
1526
- severity?: ControlSeverity;
1527
- /** Human-readable result. */
1528
- detail?: string;
1529
- /** Small evidence string or pointer. Avoid large payloads. */
1530
- evidence?: string;
1531
- /** True when the result came from deterministic state, not LLM judgment. */
1532
- objective?: boolean;
1533
- /** Structured details for downstream control policies and reports. */
1534
- metadata?: Record<string, unknown>;
1535
- }
1536
- interface ControlBudget {
1537
- maxSteps: number;
1538
- maxWallMs?: number;
1539
- maxCostUsd?: number;
1540
- }
1541
- interface ControlStopPolicies<TState, TAction> {
1542
- /**
1543
- * Stop after N consecutive steps with no state fingerprint change and
1544
- * less than `minScoreDelta` score movement. Disabled when omitted.
1545
- */
1546
- maxNoProgressSteps?: number;
1547
- /**
1548
- * Stop after the same action fingerprint is selected N consecutive
1549
- * times. Disabled when omitted.
1550
- */
1551
- maxRepeatedActions?: number;
1552
- /** Minimum score movement that counts as progress. Default 0.001. */
1553
- minScoreDelta?: number;
1554
- /** Override the default JSON/string fingerprint for state comparisons. */
1555
- stateFingerprint?: (state: TState) => string;
1556
- /** Override the default JSON/string fingerprint for repeated-action checks. */
1557
- actionFingerprint?: (action: TAction) => string;
1558
- }
1559
- interface ControlContext<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
1560
- intent: string;
1561
- state: TState;
1562
- evals: TEval[];
1563
- history: ControlStep<TState, TAction, TActionResult, TEval>[];
1564
- budget: ControlBudget;
1565
- stepIndex: number;
1566
- wallMs: number;
1567
- spentCostUsd: number;
1568
- remainingCostUsd?: number;
1569
- abortSignal: AbortSignal;
1570
- emitter?: TraceEmitter;
1571
- }
1572
- type ControlDecision<TAction> = {
1573
- type: 'continue';
1574
- action: TAction;
1575
- reason?: string;
1576
- } | {
1577
- type: 'stop';
1578
- reason: string;
1579
- pass?: boolean;
1580
- score?: number;
1581
- };
1582
- interface StopDecision {
1583
- stop: boolean;
1584
- pass: boolean;
1585
- reason: string;
1586
- score?: number;
1587
- failureClass?: FailureClass;
1588
- }
1589
- interface ControlActionOutcome<TActionResult> {
1590
- ok: boolean;
1591
- result?: TActionResult;
1592
- error?: string;
1593
- costUsd?: number;
1594
- durationMs: number;
1595
- }
1596
- interface ControlRuntimeError {
1597
- phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace';
1598
- stepIndex: number;
1599
- message: string;
1600
- }
1601
- interface ControlStep<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
1602
- index: number;
1603
- decision: ControlDecision<TAction>;
1604
- beforeState: TState;
1605
- afterState: TState;
1606
- evalsBefore: TEval[];
1607
- evalsAfter: TEval[];
1608
- actionOutcome?: ControlActionOutcome<TActionResult>;
1609
- startedAt: string;
1610
- endedAt: string;
1611
- }
1612
- interface ControlRunResult<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
1613
- intent: string;
1614
- pass: boolean;
1615
- completed: boolean;
1616
- reason: string;
1617
- score?: number;
1618
- steps: ControlStep<TState, TAction, TActionResult, TEval>[];
1619
- finalState: TState | undefined;
1620
- finalEvals: TEval[];
1621
- wallMs: number;
1622
- spentCostUsd: number;
1623
- runId: string | null;
1624
- failureClass?: FailureClass;
1625
- runtimeErrors: ControlRuntimeError[];
1626
- stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error';
1627
- }
1628
- interface ControlRuntimeConfig<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult> {
1629
- intent: string;
1630
- budget?: Partial<ControlBudget>;
1631
- signal?: AbortSignal;
1632
- /** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */
1633
- actionFailure?: ControlActionFailureMode;
1634
- /**
1635
- * Extract cost from an action result. Used for `maxCostUsd` budget
1636
- * enforcement and trace budget ledger emission.
1637
- */
1638
- getActionCostUsd?: (ctx: {
1639
- action: TAction;
1640
- result: TActionResult;
1641
- state: TState;
1642
- evals: TEval[];
1643
- history: ControlStep<TState, TAction, TActionResult, TEval>[];
1644
- }) => number | undefined;
1645
- /** Read typed task/product state. Prefer structured state over transcript-only context. */
1646
- observe: (ctx: {
1647
- history: ControlStep<TState, TAction, TActionResult, TEval>[];
1648
- abortSignal: AbortSignal;
1649
- }) => Promise<TState> | TState;
1650
- /** Objective validators first, subjective judges only where objective state is insufficient. */
1651
- validate: (ctx: {
1652
- intent: string;
1653
- state: TState;
1654
- history: ControlStep<TState, TAction, TActionResult, TEval>[];
1655
- abortSignal: AbortSignal;
1656
- }) => Promise<TEval[]> | TEval[];
1657
- /** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */
1658
- decide: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>;
1659
- /** Execute the action selected by the policy. */
1660
- act: (action: TAction, ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<TActionResult> | TActionResult;
1661
- /** Final stopping policy. Called before decide and after each action. */
1662
- shouldStop?: (ctx: ControlContext<TState, TAction, TActionResult, TEval>) => Promise<StopDecision> | StopDecision;
1663
- /** Optional hook for tracing or live progress updates. */
1664
- onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void;
1665
- /** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */
1666
- stopPolicies?: ControlStopPolicies<TState, TAction>;
1667
- /** Optional trace sink. Emits one run plus one span per control step. */
1668
- store?: TraceStore;
1669
- scenarioId?: string;
1670
- projectId?: string;
1671
- variantId?: string;
1672
- }
1673
- declare function runAgentControlLoop<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>>;
1674
- declare function stopOnNoProgress<TState, TAction>(maxNoProgressSteps: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'>): ControlStopPolicies<TState, TAction>;
1675
- declare function stopOnRepeatedAction<TState, TAction>(maxRepeatedActions: number, options?: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'>): ControlStopPolicies<TState, TAction>;
1676
- declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
1677
- declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
1678
- declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
1679
-
1680
- type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
1681
- type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
1682
- type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
1683
- type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
1684
- interface FeedbackTask {
1685
- intent: string;
1686
- context?: unknown;
1687
- }
1688
- interface ProposedSideEffect {
1689
- type: string;
1690
- risk?: 'low' | 'medium' | 'high';
1691
- costUsd?: number;
1692
- externalSideEffect?: boolean;
1693
- requiresApproval?: boolean;
1694
- metadata?: Record<string, unknown>;
1695
- }
1696
- interface FeedbackLabel {
1697
- id?: string;
1698
- source: FeedbackLabelSource;
1699
- kind: FeedbackLabelKind;
1700
- value: unknown;
1701
- reason?: string;
1702
- severity?: FeedbackSeverity;
1703
- createdAt: string;
1704
- metadata?: Record<string, unknown>;
1705
- }
1706
- interface FeedbackAttempt {
1707
- id: string;
1708
- stepIndex: number;
1709
- artifactType: FeedbackArtifactType;
1710
- artifact: unknown;
1711
- options?: unknown[];
1712
- proposedAction?: ProposedSideEffect;
1713
- evals?: ControlEvalResult[];
1714
- feedback?: FeedbackLabel[];
1715
- createdAt: string;
1716
- metadata?: Record<string, unknown>;
1717
- }
1718
- interface FeedbackOutcome {
1719
- success?: boolean;
1720
- score?: number;
1721
- metrics?: Record<string, number>;
1722
- costUsd?: number;
1723
- detail?: string;
1724
- observedAt?: string;
1725
- metadata?: Record<string, unknown>;
1726
- }
1727
- interface FeedbackTrajectory {
1728
- id: string;
1729
- projectId?: string;
1730
- scenarioId?: string;
1731
- task: FeedbackTask;
1732
- attempts: FeedbackAttempt[];
1733
- labels: FeedbackLabel[];
1734
- outcome?: FeedbackOutcome;
1735
- split?: DatasetSplit;
1736
- tags?: Record<string, string>;
1737
- createdAt: string;
1738
- updatedAt?: string;
1739
- metadata?: Record<string, unknown>;
1740
- }
1741
- interface FeedbackTrajectoryStore {
1742
- save(trajectory: FeedbackTrajectory): Promise<void>;
1743
- get(id: string): Promise<FeedbackTrajectory | null>;
1744
- list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1745
- appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1746
- appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
1747
- }
1748
- interface FeedbackTrajectoryFilter {
1749
- projectId?: string;
1750
- scenarioId?: string;
1751
- split?: DatasetSplit;
1752
- tag?: [string, string];
1753
- }
1754
- interface FeedbackSplitPolicy {
1755
- trainPct?: number;
1756
- devPct?: number;
1757
- testPct?: number;
1758
- holdoutPct?: number;
1759
- }
1760
- interface PreferenceMemoryEntry {
1761
- instruction: string;
1762
- rationale: string;
1763
- weight: number;
1764
- sourceTrajectoryId: string;
1765
- sourceLabelId?: string;
1766
- category?: string;
1767
- }
1768
- interface FeedbackOptimizerRow {
1769
- scenarioId: string;
1770
- trajectoryId: string;
1771
- labelKinds: FeedbackLabelKind[];
1772
- score?: number;
1773
- metadata?: Record<string, unknown>;
1774
- }
1775
- interface FeedbackReplayResult {
1776
- trajectoryId: string;
1777
- pass: boolean;
1778
- score?: number;
1779
- labels: FeedbackLabel[];
1780
- outcome?: FeedbackOutcome;
1781
- metadata?: Record<string, unknown>;
1782
- }
1783
- interface FeedbackReplayAdapter {
1784
- replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
1785
- }
1786
- declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
1787
- private readonly trajectories;
1788
- save(trajectory: FeedbackTrajectory): Promise<void>;
1789
- get(id: string): Promise<FeedbackTrajectory | null>;
1790
- list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1791
- appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1792
- appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
1793
- }
1794
- declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
1795
- private readonly dir;
1796
- private readonly memory;
1797
- private loaded;
1798
- constructor(options: {
1799
- dir: string;
1800
- });
1801
- save(trajectory: FeedbackTrajectory): Promise<void>;
1802
- get(id: string): Promise<FeedbackTrajectory | null>;
1803
- list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
1804
- appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
1805
- appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
1806
- private append;
1807
- private load;
1808
- }
1809
- declare function createFeedbackTrajectory(input: {
1810
- id?: string;
1811
- projectId?: string;
1812
- scenarioId?: string;
1813
- task: FeedbackTask;
1814
- attempts?: FeedbackAttempt[];
1815
- labels?: FeedbackLabel[];
1816
- outcome?: FeedbackOutcome;
1817
- split?: DatasetSplit;
1818
- tags?: Record<string, string>;
1819
- createdAt?: string;
1820
- metadata?: Record<string, unknown>;
1821
- }): FeedbackTrajectory;
1822
- declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
1823
- declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
1824
- declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
1825
- declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
1826
- declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
1827
- declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
1828
- declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
1829
- declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
1830
- declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
1831
- maxEntries?: number;
1832
- }): PreferenceMemoryEntry[];
1833
- declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
1834
- declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
1835
- declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
1836
- declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
1837
- projectId?: string;
1838
- scenarioId?: string;
1839
- artifactType?: FeedbackArtifactType;
1840
- artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
1841
- proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
1842
- createdAt?: string;
1843
- }): FeedbackTrajectory;
1844
-
1845
341
  interface LiveProofArtifact {
1846
342
  kind: string;
1847
343
  id?: string;
@@ -2181,32 +677,42 @@ declare function scoreKnowledgeReadiness(options: ScoreKnowledgeReadinessOptions
2181
677
  declare function blockingKnowledgeEval(report: KnowledgeReadinessReport, options?: {
2182
678
  id?: string;
2183
679
  minimumScore?: number;
680
+ emitter?: TraceEmitter;
2184
681
  }): ControlEvalResult;
682
+ declare function knowledgeReadinessTracePayload(report: KnowledgeReadinessReport, options?: {
683
+ passed?: boolean;
684
+ minimumScore?: number;
685
+ }): Record<string, unknown>;
2185
686
  declare function userQuestionsForKnowledgeGaps(gaps: KnowledgeRequirement[]): UserQuestion[];
2186
687
  declare function acquisitionPlansForKnowledgeGaps(gaps: KnowledgeRequirement[]): DataAcquisitionPlan[];
2187
688
 
2188
- interface ActionExecutionPolicy {
2189
- allowedTypes?: string[];
2190
- blockedTypes?: string[];
2191
- alwaysRequireApprovalTypes?: string[];
2192
- autoApproveTypes?: string[];
2193
- requireApprovalForExternalSideEffects?: boolean;
2194
- requireApprovalAboveCostUsd?: number;
2195
- maxActionCostUsd?: number;
2196
- remainingBudgetUsd?: number;
2197
- expectedOutcomeRequired?: boolean;
2198
- killCriteriaRequired?: boolean;
2199
- }
2200
- interface ActionPolicyDecision {
2201
- allowed: boolean;
2202
- blocked: boolean;
2203
- requiresApproval: boolean;
2204
- reasons: string[];
2205
- label?: FeedbackLabel;
2206
- }
2207
- declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
2208
- createdAt?: string;
2209
- }): ActionPolicyDecision;
689
+ type IntegrationGateSurface = 'integration-manifest' | 'integration-connection' | 'integration-scope' | 'integration-approval' | 'integration-auth' | 'integration-provider' | 'integration-policy';
690
+ interface IntegrationManifestGateInput {
691
+ connectorId: string;
692
+ actionId?: string;
693
+ valid: boolean;
694
+ missingConnections?: string[];
695
+ missingScopes?: string[];
696
+ requiredScopes?: string[];
697
+ approvalRequired?: boolean;
698
+ status?: 'ready' | 'blocked' | 'approval_required';
699
+ reason?: string;
700
+ metadata?: Record<string, unknown>;
701
+ }
702
+ interface IntegrationInvokeFailureInput {
703
+ connectorId: string;
704
+ actionId: string;
705
+ code: 'auth_expired' | 'scope_denied' | 'approval_required' | 'unsafe_write_denied' | 'provider_failure' | 'manifest_invalid';
706
+ message: string;
707
+ status?: number;
708
+ retryable?: boolean;
709
+ metadata?: Record<string, unknown>;
710
+ }
711
+ declare function integrationManifestValidatedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
712
+ declare function integrationManifestResolvedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
713
+ declare function integrationInvokeFailedPayload(input: IntegrationInvokeFailureInput): Record<string, unknown>;
714
+ declare function integrationGateEvals(input: IntegrationManifestGateInput): ControlEvalResult[];
715
+ declare function integrationAsi(input: IntegrationManifestGateInput | IntegrationInvokeFailureInput): ActionableSideInfo;
2210
716
 
2211
717
  /**
2212
718
  * Normalize scores so all dimensions follow "higher = better".
@@ -2769,167 +1275,39 @@ interface SteeringDelta {
2769
1275
  rolePrompts?: Record<string, SteeringRolePrompt>;
2770
1276
  metadata?: Record<string, unknown>;
2771
1277
  }
2772
- declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
2773
- declare function renderSteeringText(bundle: SteeringBundle): string;
2774
-
2775
- interface RunScore {
2776
- success: number;
2777
- goalProgress: number;
2778
- repoGroundedness: number;
2779
- driftPenalty: number;
2780
- toolUseQuality: number;
2781
- patchQuality: number;
2782
- testReality: number;
2783
- finalGate: number;
2784
- reviewerBlockers: number;
2785
- costUsd: number;
2786
- wallSeconds: number;
2787
- notes?: string[];
2788
- }
2789
- interface RunScoreWeights {
2790
- success: number;
2791
- goalProgress: number;
2792
- repoGroundedness: number;
2793
- driftPenalty: number;
2794
- toolUseQuality: number;
2795
- patchQuality: number;
2796
- testReality: number;
2797
- finalGate: number;
2798
- reviewerBlockers: number;
2799
- costUsd: number;
2800
- wallSeconds: number;
2801
- }
2802
- declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
2803
- declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
2804
- declare function clamp01(value: number): number;
2805
-
2806
- /**
2807
- * Typed query helpers over TraceStore.
2808
- *
2809
- * Not a full SQL engine — a minimal, composable set of operators that
2810
- * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
2811
- * NDJSON and point DuckDB at it; the schema is stable so external SQL
2812
- * tooling works out of the box.
2813
- */
2814
-
2815
- declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run$1[]>;
2816
- declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
2817
- declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
2818
- declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
2819
- /** Group spans by any key selector. */
2820
- declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
2821
- /** Hash tool arguments to an orderless-key-stable string for de-duplication. */
2822
- declare function argHash(args: unknown): string;
2823
- /** Sum an LLM-span array into aggregate token + cost. */
2824
- declare function aggregateLlm(spans: LlmSpan[]): {
2825
- inputTokens: number;
2826
- outputTokens: number;
2827
- cachedTokens: number;
2828
- costUsd: number;
2829
- };
2830
- /** Pick the outcome's failure class when present, else derive 'success' from run status. */
2831
- declare function runFailureClass(run: Run$1): FailureClass;
2832
-
2833
- /**
2834
- * Redaction — remove PII / secrets from trace payloads before persist.
2835
- *
2836
- * Pre-persistence rules mean raw traces in storage are already scrubbed.
2837
- * Unredacted variants (for debugging / post-mortems) live in a separate
2838
- * storage layer with stricter access controls; this module only covers
2839
- * the default scrub-then-persist path.
2840
- *
2841
- * Rules compose: pass an array of `RedactionRule`, each is applied in
2842
- * order. Strings that match get replaced with a tagged sentinel so the
2843
- * eval framework can count how many redactions happened per run
2844
- * (surfaced via `redaction_applied` events).
2845
- */
2846
- interface RedactionRule {
2847
- id: string;
2848
- pattern: RegExp;
2849
- /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
2850
- replacement?: string;
2851
- }
2852
- interface RedactionReport {
2853
- redactionCount: number;
2854
- byRule: Record<string, number>;
2855
- }
2856
- /** OWASP / common-sense defaults — extend per-domain. */
2857
- declare const DEFAULT_REDACTION_RULES: RedactionRule[];
2858
- declare const REDACTION_VERSION = "1.0.0";
2859
- /**
2860
- * Redact a single string. Returns the new string and a per-rule count of
2861
- * how many substitutions fired.
2862
- */
2863
- declare function redactString(input: string, rules?: RedactionRule[]): {
2864
- output: string;
2865
- report: RedactionReport;
2866
- };
2867
- /**
2868
- * Walk a JSON-ish value applying `redactString` to every string leaf.
2869
- * Arrays and plain objects are recursed; other types pass through
2870
- * untouched. Circular references throw — traces should be tree-shaped.
2871
- */
2872
- declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
2873
- value: unknown;
2874
- report: RedactionReport;
2875
- };
2876
-
2877
- /**
2878
- * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
2879
- * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
2880
- *
2881
- * Wire format only. We do NOT depend on the @opentelemetry SDK — that
2882
- * would drag in polyfills incompatible with Workers/Edge. Consumers
2883
- * push the JSON to their collector of choice via HTTP.
2884
- *
2885
- * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
2886
- */
2887
-
2888
- declare const OTEL_AGENT_EVAL_SCOPE: {
2889
- name: string;
2890
- version: string;
2891
- };
2892
- interface OtlpSpan {
2893
- traceId: string;
2894
- spanId: string;
2895
- parentSpanId?: string;
2896
- name: string;
2897
- kind: number;
2898
- startTimeUnixNano: string;
2899
- endTimeUnixNano: string;
2900
- attributes: Array<{
2901
- key: string;
2902
- value: {
2903
- stringValue?: string;
2904
- intValue?: string;
2905
- doubleValue?: number;
2906
- boolValue?: boolean;
2907
- };
2908
- }>;
2909
- events?: Array<{
2910
- timeUnixNano: string;
2911
- name: string;
2912
- attributes?: OtlpSpan['attributes'];
2913
- }>;
2914
- status?: {
2915
- code: number;
2916
- message?: string;
2917
- };
2918
- }
2919
- interface OtlpResourceSpans {
2920
- resource: {
2921
- attributes: OtlpSpan['attributes'];
2922
- };
2923
- scopeSpans: Array<{
2924
- scope: typeof OTEL_AGENT_EVAL_SCOPE;
2925
- spans: OtlpSpan[];
2926
- }>;
1278
+ declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
1279
+ declare function renderSteeringText(bundle: SteeringBundle): string;
1280
+
1281
+ interface RunScore {
1282
+ success: number;
1283
+ goalProgress: number;
1284
+ repoGroundedness: number;
1285
+ driftPenalty: number;
1286
+ toolUseQuality: number;
1287
+ patchQuality: number;
1288
+ testReality: number;
1289
+ finalGate: number;
1290
+ reviewerBlockers: number;
1291
+ costUsd: number;
1292
+ wallSeconds: number;
1293
+ notes?: string[];
2927
1294
  }
2928
- interface OtlpExport {
2929
- resourceSpans: OtlpResourceSpans[];
1295
+ interface RunScoreWeights {
1296
+ success: number;
1297
+ goalProgress: number;
1298
+ repoGroundedness: number;
1299
+ driftPenalty: number;
1300
+ toolUseQuality: number;
1301
+ patchQuality: number;
1302
+ testReality: number;
1303
+ finalGate: number;
1304
+ reviewerBlockers: number;
1305
+ costUsd: number;
1306
+ wallSeconds: number;
2930
1307
  }
2931
- /** Export a single run's spans + events in OTLP/JSON. */
2932
- declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
1308
+ declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
1309
+ declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
1310
+ declare function clamp01(value: number): number;
2933
1311
 
2934
1312
  interface RunTrace {
2935
1313
  run: Run$1;
@@ -3359,206 +1737,6 @@ declare class DualAgentBench {
3359
1737
  run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
3360
1738
  }
3361
1739
 
3362
- /**
3363
- * Propose / Verify / Review — the core multi-shot primitive.
3364
- *
3365
- * shot N: propose(state, priorReview) → new state
3366
- * verify(state) → pass/fail, optional layers
3367
- * review(state, verification, memory) → observations + next-shot
3368
- * instruction + shouldContinue
3369
- * memory.append(entry)
3370
- *
3371
- * Roles are strictly separated:
3372
- *
3373
- * - The WORKER is whatever the caller wraps in `propose`. It is
3374
- * stateful — caller owns its resume/session mechanism.
3375
- * - The VERIFIER grades the state. It produces the ground truth.
3376
- * The reviewer cannot overturn or downgrade a verification layer.
3377
- * - The REVIEWER is stateless per call. Its continuity is the
3378
- * `ReviewMemoryStore` — durable JSONL by default, or any store
3379
- * implementing the interface. It reads memory + trace summary +
3380
- * verification and directs the NEXT proposer shot.
3381
- *
3382
- * This shape is load-bearing. The reviewer never grades; the verifier
3383
- * never directs. Two processes, two prompts, two concerns — which is
3384
- * what keeps the loop from confirmation-biasing itself into "all
3385
- * passed" when it didn't.
3386
- *
3387
- * Short-circuits and soft-fails are both first-class:
3388
- * - verify.pass === true → reviewer LLM call is skipped, memory
3389
- * records a success entry, loop exits.
3390
- * - review throws → the shot still counts; the loop uses the
3391
- * last-known instruction (or `fallbackInstruction`) for the next
3392
- * propose call. A transient reviewer failure must NEVER abort a
3393
- * valid arc.
3394
- *
3395
- * Composable: `propose` itself can be another `runProposeReview` call.
3396
- * That's the dogfooding path — a harness built on this primitive is in
3397
- * turn evaluable by it.
3398
- */
3399
-
3400
- interface Verification {
3401
- pass: boolean;
3402
- score?: number;
3403
- failingLayers?: string[];
3404
- details?: unknown;
3405
- }
3406
- interface Review {
3407
- observations: string;
3408
- diagnosis: string;
3409
- nextShotInstruction: string;
3410
- shouldContinue: boolean;
3411
- confidence: number;
3412
- }
3413
- interface ReviewMemoryEntry extends Review {
3414
- shot: number;
3415
- timestamp: number;
3416
- verification: {
3417
- pass: boolean;
3418
- score?: number;
3419
- failingLayers?: string[];
3420
- };
3421
- }
3422
- interface ProposeInput<State> {
3423
- shot: number;
3424
- goal: string;
3425
- state: State;
3426
- priorReview: Review | null;
3427
- abortSignal: AbortSignal;
3428
- emitter?: TraceEmitter;
3429
- }
3430
- interface ProposeOutput<State, Summary = unknown> {
3431
- state: State;
3432
- traceSummary?: Summary;
3433
- }
3434
- interface ReviewInput<State, Summary = unknown> {
3435
- shot: number;
3436
- goal: string;
3437
- state: State;
3438
- verification: Verification;
3439
- traceSummary: Summary | undefined;
3440
- memory: ReviewMemoryEntry[];
3441
- }
3442
- type ProposeFn<State, Summary = unknown> = (input: ProposeInput<State>) => Promise<ProposeOutput<State, Summary>>;
3443
- type VerifyFn<State> = (state: State) => Promise<Verification>;
3444
- type ReviewFn<State, Summary = unknown> = (input: ReviewInput<State, Summary>) => Promise<Review>;
3445
- interface ReviewMemoryStore {
3446
- load(): Promise<ReviewMemoryEntry[]>;
3447
- append(entry: ReviewMemoryEntry): Promise<void>;
3448
- }
3449
- interface ProposeReviewConfig<State, Summary = unknown> {
3450
- goal: string;
3451
- initialState: State;
3452
- propose: ProposeFn<State, Summary>;
3453
- verify: VerifyFn<State>;
3454
- review: ReviewFn<State, Summary>;
3455
- /** Hard shot cap. Default 10. */
3456
- maxShots?: number;
3457
- /** Wall-clock cap in ms. Default 10 min. */
3458
- maxWallMs?: number;
3459
- /**
3460
- * If the reviewer returns confidence ≤ floor on `confidenceFloorWindow`
3461
- * consecutive shots, terminate early. Default floor 0.3, window 2.
3462
- * Set window to 0 or floor to <0 to disable.
3463
- */
3464
- confidenceFloor?: number;
3465
- confidenceFloorWindow?: number;
3466
- /** Defaults to an in-memory store if omitted. */
3467
- memory?: ReviewMemoryStore;
3468
- /** If provided, emit a Run + per-shot spans. */
3469
- store?: TraceStore;
3470
- scenarioId?: string;
3471
- projectId?: string;
3472
- variantId?: string;
3473
- /**
3474
- * Used when the reviewer soft-fails on shot 1 (no prior instruction to
3475
- * fall back to). Default is a generic "inspect failures and fix".
3476
- */
3477
- fallbackInstruction?: string;
3478
- }
3479
- interface ProposeReviewShot<State, Summary = unknown> {
3480
- shot: number;
3481
- state: State;
3482
- verification: Verification;
3483
- traceSummary: Summary | undefined;
3484
- review: Review;
3485
- reviewAvailable: boolean;
3486
- reviewError?: string;
3487
- durationMs: number;
3488
- }
3489
- interface ProposeReviewReport<State, Summary = unknown> {
3490
- runId: string | null;
3491
- completed: boolean;
3492
- shots: ProposeReviewShot<State, Summary>[];
3493
- finalState: State;
3494
- finalVerification: Verification;
3495
- failureClass?: FailureClass;
3496
- wallMs: number;
3497
- score: number;
3498
- }
3499
- declare function inMemoryReviewStore(initial?: ReviewMemoryEntry[]): ReviewMemoryStore;
3500
- declare function jsonlReviewStore(path: string): ReviewMemoryStore;
3501
- declare function runProposeReview<State, Summary = unknown>(config: ProposeReviewConfig<State, Summary>): Promise<ProposeReviewReport<State, Summary>>;
3502
- interface LlmJsonCall {
3503
- (req: {
3504
- system: string;
3505
- user: string;
3506
- }): Promise<unknown>;
3507
- }
3508
- interface LlmReviewerConfig<State, Summary = unknown> {
3509
- callJson: LlmJsonCall;
3510
- renderState?: (state: State) => string;
3511
- renderTraceSummary?: (summary: Summary | undefined) => string;
3512
- /** Appended to the default system prompt. */
3513
- systemPromptAddendum?: string;
3514
- }
3515
- declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
3516
-
3517
- interface ProposeReviewControlState<State, Summary = unknown> {
3518
- shot: number;
3519
- state: State;
3520
- priorReview: Review | null;
3521
- verification: Verification;
3522
- traceSummary?: Summary;
3523
- memory: ReviewMemoryEntry[];
3524
- completed: boolean;
3525
- reviewAvailable: boolean;
3526
- reviewError?: string;
3527
- }
3528
- interface ProposeReviewControlAction {
3529
- type: 'propose-review-shot';
3530
- shot: number;
3531
- }
3532
- interface ProposeReviewControlResult<State, Summary = unknown> {
3533
- state: State;
3534
- verification: Verification;
3535
- traceSummary?: Summary;
3536
- review: Review | null;
3537
- reviewAvailable: boolean;
3538
- reviewError?: string;
3539
- }
3540
- interface ProposeReviewControlConfig<State, Summary = unknown> {
3541
- goal: string;
3542
- initialState: State;
3543
- propose: ProposeFn<State, Summary>;
3544
- verify: VerifyFn<State>;
3545
- review: ReviewFn<State, Summary>;
3546
- maxShots?: number;
3547
- maxWallMs?: number;
3548
- memory?: ReviewMemoryStore;
3549
- store?: TraceStore;
3550
- scenarioId?: string;
3551
- projectId?: string;
3552
- variantId?: string;
3553
- fallbackInstruction?: string;
3554
- confidenceFloor?: number;
3555
- confidenceFloorWindow?: number;
3556
- failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
3557
- actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
3558
- }
3559
- declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
3560
- declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
3561
-
3562
1740
  /**
3563
1741
  * TestGradedScenario — a scenario whose score comes from a test suite.
3564
1742
  *
@@ -5781,6 +3959,39 @@ interface HypothesisResult {
5781
3959
  rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
5782
3960
  notes?: string;
5783
3961
  }
3962
+ /**
3963
+ * Deterministic JSON canonicalization — sort object keys recursively.
3964
+ *
3965
+ * Two semantically-equal objects produce byte-identical canonicalized output;
3966
+ * this is what makes a content-hash stable across encoders, key insertion
3967
+ * orders, and runtime versions. Exported for any consumer that needs the same
3968
+ * canonicalization guarantee outside the manifest-signing path (e.g., signing
3969
+ * an artifact bundle, hashing a dataset version, etc.).
3970
+ */
3971
+ declare function canonicalize(v: unknown): unknown;
3972
+ /**
3973
+ * SHA-256 hex (full 64 chars) over the canonicalized JSON encoding of `obj`.
3974
+ *
3975
+ * The same primitive `signManifest` and `verifyManifest` are built on, exposed
3976
+ * directly so consumers signing arbitrary structured content (artifact bundles,
3977
+ * production packets, dataset manifests, etc.) don't have to re-derive
3978
+ * canonicalize+sha256 from scratch.
3979
+ *
3980
+ * Stable across:
3981
+ * - object key insertion order (canonicalization sorts keys recursively)
3982
+ * - encoder choice (UTF-8 via TextEncoder, fixed)
3983
+ * - runtime (uses the Web Crypto subtle digest, present in Node ≥18 and browsers)
3984
+ *
3985
+ * Naming note: `hashJson` rather than `hashContent` because `hashContent` is
3986
+ * already taken in `prompt-registry.ts` for the truncated 12-char prompt-id
3987
+ * helper, which has different semantics (string input, short return). Both
3988
+ * coexist; `hashJson` is the right name when you mean "canonicalize then hash."
3989
+ *
3990
+ * @example
3991
+ * const hash = await hashJson({ id: '1', kind: 'spec' })
3992
+ * // 'a3f1...' (64 hex chars)
3993
+ */
3994
+ declare function hashJson<T>(obj: T): Promise<string>;
5784
3995
  /**
5785
3996
  * Sign a manifest with a SHA-256 content hash.
5786
3997
  *
@@ -7451,354 +5662,6 @@ declare function compareReferenceReplay(baseline: ReferenceReplayScore, candidat
7451
5662
  declare function decideReferenceReplayPromotion(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
7452
5663
  declare function defaultReferenceReplayMatcher(reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate): ReferenceMatchResult;
7453
5664
 
7454
- /**
7455
- * Paper-grade paired statistics for held-out promotion gates.
7456
- *
7457
- * The promotion gate (`HeldOutGate`) needs three things:
7458
- *
7459
- * 1. A bootstrap confidence interval on the per-item paired delta
7460
- * (`pairedBootstrap`). Median delta is the headline number; the
7461
- * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
7462
- * 2. A non-parametric significance test on the paired deltas
7463
- * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
7464
- * paper-style name).
7465
- * 3. False-discovery-rate correction across simultaneously-tested
7466
- * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
7467
- *
7468
- * Why a separate file: every existing primitive lives in `statistics.ts`
7469
- * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
7470
- * paired-only, paper-grade, and load-bearing for the promotion gate.
7471
- * Putting it next to `statistics.ts` would require editing that file;
7472
- * the brief forbids that. New file, new exports, no surface change.
7473
- */
7474
- interface PairedBootstrapResult {
7475
- /** Number of paired observations (after dropping unequal lengths is rejected). */
7476
- n: number;
7477
- /** Median of paired deltas (after − before). */
7478
- median: number;
7479
- /** Mean of paired deltas. */
7480
- mean: number;
7481
- /** Lower bound of the bootstrap CI on the median delta. */
7482
- low: number;
7483
- /** Upper bound of the bootstrap CI on the median delta. */
7484
- high: number;
7485
- /** Confidence level used (e.g. 0.95). */
7486
- confidence: number;
7487
- /** Number of bootstrap resamples used. */
7488
- resamples: number;
7489
- }
7490
- interface PairedBootstrapOptions {
7491
- /** Confidence level. Default 0.95. */
7492
- confidence?: number;
7493
- /** Bootstrap resample count. Default 2000. */
7494
- resamples?: number;
7495
- /** Statistic to bootstrap. Default 'median'. */
7496
- statistic?: 'median' | 'mean';
7497
- /** Deterministic seed. If omitted, uses Math.random(). */
7498
- seed?: number;
7499
- }
7500
- /**
7501
- * Paired bootstrap on (after - before) deltas. Returns a CI on the
7502
- * chosen statistic (median by default). Pairs are resampled with
7503
- * replacement. The lower bound is what the promotion gate checks: if
7504
- * `low > pairedDeltaThreshold`, the gain is real at the chosen
7505
- * confidence level.
7506
- *
7507
- * Throws on unequal sample sizes — caller must align pairs upstream.
7508
- */
7509
- declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
7510
- /**
7511
- * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
7512
- * paired deltas is the standard non-parametric significance test for
7513
- * "candidate beats baseline on matched items." Use alongside the
7514
- * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
7515
- */
7516
- declare function pairedWilcoxon(before: number[], after: number[]): {
7517
- w: number;
7518
- p: number;
7519
- };
7520
- /**
7521
- * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
7522
- * across multiple candidate-vs-baseline comparisons run in the same
7523
- * promotion sweep. Returns BH-adjusted q-values and significance at
7524
- * the requested FDR (default 0.05).
7525
- */
7526
- declare function bhAdjust(pValues: number[], fdr?: number): {
7527
- qValues: number[];
7528
- significant: boolean[];
7529
- };
7530
-
7531
- /**
7532
- * Researcher interface — stable hook for an external autonomous-research
7533
- * agent to drive the meta-loop.
7534
- *
7535
- * Implementations live downstream (typically in a private repo that
7536
- * runs the actual LLM). This package ships only the contract + a
7537
- * `NoopResearcher` so consumers can wire the surface without being
7538
- * forced to implement every method up front.
7539
- *
7540
- * The four methods mirror the four stages of the paper "Two Loops,
7541
- * Three Roles":
7542
- *
7543
- * inspectFailures — given the observed runs, what failure modes
7544
- * are present? (data → diagnosis)
7545
- * proposeChange — given diagnosed failure modes, what
7546
- * structural changes should we try?
7547
- * (diagnosis → plan delta)
7548
- * applyChange — fold the proposed deltas into a concrete
7549
- * experiment plan against an existing baseline.
7550
- * (plan delta → executable plan)
7551
- * evaluateChange — run the plan, return runs + the gate verdict.
7552
- * (executable plan → verdict)
7553
- *
7554
- * Composition is the discipline: a Researcher implementation MUST
7555
- * keep these four steps separate and inspectable. Conflating
7556
- * "diagnose + propose + run" into a single LLM call defeats the
7557
- * point of the framework — you can't audit which step lied.
7558
- *
7559
- * THIS INTERFACE IS STABLE. Breaking changes require a new module
7560
- * (e.g. `Researcher2`) so existing implementations keep working.
7561
- */
7562
-
7563
- /** A diagnosed failure mode with the run-IDs that exhibit it. */
7564
- interface FailureMode {
7565
- /** Short machine-readable code. Must be stable across runs of the
7566
- * same researcher to enable longitudinal tracking. */
7567
- code: string;
7568
- /** Human-readable description for the paper / dashboard. */
7569
- description: string;
7570
- evidence: {
7571
- /** Run IDs (from `RunRecord.runId`) where this failure mode was
7572
- * observed. */
7573
- runIds: string[];
7574
- /** Number of run samples that informed the diagnosis. */
7575
- samples: number;
7576
- };
7577
- }
7578
- /** A single steering change the researcher wants to try. */
7579
- interface SteeringChange {
7580
- kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
7581
- /** Implementation-specific payload. Researcher implementations
7582
- * define the schema — keep this `unknown` here to avoid coupling
7583
- * the public interface to any one researcher's internal model. */
7584
- payload: unknown;
7585
- /** Why the researcher proposed this change. Goes into the audit
7586
- * trail next to the failure-mode evidence. */
7587
- rationale: string;
7588
- /** Optional self-reported expected delta on the headline metric. */
7589
- expectedDelta?: number;
7590
- }
7591
- /** A single experiment plan, mapped onto the search/holdout splits. */
7592
- interface ExperimentPlan {
7593
- baselineCandidateId: string;
7594
- proposedCandidateId: string;
7595
- changes: SteeringChange[];
7596
- /** USD ceiling for the entire experiment. The runner must stop
7597
- * before exceeding this and report a partial result. */
7598
- evaluationBudgetUsd: number;
7599
- /** Item IDs (your dataset keys) for the search vs holdout splits. */
7600
- splits: {
7601
- search: string[];
7602
- holdout: string[];
7603
- };
7604
- }
7605
- /** Result of running a plan: every run, plus the gate verdict. */
7606
- interface ExperimentResult {
7607
- plan: ExperimentPlan;
7608
- runs: RunRecord[];
7609
- gateDecision: GateDecision;
7610
- }
7611
- /**
7612
- * The researcher loop. Stable, four-step, inspectable.
7613
- *
7614
- * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
7615
- * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
7616
- * └──────────┘ └──────────┘ └────┬─────┘
7617
- * │
7618
- * ▼
7619
- * ┌────────────────┐ applyChange ┌────────┐
7620
- * │ ExperimentPlan │ ◀────────────│ base │
7621
- * └────────┬───────┘ └────────┘
7622
- * │
7623
- * evaluateChange ▼
7624
- * ┌────────────────┐
7625
- * │ ExperimentResult│
7626
- * └────────────────┘
7627
- */
7628
- interface Researcher {
7629
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
7630
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
7631
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
7632
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
7633
- }
7634
- interface CallbackResearcherOptions {
7635
- inspectFailures: Researcher['inspectFailures'];
7636
- proposeChange: Researcher['proposeChange'];
7637
- applyChange: Researcher['applyChange'];
7638
- evaluateChange: Researcher['evaluateChange'];
7639
- }
7640
- /**
7641
- * Minimal concrete researcher for tests, scripts, and small integrations.
7642
- * Larger autonomous researchers can still implement `Researcher` directly.
7643
- */
7644
- declare class CallbackResearcher implements Researcher {
7645
- private readonly callbacks;
7646
- constructor(callbacks: CallbackResearcherOptions);
7647
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
7648
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
7649
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
7650
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
7651
- }
7652
- /**
7653
- * No-op researcher — fails loud on every method. Use as a placeholder
7654
- * in code paths that wire the interface but don't have an implementation
7655
- * yet. Importantly, this does NOT silently succeed: a no-op researcher
7656
- * that returned empty arrays would muffle the loop's signal that
7657
- * nobody implemented the brain.
7658
- */
7659
- declare class NoopResearcher implements Researcher {
7660
- private readonly hint;
7661
- constructor(hint?: string);
7662
- inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
7663
- proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
7664
- applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
7665
- evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
7666
- }
7667
-
7668
- /**
7669
- * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
7670
- * than replacing it.
7671
- *
7672
- * Three artefacts:
7673
- *
7674
- * - `summaryTable` Markdown table of per-candidate means,
7675
- * 95% bootstrap CIs, BH-adjusted Wilcoxon
7676
- * p-values, and Cohen's d versus a
7677
- * comparator candidate.
7678
- * - `paretoChart` Abstract spec for a cost vs quality
7679
- * scatter, with gate decisions overlaid.
7680
- * Returns numbers + labels — caller
7681
- * chooses the plotting library.
7682
- * - `gainHistogram`
7683
- * Per-item paired holdout deltas as a
7684
- * histogram spec (bins + counts + median +
7685
- * CI). Same "data, not images" contract.
7686
- *
7687
- * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
7688
- * They aren't React components and they aren't PNGs; they are
7689
- * what you'd hand to vega-lite, plotly, matplotlib, or your own
7690
- * Canvas renderer to draw the actual figure.
7691
- */
7692
-
7693
- interface SummaryTableOptions {
7694
- /** Comparator candidate id. Wilcoxon + Cohen's d are computed
7695
- * versus this candidate. Required for paired stats columns. */
7696
- comparator?: string;
7697
- /** Which split to read scores from. Default 'holdout'. */
7698
- split?: 'search' | 'holdout';
7699
- /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
7700
- confidence?: number;
7701
- /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
7702
- fdr?: number;
7703
- }
7704
- interface SummaryTableRow {
7705
- candidateId: string;
7706
- n: number;
7707
- mean: number;
7708
- ciLow: number;
7709
- ciHigh: number;
7710
- /** BH-adjusted q-value vs comparator. NaN if no comparator. */
7711
- qValue: number;
7712
- /** Cohen's d vs comparator. NaN if no comparator. */
7713
- cohensD: number;
7714
- }
7715
- interface SummaryTable {
7716
- rows: SummaryTableRow[];
7717
- comparator: string | null;
7718
- split: 'search' | 'holdout';
7719
- /** Pre-rendered markdown — drop into a paper or PR. */
7720
- markdown: string;
7721
- }
7722
- /**
7723
- * Table 1 helper. Buckets runs by `candidateId`, computes mean +
7724
- * bootstrap CI on the chosen split, and (when a comparator is given)
7725
- * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
7726
- */
7727
- declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
7728
- interface ParetoPoint {
7729
- candidateId: string;
7730
- /** Mean USD cost per run on the chosen split. */
7731
- cost: number;
7732
- /** Mean score on the chosen split. */
7733
- quality: number;
7734
- /** Number of runs that informed this point. */
7735
- n: number;
7736
- /** Whether this candidate is on the Pareto frontier — high
7737
- * quality, low cost, no dominator. */
7738
- onFrontier: boolean;
7739
- /** Optional gate verdict for this candidate, if a `GateDecision`
7740
- * for it was passed in. */
7741
- gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
7742
- }
7743
- interface ParetoFigureSpec {
7744
- kind: 'pareto-cost-quality';
7745
- split: 'search' | 'holdout';
7746
- points: ParetoPoint[];
7747
- axes: {
7748
- x: 'costUsd';
7749
- y: 'score';
7750
- };
7751
- }
7752
- /**
7753
- * Cost vs quality scatter spec. `gateDecisions` is keyed by
7754
- * candidate id; if present, every point picks up the gate verdict
7755
- * for overlay.
7756
- */
7757
- declare function paretoChart(runs: RunRecord[], opts?: {
7758
- split?: 'search' | 'holdout';
7759
- gateDecisions?: Record<string, GateDecision>;
7760
- }): ParetoFigureSpec;
7761
- interface GainDistributionBin {
7762
- /** Inclusive lower edge. */
7763
- lo: number;
7764
- /** Exclusive upper edge (or inclusive if it's the last bin). */
7765
- hi: number;
7766
- /** Number of pairs whose delta lands in this bin. */
7767
- count: number;
7768
- }
7769
- interface GainDistributionFigureSpec {
7770
- kind: 'gain-distribution';
7771
- candidateId: string;
7772
- comparator: string;
7773
- split: 'search' | 'holdout';
7774
- /** Number of pairs used. */
7775
- n: number;
7776
- bins: GainDistributionBin[];
7777
- median: number;
7778
- ci: {
7779
- low: number;
7780
- high: number;
7781
- };
7782
- }
7783
- interface GainDistributionOptions {
7784
- /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
7785
- bins?: number;
7786
- /** Which split to use. Default 'holdout'. */
7787
- split?: 'search' | 'holdout';
7788
- /** Confidence level for the CI. Default 0.95. */
7789
- confidence?: number;
7790
- /** Bootstrap resamples. Default 2000. */
7791
- resamples?: number;
7792
- /** Deterministic seed. */
7793
- seed?: number;
7794
- }
7795
- /**
7796
- * Held-out improvement distribution: per-pair delta (candidate −
7797
- * comparator), histogrammed. Includes the bootstrap CI on the median
7798
- * delta — same primitive the promotion gate uses.
7799
- */
7800
- declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
7801
-
7802
5665
  /**
7803
5666
  * Liveness canaries — cheap statistical checks that catch the failure
7804
5667
  * modes a green test suite never sees.
@@ -8461,622 +6324,4 @@ interface OrthogonalityResult {
8461
6324
  }
8462
6325
  declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
8463
6326
 
8464
- /**
8465
- * Bootstrap-CI promotion gate.
8466
- *
8467
- * In any iterative-improvement loop (GEPA, prompt evolution, dataset
8468
- * curation), the question is "did this generation actually improve, or are
8469
- * we celebrating noise?". With small N and noisy outcomes, point-estimate
8470
- * deltas lie. Bootstrap confidence intervals tell the operator whether the
8471
- * delta is real before code or prompts get promoted.
8472
- *
8473
- * This module is pure functions — no I/O, no model calls. Easy to unit-test
8474
- * and to compose into any verdict gate.
8475
- *
8476
- * Default gate:
8477
- * - Bootstrap mean baseline vs candidate (1k resamples).
8478
- * - Compute the delta distribution; pass if the lower CI bound > 0.
8479
- * - Tunable confidence (default 95%) and resample count.
8480
- *
8481
- * Verdict semantics intentionally match the existing `experiments.jsonl`
8482
- * vocabulary:
8483
- * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
8484
- * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
8485
- * - REVERT: candidate's CI upper bound < baseline mean (real regression)
8486
- * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
8487
- */
8488
- type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
8489
- interface BootstrapResult {
8490
- baselineMean: number;
8491
- candidateMean: number;
8492
- /** candidateMean - baselineMean, point estimate. */
8493
- delta: number;
8494
- /** Lower bound of the (1 - alpha) CI on the delta. */
8495
- ciLower: number;
8496
- /** Upper bound of the (1 - alpha) CI on the delta. */
8497
- ciUpper: number;
8498
- /** Number of bootstrap resamples used. */
8499
- iterations: number;
8500
- alpha: number;
8501
- verdict: Verdict;
8502
- }
8503
- interface BootstrapOptions {
8504
- /** Confidence level alpha (default 0.05 → 95% CI). */
8505
- alpha?: number;
8506
- /** Number of resamples (default 1000). */
8507
- iterations?: number;
8508
- /**
8509
- * Minimum total samples (baseline + candidate) below which we always
8510
- * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
8511
- * Default 6 (combined).
8512
- */
8513
- minTotalSamples?: number;
8514
- /** RNG seed for reproducibility. Default: Math.random. */
8515
- seed?: number;
8516
- }
8517
- /**
8518
- * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
8519
- *
8520
- * Uses simple percentile bootstrap on the difference of resampled means.
8521
- * That's the standard non-parametric primitive — no distributional
8522
- * assumptions, robust to skew, easy to reason about.
8523
- */
8524
- declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
8525
- /**
8526
- * Judge-replay promotion gate.
8527
- *
8528
- * The cheap inner-loop judge that drives an evolution run is by definition
8529
- * fast and noisy. When you're about to promote a winning variant to the
8530
- * canonical default, you want a STRONGER judge (a more expensive model, a
8531
- * human grader, a separately-trained reward model) to confirm the win
8532
- * generalises beyond the inner loop.
8533
- *
8534
- * This helper takes raw winner + baseline outputs, scores both through the
8535
- * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
8536
- * judge agrees the winner is real with the configured confidence. Doesn't
8537
- * matter what shape your "output" is — pass a string, an object, anything
8538
- * the judge can read.
8539
- */
8540
- interface JudgeReplayGateArgs<TOutput> {
8541
- baselineOutputs: TOutput[];
8542
- candidateOutputs: TOutput[];
8543
- /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
8544
- judge: (output: TOutput) => Promise<number> | number;
8545
- alpha?: number;
8546
- iterations?: number;
8547
- /** RNG seed for reproducibility. */
8548
- seed?: number;
8549
- /** Maximum concurrent judge calls. Default 4. */
8550
- judgeConcurrency?: number;
8551
- }
8552
- declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
8553
- baselineSamples: number;
8554
- candidateSamples: number;
8555
- }>;
8556
-
8557
- /**
8558
- * Shared types for the trace-analyst module.
8559
- *
8560
- * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
8561
- * line per span, OTLP-shaped. We do NOT depend on a specific tracing
8562
- * vendor at the type level. Adapter
8563
- * layers map upstream shapes onto this interface.
8564
- *
8565
- * Design constraint. Every read operation that can return arbitrary
8566
- * payload must carry a byte budget so the agent's tool result stays
8567
- * bounded regardless of input trace size. Oversized responses
8568
- * substitute a deterministic summary instead of bytes — see
8569
- * `ViewTraceOversized`.
8570
- */
8571
- /** OTLP span kind (subset we actually use). */
8572
- type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
8573
- type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
8574
- /** Subset of OTLP span fields the analyst exposes to the agent. The
8575
- * store's job is to project upstream's full span shape down to this
8576
- * view — the analyst never sees vendor extensions directly. */
8577
- interface TraceAnalystSpan {
8578
- trace_id: string;
8579
- span_id: string;
8580
- parent_span_id: string | null;
8581
- name: string;
8582
- kind: TraceAnalystSpanKind;
8583
- start_time: string;
8584
- end_time: string;
8585
- duration_ms: number;
8586
- status: TraceAnalystSpanStatus;
8587
- status_message?: string;
8588
- service_name: string | null;
8589
- agent_name: string | null;
8590
- model_name: string | null;
8591
- tool_name: string | null;
8592
- /** Raw JSON-serialisable attribute map. May contain large strings;
8593
- * callers must respect the per-attribute byte cap. */
8594
- attributes: Record<string, unknown>;
8595
- }
8596
- interface TraceAnalystTraceSummary {
8597
- trace_id: string;
8598
- service_name: string | null;
8599
- agent_name: string | null;
8600
- span_count: number;
8601
- has_errors: boolean;
8602
- start_time: string;
8603
- end_time: string;
8604
- duration_ms: number;
8605
- raw_jsonl_bytes: number;
8606
- models: string[];
8607
- tools: string[];
8608
- }
8609
- interface TraceAnalystFilters {
8610
- /** Restrict to traces that contain at least one error span. */
8611
- has_errors?: boolean;
8612
- /** Match if any span's `service.name` is in this list. */
8613
- service_names?: string[];
8614
- /** Match if any span's `agent.name` is in this list. */
8615
- agent_names?: string[];
8616
- /** Match if any LLM span's `llm.model_name` is in this list. */
8617
- model_names?: string[];
8618
- /** Match if any tool span's `tool.name` is in this list. */
8619
- tool_names?: string[];
8620
- /** ISO-8601 lower bound on the trace's earliest start time. */
8621
- start_time_after?: string;
8622
- /** ISO-8601 upper bound on the trace's earliest start time. */
8623
- start_time_before?: string;
8624
- /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
8625
- * expensive on large datasets. Use the indexed filters above first. */
8626
- regex_pattern?: string;
8627
- }
8628
- interface DatasetOverview {
8629
- total_traces: number;
8630
- raw_jsonl_bytes: number;
8631
- services: string[];
8632
- agents: string[];
8633
- models: string[];
8634
- tool_names: string[];
8635
- /** Up to 20 real trace ids the agent may pass to view/search tools. */
8636
- sample_trace_ids: string[];
8637
- errors: {
8638
- trace_count: number;
8639
- span_count: number;
8640
- };
8641
- time_range: {
8642
- earliest: string;
8643
- latest: string;
8644
- } | null;
8645
- }
8646
- interface QueryTracesPage {
8647
- traces: TraceAnalystTraceSummary[];
8648
- total: number;
8649
- has_more: boolean;
8650
- }
8651
- /** Full-trace view. When the response would exceed the per-call byte
8652
- * budget, `oversized` is populated INSTEAD of `spans` so the agent
8653
- * knows to switch to `searchTrace` / `viewSpans`. */
8654
- interface ViewTraceResult {
8655
- trace_id: string;
8656
- spans?: TraceAnalystSpan[];
8657
- oversized?: ViewTraceOversized;
8658
- }
8659
- interface ViewTraceOversized {
8660
- span_count: number;
8661
- /** Names with their counts, sorted desc. Capped at 20 entries. */
8662
- top_span_names: Array<[string, number]>;
8663
- /** Largest single span body (bytes after attribute-cap projection). */
8664
- span_response_bytes_max: number;
8665
- error_span_count: number;
8666
- }
8667
- interface ViewSpansResult {
8668
- trace_id: string;
8669
- spans: TraceAnalystSpan[];
8670
- /** Number of requested span ids that were not found in the trace. */
8671
- missing_span_ids: string[];
8672
- /** Number of attribute fields truncated to fit the per-attribute cap. */
8673
- truncated_attribute_count: number;
8674
- }
8675
- interface SpanMatchRecord {
8676
- trace_id: string;
8677
- span_id: string;
8678
- span_name: string;
8679
- span_kind: TraceAnalystSpanKind;
8680
- /** JSON pointer-style path to the matched value, e.g.
8681
- * `attributes."llm.input_messages"[2].content`. */
8682
- attribute_path: string;
8683
- matched_text: string;
8684
- context_before: string;
8685
- context_after: string;
8686
- match_offset: number;
8687
- }
8688
- interface SearchTraceResult {
8689
- trace_id: string;
8690
- hits: SpanMatchRecord[];
8691
- total_matches: number;
8692
- has_more: boolean;
8693
- }
8694
- interface SearchSpanResult {
8695
- trace_id: string;
8696
- span_id: string;
8697
- hits: SpanMatchRecord[];
8698
- total_matches: number;
8699
- has_more: boolean;
8700
- }
8701
- /** Tunable byte budgets for bounded RLM tool output. */
8702
- interface TraceAnalystByteBudgets {
8703
- /** Max bytes any single tool response may emit. Hard ceiling enforced
8704
- * by the store; oversized → summary. Default 150_000. */
8705
- perCallByteCeiling: number;
8706
- /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
8707
- * Default 4096. */
8708
- perAttributeViewBudget: number;
8709
- /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
8710
- * Default 16384. */
8711
- perAttributeSpanBudget: number;
8712
- /** Per-attribute cap on a single match record's `matched_text` and
8713
- * context window. Default 1024. */
8714
- perMatchTextBudget: number;
8715
- }
8716
- declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
8717
- /** Marker substituted in place of truncated string payloads. Callers
8718
- * parsing tool output can detect it deterministically. */
8719
- declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
8720
-
8721
- /**
8722
- * `TraceAnalysisStore` — read-side interface the trace-analyst calls
8723
- * through. Six operations, all bounded:
8724
- *
8725
- * - `getOverview(filters?)` — dataset rollup + sample trace ids.
8726
- * - `queryTraces(filters?, limit, offset)` — paginated summaries.
8727
- * - `countTraces(filters?)` — cheap count without materialisation.
8728
- * - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
8729
- * - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
8730
- * - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
8731
- * - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
8732
- *
8733
- * Multiple implementations ship in the core (`OtlpFileTraceStore`).
8734
- * Downstream callers can supply their own — e.g. a DuckDB-backed
8735
- * adapter or an in-memory adapter for tests — by implementing this
8736
- * interface.
8737
- *
8738
- * Filters compose with AND semantics. Empty/undefined fields impose
8739
- * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
8740
- * implementations may skip it via `count`/`overview` when not set.
8741
- */
8742
-
8743
- interface TraceAnalysisStore {
8744
- getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
8745
- queryTraces(opts: {
8746
- filters?: TraceAnalystFilters;
8747
- limit: number;
8748
- offset?: number;
8749
- }): Promise<QueryTracesPage>;
8750
- countTraces(filters?: TraceAnalystFilters): Promise<number>;
8751
- viewTrace(opts: {
8752
- trace_id: string;
8753
- /** Override per-attribute byte cap. Defaults to discovery budget. */
8754
- per_attribute_byte_cap?: number;
8755
- }): Promise<ViewTraceResult>;
8756
- viewSpans(opts: {
8757
- trace_id: string;
8758
- span_ids: readonly string[];
8759
- /** Override per-attribute byte cap. Defaults to surgical budget. */
8760
- per_attribute_byte_cap?: number;
8761
- }): Promise<ViewSpansResult>;
8762
- searchTrace(opts: {
8763
- trace_id: string;
8764
- regex_pattern: string;
8765
- /** Hard cap on matches returned. Default 50. */
8766
- max_matches?: number;
8767
- }): Promise<SearchTraceResult>;
8768
- searchSpan(opts: {
8769
- trace_id: string;
8770
- span_id: string;
8771
- regex_pattern: string;
8772
- max_matches?: number;
8773
- }): Promise<SearchSpanResult>;
8774
- }
8775
-
8776
- interface AnalyzeTracesInput {
8777
- /** The user-facing question. Domain framing belongs here, not in the
8778
- * actor description. */
8779
- question: string;
8780
- }
8781
- interface AnalyzeTracesResult {
8782
- /** The responder's prose answer. */
8783
- answer: string;
8784
- /** Bulleted findings extracted from the responder's structured output. */
8785
- findings: string[];
8786
- /** Per-actor-turn snapshots captured via `actorTurnCallback`. */
8787
- turns: AnalyzeTracesTurnSnapshot[];
8788
- /** Total turns the actor took. */
8789
- turnCount: number;
8790
- /** Token usage by role. */
8791
- usage: TraceAnalystUsage;
8792
- /** Full system + assistant + tool message log by role. */
8793
- chatLog: TraceAnalystChatLog;
8794
- /** Prompt version that produced this run. */
8795
- actorPromptVersion: string;
8796
- }
8797
- interface TraceAnalystUsage {
8798
- actor: TraceAnalystUsageEntry[];
8799
- responder: TraceAnalystUsageEntry[];
8800
- }
8801
- interface TraceAnalystUsageEntry {
8802
- [key: string]: unknown;
8803
- }
8804
- interface TraceAnalystChatLog {
8805
- actor: TraceAnalystChatMessage[];
8806
- responder: TraceAnalystChatMessage[];
8807
- }
8808
- interface TraceAnalystChatMessage {
8809
- [key: string]: unknown;
8810
- }
8811
- interface AnalyzeTracesTurnSnapshot {
8812
- turn: number;
8813
- isError: boolean;
8814
- /** The JS code the actor produced for this turn. */
8815
- code: string;
8816
- /** The formatted action-log entry the actor sees on the next turn. */
8817
- output: string;
8818
- /** Provider thought (when `actorOptions.showThoughts` is true and the
8819
- * provider returns it). */
8820
- thought?: string;
8821
- }
8822
- interface AnalyzeTracesOptions {
8823
- /** Trace data source. Pass either an OTLP-JSONL path or a custom store. */
8824
- source: string | TraceAnalysisStore;
8825
- /** Caller-provided AxAIService. */
8826
- ai: AxAIService;
8827
- /** Model id forwarded to actor + responder. */
8828
- model?: string;
8829
- /** Recursion depth. 0 = no sub-agent dispatch. Default 1. */
8830
- maxDepth?: number;
8831
- /** Maximum actor turns. Default 12. */
8832
- maxTurns?: number;
8833
- /** Maximum parallel sub-agent calls in batched llmQuery. Default 2. */
8834
- maxParallelSubagents?: number;
8835
- /** Override the actor description. */
8836
- actorDescription?: string;
8837
- /** Override the subagent description. */
8838
- subagentDescription?: string;
8839
- /** Per-turn observability hook. */
8840
- onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
8841
- /** Override max runtime characters per turn. Default 6000. */
8842
- maxRuntimeChars?: number;
8843
- /** When set, every turn's snapshot is appended to this JSONL file
8844
- * immediately. If the analyst crashes mid-loop (provider 503,
8845
- * network error, validator reject) the partial reasoning is still
8846
- * on disk. Replay the file with the responder afterward to recover
8847
- * evidence. */
8848
- progressLogPath?: string;
8849
- }
8850
- /**
8851
- * Run the trace analyst.
8852
- *
8853
- * Throws:
8854
- * - `TraceFileMissingError` if `source` is a path and doesn't exist.
8855
- * - `AxAgentClarificationError` if the analyst asks for clarification.
8856
- * - Provider errors (auth, rate limits) propagate from the AI service.
8857
- */
8858
- declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
8859
-
8860
- /**
8861
- * `OtlpFileTraceStore` — read-only OTLP-JSONL trace store for the
8862
- * trace-analyst.
8863
- *
8864
- * Wire shape. Each line of the input file is one OTLP-shaped span. The
8865
- * store understands flattened OTLP JSONL plus the OpenInference vocab.
8866
- * We project upstream's full
8867
- * span shape down to `TraceAnalystSpan` lazily — full materialisation
8868
- * only happens for the spans the agent actually requests.
8869
- *
8870
- * Indexing. On first read the store builds an in-memory index keyed
8871
- * by `trace_id` carrying:
8872
- * - byte offsets + lengths for each span line (for surgical reads
8873
- * without re-parsing the whole file)
8874
- * - a `TraceAnalystTraceSummary` rollup
8875
- * - sets of services / agents / models / tools / has_errors
8876
- * - byte size of the trace's JSONL slab
8877
- *
8878
- * Memory bound. The index keeps span metadata only — names, kinds,
8879
- * offsets, status. Attribute payloads stay on disk until requested.
8880
- * For a 50MB JSONL with 50k spans, the index is ~5MB.
8881
- *
8882
- * Concurrency. The store builds the index once on first read and
8883
- * caches it. Subsequent reads reuse the index. The file is opened on
8884
- * each read; we never hold a long-lived FD.
8885
- */
8886
-
8887
- interface OtlpFileTraceStoreOptions {
8888
- /** Path to the OTLP-JSONL file. */
8889
- path: string;
8890
- /** Override the discovery (`viewTrace`) per-attribute byte cap. */
8891
- perAttributeViewBudget?: number;
8892
- /** Override the surgical (`viewSpans`) per-attribute byte cap. */
8893
- perAttributeSpanBudget?: number;
8894
- /** Override the per-call ceiling that triggers oversized summaries. */
8895
- perCallByteCeiling?: number;
8896
- /** Override the per-match text budget. */
8897
- perMatchTextBudget?: number;
8898
- }
8899
- declare class OtlpFileTraceStore implements TraceAnalysisStore {
8900
- private readonly path;
8901
- private readonly perAttributeViewBudget;
8902
- private readonly perAttributeSpanBudget;
8903
- private readonly perCallByteCeiling;
8904
- private readonly perMatchTextBudget;
8905
- private indexPromise?;
8906
- /** Cached UTF-8 buffer of the file. We pin it once because every
8907
- * read needs slice access and re-reading on each call balloons the
8908
- * syscall count. */
8909
- private bufferPromise?;
8910
- constructor(opts: OtlpFileTraceStoreOptions);
8911
- getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
8912
- queryTraces(opts: {
8913
- filters?: TraceAnalystFilters;
8914
- limit: number;
8915
- offset?: number;
8916
- }): Promise<QueryTracesPage>;
8917
- countTraces(filters?: TraceAnalystFilters): Promise<number>;
8918
- viewTrace(opts: {
8919
- trace_id: string;
8920
- per_attribute_byte_cap?: number;
8921
- }): Promise<ViewTraceResult>;
8922
- viewSpans(opts: {
8923
- trace_id: string;
8924
- span_ids: readonly string[];
8925
- per_attribute_byte_cap?: number;
8926
- }): Promise<ViewSpansResult>;
8927
- searchTrace(opts: {
8928
- trace_id: string;
8929
- regex_pattern: string;
8930
- max_matches?: number;
8931
- }): Promise<SearchTraceResult>;
8932
- searchSpan(opts: {
8933
- trace_id: string;
8934
- span_id: string;
8935
- regex_pattern: string;
8936
- max_matches?: number;
8937
- }): Promise<SearchSpanResult>;
8938
- /** Force the index to materialise. Useful to amortise startup cost
8939
- * before the first agent call. */
8940
- ensureIndexed(): Promise<void>;
8941
- private buffer;
8942
- private index;
8943
- private buildIndex;
8944
- private matchedTraces;
8945
- private toSummary;
8946
- private projectSpan;
8947
- private buildOversizedSummary;
8948
- private scanSpanForMatches;
8949
- }
8950
- declare class TraceFileMissingError extends Error {
8951
- constructor(path: string);
8952
- }
8953
- declare class TraceNotFoundError extends Error {
8954
- readonly trace_id: string;
8955
- constructor(trace_id: string);
8956
- }
8957
- declare class SpanNotFoundError extends Error {
8958
- readonly trace_id: string;
8959
- readonly span_id: string;
8960
- constructor(trace_id: string, span_id: string);
8961
- }
8962
-
8963
- /**
8964
- * Trace-analyst tool surface — six namespaced AxFunctions the analyst
8965
- * agent calls from generated JS code via `traces.<name>(...)`.
8966
- *
8967
- * Discovery → narrow → deep-read protocol. Tool names + ordering
8968
- * support RLM discovery:
8969
- *
8970
- * 1. `getDatasetOverview` (cheap) — first call, sizes the dataset
8971
- * 2. `queryTraces` — paginated summaries with `raw_jsonl_bytes`
8972
- * 3. `countTraces` — cheap pre-flight before regex
8973
- * 4. `viewTrace` — full span list, oversized → summary
8974
- * 5. `viewSpans` — surgical 16KB-cap reads
8975
- * 6. `searchTrace` / `searchSpan` — bounded regex hits
8976
- *
8977
- * Failure mode. Tool handlers throw on bad input (invalid trace ids,
8978
- * out-of-range pagination, malformed regex). Ax converts thrown errors
8979
- * into actor-visible `[ERROR]` strings so the analyst can adjust on
8980
- * the next turn instead of looping.
8981
- */
8982
-
8983
- interface BuildTraceAnalystToolsOpts {
8984
- store: TraceAnalysisStore;
8985
- /** Override the default sample-trace-id slot count (20). Mostly for tests. */
8986
- sampleTraceLimit?: number;
8987
- }
8988
- /**
8989
- * Build the trace-analyst function set. Pass the result into
8990
- * `agent(...).functions.local`.
8991
- */
8992
- declare function buildTraceAnalystTools(opts: BuildTraceAnalystToolsOpts): AxFunction[];
8993
- /**
8994
- * Convenience: same shape as `buildTraceAnalystTools` but returns the
8995
- * grouped form expected when registering trace tools alongside other
8996
- * agent function modules. */
8997
- declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
8998
- namespace: string;
8999
- title: string;
9000
- selectionCriteria: string;
9001
- description: string;
9002
- functions: AxFunction[];
9003
- };
9004
-
9005
- /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
9006
- declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
9007
- declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
9008
- /** Subagent prompt for focused trace-inspection subtasks. */
9009
- declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
9010
-
9011
- interface TraceInsightTask {
9012
- id: string;
9013
- name: string;
9014
- prompt?: string;
9015
- difficulty?: string;
9016
- tags?: string[];
9017
- outcome?: string;
9018
- score?: number;
9019
- gaps?: string[];
9020
- }
9021
- interface TraceInsightSuite {
9022
- name: string;
9023
- collectionId?: string;
9024
- tasks: TraceInsightTask[];
9025
- }
9026
- interface TraceInsightFinding {
9027
- kind: string;
9028
- severity?: string;
9029
- taskIds: string[];
9030
- evidence?: string;
9031
- proposedFixClass?: string;
9032
- }
9033
- interface TraceInsightQuestion {
9034
- id: string;
9035
- question: string;
9036
- why: string;
9037
- }
9038
- interface TraceInsightPanelRole {
9039
- id: string;
9040
- name: string;
9041
- responsibility: string;
9042
- }
9043
- interface TraceInsightPromptInput {
9044
- suite: TraceInsightSuite;
9045
- findings?: TraceInsightFinding[];
9046
- agent?: Record<string, unknown>;
9047
- totals?: Record<string, unknown>;
9048
- maxRepresentativeTraces?: number;
9049
- }
9050
- interface TraceInsightContext {
9051
- suite: TraceInsightSuite;
9052
- scope: string;
9053
- keywords: string[];
9054
- questions: TraceInsightQuestion[];
9055
- panel: TraceInsightPanelRole[];
9056
- findings: TraceInsightFinding[];
9057
- agent: Record<string, unknown> | null;
9058
- totals: Record<string, unknown> | null;
9059
- }
9060
- interface TraceInsightQualityGate {
9061
- id: string;
9062
- label: string;
9063
- passed: boolean;
9064
- severity: 'critical' | 'high' | 'medium' | 'low';
9065
- detail: string;
9066
- }
9067
- interface TraceInsightReadiness {
9068
- score: number;
9069
- grade: 'external-ready' | 'internal-review' | 'raw-analysis';
9070
- gates: TraceInsightQualityGate[];
9071
- }
9072
- declare function tokenizeDomainWords(value: string): string[];
9073
- declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
9074
- declare function domainEvidencePattern(keywords: string[]): RegExp;
9075
- declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
9076
- declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
9077
- declare function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext;
9078
- declare function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness;
9079
- declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
9080
- declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
9081
-
9082
- export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, CallbackResearcher, type CallbackResearcherOptions, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_BUDGETS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetOverview, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, type QueryTracesPage, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SearchSpanResult, type SearchTraceResult, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanMatchRecord, SpanNotFoundError, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceEmitter, type TraceEmitterOptions, type TraceEvent, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, analyzeTraces, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, defaultTraceInsightPanel, deployGateLayer, describeTraceInsightScope, distillPlaybook, domainEvidencePattern, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, inferDomainKeywords, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, planTraceInsightQuestions, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, scoreTraceInsightReadiness, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, tokenizeDomainWords, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, traceAnalystFunctionGroup, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
6327
+ export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmMessage, LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OutcomeFilter, type OutcomePair, type OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };