@tangle-network/agent-eval 0.17.2 → 0.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -21,7 +21,7 @@ console.log(ship.result.passed, ship.result.score)
21
21
  - You ship a content generator and need quality signal beyond "the LLM said it's good".
22
22
  - You want a release gate that fails on regressions you can name, not vibes.
23
23
 
24
- If that's you, start with [`docs/concepts.md`](./docs/concepts.md) — 5-minute mental model — then come back here.
24
+ If that's you, start with [`docs/concepts.md`](./docs/concepts.md) — 5-minute mental model — then use [`docs/feature-guide.md`](./docs/feature-guide.md) to choose the right primitive.
25
25
 
26
26
  ## Quickstart
27
27
 
@@ -65,6 +65,7 @@ The recipe for a code-generator eval is in [`SKILL.md` §Minimal working path](.
65
65
  ## Two ways to read this repo
66
66
 
67
67
  - **You're a human onboarding** — read [`docs/concepts.md`](./docs/concepts.md) for the mental model, then [`docs/wire-protocol.md`](./docs/wire-protocol.md) if you'll call from another language, or `SKILL.md` if you'll embed in TS.
68
+ - **You're deciding what to integrate** — read [`docs/feature-guide.md`](./docs/feature-guide.md) for the layman explanation, use cases, feature map, and guardrails.
68
69
  - **You're an LLM agent writing integration code** — read `SKILL.md`. Every directive there encodes a shipped bug; skipping one reintroduces the bug class.
69
70
 
70
71
  ## What's in the box
@@ -78,7 +79,8 @@ The recipe for a code-generator eval is in [`SKILL.md` §Minimal working path](.
78
79
  | `clients/python/` | First-party Python client (`tangle-agent-eval` on PyPI). Version-locked to npm. | clients/python/README.md |
79
80
  | `BenchmarkRunner`, `executeScenario`, `ConvergenceTracker` | Multi-turn scenario execution + cross-run tracking. | SKILL.md |
80
81
  | `runAgentControlLoop` | Policy-based runtime for agentic tasks: observe typed state, validate, decide, act, repeat with budgets, tracing, and stuck-loop guards. | [control-runtime.md](./docs/control-runtime.md) |
81
- | `FeedbackTrajectory`, `InMemoryFeedbackTrajectoryStore`, `FileSystemFeedbackTrajectoryStore` | Product-native learning loops: capture approvals, rejections, choices, revisions, metrics, and policy blocks as train/dev/test/holdout examples. | [feedback-trajectories.md](./docs/feedback-trajectories.md) |
82
+ | `FeedbackTrajectory`, `InMemoryFeedbackTrajectoryStore`, `FileSystemFeedbackTrajectoryStore` | Human/environment feedback loops: capture approvals, rejections, choices, revisions, metrics, and policy blocks as train/dev/test/holdout examples. | [feedback-trajectories.md](./docs/feedback-trajectories.md) |
83
+ | `evaluateActionPolicy` | Generic action preflight for approval, budget, expected-outcome, and kill-criteria checks. | [feature-guide.md](./docs/feature-guide.md) |
82
84
  | `ExperimentTracker`, `PromptOptimizer`, `bisector` | A/B prompts, optimize steering, bisect regressions. | SKILL.md |
83
85
  | `runPromptEvolution`, `createCompositeMutator`, `createSandboxPool`, `createSandboxCodeMutator`, `MutationTelemetry`, `LineageRecorder`, `CostLedger`, `JsonlTrialCache` | Prompt + code evolution loops with bounded sandbox pools, durable JSONL telemetry, plateau-detecting composite mutators, crash-resumable trial cache. | §Evolution loop |
84
86
  | `reflective-mutation` (`buildReflectionPrompt`, `parseReflectionResponse`, `DEFAULT_MUTATION_PRIMITIVES`) | Trace-conditioned LLM mutator that reasons over top/bottom trials instead of blind rewrites. | inline JSDoc |
@@ -170,9 +172,9 @@ The `MutationTelemetry`, `LineageRecorder`, and `CostLedger` pass into the `code
170
172
 
171
173
  For the full primitive surface and rationale, read each module's JSDoc — `prompt-evolution.ts`, `composite-mutator.ts`, `sandbox-pool.ts`, `code-mutator.ts`, `reflective-mutation.ts`, `evolution-telemetry.ts`.
172
174
 
173
- ## Product feedback loop
175
+ ## Feedback trajectory loop
174
176
 
175
- When normal product usage should generate training/eval signal, use feedback
177
+ When normal agent usage should generate training/eval signal, use feedback
176
178
  trajectories. They turn approvals, rejections, option choices, edits, metrics,
177
179
  and policy blocks into reusable examples.
178
180
 
@@ -185,22 +187,21 @@ import {
185
187
  } from '@tangle-network/agent-eval'
186
188
 
187
189
  const trajectory = createFeedbackTrajectory({
188
- projectId: 'gtm-agent',
189
- scenarioId: 'ad-positioning-choice',
190
- task: { intent: 'Choose a paid-social positioning angle.' },
190
+ projectId: 'research-agent',
191
+ scenarioId: 'brief-review',
192
+ task: { intent: 'Revise a research brief until it is specific and sourced.' },
191
193
  attempts: [{
192
194
  id: 'draft-1',
193
195
  stepIndex: 0,
194
- artifactType: 'decision',
195
- artifact: { option: 'enterprise procurement language' },
196
- options: ['enterprise procurement', 'technical founder pain'],
196
+ artifactType: 'research',
197
+ artifact: { summary: 'Initial brief with weak sourcing.' },
197
198
  createdAt: new Date().toISOString(),
198
199
  }],
199
200
  labels: [{
200
201
  source: 'user',
201
- kind: 'reject',
202
- value: 'enterprise procurement',
203
- reason: 'too enterprise; our buyer is a technical founder',
202
+ kind: 'revision_request',
203
+ value: 'needs stronger evidence',
204
+ reason: 'add primary sources and remove unsupported claims',
204
205
  severity: 'error',
205
206
  createdAt: new Date().toISOString(),
206
207
  }],
@@ -211,9 +212,9 @@ const scenarios = feedbackTrajectoriesToDatasetScenarios([trajectory])
211
212
  const optimizerRows = feedbackTrajectoriesToOptimizerRows([trajectory])
212
213
  ```
213
214
 
214
- This is the bridge between product UX and optimization: normal user feedback
215
- becomes immediate memory, replayable eval scenarios, and prompt/signature/code
216
- optimizer input. See [`docs/feedback-trajectories.md`](./docs/feedback-trajectories.md).
215
+ This is the bridge between feedback and optimization: review signals become
216
+ immediate memory, replayable eval scenarios, and prompt/signature/code optimizer
217
+ input. See [`docs/feedback-trajectories.md`](./docs/feedback-trajectories.md).
217
218
 
218
219
  ## v0.16 highlights — production-rigor primitives
219
220
 
package/dist/index.d.ts CHANGED
@@ -1437,6 +1437,17 @@ interface FeedbackOptimizerRow extends OptimizationExample {
1437
1437
  labelKinds: FeedbackLabelKind[];
1438
1438
  score?: number;
1439
1439
  }
1440
+ interface FeedbackReplayResult {
1441
+ trajectoryId: string;
1442
+ pass: boolean;
1443
+ score?: number;
1444
+ labels: FeedbackLabel[];
1445
+ outcome?: FeedbackOutcome;
1446
+ metadata?: Record<string, unknown>;
1447
+ }
1448
+ interface FeedbackReplayAdapter {
1449
+ replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
1450
+ }
1440
1451
  declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
1441
1452
  private readonly trajectories;
1442
1453
  save(trajectory: FeedbackTrajectory): Promise<void>;
@@ -1479,6 +1490,8 @@ declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTraject
1479
1490
  declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
1480
1491
  declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
1481
1492
  declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
1493
+ declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
1494
+ declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
1482
1495
  declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
1483
1496
  maxEntries?: number;
1484
1497
  }): PreferenceMemoryEntry[];
@@ -1494,6 +1507,29 @@ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(
1494
1507
  createdAt?: string;
1495
1508
  }): FeedbackTrajectory;
1496
1509
 
1510
+ interface ActionExecutionPolicy {
1511
+ allowedTypes?: string[];
1512
+ blockedTypes?: string[];
1513
+ alwaysRequireApprovalTypes?: string[];
1514
+ autoApproveTypes?: string[];
1515
+ requireApprovalForExternalSideEffects?: boolean;
1516
+ requireApprovalAboveCostUsd?: number;
1517
+ maxActionCostUsd?: number;
1518
+ remainingBudgetUsd?: number;
1519
+ expectedOutcomeRequired?: boolean;
1520
+ killCriteriaRequired?: boolean;
1521
+ }
1522
+ interface ActionPolicyDecision {
1523
+ allowed: boolean;
1524
+ blocked: boolean;
1525
+ requiresApproval: boolean;
1526
+ reasons: string[];
1527
+ label?: FeedbackLabel;
1528
+ }
1529
+ declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
1530
+ createdAt?: string;
1531
+ }): ActionPolicyDecision;
1532
+
1497
1533
  /**
1498
1534
  * Normalize scores so all dimensions follow "higher = better".
1499
1535
  * Inverted dimensions (hallucination, false_confidence, worst_failure)
@@ -1595,7 +1631,7 @@ declare class ConvergenceTracker {
1595
1631
  * Uses the Web Crypto API (works in Workers, Node 22+, browsers).
1596
1632
  */
1597
1633
  interface PromptHandle {
1598
- /** Stable human-readable id, e.g. 'legal.system' */
1634
+ /** Stable human-readable id, e.g. 'browser.system' */
1599
1635
  id: string;
1600
1636
  /** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
1601
1637
  version: string;
@@ -1687,7 +1723,7 @@ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSl
1687
1723
  * Artifact validators.
1688
1724
  *
1689
1725
  * Generic "score a produced artifact" primitive. Tax uses it for PDF form
1690
- * correctness, legal for contract clauses, film for script breakdowns, GTM
1726
+ * correctness, research for sourced briefs, browser for task assertions, coding
1691
1727
  * for social posts. One interface, many validators; all plug into
1692
1728
  * `BenchmarkRunner` the same way.
1693
1729
  *
@@ -1975,7 +2011,7 @@ declare class FileSystemExperimentStore implements ExperimentStore {
1975
2011
  * `Run.status` field one-to-one.
1976
2012
  *
1977
2013
  * Why this lives next to `InMemoryExperimentStore`:
1978
- * - bad-app, legal-agent, gtm-agent, film-agent all run as Workers
2014
+ * - browser, coding, and computer-use agents can all run as Workers
1979
2015
  * - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
1980
2016
  * - Hand-rolling D1 SQL in every consumer is exactly the duplication this
1981
2017
  * module exists to prevent
@@ -2008,7 +2044,7 @@ interface D1ExperimentStoreOptions {
2008
2044
  db: D1Like;
2009
2045
  /**
2010
2046
  * Optional table-name prefix so multiple ExperimentStores can share a DB
2011
- * without colliding (e.g. `tax_eval_experiments` vs `legal_eval_experiments`).
2047
+ * without colliding (e.g. `browser_eval_experiments` vs `coding_eval_experiments`).
2012
2048
  * Default: `agent_eval_`.
2013
2049
  */
2014
2050
  tablePrefix?: string;
@@ -2592,7 +2628,7 @@ type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
2592
2628
  /**
2593
2629
  * Dual-agent convergence bench.
2594
2630
  *
2595
- * Pattern lifted from tax-agent + legal-agent: two agents take turns until
2631
+ * Pattern lifted from dual-worker review loops: two agents take turns until
2596
2632
  * they converge on a consensus artifact. One proposes, the other critiques;
2597
2633
  * the proposer revises; repeat until a score threshold is hit or max rounds.
2598
2634
  *
@@ -3408,7 +3444,7 @@ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): Ora
3408
3444
  /**
3409
3445
  * Cost tracker — token + USD accounting per scenario and per run.
3410
3446
  *
3411
- * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
3447
+ * Adapted from generic usage-event accounting. Every
3412
3448
  * optimizer needs to know "is the quality gain worth the cost delta?",
3413
3449
  * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
3414
3450
  * from metrics.ts stays authoritative for estimate math; this module
@@ -3619,7 +3655,7 @@ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOpti
3619
3655
  * State continuity scoring — measures how well a resumed/handed-off agent
3620
3656
  * preserves prior work.
3621
3657
  *
3622
- * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
3658
+ * When session 2 continues
3623
3659
  * session 1's work, the key question is: did it preserve key artifacts,
3624
3660
  * or start over and lose context? Each `ContinuityCheck` inspects one
3625
3661
  * aspect (file preserved, key count grew, status advanced) and yields
@@ -8347,4 +8383,4 @@ interface ReflectionProposal {
8347
8383
  }
8348
8384
  declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
8349
8385
 
8350
- export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
8386
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -2252,8 +2252,6 @@ async function finish(emitter, result) {
2252
2252
  }
2253
2253
 
2254
2254
  // src/feedback-trajectory.ts
2255
- import { appendFile, mkdir, readFile } from "fs/promises";
2256
- import { join } from "path";
2257
2255
  var DEFAULT_SPLIT_POLICY = {
2258
2256
  trainPct: 70,
2259
2257
  devPct: 15,
@@ -2330,12 +2328,16 @@ var FileSystemFeedbackTrajectoryStore = class {
2330
2328
  return next;
2331
2329
  }
2332
2330
  async append(record) {
2331
+ const { appendFile, mkdir } = await import("fs/promises");
2332
+ const { join: join3 } = await import("path");
2333
2333
  await mkdir(this.dir, { recursive: true });
2334
- await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
2334
+ await appendFile(join3(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
2335
2335
  }
2336
2336
  async load() {
2337
2337
  if (this.loaded) return;
2338
- const file = join(this.dir, "feedback-trajectories.ndjson");
2338
+ const { readFile } = await import("fs/promises");
2339
+ const { join: join3 } = await import("path");
2340
+ const file = join3(this.dir, "feedback-trajectories.ndjson");
2339
2341
  try {
2340
2342
  const raw = await readFile(file, "utf8");
2341
2343
  for (const line of raw.split("\n")) {
@@ -2422,6 +2424,44 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
2422
2424
  function feedbackTrajectoriesToOptimizerRows(trajectories) {
2423
2425
  return trajectories.map(feedbackTrajectoryToOptimizerRow);
2424
2426
  }
2427
+ async function replayFeedbackTrajectory(trajectory, adapter2) {
2428
+ try {
2429
+ const result = await adapter2.replay(trajectory);
2430
+ return {
2431
+ trajectoryId: trajectory.id,
2432
+ ...result
2433
+ };
2434
+ } catch (err) {
2435
+ const createdAt = (/* @__PURE__ */ new Date()).toISOString();
2436
+ const message = err instanceof Error ? err.message : String(err);
2437
+ return {
2438
+ trajectoryId: trajectory.id,
2439
+ pass: false,
2440
+ labels: [{
2441
+ source: "system",
2442
+ kind: "reject",
2443
+ value: false,
2444
+ reason: message,
2445
+ severity: "error",
2446
+ createdAt
2447
+ }],
2448
+ outcome: {
2449
+ success: false,
2450
+ score: 0,
2451
+ detail: message,
2452
+ observedAt: createdAt
2453
+ },
2454
+ metadata: { replayError: true }
2455
+ };
2456
+ }
2457
+ }
2458
+ async function replayFeedbackTrajectories(trajectories, adapter2) {
2459
+ const results = [];
2460
+ for (const trajectory of trajectories) {
2461
+ results.push(await replayFeedbackTrajectory(trajectory, adapter2));
2462
+ }
2463
+ return results;
2464
+ }
2425
2465
  function summarizePreferenceMemory(trajectories, options = {}) {
2426
2466
  const maxEntries = options.maxEntries ?? 20;
2427
2467
  const entries = [];
@@ -2585,6 +2625,69 @@ function canonicalize(value) {
2585
2625
  return out;
2586
2626
  }
2587
2627
 
2628
+ // src/action-policy.ts
2629
+ function evaluateActionPolicy(action, policy = {}, options = {}) {
2630
+ const reasons = [];
2631
+ let blocked = false;
2632
+ let requiresApproval = Boolean(action.requiresApproval);
2633
+ if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
2634
+ blocked = true;
2635
+ reasons.push(`action type "${action.type}" is not allowed`);
2636
+ }
2637
+ if (policy.blockedTypes?.includes(action.type)) {
2638
+ blocked = true;
2639
+ reasons.push(`action type "${action.type}" is blocked`);
2640
+ }
2641
+ if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
2642
+ requiresApproval = true;
2643
+ reasons.push(`action type "${action.type}" requires approval`);
2644
+ }
2645
+ if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
2646
+ requiresApproval = true;
2647
+ reasons.push("external side effect requires approval");
2648
+ }
2649
+ if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
2650
+ requiresApproval = true;
2651
+ reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
2652
+ }
2653
+ if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
2654
+ blocked = true;
2655
+ reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
2656
+ }
2657
+ if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
2658
+ blocked = true;
2659
+ reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
2660
+ }
2661
+ if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
2662
+ blocked = true;
2663
+ reasons.push("expected outcome is required");
2664
+ }
2665
+ if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
2666
+ blocked = true;
2667
+ reasons.push("kill criteria are required");
2668
+ }
2669
+ if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
2670
+ reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
2671
+ }
2672
+ if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
2673
+ const label = blocked || requiresApproval ? {
2674
+ source: "policy",
2675
+ kind: blocked ? "policy_block" : "comment",
2676
+ value: { actionType: action.type, blocked, requiresApproval },
2677
+ reason: reasons.join("; "),
2678
+ severity: blocked ? "critical" : "warning",
2679
+ createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
2680
+ metadata: { action, policy }
2681
+ } : void 0;
2682
+ return {
2683
+ allowed: !blocked,
2684
+ blocked,
2685
+ requiresApproval: !blocked && requiresApproval,
2686
+ reasons,
2687
+ label
2688
+ };
2689
+ }
2690
+
2588
2691
  // src/prompt-registry.ts
2589
2692
  var PromptRegistry = class {
2590
2693
  entries = /* @__PURE__ */ new Map();
@@ -6382,7 +6485,7 @@ function assertNonNegative(n, name) {
6382
6485
 
6383
6486
  // src/muffled-gate-scanner.ts
6384
6487
  import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
6385
- import { join as join2 } from "path";
6488
+ import { join } from "path";
6386
6489
  function codeOf(line) {
6387
6490
  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
6388
6491
  }
@@ -6486,11 +6589,11 @@ var UNIVERSAL_FINDERS = [
6486
6589
  function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
6487
6590
  const matches2 = [];
6488
6591
  const walk = (rel) => {
6489
- const abs = join2(repoRoot, rel);
6592
+ const abs = join(repoRoot, rel);
6490
6593
  if (!existsSync2(abs)) return;
6491
6594
  for (const entry of readdirSync(abs)) {
6492
- const sub = join2(rel, entry);
6493
- const subAbs = join2(repoRoot, sub);
6595
+ const sub = join(rel, entry);
6596
+ const subAbs = join(repoRoot, sub);
6494
6597
  let st;
6495
6598
  try {
6496
6599
  st = statSync(subAbs);
@@ -6519,7 +6622,7 @@ function scanForMuffledGates(opts) {
6519
6622
  const findings = [];
6520
6623
  const scanned = /* @__PURE__ */ new Set();
6521
6624
  for (const file of opts.scanFiles) {
6522
- const abs = join2(opts.repoRoot, file);
6625
+ const abs = join(opts.repoRoot, file);
6523
6626
  if (!existsSync2(abs)) continue;
6524
6627
  const text = readFileSync2(abs, "utf8");
6525
6628
  for (const find of opts.finders) findings.push(...find(file, text));
@@ -6534,7 +6637,7 @@ function scanForMuffledGates(opts) {
6534
6637
  );
6535
6638
  for (const file of importers) {
6536
6639
  if (scanned.has(file)) continue;
6537
- const abs = join2(opts.repoRoot, file);
6640
+ const abs = join(opts.repoRoot, file);
6538
6641
  if (!existsSync2(abs)) continue;
6539
6642
  const text = readFileSync2(abs, "utf8");
6540
6643
  for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
@@ -8522,7 +8625,7 @@ async function commitBisect(options) {
8522
8625
  }
8523
8626
  async function promptBisect(options) {
8524
8627
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
8525
- const join4 = (paragraphs) => paragraphs.join("\n\n");
8628
+ const join3 = (paragraphs) => paragraphs.join("\n\n");
8526
8629
  const goodParas = split(options.good);
8527
8630
  const badParas = split(options.bad);
8528
8631
  if (goodParas.length !== badParas.length) {
@@ -8540,7 +8643,7 @@ async function promptBisect(options) {
8540
8643
  const result = await bisect({
8541
8644
  good: goodMask,
8542
8645
  bad: badMask,
8543
- runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
8646
+ runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
8544
8647
  maxIterations: options.maxIterations ?? n + 5,
8545
8648
  halfway: (g, b) => {
8546
8649
  for (let i = 0; i < g.length; i++) {
@@ -8571,12 +8674,12 @@ async function promptBisect(options) {
8571
8674
  }
8572
8675
  }
8573
8676
  const materializedPath = result.path.map((s) => ({
8574
- state: join4(paragraphsFor(s.state)),
8677
+ state: join3(paragraphsFor(s.state)),
8575
8678
  score: s.score,
8576
8679
  pass: s.pass
8577
8680
  }));
8578
8681
  return {
8579
- culprit: join4(paragraphsFor(culprit)),
8682
+ culprit: join3(paragraphsFor(culprit)),
8580
8683
  path: materializedPath,
8581
8684
  converged: result.converged,
8582
8685
  inputInconsistent: result.inputInconsistent,
@@ -9631,7 +9734,7 @@ function mergeSignals(a, b) {
9631
9734
  // src/command-runner.ts
9632
9735
  import { spawnSync } from "child_process";
9633
9736
  import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
9634
- import { join as join3 } from "path";
9737
+ import { join as join2 } from "path";
9635
9738
  var localCommandRunner = {
9636
9739
  name: "local",
9637
9740
  async run(input) {
@@ -9678,7 +9781,7 @@ var localCommandRunner = {
9678
9781
  const out = [];
9679
9782
  for (const name of entries) {
9680
9783
  try {
9681
- const st = statSync2(join3(path, name));
9784
+ const st = statSync2(join2(path, name));
9682
9785
  out.push({
9683
9786
  name,
9684
9787
  isDirectory: st.isDirectory(),
@@ -13715,6 +13818,7 @@ export {
13715
13818
  estimateCost,
13716
13819
  estimateTokens,
13717
13820
  euAiActReport,
13821
+ evaluateActionPolicy,
13718
13822
  evaluateContract,
13719
13823
  evaluateHypothesis,
13720
13824
  evaluateOracles,
@@ -13822,6 +13926,8 @@ export {
13822
13926
  renderPlaybookMarkdown,
13823
13927
  renderPreferenceMemoryMarkdown,
13824
13928
  renderSteeringText,
13929
+ replayFeedbackTrajectories,
13930
+ replayFeedbackTrajectory,
13825
13931
  replayScorerOverCorpus,
13826
13932
  replayTraceThroughJudge,
13827
13933
  requiredSampleSize,