@tangle-network/agent-eval 0.17.2 → 0.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -16
- package/dist/index.d.ts +44 -8
- package/dist/index.js +122 -16
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -21,7 +21,7 @@ console.log(ship.result.passed, ship.result.score)
|
|
|
21
21
|
- You ship a content generator and need quality signal beyond "the LLM said it's good".
|
|
22
22
|
- You want a release gate that fails on regressions you can name, not vibes.
|
|
23
23
|
|
|
24
|
-
If that's you, start with [`docs/concepts.md`](./docs/concepts.md) — 5-minute mental model — then
|
|
24
|
+
If that's you, start with [`docs/concepts.md`](./docs/concepts.md) — 5-minute mental model — then use [`docs/feature-guide.md`](./docs/feature-guide.md) to choose the right primitive.
|
|
25
25
|
|
|
26
26
|
## Quickstart
|
|
27
27
|
|
|
@@ -65,6 +65,7 @@ The recipe for a code-generator eval is in [`SKILL.md` §Minimal working path](.
|
|
|
65
65
|
## Two ways to read this repo
|
|
66
66
|
|
|
67
67
|
- **You're a human onboarding** — read [`docs/concepts.md`](./docs/concepts.md) for the mental model, then [`docs/wire-protocol.md`](./docs/wire-protocol.md) if you'll call from another language, or `SKILL.md` if you'll embed in TS.
|
|
68
|
+
- **You're deciding what to integrate** — read [`docs/feature-guide.md`](./docs/feature-guide.md) for the layman explanation, use cases, feature map, and guardrails.
|
|
68
69
|
- **You're an LLM agent writing integration code** — read `SKILL.md`. Every directive there encodes a shipped bug; skipping one reintroduces the bug class.
|
|
69
70
|
|
|
70
71
|
## What's in the box
|
|
@@ -78,7 +79,8 @@ The recipe for a code-generator eval is in [`SKILL.md` §Minimal working path](.
|
|
|
78
79
|
| `clients/python/` | First-party Python client (`tangle-agent-eval` on PyPI). Version-locked to npm. | clients/python/README.md |
|
|
79
80
|
| `BenchmarkRunner`, `executeScenario`, `ConvergenceTracker` | Multi-turn scenario execution + cross-run tracking. | SKILL.md |
|
|
80
81
|
| `runAgentControlLoop` | Policy-based runtime for agentic tasks: observe typed state, validate, decide, act, repeat with budgets, tracing, and stuck-loop guards. | [control-runtime.md](./docs/control-runtime.md) |
|
|
81
|
-
| `FeedbackTrajectory`, `InMemoryFeedbackTrajectoryStore`, `FileSystemFeedbackTrajectoryStore` |
|
|
82
|
+
| `FeedbackTrajectory`, `InMemoryFeedbackTrajectoryStore`, `FileSystemFeedbackTrajectoryStore` | Human/environment feedback loops: capture approvals, rejections, choices, revisions, metrics, and policy blocks as train/dev/test/holdout examples. | [feedback-trajectories.md](./docs/feedback-trajectories.md) |
|
|
83
|
+
| `evaluateActionPolicy` | Generic action preflight for approval, budget, expected-outcome, and kill-criteria checks. | [feature-guide.md](./docs/feature-guide.md) |
|
|
82
84
|
| `ExperimentTracker`, `PromptOptimizer`, `bisector` | A/B prompts, optimize steering, bisect regressions. | SKILL.md |
|
|
83
85
|
| `runPromptEvolution`, `createCompositeMutator`, `createSandboxPool`, `createSandboxCodeMutator`, `MutationTelemetry`, `LineageRecorder`, `CostLedger`, `JsonlTrialCache` | Prompt + code evolution loops with bounded sandbox pools, durable JSONL telemetry, plateau-detecting composite mutators, crash-resumable trial cache. | §Evolution loop |
|
|
84
86
|
| `reflective-mutation` (`buildReflectionPrompt`, `parseReflectionResponse`, `DEFAULT_MUTATION_PRIMITIVES`) | Trace-conditioned LLM mutator that reasons over top/bottom trials instead of blind rewrites. | inline JSDoc |
|
|
@@ -170,9 +172,9 @@ The `MutationTelemetry`, `LineageRecorder`, and `CostLedger` pass into the `code
|
|
|
170
172
|
|
|
171
173
|
For the full primitive surface and rationale, read each module's JSDoc — `prompt-evolution.ts`, `composite-mutator.ts`, `sandbox-pool.ts`, `code-mutator.ts`, `reflective-mutation.ts`, `evolution-telemetry.ts`.
|
|
172
174
|
|
|
173
|
-
##
|
|
175
|
+
## Feedback trajectory loop
|
|
174
176
|
|
|
175
|
-
When normal
|
|
177
|
+
When normal agent usage should generate training/eval signal, use feedback
|
|
176
178
|
trajectories. They turn approvals, rejections, option choices, edits, metrics,
|
|
177
179
|
and policy blocks into reusable examples.
|
|
178
180
|
|
|
@@ -185,22 +187,21 @@ import {
|
|
|
185
187
|
} from '@tangle-network/agent-eval'
|
|
186
188
|
|
|
187
189
|
const trajectory = createFeedbackTrajectory({
|
|
188
|
-
projectId: '
|
|
189
|
-
scenarioId: '
|
|
190
|
-
task: { intent: '
|
|
190
|
+
projectId: 'research-agent',
|
|
191
|
+
scenarioId: 'brief-review',
|
|
192
|
+
task: { intent: 'Revise a research brief until it is specific and sourced.' },
|
|
191
193
|
attempts: [{
|
|
192
194
|
id: 'draft-1',
|
|
193
195
|
stepIndex: 0,
|
|
194
|
-
artifactType: '
|
|
195
|
-
artifact: {
|
|
196
|
-
options: ['enterprise procurement', 'technical founder pain'],
|
|
196
|
+
artifactType: 'research',
|
|
197
|
+
artifact: { summary: 'Initial brief with weak sourcing.' },
|
|
197
198
|
createdAt: new Date().toISOString(),
|
|
198
199
|
}],
|
|
199
200
|
labels: [{
|
|
200
201
|
source: 'user',
|
|
201
|
-
kind: '
|
|
202
|
-
value: '
|
|
203
|
-
reason: '
|
|
202
|
+
kind: 'revision_request',
|
|
203
|
+
value: 'needs stronger evidence',
|
|
204
|
+
reason: 'add primary sources and remove unsupported claims',
|
|
204
205
|
severity: 'error',
|
|
205
206
|
createdAt: new Date().toISOString(),
|
|
206
207
|
}],
|
|
@@ -211,9 +212,9 @@ const scenarios = feedbackTrajectoriesToDatasetScenarios([trajectory])
|
|
|
211
212
|
const optimizerRows = feedbackTrajectoriesToOptimizerRows([trajectory])
|
|
212
213
|
```
|
|
213
214
|
|
|
214
|
-
This is the bridge between
|
|
215
|
-
|
|
216
|
-
|
|
215
|
+
This is the bridge between feedback and optimization: review signals become
|
|
216
|
+
immediate memory, replayable eval scenarios, and prompt/signature/code optimizer
|
|
217
|
+
input. See [`docs/feedback-trajectories.md`](./docs/feedback-trajectories.md).
|
|
217
218
|
|
|
218
219
|
## v0.16 highlights — production-rigor primitives
|
|
219
220
|
|
package/dist/index.d.ts
CHANGED
|
@@ -1437,6 +1437,17 @@ interface FeedbackOptimizerRow extends OptimizationExample {
|
|
|
1437
1437
|
labelKinds: FeedbackLabelKind[];
|
|
1438
1438
|
score?: number;
|
|
1439
1439
|
}
|
|
1440
|
+
interface FeedbackReplayResult {
|
|
1441
|
+
trajectoryId: string;
|
|
1442
|
+
pass: boolean;
|
|
1443
|
+
score?: number;
|
|
1444
|
+
labels: FeedbackLabel[];
|
|
1445
|
+
outcome?: FeedbackOutcome;
|
|
1446
|
+
metadata?: Record<string, unknown>;
|
|
1447
|
+
}
|
|
1448
|
+
interface FeedbackReplayAdapter {
|
|
1449
|
+
replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
|
|
1450
|
+
}
|
|
1440
1451
|
declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
|
|
1441
1452
|
private readonly trajectories;
|
|
1442
1453
|
save(trajectory: FeedbackTrajectory): Promise<void>;
|
|
@@ -1479,6 +1490,8 @@ declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTraject
|
|
|
1479
1490
|
declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
|
|
1480
1491
|
declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
|
|
1481
1492
|
declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
|
|
1493
|
+
declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
|
|
1494
|
+
declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
|
|
1482
1495
|
declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
|
|
1483
1496
|
maxEntries?: number;
|
|
1484
1497
|
}): PreferenceMemoryEntry[];
|
|
@@ -1494,6 +1507,29 @@ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(
|
|
|
1494
1507
|
createdAt?: string;
|
|
1495
1508
|
}): FeedbackTrajectory;
|
|
1496
1509
|
|
|
1510
|
+
interface ActionExecutionPolicy {
|
|
1511
|
+
allowedTypes?: string[];
|
|
1512
|
+
blockedTypes?: string[];
|
|
1513
|
+
alwaysRequireApprovalTypes?: string[];
|
|
1514
|
+
autoApproveTypes?: string[];
|
|
1515
|
+
requireApprovalForExternalSideEffects?: boolean;
|
|
1516
|
+
requireApprovalAboveCostUsd?: number;
|
|
1517
|
+
maxActionCostUsd?: number;
|
|
1518
|
+
remainingBudgetUsd?: number;
|
|
1519
|
+
expectedOutcomeRequired?: boolean;
|
|
1520
|
+
killCriteriaRequired?: boolean;
|
|
1521
|
+
}
|
|
1522
|
+
interface ActionPolicyDecision {
|
|
1523
|
+
allowed: boolean;
|
|
1524
|
+
blocked: boolean;
|
|
1525
|
+
requiresApproval: boolean;
|
|
1526
|
+
reasons: string[];
|
|
1527
|
+
label?: FeedbackLabel;
|
|
1528
|
+
}
|
|
1529
|
+
declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
|
|
1530
|
+
createdAt?: string;
|
|
1531
|
+
}): ActionPolicyDecision;
|
|
1532
|
+
|
|
1497
1533
|
/**
|
|
1498
1534
|
* Normalize scores so all dimensions follow "higher = better".
|
|
1499
1535
|
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
@@ -1595,7 +1631,7 @@ declare class ConvergenceTracker {
|
|
|
1595
1631
|
* Uses the Web Crypto API (works in Workers, Node 22+, browsers).
|
|
1596
1632
|
*/
|
|
1597
1633
|
interface PromptHandle {
|
|
1598
|
-
/** Stable human-readable id, e.g. '
|
|
1634
|
+
/** Stable human-readable id, e.g. 'browser.system' */
|
|
1599
1635
|
id: string;
|
|
1600
1636
|
/** Caller-chosen version string, e.g. 'v3' or '2026-04-20' */
|
|
1601
1637
|
version: string;
|
|
@@ -1687,7 +1723,7 @@ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSl
|
|
|
1687
1723
|
* Artifact validators.
|
|
1688
1724
|
*
|
|
1689
1725
|
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
1690
|
-
* correctness,
|
|
1726
|
+
* correctness, research for sourced briefs, browser for task assertions, coding
|
|
1691
1727
|
* for social posts. One interface, many validators; all plug into
|
|
1692
1728
|
* `BenchmarkRunner` the same way.
|
|
1693
1729
|
*
|
|
@@ -1975,7 +2011,7 @@ declare class FileSystemExperimentStore implements ExperimentStore {
|
|
|
1975
2011
|
* `Run.status` field one-to-one.
|
|
1976
2012
|
*
|
|
1977
2013
|
* Why this lives next to `InMemoryExperimentStore`:
|
|
1978
|
-
* -
|
|
2014
|
+
* - browser, coding, and computer-use agents can all run as Workers
|
|
1979
2015
|
* - Workers cannot use `node:fs`, so `FileSystemExperimentStore` doesn't apply
|
|
1980
2016
|
* - Hand-rolling D1 SQL in every consumer is exactly the duplication this
|
|
1981
2017
|
* module exists to prevent
|
|
@@ -2008,7 +2044,7 @@ interface D1ExperimentStoreOptions {
|
|
|
2008
2044
|
db: D1Like;
|
|
2009
2045
|
/**
|
|
2010
2046
|
* Optional table-name prefix so multiple ExperimentStores can share a DB
|
|
2011
|
-
* without colliding (e.g. `
|
|
2047
|
+
* without colliding (e.g. `browser_eval_experiments` vs `coding_eval_experiments`).
|
|
2012
2048
|
* Default: `agent_eval_`.
|
|
2013
2049
|
*/
|
|
2014
2050
|
tablePrefix?: string;
|
|
@@ -2592,7 +2628,7 @@ type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
|
|
|
2592
2628
|
/**
|
|
2593
2629
|
* Dual-agent convergence bench.
|
|
2594
2630
|
*
|
|
2595
|
-
* Pattern lifted from
|
|
2631
|
+
* Pattern lifted from dual-worker review loops: two agents take turns until
|
|
2596
2632
|
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
2597
2633
|
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
2598
2634
|
*
|
|
@@ -3408,7 +3444,7 @@ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): Ora
|
|
|
3408
3444
|
/**
|
|
3409
3445
|
* Cost tracker — token + USD accounting per scenario and per run.
|
|
3410
3446
|
*
|
|
3411
|
-
*
|
|
3447
|
+
* Adapted from generic usage-event accounting. Every
|
|
3412
3448
|
* optimizer needs to know "is the quality gain worth the cost delta?",
|
|
3413
3449
|
* and every dashboard needs dollars-per-completed-task. MODEL_PRICING
|
|
3414
3450
|
* from metrics.ts stays authoritative for estimate math; this module
|
|
@@ -3619,7 +3655,7 @@ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOpti
|
|
|
3619
3655
|
* State continuity scoring — measures how well a resumed/handed-off agent
|
|
3620
3656
|
* preserves prior work.
|
|
3621
3657
|
*
|
|
3622
|
-
*
|
|
3658
|
+
* When session 2 continues
|
|
3623
3659
|
* session 1's work, the key question is: did it preserve key artifacts,
|
|
3624
3660
|
* or start over and lose context? Each `ContinuityCheck` inspects one
|
|
3625
3661
|
* aspect (file preserved, key count grew, status advanced) and yields
|
|
@@ -8347,4 +8383,4 @@ interface ReflectionProposal {
|
|
|
8347
8383
|
}
|
|
8348
8384
|
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
8349
8385
|
|
|
8350
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|
|
8386
|
+
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|
package/dist/index.js
CHANGED
|
@@ -2252,8 +2252,6 @@ async function finish(emitter, result) {
|
|
|
2252
2252
|
}
|
|
2253
2253
|
|
|
2254
2254
|
// src/feedback-trajectory.ts
|
|
2255
|
-
import { appendFile, mkdir, readFile } from "fs/promises";
|
|
2256
|
-
import { join } from "path";
|
|
2257
2255
|
var DEFAULT_SPLIT_POLICY = {
|
|
2258
2256
|
trainPct: 70,
|
|
2259
2257
|
devPct: 15,
|
|
@@ -2330,12 +2328,16 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
2330
2328
|
return next;
|
|
2331
2329
|
}
|
|
2332
2330
|
async append(record) {
|
|
2331
|
+
const { appendFile, mkdir } = await import("fs/promises");
|
|
2332
|
+
const { join: join3 } = await import("path");
|
|
2333
2333
|
await mkdir(this.dir, { recursive: true });
|
|
2334
|
-
await appendFile(
|
|
2334
|
+
await appendFile(join3(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
|
|
2335
2335
|
}
|
|
2336
2336
|
async load() {
|
|
2337
2337
|
if (this.loaded) return;
|
|
2338
|
-
const
|
|
2338
|
+
const { readFile } = await import("fs/promises");
|
|
2339
|
+
const { join: join3 } = await import("path");
|
|
2340
|
+
const file = join3(this.dir, "feedback-trajectories.ndjson");
|
|
2339
2341
|
try {
|
|
2340
2342
|
const raw = await readFile(file, "utf8");
|
|
2341
2343
|
for (const line of raw.split("\n")) {
|
|
@@ -2422,6 +2424,44 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
|
2422
2424
|
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
2423
2425
|
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
2424
2426
|
}
|
|
2427
|
+
async function replayFeedbackTrajectory(trajectory, adapter2) {
|
|
2428
|
+
try {
|
|
2429
|
+
const result = await adapter2.replay(trajectory);
|
|
2430
|
+
return {
|
|
2431
|
+
trajectoryId: trajectory.id,
|
|
2432
|
+
...result
|
|
2433
|
+
};
|
|
2434
|
+
} catch (err) {
|
|
2435
|
+
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2436
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2437
|
+
return {
|
|
2438
|
+
trajectoryId: trajectory.id,
|
|
2439
|
+
pass: false,
|
|
2440
|
+
labels: [{
|
|
2441
|
+
source: "system",
|
|
2442
|
+
kind: "reject",
|
|
2443
|
+
value: false,
|
|
2444
|
+
reason: message,
|
|
2445
|
+
severity: "error",
|
|
2446
|
+
createdAt
|
|
2447
|
+
}],
|
|
2448
|
+
outcome: {
|
|
2449
|
+
success: false,
|
|
2450
|
+
score: 0,
|
|
2451
|
+
detail: message,
|
|
2452
|
+
observedAt: createdAt
|
|
2453
|
+
},
|
|
2454
|
+
metadata: { replayError: true }
|
|
2455
|
+
};
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
async function replayFeedbackTrajectories(trajectories, adapter2) {
|
|
2459
|
+
const results = [];
|
|
2460
|
+
for (const trajectory of trajectories) {
|
|
2461
|
+
results.push(await replayFeedbackTrajectory(trajectory, adapter2));
|
|
2462
|
+
}
|
|
2463
|
+
return results;
|
|
2464
|
+
}
|
|
2425
2465
|
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
2426
2466
|
const maxEntries = options.maxEntries ?? 20;
|
|
2427
2467
|
const entries = [];
|
|
@@ -2585,6 +2625,69 @@ function canonicalize(value) {
|
|
|
2585
2625
|
return out;
|
|
2586
2626
|
}
|
|
2587
2627
|
|
|
2628
|
+
// src/action-policy.ts
|
|
2629
|
+
function evaluateActionPolicy(action, policy = {}, options = {}) {
|
|
2630
|
+
const reasons = [];
|
|
2631
|
+
let blocked = false;
|
|
2632
|
+
let requiresApproval = Boolean(action.requiresApproval);
|
|
2633
|
+
if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
|
|
2634
|
+
blocked = true;
|
|
2635
|
+
reasons.push(`action type "${action.type}" is not allowed`);
|
|
2636
|
+
}
|
|
2637
|
+
if (policy.blockedTypes?.includes(action.type)) {
|
|
2638
|
+
blocked = true;
|
|
2639
|
+
reasons.push(`action type "${action.type}" is blocked`);
|
|
2640
|
+
}
|
|
2641
|
+
if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
|
|
2642
|
+
requiresApproval = true;
|
|
2643
|
+
reasons.push(`action type "${action.type}" requires approval`);
|
|
2644
|
+
}
|
|
2645
|
+
if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
|
|
2646
|
+
requiresApproval = true;
|
|
2647
|
+
reasons.push("external side effect requires approval");
|
|
2648
|
+
}
|
|
2649
|
+
if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
|
|
2650
|
+
requiresApproval = true;
|
|
2651
|
+
reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
|
|
2652
|
+
}
|
|
2653
|
+
if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
|
|
2654
|
+
blocked = true;
|
|
2655
|
+
reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
|
|
2656
|
+
}
|
|
2657
|
+
if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
|
|
2658
|
+
blocked = true;
|
|
2659
|
+
reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
|
|
2660
|
+
}
|
|
2661
|
+
if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
|
|
2662
|
+
blocked = true;
|
|
2663
|
+
reasons.push("expected outcome is required");
|
|
2664
|
+
}
|
|
2665
|
+
if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
|
|
2666
|
+
blocked = true;
|
|
2667
|
+
reasons.push("kill criteria are required");
|
|
2668
|
+
}
|
|
2669
|
+
if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
|
|
2670
|
+
reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
|
|
2671
|
+
}
|
|
2672
|
+
if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
|
|
2673
|
+
const label = blocked || requiresApproval ? {
|
|
2674
|
+
source: "policy",
|
|
2675
|
+
kind: blocked ? "policy_block" : "comment",
|
|
2676
|
+
value: { actionType: action.type, blocked, requiresApproval },
|
|
2677
|
+
reason: reasons.join("; "),
|
|
2678
|
+
severity: blocked ? "critical" : "warning",
|
|
2679
|
+
createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
2680
|
+
metadata: { action, policy }
|
|
2681
|
+
} : void 0;
|
|
2682
|
+
return {
|
|
2683
|
+
allowed: !blocked,
|
|
2684
|
+
blocked,
|
|
2685
|
+
requiresApproval: !blocked && requiresApproval,
|
|
2686
|
+
reasons,
|
|
2687
|
+
label
|
|
2688
|
+
};
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2588
2691
|
// src/prompt-registry.ts
|
|
2589
2692
|
var PromptRegistry = class {
|
|
2590
2693
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -6382,7 +6485,7 @@ function assertNonNegative(n, name) {
|
|
|
6382
6485
|
|
|
6383
6486
|
// src/muffled-gate-scanner.ts
|
|
6384
6487
|
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
6385
|
-
import { join
|
|
6488
|
+
import { join } from "path";
|
|
6386
6489
|
function codeOf(line) {
|
|
6387
6490
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
6388
6491
|
}
|
|
@@ -6486,11 +6589,11 @@ var UNIVERSAL_FINDERS = [
|
|
|
6486
6589
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
6487
6590
|
const matches2 = [];
|
|
6488
6591
|
const walk = (rel) => {
|
|
6489
|
-
const abs =
|
|
6592
|
+
const abs = join(repoRoot, rel);
|
|
6490
6593
|
if (!existsSync2(abs)) return;
|
|
6491
6594
|
for (const entry of readdirSync(abs)) {
|
|
6492
|
-
const sub =
|
|
6493
|
-
const subAbs =
|
|
6595
|
+
const sub = join(rel, entry);
|
|
6596
|
+
const subAbs = join(repoRoot, sub);
|
|
6494
6597
|
let st;
|
|
6495
6598
|
try {
|
|
6496
6599
|
st = statSync(subAbs);
|
|
@@ -6519,7 +6622,7 @@ function scanForMuffledGates(opts) {
|
|
|
6519
6622
|
const findings = [];
|
|
6520
6623
|
const scanned = /* @__PURE__ */ new Set();
|
|
6521
6624
|
for (const file of opts.scanFiles) {
|
|
6522
|
-
const abs =
|
|
6625
|
+
const abs = join(opts.repoRoot, file);
|
|
6523
6626
|
if (!existsSync2(abs)) continue;
|
|
6524
6627
|
const text = readFileSync2(abs, "utf8");
|
|
6525
6628
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
@@ -6534,7 +6637,7 @@ function scanForMuffledGates(opts) {
|
|
|
6534
6637
|
);
|
|
6535
6638
|
for (const file of importers) {
|
|
6536
6639
|
if (scanned.has(file)) continue;
|
|
6537
|
-
const abs =
|
|
6640
|
+
const abs = join(opts.repoRoot, file);
|
|
6538
6641
|
if (!existsSync2(abs)) continue;
|
|
6539
6642
|
const text = readFileSync2(abs, "utf8");
|
|
6540
6643
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
@@ -8522,7 +8625,7 @@ async function commitBisect(options) {
|
|
|
8522
8625
|
}
|
|
8523
8626
|
async function promptBisect(options) {
|
|
8524
8627
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
8525
|
-
const
|
|
8628
|
+
const join3 = (paragraphs) => paragraphs.join("\n\n");
|
|
8526
8629
|
const goodParas = split(options.good);
|
|
8527
8630
|
const badParas = split(options.bad);
|
|
8528
8631
|
if (goodParas.length !== badParas.length) {
|
|
@@ -8540,7 +8643,7 @@ async function promptBisect(options) {
|
|
|
8540
8643
|
const result = await bisect({
|
|
8541
8644
|
good: goodMask,
|
|
8542
8645
|
bad: badMask,
|
|
8543
|
-
runEval: (mask) => options.runEval(
|
|
8646
|
+
runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
|
|
8544
8647
|
maxIterations: options.maxIterations ?? n + 5,
|
|
8545
8648
|
halfway: (g, b) => {
|
|
8546
8649
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -8571,12 +8674,12 @@ async function promptBisect(options) {
|
|
|
8571
8674
|
}
|
|
8572
8675
|
}
|
|
8573
8676
|
const materializedPath = result.path.map((s) => ({
|
|
8574
|
-
state:
|
|
8677
|
+
state: join3(paragraphsFor(s.state)),
|
|
8575
8678
|
score: s.score,
|
|
8576
8679
|
pass: s.pass
|
|
8577
8680
|
}));
|
|
8578
8681
|
return {
|
|
8579
|
-
culprit:
|
|
8682
|
+
culprit: join3(paragraphsFor(culprit)),
|
|
8580
8683
|
path: materializedPath,
|
|
8581
8684
|
converged: result.converged,
|
|
8582
8685
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -9631,7 +9734,7 @@ function mergeSignals(a, b) {
|
|
|
9631
9734
|
// src/command-runner.ts
|
|
9632
9735
|
import { spawnSync } from "child_process";
|
|
9633
9736
|
import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
|
|
9634
|
-
import { join as
|
|
9737
|
+
import { join as join2 } from "path";
|
|
9635
9738
|
var localCommandRunner = {
|
|
9636
9739
|
name: "local",
|
|
9637
9740
|
async run(input) {
|
|
@@ -9678,7 +9781,7 @@ var localCommandRunner = {
|
|
|
9678
9781
|
const out = [];
|
|
9679
9782
|
for (const name of entries) {
|
|
9680
9783
|
try {
|
|
9681
|
-
const st = statSync2(
|
|
9784
|
+
const st = statSync2(join2(path, name));
|
|
9682
9785
|
out.push({
|
|
9683
9786
|
name,
|
|
9684
9787
|
isDirectory: st.isDirectory(),
|
|
@@ -13715,6 +13818,7 @@ export {
|
|
|
13715
13818
|
estimateCost,
|
|
13716
13819
|
estimateTokens,
|
|
13717
13820
|
euAiActReport,
|
|
13821
|
+
evaluateActionPolicy,
|
|
13718
13822
|
evaluateContract,
|
|
13719
13823
|
evaluateHypothesis,
|
|
13720
13824
|
evaluateOracles,
|
|
@@ -13822,6 +13926,8 @@ export {
|
|
|
13822
13926
|
renderPlaybookMarkdown,
|
|
13823
13927
|
renderPreferenceMemoryMarkdown,
|
|
13824
13928
|
renderSteeringText,
|
|
13929
|
+
replayFeedbackTrajectories,
|
|
13930
|
+
replayFeedbackTrajectory,
|
|
13825
13931
|
replayScorerOverCorpus,
|
|
13826
13932
|
replayTraceThroughJudge,
|
|
13827
13933
|
requiredSampleSize,
|