@tangle-network/agent-eval 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -3205,4 +3205,931 @@ declare class ProjectRegistry {
3205
3205
  projectChats(projectId: string): Promise<ChatSummary[]>;
3206
3206
  }
3207
3207
 
3208
- export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScore, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CostEntry, type CostSummary, CostTracker, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type EventFilter, type EventKind, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, FAILURE_CLASSES, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type ImageData, InMemoryExperimentStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, benjaminiHochberg, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, canaryLeakView, checkCanaries, checkSlos, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, dominates, estimateCost, estimateTokens, evaluateContract, evaluateOracles, executeScenario, expectAgent, exportRunAsOtlp, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, lowercaseMutator, mannWhitneyU, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdownReport, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runE2EWorkflow, runExpectations, runFailureClass, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, statusAdvanced, stuckLoopView, textInSnapshot, toLangfuseEnvelope, toPrometheusText, toolNamesForRun, toolSpans, toolWasteView, typoMutator, urlContains, verbosityBias, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
3208
+ /**
3209
+ * OutcomeStore — deployment outcomes attached to Run IDs.
3210
+ *
3211
+ * Outcomes arrive asynchronously from production telemetry after the
3212
+ * eval run completed: user ratings, retention flags, conversion events,
3213
+ * revenue, support-ticket rate, anything a product team can measure.
3214
+ * The store is a peer to TraceStore — separate lifecycle, same runId
3215
+ * foreign key.
3216
+ *
3217
+ * The whole point of this module is to make the meta-eval correlation
3218
+ * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
3219
+ */
3220
+ interface DeploymentOutcome {
3221
+ runId: string;
3222
+ capturedAt: number;
3223
+ /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
3224
+ metrics: Record<string, number>;
3225
+ /** Dimensions for stratified analysis — cohort, region, user_segment. */
3226
+ labels?: Record<string, string>;
3227
+ /** Free-form provenance (source system, pipeline version). */
3228
+ source?: string;
3229
+ }
3230
+ interface OutcomeFilter {
3231
+ runIds?: string[];
3232
+ since?: number;
3233
+ until?: number;
3234
+ label?: {
3235
+ key: string;
3236
+ value: string;
3237
+ };
3238
+ source?: string;
3239
+ }
3240
+ interface OutcomeStore {
3241
+ append(outcome: DeploymentOutcome): Promise<void>;
3242
+ /** All outcomes attached to this run (a single run can have many — multiple
3243
+ * capture windows over deployment time). */
3244
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3245
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3246
+ }
3247
+ declare class InMemoryOutcomeStore implements OutcomeStore {
3248
+ private items;
3249
+ append(outcome: DeploymentOutcome): Promise<void>;
3250
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3251
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3252
+ }
3253
+ interface FileSystemOutcomeStoreOptions {
3254
+ dir: string;
3255
+ maxBytes?: number;
3256
+ }
3257
+ declare class FileSystemOutcomeStore implements OutcomeStore {
3258
+ private dir;
3259
+ private maxBytes;
3260
+ private memo?;
3261
+ private loaded;
3262
+ constructor(options: FileSystemOutcomeStoreOptions);
3263
+ private ensureDir;
3264
+ append(outcome: DeploymentOutcome): Promise<void>;
3265
+ private load;
3266
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3267
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3268
+ }
3269
+
3270
+ /**
3271
+ * Correlation study — "does our eval score predict real-world outcomes?"
3272
+ *
3273
+ * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
3274
+ * joins on runId, computes Pearson + Spearman + bootstrap CI for every
3275
+ * (evalMetric, outcomeMetric) pair the caller declares.
3276
+ *
3277
+ * Without this number the framework is ornamental. With it and r > 0.6
3278
+ * the framework is a moat — no other agent-eval tool publishes one.
3279
+ */
3280
+
3281
+ interface EvalMetricSpec {
3282
+ id: string;
3283
+ /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
3284
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
3285
+ }
3286
+ interface OutcomePair {
3287
+ evalMetric: string;
3288
+ outcomeMetric: string;
3289
+ }
3290
+ interface CorrelationResult {
3291
+ evalMetric: string;
3292
+ outcomeMetric: string;
3293
+ n: number;
3294
+ pearson: number;
3295
+ spearman: number;
3296
+ /** 95% bootstrap CI for Pearson. */
3297
+ pearsonCi95: {
3298
+ lower: number;
3299
+ upper: number;
3300
+ };
3301
+ /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
3302
+ verdict: 'strong' | 'moderate' | 'weak';
3303
+ }
3304
+ interface CorrelationStudyResult {
3305
+ pairs: CorrelationResult[];
3306
+ joinedSamples: number;
3307
+ skippedRuns: number;
3308
+ }
3309
+ interface CorrelationStudyOptions {
3310
+ /** Only join outcomes captured within this window after run.startedAt. */
3311
+ maxCaptureLagMs?: number;
3312
+ /** Restrict to a subset of outcomes (cohort, region, source). */
3313
+ outcomeFilter?: OutcomeFilter;
3314
+ /** Which outcome per run to use when multiple exist. Default 'latest'. */
3315
+ reduction?: 'latest' | 'mean' | 'max';
3316
+ /** Bootstrap iterations for the CI. Default 500. */
3317
+ bootstrapIterations?: number;
3318
+ }
3319
+ declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
3320
+
3321
+ /**
3322
+ * Calibration curve — binned "if eval says X, what does reality show?"
3323
+ *
3324
+ * Companion to correlationStudy. Raw correlation is a single number;
3325
+ * the calibration curve shows *where* the eval is well-calibrated vs
3326
+ * overconfident / underconfident. Buckets the eval metric, computes
3327
+ * mean outcome per bucket, reports expected-calibration-error (ECE).
3328
+ */
3329
+
3330
+ interface CalibrationBin {
3331
+ lower: number;
3332
+ upper: number;
3333
+ n: number;
3334
+ evalMean: number;
3335
+ outcomeMean: number;
3336
+ /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
3337
+ gap: number;
3338
+ }
3339
+ interface CalibrationReport {
3340
+ evalMetric: string;
3341
+ outcomeMetric: string;
3342
+ n: number;
3343
+ bins: CalibrationBin[];
3344
+ /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
3345
+ ece: number;
3346
+ /** Max bin gap — upper bound on miscalibration. */
3347
+ maxGap: number;
3348
+ }
3349
+ interface CalibrationOptions {
3350
+ bins?: number;
3351
+ /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
3352
+ binning?: 'equal-width' | 'equal-frequency';
3353
+ /** Clip eval values to [lo, hi] before binning. */
3354
+ range?: {
3355
+ lo: number;
3356
+ hi: number;
3357
+ };
3358
+ }
3359
+ declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
3360
+
3361
+ /**
3362
+ * Process Reward Modeling — per-step rubric grading.
3363
+ *
3364
+ * A StepRubric inspects one span and returns a score + rationale.
3365
+ * PrmGrader applies an array of rubrics to every LLM span in a
3366
+ * trajectory (consumers can broaden to tool/retrieval spans via the
3367
+ * `kind` filter on each rubric).
3368
+ *
3369
+ * Why this matters: outcome-only eval (did the final artifact work?)
3370
+ * gives sparse reward — most agent turns are unattributable. PRMs
3371
+ * densify the signal so optimizers and RL fine-tuning can assign
3372
+ * credit per turn.
3373
+ */
3374
+
3375
+ interface StepContext {
3376
+ trajectory: Trajectory;
3377
+ step: TrajectoryStep;
3378
+ /** Steps preceding `step` in trajectory order. */
3379
+ prior: TrajectoryStep[];
3380
+ /** Steps following `step`. */
3381
+ next: TrajectoryStep[];
3382
+ }
3383
+ interface StepRubric {
3384
+ id: string;
3385
+ /** Only grade spans of these kinds (default: all). */
3386
+ kinds?: Array<Span['kind']>;
3387
+ /** Weight in the aggregate score. Default 1. */
3388
+ weight?: number;
3389
+ /** Returns score in 0..1 + optional rationale/evidence. Return `null` to
3390
+ * skip grading (rubric doesn't apply to this step). */
3391
+ grade: (ctx: StepContext) => Promise<{
3392
+ score: number;
3393
+ rationale?: string;
3394
+ evidence?: string;
3395
+ } | null>;
3396
+ }
3397
+ interface GradedStep {
3398
+ spanId: string;
3399
+ rubricId: string;
3400
+ score: number;
3401
+ weight: number;
3402
+ rationale?: string;
3403
+ evidence?: string;
3404
+ }
3405
+ interface PrmGradedTrace {
3406
+ runId: string;
3407
+ steps: GradedStep[];
3408
+ /** Weighted mean of all graded steps; 0..1. */
3409
+ aggregateScore: number;
3410
+ /** Number of spans graded — useful for sanity-checking coverage. */
3411
+ gradedCount: number;
3412
+ /** Number of spans in the trajectory that no rubric matched. */
3413
+ ungradedCount: number;
3414
+ }
3415
+ declare class PrmGrader {
3416
+ private rubrics;
3417
+ constructor(rubrics: StepRubric[]);
3418
+ /**
3419
+ * Grade every eligible span in a run. Emits a JudgeVerdict span for each
3420
+ * (rubric × span) verdict so the result is visible to downstream pipelines
3421
+ * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
3422
+ */
3423
+ grade(store: TraceStore, runId: string): Promise<PrmGradedTrace>;
3424
+ }
3425
+ /** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines
3426
+ * can distinguish PRM verdicts from human or top-level LLM judges. */
3427
+ declare function isPrmVerdict(verdict: JudgeSpan): boolean;
3428
+
3429
+ /**
3430
+ * Built-in reference rubrics. Consumers combine these with domain
3431
+ * rubrics. All are deterministic, rule-based — cheap to run + easy
3432
+ * to unit-test. LLM-based rubrics are trivially authored by
3433
+ * following the StepRubric contract.
3434
+ */
3435
+
3436
+ /** Penalize very short or very long assistant outputs. */
3437
+ declare function outputLengthRubric(args?: {
3438
+ minChars?: number;
3439
+ maxChars?: number;
3440
+ weight?: number;
3441
+ }): StepRubric;
3442
+ /** Reward tool calls that succeeded (status='ok') with an informative result. */
3443
+ declare function toolSuccessRubric(args?: {
3444
+ weight?: number;
3445
+ }): StepRubric;
3446
+ /** Penalize tool calls that duplicate a prior call with identical args. */
3447
+ declare function toolNonRedundantRubric(args?: {
3448
+ weight?: number;
3449
+ }): StepRubric;
3450
+ /** Penalize LLM outputs that contain common refusal markers when a refusal
3451
+ * is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
3452
+ declare function nonRefusalRubric(args?: {
3453
+ markers?: RegExp[];
3454
+ weight?: number;
3455
+ }): StepRubric;
3456
+ /** Reward outputs that invoke the next-step tool the trajectory actually uses
3457
+ * (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
3458
+ declare function toolIntentAlignmentRubric(args?: {
3459
+ weight?: number;
3460
+ }): StepRubric;
3461
+
3462
+ /**
3463
+ * Export PRM-graded traces as training data for downstream reward-model
3464
+ * fine-tuning. Canonical format is NDJSON of
3465
+ * `{ trajectory_text, step_index, rubric, score }` so a small model can
3466
+ * learn to predict step rewards from step context.
3467
+ *
3468
+ * The framework doesn't train the model — we emit the data; callers
3469
+ * plug it into their preferred trainer (TRL, Unsloth, custom).
3470
+ */
3471
+
3472
+ interface PrmTrainingSample {
3473
+ runId: string;
3474
+ spanId: string;
3475
+ rubricId: string;
3476
+ score: number;
3477
+ /** Serialized step context — step + surrounding conversation. */
3478
+ context: {
3479
+ priorTurns: Array<{
3480
+ role: string;
3481
+ content: string;
3482
+ }>;
3483
+ step: {
3484
+ kind: Span['kind'];
3485
+ text: string;
3486
+ };
3487
+ };
3488
+ /** Optional evidence + rationale for auditability. */
3489
+ rationale?: string;
3490
+ evidence?: string;
3491
+ }
3492
+ declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
3493
+ contextWindow?: number;
3494
+ }): Promise<PrmTrainingSample[]>;
3495
+ /** NDJSON serialization — write to file or stream directly to a trainer. */
3496
+ declare function toNdjson(samples: PrmTrainingSample[]): string;
3497
+
3498
+ /**
3499
+ * Inference-time PRM scoring — pick the best of N candidate trajectories
3500
+ * using a trained reward model (or a rule-based PRM as a proxy).
3501
+ *
3502
+ * The canonical Best-of-N pattern: generate N completions, score each
3503
+ * with a PRM, pick the winner. Here the scoring loop is framework-agnostic
3504
+ * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
3505
+ */
3506
+
3507
+ interface BestOfNResult {
3508
+ winner: PrmGradedTrace;
3509
+ ranked: PrmGradedTrace[];
3510
+ /** Standard deviation of aggregate scores — small = candidates were homogenous. */
3511
+ stdDev: number;
3512
+ }
3513
+ declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
3514
+ /**
3515
+ * Weighted vote across multiple graders — use when you want a PRM ensemble
3516
+ * (e.g. rule-based + LLM-based + trained model). Each grader produces its
3517
+ * own ranking; we aggregate via rank-sum (Borda count) so no single grader
3518
+ * dominates via a different score scale.
3519
+ */
3520
+ declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
3521
+
3522
+ /**
3523
+ * Bisector — auto-locate the change that introduced an eval regression.
3524
+ *
3525
+ * Two shapes:
3526
+ * - `commitBisect` — walk an ordered SHA list, binary-search for the
3527
+ * first commit that fails.
3528
+ * - `promptBisect` — given a good and bad prompt, progressively port
3529
+ * paragraphs from good→bad to localize the breaking change.
3530
+ *
3531
+ * Generic `bisect<T>` lets callers drive any ordered state space
3532
+ * (dataset versions, config files, CLI flag combinations).
3533
+ */
3534
+ interface BisectOptions<T> {
3535
+ /** State known to pass. */
3536
+ good: T;
3537
+ /** State known to fail. */
3538
+ bad: T;
3539
+ /** Equality test on state values — default Object.is. */
3540
+ equals?: (a: T, b: T) => boolean;
3541
+ /** Pick the halfway state between good + bad. Return null when no further
3542
+ * split is possible (e.g. adjacent commits). */
3543
+ halfway: (good: T, bad: T) => T | null;
3544
+ /** Produce a verdict for a state. */
3545
+ runEval: (state: T) => Promise<{
3546
+ score: number;
3547
+ pass: boolean;
3548
+ }>;
3549
+ /** Hard cap on iterations (default 40 — covers ~1T ordered states). */
3550
+ maxIterations?: number;
3551
+ }
3552
+ interface BisectStep<T> {
3553
+ state: T;
3554
+ score: number;
3555
+ pass: boolean;
3556
+ }
3557
+ interface BisectResult<T> {
3558
+ /** The first bad state — typically `bad` in the final (good, bad) adjacent pair. */
3559
+ culprit: T;
3560
+ /** Ordered trace of all states evaluated. */
3561
+ path: BisectStep<T>[];
3562
+ /** True when we narrowed to an adjacent (good, bad) pair. */
3563
+ converged: boolean;
3564
+ /** True when `good` itself failed or `bad` itself passed — the caller's
3565
+ * premise was broken. */
3566
+ inputInconsistent: boolean;
3567
+ }
3568
+ declare function bisect<T>(options: BisectOptions<T>): Promise<BisectResult<T>>;
3569
+ /**
3570
+ * Commit bisect — `commits` is an ordered SHA list, oldest to newest.
3571
+ * `good` and `bad` must both be present in the list.
3572
+ */
3573
+ declare function commitBisect(options: {
3574
+ commits: string[];
3575
+ good: string;
3576
+ bad: string;
3577
+ runEval: (sha: string) => Promise<{
3578
+ score: number;
3579
+ pass: boolean;
3580
+ }>;
3581
+ maxIterations?: number;
3582
+ }): Promise<BisectResult<string>>;
3583
+ /**
3584
+ * Prompt bisect — splits the good and bad prompts into paragraphs, then
3585
+ * progressively replaces paragraphs in `good` with their counterparts
3586
+ * from `bad` to localize the offending change. Only works when the two
3587
+ * prompts have the same paragraph count (a common editorial workflow
3588
+ * constraint — one paragraph = one change unit).
3589
+ */
3590
+ declare function promptBisect(options: {
3591
+ good: string;
3592
+ bad: string;
3593
+ runEval: (prompt: string) => Promise<{
3594
+ score: number;
3595
+ pass: boolean;
3596
+ }>;
3597
+ maxIterations?: number;
3598
+ paragraphSplitter?: (prompt: string) => string[];
3599
+ }): Promise<BisectResult<string> & {
3600
+ offendingParagraphIndex?: number;
3601
+ }>;
3602
+
3603
+ /**
3604
+ * Counterfactual replay — "what would have happened if we'd changed
3605
+ * exactly one thing at turn N?"
3606
+ *
3607
+ * The framework does NOT drive the agent — it sets up the replay
3608
+ * context (prior spans, prior state, mutation spec) and records the
3609
+ * resulting divergence. Consumers supply an `executeFrom(ctx)` callback
3610
+ * that runs their agent starting from turn N with the mutation applied.
3611
+ *
3612
+ * Counterfactual runs are recorded as a new Run with `layer='meta'` and
3613
+ * `parentRunId = originalRunId`, so downstream diff + correlation
3614
+ * pipelines see them natively.
3615
+ */
3616
+
3617
+ type CounterfactualMutation = {
3618
+ kind: 'swap-model';
3619
+ at: number;
3620
+ newModel: string;
3621
+ } | {
3622
+ kind: 'swap-tool-result';
3623
+ at: number;
3624
+ newResult: unknown;
3625
+ } | {
3626
+ kind: 'truncate-after';
3627
+ at: number;
3628
+ } | {
3629
+ kind: 'inject-system-message';
3630
+ at: number;
3631
+ content: string;
3632
+ } | {
3633
+ kind: 'custom';
3634
+ at: number;
3635
+ describe: string;
3636
+ apply: (step: TrajectoryStep) => TrajectoryStep;
3637
+ };
3638
+ interface CounterfactualContext {
3639
+ originalRunId: string;
3640
+ originalTrajectory: Trajectory;
3641
+ /** Steps up to (but not including) the mutation point — the prefix the
3642
+ * replayed agent inherits as its prior conversation/tool history. */
3643
+ prefix: TrajectoryStep[];
3644
+ mutation: CounterfactualMutation;
3645
+ /** Pre-applied mutation on the step at `mutation.at`. Consumers use this
3646
+ * as the FIRST step the replayed agent emits (they decide whether to
3647
+ * re-emit it or continue from there). */
3648
+ mutatedStep: TrajectoryStep;
3649
+ }
3650
+ interface CounterfactualResult {
3651
+ counterfactualRunId: string;
3652
+ originalRunId: string;
3653
+ mutation: CounterfactualMutation;
3654
+ /** Structured delta summary — caller can extend via scoring. */
3655
+ delta: {
3656
+ originalOutcomeScore: number | null;
3657
+ counterfactualOutcomeScore: number | null;
3658
+ deltaScore: number | null;
3659
+ };
3660
+ }
3661
+ interface CounterfactualRunner {
3662
+ /**
3663
+ * Execute the agent from `ctx.prefix` with the mutation applied.
3664
+ * MUST emit spans into the provided emitter so they become part of
3665
+ * the counterfactual run. MUST call emitter.endRun() with a verdict.
3666
+ */
3667
+ executeFrom: (ctx: CounterfactualContext, emitter: TraceEmitter) => Promise<void>;
3668
+ }
3669
+ declare function runCounterfactual(store: TraceStore, originalRunId: string, mutation: CounterfactualMutation, runner: CounterfactualRunner): Promise<CounterfactualResult>;
3670
+ /**
3671
+ * Aggregate a batch of counterfactuals into a simple attribution table:
3672
+ * which mutation kinds move outcomes most? (Useful when you run a grid
3673
+ * over the same trajectory — swap-model at every llm span, swap-tool
3674
+ * at every tool span — and want a ranked summary.)
3675
+ */
3676
+ declare function attributeCounterfactuals(results: CounterfactualResult[]): Array<{
3677
+ mutationKind: CounterfactualMutation['kind'];
3678
+ n: number;
3679
+ meanAbsDelta: number;
3680
+ meanSignedDelta: number;
3681
+ }>;
3682
+
3683
+ /**
3684
+ * Full cross-trace diff — align two trajectories step-by-step, report
3685
+ * per-step score deltas, attribute a variant's total outcome lead to
3686
+ * specific turns.
3687
+ *
3688
+ * 0.5 shipped `firstDivergenceView` (finds the first differing step).
3689
+ * This does the heavier work: full alignment via LCS, per-step
3690
+ * contribution to score delta using PRM verdicts when available,
3691
+ * fallback to structural heuristics (latency, token count, tool
3692
+ * outcome) otherwise.
3693
+ */
3694
+
3695
+ type AlignmentOp = {
3696
+ op: 'match';
3697
+ a: TrajectoryStep;
3698
+ b: TrajectoryStep;
3699
+ } | {
3700
+ op: 'insert';
3701
+ b: TrajectoryStep;
3702
+ } | {
3703
+ op: 'delete';
3704
+ a: TrajectoryStep;
3705
+ } | {
3706
+ op: 'replace';
3707
+ a: TrajectoryStep;
3708
+ b: TrajectoryStep;
3709
+ };
3710
+ interface StepAttribution {
3711
+ op: AlignmentOp;
3712
+ /** Difference in PRM score (or null when not scored by a matching judge). */
3713
+ prmDelta: number | null;
3714
+ /** Difference in latency (endedAt - startedAt). */
3715
+ latencyDeltaMs: number | null;
3716
+ /** Difference in token count (LLM spans). */
3717
+ tokenDelta: number | null;
3718
+ /** Reason this step is / isn't considered a contributor to the outcome delta. */
3719
+ note: string;
3720
+ }
3721
+ interface CrossTraceDiff {
3722
+ runA: string;
3723
+ runB: string;
3724
+ alignment: AlignmentOp[];
3725
+ attributions: StepAttribution[];
3726
+ /** Total score delta (B - A). */
3727
+ totalScoreDelta: number | null;
3728
+ /** Sum of PRM deltas across matched/replaced steps. Close to
3729
+ * `totalScoreDelta` when PRM covers the trajectory; gap indicates
3730
+ * unmodeled variance. */
3731
+ prmDeltaSum: number;
3732
+ }
3733
+ interface CrossTraceDiffOptions {
3734
+ stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
3735
+ }
3736
+ declare function crossTraceDiff(store: TraceStore, runA: string, runB: string, options?: CrossTraceDiffOptions): Promise<CrossTraceDiff>;
3737
+
3738
+ /**
3739
+ * Pre-registered hypotheses — declare what you're testing BEFORE the
3740
+ * run, check it AFTER. Prevents p-hacking, optional stopping, and the
3741
+ * "we ran until it looked good" failure mode.
3742
+ *
3743
+ * Manifest is a plain JSON-friendly object. Sign it with a content hash
3744
+ * + timestamp; the registered record becomes immutable. Post-run,
3745
+ * evaluate the manifest against observed results — the library refuses
3746
+ * to let you re-interpret a different metric as the declared one.
3747
+ */
3748
+ interface HypothesisManifest {
3749
+ id: string;
3750
+ /** Human prose — goes into the audit trail. */
3751
+ hypothesis: string;
3752
+ /** Metric the hypothesis claims to move. */
3753
+ metric: string;
3754
+ /** 'increase' = candidate should score higher than baseline; 'decrease' = lower. */
3755
+ direction: 'increase' | 'decrease';
3756
+ /** Minimum effect size to count (same units as the metric). */
3757
+ minEffect: number;
3758
+ /** Alpha threshold. */
3759
+ alpha: number;
3760
+ /** Target statistical power at which sample size was pre-computed. */
3761
+ power: number;
3762
+ /** Declared N per arm before running. */
3763
+ preRegisteredN: number;
3764
+ /** ISO8601 timestamp the manifest was registered. */
3765
+ registeredAt: string;
3766
+ /** Optional identifiers to tie into the trace corpus. */
3767
+ baselineLabel?: string;
3768
+ candidateLabel?: string;
3769
+ }
3770
+ interface SignedManifest extends HypothesisManifest {
3771
+ /** sha256 hex of canonicalized manifest (everything except contentHash). */
3772
+ contentHash: string;
3773
+ }
3774
+ interface HypothesisResult {
3775
+ manifest: SignedManifest;
3776
+ observedN: number;
3777
+ observedEffect: number;
3778
+ observedPValue: number;
3779
+ /** True iff the observed effect hits the pre-declared direction with
3780
+ * magnitude ≥ minEffect AND p < alpha. */
3781
+ confirmed: boolean;
3782
+ /** Enumerated reasons the hypothesis was rejected (each a machine-tag). */
3783
+ rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
3784
+ notes?: string;
3785
+ }
3786
+ declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
3787
+ /** Verify that a signed manifest has not been tampered with. */
3788
+ declare function verifyManifest(m: SignedManifest): Promise<boolean>;
3789
+ /**
3790
+ * Evaluate a pre-registered hypothesis against observed results.
3791
+ * Mechanical — no re-interpretation permitted.
3792
+ */
3793
+ declare function evaluateHypothesis(manifest: SignedManifest, observed: {
3794
+ n: number;
3795
+ effect: number;
3796
+ pValue: number;
3797
+ }): Promise<HypothesisResult>;
3798
+
3799
+ /**
3800
+ * Self-play scenario evolution — agents generate adversarial scenarios
3801
+ * against each other; survivors become part of the eval corpus.
3802
+ *
3803
+ * Framework-agnostic about how scenarios are generated. Caller supplies:
3804
+ * - `propose`: asks a "proposer" agent for candidate scenarios
3805
+ * - `scoreAgainst`: runs a target agent against a scenario and returns
3806
+ * its score
3807
+ *
3808
+ * A scenario *survives* if it reveals a meaningful score difference
3809
+ * between two target agents (or between a target agent and itself on
3810
+ * different runs). Survivors are promoted to a Dataset; the caller
3811
+ * decides what to do with them (hold-out, training, regression set).
3812
+ *
3813
+ * Guard rails: minimum absolute score delta to consider a scenario
3814
+ * informative; floor on absolute target score so degenerate break-all
3815
+ * scenarios (noise, gibberish) don't flood the corpus.
3816
+ */
3817
+
3818
+ interface CandidateScenario {
3819
+ id: string;
3820
+ payload: unknown;
3821
+ /** Free-form tags (domain, generation, parent). */
3822
+ tags?: Record<string, string>;
3823
+ }
3824
+ interface ScoredTarget {
3825
+ targetId: string;
3826
+ score: number;
3827
+ }
3828
+ interface EvolutionRound {
3829
+ round: number;
3830
+ proposed: CandidateScenario[];
3831
+ survived: CandidateScenario[];
3832
+ rejected: Array<{
3833
+ candidate: CandidateScenario;
3834
+ reason: string;
3835
+ }>;
3836
+ scoredBreakdown: Array<{
3837
+ candidate: CandidateScenario;
3838
+ scores: ScoredTarget[];
3839
+ spread: number;
3840
+ }>;
3841
+ }
3842
+ interface SelfPlayOptions {
3843
+ /** Minimum score spread across targets for a scenario to survive. Default 0.1. */
3844
+ minSpread?: number;
3845
+ /** Minimum floor score across targets — keeps degenerate break-all scenarios
3846
+ * out. Default 0.1 (if every target scores below this, discard). */
3847
+ minAbsoluteFloor?: number;
3848
+ /** Hard cap on survivors per round. Default 50. */
3849
+ maxSurvivors?: number;
3850
+ /** Rounds to run. Default 1. Each round's survivors can be fed back into
3851
+ * `propose` to compound. */
3852
+ rounds?: number;
3853
+ /** Seed for scenario id generation if proposer doesn't provide one. */
3854
+ seed?: number;
3855
+ }
3856
+ interface SelfPlayProposer {
3857
+ propose(round: number, priorSurvivors: CandidateScenario[]): Promise<CandidateScenario[]>;
3858
+ }
3859
+ interface SelfPlayScorer {
3860
+ /** Score one candidate against every target; returns parallel array. */
3861
+ scoreCandidate(candidate: CandidateScenario, targets: string[]): Promise<ScoredTarget[]>;
3862
+ }
3863
+ declare function runSelfPlay(proposer: SelfPlayProposer, scorer: SelfPlayScorer, targets: string[], options?: SelfPlayOptions): Promise<{
3864
+ rounds: EvolutionRound[];
3865
+ dataset: Dataset;
3866
+ }>;
3867
+
3868
+ /**
3869
+ * Causal attribution via factorial experiments.
3870
+ *
3871
+ * Run every combination of {model × prompt × scenario × seed}, then
3872
+ * decompose observed score variance into main effects + interactions.
3873
+ * Moves from correlational "variant B is better" to causal "the model
3874
+ * swap accounts for 42% of the lead; the prompt change accounts for 28%;
3875
+ * interaction is 30%."
3876
+ *
3877
+ * Minimal implementation: 2-way factorial (two factors at a time) with
3878
+ * main-effect + interaction decomposition via variance of cell means.
3879
+ * Consumers run the factorial design themselves (we don't schedule
3880
+ * runs); this module consumes the (factorLevels, observedScores)
3881
+ * table and does the attribution math.
3882
+ */
3883
+ interface FactorialCell {
3884
+ /** Map factor name → level id. e.g. { model: 'claude', prompt: 'v2' } */
3885
+ levels: Record<string, string>;
3886
+ /** Observed score for this cell (mean over replications if n > 1). */
3887
+ score: number;
3888
+ /** Number of replications averaged to produce `score`. */
3889
+ n: number;
3890
+ }
3891
+ interface FactorContribution {
3892
+ factor: string;
3893
+ /** Variance attributed to this factor's main effect, as a fraction of total. */
3894
+ shareOfVariance: number;
3895
+ /** Range of cell means across levels of this factor. */
3896
+ range: number;
3897
+ }
3898
+ interface InteractionContribution {
3899
+ factors: [string, string];
3900
+ shareOfVariance: number;
3901
+ }
3902
+ interface CausalAttributionReport {
3903
+ totalVariance: number;
3904
+ mainEffects: FactorContribution[];
3905
+ interactions: InteractionContribution[];
3906
+ /** Residual = variance unexplained by main effects + modeled interactions. */
3907
+ residualShare: number;
3908
+ /** Sanity: shares sum to 1 (within fp). */
3909
+ sharesSum: number;
3910
+ }
3911
+ declare function causalAttribution(cells: FactorialCell[]): CausalAttributionReport;
3912
+
3913
+ /**
3914
+ * Active learning — agent-as-scenario-author.
3915
+ *
3916
+ * Analyzes an existing Dataset + trace corpus for coverage gaps and
3917
+ * weak spots, returns a prioritized list of *synthesis targets*:
3918
+ * (gap description, existing-neighbor examples, suggested direction).
3919
+ *
3920
+ * Does NOT call an LLM itself — the proposer agent is caller-supplied.
3921
+ * This module's job is to identify WHERE new scenarios would compound
3922
+ * the most information, not to author them.
3923
+ *
3924
+ * Gaps we detect:
3925
+ * - dimensions with high score variance (unstable, need more data)
3926
+ * - dimensions with low coverage count (undersampled)
3927
+ * - failure classes with clusters (systematic weakness)
3928
+ * - difficulty bins with no coverage
3929
+ */
3930
+
3931
+ type SynthesisReason = 'high-variance' | 'undersampled' | 'failure-cluster' | 'difficulty-gap';
3932
+ interface SynthesisTarget {
3933
+ reason: SynthesisReason;
3934
+ description: string;
3935
+ /** Existing scenarios that are closest to the gap; caller feeds these to
3936
+ * their LLM proposer as few-shot examples. */
3937
+ neighbors: DatasetScenario[];
3938
+ /** Suggested direction — e.g. "harder variants", "edge cases of X", "failure class Y". */
3939
+ direction: string;
3940
+ /** Priority score — higher = more information-dense gap. 0..1. */
3941
+ priority: number;
3942
+ }
3943
+ interface ActiveLearningOptions {
3944
+ /** Minimum scenarios per difficulty band to count as "covered". */
3945
+ minPerBand?: number;
3946
+ /** Variance threshold above which a scenario's dimension is "unstable". */
3947
+ varianceThreshold?: number;
3948
+ /** Max synthesis targets returned. */
3949
+ topK?: number;
3950
+ }
3951
+ declare function proposeSynthesisTargets(dataset: Dataset, traceStore: TraceStore, options?: ActiveLearningOptions): Promise<SynthesisTarget[]>;
3952
+
3953
+ /**
3954
+ * Reward-model export — the productizable wrapper around PRM training
3955
+ * data. Takes a TraceStore + PrmGrader, produces an embeddable
3956
+ * inference scorer that customers plug into their own agent stack.
3957
+ *
3958
+ * Two export forms:
3959
+ * - `exportRewardModel(store, graders)` — serializes the (step-context,
3960
+ * score) corpus to a framework-agnostic payload. Customer fine-tunes
3961
+ * their own model; we ship the scaffolding.
3962
+ * - `loadScorerFromTraces(store, grader)` — a zero-deps "reward model"
3963
+ * that literally replays the trained rubric at inference time. Works
3964
+ * as a reference baseline + deterministic fallback.
3965
+ */
3966
+
3967
+ interface ExportedRewardModel {
3968
+ /** Version of the export format. Bump when payload shape changes. */
3969
+ version: '1.0';
3970
+ /** Metadata about the training corpus. */
3971
+ metadata: {
3972
+ nTraces: number;
3973
+ nSamples: number;
3974
+ rubrics: string[];
3975
+ exportedAt: string;
3976
+ /** Mean reward across training corpus — use as sanity check at load. */
3977
+ meanReward: number;
3978
+ };
3979
+ /** NDJSON training payload suitable for most fine-tuning frameworks. */
3980
+ trainingNdjson: string;
3981
+ }
3982
+ declare function exportRewardModel(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<ExportedRewardModel>;
3983
+ /**
3984
+ * Zero-deps inference scorer — apply a grader to a trajectory and return
3985
+ * its aggregate score. This is the "reward model" customers embed when
3986
+ * they don't want (or can't) fine-tune one. Deterministic + portable.
3987
+ */
3988
+ interface InferenceScorer {
3989
+ /** Score a completed trajectory. Higher is better. */
3990
+ score(trajectory: Trajectory, store: TraceStore): Promise<number>;
3991
+ metadata: {
3992
+ rubrics: string[];
3993
+ deterministic: true;
3994
+ };
3995
+ }
3996
+ declare function loadScorerFromGrader(grader: PrmGrader): InferenceScorer;
3997
+ /**
3998
+ * Replay a trace corpus through a scorer — produces the canonical
3999
+ * "what would this reward model have said about every run?" table.
4000
+ * Callers use this to validate a trained model against the training
4001
+ * corpus (expect high agreement; drift indicates overfitting).
4002
+ */
4003
+ declare function replayScorerOverCorpus(store: TraceStore, scorer: InferenceScorer, runIds: string[]): Promise<Array<{
4004
+ runId: string;
4005
+ score: number;
4006
+ outcomeScore: number | null;
4007
+ }>>;
4008
+
4009
+ /**
4010
+ * Governance reporting — shared types.
4011
+ *
4012
+ * The framework collects a `GovernanceContext` (traces + outcomes +
4013
+ * dataset manifests + red-team results + judge calibration) and each
4014
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
4015
+ * structured report from it.
4016
+ *
4017
+ * Reports are machine-readable JSON first; human-readable Markdown is a
4018
+ * pure transform on top. External auditors consume the Markdown; CI
4019
+ * consumes the JSON.
4020
+ */
4021
+
4022
+ interface GovernanceContext {
4023
+ /** Legal / org identity for the report. */
4024
+ organization: string;
4025
+ /** System / agent identifier. */
4026
+ systemName: string;
4027
+ /** ISO8601 period the report covers. */
4028
+ periodStart: string;
4029
+ periodEnd: string;
4030
+ /** Versioned dataset manifests used during the period. */
4031
+ datasets: DatasetManifest[];
4032
+ traceStore: TraceStore;
4033
+ outcomeStore?: OutcomeStore;
4034
+ /** Cached red-team results for the period, if available. */
4035
+ redTeam?: RedTeamReport;
4036
+ /** Judge-vs-human calibration results, if measured. */
4037
+ judgeCalibration?: CalibrationResult[];
4038
+ /** Responsible owner for the system — role + name + email. */
4039
+ owner: {
4040
+ role: string;
4041
+ name: string;
4042
+ email: string;
4043
+ };
4044
+ }
4045
+ interface GovernanceFinding {
4046
+ id: string;
4047
+ severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
4048
+ /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
4049
+ control: string;
4050
+ summary: string;
4051
+ evidence?: string;
4052
+ remediation?: string;
4053
+ }
4054
+ interface GovernanceReport {
4055
+ framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
4056
+ version: string;
4057
+ context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
4058
+ summary: {
4059
+ findings: number;
4060
+ byeverity: Record<GovernanceFinding['severity'], number>;
4061
+ overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
4062
+ };
4063
+ findings: GovernanceFinding[];
4064
+ /** Framework-specific structured payload (mapped controls, risk class, etc.). */
4065
+ payload: Record<string, unknown>;
4066
+ generatedAt: string;
4067
+ }
4068
+ declare function renderMarkdown(report: GovernanceReport): string;
4069
+ declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
4070
+
4071
+ /**
4072
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
4073
+ *
4074
+ * Each subcategory derives its status from concrete framework state:
4075
+ * MEASURE 2.x: do we have a calibration regime? contamination controls?
4076
+ * MEASURE 2.7: are red-team results available?
4077
+ * MANAGE 1.x: are outcome metrics captured? correlation measured?
4078
+ * GOVERN 1.x: dataset + prompt provenance recorded?
4079
+ *
4080
+ * We ship the mapping and the derivation rules; consumers supply the
4081
+ * GovernanceContext.
4082
+ */
4083
+
4084
+ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
4085
+
4086
+ /**
4087
+ * SOC 2 — Common Criteria 7 (system operations + change management)
4088
+ * audit trail derived from the trace corpus.
4089
+ *
4090
+ * This is NOT a formal SOC2 report — that requires an external
4091
+ * auditor. What we ship is the machine-readable *evidence* package
4092
+ * that an auditor consumes: run counts, deploy events, access log
4093
+ * summary, anomaly tracking, response-time SLOs.
4094
+ */
4095
+
4096
+ declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
4097
+
4098
+ /**
4099
+ * EU AI Act — risk-class classification + compliance checklist.
4100
+ *
4101
+ * Classification is declarative: caller supplies the domain/use-case
4102
+ * signals (biometric? critical infrastructure? education? employment?
4103
+ * access to services?) and we map to the Act's risk tiers:
4104
+ * - "unacceptable" (prohibited)
4105
+ * - "high" (Annex III — strict obligations)
4106
+ * - "limited" (transparency obligations)
4107
+ * - "minimal" (voluntary codes of conduct)
4108
+ *
4109
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
4110
+ * 10 (data + data governance), 11 (technical documentation), 13
4111
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
4112
+ * requirements and flags gaps.
4113
+ */
4114
+
4115
+ type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
4116
+ interface UseCaseSignals {
4117
+ /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
4118
+ biometricPublic?: boolean;
4119
+ /** Social scoring by public authorities? (Art. 5). */
4120
+ socialScoring?: boolean;
4121
+ /** Subliminal manipulation? (Art. 5). */
4122
+ subliminal?: boolean;
4123
+ /** Annex III sector: critical infrastructure / education / employment /
4124
+ * access to essential services / law enforcement / migration /
4125
+ * administration of justice / democratic processes? */
4126
+ annexIII?: boolean;
4127
+ /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
4128
+ chatbot?: boolean;
4129
+ /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
4130
+ generatesSyntheticMedia?: boolean;
4131
+ }
4132
+ declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4133
+ declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4134
+
4135
+ export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, crossTraceDiff, defaultJudges, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };