npm - @tangle-network/agent-eval - Versions diffs - 0.77.0 → 0.80.0 - Mend

@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +50 -19
package/dist/adapters/http.d.ts +2 -2
package/dist/adapters/langchain.d.ts +2 -2
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
package/dist/analyst/index.d.ts +42 -8
package/dist/analyst/index.js +32 -2
package/dist/analyst/index.js.map +1 -1
package/dist/authenticity/index.d.ts +54 -1
package/dist/authenticity/index.js +88 -1
package/dist/authenticity/index.js.map +1 -1
package/dist/belief-state/index.d.ts +188 -0
package/dist/belief-state/index.js +486 -0
package/dist/belief-state/index.js.map +1 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/calibration-Cpr3WaX3.d.ts +101 -0
package/dist/campaign/index.d.ts +11 -11
package/dist/campaign/index.js +4 -4
package/dist/chunk-4DIJWVUT.js +131 -0
package/dist/chunk-4DIJWVUT.js.map +1 -0
package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
package/dist/chunk-5LVWPNS5.js.map +1 -0
package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
package/dist/chunk-CF67I6QY.js.map +1 -0
package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
package/dist/chunk-KWRRMR3J.js.map +1 -0
package/dist/chunk-NPCTHQIO.js +91 -0
package/dist/chunk-NPCTHQIO.js.map +1 -0
package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
package/dist/chunk-RPLZ4OIB.js.map +1 -0
package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
package/dist/contract/index.d.ts +128 -15
package/dist/contract/index.js +118 -2
package/dist/contract/index.js.map +1 -1
package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +2 -2
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
package/dist/index.d.ts +127 -26
package/dist/index.js +32 -7
package/dist/index.js.map +1 -1
package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
package/dist/meta-eval/index.d.ts +6 -99
package/dist/meta-eval/index.js +7 -76
package/dist/meta-eval/index.js.map +1 -1
package/dist/off-policy-DiwuKKg7.d.ts +132 -0
package/dist/openapi.json +1 -1
package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
package/dist/reporting.d.ts +5 -5
package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
package/dist/rl.d.ts +10 -140
package/dist/rl.js +8 -122
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
package/dist/traces.d.ts +1 -1
package/dist/traces.js +2 -2
package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
package/dist/workflow/index.d.ts +4 -4
package/dist/workflow/index.js +1 -1
package/docs/auto-research-loop-end-to-end.md +1 -1
package/docs/feature-guide.md +4 -4
package/docs/multi-shot-optimization.md +61 -115
package/docs/product-eval-adoption.md +1 -1
package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
package/docs/research/research-roadmap.md +1 -0
package/docs/three-package-architecture.md +1 -1
package/docs/trace-analysis.md +19 -0
package/package.json +7 -2
package/dist/chunk-7W4SM7FD.js.map +0 -1
package/dist/chunk-F3SRAAZO.js.map +0 -1
package/dist/chunk-JYE3WOTE.js.map +0 -1
package/dist/chunk-WYIHD6EB.js.map +0 -1
/package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
/package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
/package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
/package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0

package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} RENAMED Viewed

@@ -3,7 +3,7 @@ import { C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfi
 import { T as TraceEmitter } from './emitter-DEZwY14K.js';
 import { F as FailureClass } from './schema-m0gsnbt3.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
-import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
 interface ActionExecutionPolicy {
     allowedTypes?: string[];

package/dist/control.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BgA6BYTm.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CehLtoET.js';
 export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
 import './feedback-trajectory-B3rErRsh.js';
 import './dataset-B2kL-fSM.js';
@@ -6,4 +6,4 @@ import './errors-Dwqw-T_m.js';
 import './emitter-DEZwY14K.js';
 import './schema-m0gsnbt3.js';
 import './store-CKUAgsJz.js';
-import './run-record-BgTFzO2r.js';
+import './run-record-sItO5ftF.js';

package/dist/control.js CHANGED Viewed

@@ -4,7 +4,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-6EKXFFGQ.js";
+} from "./chunk-RTWFUK6A.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -13,7 +13,7 @@ import {
   stopOnRepeatedAction,
   subjectiveEval
 } from "./chunk-NCRFYPS3.js";
-import "./chunk-F3SRAAZO.js";
+import "./chunk-KWRRMR3J.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
 import "./chunk-3BFEG2F6.js";

package/dist/governance/index.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { c as DatasetManifest } from '../dataset-B2kL-fSM.js';
 import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
-import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
+import { b as OutcomeStore } from '../outcome-store-rnXLEqSn.js';
 import { d as RedTeamReport } from '../red-team-DW9Ca_tj.js';
 import { T as TraceStore } from '../store-CKUAgsJz.js';
 import '../errors-Dwqw-T_m.js';

package/dist/hosted/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { M as MutableSurface, j as GateDecision } from '../types-Bba0vl1V.js';
-import { I as InsightReport } from '../insight-report-Df3lxYXM.js';
-import '../run-record-BgTFzO2r.js';
+import { M as MutableSurface, c as GateDecision } from '../types-4mm2msnR.js';
+import { I as InsightReport } from '../insight-report-dlpEzQDi.js';
+import '../run-record-sItO5ftF.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';
-import '../summary-report-ByiOUrHj.js';
+import '../summary-report-BTaXq1TS.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../store-CKUAgsJz.js';
 import '../judge-calibration-DilmB3Ml.js';

package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { b as RunSplitTag } from './run-record-BgTFzO2r.js';
+import { b as RunSplitTag } from './run-record-sItO5ftF.js';
 /**
  * Shared types for the reference benchmark wrappers under

package/dist/index.d.ts CHANGED Viewed

@@ -1,11 +1,11 @@
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BgA6BYTm.js';
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
-export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunSplitTag, a as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BgTFzO2r.js';
-export { B as BehavioralMetrics, z as ConceptComplexity, A as ConceptFinding, E as ConceptSpec, G as ConceptWeightStrategy, C as CreateAnalystAiConfig, H as DEFAULT_COMPLEXITY_WEIGHTS, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, f as FindingSubject, g as FindingSubjectKind, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, J as SEMANTIC_CONCEPT_JUDGE_VERSION, m as SKILL_USAGE_ANALYST, a as SemanticConceptJudgeInput, S as SemanticConceptJudgeOptions, L as SemanticConceptJudgeResult, n as SkillUsageAnalyst, M as SuboptimalCode, N as SuboptimalSignal, r as buildDefaultAnalystRegistry, O as computeTraceMetrics, t as createAnalystAi, Q as createSemanticConceptJudge, u as defaultIsMaterial, v as diffFindings, R as runSemanticConceptJudge } from './semantic-concept-judge-CV9Wlx4t.js';
-export { C as CreateTraceAnalystKindOpts, a as RawAnalystFinding, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, r as renderPriorFindings } from './kind-factory-DW9XWPvM.js';
-export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from './registry-DuVYiTvw.js';
-import { l as ChatRequest, p as CreateChatClientOpts } from './types-CRD68aH7.js';
-export { A as Analyst, a as AnalystContext, g as AnalystCost, c as AnalystFinding, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, h as AnalystSeverity, E as EvidenceRef, q as computeFindingId, s as makeFinding } from './types-CRD68aH7.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CehLtoET.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
+export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunSplitTag, a as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-sItO5ftF.js';
+export { B as BehavioralMetrics, z as ConceptComplexity, A as ConceptFinding, E as ConceptSpec, G as ConceptWeightStrategy, C as CreateAnalystAiConfig, H as DEFAULT_COMPLEXITY_WEIGHTS, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, f as FindingSubject, g as FindingSubjectKind, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, J as SEMANTIC_CONCEPT_JUDGE_VERSION, m as SKILL_USAGE_ANALYST, a as SemanticConceptJudgeInput, S as SemanticConceptJudgeOptions, L as SemanticConceptJudgeResult, n as SkillUsageAnalyst, M as SuboptimalCode, N as SuboptimalSignal, r as buildDefaultAnalystRegistry, O as computeTraceMetrics, t as createAnalystAi, Q as createSemanticConceptJudge, u as defaultIsMaterial, v as diffFindings, R as runSemanticConceptJudge } from './semantic-concept-judge-qXEUV2w7.js';
+import { l as ChatRequest, p as CreateChatClientOpts } from './types-DRvV0zRo.js';
+export { A as Analyst, a as AnalystContext, g as AnalystCost, c as AnalystFinding, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, h as AnalystSeverity, k as ChatCallOpts, C as ChatClient, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from './types-DRvV0zRo.js';
+export { C as CreateTraceAnalystKindOpts, a as RawAnalystFinding, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, r as renderPriorFindings } from './kind-factory-DqV2t1Xk.js';
+export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from './registry-BK0Zee01.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, b as JudgeFn, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
 export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, a as JudgeInput, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
@@ -14,11 +14,11 @@ import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
 export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-Dwqw-T_m.js';
 import { b as FeedbackLabel, F as FeedbackTrajectoryStore, a as FeedbackTrajectory } from './feedback-trajectory-B3rErRsh.js';
 export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-B3rErRsh.js';
-import { A as AgentProfile$1 } from './agent-profile-DYRboYWu.js';
-export { c as ArtifactCheckArtifact, d as ArtifactEventLike, e as ArtifactValidator, f as BackendIntegrityError, B as BackendIntegrityReport, C as CompletionRequirement, a as CompletionVerdict, b as CorrectnessChecker, L as LlmCorrectnessCheckerOpts, g as ProducedProposal, P as ProducedState, h as ProposalEventLike, i as RequirementCheck, R as RuntimeEventLike, S as SatisfiedBy, T as TaskGold, j as ToolCallEventLike, V as ValidationContext, k as ValidationIssue, l as ValidationResult, m as agentProfileHash, n as assertRealBackend, o as byteLengthRange, p as composeValidators, q as containsAll, r as createLlmCorrectnessChecker, s as extractProducedState, t as jsonHasKeys, u as parseCorrectnessResponse, v as regexMatch, w as summarizeBackendIntegrity, x as verifyCompletion } from './agent-profile-DYRboYWu.js';
+import { A as AgentProfile$1 } from './agent-profile-aSEaJ9Pl.js';
+export { c as ArtifactCheckArtifact, d as ArtifactEventLike, e as ArtifactValidator, f as BackendIntegrityError, B as BackendIntegrityReport, C as CompletionRequirement, a as CompletionVerdict, b as CorrectnessChecker, L as LlmCorrectnessCheckerOpts, g as ProducedProposal, P as ProducedState, h as ProposalEventLike, i as RequirementCheck, R as RuntimeEventLike, S as SatisfiedBy, T as TaskGold, j as ToolCallEventLike, V as ValidationContext, k as ValidationIssue, l as ValidationResult, m as agentProfileHash, n as assertRealBackend, o as byteLengthRange, p as composeValidators, q as containsAll, r as createLlmCorrectnessChecker, s as extractProducedState, t as jsonHasKeys, u as parseCorrectnessResponse, v as regexMatch, w as summarizeBackendIntegrity, x as verifyCompletion } from './agent-profile-aSEaJ9Pl.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
-import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
-export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
+import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CXXZlR8g.js';
+export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
 export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
 import { a as AnalyzeTracesInput, A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t7zZS3TV.js';
 export { c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
@@ -58,23 +58,22 @@ import { b as Layer, S as Severity, L as LayerResult, c as VerifyContext } from
 export { F as Finding, d as LayerStatus, M as MultiLayerVerifier, a as VerificationReport, V as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-DlWCXuxL.js';
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
 export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-DbjLfz-K.js';
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DsnOpCO6.js';
-export { C as CallbackResearcher, d as CallbackResearcherOptions, e as CampaignFactoryParams, f as CampaignIntegrityPolicy, g as CampaignRunContext, h as CampaignRunOutcome, i as CampaignRunner, j as CampaignScenario, k as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, l as FailedRun, F as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-C_KJyIGg.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-B1RKber3.js';
+export { C as CallbackResearcher, d as CallbackResearcherOptions, e as CampaignFactoryParams, f as CampaignIntegrityPolicy, g as CampaignRunContext, h as CampaignRunOutcome, i as CampaignRunner, j as CampaignScenario, k as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, l as FailedRun, F as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-rInLj9De.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-Bba0vl1V.js';
-import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BqYH2vCR.js';
+import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-4mm2msnR.js';
+import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BAl_aVOZ.js';
 import '@ax-llm/ax';
 import 'zod';
-import './outcome-store-D6KWmYvj.js';
+import './outcome-store-rnXLEqSn.js';
 /**
- * Automated pull request opener for the production loop.
+ * Automated pull request opener for the improvement loop.
  *
- * `runProductionLoop` produces a `promotedPrompt` string and a release
- * scorecard. To close the eval → prod → eval cycle the framework needs
- * to land that prompt as a reviewable code change. This module does
- * exactly that:
+ * When `runImprovementLoop` ships a winner (`autoOnPromote: 'pr'`) it produces
+ * a promoted surface diff. To close the eval → prod → eval cycle the framework
+ * lands that change as a reviewable code change. This module does exactly that:
  *
  *   1. Stage a branch off `baseBranch`.
  *   2. Write each `fileChange` into the worktree.
@@ -1904,6 +1903,110 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
 /** Common check: a status field advanced in an expected order. */
 declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
+/**
+ * UI audit finding — substrate primitive for "what is wrong with the UI?"
+ *
+ * Used by:
+ *   - `@tangle-network/agent-runtime` (ui-auditor profile + delegate) —
+ *     produced as the canonical output of an audit iteration, persisted to
+ *     disk as GitHub-issue Markdown, surfaced over MCP.
+ *   - Downstream ship gates / dashboards / analyst consumers — load and
+ *     transform findings without depending on the runtime.
+ *
+ * Repo layering: agent-eval is the substrate (no upward deps). Consumers
+ * read this type from here; the reverse is forbidden. See CLAUDE.md
+ * "Repo layering" for the rule. A UI finding makes sense WITHOUT a running
+ * agent loop (you can load a saved finding, ship-gate against a set of
+ * them, render them in a dashboard), which puts it firmly in substrate.
+ *
+ * The shape is intentionally minimal — runtime-shaped state (capture
+ * timestamps, OTel trace IDs, sandbox placement) lives on auxiliary
+ * runtime types in `agent-runtime`, not on the finding itself.
+ */
+/**
+ * Canonical audit lenses. Each lens scopes a finding to a single class of
+ * problem so a single audit pass can iterate them without pile-on findings
+ * under a generic label.
+ *
+ * Naming is fixed for cross-package wire compatibility. Treat additions as
+ * a substrate-level decision — analysts, gates, and writers all branch on
+ * the lens.
+ */
+type UiLens = 'consistency' | 'hierarchy' | 'layout' | 'ux-flow' | 'duplication' | 'accessibility' | 'responsive' | 'states' | 'content' | 'interaction' | 'performance-perceived' | 'other';
+/** Frozen tuple of lenses for validation + iteration. */
+declare const UI_LENSES: readonly UiLens[];
+/**
+ * Severity scale — intentionally narrow.
+ *
+ *   - `critical` — blocks a core task or is an accessibility blocker.
+ *   - `high`     — confusing, broken-looking, or noticeable friction.
+ *   - `med`      — visible polish issue, would be caught in code review.
+ *   - `low`      — nitpick worth fixing eventually.
+ */
+type UiFindingSeverity = 'low' | 'med' | 'high' | 'critical';
+/** Frozen severity tuple, ordered worst → least bad for sort/report. */
+declare const UI_FINDING_SEVERITIES: readonly UiFindingSeverity[];
+/**
+ * Pointer to a screenshot referenced by the finding. The path is
+ * intentionally a relative string (relative to the audit workspace root)
+ * so findings remain portable across machines and into GitHub issues.
+ */
+interface UiFindingScreenshot {
+    /** Workspace-relative path to the screenshot file (e.g. `screenshots/home--1280x800--...png`). */
+    path: string;
+    /** Optional viewport the screenshot was taken at, e.g. `1280x800`. */
+    viewport?: string;
+    /** Optional short label that disambiguates multiple captures of the same surface (e.g. `t0`, `step-1`). */
+    label?: string;
+}
+/**
+ * A single UI audit finding — the unit of work a contributor can act on.
+ *
+ * Every field except the documented optionals is required. The shape is
+ * deliberately constraining: a finding without a screenshot, a lens, a
+ * concrete title, and a suggested fix is not actionable, and the auditor
+ * validator hard-fails on those gaps.
+ */
+interface UiFinding {
+    /**
+     * Stable identifier within a single audit workspace. Monotonically
+     * increasing integer (1, 2, …) assigned by the writer when persisting.
+     * Optional in transit (before persistence) — undefined on freshly minted
+     * findings emitted from a loop iteration.
+     */
+    id?: number;
+    /** Concrete title — names the offending element AND what's wrong. */
+    title: string;
+    /** Lens this finding belongs to. */
+    lens: UiLens;
+    /** Severity. */
+    severity: UiFindingSeverity;
+    /** Logical route the finding was observed on (e.g. `home`, `checkout-step-2`). */
+    route: string;
+    /** Fully qualified URL the finding was observed at. */
+    url?: string;
+    /** Viewport string the offending capture was taken at (e.g. `1280x800`). */
+    viewport?: string;
+    /** CSS selector pinning the offending element, when one can be identified. */
+    selector?: string;
+    /** 1–3 sentences describing what the screenshot shows that is wrong. */
+    observation: string;
+    /** Who is affected and how. Concrete user impact. */
+    impact: string;
+    /** A specific change a contributor could apply without asking back. */
+    suggestedFix: string;
+    /** Optional explicit reproduction steps. Writer synthesizes from route/url/selector when omitted. */
+    reproSteps?: string;
+    /** Free-form tags. */
+    tags?: readonly string[];
+    /** Screenshot references — required to be non-empty for actionable findings. */
+    screenshots: readonly UiFindingScreenshot[];
+    /** Cross-references to similar findings already on file, by id. */
+    similarTo?: readonly number[];
+    /** ISO-8601 creation timestamp set by the writer when persisted. */
+    createdAt?: string;
+}
 /**
  * Behavior DSL — pytest-style assertions over a run's trajectory.
  *
@@ -4231,8 +4334,6 @@ declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPo
  * Pipeline-level OTEL integration — auto-attaches an OTEL exporter when
  * OTEL_EXPORTER_OTLP_ENDPOINT is set. Pipelines call `withOtelPipeline()`
  * to get a configured exporter + shutdown handle without manual wiring.
- *
- * Used by: runEvalCampaign, runProductionLoop, runAgentMatrix.
  */
 interface OtelPipelineHandle {
@@ -4718,4 +4819,4 @@ declare namespace index {
   export { type index_AgentProfile as AgentProfile, type index_AgentProfileSection as AgentProfileSection, index_BASELINE_ROLES as BASELINE_ROLES, type index_BaselineRoleKey as BaselineRoleKey, type index_ProfileSkill as ProfileSkill, index_applyDomainPatch as applyDomainPatch, index_baselineProfile as baselineProfile, index_baselineProfileFromRole as baselineProfileFromRole, index_engineerRole as engineerRole, index_generalistRole as generalistRole, index_prodProfile as prodProfile, index_profileToSurface as profileToSurface, index_renderProfile as renderProfile, index_researcherRole as researcherRole, index_sectionHash as sectionHash };
 }
-export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_FINDERS, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffScorecardOptions, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, HarnessConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeFamily, type JudgeFleetOptions, JudgeFn, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, type Mutator, Mutex, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParseStudentLabel, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, RunScore, RunScoreWeights, RunTrace, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type SplitGoldOptions, SteeringBundle, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type StepAttribution, type SynthesisReason, type SynthesisTarget, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceStore, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxPool, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseGoldJsonl, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
+export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_FINDERS, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffScorecardOptions, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, HarnessConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeFamily, type JudgeFleetOptions, JudgeFn, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, type Mutator, Mutex, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParseStudentLabel, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, RunScore, RunScoreWeights, RunTrace, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type SplitGoldOptions, SteeringBundle, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type StepAttribution, type SynthesisReason, type SynthesisTarget, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceStore, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UI_FINDING_SEVERITIES, UI_LENSES, UNIVERSAL_FINDERS, type UiFinding, type UiFindingScreenshot, type UiFindingSeverity, type UiLens, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxPool, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseGoldJsonl, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };

package/dist/index.js CHANGED Viewed

@@ -42,7 +42,7 @@ import {
   scoreRedTeamOutput,
   surfaceContentHash,
   toolNamesForRun
-} from "./chunk-JYE3WOTE.js";
+} from "./chunk-RPLZ4OIB.js";
 import {
   BackendIntegrityError,
   assertRealBackend,
@@ -114,7 +114,7 @@ import {
   diffFindings,
   resetLockedAppendersForTesting,
   runSemanticConceptJudge
-} from "./chunk-7W4SM7FD.js";
+} from "./chunk-5LVWPNS5.js";
 import {
   AnalystRegistry,
   DEFAULT_TRACE_ANALYST_KINDS,
@@ -126,7 +126,7 @@ import {
   createTraceAnalystKind,
   makeFinding,
   renderPriorFindings
-} from "./chunk-WYIHD6EB.js";
+} from "./chunk-CF67I6QY.js";
 import {
   controlFailureClassFromVerification,
   controlRunToRunRecord,
@@ -137,7 +137,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-6EKXFFGQ.js";
+} from "./chunk-RTWFUK6A.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -155,7 +155,7 @@ import {
 } from "./chunk-B26KI423.js";
 import {
   runEvalCampaign
-} from "./chunk-GJJNJVIR.js";
+} from "./chunk-XXNIODOM.js";
 import {
   LlmCallError,
   LlmClient,
@@ -233,7 +233,7 @@ import {
   scoreTraceInsightReadiness,
   tokenizeDomainWords,
   traceAnalystOnRunComplete
-} from "./chunk-XGNCBAVZ.js";
+} from "./chunk-XQL22JDG.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,
@@ -312,7 +312,7 @@ import {
   validateAgentProfileCell,
   validateRunRecord,
   verifyAgentProfileCell
-} from "./chunk-F3SRAAZO.js";
+} from "./chunk-KWRRMR3J.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
@@ -4643,6 +4643,28 @@ function statusAdvanced(key, progression) {
   };
 }
+// src/ui-finding.ts
+var UI_LENSES = [
+  "consistency",
+  "hierarchy",
+  "layout",
+  "ux-flow",
+  "duplication",
+  "accessibility",
+  "responsive",
+  "states",
+  "content",
+  "interaction",
+  "performance-perceived",
+  "other"
+];
+var UI_FINDING_SEVERITIES = [
+  "critical",
+  "high",
+  "med",
+  "low"
+];
 // src/behavior-dsl.ts
 var BehaviorAssertion = class {
   constructor(store, runId) {
@@ -8680,6 +8702,8 @@ export {
   TraceEmitter,
   TraceFileMissingError,
   TraceNotFoundError,
+  UI_FINDING_SEVERITIES,
+  UI_LENSES,
   UNIVERSAL_FINDERS,
   ValidationError,
   VerificationError,
@@ -8768,6 +8792,7 @@ export {
   corpusInterRaterAgreementFromJudgeScores,
   createAnalystAi,
   createAntiSlopJudge,
+  createChatClient,
   createCustomJudge,
   createDefaultReviewer,
   createDomainExpertJudge,