@tangle-network/agent-eval 0.76.0 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/adapters/http.d.ts +2 -2
  2. package/dist/adapters/langchain.d.ts +2 -2
  3. package/dist/adapters/otel.d.ts +4 -4
  4. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  5. package/dist/analyst/index.d.ts +42 -8
  6. package/dist/analyst/index.js +32 -2
  7. package/dist/analyst/index.js.map +1 -1
  8. package/dist/authenticity/index.d.ts +161 -0
  9. package/dist/authenticity/index.js +215 -0
  10. package/dist/authenticity/index.js.map +1 -0
  11. package/dist/benchmarks/index.d.ts +2 -2
  12. package/dist/campaign/index.d.ts +11 -11
  13. package/dist/campaign/index.js +4 -4
  14. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  15. package/dist/chunk-5LVWPNS5.js.map +1 -0
  16. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  17. package/dist/chunk-CF67I6QY.js.map +1 -0
  18. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  19. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  20. package/dist/chunk-KWRRMR3J.js.map +1 -0
  21. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  22. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  23. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  24. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  25. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  26. package/dist/contract/index.d.ts +12 -12
  27. package/dist/contract/index.js +2 -2
  28. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  29. package/dist/control.d.ts +2 -2
  30. package/dist/control.js +2 -2
  31. package/dist/hosted/index.d.ts +4 -4
  32. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  33. package/dist/index.d.ts +126 -25
  34. package/dist/index.js +32 -7
  35. package/dist/index.js.map +1 -1
  36. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  37. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  38. package/dist/meta-eval/index.d.ts +2 -2
  39. package/dist/openapi.json +1 -1
  40. package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
  41. package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
  42. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  43. package/dist/reporting.d.ts +4 -4
  44. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  45. package/dist/rl.d.ts +6 -6
  46. package/dist/rl.js +2 -2
  47. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
  48. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
  49. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  50. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
  51. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  52. package/dist/traces.d.ts +1 -1
  53. package/dist/traces.js +2 -2
  54. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  55. package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
  56. package/dist/workflow/index.d.ts +4 -4
  57. package/dist/workflow/index.js +1 -1
  58. package/docs/auto-research-loop-end-to-end.md +1 -1
  59. package/docs/feature-guide.md +4 -4
  60. package/docs/multi-shot-optimization.md +61 -115
  61. package/docs/product-eval-adoption.md +1 -1
  62. package/docs/three-package-architecture.md +1 -1
  63. package/docs/trace-analysis.md +19 -0
  64. package/package.json +6 -1
  65. package/dist/chunk-7W4SM7FD.js.map +0 -1
  66. package/dist/chunk-F3SRAAZO.js.map +0 -1
  67. package/dist/chunk-JYE3WOTE.js.map +0 -1
  68. package/dist/chunk-WYIHD6EB.js.map +0 -1
  69. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  70. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  71. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  72. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -3,7 +3,7 @@ import { C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfi
3
3
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
4
4
  import { F as FailureClass } from './schema-m0gsnbt3.js';
5
5
  import { T as TraceStore } from './store-CKUAgsJz.js';
6
- import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
6
+ import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
7
7
 
8
8
  interface ActionExecutionPolicy {
9
9
  allowedTypes?: string[];
package/dist/control.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BgA6BYTm.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CehLtoET.js';
2
2
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
3
3
  import './feedback-trajectory-B3rErRsh.js';
4
4
  import './dataset-B2kL-fSM.js';
@@ -6,4 +6,4 @@ import './errors-Dwqw-T_m.js';
6
6
  import './emitter-DEZwY14K.js';
7
7
  import './schema-m0gsnbt3.js';
8
8
  import './store-CKUAgsJz.js';
9
- import './run-record-BgTFzO2r.js';
9
+ import './run-record-sItO5ftF.js';
package/dist/control.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  runProposeReview,
5
5
  runProposeReviewAsControlLoop,
6
6
  scoreFromEvals
7
- } from "./chunk-6EKXFFGQ.js";
7
+ } from "./chunk-RTWFUK6A.js";
8
8
  import {
9
9
  allCriticalPassed,
10
10
  objectiveEval,
@@ -13,7 +13,7 @@ import {
13
13
  stopOnRepeatedAction,
14
14
  subjectiveEval
15
15
  } from "./chunk-NCRFYPS3.js";
16
- import "./chunk-F3SRAAZO.js";
16
+ import "./chunk-KWRRMR3J.js";
17
17
  import "./chunk-TVVP3ZZQ.js";
18
18
  import "./chunk-VSMTAMNK.js";
19
19
  import "./chunk-3BFEG2F6.js";
@@ -1,9 +1,9 @@
1
- import { M as MutableSurface, j as GateDecision } from '../types-Bba0vl1V.js';
2
- import { I as InsightReport } from '../insight-report-Df3lxYXM.js';
3
- import '../run-record-BgTFzO2r.js';
1
+ import { M as MutableSurface, j as GateDecision } from '../types-QHG0KnkF.js';
2
+ import { I as InsightReport } from '../insight-report-dlpEzQDi.js';
3
+ import '../run-record-sItO5ftF.js';
4
4
  import '../errors-Dwqw-T_m.js';
5
5
  import '../schema-m0gsnbt3.js';
6
- import '../summary-report-ByiOUrHj.js';
6
+ import '../summary-report-BTaXq1TS.js';
7
7
  import '../failure-cluster-CL7IVgkJ.js';
8
8
  import '../store-CKUAgsJz.js';
9
9
  import '../judge-calibration-DilmB3Ml.js';
@@ -1,4 +1,4 @@
1
- import { b as RunSplitTag } from './run-record-BgTFzO2r.js';
1
+ import { b as RunSplitTag } from './run-record-sItO5ftF.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under
package/dist/index.d.ts CHANGED
@@ -1,11 +1,11 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BgA6BYTm.js';
2
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
- export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunSplitTag, a as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BgTFzO2r.js';
4
- export { B as BehavioralMetrics, z as ConceptComplexity, A as ConceptFinding, E as ConceptSpec, G as ConceptWeightStrategy, C as CreateAnalystAiConfig, H as DEFAULT_COMPLEXITY_WEIGHTS, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, f as FindingSubject, g as FindingSubjectKind, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, J as SEMANTIC_CONCEPT_JUDGE_VERSION, m as SKILL_USAGE_ANALYST, a as SemanticConceptJudgeInput, S as SemanticConceptJudgeOptions, L as SemanticConceptJudgeResult, n as SkillUsageAnalyst, M as SuboptimalCode, N as SuboptimalSignal, r as buildDefaultAnalystRegistry, O as computeTraceMetrics, t as createAnalystAi, Q as createSemanticConceptJudge, u as defaultIsMaterial, v as diffFindings, R as runSemanticConceptJudge } from './semantic-concept-judge-CV9Wlx4t.js';
5
- export { C as CreateTraceAnalystKindOpts, a as RawAnalystFinding, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, r as renderPriorFindings } from './kind-factory-DW9XWPvM.js';
6
- export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from './registry-DuVYiTvw.js';
7
- import { l as ChatRequest, p as CreateChatClientOpts } from './types-CRD68aH7.js';
8
- export { A as Analyst, a as AnalystContext, g as AnalystCost, c as AnalystFinding, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, h as AnalystSeverity, E as EvidenceRef, q as computeFindingId, s as makeFinding } from './types-CRD68aH7.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CehLtoET.js';
2
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
3
+ export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunSplitTag, a as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-sItO5ftF.js';
4
+ export { B as BehavioralMetrics, z as ConceptComplexity, A as ConceptFinding, E as ConceptSpec, G as ConceptWeightStrategy, C as CreateAnalystAiConfig, H as DEFAULT_COMPLEXITY_WEIGHTS, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, f as FindingSubject, g as FindingSubjectKind, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, J as SEMANTIC_CONCEPT_JUDGE_VERSION, m as SKILL_USAGE_ANALYST, a as SemanticConceptJudgeInput, S as SemanticConceptJudgeOptions, L as SemanticConceptJudgeResult, n as SkillUsageAnalyst, M as SuboptimalCode, N as SuboptimalSignal, r as buildDefaultAnalystRegistry, O as computeTraceMetrics, t as createAnalystAi, Q as createSemanticConceptJudge, u as defaultIsMaterial, v as diffFindings, R as runSemanticConceptJudge } from './semantic-concept-judge-Du4ZVyef.js';
5
+ import { l as ChatRequest, p as CreateChatClientOpts } from './types-DRvV0zRo.js';
6
+ export { A as Analyst, a as AnalystContext, g as AnalystCost, c as AnalystFinding, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, h as AnalystSeverity, k as ChatCallOpts, C as ChatClient, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from './types-DRvV0zRo.js';
7
+ export { C as CreateTraceAnalystKindOpts, a as RawAnalystFinding, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, r as renderPriorFindings } from './kind-factory-DqV2t1Xk.js';
8
+ export { a as AnalystHooks, A as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from './registry-BmEuU94S.js';
9
9
  import { TCloud } from '@tangle-network/tcloud';
10
10
  import { B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, b as JudgeFn, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
11
11
  export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, a as JudgeInput, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
@@ -14,11 +14,11 @@ import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
14
14
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-Dwqw-T_m.js';
15
15
  import { b as FeedbackLabel, F as FeedbackTrajectoryStore, a as FeedbackTrajectory } from './feedback-trajectory-B3rErRsh.js';
16
16
  export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-B3rErRsh.js';
17
- import { A as AgentProfile$1 } from './agent-profile-DYRboYWu.js';
18
- export { c as ArtifactCheckArtifact, d as ArtifactEventLike, e as ArtifactValidator, f as BackendIntegrityError, B as BackendIntegrityReport, C as CompletionRequirement, a as CompletionVerdict, b as CorrectnessChecker, L as LlmCorrectnessCheckerOpts, g as ProducedProposal, P as ProducedState, h as ProposalEventLike, i as RequirementCheck, R as RuntimeEventLike, S as SatisfiedBy, T as TaskGold, j as ToolCallEventLike, V as ValidationContext, k as ValidationIssue, l as ValidationResult, m as agentProfileHash, n as assertRealBackend, o as byteLengthRange, p as composeValidators, q as containsAll, r as createLlmCorrectnessChecker, s as extractProducedState, t as jsonHasKeys, u as parseCorrectnessResponse, v as regexMatch, w as summarizeBackendIntegrity, x as verifyCompletion } from './agent-profile-DYRboYWu.js';
17
+ import { A as AgentProfile$1 } from './agent-profile-aSEaJ9Pl.js';
18
+ export { c as ArtifactCheckArtifact, d as ArtifactEventLike, e as ArtifactValidator, f as BackendIntegrityError, B as BackendIntegrityReport, C as CompletionRequirement, a as CompletionVerdict, b as CorrectnessChecker, L as LlmCorrectnessCheckerOpts, g as ProducedProposal, P as ProducedState, h as ProposalEventLike, i as RequirementCheck, R as RuntimeEventLike, S as SatisfiedBy, T as TaskGold, j as ToolCallEventLike, V as ValidationContext, k as ValidationIssue, l as ValidationResult, m as agentProfileHash, n as assertRealBackend, o as byteLengthRange, p as composeValidators, q as containsAll, r as createLlmCorrectnessChecker, s as extractProducedState, t as jsonHasKeys, u as parseCorrectnessResponse, v as regexMatch, w as summarizeBackendIntegrity, x as verifyCompletion } from './agent-profile-aSEaJ9Pl.js';
19
19
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
20
- import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
21
- export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
20
+ import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CXXZlR8g.js';
21
+ export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
22
22
  export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
23
23
  import { a as AnalyzeTracesInput, A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t7zZS3TV.js';
24
24
  export { c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
@@ -58,23 +58,22 @@ import { b as Layer, S as Severity, L as LayerResult, c as VerifyContext } from
58
58
  export { F as Finding, d as LayerStatus, M as MultiLayerVerifier, a as VerificationReport, V as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-DlWCXuxL.js';
59
59
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
60
60
  export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-DbjLfz-K.js';
61
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DsnOpCO6.js';
62
- export { C as CallbackResearcher, d as CallbackResearcherOptions, e as CampaignFactoryParams, f as CampaignIntegrityPolicy, g as CampaignRunContext, h as CampaignRunOutcome, i as CampaignRunner, j as CampaignScenario, k as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, l as FailedRun, F as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-C_KJyIGg.js';
63
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
61
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-B1RKber3.js';
62
+ export { C as CallbackResearcher, d as CallbackResearcherOptions, e as CampaignFactoryParams, f as CampaignIntegrityPolicy, g as CampaignRunContext, h as CampaignRunOutcome, i as CampaignRunner, j as CampaignScenario, k as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, l as FailedRun, F as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-rInLj9De.js';
63
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
64
64
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
65
- import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-Bba0vl1V.js';
66
- import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BqYH2vCR.js';
65
+ import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-QHG0KnkF.js';
66
+ import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-Bgu4C59E.js';
67
67
  import '@ax-llm/ax';
68
68
  import 'zod';
69
69
  import './outcome-store-D6KWmYvj.js';
70
70
 
71
71
  /**
72
- * Automated pull request opener for the production loop.
72
+ * Automated pull request opener for the improvement loop.
73
73
  *
74
- * `runProductionLoop` produces a `promotedPrompt` string and a release
75
- * scorecard. To close the eval → prod → eval cycle the framework needs
76
- * to land that prompt as a reviewable code change. This module does
77
- * exactly that:
74
+ * When `runImprovementLoop` ships a winner (`autoOnPromote: 'pr'`) it produces
75
+ * a promoted surface diff. To close the eval → prod → eval cycle the framework
76
+ * lands that change as a reviewable code change. This module does exactly that:
78
77
  *
79
78
  * 1. Stage a branch off `baseBranch`.
80
79
  * 2. Write each `fileChange` into the worktree.
@@ -1904,6 +1903,110 @@ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minR
1904
1903
  /** Common check: a status field advanced in an expected order. */
1905
1904
  declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
1906
1905
 
1906
+ /**
1907
+ * UI audit finding — substrate primitive for "what is wrong with the UI?"
1908
+ *
1909
+ * Used by:
1910
+ * - `@tangle-network/agent-runtime` (ui-auditor profile + delegate) —
1911
+ * produced as the canonical output of an audit iteration, persisted to
1912
+ * disk as GitHub-issue Markdown, surfaced over MCP.
1913
+ * - Downstream ship gates / dashboards / analyst consumers — load and
1914
+ * transform findings without depending on the runtime.
1915
+ *
1916
+ * Repo layering: agent-eval is the substrate (no upward deps). Consumers
1917
+ * read this type from here; the reverse is forbidden. See CLAUDE.md
1918
+ * "Repo layering" for the rule. A UI finding makes sense WITHOUT a running
1919
+ * agent loop (you can load a saved finding, ship-gate against a set of
1920
+ * them, render them in a dashboard), which puts it firmly in substrate.
1921
+ *
1922
+ * The shape is intentionally minimal — runtime-shaped state (capture
1923
+ * timestamps, OTel trace IDs, sandbox placement) lives on auxiliary
1924
+ * runtime types in `agent-runtime`, not on the finding itself.
1925
+ */
1926
+ /**
1927
+ * Canonical audit lenses. Each lens scopes a finding to a single class of
1928
+ * problem so a single audit pass can iterate them without pile-on findings
1929
+ * under a generic label.
1930
+ *
1931
+ * Naming is fixed for cross-package wire compatibility. Treat additions as
1932
+ * a substrate-level decision — analysts, gates, and writers all branch on
1933
+ * the lens.
1934
+ */
1935
+ type UiLens = 'consistency' | 'hierarchy' | 'layout' | 'ux-flow' | 'duplication' | 'accessibility' | 'responsive' | 'states' | 'content' | 'interaction' | 'performance-perceived' | 'other';
1936
+ /** Frozen tuple of lenses for validation + iteration. */
1937
+ declare const UI_LENSES: readonly UiLens[];
1938
+ /**
1939
+ * Severity scale — intentionally narrow.
1940
+ *
1941
+ * - `critical` — blocks a core task or is an accessibility blocker.
1942
+ * - `high` — confusing, broken-looking, or noticeable friction.
1943
+ * - `med` — visible polish issue, would be caught in code review.
1944
+ * - `low` — nitpick worth fixing eventually.
1945
+ */
1946
+ type UiFindingSeverity = 'low' | 'med' | 'high' | 'critical';
1947
+ /** Frozen severity tuple, ordered worst → least bad for sort/report. */
1948
+ declare const UI_FINDING_SEVERITIES: readonly UiFindingSeverity[];
1949
+ /**
1950
+ * Pointer to a screenshot referenced by the finding. The path is
1951
+ * intentionally a relative string (relative to the audit workspace root)
1952
+ * so findings remain portable across machines and into GitHub issues.
1953
+ */
1954
+ interface UiFindingScreenshot {
1955
+ /** Workspace-relative path to the screenshot file (e.g. `screenshots/home--1280x800--...png`). */
1956
+ path: string;
1957
+ /** Optional viewport the screenshot was taken at, e.g. `1280x800`. */
1958
+ viewport?: string;
1959
+ /** Optional short label that disambiguates multiple captures of the same surface (e.g. `t0`, `step-1`). */
1960
+ label?: string;
1961
+ }
1962
+ /**
1963
+ * A single UI audit finding — the unit of work a contributor can act on.
1964
+ *
1965
+ * Every field except the documented optionals is required. The shape is
1966
+ * deliberately constraining: a finding without a screenshot, a lens, a
1967
+ * concrete title, and a suggested fix is not actionable, and the auditor
1968
+ * validator hard-fails on those gaps.
1969
+ */
1970
+ interface UiFinding {
1971
+ /**
1972
+ * Stable identifier within a single audit workspace. Monotonically
1973
+ * increasing integer (1, 2, …) assigned by the writer when persisting.
1974
+ * Optional in transit (before persistence) — undefined on freshly minted
1975
+ * findings emitted from a loop iteration.
1976
+ */
1977
+ id?: number;
1978
+ /** Concrete title — names the offending element AND what's wrong. */
1979
+ title: string;
1980
+ /** Lens this finding belongs to. */
1981
+ lens: UiLens;
1982
+ /** Severity. */
1983
+ severity: UiFindingSeverity;
1984
+ /** Logical route the finding was observed on (e.g. `home`, `checkout-step-2`). */
1985
+ route: string;
1986
+ /** Fully qualified URL the finding was observed at. */
1987
+ url?: string;
1988
+ /** Viewport string the offending capture was taken at (e.g. `1280x800`). */
1989
+ viewport?: string;
1990
+ /** CSS selector pinning the offending element, when one can be identified. */
1991
+ selector?: string;
1992
+ /** 1–3 sentences describing what the screenshot shows that is wrong. */
1993
+ observation: string;
1994
+ /** Who is affected and how. Concrete user impact. */
1995
+ impact: string;
1996
+ /** A specific change a contributor could apply without asking back. */
1997
+ suggestedFix: string;
1998
+ /** Optional explicit reproduction steps. Writer synthesizes from route/url/selector when omitted. */
1999
+ reproSteps?: string;
2000
+ /** Free-form tags. */
2001
+ tags?: readonly string[];
2002
+ /** Screenshot references — required to be non-empty for actionable findings. */
2003
+ screenshots: readonly UiFindingScreenshot[];
2004
+ /** Cross-references to similar findings already on file, by id. */
2005
+ similarTo?: readonly number[];
2006
+ /** ISO-8601 creation timestamp set by the writer when persisted. */
2007
+ createdAt?: string;
2008
+ }
2009
+
1907
2010
  /**
1908
2011
  * Behavior DSL — pytest-style assertions over a run's trajectory.
1909
2012
  *
@@ -4231,8 +4334,6 @@ declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPo
4231
4334
  * Pipeline-level OTEL integration — auto-attaches an OTEL exporter when
4232
4335
  * OTEL_EXPORTER_OTLP_ENDPOINT is set. Pipelines call `withOtelPipeline()`
4233
4336
  * to get a configured exporter + shutdown handle without manual wiring.
4234
- *
4235
- * Used by: runEvalCampaign, runProductionLoop, runAgentMatrix.
4236
4337
  */
4237
4338
 
4238
4339
  interface OtelPipelineHandle {
@@ -4718,4 +4819,4 @@ declare namespace index {
4718
4819
  export { type index_AgentProfile as AgentProfile, type index_AgentProfileSection as AgentProfileSection, index_BASELINE_ROLES as BASELINE_ROLES, type index_BaselineRoleKey as BaselineRoleKey, type index_ProfileSkill as ProfileSkill, index_applyDomainPatch as applyDomainPatch, index_baselineProfile as baselineProfile, index_baselineProfileFromRole as baselineProfileFromRole, index_engineerRole as engineerRole, index_generalistRole as generalistRole, index_prodProfile as prodProfile, index_profileToSurface as profileToSurface, index_renderProfile as renderProfile, index_researcherRole as researcherRole, index_sectionHash as sectionHash };
4719
4820
  }
4720
4821
 
4721
- export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_FINDERS, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffScorecardOptions, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, HarnessConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeFamily, type JudgeFleetOptions, JudgeFn, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, type Mutator, Mutex, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParseStudentLabel, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, RunScore, RunScoreWeights, RunTrace, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type SplitGoldOptions, SteeringBundle, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type StepAttribution, type SynthesisReason, type SynthesisTarget, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceStore, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxPool, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseGoldJsonl, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
4822
+ export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_FINDERS, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffScorecardOptions, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, HarnessConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeFamily, type JudgeFleetOptions, JudgeFn, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, type Mutator, Mutex, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParseStudentLabel, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, RunScore, RunScoreWeights, RunTrace, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type SplitGoldOptions, SteeringBundle, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type StepAttribution, type SynthesisReason, type SynthesisTarget, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceStore, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UI_FINDING_SEVERITIES, UI_LENSES, UNIVERSAL_FINDERS, type UiFinding, type UiFindingScreenshot, type UiFindingSeverity, type UiLens, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxPool, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseGoldJsonl, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -42,7 +42,7 @@ import {
42
42
  scoreRedTeamOutput,
43
43
  surfaceContentHash,
44
44
  toolNamesForRun
45
- } from "./chunk-JYE3WOTE.js";
45
+ } from "./chunk-RPLZ4OIB.js";
46
46
  import {
47
47
  BackendIntegrityError,
48
48
  assertRealBackend,
@@ -114,7 +114,7 @@ import {
114
114
  diffFindings,
115
115
  resetLockedAppendersForTesting,
116
116
  runSemanticConceptJudge
117
- } from "./chunk-7W4SM7FD.js";
117
+ } from "./chunk-5LVWPNS5.js";
118
118
  import {
119
119
  AnalystRegistry,
120
120
  DEFAULT_TRACE_ANALYST_KINDS,
@@ -126,7 +126,7 @@ import {
126
126
  createTraceAnalystKind,
127
127
  makeFinding,
128
128
  renderPriorFindings
129
- } from "./chunk-WYIHD6EB.js";
129
+ } from "./chunk-CF67I6QY.js";
130
130
  import {
131
131
  controlFailureClassFromVerification,
132
132
  controlRunToRunRecord,
@@ -137,7 +137,7 @@ import {
137
137
  runProposeReview,
138
138
  runProposeReviewAsControlLoop,
139
139
  scoreFromEvals
140
- } from "./chunk-6EKXFFGQ.js";
140
+ } from "./chunk-RTWFUK6A.js";
141
141
  import {
142
142
  allCriticalPassed,
143
143
  objectiveEval,
@@ -155,7 +155,7 @@ import {
155
155
  } from "./chunk-B26KI423.js";
156
156
  import {
157
157
  runEvalCampaign
158
- } from "./chunk-GJJNJVIR.js";
158
+ } from "./chunk-XXNIODOM.js";
159
159
  import {
160
160
  LlmCallError,
161
161
  LlmClient,
@@ -233,7 +233,7 @@ import {
233
233
  scoreTraceInsightReadiness,
234
234
  tokenizeDomainWords,
235
235
  traceAnalystOnRunComplete
236
- } from "./chunk-XGNCBAVZ.js";
236
+ } from "./chunk-XQL22JDG.js";
237
237
  import {
238
238
  DEFAULT_REDACTION_RULES,
239
239
  REDACTION_VERSION,
@@ -312,7 +312,7 @@ import {
312
312
  validateAgentProfileCell,
313
313
  validateRunRecord,
314
314
  verifyAgentProfileCell
315
- } from "./chunk-F3SRAAZO.js";
315
+ } from "./chunk-KWRRMR3J.js";
316
316
  import {
317
317
  TraceEmitter,
318
318
  llmSpanFromProvider
@@ -4643,6 +4643,28 @@ function statusAdvanced(key, progression) {
4643
4643
  };
4644
4644
  }
4645
4645
 
4646
+ // src/ui-finding.ts
4647
+ var UI_LENSES = [
4648
+ "consistency",
4649
+ "hierarchy",
4650
+ "layout",
4651
+ "ux-flow",
4652
+ "duplication",
4653
+ "accessibility",
4654
+ "responsive",
4655
+ "states",
4656
+ "content",
4657
+ "interaction",
4658
+ "performance-perceived",
4659
+ "other"
4660
+ ];
4661
+ var UI_FINDING_SEVERITIES = [
4662
+ "critical",
4663
+ "high",
4664
+ "med",
4665
+ "low"
4666
+ ];
4667
+
4646
4668
  // src/behavior-dsl.ts
4647
4669
  var BehaviorAssertion = class {
4648
4670
  constructor(store, runId) {
@@ -8680,6 +8702,8 @@ export {
8680
8702
  TraceEmitter,
8681
8703
  TraceFileMissingError,
8682
8704
  TraceNotFoundError,
8705
+ UI_FINDING_SEVERITIES,
8706
+ UI_LENSES,
8683
8707
  UNIVERSAL_FINDERS,
8684
8708
  ValidationError,
8685
8709
  VerificationError,
@@ -8768,6 +8792,7 @@ export {
8768
8792
  corpusInterRaterAgreementFromJudgeScores,
8769
8793
  createAnalystAi,
8770
8794
  createAntiSlopJudge,
8795
+ createChatClient,
8771
8796
  createCustomJudge,
8772
8797
  createDefaultReviewer,
8773
8798
  createDomainExpertJudge,