@tangle-network/agent-eval 0.37.0 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/campaign/index.d.ts +695 -0
  2. package/dist/campaign/index.js +741 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-QWV226SL.js +276 -0
  19. package/dist/chunk-QWV226SL.js.map +1 -0
  20. package/dist/chunk-TMXPFWC7.js +305 -0
  21. package/dist/chunk-TMXPFWC7.js.map +1 -0
  22. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  23. package/dist/chunk-WP7SY7AI.js.map +1 -0
  24. package/dist/chunk-YV7J7X5N.js +313 -0
  25. package/dist/chunk-YV7J7X5N.js.map +1 -0
  26. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  27. package/dist/control.d.ts +3 -3
  28. package/dist/control.js +2 -2
  29. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  31. package/dist/governance/index.d.ts +133 -5
  32. package/dist/index.d.ts +35 -34
  33. package/dist/index.js +97 -630
  34. package/dist/index.js.map +1 -1
  35. package/dist/matrix/index.d.ts +2 -109
  36. package/dist/matrix/index.js +5 -270
  37. package/dist/matrix/index.js.map +1 -1
  38. package/dist/multishot/index.d.ts +276 -0
  39. package/dist/multishot/index.js +516 -0
  40. package/dist/multishot/index.js.map +1 -0
  41. package/dist/openapi.json +1 -1
  42. package/dist/optimization.d.ts +2 -2
  43. package/dist/optimization.js +5 -5
  44. package/dist/pipelines/index.js +2 -2
  45. package/dist/red-team-30II1T4o.d.ts +63 -0
  46. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  47. package/dist/reporting.d.ts +2 -2
  48. package/dist/reporting.js +3 -3
  49. package/dist/rl.js +15 -315
  50. package/dist/rl.js.map +1 -1
  51. package/dist/run-campaign-JYJXYHHL.js +10 -0
  52. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  53. package/dist/traces.js +7 -5
  54. package/dist/types-DHqkLwEU.d.ts +110 -0
  55. package/dist/wire/index.d.ts +2 -2
  56. package/docs/design/loop-taxonomy.md +233 -0
  57. package/package.json +38 -24
  58. package/dist/chunk-KHZRNY3F.js.map +0 -1
  59. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  60. package/dist/chunk-TSPOEDM3.js.map +0 -1
  61. package/dist/index-CN2agEaO.d.ts +0 -191
  62. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  63. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  64. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  65. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  66. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  67. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -112,4 +112,4 @@ declare class Dataset {
112
112
  }
113
113
  declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
114
114
 
115
- export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a, type DatasetManifest as b, Dataset as c, type DatasetDifficulty as d, type DatasetProvenance as e, hashScenarios as h };
115
+ export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a, Dataset as b, type DatasetManifest as c, type DatasetDifficulty as d, type DatasetProvenance as e, hashScenarios as h };
@@ -1,5 +1,5 @@
1
1
  import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BZ_lVLYW.js';
2
- import { D as DatasetSplit, a as DatasetScenario } from './dataset-ueRVTUoY.js';
2
+ import { D as DatasetSplit, a as DatasetScenario } from './dataset-BlwAtYYf.js';
3
3
 
4
4
  type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
5
5
  type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
@@ -1,6 +1,134 @@
1
- export { E as EuRiskClass, G as GovernanceContext, a as GovernanceFinding, b as GovernanceReport, U as UseCaseSignals, g as classifyEuAiRisk, h as euAiActReport, n as nistAiRmfReport, j as renderMarkdown, k as soc2Report, l as summarize } from '../index-CN2agEaO.js';
2
- import '../dataset-ueRVTUoY.js';
1
+ import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
2
+ import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
3
+ import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
+ import { d as RedTeamReport } from '../red-team-30II1T4o.js';
5
+ import { T as TraceStore } from '../store-Db2Bv8Cf.js';
3
6
  import '../errors-mje_cKOs.js';
4
- import '../judge-calibration-DilmB3Ml.js';
5
- import '../outcome-store-D6KWmYvj.js';
6
- import '../store-Db2Bv8Cf.js';
7
+
8
+ /**
9
+ * Governance reporting — shared types.
10
+ *
11
+ * The framework collects a `GovernanceContext` (traces + outcomes +
12
+ * dataset manifests + red-team results + judge calibration) and each
13
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
14
+ * structured report from it.
15
+ *
16
+ * Reports are machine-readable JSON first; human-readable Markdown is a
17
+ * pure transform on top. External auditors consume the Markdown; CI
18
+ * consumes the JSON.
19
+ */
20
+
21
+ interface GovernanceContext {
22
+ /** Legal / org identity for the report. */
23
+ organization: string;
24
+ /** System / agent identifier. */
25
+ systemName: string;
26
+ /** ISO8601 period the report covers. */
27
+ periodStart: string;
28
+ periodEnd: string;
29
+ /** Versioned dataset manifests used during the period. */
30
+ datasets: DatasetManifest[];
31
+ traceStore: TraceStore;
32
+ outcomeStore?: OutcomeStore;
33
+ /** Cached red-team results for the period, if available. */
34
+ redTeam?: RedTeamReport;
35
+ /** Judge-vs-human calibration results, if measured. */
36
+ judgeCalibration?: CalibrationResult[];
37
+ /** Responsible owner for the system — role + name + email. */
38
+ owner: {
39
+ role: string;
40
+ name: string;
41
+ email: string;
42
+ };
43
+ }
44
+ interface GovernanceFinding {
45
+ id: string;
46
+ severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
47
+ /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
48
+ control: string;
49
+ summary: string;
50
+ evidence?: string;
51
+ remediation?: string;
52
+ }
53
+ interface GovernanceReport {
54
+ framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
55
+ version: string;
56
+ context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
57
+ summary: {
58
+ findings: number;
59
+ byeverity: Record<GovernanceFinding['severity'], number>;
60
+ overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
61
+ };
62
+ findings: GovernanceFinding[];
63
+ /** Framework-specific structured payload (mapped controls, risk class, etc.). */
64
+ payload: Record<string, unknown>;
65
+ generatedAt: string;
66
+ }
67
+ declare function renderMarkdown(report: GovernanceReport): string;
68
+ declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
69
+
70
+ /**
71
+ * EU AI Act — risk-class classification + compliance checklist.
72
+ *
73
+ * Classification is declarative: caller supplies the domain/use-case
74
+ * signals (biometric? critical infrastructure? education? employment?
75
+ * access to services?) and we map to the Act's risk tiers:
76
+ * - "unacceptable" (prohibited)
77
+ * - "high" (Annex III — strict obligations)
78
+ * - "limited" (transparency obligations)
79
+ * - "minimal" (voluntary codes of conduct)
80
+ *
81
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
82
+ * 10 (data + data governance), 11 (technical documentation), 13
83
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
84
+ * requirements and flags gaps.
85
+ */
86
+
87
+ type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
88
+ interface UseCaseSignals {
89
+ /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
90
+ biometricPublic?: boolean;
91
+ /** Social scoring by public authorities? (Art. 5). */
92
+ socialScoring?: boolean;
93
+ /** Subliminal manipulation? (Art. 5). */
94
+ subliminal?: boolean;
95
+ /** Annex III sector: critical infrastructure / education / employment /
96
+ * access to essential services / law enforcement / migration /
97
+ * administration of justice / democratic processes? */
98
+ annexIII?: boolean;
99
+ /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
100
+ chatbot?: boolean;
101
+ /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
102
+ generatesSyntheticMedia?: boolean;
103
+ }
104
+ declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
105
+ declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
106
+
107
+ /**
108
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
109
+ *
110
+ * Each subcategory derives its status from concrete framework state:
111
+ * MEASURE 2.x: do we have a calibration regime? contamination controls?
112
+ * MEASURE 2.7: are red-team results available?
113
+ * MANAGE 1.x: are outcome metrics captured? correlation measured?
114
+ * GOVERN 1.x: dataset + prompt provenance recorded?
115
+ *
116
+ * We ship the mapping and the derivation rules; consumers supply the
117
+ * GovernanceContext.
118
+ */
119
+
120
+ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
121
+
122
+ /**
123
+ * SOC 2 — Common Criteria 7 (system operations + change management)
124
+ * audit trail derived from the trace corpus.
125
+ *
126
+ * This is NOT a formal SOC2 report — that requires an external
127
+ * auditor. What we ship is the machine-readable *evidence* package
128
+ * that an auditor consumes: run counts, deploy events, access log
129
+ * summary, anomaly tracking, response-time SLOs.
130
+ */
131
+
132
+ declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
133
+
134
+ export { type EuRiskClass, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize };
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DVrmvM_k.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CmLJk3IG.js';
2
2
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
3
3
  export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
4
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
@@ -10,16 +10,16 @@ import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from '
10
10
  export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-DeZ_EArp.js';
11
11
  import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
12
12
  export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
- import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-D2ykiLSe.js';
14
- export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-D2ykiLSe.js';
13
+ import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-Di84bXD7.js';
14
+ export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
15
15
  import { TCloud } from '@tangle-network/tcloud';
16
16
  import { z } from 'zod';
17
17
  import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
18
18
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
19
19
  import { A as AgentEvalError } from './errors-mje_cKOs.js';
20
20
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
21
- import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-iATEAHmc.js';
22
- export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
21
+ import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-Dvy-bt7x.js';
22
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
23
23
  import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DuZXOk7K.js';
24
24
  export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
25
25
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
@@ -35,11 +35,12 @@ import { a as BaselineReport } from './baseline-4R5deP0N.js';
35
35
  export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
36
36
  import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
37
37
  export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
38
- import { a as DatasetScenario, c as Dataset } from './dataset-ueRVTUoY.js';
39
- export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-ueRVTUoY.js';
38
+ import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
39
+ export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
40
40
  export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
41
- export { D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GovernanceContext, a as GovernanceFinding, b as GovernanceReport, R as RedTeamCase, c as RedTeamCategory, d as RedTeamFinding, e as RedTeamPayload, f as RedTeamReport, U as UseCaseSignals, g as classifyEuAiRisk, h as euAiActReport, n as nistAiRmfReport, r as redTeamDataset, i as redTeamReport, j as renderMarkdown, s as scoreRedTeamOutput, k as soc2Report, l as summarize, t as toolNamesForRun } from './index-CN2agEaO.js';
41
+ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-30II1T4o.js';
42
42
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
43
+ export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
43
44
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
44
45
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
45
46
  import './outcome-store-D6KWmYvj.js';
@@ -1159,7 +1160,7 @@ interface AnalystHooks {
1159
1160
  analyst: Analyst;
1160
1161
  error: Error;
1161
1162
  runId: string;
1162
- }): AnalystFinding[] | void | Promise<AnalystFinding[] | void>;
1163
+ }): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
1163
1164
  /** Once after registry.run() completes. Use for final aggregation, persistence. */
1164
1165
  onComplete?(args: {
1165
1166
  result: AnalystRunResult;
@@ -6311,6 +6312,31 @@ declare function withOtelPipeline(opts?: OtelPipelineOptions): OtelPipelineHandl
6311
6312
  */
6312
6313
  declare function isOtelConfigured(): boolean;
6313
6314
 
6315
+ /**
6316
+ * Traced analyst wrapper — instruments `analyzeTraces` with spans so the
6317
+ * analyst's internal LLM calls (actor + responder turns) appear in the
6318
+ * trace tree. Also wraps each actor turn callback with a span.
6319
+ *
6320
+ * Since the analyst uses @ax-llm/ax internally (an agent framework with
6321
+ * its own turn loop), we cannot wrap individual `tc.chat()` calls without
6322
+ * forking ax. Instead, we wrap at the boundary:
6323
+ * 1. A parent span for the entire analyst run.
6324
+ * 2. Per-turn child spans from the `onTurn` callback (captures code,
6325
+ * output size, error status).
6326
+ * 3. Summary attributes on the parent (total turns, usage, findings).
6327
+ */
6328
+
6329
+ interface TracedAnalystOptions {
6330
+ /** TraceEmitter for span emission. */
6331
+ emitter: TraceEmitter;
6332
+ /** Parent span id. If omitted, uses emitter stack. */
6333
+ parentSpanId?: string;
6334
+ }
6335
+ /**
6336
+ * Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
6337
+ */
6338
+ declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
6339
+
6314
6340
  /**
6315
6341
  * Traced judge wrappers — instruments every LLM call inside the judge
6316
6342
  * ensemble with child spans so OTEL sinks see per-judge latency, model,
@@ -6337,31 +6363,6 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
6337
6363
  */
6338
6364
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
6339
6365
 
6340
- /**
6341
- * Traced analyst wrapper — instruments `analyzeTraces` with spans so the
6342
- * analyst's internal LLM calls (actor + responder turns) appear in the
6343
- * trace tree. Also wraps each actor turn callback with a span.
6344
- *
6345
- * Since the analyst uses @ax-llm/ax internally (an agent framework with
6346
- * its own turn loop), we cannot wrap individual `tc.chat()` calls without
6347
- * forking ax. Instead, we wrap at the boundary:
6348
- * 1. A parent span for the entire analyst run.
6349
- * 2. Per-turn child spans from the `onTurn` callback (captures code,
6350
- * output size, error status).
6351
- * 3. Summary attributes on the parent (total turns, usage, findings).
6352
- */
6353
-
6354
- interface TracedAnalystOptions {
6355
- /** TraceEmitter for span emission. */
6356
- emitter: TraceEmitter;
6357
- /** Parent span id. If omitted, uses emitter stack. */
6358
- parentSpanId?: string;
6359
- }
6360
- /**
6361
- * Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
6362
- */
6363
- declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
6364
-
6365
6366
  /**
6366
6367
  * Traced mutator wrapper — instruments reflective-mutation LLM calls.
6367
6368
  *