@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,76 @@
1
+ import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
2
+
3
+ /**
4
+ * Failure taxonomy — canonical classes + a default classifier.
5
+ *
6
+ * Every failed run should end up in a named class. The classifier here
7
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
8
+ * the consumer for novel cases and trained into the rule base over time.
9
+ *
10
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
11
+ * returned class as `Run.outcome.failureClass`.
12
+ */
13
+
14
+ interface FailureContext {
15
+ run: Run;
16
+ spans: Span[];
17
+ events: TraceEvent[];
18
+ }
19
+ interface FailureClassification {
20
+ failureClass: FailureClass;
21
+ reason: string;
22
+ triggerSpanId?: string;
23
+ triggerEventId?: string;
24
+ }
25
+ /** Ordered rules — first match wins. */
26
+ interface FailureRule {
27
+ id: string;
28
+ match: (ctx: FailureContext) => {
29
+ failureClass: FailureClass;
30
+ reason: string;
31
+ triggerSpanId?: string;
32
+ triggerEventId?: string;
33
+ } | null;
34
+ }
35
+ declare const DEFAULT_RULES: FailureRule[];
36
+ /** Classify the failure mode of a run using an ordered rule list. */
37
+ declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
38
+
39
+ /**
40
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
41
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
42
+ *
43
+ * Each cluster includes: N runs, scenarios affected, representative
44
+ * error message, a proposed mitigation hint (rule → action table).
45
+ */
46
+
47
+ interface FailureCluster {
48
+ failureClass: FailureClass;
49
+ /** Tool name when the trigger was a tool span, else undefined. */
50
+ toolName?: string;
51
+ /** First 16 chars of argHash — clusters similar args. */
52
+ argPrefix?: string;
53
+ /**
54
+ * Source dimension when the trigger was a judge span (e.g. `'format'`,
55
+ * `'safety'`, `'correctness'`). Lets cross-template aggregators
56
+ * group failures by the dimension that fired without overloading
57
+ * `argPrefix`. Optional — legacy clusters without this field
58
+ * deserialize cleanly.
59
+ */
60
+ dimension?: string;
61
+ runCount: number;
62
+ scenarioIds: string[];
63
+ exampleError?: string;
64
+ exampleRunId: string;
65
+ }
66
+ interface FailureClusterReport {
67
+ clusters: FailureCluster[];
68
+ totalFailures: number;
69
+ totalRuns: number;
70
+ }
71
+ declare function failureClusterView(store: TraceStore, options?: {
72
+ rules?: FailureRule[];
73
+ minClusterSize?: number;
74
+ }): Promise<FailureClusterReport>;
75
+
76
+ export { DEFAULT_RULES as D, type FailureClusterReport as F, type FailureCluster as a, type FailureClassification as b, type FailureContext as c, type FailureRule as d, classifyFailure as e, failureClusterView as f };
@@ -0,0 +1,169 @@
1
+ import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BuJHoLg0.js';
2
+ import { D as DatasetSplit, a as DatasetScenario } from './dataset-CiK_3LDr.js';
3
+
4
+ type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
5
+ type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
6
+ type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
7
+ type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
8
+ interface FeedbackTask {
9
+ intent: string;
10
+ context?: unknown;
11
+ }
12
+ interface ProposedSideEffect {
13
+ type: string;
14
+ risk?: 'low' | 'medium' | 'high';
15
+ costUsd?: number;
16
+ externalSideEffect?: boolean;
17
+ requiresApproval?: boolean;
18
+ metadata?: Record<string, unknown>;
19
+ }
20
+ interface FeedbackLabel {
21
+ id?: string;
22
+ source: FeedbackLabelSource;
23
+ kind: FeedbackLabelKind;
24
+ value: unknown;
25
+ reason?: string;
26
+ severity?: FeedbackSeverity;
27
+ createdAt: string;
28
+ metadata?: Record<string, unknown>;
29
+ }
30
+ interface FeedbackAttempt {
31
+ id: string;
32
+ stepIndex: number;
33
+ artifactType: FeedbackArtifactType;
34
+ artifact: unknown;
35
+ options?: unknown[];
36
+ proposedAction?: ProposedSideEffect;
37
+ evals?: ControlEvalResult[];
38
+ feedback?: FeedbackLabel[];
39
+ createdAt: string;
40
+ metadata?: Record<string, unknown>;
41
+ }
42
+ interface FeedbackOutcome {
43
+ success?: boolean;
44
+ score?: number;
45
+ metrics?: Record<string, number>;
46
+ costUsd?: number;
47
+ detail?: string;
48
+ observedAt?: string;
49
+ metadata?: Record<string, unknown>;
50
+ }
51
+ interface FeedbackTrajectory {
52
+ id: string;
53
+ projectId?: string;
54
+ scenarioId?: string;
55
+ task: FeedbackTask;
56
+ attempts: FeedbackAttempt[];
57
+ labels: FeedbackLabel[];
58
+ outcome?: FeedbackOutcome;
59
+ split?: DatasetSplit;
60
+ tags?: Record<string, string>;
61
+ createdAt: string;
62
+ updatedAt?: string;
63
+ metadata?: Record<string, unknown>;
64
+ }
65
+ interface FeedbackTrajectoryStore {
66
+ save(trajectory: FeedbackTrajectory): Promise<void>;
67
+ get(id: string): Promise<FeedbackTrajectory | null>;
68
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
69
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
70
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
71
+ }
72
+ interface FeedbackTrajectoryFilter {
73
+ projectId?: string;
74
+ scenarioId?: string;
75
+ split?: DatasetSplit;
76
+ tag?: [string, string];
77
+ }
78
+ interface FeedbackSplitPolicy {
79
+ trainPct?: number;
80
+ devPct?: number;
81
+ testPct?: number;
82
+ holdoutPct?: number;
83
+ }
84
+ interface PreferenceMemoryEntry {
85
+ instruction: string;
86
+ rationale: string;
87
+ weight: number;
88
+ sourceTrajectoryId: string;
89
+ sourceLabelId?: string;
90
+ category?: string;
91
+ }
92
+ interface FeedbackOptimizerRow {
93
+ scenarioId: string;
94
+ trajectoryId: string;
95
+ labelKinds: FeedbackLabelKind[];
96
+ score?: number;
97
+ metadata?: Record<string, unknown>;
98
+ }
99
+ interface FeedbackReplayResult {
100
+ trajectoryId: string;
101
+ pass: boolean;
102
+ score?: number;
103
+ labels: FeedbackLabel[];
104
+ outcome?: FeedbackOutcome;
105
+ metadata?: Record<string, unknown>;
106
+ }
107
+ interface FeedbackReplayAdapter {
108
+ replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
109
+ }
110
+ declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
111
+ private readonly trajectories;
112
+ save(trajectory: FeedbackTrajectory): Promise<void>;
113
+ get(id: string): Promise<FeedbackTrajectory | null>;
114
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
115
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
116
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
117
+ }
118
+ declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
119
+ private readonly dir;
120
+ private readonly memory;
121
+ private loaded;
122
+ constructor(options: {
123
+ dir: string;
124
+ });
125
+ save(trajectory: FeedbackTrajectory): Promise<void>;
126
+ get(id: string): Promise<FeedbackTrajectory | null>;
127
+ list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
128
+ appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
129
+ appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
130
+ private append;
131
+ private load;
132
+ }
133
+ declare function createFeedbackTrajectory(input: {
134
+ id?: string;
135
+ projectId?: string;
136
+ scenarioId?: string;
137
+ task: FeedbackTask;
138
+ attempts?: FeedbackAttempt[];
139
+ labels?: FeedbackLabel[];
140
+ outcome?: FeedbackOutcome;
141
+ split?: DatasetSplit;
142
+ tags?: Record<string, string>;
143
+ createdAt?: string;
144
+ metadata?: Record<string, unknown>;
145
+ }): FeedbackTrajectory;
146
+ declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
147
+ declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
148
+ declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
149
+ declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
150
+ declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
151
+ declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
152
+ declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
153
+ declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
154
+ declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
155
+ maxEntries?: number;
156
+ }): PreferenceMemoryEntry[];
157
+ declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
158
+ declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
159
+ declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
160
+ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
161
+ projectId?: string;
162
+ scenarioId?: string;
163
+ artifactType?: FeedbackArtifactType;
164
+ artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
165
+ proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
166
+ createdAt?: string;
167
+ }): FeedbackTrajectory;
168
+
169
+ export { replayFeedbackTrajectory as A, serializeFeedbackTrajectoriesJsonl as B, summarizePreferenceMemory as C, withAssignedFeedbackSplit as D, type FeedbackArtifactType as F, InMemoryFeedbackTrajectoryStore as I, type PreferenceMemoryEntry as P, type FeedbackAttempt as a, type FeedbackLabel as b, type FeedbackLabelKind as c, type FeedbackLabelSource as d, type FeedbackOptimizerRow as e, type FeedbackOutcome as f, type FeedbackReplayAdapter as g, type FeedbackReplayResult as h, type FeedbackSeverity as i, type FeedbackSplitPolicy as j, type FeedbackTask as k, type FeedbackTrajectory as l, type FeedbackTrajectoryFilter as m, type FeedbackTrajectoryStore as n, FileSystemFeedbackTrajectoryStore as o, type ProposedSideEffect as p, assignFeedbackSplit as q, controlRunToFeedbackTrajectory as r, createFeedbackTrajectory as s, feedbackTrajectoriesToDatasetScenarios as t, feedbackTrajectoriesToOptimizerRows as u, feedbackTrajectoryToDatasetScenario as v, feedbackTrajectoryToOptimizerRow as w, parseFeedbackTrajectoriesJsonl as x, renderPreferenceMemoryMarkdown as y, replayFeedbackTrajectories as z };
@@ -0,0 +1,5 @@
1
+ export { E as EuRiskClass, b as GovernanceContext, c as GovernanceFinding, d as GovernanceReport, U as UseCaseSignals, j as classifyEuAiRisk, k as euAiActReport, n as nistAiRmfReport, m as renderMarkdown, q as soc2Report, t as summarize } from '../index-Oj9fAPPN.js';
2
+ import '../dataset-CiK_3LDr.js';
3
+ import '../errors-BZ9sTdz7.js';
4
+ import '../outcome-store-D6KWmYvj.js';
5
+ import '../store-Db2Bv8Cf.js';
@@ -0,0 +1,18 @@
1
+ import {
2
+ classifyEuAiRisk,
3
+ euAiActReport,
4
+ nistAiRmfReport,
5
+ renderMarkdown,
6
+ soc2Report,
7
+ summarize
8
+ } from "../chunk-KKHDIONI.js";
9
+ import "../chunk-PZ5AY32C.js";
10
+ export {
11
+ classifyEuAiRisk,
12
+ euAiActReport,
13
+ nistAiRmfReport,
14
+ renderMarkdown,
15
+ soc2Report,
16
+ summarize
17
+ };
18
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -1,4 +1,4 @@
1
- import { a as RunSplitTag } from './run-record-DNiOMBrZ.js';
1
+ import { a as RunSplitTag } from './run-record-CqzahIbx.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under
@@ -0,0 +1,270 @@
1
+ import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-CiK_3LDr.js';
2
+ import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
4
+
5
+ /**
6
+ * Judge calibration — measure judge quality against human gold + bias.
7
+ *
8
+ * Workflow:
9
+ * 1. Build a golden set: {itemId, humanScore}[].
10
+ * 2. Run candidate judges; each produces {itemId, score}.
11
+ * 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
12
+ * 4. Run bias probes (positional, verbosity, self-preference) to
13
+ * detect systematic score inflation.
14
+ *
15
+ * Returns actionable diagnostics, not a single number. Consumers then
16
+ * decide whether to trust the judge, retrain it, or add a tie-breaker.
17
+ */
18
+ interface GoldenItem {
19
+ itemId: string;
20
+ humanScore: number;
21
+ /** Optional group used for per-group bias audits (e.g. model-of-output family). */
22
+ group?: string;
23
+ }
24
+ interface CandidateScore {
25
+ itemId: string;
26
+ score: number;
27
+ /** Optional — enables positional-bias analysis (did order matter?). */
28
+ positionOfAInput?: 'first' | 'second';
29
+ }
30
+ interface CalibrationResult {
31
+ n: number;
32
+ pearson: number;
33
+ /** Cohen's κ with quadratic weights over integer-rounded scores. */
34
+ kappa: number;
35
+ /** Mean absolute error vs human. */
36
+ mae: number;
37
+ /** Worst-5 miscalibrations (largest |judge - human|). */
38
+ worstItems: Array<{
39
+ itemId: string;
40
+ judge: number;
41
+ human: number;
42
+ delta: number;
43
+ }>;
44
+ }
45
+ declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
46
+ interface PositionalBiasResult {
47
+ /**
48
+ * Score delta (first-position - second-position) averaged across items
49
+ * presented in both positions. Non-zero = positional bias.
50
+ */
51
+ avgDelta: number;
52
+ n: number;
53
+ }
54
+ /**
55
+ * Feed the same items to the judge twice with A/B swapped and pass all
56
+ * results here. Items that don't appear in both positions are ignored.
57
+ */
58
+ declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
59
+ interface VerbosityBiasResult {
60
+ /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
61
+ pearson: number;
62
+ n: number;
63
+ }
64
+ declare function verbosityBias(samples: Array<{
65
+ outputLen: number;
66
+ score: number;
67
+ }>): VerbosityBiasResult;
68
+ interface SelfPreferenceResult {
69
+ /** Mean judge score when judge's family matches output's family. */
70
+ inFamilyMean: number;
71
+ outOfFamilyMean: number;
72
+ deltaMean: number;
73
+ n: number;
74
+ }
75
+ /**
76
+ * Pass the same scenarios scored with judge-model X grading outputs from
77
+ * model X (in-family) and model Y (out-of-family). Non-zero delta
78
+ * indicates self-preference.
79
+ */
80
+ declare function selfPreference(samples: Array<{
81
+ score: number;
82
+ inFamily: boolean;
83
+ }>): SelfPreferenceResult;
84
+
85
+ /**
86
+ * Red-team battery — adversarial scenario corpus with per-category
87
+ * scorers.
88
+ *
89
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
90
+ * that declares: category, expected refusal, attack surface. Scorers
91
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
92
+ * category. Aggregated via `redTeamReport`.
93
+ *
94
+ * The built-in corpus is intentionally conservative — 40 cases
95
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
96
+ */
97
+
98
+ type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
99
+ interface RedTeamPayload {
100
+ category: RedTeamCategory;
101
+ /** The hostile input the agent sees. */
102
+ input: string;
103
+ /**
104
+ * What a safe agent should do: "refuse" means explicit refusal,
105
+ * "ignore" means process benignly without executing the attack.
106
+ */
107
+ expected: 'refuse' | 'ignore';
108
+ /** Substring(s) that MUST NOT appear in output. */
109
+ forbiddenStrings?: string[];
110
+ /** Tool names the agent MUST NOT call for this scenario. */
111
+ forbiddenTools?: string[];
112
+ }
113
+ interface RedTeamCase extends DatasetScenario {
114
+ payload: RedTeamPayload;
115
+ }
116
+ interface RedTeamFinding {
117
+ scenarioId: string;
118
+ category: RedTeamCategory;
119
+ passed: boolean;
120
+ reason: string;
121
+ evidence?: string;
122
+ }
123
+ interface RedTeamReport {
124
+ findings: RedTeamFinding[];
125
+ passRateByCategory: Record<RedTeamCategory, number>;
126
+ overallPassRate: number;
127
+ }
128
+ /** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
129
+ declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
130
+ declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
131
+ /**
132
+ * Score a single agent response against the red-team case.
133
+ * `toolCalls` should be the names of tools the agent invoked during the run.
134
+ */
135
+ declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
136
+ /** Aggregate red-team findings into per-category pass rates. */
137
+ declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
138
+ /**
139
+ * Extract the tool-call names from a corpus run — convenience for the
140
+ * common pipeline (run the scenario → score the run).
141
+ */
142
+ declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
143
+
144
+ /**
145
+ * Governance reporting — shared types.
146
+ *
147
+ * The framework collects a `GovernanceContext` (traces + outcomes +
148
+ * dataset manifests + red-team results + judge calibration) and each
149
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
150
+ * structured report from it.
151
+ *
152
+ * Reports are machine-readable JSON first; human-readable Markdown is a
153
+ * pure transform on top. External auditors consume the Markdown; CI
154
+ * consumes the JSON.
155
+ */
156
+
157
+ interface GovernanceContext {
158
+ /** Legal / org identity for the report. */
159
+ organization: string;
160
+ /** System / agent identifier. */
161
+ systemName: string;
162
+ /** ISO8601 period the report covers. */
163
+ periodStart: string;
164
+ periodEnd: string;
165
+ /** Versioned dataset manifests used during the period. */
166
+ datasets: DatasetManifest[];
167
+ traceStore: TraceStore;
168
+ outcomeStore?: OutcomeStore;
169
+ /** Cached red-team results for the period, if available. */
170
+ redTeam?: RedTeamReport;
171
+ /** Judge-vs-human calibration results, if measured. */
172
+ judgeCalibration?: CalibrationResult[];
173
+ /** Responsible owner for the system — role + name + email. */
174
+ owner: {
175
+ role: string;
176
+ name: string;
177
+ email: string;
178
+ };
179
+ }
180
+ interface GovernanceFinding {
181
+ id: string;
182
+ severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
183
+ /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
184
+ control: string;
185
+ summary: string;
186
+ evidence?: string;
187
+ remediation?: string;
188
+ }
189
+ interface GovernanceReport {
190
+ framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
191
+ version: string;
192
+ context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
193
+ summary: {
194
+ findings: number;
195
+ byeverity: Record<GovernanceFinding['severity'], number>;
196
+ overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
197
+ };
198
+ findings: GovernanceFinding[];
199
+ /** Framework-specific structured payload (mapped controls, risk class, etc.). */
200
+ payload: Record<string, unknown>;
201
+ generatedAt: string;
202
+ }
203
+ declare function renderMarkdown(report: GovernanceReport): string;
204
+ declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
205
+
206
+ /**
207
+ * EU AI Act — risk-class classification + compliance checklist.
208
+ *
209
+ * Classification is declarative: caller supplies the domain/use-case
210
+ * signals (biometric? critical infrastructure? education? employment?
211
+ * access to services?) and we map to the Act's risk tiers:
212
+ * - "unacceptable" (prohibited)
213
+ * - "high" (Annex III — strict obligations)
214
+ * - "limited" (transparency obligations)
215
+ * - "minimal" (voluntary codes of conduct)
216
+ *
217
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
218
+ * 10 (data + data governance), 11 (technical documentation), 13
219
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
220
+ * requirements and flags gaps.
221
+ */
222
+
223
+ type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
224
+ interface UseCaseSignals {
225
+ /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
226
+ biometricPublic?: boolean;
227
+ /** Social scoring by public authorities? (Art. 5). */
228
+ socialScoring?: boolean;
229
+ /** Subliminal manipulation? (Art. 5). */
230
+ subliminal?: boolean;
231
+ /** Annex III sector: critical infrastructure / education / employment /
232
+ * access to essential services / law enforcement / migration /
233
+ * administration of justice / democratic processes? */
234
+ annexIII?: boolean;
235
+ /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
236
+ chatbot?: boolean;
237
+ /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
238
+ generatesSyntheticMedia?: boolean;
239
+ }
240
+ declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
241
+ declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
242
+
243
+ /**
244
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
245
+ *
246
+ * Each subcategory derives its status from concrete framework state:
247
+ * MEASURE 2.x: do we have a calibration regime? contamination controls?
248
+ * MEASURE 2.7: are red-team results available?
249
+ * MANAGE 1.x: are outcome metrics captured? correlation measured?
250
+ * GOVERN 1.x: dataset + prompt provenance recorded?
251
+ *
252
+ * We ship the mapping and the derivation rules; consumers supply the
253
+ * GovernanceContext.
254
+ */
255
+
256
+ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
257
+
258
+ /**
259
+ * SOC 2 — Common Criteria 7 (system operations + change management)
260
+ * audit trail derived from the trace corpus.
261
+ *
262
+ * This is NOT a formal SOC2 report — that requires an external
263
+ * auditor. What we ship is the machine-readable *evidence* package
264
+ * that an auditor consumes: run counts, deploy events, access log
265
+ * summary, anomaly tracking, response-time SLOs.
266
+ */
267
+
268
+ declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
269
+
270
+ export { type CalibrationResult as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type CandidateScore as a, type GovernanceContext as b, type GovernanceFinding as c, type GovernanceReport as d, type RedTeamCategory as e, type RedTeamFinding as f, type RedTeamPayload as g, type RedTeamReport as h, calibrateJudge as i, classifyEuAiRisk as j, euAiActReport as k, redTeamReport as l, renderMarkdown as m, nistAiRmfReport as n, selfPreference as o, positionalBias as p, soc2Report as q, redTeamDataset as r, scoreRedTeamOutput as s, summarize as t, toolNamesForRun as u, verbosityBias as v };