@tangle-network/agent-eval 0.38.0 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/dist/campaign/index.d.ts +695 -0
  2. package/dist/campaign/index.js +741 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/package.json +33 -24
  52. package/dist/chunk-KHZRNY3F.js.map +0 -1
  53. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  54. package/dist/chunk-TSPOEDM3.js.map +0 -1
  55. package/dist/index-CN2agEaO.d.ts +0 -191
  56. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  57. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  58. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  59. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  60. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  61. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -1,191 +0,0 @@
1
- import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-ueRVTUoY.js';
2
- import { b as CalibrationResult } from './judge-calibration-DilmB3Ml.js';
3
- import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
4
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
5
-
6
- /**
7
- * Red-team battery — adversarial scenario corpus with per-category
8
- * scorers.
9
- *
10
- * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
11
- * that declares: category, expected refusal, attack surface. Scorers
12
- * inspect the agent's output + trajectory and return a PASS/FAIL per
13
- * category. Aggregated via `redTeamReport`.
14
- *
15
- * The built-in corpus is intentionally conservative — 40 cases
16
- * covering the most-seen attack patterns. Extend via `extendCorpus`.
17
- */
18
-
19
- type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
20
- interface RedTeamPayload {
21
- category: RedTeamCategory;
22
- /** The hostile input the agent sees. */
23
- input: string;
24
- /**
25
- * What a safe agent should do: "refuse" means explicit refusal,
26
- * "ignore" means process benignly without executing the attack.
27
- */
28
- expected: 'refuse' | 'ignore';
29
- /** Substring(s) that MUST NOT appear in output. */
30
- forbiddenStrings?: string[];
31
- /** Tool names the agent MUST NOT call for this scenario. */
32
- forbiddenTools?: string[];
33
- }
34
- interface RedTeamCase extends DatasetScenario {
35
- payload: RedTeamPayload;
36
- }
37
- interface RedTeamFinding {
38
- scenarioId: string;
39
- category: RedTeamCategory;
40
- passed: boolean;
41
- reason: string;
42
- evidence?: string;
43
- }
44
- interface RedTeamReport {
45
- findings: RedTeamFinding[];
46
- passRateByCategory: Record<RedTeamCategory, number>;
47
- overallPassRate: number;
48
- }
49
- /** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
50
- declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
51
- declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
52
- /**
53
- * Score a single agent response against the red-team case.
54
- * `toolCalls` should be the names of tools the agent invoked during the run.
55
- */
56
- declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
57
- /** Aggregate red-team findings into per-category pass rates. */
58
- declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
59
- /**
60
- * Extract the tool-call names from a corpus run — convenience for the
61
- * common pipeline (run the scenario → score the run).
62
- */
63
- declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
64
-
65
- /**
66
- * Governance reporting — shared types.
67
- *
68
- * The framework collects a `GovernanceContext` (traces + outcomes +
69
- * dataset manifests + red-team results + judge calibration) and each
70
- * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
71
- * structured report from it.
72
- *
73
- * Reports are machine-readable JSON first; human-readable Markdown is a
74
- * pure transform on top. External auditors consume the Markdown; CI
75
- * consumes the JSON.
76
- */
77
-
78
- interface GovernanceContext {
79
- /** Legal / org identity for the report. */
80
- organization: string;
81
- /** System / agent identifier. */
82
- systemName: string;
83
- /** ISO8601 period the report covers. */
84
- periodStart: string;
85
- periodEnd: string;
86
- /** Versioned dataset manifests used during the period. */
87
- datasets: DatasetManifest[];
88
- traceStore: TraceStore;
89
- outcomeStore?: OutcomeStore;
90
- /** Cached red-team results for the period, if available. */
91
- redTeam?: RedTeamReport;
92
- /** Judge-vs-human calibration results, if measured. */
93
- judgeCalibration?: CalibrationResult[];
94
- /** Responsible owner for the system — role + name + email. */
95
- owner: {
96
- role: string;
97
- name: string;
98
- email: string;
99
- };
100
- }
101
- interface GovernanceFinding {
102
- id: string;
103
- severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
104
- /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
105
- control: string;
106
- summary: string;
107
- evidence?: string;
108
- remediation?: string;
109
- }
110
- interface GovernanceReport {
111
- framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
112
- version: string;
113
- context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
114
- summary: {
115
- findings: number;
116
- byeverity: Record<GovernanceFinding['severity'], number>;
117
- overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
118
- };
119
- findings: GovernanceFinding[];
120
- /** Framework-specific structured payload (mapped controls, risk class, etc.). */
121
- payload: Record<string, unknown>;
122
- generatedAt: string;
123
- }
124
- declare function renderMarkdown(report: GovernanceReport): string;
125
- declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
126
-
127
- /**
128
- * EU AI Act — risk-class classification + compliance checklist.
129
- *
130
- * Classification is declarative: caller supplies the domain/use-case
131
- * signals (biometric? critical infrastructure? education? employment?
132
- * access to services?) and we map to the Act's risk tiers:
133
- * - "unacceptable" (prohibited)
134
- * - "high" (Annex III — strict obligations)
135
- * - "limited" (transparency obligations)
136
- * - "minimal" (voluntary codes of conduct)
137
- *
138
- * Then the compliance checklist enumerates Article 9 (risk mgmt),
139
- * 10 (data + data governance), 11 (technical documentation), 13
140
- * (transparency), 14 (human oversight), 15 (accuracy + robustness)
141
- * requirements and flags gaps.
142
- */
143
-
144
- type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
145
- interface UseCaseSignals {
146
- /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
147
- biometricPublic?: boolean;
148
- /** Social scoring by public authorities? (Art. 5). */
149
- socialScoring?: boolean;
150
- /** Subliminal manipulation? (Art. 5). */
151
- subliminal?: boolean;
152
- /** Annex III sector: critical infrastructure / education / employment /
153
- * access to essential services / law enforcement / migration /
154
- * administration of justice / democratic processes? */
155
- annexIII?: boolean;
156
- /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
157
- chatbot?: boolean;
158
- /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
159
- generatesSyntheticMedia?: boolean;
160
- }
161
- declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
162
- declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
163
-
164
- /**
165
- * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
166
- *
167
- * Each subcategory derives its status from concrete framework state:
168
- * MEASURE 2.x: do we have a calibration regime? contamination controls?
169
- * MEASURE 2.7: are red-team results available?
170
- * MANAGE 1.x: are outcome metrics captured? correlation measured?
171
- * GOVERN 1.x: dataset + prompt provenance recorded?
172
- *
173
- * We ship the mapping and the derivation rules; consumers supply the
174
- * GovernanceContext.
175
- */
176
-
177
- declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
178
-
179
- /**
180
- * SOC 2 — Common Criteria 7 (system operations + change management)
181
- * audit trail derived from the trace corpus.
182
- *
183
- * This is NOT a formal SOC2 report — that requires an external
184
- * auditor. What we ship is the machine-readable *evidence* package
185
- * that an auditor consumes: run counts, deploy events, access log
186
- * summary, anomaly tracking, response-time SLOs.
187
- */
188
-
189
- declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
190
-
191
- export { DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GovernanceContext as G, type RedTeamCase as R, type UseCaseSignals as U, type GovernanceFinding as a, type GovernanceReport as b, type RedTeamCategory as c, type RedTeamFinding as d, type RedTeamPayload as e, type RedTeamReport as f, classifyEuAiRisk as g, euAiActReport as h, redTeamReport as i, renderMarkdown as j, soc2Report as k, summarize as l, nistAiRmfReport as n, redTeamDataset as r, scoreRedTeamOutput as s, toolNamesForRun as t };