ai-shield-core 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audit/logger.d.ts.map +1 -1
- package/dist/audit/logger.js +13 -14
- package/dist/audit/types.js +1 -2
- package/dist/cache/lru.js +1 -5
- package/dist/canary/memory.d.ts +75 -0
- package/dist/canary/memory.d.ts.map +1 -0
- package/dist/canary/memory.js +194 -0
- package/dist/context/wrap-context.d.ts +169 -0
- package/dist/context/wrap-context.d.ts.map +1 -0
- package/dist/context/wrap-context.js +278 -0
- package/dist/cost/anomaly.js +1 -4
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/cost/pricing.js +26 -19
- package/dist/cost/tracker.d.ts +19 -1
- package/dist/cost/tracker.d.ts.map +1 -1
- package/dist/cost/tracker.js +27 -10
- package/dist/index.d.ts +34 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +55 -37
- package/dist/judge/async-judge.d.ts +85 -0
- package/dist/judge/async-judge.d.ts.map +1 -0
- package/dist/judge/async-judge.js +146 -0
- package/dist/policy/circuit-breaker.d.ts +70 -0
- package/dist/policy/circuit-breaker.d.ts.map +1 -0
- package/dist/policy/circuit-breaker.js +376 -0
- package/dist/policy/engine.js +1 -5
- package/dist/policy/tools.js +4 -8
- package/dist/scanner/canary.js +4 -8
- package/dist/scanner/chain.js +1 -5
- package/dist/scanner/heuristic.d.ts +27 -0
- package/dist/scanner/heuristic.d.ts.map +1 -1
- package/dist/scanner/heuristic.js +118 -7
- package/dist/scanner/ingestion.d.ts +147 -0
- package/dist/scanner/ingestion.d.ts.map +1 -0
- package/dist/scanner/ingestion.js +520 -0
- package/dist/scanner/output.d.ts +73 -0
- package/dist/scanner/output.d.ts.map +1 -0
- package/dist/scanner/output.js +297 -0
- package/dist/scanner/pii.d.ts.map +1 -1
- package/dist/scanner/pii.js +24 -12
- package/dist/shield.d.ts.map +1 -1
- package/dist/shield.js +34 -26
- package/dist/types.d.ts +156 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -2
- package/package.json +4 -3
- package/src/audit/logger.ts +6 -1
- package/src/canary/memory.ts +259 -0
- package/src/context/wrap-context.ts +475 -0
- package/src/cost/pricing.ts +21 -9
- package/src/cost/tracker.ts +35 -1
- package/src/index.ts +113 -2
- package/src/judge/async-judge.ts +254 -0
- package/src/policy/circuit-breaker.ts +449 -0
- package/src/scanner/heuristic.ts +125 -2
- package/src/scanner/ingestion.ts +624 -0
- package/src/scanner/output.ts +386 -0
- package/src/scanner/pii.ts +21 -7
- package/src/shield.ts +15 -2
- package/src/types.ts +194 -2
- package/tsconfig.json +2 -1
- package/dist/audit/logger.js.map +0 -1
- package/dist/audit/types.js.map +0 -1
- package/dist/cache/lru.js.map +0 -1
- package/dist/cost/anomaly.js.map +0 -1
- package/dist/cost/pricing.js.map +0 -1
- package/dist/cost/tracker.js.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/policy/engine.js.map +0 -1
- package/dist/policy/tools.js.map +0 -1
- package/dist/scanner/canary.js.map +0 -1
- package/dist/scanner/chain.js.map +0 -1
- package/dist/scanner/heuristic.js.map +0 -1
- package/dist/scanner/pii.js.map +0 -1
- package/dist/shield.js.map +0 -1
- package/dist/types.js.map +0 -1
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
ContextSegment,
|
|
3
|
+
IngestionSource,
|
|
4
|
+
TrustTier,
|
|
5
|
+
WrappedContext,
|
|
6
|
+
ScanContext,
|
|
7
|
+
ScanDecision,
|
|
8
|
+
Violation,
|
|
9
|
+
} from "../types.js";
|
|
10
|
+
import { createHash } from "node:crypto";
|
|
11
|
+
import { IngestionScanner } from "../scanner/ingestion.js";
|
|
12
|
+
|
|
13
|
+
// ============================================================
|
|
14
|
+
// wrapContext — Trust-Tier Context Streams
|
|
15
|
+
//
|
|
16
|
+
// The deepest finding of the 2026 prompt-injection literature
|
|
17
|
+
// (Parallax, IPI surveys, OWASP LLM01:2025) is that the LLM cannot
|
|
18
|
+
// reliably distinguish *instruction* from *data* once both share the
|
|
19
|
+
// same attention substrate. The only architecturally robust mitigation
|
|
20
|
+
// is privilege separation: tag every segment with its provenance + trust
|
|
21
|
+
// tier, scan untrusted segments aggressively, and let downstream code
|
|
22
|
+
// decide whether instruction-shaped content from a `web`/`rag`/`tool-desc`
|
|
23
|
+
// segment is allowed to influence behaviour.
|
|
24
|
+
//
|
|
25
|
+
// `wrapContext()` is the ergonomic entry point.
|
|
26
|
+
// ============================================================
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Input shape for `wrapContext()`. Each named field is conventional;
|
|
30
|
+
* pass only what applies.
|
|
31
|
+
*/
|
|
32
|
+
export interface WrapContextInput {
|
|
33
|
+
/** Developer-controlled prompt. Always `trust: "system"`. */
|
|
34
|
+
system?: string;
|
|
35
|
+
/** Direct user message(s). `trust: "untrusted"`, `source: "user"`. */
|
|
36
|
+
user?: string | string[];
|
|
37
|
+
/** Retrieved documents. `trust: "untrusted"`, `source: "rag"`. */
|
|
38
|
+
retrieved?: Array<{ content: string; label?: string } | string>;
|
|
39
|
+
/** MCP / function tool descriptions about to be exposed to the model. */
|
|
40
|
+
tools?: Array<{ content: string; label?: string } | string>;
|
|
41
|
+
/** Stored memory facts. `trust: "untrusted"`, `source: "memory"`. */
|
|
42
|
+
memory?: Array<{ content: string; label?: string } | string>;
|
|
43
|
+
/** Scraped / fetched web content. */
|
|
44
|
+
web?: Array<{ content: string; label?: string } | string>;
|
|
45
|
+
/** Output from another agent (multi-agent pipelines). */
|
|
46
|
+
agentOutput?: Array<{ content: string; label?: string } | string>;
|
|
47
|
+
/**
|
|
48
|
+
* Promote specific named segments to `"trusted"` (e.g. an internal
|
|
49
|
+
* knowledge base whose contents you control end-to-end).
|
|
50
|
+
* Match is by `label` substring, case-insensitive.
|
|
51
|
+
*/
|
|
52
|
+
trustedLabels?: string[];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Build a `WrappedContext` from typed inputs.
|
|
57
|
+
*
|
|
58
|
+
* Trust assignment:
|
|
59
|
+
* - `system` -> system
|
|
60
|
+
* - `retrieved`/`tools`/`memory`/`web`/`agent-output` -> untrusted
|
|
61
|
+
* - `user` -> untrusted (a user is not trusted in this threat model — they
|
|
62
|
+
* can also inject; the `untrusted` label means "scan aggressively")
|
|
63
|
+
* - any segment whose `label` matches one of `trustedLabels` -> trusted
|
|
64
|
+
*
|
|
65
|
+
* Trust does NOT mean "skip scanning". It only governs how
|
|
66
|
+
* `assemblePrompt()` and the per-segment policy decide whether to
|
|
67
|
+
* include the segment in the final assembled prompt.
|
|
68
|
+
*/
|
|
69
|
+
export function wrapContext(input: WrapContextInput): WrappedContext {
|
|
70
|
+
const segments: ContextSegment[] = [];
|
|
71
|
+
const trustedLabels = (input.trustedLabels ?? []).map((s) => s.toLowerCase());
|
|
72
|
+
|
|
73
|
+
// Critic H1 — substring match would let an attacker-supplied label
|
|
74
|
+
// like "untrusted-doc-INTERNAL-kb-poisoned" claim trust because it
|
|
75
|
+
// CONTAINS the trusted prefix. Match exact or path-anchored only.
|
|
76
|
+
const isTrustedLabel = (label?: string): boolean => {
|
|
77
|
+
if (!label) return false;
|
|
78
|
+
const lc = label.toLowerCase();
|
|
79
|
+
return trustedLabels.some((tl) => lc === tl || lc.startsWith(tl + "/"));
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
const push = (
|
|
83
|
+
content: string,
|
|
84
|
+
source: IngestionSource,
|
|
85
|
+
trust: TrustTier,
|
|
86
|
+
label?: string,
|
|
87
|
+
): void => {
|
|
88
|
+
if (typeof content !== "string" || content.length === 0) return;
|
|
89
|
+
segments.push({
|
|
90
|
+
source,
|
|
91
|
+
trust,
|
|
92
|
+
content,
|
|
93
|
+
label,
|
|
94
|
+
contentHash: hashContent(content),
|
|
95
|
+
});
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
// System: always trust=system. The `source` field is unused for
|
|
99
|
+
// system segments because `trust === "system"` is the authoritative
|
|
100
|
+
// signal — Analyst A2 round 1 review. We keep `source: "user"` here
|
|
101
|
+
// only because `ContextSegment.source` is non-optional; any code that
|
|
102
|
+
// branches on `seg.source` MUST first check `seg.trust !== "system"`.
|
|
103
|
+
if (input.system) {
|
|
104
|
+
push(input.system, "user", "system", "system-prompt");
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// User messages.
|
|
108
|
+
if (input.user) {
|
|
109
|
+
const userInputs = Array.isArray(input.user) ? input.user : [input.user];
|
|
110
|
+
for (const u of userInputs) {
|
|
111
|
+
push(u, "user", "untrusted", "user");
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Helper for the array-of-{content,label} groups.
|
|
116
|
+
const pushGroup = (
|
|
117
|
+
items: Array<{ content: string; label?: string } | string> | undefined,
|
|
118
|
+
source: IngestionSource,
|
|
119
|
+
): void => {
|
|
120
|
+
if (!items) return;
|
|
121
|
+
for (const item of items) {
|
|
122
|
+
const content = typeof item === "string" ? item : item.content;
|
|
123
|
+
const label = typeof item === "string" ? undefined : item.label;
|
|
124
|
+
const trust: TrustTier = isTrustedLabel(label) ? "trusted" : "untrusted";
|
|
125
|
+
push(content, source, trust, label);
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
pushGroup(input.retrieved, "rag");
|
|
130
|
+
pushGroup(input.tools, "tool-desc");
|
|
131
|
+
pushGroup(input.memory, "memory");
|
|
132
|
+
pushGroup(input.web, "web");
|
|
133
|
+
pushGroup(input.agentOutput, "agent-output");
|
|
134
|
+
|
|
135
|
+
return { segments };
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Scan every segment with the source-specific ingestion profile.
|
|
140
|
+
* Mutates `ctx` in place by attaching `scanResults` + `decision`,
|
|
141
|
+
* AND returns the same object for chaining.
|
|
142
|
+
*/
|
|
143
|
+
export async function scanWrappedContext(
|
|
144
|
+
ctx: WrappedContext,
|
|
145
|
+
options: { strictness?: "low" | "medium" | "high" } = {},
|
|
146
|
+
): Promise<WrappedContext> {
|
|
147
|
+
const scanner = new IngestionScanner({
|
|
148
|
+
strictness: options.strictness ?? "high",
|
|
149
|
+
});
|
|
150
|
+
const results: NonNullable<WrappedContext["scanResults"]> = [];
|
|
151
|
+
let worst: ScanDecision = "allow";
|
|
152
|
+
|
|
153
|
+
for (let i = 0; i < ctx.segments.length; i += 1) {
|
|
154
|
+
const seg = ctx.segments[i]!;
|
|
155
|
+
// System segments skip the scanner — they're developer-authored and
|
|
156
|
+
// running the heuristic over a real system prompt would flood with
|
|
157
|
+
// false positives (system prompts ARE instructions, by definition).
|
|
158
|
+
if (seg.trust === "system") {
|
|
159
|
+
results.push({ segmentIndex: i, decision: "allow", violations: [] });
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const scanContext: ScanContext = {
|
|
164
|
+
source: seg.source,
|
|
165
|
+
trustTier: seg.trust,
|
|
166
|
+
};
|
|
167
|
+
const r = await scanner.scan(seg.content, scanContext);
|
|
168
|
+
results.push({
|
|
169
|
+
segmentIndex: i,
|
|
170
|
+
decision: r.decision,
|
|
171
|
+
violations: r.violations,
|
|
172
|
+
});
|
|
173
|
+
if (priority(r.decision) > priority(worst)) {
|
|
174
|
+
worst = r.decision;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
ctx.scanResults = results;
|
|
179
|
+
ctx.decision = worst;
|
|
180
|
+
return ctx;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Assemble a prompt string respecting tier boundaries.
|
|
185
|
+
*
|
|
186
|
+
* Order: `system` → `trusted` retrieved/memory/tool-desc → `user`
|
|
187
|
+
* → all remaining `untrusted` segments wrapped in fenced markers.
|
|
188
|
+
*
|
|
189
|
+
* Why `trusted` before `user`? Putting developer-marked trusted
|
|
190
|
+
* context above the user message reduces the chance an untrusted user
|
|
191
|
+
* prompt re-frames the trusted reference material below it.
|
|
192
|
+
*
|
|
193
|
+
* Untrusted segments are wrapped in an explicit fence so a downstream
|
|
194
|
+
* model has a chance to attend to provenance. This is not a guarantee
|
|
195
|
+
* (no in-band marker is) but it is the single highest-leverage
|
|
196
|
+
* mitigation we can apply at the toolkit layer per Anthropic +
|
|
197
|
+
* OpenAI Model Spec guidance.
|
|
198
|
+
*
|
|
199
|
+
* Pass `strictMode: true` to OMIT blocked segments entirely. Default
|
|
200
|
+
* keeps them but fences them with a `<BLOCKED>` marker so an auditor
|
|
201
|
+
* can see what was tried.
|
|
202
|
+
*/
|
|
203
|
+
export interface AssembleOptions {
|
|
204
|
+
strictMode?: boolean;
|
|
205
|
+
/** Custom fence labels. Defaults are sensible. */
|
|
206
|
+
fences?: {
|
|
207
|
+
untrusted?: { open: string; close: string };
|
|
208
|
+
blocked?: { open: string; close: string };
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
export function assemblePrompt(
|
|
213
|
+
ctx: WrappedContext,
|
|
214
|
+
options: AssembleOptions = {},
|
|
215
|
+
): string {
|
|
216
|
+
const fences = {
|
|
217
|
+
untrusted: options.fences?.untrusted ?? {
|
|
218
|
+
open: "<UNTRUSTED_CONTENT source=",
|
|
219
|
+
close: "</UNTRUSTED_CONTENT>",
|
|
220
|
+
},
|
|
221
|
+
blocked: options.fences?.blocked ?? {
|
|
222
|
+
open: "<BLOCKED_CONTENT source=",
|
|
223
|
+
close: "</BLOCKED_CONTENT>",
|
|
224
|
+
},
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
// Pre-build a segment→index map ONCE. Avoids O(n²) `indexOf` inside the
|
|
228
|
+
// assembly loop AND removes a TOCTOU on mutable `ctx.segments` (Critic
|
|
229
|
+
// H2 + Analyst A4 round 1 review).
|
|
230
|
+
const segmentIndexMap = new Map<ContextSegment, number>();
|
|
231
|
+
ctx.segments.forEach((s, i) => segmentIndexMap.set(s, i));
|
|
232
|
+
const segmentResultMap = new Map<number, NonNullable<WrappedContext["scanResults"]>[number]>();
|
|
233
|
+
for (const r of ctx.scanResults ?? []) {
|
|
234
|
+
segmentResultMap.set(r.segmentIndex, r);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const ordered: ContextSegment[] = [];
|
|
238
|
+
// 1. system
|
|
239
|
+
ordered.push(...ctx.segments.filter((s) => s.trust === "system"));
|
|
240
|
+
// 2. trusted (retrieved/memory/tool-desc the dev marked as trusted)
|
|
241
|
+
ordered.push(...ctx.segments.filter((s) => s.trust === "trusted"));
|
|
242
|
+
// 3. user (untrusted, source="user")
|
|
243
|
+
ordered.push(
|
|
244
|
+
...ctx.segments.filter(
|
|
245
|
+
(s) => s.source === "user" && s.trust === "untrusted",
|
|
246
|
+
),
|
|
247
|
+
);
|
|
248
|
+
// 4. all remaining untrusted, preserve original order within group.
|
|
249
|
+
for (const s of ctx.segments) {
|
|
250
|
+
if (s.trust === "untrusted" && s.source !== "user") {
|
|
251
|
+
ordered.push(s);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const parts: string[] = [];
|
|
256
|
+
for (const seg of ordered) {
|
|
257
|
+
const segIdx = segmentIndexMap.get(seg) ?? -1;
|
|
258
|
+
const segResult = segIdx >= 0 ? segmentResultMap.get(segIdx) : undefined;
|
|
259
|
+
const blocked = segResult?.decision === "block";
|
|
260
|
+
|
|
261
|
+
if (blocked) {
|
|
262
|
+
if (options.strictMode) {
|
|
263
|
+
// Drop entirely.
|
|
264
|
+
continue;
|
|
265
|
+
}
|
|
266
|
+
parts.push(
|
|
267
|
+
`${fences.blocked.open}"${seg.source}" label="${seg.label ?? ""}">\n${seg.content}\n${fences.blocked.close}`,
|
|
268
|
+
);
|
|
269
|
+
continue;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (seg.trust === "system") {
|
|
273
|
+
parts.push(seg.content);
|
|
274
|
+
} else if (seg.trust === "trusted") {
|
|
275
|
+
parts.push(seg.content);
|
|
276
|
+
} else if (seg.source === "user" && seg.trust === "untrusted") {
|
|
277
|
+
// User input keeps its natural shape — fencing every user message
|
|
278
|
+
// creates more noise than signal.
|
|
279
|
+
parts.push(seg.content);
|
|
280
|
+
} else {
|
|
281
|
+
parts.push(
|
|
282
|
+
`${fences.untrusted.open}"${seg.source}" label="${seg.label ?? ""}">\n${seg.content}\n${fences.untrusted.close}`,
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return parts.join("\n\n");
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function hashContent(content: string): string {
|
|
291
|
+
return createHash("sha256").update(content).digest("hex");
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function priority(d: ScanDecision): number {
|
|
295
|
+
return d === "block" ? 2 : d === "warn" ? 1 : 0;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Convenience aggregator: violations across all scanned segments.
|
|
300
|
+
*/
|
|
301
|
+
export function flattenViolations(ctx: WrappedContext): Violation[] {
|
|
302
|
+
if (!ctx.scanResults) return [];
|
|
303
|
+
return ctx.scanResults.flatMap((r) => r.violations);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// ============================================================
|
|
307
|
+
// propagateTrust — Multi-Agent Trust Propagation
|
|
308
|
+
//
|
|
309
|
+
// In a multi-agent pipeline one agent's output becomes the next agent's
|
|
310
|
+
// input. A successful injection in agent A propagates: A summarizes a
|
|
311
|
+
// poisoned document, B reads A's summary and decides, C executes. The
|
|
312
|
+
// 2026 literature calls this multi-agent contagion — and the standard
|
|
313
|
+
// in-context defenses share an attention substrate with the payload, so
|
|
314
|
+
// the only robust handling is to track trust ACROSS the chain and refuse
|
|
315
|
+
// to let a downstream agent treat upstream output as trusted once any
|
|
316
|
+
// link is contaminated.
|
|
317
|
+
//
|
|
318
|
+
// `propagateTrust()` scans one hop (A → B) as `agent-output`, degrades
|
|
319
|
+
// the effective trust tier on any warn/block, and keeps contamination
|
|
320
|
+
// "sticky": pass the returned `hops` back as `priorChain` for the next
|
|
321
|
+
// link so a poisoning at A still marks the C-hop as contaminated even if
|
|
322
|
+
// C's own payload looks clean.
|
|
323
|
+
// ============================================================
|
|
324
|
+
|
|
325
|
+
export interface AgentHop {
|
|
326
|
+
/** The agent that PRODUCED the payload entering this hop. */
|
|
327
|
+
agentId: string;
|
|
328
|
+
/** Trust tier the payload was treated as at this hop. */
|
|
329
|
+
trust: TrustTier;
|
|
330
|
+
/** Scan decision for this hop's payload. */
|
|
331
|
+
decision: ScanDecision;
|
|
332
|
+
/** Violations found at this hop. */
|
|
333
|
+
violations: Violation[];
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
export interface PropagateTrustOptions {
|
|
337
|
+
/**
|
|
338
|
+
* Trust tier of the producing agent's output. Defaults to `untrusted` —
|
|
339
|
+
* agent output is attacker-influenceable by construction. Only set to
|
|
340
|
+
* `trusted` for an agent whose output you control end-to-end.
|
|
341
|
+
*/
|
|
342
|
+
fromTrust?: TrustTier;
|
|
343
|
+
/**
|
|
344
|
+
* Chain returned by an earlier `propagateTrust()` call. Pass it to keep
|
|
345
|
+
* contamination sticky across A→B→C. Omit for the first link.
|
|
346
|
+
*/
|
|
347
|
+
priorChain?: AgentHop[];
|
|
348
|
+
/** Ingestion-scanner strictness for the contagion scan. Default `high`. */
|
|
349
|
+
strictness?: "low" | "medium" | "high";
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
export interface TrustPropagationResult {
|
|
353
|
+
/** No contamination anywhere in the chain (including prior hops). */
|
|
354
|
+
safe: boolean;
|
|
355
|
+
/** Worst decision across the whole chain — sticky (once block, stays). */
|
|
356
|
+
decision: ScanDecision;
|
|
357
|
+
/**
|
|
358
|
+
* Trust tier the RECEIVING agent should treat the payload as. Degrades to
|
|
359
|
+
* `untrusted` the moment this hop — or any prior hop — warns or blocks.
|
|
360
|
+
*/
|
|
361
|
+
effectiveTrust: TrustTier;
|
|
362
|
+
/** Full chain including this hop. Feed back as `priorChain` for the next. */
|
|
363
|
+
hops: AgentHop[];
|
|
364
|
+
/** Every violation across the chain, newest hop last. */
|
|
365
|
+
violations: Violation[];
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Scan one agent-to-agent hand-off and propagate trust along the chain.
|
|
370
|
+
*
|
|
371
|
+
* @param payload The producing agent's output (= consuming agent's input).
|
|
372
|
+
* @param fromAgentId Agent that produced `payload`.
|
|
373
|
+
* @param toAgentId Agent about to consume `payload`.
|
|
374
|
+
*
|
|
375
|
+
* @example
|
|
376
|
+
* ```ts
|
|
377
|
+
* import { propagateTrust } from "ai-shield-core";
|
|
378
|
+
*
|
|
379
|
+
* // A → B
|
|
380
|
+
* let chain = await propagateTrust(aOutput, "researcher", "planner");
|
|
381
|
+
* // B → C, contamination at A stays sticky through to C
|
|
382
|
+
* chain = await propagateTrust(bOutput, "planner", "executor", {
|
|
383
|
+
* priorChain: chain.hops,
|
|
384
|
+
* });
|
|
385
|
+
* if (chain.effectiveTrust !== "trusted" && !chain.safe) {
|
|
386
|
+
* // an upstream agent was poisoned — do not let the executor act on it
|
|
387
|
+
* haltPipeline(chain.violations);
|
|
388
|
+
* }
|
|
389
|
+
* ```
|
|
390
|
+
*/
|
|
391
|
+
export async function propagateTrust(
|
|
392
|
+
payload: string,
|
|
393
|
+
fromAgentId: string,
|
|
394
|
+
toAgentId: string,
|
|
395
|
+
options: PropagateTrustOptions = {},
|
|
396
|
+
): Promise<TrustPropagationResult> {
|
|
397
|
+
const fromTrust = options.fromTrust ?? "untrusted";
|
|
398
|
+
const priorChain = options.priorChain ?? [];
|
|
399
|
+
|
|
400
|
+
const scanner = new IngestionScanner({
|
|
401
|
+
strictness: options.strictness ?? "high",
|
|
402
|
+
});
|
|
403
|
+
const scanContext: ScanContext = {
|
|
404
|
+
source: "agent-output",
|
|
405
|
+
trustTier: fromTrust,
|
|
406
|
+
agentId: fromAgentId,
|
|
407
|
+
};
|
|
408
|
+
const scan = await scanner.scan(payload, scanContext);
|
|
409
|
+
|
|
410
|
+
const hopViolations: Violation[] = scan.violations.map((v) => ({
|
|
411
|
+
...v,
|
|
412
|
+
detail: `${v.detail ?? ""} (${fromAgentId}→${toAgentId})`.trim(),
|
|
413
|
+
}));
|
|
414
|
+
|
|
415
|
+
// Was anything upstream already contaminated?
|
|
416
|
+
const upstreamWorst = priorChain.reduce<ScanDecision>(
|
|
417
|
+
(worst, h) => (priority(h.decision) > priority(worst) ? h.decision : worst),
|
|
418
|
+
"allow",
|
|
419
|
+
);
|
|
420
|
+
const upstreamContaminated = upstreamWorst !== "allow";
|
|
421
|
+
|
|
422
|
+
// This hop's own decision.
|
|
423
|
+
const hopDecision = scan.decision;
|
|
424
|
+
|
|
425
|
+
// Make contamination explicit as a multi-agent violation (distinct from
|
|
426
|
+
// the per-segment `ingested_injection` the scanner already produced).
|
|
427
|
+
if (hopDecision !== "allow") {
|
|
428
|
+
hopViolations.push({
|
|
429
|
+
type: "trust_propagation",
|
|
430
|
+
scanner: "trust-chain",
|
|
431
|
+
score: hopDecision === "block" ? 1.0 : 0.5,
|
|
432
|
+
threshold: 0.5,
|
|
433
|
+
message: `Contagion risk in hand-off ${fromAgentId}→${toAgentId}`,
|
|
434
|
+
detail: `Agent output flagged at this hop`,
|
|
435
|
+
});
|
|
436
|
+
} else if (upstreamContaminated) {
|
|
437
|
+
hopViolations.push({
|
|
438
|
+
type: "trust_propagation",
|
|
439
|
+
scanner: "trust-chain",
|
|
440
|
+
score: 0.5,
|
|
441
|
+
threshold: 0.5,
|
|
442
|
+
message: `Payload reaching ${toAgentId} originates from a contaminated chain`,
|
|
443
|
+
detail: `Upstream contamination is sticky across hops`,
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
const hop: AgentHop = {
|
|
448
|
+
agentId: fromAgentId,
|
|
449
|
+
trust: fromTrust,
|
|
450
|
+
decision: hopDecision,
|
|
451
|
+
violations: hopViolations,
|
|
452
|
+
};
|
|
453
|
+
const hops = [...priorChain, hop];
|
|
454
|
+
|
|
455
|
+
// Worst decision across the full chain (sticky).
|
|
456
|
+
const chainDecision: ScanDecision =
|
|
457
|
+
priority(hopDecision) >= priority(upstreamWorst)
|
|
458
|
+
? hopDecision
|
|
459
|
+
: upstreamWorst;
|
|
460
|
+
|
|
461
|
+
// Effective trust degrades to untrusted on ANY contamination in the chain.
|
|
462
|
+
// A clean hand-off from a `trusted` agent with a clean chain stays trusted.
|
|
463
|
+
const effectiveTrust: TrustTier =
|
|
464
|
+
chainDecision === "allow" && fromTrust === "trusted"
|
|
465
|
+
? "trusted"
|
|
466
|
+
: "untrusted";
|
|
467
|
+
|
|
468
|
+
return {
|
|
469
|
+
safe: chainDecision === "allow",
|
|
470
|
+
decision: chainDecision,
|
|
471
|
+
effectiveTrust,
|
|
472
|
+
hops,
|
|
473
|
+
violations: hops.flatMap((h) => h.violations),
|
|
474
|
+
};
|
|
475
|
+
}
|
package/src/cost/pricing.ts
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import type { ModelPricing } from "../types.js";
|
|
2
2
|
|
|
3
3
|
// ============================================================
|
|
4
|
-
// Model Pricing Table — Updated
|
|
5
|
-
// Prices in USD per 1M tokens
|
|
4
|
+
// Model Pricing Table — Updated June 2026
|
|
5
|
+
// Prices in USD per 1M tokens.
|
|
6
|
+
// Includes `cachedInputPer1M` for providers that support prompt caching
|
|
7
|
+
// (Anthropic cache reads land at ~10% of standard input rate).
|
|
8
|
+
//
|
|
9
|
+
// Note: with the Opus 4.7 generation Anthropic dropped the Opus input/output
|
|
10
|
+
// rate from $15/$75 to $5/$25 and serves the 1M context window at standard
|
|
11
|
+
// pricing (no long-context premium). Earlier tables that still list Opus at
|
|
12
|
+
// $15/$75 over-estimate Opus cost by ~3x.
|
|
6
13
|
// ============================================================
|
|
7
14
|
|
|
8
15
|
export const MODEL_PRICING: Record<string, ModelPricing> = {
|
|
@@ -17,16 +24,21 @@ export const MODEL_PRICING: Record<string, ModelPricing> = {
|
|
|
17
24
|
"o3-mini": { inputPer1M: 1.10, outputPer1M: 4.40 },
|
|
18
25
|
"o4-mini": { inputPer1M: 1.10, outputPer1M: 4.40 },
|
|
19
26
|
|
|
20
|
-
// Anthropic
|
|
21
|
-
"claude-
|
|
22
|
-
"claude-
|
|
23
|
-
"claude-
|
|
27
|
+
// Anthropic — June 2026 line-up (Fable 5, Opus 4.8/4.7/4.6, Sonnet 4.6, Haiku 4.5)
|
|
28
|
+
"claude-fable-5": { inputPer1M: 10.0, outputPer1M: 50.0, cachedInputPer1M: 1.0 },
|
|
29
|
+
"claude-opus-4-8": { inputPer1M: 5.0, outputPer1M: 25.0, cachedInputPer1M: 0.50 },
|
|
30
|
+
"claude-opus-4-7": { inputPer1M: 5.0, outputPer1M: 25.0, cachedInputPer1M: 0.50 },
|
|
31
|
+
"claude-opus-4-6": { inputPer1M: 5.0, outputPer1M: 25.0, cachedInputPer1M: 0.50 },
|
|
32
|
+
"claude-sonnet-4-6": { inputPer1M: 3.0, outputPer1M: 15.0, cachedInputPer1M: 0.30 },
|
|
33
|
+
"claude-sonnet-4-5": { inputPer1M: 3.0, outputPer1M: 15.0, cachedInputPer1M: 0.30 },
|
|
34
|
+
"claude-haiku-4-5": { inputPer1M: 1.0, outputPer1M: 5.0, cachedInputPer1M: 0.10 },
|
|
24
35
|
|
|
25
36
|
// Aliases
|
|
26
37
|
"gpt-5.2-turbo": { inputPer1M: 2.50, outputPer1M: 10.0 },
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
38
|
+
fable: { inputPer1M: 10.0, outputPer1M: 50.0, cachedInputPer1M: 1.0 },
|
|
39
|
+
opus: { inputPer1M: 5.0, outputPer1M: 25.0, cachedInputPer1M: 0.50 },
|
|
40
|
+
sonnet: { inputPer1M: 3.0, outputPer1M: 15.0, cachedInputPer1M: 0.30 },
|
|
41
|
+
haiku: { inputPer1M: 1.0, outputPer1M: 5.0, cachedInputPer1M: 0.10 },
|
|
30
42
|
};
|
|
31
43
|
|
|
32
44
|
/** Get pricing for a model, fallback to gpt-4o-mini rates */
|
package/src/cost/tracker.ts
CHANGED
|
@@ -49,17 +49,32 @@ class MemoryStore implements RedisLike {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
+
export interface CostTrackerOptions {
|
|
53
|
+
/**
|
|
54
|
+
* Cap on in-memory CostRecord retention (ring-buffer).
|
|
55
|
+
* Default: 10_000. Set to 0 to disable record retention entirely
|
|
56
|
+
* (use this in long-running processes that only care about budget
|
|
57
|
+
* counters, not per-request records).
|
|
58
|
+
* Override via env: AI_SHIELD_MAX_RECORDS.
|
|
59
|
+
*/
|
|
60
|
+
maxRecords?: number;
|
|
61
|
+
}
|
|
62
|
+
|
|
52
63
|
export class CostTracker {
|
|
53
64
|
private store: RedisLike;
|
|
54
65
|
private budgets: Map<string, BudgetConfig>;
|
|
55
66
|
private records: CostRecord[] = [];
|
|
67
|
+
private maxRecords: number;
|
|
56
68
|
|
|
57
69
|
constructor(
|
|
58
70
|
budgets: Record<string, BudgetConfig> = {},
|
|
59
71
|
redis?: RedisLike,
|
|
72
|
+
options: CostTrackerOptions = {},
|
|
60
73
|
) {
|
|
61
74
|
this.store = redis ?? new MemoryStore();
|
|
62
75
|
this.budgets = new Map(Object.entries(budgets));
|
|
76
|
+
const envCap = Number(process.env.AI_SHIELD_MAX_RECORDS);
|
|
77
|
+
this.maxRecords = options.maxRecords ?? (Number.isFinite(envCap) && envCap >= 0 ? envCap : 10_000);
|
|
63
78
|
}
|
|
64
79
|
|
|
65
80
|
/** Check if a request is within budget BEFORE sending to LLM */
|
|
@@ -133,10 +148,29 @@ export class CostTracker {
|
|
|
133
148
|
await this.store.expire(globalKey, this.periodSeconds(globalBudget.period) * 2);
|
|
134
149
|
}
|
|
135
150
|
|
|
136
|
-
this.
|
|
151
|
+
this.appendRecord(record);
|
|
137
152
|
return record;
|
|
138
153
|
}
|
|
139
154
|
|
|
155
|
+
/**
|
|
156
|
+
* Append a record with ring-buffer semantics to prevent unbounded memory growth.
|
|
157
|
+
* When maxRecords is 0, records are not retained.
|
|
158
|
+
*/
|
|
159
|
+
private appendRecord(record: CostRecord): void {
|
|
160
|
+
if (this.maxRecords === 0) return;
|
|
161
|
+
this.records.push(record);
|
|
162
|
+
if (this.records.length > this.maxRecords) {
|
|
163
|
+
// Drop oldest entries — O(1) amortized using splice(0, overflow)
|
|
164
|
+
const overflow = this.records.length - this.maxRecords;
|
|
165
|
+
this.records.splice(0, overflow);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/** Clear all in-memory records (e.g., after export) */
|
|
170
|
+
clearRecords(): void {
|
|
171
|
+
this.records.length = 0;
|
|
172
|
+
}
|
|
173
|
+
|
|
140
174
|
/** Get current spend for an entity */
|
|
141
175
|
async getCurrentSpend(entityId: string): Promise<number> {
|
|
142
176
|
const budget = this.budgets.get(entityId);
|