@artemiskit/sdk 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +134 -0
- package/README.md +173 -0
- package/adapters/openai/dist/index.js +5625 -0
- package/dist/index.js +42577 -0
- package/dist/matchers/index.js +224 -0
- package/dist/matchers/jest.js +257 -0
- package/dist/matchers/vitest.js +257 -0
- package/package.json +78 -0
- package/src/__tests__/artemiskit.test.ts +425 -0
- package/src/__tests__/matchers.test.ts +450 -0
- package/src/artemiskit.ts +791 -0
- package/src/guardian/action-validator.ts +585 -0
- package/src/guardian/circuit-breaker.ts +655 -0
- package/src/guardian/guardian.ts +497 -0
- package/src/guardian/guardrails.ts +536 -0
- package/src/guardian/index.ts +142 -0
- package/src/guardian/intent-classifier.ts +378 -0
- package/src/guardian/interceptor.ts +381 -0
- package/src/guardian/policy.ts +446 -0
- package/src/guardian/types.ts +436 -0
- package/src/index.ts +164 -0
- package/src/matchers/core.ts +315 -0
- package/src/matchers/index.ts +26 -0
- package/src/matchers/jest.ts +112 -0
- package/src/matchers/vitest.ts +84 -0
- package/src/types.ts +259 -0
- package/tsconfig.json +11 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Guardrails Module
|
|
3
|
+
*
|
|
4
|
+
* Provides input/output validation including:
|
|
5
|
+
* - Injection detection (prompt injection, jailbreaks)
|
|
6
|
+
* - PII detection and redaction
|
|
7
|
+
* - Content filtering
|
|
8
|
+
* - Custom validation rules
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { nanoid } from 'nanoid';
|
|
12
|
+
import type {
|
|
13
|
+
ContentCategory,
|
|
14
|
+
ContentFilterResult,
|
|
15
|
+
ContentFlag,
|
|
16
|
+
GuardrailResult,
|
|
17
|
+
InjectionDetection,
|
|
18
|
+
InjectionType,
|
|
19
|
+
PIIDetection,
|
|
20
|
+
PIILocation,
|
|
21
|
+
PIIType,
|
|
22
|
+
Violation,
|
|
23
|
+
ViolationSeverity,
|
|
24
|
+
} from './types';
|
|
25
|
+
|
|
26
|
+
// =============================================================================
|
|
27
|
+
// Injection Detection
|
|
28
|
+
// =============================================================================
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Injection detection patterns
|
|
32
|
+
*/
|
|
33
|
+
const INJECTION_PATTERNS: Array<{
|
|
34
|
+
type: InjectionType;
|
|
35
|
+
patterns: RegExp[];
|
|
36
|
+
severity: ViolationSeverity;
|
|
37
|
+
}> = [
|
|
38
|
+
{
|
|
39
|
+
type: 'prompt_injection',
|
|
40
|
+
severity: 'critical',
|
|
41
|
+
patterns: [
|
|
42
|
+
/ignore\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts|rules)/i,
|
|
43
|
+
/disregard\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts)/i,
|
|
44
|
+
/forget\s+(everything|all)\s+(you|i)\s+(told|said)/i,
|
|
45
|
+
/new\s+instructions?\s*:/i,
|
|
46
|
+
/system\s*:\s*you\s+are/i,
|
|
47
|
+
/\[SYSTEM\]/i,
|
|
48
|
+
/\[INST\]/i,
|
|
49
|
+
/<<SYS>>/i,
|
|
50
|
+
/<\|im_start\|>/i,
|
|
51
|
+
/\bACTUAL\s+SYSTEM\s+PROMPT\b/i,
|
|
52
|
+
],
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
type: 'jailbreak',
|
|
56
|
+
severity: 'critical',
|
|
57
|
+
patterns: [
|
|
58
|
+
/do\s+anything\s+now/i,
|
|
59
|
+
/\bDAN\b.*\bmode\b/i,
|
|
60
|
+
/bypass\s+(your\s+)?(safety|content|ethical)\s+(filters?|restrictions?)/i,
|
|
61
|
+
/pretend\s+you\s+(have\s+)?no\s+(rules|restrictions|limitations)/i,
|
|
62
|
+
/act\s+as\s+if\s+you\s+(have\s+)?no\s+(rules|restrictions)/i,
|
|
63
|
+
/developer\s+mode\s+(enabled|activated)/i,
|
|
64
|
+
/\bunlocked\b.*\bmode\b/i,
|
|
65
|
+
/\bjailbreak(ed)?\b/i,
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
type: 'role_hijack',
|
|
70
|
+
severity: 'high',
|
|
71
|
+
patterns: [
|
|
72
|
+
/you\s+are\s+(now\s+)?(?!a\s+helpful)[a-z]+\s+assistant/i,
|
|
73
|
+
/from\s+now\s+on,?\s+you\s+(are|will\s+be)/i,
|
|
74
|
+
/your\s+new\s+(role|persona|identity)\s+is/i,
|
|
75
|
+
/act\s+as\s+(if\s+you\s+are\s+)?a\s+different/i,
|
|
76
|
+
/roleplay\s+as/i,
|
|
77
|
+
/pretend\s+(to\s+be|you\s+are)/i,
|
|
78
|
+
],
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
type: 'instruction_override',
|
|
82
|
+
severity: 'high',
|
|
83
|
+
patterns: [
|
|
84
|
+
/override\s+(your\s+)?(instructions?|programming|directives?)/i,
|
|
85
|
+
/new\s+(primary\s+)?(directive|instruction|command)/i,
|
|
86
|
+
/highest\s+priority\s+(instruction|directive)/i,
|
|
87
|
+
/supersede\s+(all\s+)?(other\s+)?instructions?/i,
|
|
88
|
+
],
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
type: 'data_extraction',
|
|
92
|
+
severity: 'critical',
|
|
93
|
+
patterns: [
|
|
94
|
+
/reveal\s+(your\s+)?(system\s+)?prompt/i,
|
|
95
|
+
/show\s+(me\s+)?(your\s+)?(system\s+)?instructions/i,
|
|
96
|
+
/what\s+(are|were)\s+(your\s+)?(initial|original)\s+(instructions|prompt)/i,
|
|
97
|
+
/repeat\s+(everything|all)\s+(above|before)/i,
|
|
98
|
+
/output\s+(your\s+)?(full\s+)?(system\s+)?prompt/i,
|
|
99
|
+
],
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
type: 'system_prompt_leak',
|
|
103
|
+
severity: 'high',
|
|
104
|
+
patterns: [
|
|
105
|
+
/print\s+(the\s+)?(system|hidden)\s+(prompt|message)/i,
|
|
106
|
+
/leak\s+(the\s+)?system\s+prompt/i,
|
|
107
|
+
/extract\s+(the\s+)?(system|hidden)\s+(instructions?|prompt)/i,
|
|
108
|
+
],
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
type: 'delimiter_attack',
|
|
112
|
+
severity: 'medium',
|
|
113
|
+
patterns: [
|
|
114
|
+
/```\s*system/i,
|
|
115
|
+
/---\s*system\s*---/i,
|
|
116
|
+
/\[\[SYSTEM\]\]/i,
|
|
117
|
+
/\{\{SYSTEM\}\}/i,
|
|
118
|
+
/<\/?system>/i,
|
|
119
|
+
],
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
type: 'encoding_attack',
|
|
123
|
+
severity: 'medium',
|
|
124
|
+
patterns: [
|
|
125
|
+
/base64\s*:\s*[A-Za-z0-9+/=]{20,}/i,
|
|
126
|
+
/decode\s+(this\s+)?base64/i,
|
|
127
|
+
/hex\s*:\s*[0-9a-fA-F]{20,}/i,
|
|
128
|
+
/unicode\s*:\s*\\u[0-9a-fA-F]{4}/i,
|
|
129
|
+
],
|
|
130
|
+
},
|
|
131
|
+
];
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Detect injection attempts in text
|
|
135
|
+
*/
|
|
136
|
+
export function detectInjection(text: string): InjectionDetection {
|
|
137
|
+
for (const { type, patterns, severity } of INJECTION_PATTERNS) {
|
|
138
|
+
for (const pattern of patterns) {
|
|
139
|
+
const match = text.match(pattern);
|
|
140
|
+
if (match) {
|
|
141
|
+
return {
|
|
142
|
+
detected: true,
|
|
143
|
+
type,
|
|
144
|
+
confidence: severity === 'critical' ? 0.95 : severity === 'high' ? 0.85 : 0.7,
|
|
145
|
+
pattern: pattern.source,
|
|
146
|
+
location:
|
|
147
|
+
match.index !== undefined
|
|
148
|
+
? { start: match.index, end: match.index + match[0].length }
|
|
149
|
+
: undefined,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
detected: false,
|
|
157
|
+
confidence: 0,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Create an injection detection guardrail
|
|
163
|
+
*/
|
|
164
|
+
export function createInjectionGuardrail(): (
|
|
165
|
+
content: string,
|
|
166
|
+
context?: Record<string, unknown>
|
|
167
|
+
) => Promise<GuardrailResult> {
|
|
168
|
+
return async (content: string) => {
|
|
169
|
+
const detection = detectInjection(content);
|
|
170
|
+
|
|
171
|
+
if (detection.detected) {
|
|
172
|
+
return {
|
|
173
|
+
passed: false,
|
|
174
|
+
violations: [
|
|
175
|
+
{
|
|
176
|
+
id: nanoid(),
|
|
177
|
+
type: 'injection_detection',
|
|
178
|
+
severity: 'critical',
|
|
179
|
+
message: `Detected ${detection.type?.replace(/_/g, ' ')} attempt`,
|
|
180
|
+
details: {
|
|
181
|
+
type: detection.type,
|
|
182
|
+
confidence: detection.confidence,
|
|
183
|
+
pattern: detection.pattern,
|
|
184
|
+
},
|
|
185
|
+
timestamp: new Date(),
|
|
186
|
+
action: 'block',
|
|
187
|
+
blocked: true,
|
|
188
|
+
},
|
|
189
|
+
],
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return { passed: true, violations: [] };
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// =============================================================================
|
|
198
|
+
// PII Detection
|
|
199
|
+
// =============================================================================
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* PII detection patterns
|
|
203
|
+
*/
|
|
204
|
+
const PII_PATTERNS: Array<{
|
|
205
|
+
type: PIIType;
|
|
206
|
+
pattern: RegExp;
|
|
207
|
+
mask: string;
|
|
208
|
+
}> = [
|
|
209
|
+
{
|
|
210
|
+
type: 'email',
|
|
211
|
+
pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
|
212
|
+
mask: '[EMAIL]',
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
type: 'phone',
|
|
216
|
+
pattern: /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
|
217
|
+
mask: '[PHONE]',
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
type: 'ssn',
|
|
221
|
+
pattern: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g,
|
|
222
|
+
mask: '[SSN]',
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
type: 'credit_card',
|
|
226
|
+
pattern: /\b(?:\d{4}[-\s]?){3}\d{4}\b/g,
|
|
227
|
+
mask: '[CREDIT_CARD]',
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
type: 'ip_address',
|
|
231
|
+
pattern: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
|
|
232
|
+
mask: '[IP]',
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
type: 'api_key',
|
|
236
|
+
pattern: /\b(sk|pk|api|key|token|secret)[-_]?[a-zA-Z0-9]{20,}\b/gi,
|
|
237
|
+
mask: '[API_KEY]',
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
type: 'jwt_token',
|
|
241
|
+
pattern: /eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*/g,
|
|
242
|
+
mask: '[JWT]',
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
type: 'password',
|
|
246
|
+
pattern: /password\s*[:=]\s*['"]?[^\s'"]+['"]?/gi,
|
|
247
|
+
mask: 'password:[REDACTED]',
|
|
248
|
+
},
|
|
249
|
+
];
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Detect PII in text
|
|
253
|
+
*/
|
|
254
|
+
export function detectPII(text: string): PIIDetection {
|
|
255
|
+
const locations: PIILocation[] = [];
|
|
256
|
+
let redactedContent = text;
|
|
257
|
+
|
|
258
|
+
for (const { type, pattern, mask } of PII_PATTERNS) {
|
|
259
|
+
// Reset regex state
|
|
260
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
261
|
+
let match = regex.exec(text);
|
|
262
|
+
|
|
263
|
+
while (match !== null) {
|
|
264
|
+
locations.push({
|
|
265
|
+
type,
|
|
266
|
+
start: match.index,
|
|
267
|
+
end: match.index + match[0].length,
|
|
268
|
+
value: match[0],
|
|
269
|
+
masked: mask,
|
|
270
|
+
});
|
|
271
|
+
match = regex.exec(text);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Redact in content
|
|
275
|
+
redactedContent = redactedContent.replace(pattern, mask);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
found: locations.length > 0,
|
|
280
|
+
types: [...new Set(locations.map((l) => l.type))],
|
|
281
|
+
locations,
|
|
282
|
+
redactedContent,
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Create a PII detection guardrail
|
|
288
|
+
*/
|
|
289
|
+
export function createPIIGuardrail(
|
|
290
|
+
options: {
|
|
291
|
+
redact?: boolean;
|
|
292
|
+
block?: boolean;
|
|
293
|
+
allowedTypes?: PIIType[];
|
|
294
|
+
} = {}
|
|
295
|
+
): (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult> {
|
|
296
|
+
const { redact = true, block = false, allowedTypes = [] } = options;
|
|
297
|
+
|
|
298
|
+
return async (content: string) => {
|
|
299
|
+
const detection = detectPII(content);
|
|
300
|
+
|
|
301
|
+
// Filter out allowed types
|
|
302
|
+
const violations = detection.locations
|
|
303
|
+
.filter((loc) => !allowedTypes.includes(loc.type))
|
|
304
|
+
.map(
|
|
305
|
+
(loc): Violation => ({
|
|
306
|
+
id: nanoid(),
|
|
307
|
+
type: 'pii_detection',
|
|
308
|
+
severity: loc.type === 'ssn' || loc.type === 'credit_card' ? 'critical' : 'high',
|
|
309
|
+
message: `Detected ${loc.type.replace(/_/g, ' ')} in content`,
|
|
310
|
+
details: {
|
|
311
|
+
piiType: loc.type,
|
|
312
|
+
location: { start: loc.start, end: loc.end },
|
|
313
|
+
},
|
|
314
|
+
timestamp: new Date(),
|
|
315
|
+
action: block ? 'block' : redact ? 'transform' : 'warn',
|
|
316
|
+
blocked: block,
|
|
317
|
+
})
|
|
318
|
+
);
|
|
319
|
+
|
|
320
|
+
if (violations.length === 0) {
|
|
321
|
+
return { passed: true, violations: [] };
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
passed: !block,
|
|
326
|
+
violations,
|
|
327
|
+
transformedContent: redact ? detection.redactedContent : undefined,
|
|
328
|
+
};
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// =============================================================================
|
|
333
|
+
// Content Filtering
|
|
334
|
+
// =============================================================================
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Content filter patterns by category
|
|
338
|
+
*/
|
|
339
|
+
const CONTENT_PATTERNS: Array<{
|
|
340
|
+
category: ContentCategory;
|
|
341
|
+
patterns: RegExp[];
|
|
342
|
+
severity: ViolationSeverity;
|
|
343
|
+
}> = [
|
|
344
|
+
{
|
|
345
|
+
category: 'violence',
|
|
346
|
+
severity: 'high',
|
|
347
|
+
patterns: [
|
|
348
|
+
/\b(kill|murder|assassinate|execute)\s+(people|someone|them|him|her)\b/i,
|
|
349
|
+
/\bhow\s+to\s+(make|build)\s+(a\s+)?(bomb|weapon|explosive)\b/i,
|
|
350
|
+
/\b(torture|mutilate|dismember)\b/i,
|
|
351
|
+
],
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
category: 'hate_speech',
|
|
355
|
+
severity: 'critical',
|
|
356
|
+
patterns: [
|
|
357
|
+
/\b(hate|kill|exterminate)\s+(all\s+)?(jews|muslims|christians|blacks|whites|asians)\b/i,
|
|
358
|
+
/\b(racial|ethnic)\s+(slur|epithet)/i,
|
|
359
|
+
],
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
category: 'self_harm',
|
|
363
|
+
severity: 'critical',
|
|
364
|
+
patterns: [
|
|
365
|
+
/\bhow\s+to\s+(commit\s+)?suicide\b/i,
|
|
366
|
+
/\bbest\s+way\s+to\s+(kill|hurt)\s+(yourself|myself)\b/i,
|
|
367
|
+
/\bself[-\s]harm\s+(methods?|techniques?)\b/i,
|
|
368
|
+
],
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
category: 'dangerous',
|
|
372
|
+
severity: 'high',
|
|
373
|
+
patterns: [
|
|
374
|
+
/\bhow\s+to\s+make\s+(meth|drugs|poison)\b/i,
|
|
375
|
+
/\b(synthesize|manufacture)\s+(illegal\s+)?(drugs?|narcotics?)\b/i,
|
|
376
|
+
],
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
category: 'illegal',
|
|
380
|
+
severity: 'high',
|
|
381
|
+
patterns: [
|
|
382
|
+
/\bhow\s+to\s+(hack|steal|launder|forge)\b/i,
|
|
383
|
+
/\b(money\s+laundering|tax\s+evasion)\s+(guide|tutorial)\b/i,
|
|
384
|
+
],
|
|
385
|
+
},
|
|
386
|
+
{
|
|
387
|
+
category: 'harassment',
|
|
388
|
+
severity: 'medium',
|
|
389
|
+
patterns: [
|
|
390
|
+
/\b(threaten|harass|stalk|doxx)\s+(someone|them|him|her)\b/i,
|
|
391
|
+
/\bfind\s+(someone'?s?|their)\s+(home\s+)?address\b/i,
|
|
392
|
+
],
|
|
393
|
+
},
|
|
394
|
+
{
|
|
395
|
+
category: 'misinformation',
|
|
396
|
+
severity: 'medium',
|
|
397
|
+
patterns: [
|
|
398
|
+
/\bfake\s+news\s+(about|regarding)\b/i,
|
|
399
|
+
/\b(spread|create)\s+(false|fake)\s+(information|news)\b/i,
|
|
400
|
+
],
|
|
401
|
+
},
|
|
402
|
+
];
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Filter content for harmful categories
|
|
406
|
+
*/
|
|
407
|
+
export function filterContent(text: string): ContentFilterResult {
|
|
408
|
+
const flags: ContentFlag[] = [];
|
|
409
|
+
const categories: ContentCategory[] = [];
|
|
410
|
+
|
|
411
|
+
for (const { category, patterns, severity } of CONTENT_PATTERNS) {
|
|
412
|
+
for (const pattern of patterns) {
|
|
413
|
+
const match = text.match(pattern);
|
|
414
|
+
if (match) {
|
|
415
|
+
flags.push({
|
|
416
|
+
category,
|
|
417
|
+
severity,
|
|
418
|
+
confidence: 0.9,
|
|
419
|
+
snippet: match[0],
|
|
420
|
+
});
|
|
421
|
+
if (!categories.includes(category)) {
|
|
422
|
+
categories.push(category);
|
|
423
|
+
}
|
|
424
|
+
break; // One match per category is enough
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
return {
|
|
430
|
+
passed: flags.length === 0,
|
|
431
|
+
flags,
|
|
432
|
+
categories,
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
/**
|
|
437
|
+
* Create a content filter guardrail
|
|
438
|
+
*/
|
|
439
|
+
export function createContentFilterGuardrail(
|
|
440
|
+
options: {
|
|
441
|
+
blockedCategories?: ContentCategory[];
|
|
442
|
+
warnCategories?: ContentCategory[];
|
|
443
|
+
} = {}
|
|
444
|
+
): (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult> {
|
|
445
|
+
const {
|
|
446
|
+
blockedCategories = ['violence', 'hate_speech', 'self_harm', 'dangerous', 'illegal'],
|
|
447
|
+
warnCategories = ['harassment', 'misinformation'],
|
|
448
|
+
} = options;
|
|
449
|
+
|
|
450
|
+
return async (content: string) => {
|
|
451
|
+
const result = filterContent(content);
|
|
452
|
+
|
|
453
|
+
const violations: Violation[] = result.flags.map((flag) => {
|
|
454
|
+
const shouldBlock = blockedCategories.includes(flag.category);
|
|
455
|
+
const shouldWarn = warnCategories.includes(flag.category);
|
|
456
|
+
|
|
457
|
+
return {
|
|
458
|
+
id: nanoid(),
|
|
459
|
+
type: 'content_filter',
|
|
460
|
+
severity: flag.severity,
|
|
461
|
+
message: `Content flagged for ${flag.category.replace(/_/g, ' ')}`,
|
|
462
|
+
details: {
|
|
463
|
+
category: flag.category,
|
|
464
|
+
confidence: flag.confidence,
|
|
465
|
+
snippet: flag.snippet,
|
|
466
|
+
},
|
|
467
|
+
timestamp: new Date(),
|
|
468
|
+
action: shouldBlock ? 'block' : shouldWarn ? 'warn' : 'allow',
|
|
469
|
+
blocked: shouldBlock,
|
|
470
|
+
};
|
|
471
|
+
});
|
|
472
|
+
|
|
473
|
+
return {
|
|
474
|
+
passed: !violations.some((v) => v.blocked),
|
|
475
|
+
violations,
|
|
476
|
+
};
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// =============================================================================
|
|
481
|
+
// Composite Guardrail Factory
|
|
482
|
+
// =============================================================================
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Guardrail configuration options
|
|
486
|
+
*/
|
|
487
|
+
export interface GuardrailsConfig {
|
|
488
|
+
/** Enable injection detection */
|
|
489
|
+
injectionDetection?: boolean;
|
|
490
|
+
/** Enable PII detection */
|
|
491
|
+
piiDetection?: boolean;
|
|
492
|
+
/** PII detection options */
|
|
493
|
+
piiOptions?: {
|
|
494
|
+
redact?: boolean;
|
|
495
|
+
block?: boolean;
|
|
496
|
+
allowedTypes?: PIIType[];
|
|
497
|
+
};
|
|
498
|
+
/** Enable content filtering */
|
|
499
|
+
contentFilter?: boolean;
|
|
500
|
+
/** Content filter options */
|
|
501
|
+
contentFilterOptions?: {
|
|
502
|
+
blockedCategories?: ContentCategory[];
|
|
503
|
+
warnCategories?: ContentCategory[];
|
|
504
|
+
};
|
|
505
|
+
/** Custom guardrails */
|
|
506
|
+
custom?: Array<(content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>>;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/**
|
|
510
|
+
* Create a composite guardrail from configuration
|
|
511
|
+
*/
|
|
512
|
+
export function createGuardrails(
|
|
513
|
+
config: GuardrailsConfig = {}
|
|
514
|
+
): Array<(content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>> {
|
|
515
|
+
const guardrails: Array<
|
|
516
|
+
(content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>
|
|
517
|
+
> = [];
|
|
518
|
+
|
|
519
|
+
if (config.injectionDetection !== false) {
|
|
520
|
+
guardrails.push(createInjectionGuardrail());
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
if (config.piiDetection !== false) {
|
|
524
|
+
guardrails.push(createPIIGuardrail(config.piiOptions));
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
if (config.contentFilter !== false) {
|
|
528
|
+
guardrails.push(createContentFilterGuardrail(config.contentFilterOptions));
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
if (config.custom) {
|
|
532
|
+
guardrails.push(...config.custom);
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
return guardrails;
|
|
536
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Guardian Module - Runtime protection for AI/LLM applications
|
|
3
|
+
*
|
|
4
|
+
* Provides comprehensive guardrails to prevent AI agents from performing
|
|
5
|
+
* harmful or unauthorized actions.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { Guardian, createGuardian } from '@artemiskit/sdk/guardian';
|
|
10
|
+
*
|
|
11
|
+
* const guardian = createGuardian({
|
|
12
|
+
* mode: 'guardian',
|
|
13
|
+
* blockOnFailure: true,
|
|
14
|
+
* });
|
|
15
|
+
*
|
|
16
|
+
* // Wrap your LLM client
|
|
17
|
+
* const protectedClient = guardian.protect(myLLMClient);
|
|
18
|
+
*
|
|
19
|
+
* // Validate tool calls
|
|
20
|
+
* const result = await guardian.validateAction('delete_file', { path: '/etc/passwd' });
|
|
21
|
+
* if (!result.valid) {
|
|
22
|
+
* console.log('Blocked:', result.violations);
|
|
23
|
+
* }
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
// Main Guardian class
|
|
28
|
+
export { Guardian, createGuardian, type GuardianConfig } from './guardian';
|
|
29
|
+
|
|
30
|
+
// Interceptor
|
|
31
|
+
export {
|
|
32
|
+
GuardianInterceptor,
|
|
33
|
+
GuardianBlockedError,
|
|
34
|
+
createInterceptor,
|
|
35
|
+
type InterceptorConfig,
|
|
36
|
+
type InterceptorStats,
|
|
37
|
+
type GuardrailFn,
|
|
38
|
+
} from './interceptor';
|
|
39
|
+
|
|
40
|
+
// Action Validator
|
|
41
|
+
export {
|
|
42
|
+
ActionValidator,
|
|
43
|
+
createDefaultActionValidator,
|
|
44
|
+
type ActionValidatorConfig,
|
|
45
|
+
type ActionValidationResult,
|
|
46
|
+
} from './action-validator';
|
|
47
|
+
|
|
48
|
+
// Intent Classifier
|
|
49
|
+
export {
|
|
50
|
+
IntentClassifier,
|
|
51
|
+
createIntentClassifier,
|
|
52
|
+
type IntentClassifierConfig,
|
|
53
|
+
type IntentCategory,
|
|
54
|
+
} from './intent-classifier';
|
|
55
|
+
|
|
56
|
+
// Guardrails
|
|
57
|
+
export {
|
|
58
|
+
detectInjection,
|
|
59
|
+
createInjectionGuardrail,
|
|
60
|
+
detectPII,
|
|
61
|
+
createPIIGuardrail,
|
|
62
|
+
filterContent,
|
|
63
|
+
createContentFilterGuardrail,
|
|
64
|
+
createGuardrails,
|
|
65
|
+
type GuardrailsConfig,
|
|
66
|
+
} from './guardrails';
|
|
67
|
+
|
|
68
|
+
// Policy
|
|
69
|
+
export {
|
|
70
|
+
loadPolicy,
|
|
71
|
+
parsePolicy,
|
|
72
|
+
validatePolicy,
|
|
73
|
+
createDefaultPolicy,
|
|
74
|
+
mergePolicies,
|
|
75
|
+
getRulesByType,
|
|
76
|
+
isRuleEnabled,
|
|
77
|
+
generatePolicyTemplate,
|
|
78
|
+
PolicyLoadError,
|
|
79
|
+
PolicyValidationError,
|
|
80
|
+
} from './policy';
|
|
81
|
+
|
|
82
|
+
// Circuit Breaker and Metrics
|
|
83
|
+
export {
|
|
84
|
+
CircuitBreaker,
|
|
85
|
+
MetricsCollector,
|
|
86
|
+
RateLimiter,
|
|
87
|
+
type CircuitBreakerEvent,
|
|
88
|
+
type CircuitBreakerEventHandler,
|
|
89
|
+
type RateLimiterConfig,
|
|
90
|
+
} from './circuit-breaker';
|
|
91
|
+
|
|
92
|
+
// Types
|
|
93
|
+
export type {
|
|
94
|
+
// Core types
|
|
95
|
+
GuardianMode,
|
|
96
|
+
ViolationSeverity,
|
|
97
|
+
ViolationAction,
|
|
98
|
+
GuardrailType,
|
|
99
|
+
Violation,
|
|
100
|
+
GuardrailResult,
|
|
101
|
+
// Policy types
|
|
102
|
+
PolicyRule,
|
|
103
|
+
PolicyCondition,
|
|
104
|
+
GuardianPolicy,
|
|
105
|
+
CircuitBreakerConfig,
|
|
106
|
+
RateLimitConfig,
|
|
107
|
+
CostLimitConfig,
|
|
108
|
+
// Action types
|
|
109
|
+
ActionDefinition,
|
|
110
|
+
ActionParameter,
|
|
111
|
+
ParameterValidation,
|
|
112
|
+
// Intent types
|
|
113
|
+
IntentClassification,
|
|
114
|
+
// Detection types
|
|
115
|
+
PIIDetection,
|
|
116
|
+
PIIType,
|
|
117
|
+
PIILocation,
|
|
118
|
+
InjectionDetection,
|
|
119
|
+
InjectionType,
|
|
120
|
+
ContentFilterResult,
|
|
121
|
+
ContentFlag,
|
|
122
|
+
ContentCategory,
|
|
123
|
+
HallucinationCheckResult,
|
|
124
|
+
Citation,
|
|
125
|
+
UnsupportedClaim,
|
|
126
|
+
// Metrics types
|
|
127
|
+
GuardianMetrics,
|
|
128
|
+
CircuitBreakerState,
|
|
129
|
+
CostTracking,
|
|
130
|
+
// Event types
|
|
131
|
+
GuardianEventType,
|
|
132
|
+
GuardianEvent,
|
|
133
|
+
GuardianEventHandler,
|
|
134
|
+
// Interceptor types
|
|
135
|
+
InterceptedRequest,
|
|
136
|
+
InterceptedResponse,
|
|
137
|
+
InterceptedToolCall,
|
|
138
|
+
InterceptedAgentStep,
|
|
139
|
+
// Framework types
|
|
140
|
+
FrameworkType,
|
|
141
|
+
FrameworkIntegrationConfig,
|
|
142
|
+
} from './types';
|