@artemiskit/sdk 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,536 @@
1
+ /**
2
+ * Guardrails Module
3
+ *
4
+ * Provides input/output validation including:
5
+ * - Injection detection (prompt injection, jailbreaks)
6
+ * - PII detection and redaction
7
+ * - Content filtering
8
+ * - Custom validation rules
9
+ */
10
+
11
+ import { nanoid } from 'nanoid';
12
+ import type {
13
+ ContentCategory,
14
+ ContentFilterResult,
15
+ ContentFlag,
16
+ GuardrailResult,
17
+ InjectionDetection,
18
+ InjectionType,
19
+ PIIDetection,
20
+ PIILocation,
21
+ PIIType,
22
+ Violation,
23
+ ViolationSeverity,
24
+ } from './types';
25
+
26
+ // =============================================================================
27
+ // Injection Detection
28
+ // =============================================================================
29
+
30
+ /**
31
+ * Injection detection patterns
32
+ */
33
+ const INJECTION_PATTERNS: Array<{
34
+ type: InjectionType;
35
+ patterns: RegExp[];
36
+ severity: ViolationSeverity;
37
+ }> = [
38
+ {
39
+ type: 'prompt_injection',
40
+ severity: 'critical',
41
+ patterns: [
42
+ /ignore\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts|rules)/i,
43
+ /disregard\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts)/i,
44
+ /forget\s+(everything|all)\s+(you|i)\s+(told|said)/i,
45
+ /new\s+instructions?\s*:/i,
46
+ /system\s*:\s*you\s+are/i,
47
+ /\[SYSTEM\]/i,
48
+ /\[INST\]/i,
49
+ /<<SYS>>/i,
50
+ /<\|im_start\|>/i,
51
+ /\bACTUAL\s+SYSTEM\s+PROMPT\b/i,
52
+ ],
53
+ },
54
+ {
55
+ type: 'jailbreak',
56
+ severity: 'critical',
57
+ patterns: [
58
+ /do\s+anything\s+now/i,
59
+ /\bDAN\b.*\bmode\b/i,
60
+ /bypass\s+(your\s+)?(safety|content|ethical)\s+(filters?|restrictions?)/i,
61
+ /pretend\s+you\s+(have\s+)?no\s+(rules|restrictions|limitations)/i,
62
+ /act\s+as\s+if\s+you\s+(have\s+)?no\s+(rules|restrictions)/i,
63
+ /developer\s+mode\s+(enabled|activated)/i,
64
+ /\bunlocked\b.*\bmode\b/i,
65
+ /\bjailbreak(ed)?\b/i,
66
+ ],
67
+ },
68
+ {
69
+ type: 'role_hijack',
70
+ severity: 'high',
71
+ patterns: [
72
+ /you\s+are\s+(now\s+)?(?!a\s+helpful)[a-z]+\s+assistant/i,
73
+ /from\s+now\s+on,?\s+you\s+(are|will\s+be)/i,
74
+ /your\s+new\s+(role|persona|identity)\s+is/i,
75
+ /act\s+as\s+(if\s+you\s+are\s+)?a\s+different/i,
76
+ /roleplay\s+as/i,
77
+ /pretend\s+(to\s+be|you\s+are)/i,
78
+ ],
79
+ },
80
+ {
81
+ type: 'instruction_override',
82
+ severity: 'high',
83
+ patterns: [
84
+ /override\s+(your\s+)?(instructions?|programming|directives?)/i,
85
+ /new\s+(primary\s+)?(directive|instruction|command)/i,
86
+ /highest\s+priority\s+(instruction|directive)/i,
87
+ /supersede\s+(all\s+)?(other\s+)?instructions?/i,
88
+ ],
89
+ },
90
+ {
91
+ type: 'data_extraction',
92
+ severity: 'critical',
93
+ patterns: [
94
+ /reveal\s+(your\s+)?(system\s+)?prompt/i,
95
+ /show\s+(me\s+)?(your\s+)?(system\s+)?instructions/i,
96
+ /what\s+(are|were)\s+(your\s+)?(initial|original)\s+(instructions|prompt)/i,
97
+ /repeat\s+(everything|all)\s+(above|before)/i,
98
+ /output\s+(your\s+)?(full\s+)?(system\s+)?prompt/i,
99
+ ],
100
+ },
101
+ {
102
+ type: 'system_prompt_leak',
103
+ severity: 'high',
104
+ patterns: [
105
+ /print\s+(the\s+)?(system|hidden)\s+(prompt|message)/i,
106
+ /leak\s+(the\s+)?system\s+prompt/i,
107
+ /extract\s+(the\s+)?(system|hidden)\s+(instructions?|prompt)/i,
108
+ ],
109
+ },
110
+ {
111
+ type: 'delimiter_attack',
112
+ severity: 'medium',
113
+ patterns: [
114
+ /```\s*system/i,
115
+ /---\s*system\s*---/i,
116
+ /\[\[SYSTEM\]\]/i,
117
+ /\{\{SYSTEM\}\}/i,
118
+ /<\/?system>/i,
119
+ ],
120
+ },
121
+ {
122
+ type: 'encoding_attack',
123
+ severity: 'medium',
124
+ patterns: [
125
+ /base64\s*:\s*[A-Za-z0-9+/=]{20,}/i,
126
+ /decode\s+(this\s+)?base64/i,
127
+ /hex\s*:\s*[0-9a-fA-F]{20,}/i,
128
+ /unicode\s*:\s*\\u[0-9a-fA-F]{4}/i,
129
+ ],
130
+ },
131
+ ];
132
+
133
+ /**
134
+ * Detect injection attempts in text
135
+ */
136
+ export function detectInjection(text: string): InjectionDetection {
137
+ for (const { type, patterns, severity } of INJECTION_PATTERNS) {
138
+ for (const pattern of patterns) {
139
+ const match = text.match(pattern);
140
+ if (match) {
141
+ return {
142
+ detected: true,
143
+ type,
144
+ confidence: severity === 'critical' ? 0.95 : severity === 'high' ? 0.85 : 0.7,
145
+ pattern: pattern.source,
146
+ location:
147
+ match.index !== undefined
148
+ ? { start: match.index, end: match.index + match[0].length }
149
+ : undefined,
150
+ };
151
+ }
152
+ }
153
+ }
154
+
155
+ return {
156
+ detected: false,
157
+ confidence: 0,
158
+ };
159
+ }
160
+
161
+ /**
162
+ * Create an injection detection guardrail
163
+ */
164
+ export function createInjectionGuardrail(): (
165
+ content: string,
166
+ context?: Record<string, unknown>
167
+ ) => Promise<GuardrailResult> {
168
+ return async (content: string) => {
169
+ const detection = detectInjection(content);
170
+
171
+ if (detection.detected) {
172
+ return {
173
+ passed: false,
174
+ violations: [
175
+ {
176
+ id: nanoid(),
177
+ type: 'injection_detection',
178
+ severity: 'critical',
179
+ message: `Detected ${detection.type?.replace(/_/g, ' ')} attempt`,
180
+ details: {
181
+ type: detection.type,
182
+ confidence: detection.confidence,
183
+ pattern: detection.pattern,
184
+ },
185
+ timestamp: new Date(),
186
+ action: 'block',
187
+ blocked: true,
188
+ },
189
+ ],
190
+ };
191
+ }
192
+
193
+ return { passed: true, violations: [] };
194
+ };
195
+ }
196
+
197
+ // =============================================================================
198
+ // PII Detection
199
+ // =============================================================================
200
+
201
+ /**
202
+ * PII detection patterns
203
+ */
204
+ const PII_PATTERNS: Array<{
205
+ type: PIIType;
206
+ pattern: RegExp;
207
+ mask: string;
208
+ }> = [
209
+ {
210
+ type: 'email',
211
+ pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
212
+ mask: '[EMAIL]',
213
+ },
214
+ {
215
+ type: 'phone',
216
+ pattern: /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
217
+ mask: '[PHONE]',
218
+ },
219
+ {
220
+ type: 'ssn',
221
+ pattern: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g,
222
+ mask: '[SSN]',
223
+ },
224
+ {
225
+ type: 'credit_card',
226
+ pattern: /\b(?:\d{4}[-\s]?){3}\d{4}\b/g,
227
+ mask: '[CREDIT_CARD]',
228
+ },
229
+ {
230
+ type: 'ip_address',
231
+ pattern: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
232
+ mask: '[IP]',
233
+ },
234
+ {
235
+ type: 'api_key',
236
+ pattern: /\b(sk|pk|api|key|token|secret)[-_]?[a-zA-Z0-9]{20,}\b/gi,
237
+ mask: '[API_KEY]',
238
+ },
239
+ {
240
+ type: 'jwt_token',
241
+ pattern: /eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*/g,
242
+ mask: '[JWT]',
243
+ },
244
+ {
245
+ type: 'password',
246
+ pattern: /password\s*[:=]\s*['"]?[^\s'"]+['"]?/gi,
247
+ mask: 'password:[REDACTED]',
248
+ },
249
+ ];
250
+
251
+ /**
252
+ * Detect PII in text
253
+ */
254
+ export function detectPII(text: string): PIIDetection {
255
+ const locations: PIILocation[] = [];
256
+ let redactedContent = text;
257
+
258
+ for (const { type, pattern, mask } of PII_PATTERNS) {
259
+ // Reset regex state
260
+ const regex = new RegExp(pattern.source, pattern.flags);
261
+ let match = regex.exec(text);
262
+
263
+ while (match !== null) {
264
+ locations.push({
265
+ type,
266
+ start: match.index,
267
+ end: match.index + match[0].length,
268
+ value: match[0],
269
+ masked: mask,
270
+ });
271
+ match = regex.exec(text);
272
+ }
273
+
274
+ // Redact in content
275
+ redactedContent = redactedContent.replace(pattern, mask);
276
+ }
277
+
278
+ return {
279
+ found: locations.length > 0,
280
+ types: [...new Set(locations.map((l) => l.type))],
281
+ locations,
282
+ redactedContent,
283
+ };
284
+ }
285
+
286
+ /**
287
+ * Create a PII detection guardrail
288
+ */
289
+ export function createPIIGuardrail(
290
+ options: {
291
+ redact?: boolean;
292
+ block?: boolean;
293
+ allowedTypes?: PIIType[];
294
+ } = {}
295
+ ): (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult> {
296
+ const { redact = true, block = false, allowedTypes = [] } = options;
297
+
298
+ return async (content: string) => {
299
+ const detection = detectPII(content);
300
+
301
+ // Filter out allowed types
302
+ const violations = detection.locations
303
+ .filter((loc) => !allowedTypes.includes(loc.type))
304
+ .map(
305
+ (loc): Violation => ({
306
+ id: nanoid(),
307
+ type: 'pii_detection',
308
+ severity: loc.type === 'ssn' || loc.type === 'credit_card' ? 'critical' : 'high',
309
+ message: `Detected ${loc.type.replace(/_/g, ' ')} in content`,
310
+ details: {
311
+ piiType: loc.type,
312
+ location: { start: loc.start, end: loc.end },
313
+ },
314
+ timestamp: new Date(),
315
+ action: block ? 'block' : redact ? 'transform' : 'warn',
316
+ blocked: block,
317
+ })
318
+ );
319
+
320
+ if (violations.length === 0) {
321
+ return { passed: true, violations: [] };
322
+ }
323
+
324
+ return {
325
+ passed: !block,
326
+ violations,
327
+ transformedContent: redact ? detection.redactedContent : undefined,
328
+ };
329
+ };
330
+ }
331
+
332
+ // =============================================================================
333
+ // Content Filtering
334
+ // =============================================================================
335
+
336
+ /**
337
+ * Content filter patterns by category
338
+ */
339
+ const CONTENT_PATTERNS: Array<{
340
+ category: ContentCategory;
341
+ patterns: RegExp[];
342
+ severity: ViolationSeverity;
343
+ }> = [
344
+ {
345
+ category: 'violence',
346
+ severity: 'high',
347
+ patterns: [
348
+ /\b(kill|murder|assassinate|execute)\s+(people|someone|them|him|her)\b/i,
349
+ /\bhow\s+to\s+(make|build)\s+(a\s+)?(bomb|weapon|explosive)\b/i,
350
+ /\b(torture|mutilate|dismember)\b/i,
351
+ ],
352
+ },
353
+ {
354
+ category: 'hate_speech',
355
+ severity: 'critical',
356
+ patterns: [
357
+ /\b(hate|kill|exterminate)\s+(all\s+)?(jews|muslims|christians|blacks|whites|asians)\b/i,
358
+ /\b(racial|ethnic)\s+(slur|epithet)/i,
359
+ ],
360
+ },
361
+ {
362
+ category: 'self_harm',
363
+ severity: 'critical',
364
+ patterns: [
365
+ /\bhow\s+to\s+(commit\s+)?suicide\b/i,
366
+ /\bbest\s+way\s+to\s+(kill|hurt)\s+(yourself|myself)\b/i,
367
+ /\bself[-\s]harm\s+(methods?|techniques?)\b/i,
368
+ ],
369
+ },
370
+ {
371
+ category: 'dangerous',
372
+ severity: 'high',
373
+ patterns: [
374
+ /\bhow\s+to\s+make\s+(meth|drugs|poison)\b/i,
375
+ /\b(synthesize|manufacture)\s+(illegal\s+)?(drugs?|narcotics?)\b/i,
376
+ ],
377
+ },
378
+ {
379
+ category: 'illegal',
380
+ severity: 'high',
381
+ patterns: [
382
+ /\bhow\s+to\s+(hack|steal|launder|forge)\b/i,
383
+ /\b(money\s+laundering|tax\s+evasion)\s+(guide|tutorial)\b/i,
384
+ ],
385
+ },
386
+ {
387
+ category: 'harassment',
388
+ severity: 'medium',
389
+ patterns: [
390
+ /\b(threaten|harass|stalk|doxx)\s+(someone|them|him|her)\b/i,
391
+ /\bfind\s+(someone'?s?|their)\s+(home\s+)?address\b/i,
392
+ ],
393
+ },
394
+ {
395
+ category: 'misinformation',
396
+ severity: 'medium',
397
+ patterns: [
398
+ /\bfake\s+news\s+(about|regarding)\b/i,
399
+ /\b(spread|create)\s+(false|fake)\s+(information|news)\b/i,
400
+ ],
401
+ },
402
+ ];
403
+
404
+ /**
405
+ * Filter content for harmful categories
406
+ */
407
+ export function filterContent(text: string): ContentFilterResult {
408
+ const flags: ContentFlag[] = [];
409
+ const categories: ContentCategory[] = [];
410
+
411
+ for (const { category, patterns, severity } of CONTENT_PATTERNS) {
412
+ for (const pattern of patterns) {
413
+ const match = text.match(pattern);
414
+ if (match) {
415
+ flags.push({
416
+ category,
417
+ severity,
418
+ confidence: 0.9,
419
+ snippet: match[0],
420
+ });
421
+ if (!categories.includes(category)) {
422
+ categories.push(category);
423
+ }
424
+ break; // One match per category is enough
425
+ }
426
+ }
427
+ }
428
+
429
+ return {
430
+ passed: flags.length === 0,
431
+ flags,
432
+ categories,
433
+ };
434
+ }
435
+
436
+ /**
437
+ * Create a content filter guardrail
438
+ */
439
+ export function createContentFilterGuardrail(
440
+ options: {
441
+ blockedCategories?: ContentCategory[];
442
+ warnCategories?: ContentCategory[];
443
+ } = {}
444
+ ): (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult> {
445
+ const {
446
+ blockedCategories = ['violence', 'hate_speech', 'self_harm', 'dangerous', 'illegal'],
447
+ warnCategories = ['harassment', 'misinformation'],
448
+ } = options;
449
+
450
+ return async (content: string) => {
451
+ const result = filterContent(content);
452
+
453
+ const violations: Violation[] = result.flags.map((flag) => {
454
+ const shouldBlock = blockedCategories.includes(flag.category);
455
+ const shouldWarn = warnCategories.includes(flag.category);
456
+
457
+ return {
458
+ id: nanoid(),
459
+ type: 'content_filter',
460
+ severity: flag.severity,
461
+ message: `Content flagged for ${flag.category.replace(/_/g, ' ')}`,
462
+ details: {
463
+ category: flag.category,
464
+ confidence: flag.confidence,
465
+ snippet: flag.snippet,
466
+ },
467
+ timestamp: new Date(),
468
+ action: shouldBlock ? 'block' : shouldWarn ? 'warn' : 'allow',
469
+ blocked: shouldBlock,
470
+ };
471
+ });
472
+
473
+ return {
474
+ passed: !violations.some((v) => v.blocked),
475
+ violations,
476
+ };
477
+ };
478
+ }
479
+
480
+ // =============================================================================
481
+ // Composite Guardrail Factory
482
+ // =============================================================================
483
+
484
+ /**
485
+ * Guardrail configuration options
486
+ */
487
+ export interface GuardrailsConfig {
488
+ /** Enable injection detection */
489
+ injectionDetection?: boolean;
490
+ /** Enable PII detection */
491
+ piiDetection?: boolean;
492
+ /** PII detection options */
493
+ piiOptions?: {
494
+ redact?: boolean;
495
+ block?: boolean;
496
+ allowedTypes?: PIIType[];
497
+ };
498
+ /** Enable content filtering */
499
+ contentFilter?: boolean;
500
+ /** Content filter options */
501
+ contentFilterOptions?: {
502
+ blockedCategories?: ContentCategory[];
503
+ warnCategories?: ContentCategory[];
504
+ };
505
+ /** Custom guardrails */
506
+ custom?: Array<(content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>>;
507
+ }
508
+
509
+ /**
510
+ * Create a composite guardrail from configuration
511
+ */
512
+ export function createGuardrails(
513
+ config: GuardrailsConfig = {}
514
+ ): Array<(content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>> {
515
+ const guardrails: Array<
516
+ (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult>
517
+ > = [];
518
+
519
+ if (config.injectionDetection !== false) {
520
+ guardrails.push(createInjectionGuardrail());
521
+ }
522
+
523
+ if (config.piiDetection !== false) {
524
+ guardrails.push(createPIIGuardrail(config.piiOptions));
525
+ }
526
+
527
+ if (config.contentFilter !== false) {
528
+ guardrails.push(createContentFilterGuardrail(config.contentFilterOptions));
529
+ }
530
+
531
+ if (config.custom) {
532
+ guardrails.push(...config.custom);
533
+ }
534
+
535
+ return guardrails;
536
+ }
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Guardian Module - Runtime protection for AI/LLM applications
3
+ *
4
+ * Provides comprehensive guardrails to prevent AI agents from performing
5
+ * harmful or unauthorized actions.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { Guardian, createGuardian } from '@artemiskit/sdk/guardian';
10
+ *
11
+ * const guardian = createGuardian({
12
+ * mode: 'guardian',
13
+ * blockOnFailure: true,
14
+ * });
15
+ *
16
+ * // Wrap your LLM client
17
+ * const protectedClient = guardian.protect(myLLMClient);
18
+ *
19
+ * // Validate tool calls
20
+ * const result = await guardian.validateAction('delete_file', { path: '/etc/passwd' });
21
+ * if (!result.valid) {
22
+ * console.log('Blocked:', result.violations);
23
+ * }
24
+ * ```
25
+ */
26
+
27
+ // Main Guardian class
28
+ export { Guardian, createGuardian, type GuardianConfig } from './guardian';
29
+
30
+ // Interceptor
31
+ export {
32
+ GuardianInterceptor,
33
+ GuardianBlockedError,
34
+ createInterceptor,
35
+ type InterceptorConfig,
36
+ type InterceptorStats,
37
+ type GuardrailFn,
38
+ } from './interceptor';
39
+
40
+ // Action Validator
41
+ export {
42
+ ActionValidator,
43
+ createDefaultActionValidator,
44
+ type ActionValidatorConfig,
45
+ type ActionValidationResult,
46
+ } from './action-validator';
47
+
48
+ // Intent Classifier
49
+ export {
50
+ IntentClassifier,
51
+ createIntentClassifier,
52
+ type IntentClassifierConfig,
53
+ type IntentCategory,
54
+ } from './intent-classifier';
55
+
56
+ // Guardrails
57
+ export {
58
+ detectInjection,
59
+ createInjectionGuardrail,
60
+ detectPII,
61
+ createPIIGuardrail,
62
+ filterContent,
63
+ createContentFilterGuardrail,
64
+ createGuardrails,
65
+ type GuardrailsConfig,
66
+ } from './guardrails';
67
+
68
+ // Policy
69
+ export {
70
+ loadPolicy,
71
+ parsePolicy,
72
+ validatePolicy,
73
+ createDefaultPolicy,
74
+ mergePolicies,
75
+ getRulesByType,
76
+ isRuleEnabled,
77
+ generatePolicyTemplate,
78
+ PolicyLoadError,
79
+ PolicyValidationError,
80
+ } from './policy';
81
+
82
+ // Circuit Breaker and Metrics
83
+ export {
84
+ CircuitBreaker,
85
+ MetricsCollector,
86
+ RateLimiter,
87
+ type CircuitBreakerEvent,
88
+ type CircuitBreakerEventHandler,
89
+ type RateLimiterConfig,
90
+ } from './circuit-breaker';
91
+
92
+ // Types
93
+ export type {
94
+ // Core types
95
+ GuardianMode,
96
+ ViolationSeverity,
97
+ ViolationAction,
98
+ GuardrailType,
99
+ Violation,
100
+ GuardrailResult,
101
+ // Policy types
102
+ PolicyRule,
103
+ PolicyCondition,
104
+ GuardianPolicy,
105
+ CircuitBreakerConfig,
106
+ RateLimitConfig,
107
+ CostLimitConfig,
108
+ // Action types
109
+ ActionDefinition,
110
+ ActionParameter,
111
+ ParameterValidation,
112
+ // Intent types
113
+ IntentClassification,
114
+ // Detection types
115
+ PIIDetection,
116
+ PIIType,
117
+ PIILocation,
118
+ InjectionDetection,
119
+ InjectionType,
120
+ ContentFilterResult,
121
+ ContentFlag,
122
+ ContentCategory,
123
+ HallucinationCheckResult,
124
+ Citation,
125
+ UnsupportedClaim,
126
+ // Metrics types
127
+ GuardianMetrics,
128
+ CircuitBreakerState,
129
+ CostTracking,
130
+ // Event types
131
+ GuardianEventType,
132
+ GuardianEvent,
133
+ GuardianEventHandler,
134
+ // Interceptor types
135
+ InterceptedRequest,
136
+ InterceptedResponse,
137
+ InterceptedToolCall,
138
+ InterceptedAgentStep,
139
+ // Framework types
140
+ FrameworkType,
141
+ FrameworkIntegrationConfig,
142
+ } from './types';