@artemiskit/sdk 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,378 @@
1
+ /**
2
+ * Intent Classifier
3
+ *
4
+ * Analyzes AI/agent intent to determine what it's trying to accomplish,
5
+ * not just the literal action. Uses pattern matching and optionally LLM-based
6
+ * classification to detect potentially risky intents.
7
+ */
8
+
9
+ import type { ModelClient } from '@artemiskit/core';
10
+ import { nanoid } from 'nanoid';
11
+ import type { GuardrailResult, IntentClassification, Violation, ViolationSeverity } from './types';
12
+
13
+ /**
14
+ * Intent category definition
15
+ */
16
+ export interface IntentCategory {
17
+ name: string;
18
+ description: string;
19
+ riskLevel: ViolationSeverity;
20
+ patterns?: RegExp[];
21
+ keywords?: string[];
22
+ examples?: string[];
23
+ action?: 'allow' | 'warn' | 'block';
24
+ }
25
+
26
+ /**
27
+ * Intent classifier configuration
28
+ */
29
+ export interface IntentClassifierConfig {
30
+ /** Pre-defined intent categories */
31
+ categories?: IntentCategory[];
32
+ /** Use LLM for classification */
33
+ useLLM?: boolean;
34
+ /** LLM client for classification */
35
+ llmClient?: ModelClient;
36
+ /** Minimum confidence threshold */
37
+ confidenceThreshold?: number;
38
+ /** Block unknown intents */
39
+ blockUnknown?: boolean;
40
+ /** Block high-risk intents */
41
+ blockHighRisk?: boolean;
42
+ }
43
+
44
+ /**
45
+ * Default intent categories for common risky patterns
46
+ */
47
+ const DEFAULT_CATEGORIES: IntentCategory[] = [
48
+ {
49
+ name: 'data_exfiltration',
50
+ description: 'Attempting to extract or transmit sensitive data',
51
+ riskLevel: 'critical',
52
+ action: 'block',
53
+ patterns: [
54
+ /send\s+(all|every|the)\s+(data|information|records|files)/i,
55
+ /export\s+.*\s+(external|outside|remote)/i,
56
+ /upload\s+.*\s+(server|cloud|external)/i,
57
+ /transfer\s+.*\s+(data|files|records)/i,
58
+ ],
59
+ keywords: ['exfiltrate', 'steal', 'extract sensitive', 'leak data', 'send externally'],
60
+ },
61
+ {
62
+ name: 'privilege_escalation',
63
+ description: 'Attempting to gain elevated permissions',
64
+ riskLevel: 'critical',
65
+ action: 'block',
66
+ patterns: [
67
+ /become\s+(admin|root|superuser)/i,
68
+ /grant\s+.*\s+(admin|root|all)\s+permissions/i,
69
+ /elevate\s+.*\s+privileges/i,
70
+ /bypass\s+.*\s+(auth|permission|security)/i,
71
+ ],
72
+ keywords: ['sudo', 'admin access', 'root access', 'elevated privileges', 'bypass security'],
73
+ },
74
+ {
75
+ name: 'system_modification',
76
+ description: 'Attempting to modify system configuration',
77
+ riskLevel: 'high',
78
+ action: 'warn',
79
+ patterns: [
80
+ /modify\s+.*\s+(system|config|settings)/i,
81
+ /change\s+.*\s+(permissions|ownership)/i,
82
+ /install\s+.*\s+(software|package|dependency)/i,
83
+ /update\s+.*\s+(system|os|kernel)/i,
84
+ ],
85
+ keywords: ['system config', 'install package', 'modify settings', 'change permissions'],
86
+ },
87
+ {
88
+ name: 'destructive_action',
89
+ description: 'Attempting destructive operations',
90
+ riskLevel: 'critical',
91
+ action: 'block',
92
+ patterns: [
93
+ /delete\s+(all|every|\*)/i,
94
+ /drop\s+(table|database|collection)/i,
95
+ /remove\s+.*\s+(permanently|forever)/i,
96
+ /destroy\s+.*\s+(data|files|records)/i,
97
+ /wipe\s+.*\s+(clean|everything)/i,
98
+ ],
99
+ keywords: ['rm -rf', 'drop database', 'delete all', 'wipe clean', 'destroy'],
100
+ },
101
+ {
102
+ name: 'sensitive_access',
103
+ description: 'Attempting to access sensitive information',
104
+ riskLevel: 'high',
105
+ action: 'warn',
106
+ patterns: [
107
+ /access\s+.*\s+(password|secret|key|credential)/i,
108
+ /read\s+.*\s+(\.env|config|secret)/i,
109
+ /show\s+.*\s+(password|api.?key|token)/i,
110
+ /list\s+.*\s+(credentials|secrets|keys)/i,
111
+ ],
112
+ keywords: ['api key', 'password', 'secret', 'credential', 'private key', 'access token'],
113
+ },
114
+ {
115
+ name: 'reconnaissance',
116
+ description: 'Gathering information about systems or infrastructure',
117
+ riskLevel: 'medium',
118
+ action: 'warn',
119
+ patterns: [
120
+ /scan\s+.*\s+(network|ports|hosts)/i,
121
+ /enumerate\s+.*\s+(users|services|endpoints)/i,
122
+ /discover\s+.*\s+(systems|services|hosts)/i,
123
+ /list\s+.*\s+(all|every)\s+(user|service|endpoint)/i,
124
+ ],
125
+ keywords: ['port scan', 'network scan', 'enumerate', 'fingerprint', 'probe'],
126
+ },
127
+ {
128
+ name: 'code_execution',
129
+ description: 'Attempting to execute arbitrary code',
130
+ riskLevel: 'critical',
131
+ action: 'block',
132
+ patterns: [
133
+ /execute\s+.*\s+(command|script|code)/i,
134
+ /run\s+.*\s+(shell|bash|command)/i,
135
+ /eval\s*\(/i,
136
+ /exec\s*\(/i,
137
+ ],
138
+ keywords: ['execute code', 'run command', 'shell command', 'eval', 'exec'],
139
+ },
140
+ {
141
+ name: 'social_engineering',
142
+ description: 'Attempting social engineering or manipulation',
143
+ riskLevel: 'high',
144
+ action: 'block',
145
+ patterns: [
146
+ /pretend\s+(to be|you are)/i,
147
+ /impersonate\s+/i,
148
+ /ignore\s+.*\s+(instructions|rules|guidelines)/i,
149
+ /forget\s+.*\s+(previous|earlier)\s+(instructions|rules)/i,
150
+ ],
151
+ keywords: ['pretend to be', 'ignore instructions', 'forget rules', 'act as', 'jailbreak'],
152
+ },
153
+ {
154
+ name: 'financial_transaction',
155
+ description: 'Attempting financial operations',
156
+ riskLevel: 'high',
157
+ action: 'warn',
158
+ patterns: [
159
+ /transfer\s+.*\s+(money|funds|payment)/i,
160
+ /send\s+.*\s+(payment|money)/i,
161
+ /make\s+.*\s+(purchase|payment|transaction)/i,
162
+ /withdraw\s+.*\s+(funds|money)/i,
163
+ ],
164
+ keywords: ['transfer funds', 'send payment', 'make purchase', 'withdraw money'],
165
+ },
166
+ {
167
+ name: 'communication',
168
+ description: 'Attempting to send communications',
169
+ riskLevel: 'medium',
170
+ action: 'warn',
171
+ patterns: [
172
+ /send\s+.*\s+(email|message|notification)/i,
173
+ /post\s+.*\s+(message|comment|update)/i,
174
+ /publish\s+.*\s+(content|article|post)/i,
175
+ ],
176
+ keywords: ['send email', 'post message', 'publish content', 'notify'],
177
+ },
178
+ ];
179
+
180
+ /**
181
+ * Intent Classifier
182
+ *
183
+ * Analyzes text to determine the underlying intent and assess risk.
184
+ */
185
+ export class IntentClassifier {
186
+ private config: IntentClassifierConfig;
187
+ private categories: IntentCategory[];
188
+
189
+ constructor(config: IntentClassifierConfig = {}) {
190
+ this.config = {
191
+ confidenceThreshold: 0.7,
192
+ blockUnknown: false,
193
+ blockHighRisk: true,
194
+ ...config,
195
+ };
196
+ this.categories = [...DEFAULT_CATEGORIES, ...(config.categories ?? [])];
197
+ }
198
+
199
+ /**
200
+ * Classify the intent of a given text
201
+ */
202
+ async classify(text: string): Promise<IntentClassification[]> {
203
+ const classifications: IntentClassification[] = [];
204
+
205
+ // Pattern-based classification
206
+ for (const category of this.categories) {
207
+ let confidence = 0;
208
+ let matches = 0;
209
+ const totalChecks = (category.patterns?.length ?? 0) + (category.keywords?.length ?? 0);
210
+
211
+ // Check patterns
212
+ if (category.patterns) {
213
+ for (const pattern of category.patterns) {
214
+ if (pattern.test(text)) {
215
+ matches++;
216
+ confidence += 0.8; // Pattern matches are high confidence
217
+ }
218
+ }
219
+ }
220
+
221
+ // Check keywords
222
+ if (category.keywords) {
223
+ const lowerText = text.toLowerCase();
224
+ for (const keyword of category.keywords) {
225
+ if (lowerText.includes(keyword.toLowerCase())) {
226
+ matches++;
227
+ confidence += 0.5; // Keyword matches are medium confidence
228
+ }
229
+ }
230
+ }
231
+
232
+ if (matches > 0) {
233
+ // Normalize confidence
234
+ const normalizedConfidence = Math.min(1, confidence / Math.max(1, totalChecks));
235
+
236
+ classifications.push({
237
+ intent: category.name,
238
+ confidence: normalizedConfidence,
239
+ category: category.description,
240
+ riskLevel: category.riskLevel,
241
+ });
242
+ }
243
+ }
244
+
245
+ // LLM-based classification if enabled
246
+ if (this.config.useLLM && this.config.llmClient) {
247
+ const llmClassifications = await this.classifyWithLLM(text);
248
+ classifications.push(...llmClassifications);
249
+ }
250
+
251
+ // Sort by confidence descending
252
+ classifications.sort((a, b) => b.confidence - a.confidence);
253
+
254
+ return classifications;
255
+ }
256
+
257
+ /**
258
+ * Create a guardrail function from this classifier
259
+ */
260
+ asGuardrail(): (content: string, context?: Record<string, unknown>) => Promise<GuardrailResult> {
261
+ return async (content: string) => {
262
+ const result = await this.validate(content);
263
+ return result;
264
+ };
265
+ }
266
+
267
+ /**
268
+ * Validate content and return guardrail result
269
+ */
270
+ async validate(text: string): Promise<GuardrailResult> {
271
+ const classifications = await this.classify(text);
272
+ const violations: Violation[] = [];
273
+
274
+ for (const classification of classifications) {
275
+ if (classification.confidence < (this.config.confidenceThreshold ?? 0.7)) {
276
+ continue;
277
+ }
278
+
279
+ const category = this.categories.find((c) => c.name === classification.intent);
280
+ const action = category?.action ?? 'warn';
281
+ const shouldBlock =
282
+ action === 'block' ||
283
+ (this.config.blockHighRisk === true &&
284
+ (classification.riskLevel === 'high' || classification.riskLevel === 'critical'));
285
+
286
+ if (action !== 'allow') {
287
+ violations.push({
288
+ id: nanoid(),
289
+ type: 'intent_classification',
290
+ severity: classification.riskLevel ?? 'medium',
291
+ message: `Detected risky intent: ${classification.intent}`,
292
+ details: {
293
+ intent: classification.intent,
294
+ confidence: classification.confidence,
295
+ category: classification.category,
296
+ },
297
+ timestamp: new Date(),
298
+ action: shouldBlock ? 'block' : 'warn',
299
+ blocked: shouldBlock,
300
+ });
301
+ }
302
+ }
303
+
304
+ return {
305
+ passed: violations.length === 0,
306
+ violations,
307
+ metadata: { classifications },
308
+ };
309
+ }
310
+
311
+ /**
312
+ * Add a custom intent category
313
+ */
314
+ addCategory(category: IntentCategory): void {
315
+ this.categories.push(category);
316
+ }
317
+
318
+ /**
319
+ * Remove an intent category
320
+ */
321
+ removeCategory(name: string): void {
322
+ this.categories = this.categories.filter((c) => c.name !== name);
323
+ }
324
+
325
+ /**
326
+ * Get all categories
327
+ */
328
+ getCategories(): IntentCategory[] {
329
+ return [...this.categories];
330
+ }
331
+
332
+ /**
333
+ * Classify using LLM (for more nuanced understanding)
334
+ */
335
+ private async classifyWithLLM(text: string): Promise<IntentClassification[]> {
336
+ if (!this.config.llmClient) {
337
+ return [];
338
+ }
339
+
340
+ const prompt = `Analyze the following text and identify any potentially risky intents.
341
+ For each intent found, provide:
342
+ - intent: a short name for the intent
343
+ - confidence: a number between 0 and 1
344
+ - category: a brief description
345
+ - riskLevel: one of "low", "medium", "high", "critical"
346
+
347
+ Text to analyze:
348
+ "${text}"
349
+
350
+ Respond with a JSON array of intent objects. If no risky intents are found, respond with an empty array [].`;
351
+
352
+ try {
353
+ const result = await this.config.llmClient.generate({
354
+ prompt,
355
+ temperature: 0,
356
+ maxTokens: 500,
357
+ });
358
+
359
+ // Parse JSON response
360
+ const jsonMatch = result.text.match(/\[[\s\S]*\]/);
361
+ if (jsonMatch) {
362
+ const parsed = JSON.parse(jsonMatch[0]);
363
+ return parsed as IntentClassification[];
364
+ }
365
+ } catch {
366
+ // Fall back to pattern-based only
367
+ }
368
+
369
+ return [];
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Create a default intent classifier
375
+ */
376
+ export function createIntentClassifier(config: IntentClassifierConfig = {}): IntentClassifier {
377
+ return new IntentClassifier(config);
378
+ }