palaryn 0.3.7 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +2 -1
  2. package/dist/src/auth/routes.d.ts.map +1 -1
  3. package/dist/src/auth/routes.js +5 -1
  4. package/dist/src/auth/routes.js.map +1 -1
  5. package/dist/src/config/defaults.d.ts.map +1 -1
  6. package/dist/src/config/defaults.js +7 -2
  7. package/dist/src/config/defaults.js.map +1 -1
  8. package/dist/src/dlp/composite-scanner.d.ts.map +1 -1
  9. package/dist/src/dlp/composite-scanner.js +26 -1
  10. package/dist/src/dlp/composite-scanner.js.map +1 -1
  11. package/dist/src/dlp/heuristic-scorer.d.ts +31 -0
  12. package/dist/src/dlp/heuristic-scorer.d.ts.map +1 -0
  13. package/dist/src/dlp/heuristic-scorer.js +314 -0
  14. package/dist/src/dlp/heuristic-scorer.js.map +1 -0
  15. package/dist/src/dlp/llm-classifier.d.ts +38 -0
  16. package/dist/src/dlp/llm-classifier.d.ts.map +1 -0
  17. package/dist/src/dlp/llm-classifier.js +152 -0
  18. package/dist/src/dlp/llm-classifier.js.map +1 -0
  19. package/dist/src/dlp/patterns.d.ts.map +1 -1
  20. package/dist/src/dlp/patterns.js +1 -0
  21. package/dist/src/dlp/patterns.js.map +1 -1
  22. package/dist/src/dlp/prompt-injection-backend.d.ts.map +1 -1
  23. package/dist/src/dlp/prompt-injection-backend.js +17 -0
  24. package/dist/src/dlp/prompt-injection-backend.js.map +1 -1
  25. package/dist/src/dlp/prompt-injection-patterns.d.ts.map +1 -1
  26. package/dist/src/dlp/prompt-injection-patterns.js +36 -0
  27. package/dist/src/dlp/prompt-injection-patterns.js.map +1 -1
  28. package/dist/src/dlp/regex-backend.d.ts.map +1 -1
  29. package/dist/src/dlp/regex-backend.js +2 -38
  30. package/dist/src/dlp/regex-backend.js.map +1 -1
  31. package/dist/src/dlp/scanner.d.ts.map +1 -1
  32. package/dist/src/dlp/scanner.js +38 -6
  33. package/dist/src/dlp/scanner.js.map +1 -1
  34. package/dist/src/dlp/text-normalizer.d.ts +10 -1
  35. package/dist/src/dlp/text-normalizer.d.ts.map +1 -1
  36. package/dist/src/dlp/text-normalizer.js +124 -2
  37. package/dist/src/dlp/text-normalizer.js.map +1 -1
  38. package/dist/src/mcp/http-transport.d.ts +2 -0
  39. package/dist/src/mcp/http-transport.d.ts.map +1 -1
  40. package/dist/src/mcp/http-transport.js +25 -6
  41. package/dist/src/mcp/http-transport.js.map +1 -1
  42. package/dist/src/policy/engine.d.ts.map +1 -1
  43. package/dist/src/policy/engine.js +109 -0
  44. package/dist/src/policy/engine.js.map +1 -1
  45. package/dist/src/saas/routes.d.ts.map +1 -1
  46. package/dist/src/saas/routes.js +19 -5
  47. package/dist/src/saas/routes.js.map +1 -1
  48. package/dist/src/server/app.d.ts.map +1 -1
  49. package/dist/src/server/app.js +7 -0
  50. package/dist/src/server/app.js.map +1 -1
  51. package/dist/src/server/gateway.d.ts +1 -0
  52. package/dist/src/server/gateway.d.ts.map +1 -1
  53. package/dist/src/server/gateway.js +160 -1
  54. package/dist/src/server/gateway.js.map +1 -1
  55. package/dist/src/types/config.d.ts +14 -1
  56. package/dist/src/types/config.d.ts.map +1 -1
  57. package/dist/tests/security/pentest-payloads.d.ts +46 -0
  58. package/dist/tests/security/pentest-payloads.d.ts.map +1 -0
  59. package/dist/tests/security/pentest-payloads.js +475 -0
  60. package/dist/tests/security/pentest-payloads.js.map +1 -0
  61. package/dist/tests/unit/adversarial-pipeline.test.d.ts +15 -0
  62. package/dist/tests/unit/adversarial-pipeline.test.d.ts.map +1 -0
  63. package/dist/tests/unit/adversarial-pipeline.test.js +1557 -0
  64. package/dist/tests/unit/adversarial-pipeline.test.js.map +1 -0
  65. package/dist/tests/unit/dlp-scanner.test.js +5 -5
  66. package/dist/tests/unit/gateway-branches.test.js +137 -0
  67. package/dist/tests/unit/gateway-branches.test.js.map +1 -1
  68. package/dist/tests/unit/heuristic-scorer.test.d.ts +2 -0
  69. package/dist/tests/unit/heuristic-scorer.test.d.ts.map +1 -0
  70. package/dist/tests/unit/heuristic-scorer.test.js +248 -0
  71. package/dist/tests/unit/heuristic-scorer.test.js.map +1 -0
  72. package/dist/tests/unit/llm-classifier.test.d.ts +2 -0
  73. package/dist/tests/unit/llm-classifier.test.d.ts.map +1 -0
  74. package/dist/tests/unit/llm-classifier.test.js +349 -0
  75. package/dist/tests/unit/llm-classifier.test.js.map +1 -0
  76. package/dist/tests/unit/prompt-injection-backend.test.js +122 -0
  77. package/dist/tests/unit/prompt-injection-backend.test.js.map +1 -1
  78. package/dist/tests/unit/text-normalizer.test.js +52 -1
  79. package/dist/tests/unit/text-normalizer.test.js.map +1 -1
  80. package/package.json +1 -1
  81. package/policy-packs/default.yaml +88 -0
  82. package/src/auth/routes.ts +6 -1
  83. package/src/config/defaults.ts +7 -2
  84. package/src/dlp/composite-scanner.ts +27 -1
  85. package/src/dlp/heuristic-scorer.ts +342 -0
  86. package/src/dlp/llm-classifier.ts +191 -0
  87. package/src/dlp/patterns.ts +1 -0
  88. package/src/dlp/prompt-injection-backend.ts +19 -1
  89. package/src/dlp/prompt-injection-patterns.ts +38 -0
  90. package/src/dlp/regex-backend.ts +2 -45
  91. package/src/dlp/scanner.ts +36 -6
  92. package/src/dlp/text-normalizer.ts +130 -2
  93. package/src/mcp/http-transport.ts +29 -6
  94. package/src/policy/engine.ts +102 -0
  95. package/src/saas/routes.ts +22 -5
  96. package/src/server/app.ts +7 -0
  97. package/src/server/gateway.ts +196 -1
  98. package/src/types/config.ts +15 -1
@@ -9,6 +9,8 @@ import { DLPScanner } from '../dlp/scanner';
9
9
  import { CompositeDLPScanner } from '../dlp/composite-scanner';
10
10
  import { DLPBackend } from '../dlp/interfaces';
11
11
  import { PromptInjectionBackend } from '../dlp/prompt-injection-backend';
12
+ import { HeuristicScorerBackend } from '../dlp/heuristic-scorer';
13
+ import { scorePromptInjection } from '../dlp/heuristic-scorer';
12
14
  import { TruffleHogBackend } from '../dlp/trufflehog-backend';
13
15
  import { BudgetManager, CostRecord } from '../budget/manager';
14
16
  import { UsageExtractor } from '../budget/usage-extractor';
@@ -29,6 +31,7 @@ import { UsageData } from '../types/tool-result';
29
31
  import { GatewayMetrics } from '../metrics';
30
32
  import { GatewayTracer } from '../tracing';
31
33
  import { AnomalyDetector } from '../anomaly';
34
+ import { LlmPromptInjectionClassifier } from '../dlp/llm-classifier';
32
35
  import { log as devLog, logger } from './logger';
33
36
 
34
37
  export interface PreExecuteResult {
@@ -118,6 +121,7 @@ export class Gateway {
118
121
  private rateLimitConfigStore?: RateLimitConfigStore;
119
122
  private budgetConfigStore?: BudgetConfigStore;
120
123
  private usageExtractor: UsageExtractor;
124
+ private llmClassifier?: LlmPromptInjectionClassifier;
121
125
  private inFlightCleanupInterval?: ReturnType<typeof setInterval>;
122
126
  /**
123
127
  * Tracks tool_call_ids currently being processed to prevent TOCTOU races.
@@ -137,6 +141,7 @@ export class Gateway {
137
141
  dlpBackends.push(new PromptInjectionBackend({
138
142
  scan_output: config.dlp.scan_output,
139
143
  }));
144
+ dlpBackends.push(new HeuristicScorerBackend());
140
145
  }
141
146
  if (config.dlp.trufflehog?.enabled) {
142
147
  dlpBackends.push(new TruffleHogBackend({
@@ -165,6 +170,14 @@ export class Gateway {
165
170
  this.opaEngine = new OPAEngine(config.policy.opa);
166
171
  }
167
172
 
173
+ // Set up LLM-based prompt injection classifier if enabled
174
+ if (config.dlp.llm_classifier?.enabled) {
175
+ this.llmClassifier = new LlmPromptInjectionClassifier(config.dlp.llm_classifier);
176
+ console.log(`[Gateway] LLM classifier enabled (model: ${config.dlp.llm_classifier.model || 'default'})`);
177
+ } else {
178
+ console.log(`[Gateway] LLM classifier disabled (PALARYN_LLM_API_KEY ${process.env.PALARYN_LLM_API_KEY ? 'set' : 'NOT set'})`);
179
+ }
180
+
168
181
  this.usageExtractor = new UsageExtractor(config.budget.token_pricing);
169
182
 
170
183
  // Set up executor registry with HTTP as default + catch-all fallback
@@ -310,7 +323,9 @@ export class Gateway {
310
323
  // Prompt injection blocking check (before policy, so it always runs)
311
324
  const piAction = this.config.dlp.prompt_injection_action || 'log';
312
325
  if (piAction === 'block' && argsDlp.detected.length > 0) {
313
- const piDetections = argsDlp.detected.filter((d: string) => d.startsWith('prompt_injection_'));
326
+ const piDetections = argsDlp.detected.filter((d: string) =>
327
+ d.startsWith('prompt_injection_') || d.startsWith('heuristic_prompt_injection')
328
+ );
314
329
  if (piDetections.length > 0) {
315
330
  const threshold = this.config.dlp.prompt_injection_block_threshold || 'high';
316
331
  const severityRank: Record<string, number> = { low: 0, medium: 1, high: 2 };
@@ -355,6 +370,115 @@ export class Gateway {
355
370
  }
356
371
  }
357
372
 
373
+ // Heuristic scoring — force LLM classifier when structural signals are elevated
374
+ let forceLlmClassification = false;
375
+ const inputText = JSON.stringify(toolCall);
376
+ const heuristicResult = scorePromptInjection(inputText);
377
+ if (heuristicResult.score >= 0.4) {
378
+ forceLlmClassification = true;
379
+ devLog.pipelineStep('🔍', 'HEURISTIC_SCORER',
380
+ `score=${heuristicResult.score.toFixed(2)} signals=[${heuristicResult.signals.join(',')}]`);
381
+
382
+ // Fallback: if heuristic score >= 0.5 and no LLM classifier is available, block directly.
383
+ // Without an LLM classifier the forceLlmClassification flag has no effect,
384
+ // so we treat a score of 0.5+ (3+ structural signals) as sufficient evidence to block.
385
+ if (heuristicResult.score >= 0.5 && !this.llmClassifier && piAction === 'block') {
386
+ const heuristicDetection = `heuristic_prompt_injection (score=${heuristicResult.score.toFixed(2)}, signals=${heuristicResult.signals.join(',')})`;
387
+ argsDlp.detected.push(heuristicDetection);
388
+ argsDlp.severity = this.maxSeverity(argsDlp.severity, 'high');
389
+ this.auditLogger.logDLPScanned(toolCall, [heuristicDetection], 'high', 0);
390
+ this.metrics?.recordDLPDetection('heuristic_prompt_injection', 'high');
391
+
392
+ devLog.pipelineStep('🛡️', 'HEURISTIC_BLOCK', `score=${heuristicResult.score.toFixed(2)} — no LLM classifier available, blocking`);
393
+ const durationSec = (Date.now() - startTime) / 1000;
394
+ this.metrics?.recordRequest('blocked', toolCall.tool.name, toolCall.tool.capability, durationSec);
395
+ const result = this.buildResult(toolCall, 'blocked', {
396
+ decision: 'deny',
397
+ rule_id: 'heuristic_prompt_injection_block',
398
+ rule_name: 'Heuristic prompt injection detected',
399
+ reasons: [`Heuristic prompt injection detected: ${heuristicResult.signals.join(', ')} (score=${heuristicResult.score.toFixed(2)})`],
400
+ }, startTime, undefined,
401
+ `Blocked by heuristic prompt injection scorer (score=${heuristicResult.score.toFixed(2)}, signals: ${heuristicResult.signals.join(', ')})`,
402
+ undefined, argsDlp);
403
+ devLog.pipelineEnd('blocked', Date.now() - startTime);
404
+ return { allowed: false, result, stepTimings, startTime };
405
+ }
406
+ }
407
+
408
+ // LLM-based prompt injection classification on INPUT (async, runs after sync DLP scan)
409
+ if ((this.llmClassifier && this.config.dlp.llm_classifier?.scan_input !== false) || (forceLlmClassification && this.llmClassifier)) {
410
+ const llmInputStart = Date.now();
411
+ const llmInputResult = await asyncChildSpan(otel, 'gateway.llm_classifier_input', () =>
412
+ this.llmClassifier!.classify(inputText)
413
+ );
414
+ stepTimings.llm_classifier_input = Date.now() - llmInputStart;
415
+ const llmInputClassifications = llmInputResult.classifications;
416
+
417
+ // Degraded signal: LLM classifier failed while heuristic flagged suspicious content
418
+ if (llmInputResult.error && forceLlmClassification) {
419
+ const degradedName = 'llm_classifier_unavailable';
420
+ argsDlp.detected.push(degradedName);
421
+ argsDlp.severity = this.maxSeverity(argsDlp.severity, 'medium');
422
+ this.auditLogger.logDLPScanned(toolCall, [degradedName], 'medium', 0);
423
+ this.metrics?.recordDLPDetection(degradedName, 'medium');
424
+ devLog.pipelineStep('⚠️', 'LLM_CLASSIFIER_UNAVAILABLE',
425
+ `LLM classifier failed with heuristic score=${heuristicResult.score.toFixed(2)} — emitting degraded signal`);
426
+ }
427
+
428
+ if (llmInputClassifications.length > 0) {
429
+ const llmDetections = LlmPromptInjectionClassifier.toDLPDetections(llmInputClassifications, inputText);
430
+ const llmDetectionNames = llmDetections.map(d => d.pattern_name);
431
+ argsDlp.detected.push(...llmDetectionNames);
432
+
433
+ for (const d of llmDetections) {
434
+ argsDlp.severity = this.maxSeverity(argsDlp.severity, d.severity);
435
+ }
436
+
437
+ this.auditLogger.logDLPScanned(toolCall, llmDetectionNames, argsDlp.severity, 0);
438
+ for (const name of llmDetectionNames) {
439
+ this.metrics?.recordDLPDetection(name, argsDlp.severity);
440
+ }
441
+
442
+ // Re-check prompt injection blocking with LLM classifier detections.
443
+ // LLM classifier uses 'high' threshold to reduce false positives on benign content —
444
+ // it's more aggressive than regex/heuristic and needs higher confidence to block.
445
+ if (piAction === 'block') {
446
+ const llmPiDetections = argsDlp.detected.filter((d: string) => d.startsWith('llm_classifier_'));
447
+ if (llmPiDetections.length > 0) {
448
+ // Use max of configured threshold and 'high' — LLM-only blocks require high severity
449
+ const configThreshold = this.config.dlp.prompt_injection_block_threshold || 'high';
450
+ const severityRank: Record<string, number> = { low: 0, medium: 1, high: 2 };
451
+ const hasHeuristicDetection = argsDlp.detected.some((d: string) => d.startsWith('heuristic_prompt_injection'));
452
+ // If heuristic also flagged it, trust the LLM at configured threshold;
453
+ // otherwise require 'high' to avoid false positives on benign content.
454
+ const threshold = hasHeuristicDetection ? configThreshold : 'high';
455
+ const thresholdRank = severityRank[threshold] ?? 2;
456
+ const maxLlmSeverity = llmInputClassifications.reduce((max, c) => {
457
+ const rank = severityRank[c.severity] ?? 0;
458
+ return rank > max ? rank : max;
459
+ }, 0);
460
+
461
+ if (maxLlmSeverity >= thresholdRank) {
462
+ devLog.pipelineStep('🛡️', 'LLM_CLASSIFIER_INPUT_BLOCK',
463
+ `Blocked: ${llmPiDetections.join(', ')} (severity: ${argsDlp.severity}, threshold: ${threshold})`);
464
+ const durationSec = (Date.now() - startTime) / 1000;
465
+ this.metrics?.recordRequest('blocked', toolCall.tool.name, toolCall.tool.capability, durationSec);
466
+ const result = this.buildResult(toolCall, 'blocked', {
467
+ decision: 'deny',
468
+ rule_id: 'llm_classifier_input_block',
469
+ rule_name: 'LLM classifier detected prompt injection in input',
470
+ reasons: [`LLM classifier detected: ${llmPiDetections.join(', ')}`],
471
+ }, startTime, undefined,
472
+ `Blocked by LLM classifier: ${llmPiDetections.join(', ')} (severity: ${argsDlp.severity})`,
473
+ undefined, argsDlp);
474
+ devLog.pipelineEnd('blocked', Date.now() - startTime);
475
+ return { allowed: false, result, stepTimings, startTime };
476
+ }
477
+ }
478
+ }
479
+ }
480
+ }
481
+
358
482
  // Policy evaluation — DLP context is passed so DLP-conditioned rules
359
483
  // compete with all other rules in a single priority-ordered pass.
360
484
  stepStart = Date.now();
@@ -533,6 +657,77 @@ export class Gateway {
533
657
  }
534
658
  stepTimings.dlp_out = Date.now() - stepStart;
535
659
 
660
+ // LLM-based prompt injection classification (async, runs after sync DLP scan)
661
+ if (this.llmClassifier && output.body && this.config.dlp.llm_classifier?.scan_output !== false) {
662
+ const llmStart = Date.now();
663
+ const text = typeof output.body === 'string' ? output.body : JSON.stringify(output.body);
664
+ const llmOutputResult = await asyncChildSpan(otel, 'gateway.llm_classifier', () =>
665
+ this.llmClassifier!.classify(text)
666
+ );
667
+ stepTimings.llm_classifier = Date.now() - llmStart;
668
+ const llmClassifications = llmOutputResult.classifications;
669
+
670
+ if (llmClassifications.length > 0) {
671
+ const llmDetections = LlmPromptInjectionClassifier.toDLPDetections(llmClassifications, text);
672
+ const llmDetectionNames = llmDetections.map(d => d.pattern_name);
673
+ outputDlp.detected.push(...llmDetectionNames);
674
+
675
+ // Recalculate severity: take the max of existing and LLM detections
676
+ for (const d of llmDetections) {
677
+ outputDlp.severity = this.maxSeverity(outputDlp.severity, d.severity);
678
+ }
679
+
680
+ this.auditLogger.logDLPScanned(toolCall, llmDetectionNames, outputDlp.severity, 0);
681
+ for (const name of llmDetectionNames) {
682
+ this.metrics?.recordDLPDetection(name, outputDlp.severity);
683
+ }
684
+ }
685
+ }
686
+
687
+ // Prompt injection blocking check on output (regex + LLM classifier detections)
688
+ const piOutputAction = this.config.dlp.prompt_injection_action || 'log';
689
+ if (piOutputAction === 'block' && outputDlp.detected.length > 0) {
690
+ const piDetections = outputDlp.detected.filter((d: string) =>
691
+ d.startsWith('prompt_injection_') || d.startsWith('llm_classifier_')
692
+ );
693
+ if (piDetections.length > 0) {
694
+ const threshold = this.config.dlp.prompt_injection_block_threshold || 'medium';
695
+ const severityRank: Record<string, number> = { low: 0, medium: 1, high: 2 };
696
+ const thresholdRank = severityRank[threshold] ?? 1;
697
+ const maxSeverityRank = severityRank[outputDlp.severity] ?? 0;
698
+
699
+ if (maxSeverityRank >= thresholdRank) {
700
+ // Release budget reservation since we're blocking
701
+ if (reservationKey) {
702
+ this.budgetManager.commitReservation(reservationKey, 0);
703
+ }
704
+
705
+ devLog.pipelineStep('🛡️', 'PROMPT_INJECTION_OUTPUT_BLOCK',
706
+ `Blocked output: ${piDetections.join(', ')} (severity: ${outputDlp.severity}, threshold: ${threshold})`);
707
+ const durationSec = (Date.now() - startTime) / 1000;
708
+ this.metrics?.recordRequest('blocked', toolCall.tool.name, toolCall.tool.capability, durationSec);
709
+ const defaultPolicy = policyResult || { decision: 'allow' as const, rule_id: 'passthrough', rule_name: 'Passthrough', reasons: [] };
710
+ const argsDlpSafe = argsDlp || { detected: [], redactions: [], severity: 'low' as DLPSeverity };
711
+ const mergedDlp = {
712
+ detected: [...new Set([...argsDlpSafe.detected, ...outputDlp.detected])],
713
+ redactions: [...argsDlpSafe.redactions, ...outputDlp.redactions],
714
+ severity: this.maxSeverity(argsDlpSafe.severity, outputDlp.severity),
715
+ };
716
+ const result = this.buildResult(toolCall, 'blocked', {
717
+ decision: 'deny',
718
+ rule_id: 'prompt_injection_output_block',
719
+ rule_name: 'Prompt injection detected in output',
720
+ reasons: [`Prompt injection detected in output: ${piDetections.join(', ')}`],
721
+ }, startTime, undefined,
722
+ `Blocked by prompt injection detection in output: ${piDetections.join(', ')} (severity: ${outputDlp.severity})`,
723
+ undefined, mergedDlp);
724
+ this.auditLogger.logToolResultReturned(toolCall, 'blocked', Date.now() - startTime, { prompt_injection_blocked: true, detections: piDetections });
725
+ devLog.pipelineEnd('blocked', Date.now() - startTime);
726
+ return result;
727
+ }
728
+ }
729
+ }
730
+
536
731
  // Extract usage data from response
537
732
  const headerUsage = this.usageExtractor.extractFromHeaders(output.headers);
538
733
  const bodyUsage = this.usageExtractor.extractFromBody(output.body);
@@ -140,6 +140,16 @@ export interface OPAConfig {
140
140
  package_name?: string;
141
141
  }
142
142
 
143
+ export interface LlmClassifierConfig {
144
+ enabled: boolean;
145
+ model?: string;
146
+ confidence_threshold?: number;
147
+ /** Run LLM classifier on input/args (default: true) */
148
+ scan_input?: boolean;
149
+ /** Run LLM classifier on output (default: true) */
150
+ scan_output?: boolean;
151
+ }
152
+
143
153
  export interface DLPConfig {
144
154
  enabled: boolean;
145
155
  scan_args: boolean;
@@ -149,7 +159,7 @@ export interface DLPConfig {
149
159
  prompt_injection_detection?: boolean;
150
160
  /** Action when prompt injection is detected: 'log' (default), 'flag', or 'block' */
151
161
  prompt_injection_action?: 'log' | 'flag' | 'block';
152
- /** Minimum severity to trigger blocking (when action is 'block'). Default: 'high' */
162
+ /** Minimum severity to trigger blocking (when action is 'block'). Default: 'medium' */
153
163
  prompt_injection_block_threshold?: 'medium' | 'high';
154
164
  /** Response mode when injection is blocked: 'deny' (default), 'sanitize', or 'require_approval' */
155
165
  prompt_injection_response?: 'deny' | 'sanitize' | 'require_approval';
@@ -159,6 +169,10 @@ export interface DLPConfig {
159
169
  binary_path?: string;
160
170
  timeout_ms?: number;
161
171
  };
172
+ /** Maximum scan depth for nested structures (default: 64) */
173
+ max_scan_depth?: number;
174
+ /** LLM-based prompt injection classifier (async, semantic analysis) */
175
+ llm_classifier?: LlmClassifierConfig;
162
176
  }
163
177
 
164
178
  export interface AuditConfig {