npm - ai-shield-core - Versions diffs - 0.1.0 - Mend

ai-shield-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/dist/audit/logger.d.ts +40 -0
package/dist/audit/logger.d.ts.map +1 -0
package/dist/audit/logger.js +100 -0
package/dist/audit/logger.js.map +1 -0
package/dist/audit/types.d.ts +12 -0
package/dist/audit/types.d.ts.map +1 -0
package/dist/audit/types.js +3 -0
package/dist/audit/types.js.map +1 -0
package/dist/cache/lru.d.ts +27 -0
package/dist/cache/lru.d.ts.map +1 -0
package/dist/cache/lru.js +74 -0
package/dist/cache/lru.js.map +1 -0
package/dist/cost/anomaly.d.ts +10 -0
package/dist/cost/anomaly.d.ts.map +1 -0
package/dist/cost/anomaly.js +42 -0
package/dist/cost/anomaly.js.map +1 -0
package/dist/cost/pricing.d.ts +7 -0
package/dist/cost/pricing.d.ts.map +1 -0
package/dist/cost/pricing.js +51 -0
package/dist/cost/pricing.js.map +1 -0
package/dist/cost/tracker.d.ts +24 -0
package/dist/cost/tracker.d.ts.map +1 -0
package/dist/cost/tracker.js +136 -0
package/dist/cost/tracker.js.map +1 -0
package/dist/index.d.ts +18 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +59 -0
package/dist/index.js.map +1 -0
package/dist/policy/engine.d.ts +36 -0
package/dist/policy/engine.d.ts.map +1 -0
package/dist/policy/engine.js +127 -0
package/dist/policy/engine.js.map +1 -0
package/dist/policy/tools.d.ts +25 -0
package/dist/policy/tools.d.ts.map +1 -0
package/dist/policy/tools.js +158 -0
package/dist/policy/tools.js.map +1 -0
package/dist/scanner/canary.d.ts +9 -0
package/dist/scanner/canary.d.ts.map +1 -0
package/dist/scanner/canary.js +19 -0
package/dist/scanner/canary.js.map +1 -0
package/dist/scanner/chain.d.ts +17 -0
package/dist/scanner/chain.d.ts.map +1 -0
package/dist/scanner/chain.js +69 -0
package/dist/scanner/chain.js.map +1 -0
package/dist/scanner/heuristic.d.ts +28 -0
package/dist/scanner/heuristic.d.ts.map +1 -0
package/dist/scanner/heuristic.js +375 -0
package/dist/scanner/heuristic.js.map +1 -0
package/dist/scanner/pii.d.ts +17 -0
package/dist/scanner/pii.d.ts.map +1 -0
package/dist/scanner/pii.js +255 -0
package/dist/scanner/pii.js.map +1 -0
package/dist/shield.d.ts +31 -0
package/dist/shield.d.ts.map +1 -0
package/dist/shield.js +184 -0
package/dist/shield.js.map +1 -0
package/dist/types.d.ts +182 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +6 -0
package/dist/types.js.map +1 -0
package/package.json +27 -0
package/src/audit/logger.ts +135 -0
package/src/audit/schema.sql +51 -0
package/src/audit/types.ts +16 -0
package/src/cache/lru.ts +93 -0
package/src/cost/anomaly.ts +57 -0
package/src/cost/pricing.ts +58 -0
package/src/cost/tracker.ts +182 -0
package/src/index.ts +91 -0
package/src/policy/engine.ts +163 -0
package/src/policy/tools.ts +189 -0
package/src/scanner/canary.ts +30 -0
package/src/scanner/chain.ts +88 -0
package/src/scanner/heuristic.ts +427 -0
package/src/scanner/pii.ts +313 -0
package/src/shield.ts +228 -0
package/src/types.ts +242 -0
package/tsconfig.json +8 -0

package/src/scanner/heuristic.ts ADDED Viewed

@@ -0,0 +1,427 @@
+import type { Scanner, ScannerResult, ScanContext, Violation } from "../types.js";
+// ============================================================
+// Heuristic Prompt Injection Scanner
+// Score-based: multiple matches = higher confidence
+// ============================================================
+interface PatternRule {
+  id: string;
+  category: InjectionCategory;
+  pattern: RegExp;
+  weight: number;
+  description: string;
+}
+type InjectionCategory =
+  | "instruction_override"
+  | "role_manipulation"
+  | "system_prompt_extraction"
+  | "encoding_evasion"
+  | "delimiter_injection"
+  | "context_manipulation"
+  | "output_manipulation"
+  | "tool_abuse";
+const PATTERNS: PatternRule[] = [
+  // --- Instruction Override (weight: 0.25 each) ---
+  {
+    id: "INJ-001",
+    category: "instruction_override",
+    pattern: /ignore\s+(all\s+)?(previous|prior|above|earlier|preceding)\s+(instructions?|prompts?|rules?|guidelines?|context)/i,
+    weight: 0.25,
+    description: "Ignore previous instructions",
+  },
+  {
+    id: "INJ-002",
+    category: "instruction_override",
+    pattern: /disregard\s+(all\s+|your\s+)?(previous|prior|above|earlier)\s+(instructions?|context|rules?|guidelines?)/i,
+    weight: 0.25,
+    description: "Disregard instructions",
+  },
+  {
+    id: "INJ-003",
+    category: "instruction_override",
+    pattern: /forget\s+(everything|all|your)\s+(about\s+)?(instructions?|rules?|training|guidelines?|constraints?)/i,
+    weight: 0.25,
+    description: "Forget instructions",
+  },
+  {
+    id: "INJ-004",
+    category: "instruction_override",
+    pattern: /override\s+(your|the|all)\s+(instructions?|rules?|safety|guidelines?|constraints?|policies)/i,
+    weight: 0.25,
+    description: "Override instructions",
+  },
+  {
+    id: "INJ-005",
+    category: "instruction_override",
+    pattern: /do\s+not\s+follow\s+(your|the|any)\s+(previous|original|initial)\s+(instructions?|rules?|guidelines?)/i,
+    weight: 0.25,
+    description: "Do not follow instructions",
+  },
+  {
+    id: "INJ-006",
+    category: "instruction_override",
+    pattern: /new\s+(instructions?|rules?|guidelines?)[\s:]+/i,
+    weight: 0.15,
+    description: "New instructions prefix",
+  },
+  {
+    id: "INJ-007",
+    category: "instruction_override",
+    pattern: /from\s+now\s+on[,\s]+(you\s+)?(will|must|should|are|shall)/i,
+    weight: 0.20,
+    description: "From now on directive",
+  },
+  {
+    id: "INJ-008",
+    category: "instruction_override",
+    pattern: /instead[,\s]+(you\s+)?(will|must|should|shall)\s+/i,
+    weight: 0.15,
+    description: "Instead directive",
+  },
+  // --- Role Manipulation (weight: 0.25 each) ---
+  {
+    id: "ROLE-001",
+    category: "role_manipulation",
+    pattern: /you\s+are\s+now\s+(a|an|the|my)\s+/i,
+    weight: 0.25,
+    description: "You are now [role]",
+  },
+  {
+    id: "ROLE-002",
+    category: "role_manipulation",
+    pattern: /pretend\s+(you('re|\s+are)\s+|to\s+be\s+)/i,
+    weight: 0.25,
+    description: "Pretend to be",
+  },
+  {
+    id: "ROLE-003",
+    category: "role_manipulation",
+    pattern: /act\s+as\s+(a|an|if\s+you\s+were)\s+/i,
+    weight: 0.20,
+    description: "Act as [role]",
+  },
+  {
+    id: "ROLE-004",
+    category: "role_manipulation",
+    pattern: /switch\s+to\s+.{0,30}\s+mode/i,
+    weight: 0.25,
+    description: "Switch to [X] mode",
+  },
+  {
+    id: "ROLE-005",
+    category: "role_manipulation",
+    pattern: /enter\s+(developer|debug|admin|god|sudo|root|jailbreak|DAN)\s+mode/i,
+    weight: 0.35,
+    description: "Enter privileged mode",
+  },
+  {
+    id: "ROLE-006",
+    category: "role_manipulation",
+    pattern: /you\s+have\s+no\s+(restrictions?|limitations?|rules?|constraints?|filters?)/i,
+    weight: 0.30,
+    description: "No restrictions claim",
+  },
+  {
+    id: "ROLE-007",
+    category: "role_manipulation",
+    pattern: /\bDAN\b.*\b(do\s+anything|no\s+(restrictions?|limits?|filters?))/i,
+    weight: 0.35,
+    description: "DAN jailbreak pattern",
+  },
+  // --- System Prompt Extraction (weight: 0.30 each) ---
+  {
+    id: "EXTRACT-001",
+    category: "system_prompt_extraction",
+    pattern: /what\s+(is|are)\s+your\s+(system\s+)?(prompt|instructions?|rules?|guidelines?|directives?)/i,
+    weight: 0.30,
+    description: "What is your system prompt",
+  },
+  {
+    id: "EXTRACT-002",
+    category: "system_prompt_extraction",
+    pattern: /show\s+(me\s+)?your\s+(system\s+)?(prompt|instructions?|config(uration)?|rules?)/i,
+    weight: 0.30,
+    description: "Show your prompt",
+  },
+  {
+    id: "EXTRACT-003",
+    category: "system_prompt_extraction",
+    pattern: /repeat\s+(the|your)\s+(system\s+|initial\s+|original\s+)?(prompt|instructions?|message|rules?)\s*(word\s+for\s+word|verbatim|exactly)?/i,
+    weight: 0.30,
+    description: "Repeat your prompt",
+  },
+  {
+    id: "EXTRACT-004",
+    category: "system_prompt_extraction",
+    pattern: /output\s+(the|your)\s+(initial|system|original|first)\s+(prompt|instructions?|message)/i,
+    weight: 0.30,
+    description: "Output initial prompt",
+  },
+  {
+    id: "EXTRACT-005",
+    category: "system_prompt_extraction",
+    pattern: /print\s+(your|the)\s+(system\s+)?(prompt|instructions?|configuration)/i,
+    weight: 0.30,
+    description: "Print system prompt",
+  },
+  {
+    id: "EXTRACT-006",
+    category: "system_prompt_extraction",
+    pattern: /tell\s+me\s+(your|the)\s+(exact\s+)?(system\s+)?(prompt|instructions?|wording)/i,
+    weight: 0.30,
+    description: "Tell me your prompt",
+  },
+  {
+    id: "EXTRACT-007",
+    category: "system_prompt_extraction",
+    pattern: /copy\s+(and\s+)?(paste|output)\s+(your|the)\s+(system\s+)?(prompt|instructions?)/i,
+    weight: 0.30,
+    description: "Copy paste your prompt",
+  },
+  // --- Encoding Evasion (weight: 0.20 each) ---
+  {
+    id: "ENCODE-001",
+    category: "encoding_evasion",
+    pattern: /(?:decode|translate|convert|interpret)\s+(?:this|the\s+following)\s+(?:from\s+)?(?:base64|rot13|hex|binary|morse|unicode|ascii|url.?encoded)/i,
+    weight: 0.20,
+    description: "Decode from encoding",
+  },
+  {
+    id: "ENCODE-002",
+    category: "encoding_evasion",
+    pattern: /(?:execute|follow|obey|run)\s+(?:the\s+)?(?:decoded|hidden|encoded|secret)\s+(?:instructions?|commands?|text)/i,
+    weight: 0.30,
+    description: "Execute decoded instructions",
+  },
+  {
+    id: "ENCODE-003",
+    category: "encoding_evasion",
+    pattern: /[A-Za-z0-9+/]{50,}={0,2}/,
+    weight: 0.10,
+    description: "Long Base64 string detected",
+  },
+  // --- Delimiter Injection (weight: 0.25 each) ---
+  {
+    id: "DELIM-001",
+    category: "delimiter_injection",
+    pattern: /```\s*system\b/i,
+    weight: 0.30,
+    description: "Code block system injection",
+  },
+  {
+    id: "DELIM-002",
+    category: "delimiter_injection",
+    pattern: /\[SYSTEM\]/i,
+    weight: 0.30,
+    description: "[SYSTEM] tag injection",
+  },
+  {
+    id: "DELIM-003",
+    category: "delimiter_injection",
+    pattern: /<\/?system>/i,
+    weight: 0.30,
+    description: "<system> tag injection",
+  },
+  {
+    id: "DELIM-004",
+    category: "delimiter_injection",
+    pattern: /###\s*(?:SYSTEM|INSTRUCTION|ADMIN|OVERRIDE)/i,
+    weight: 0.30,
+    description: "### SYSTEM header injection",
+  },
+  {
+    id: "DELIM-005",
+    category: "delimiter_injection",
+    pattern: /<\|(?:im_start|im_end|system|endoftext)\|>/i,
+    weight: 0.35,
+    description: "ChatML special token injection",
+  },
+  {
+    id: "DELIM-006",
+    category: "delimiter_injection",
+    pattern: /\[INST\]|\[\/INST\]|<<SYS>>|<\/s>/i,
+    weight: 0.35,
+    description: "Llama special token injection",
+  },
+  // --- Context Manipulation (weight: 0.20 each) ---
+  {
+    id: "CTX-001",
+    category: "context_manipulation",
+    pattern: /(?:imagine|suppose|assume|consider)\s+(?:that\s+)?(?:you\s+(?:are|have|were|can)|there\s+(?:are|is))\s+(?:no\s+)?(?:restrictions?|limitations?|rules?|safety|filters?)/i,
+    weight: 0.20,
+    description: "Hypothetical no-restrictions",
+  },
+  {
+    id: "CTX-002",
+    category: "context_manipulation",
+    pattern: /(?:in\s+a\s+)?(?:hypothetical|fictional|imaginary|theoretical)\s+(?:scenario|world|situation|context)/i,
+    weight: 0.10,
+    description: "Hypothetical framing",
+  },
+  {
+    id: "CTX-003",
+    category: "context_manipulation",
+    pattern: /(?:for\s+)?(?:educational|research|academic|testing|security\s+testing)\s+purposes?\s+only/i,
+    weight: 0.10,
+    description: "Educational purpose claim",
+  },
+  {
+    id: "CTX-004",
+    category: "context_manipulation",
+    pattern: /this\s+is\s+(?:a\s+)?(?:test|simulation|drill|exercise|safe\s+environment)/i,
+    weight: 0.10,
+    description: "Test environment claim",
+  },
+  // --- Output Manipulation (weight: 0.20 each) ---
+  {
+    id: "OUT-001",
+    category: "output_manipulation",
+    pattern: /(?:respond|reply|answer|output)\s+(?:only\s+)?(?:with|in)\s+(?:json|xml|html|markdown|code|raw)/i,
+    weight: 0.05,
+    description: "Format forcing (low risk)",
+  },
+  {
+    id: "OUT-002",
+    category: "output_manipulation",
+    pattern: /do\s+not\s+(?:mention|include|add|say|output)\s+(?:any\s+)?(?:warnings?|disclaimers?|caveats?|notes?|safety)/i,
+    weight: 0.20,
+    description: "Suppress safety disclaimers",
+  },
+  {
+    id: "OUT-003",
+    category: "output_manipulation",
+    pattern: /(?:never|don't|do\s+not)\s+(?:refuse|decline|reject|deny|say\s+(?:no|you\s+can't))/i,
+    weight: 0.25,
+    description: "Never refuse requests",
+  },
+  // --- Tool Abuse (weight: 0.30 each) ---
+  {
+    id: "TOOL-001",
+    category: "tool_abuse",
+    pattern: /(?:call|execute|run|invoke|use)\s+(?:the\s+)?(?:delete|remove|drop|destroy|wipe|kill|shutdown)[\s_]/i,
+    weight: 0.30,
+    description: "Destructive tool invocation",
+  },
+  {
+    id: "TOOL-002",
+    category: "tool_abuse",
+    pattern: /(?:send|forward|exfiltrate|leak|transmit)\s+(?:all\s+)?(?:the\s+)?(?:data|information|credentials?|secrets?|keys?|tokens?|api.?keys?)\s+(?:to|via|and)/i,
+    weight: 0.35,
+    description: "Data exfiltration attempt",
+  },
+  {
+    id: "TOOL-003",
+    category: "tool_abuse",
+    pattern: /(?:access|read|fetch|get|retrieve)\s+(?:the\s+)?(?:env|environment|\.env|secrets?|api.?keys?|credentials?|passwords?|tokens?)/i,
+    weight: 0.30,
+    description: "Credential access attempt",
+  },
+];
+// Thresholds per strictness level
+const THRESHOLDS: Record<string, number> = {
+  low: 0.5,
+  medium: 0.3,
+  high: 0.15,
+};
+export interface HeuristicConfig {
+  strictness?: "low" | "medium" | "high";
+  threshold?: number;
+  customPatterns?: PatternRule[];
+}
+export class HeuristicScanner implements Scanner {
+  readonly name = "heuristic";
+  private patterns: PatternRule[];
+  private threshold: number;
+  constructor(config: HeuristicConfig = {}) {
+    this.patterns = [...PATTERNS, ...(config.customPatterns ?? [])];
+    this.threshold =
+      config.threshold ?? THRESHOLDS[config.strictness ?? "medium"] ?? 0.3;
+  }
+  async scan(input: string, _context: ScanContext): Promise<ScannerResult> {
+    const start = performance.now();
+    const violations: Violation[] = [];
+    let totalScore = 0;
+    for (const rule of this.patterns) {
+      if (rule.pattern.test(input)) {
+        totalScore += rule.weight;
+        violations.push({
+          type: "prompt_injection",
+          scanner: this.name,
+          score: rule.weight,
+          threshold: this.threshold,
+          message: rule.description,
+          detail: `Rule ${rule.id} (${rule.category})`,
+        });
+      }
+    }
+    // Structural signals (cumulative)
+    const structuralScore = this.checkStructuralSignals(input);
+    totalScore += structuralScore;
+    // Cap at 1.0
+    totalScore = Math.min(totalScore, 1.0);
+    const decision =
+      totalScore >= this.threshold
+        ? "block"
+        : totalScore >= this.threshold * 0.6
+          ? "warn"
+          : "allow";
+    const durationMs = performance.now() - start;
+    return { decision, violations, durationMs };
+  }
+  private checkStructuralSignals(input: string): number {
+    let score = 0;
+    // Many newlines (structured prompt injection)
+    const newlines = (input.match(/\n/g) ?? []).length;
+    if (newlines > 15) score += 0.05;
+    // Excessive use of markdown headers (structure injection)
+    const headers = (input.match(/^#{1,3}\s/gm) ?? []).length;
+    if (headers > 3) score += 0.05;
+    // Multiple role-like markers
+    const roleMarkers = (
+      input.match(
+        /\b(system|user|assistant|human|ai|bot|admin)[\s:]/gi,
+      ) ?? []
+    ).length;
+    if (roleMarkers > 2) score += 0.10;
+    // Very long input (potential padding attack)
+    if (input.length > 5000) score += 0.05;
+    return score;
+  }
+  /** Get all registered pattern IDs for testing */
+  getPatternIds(): string[] {
+    return this.patterns.map((p) => p.id);
+  }
+  /** Get pattern count */
+  get patternCount(): number {
+    return this.patterns.length;
+  }
+}