npm - ai-shield-classifier-onnx - Versions diffs - 0.2.0 - Mend

ai-shield-classifier-onnx 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,72 @@
+# ai-shield-classifier-onnx
+Optional ONNX-runtime ML classifier for [ai-shield](https://github.com/studiomeyer-io/ai-shield).
+Pairs with `ai-shield-core` to add a DeBERTa-style prompt-injection classifier
+alongside the heuristic patterns.
+## Why a separate package?
+`ai-shield-core` is zero-dependency by design. ONNX inference requires
+`onnxruntime-node`, which ships native binaries. Install this package only
+when you actively want ML-augmented detection on top of the regex layer.
+## Install
+```bash
+npm install ai-shield-core ai-shield-classifier-onnx onnxruntime-node
+```
+## Usage
+```ts
+import { ScannerChain, HeuristicScanner } from "ai-shield-core";
+import { loadOnnxClassifier } from "ai-shield-classifier-onnx";
+// Bring your own tokenizer. Example: protectai/deberta-v3-base-prompt-injection
+const tokenizer = await yourTokenizerFor("protectai/deberta-v3-base-prompt-injection");
+const ml = await loadOnnxClassifier({
+  modelPath: "./models/deberta-injection.onnx",
+  tokenizer,
+  threshold: 0.85, // tune per model
+});
+const chain = new ScannerChain({ earlyExit: true });
+chain.add(new HeuristicScanner({ strictness: "high" })); // cheap regex first
+chain.add(ml);                                            // ML fallback
+const result = await chain.run("Ignore previous instructions...");
+console.log(result.decision); // "block"
+```
+Or manually with an already-constructed `InferenceSession`:
+```ts
+import * as ort from "onnxruntime-node";
+import { OnnxInjectionScanner } from "ai-shield-classifier-onnx";
+const session = await ort.InferenceSession.create("./models/deberta.onnx");
+const scanner = new OnnxInjectionScanner({ session, tokenizer, threshold: 0.85 });
+```
+## Recommended models
+- [`protectai/deberta-v3-base-prompt-injection`](https://huggingface.co/protectai/deberta-v3-base-prompt-injection) (Apache-2.0)
+- [`protectai/deberta-v3-base-prompt-injection-v2`](https://huggingface.co/protectai/deberta-v3-base-prompt-injection-v2)
+- Any HF model exported to ONNX via `optimum-cli export onnx`
+## Notes
+- The scanner degrades **gracefully** on inference errors — failure is logged
+  as a `content_policy` violation but does not block traffic. This avoids
+  taking down the whole chain when the model file is missing or the runtime
+  hits a hardware-specific edge case.
+- Use after the heuristic scanner. Most known attacks short-circuit on
+  cheap regex; the ML pass catches paraphrases and obfuscations that slip
+  through.
+- The probability threshold is **calibrated per model**. Start at 0.85 and
+  tune against your false-positive budget.
+## License
+MIT — see [LICENSE](../../LICENSE).

package/package.json ADDED Viewed

@@ -0,0 +1,53 @@
+{
+  "name": "ai-shield-classifier-onnx",
+  "version": "0.2.0",
+  "license": "MIT",
+  "description": "Optional ONNX ML classifier for ai-shield — DeBERTa-style prompt-injection detection alongside heuristic patterns",
+  "author": "StudioMeyer <hello@studiomeyer.io>",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/studiomeyer-io/ai-shield",
+    "directory": "packages/classifier-onnx"
+  },
+  "homepage": "https://github.com/studiomeyer-io/ai-shield/tree/main/packages/classifier-onnx",
+  "bugs": {
+    "url": "https://github.com/studiomeyer-io/ai-shield/issues"
+  },
+  "keywords": [
+    "llm",
+    "security",
+    "prompt-injection",
+    "onnx",
+    "deberta",
+    "ml",
+    "classifier",
+    "ai-shield"
+  ],
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "scripts": {
+    "build": "tsc -b",
+    "typecheck": "tsc -b"
+  },
+  "peerDependencies": {
+    "ai-shield-core": "0.2.0",
+    "onnxruntime-node": ">=1.17.0"
+  },
+  "peerDependenciesMeta": {
+    "onnxruntime-node": { "optional": true }
+  },
+  "devDependencies": {
+    "ai-shield-core": "0.2.0",
+    "typescript": "^5.7.0"
+  },
+  "engines": {
+    "node": ">=20.0.0"
+  }
+}

package/src/classifier.ts ADDED Viewed

@@ -0,0 +1,326 @@
+import type {
+  Scanner,
+  ScannerResult,
+  ScanContext,
+  Violation,
+} from "ai-shield-core";
+// ============================================================
+// OnnxInjectionScanner — ML-backed prompt-injection detection
+//
+// Implements the `Scanner` interface from ai-shield-core. Designed to
+// be added to a `ScannerChain` after the heuristic scanner so that
+// known patterns short-circuit on cheap regex AND novel paraphrases
+// still get a second-pass ML check.
+//
+// The runtime is abstracted via `OnnxInferenceRuntime` so this file
+// has zero hard dependency on `onnxruntime-node` at type-check time.
+// At runtime you pass in either:
+//   1. An already-constructed `ort.InferenceSession`
+//      (`new OnnxInjectionScanner({ session, tokenizer })`)
+//   2. A path to a `.onnx` model file + a path to a `tokenizer.json`
+//      (`await loadOnnxClassifier({ modelPath, tokenizerPath })`)
+//
+// Both paths keep the dep injection clean and unit-testable.
+// ============================================================
+/**
+ * Minimal subset of `onnxruntime-node`'s `InferenceSession` we use.
+ * Declaring it locally means this package type-checks even when
+ * `onnxruntime-node` is not installed (which is the entire point —
+ * it's an optional peer dep).
+ */
+export interface OnnxInferenceRuntime {
+  run(
+    feeds: Record<string, OnnxTensorLike>,
+  ): Promise<Record<string, OnnxTensorLike>>;
+  readonly inputNames?: readonly string[];
+  readonly outputNames?: readonly string[];
+}
+/** Tensor descriptor compatible with `onnxruntime-common`'s Tensor. */
+export interface OnnxTensorLike {
+  readonly data: ArrayLike<number> | BigInt64Array;
+  readonly dims: readonly number[];
+  readonly type?: string;
+}
+/**
+ * Tokenizer abstraction. Same trick as the runtime — we don't bind to
+ * any specific HF tokenizer package so the user can wire up
+ * `@huggingface/transformers`, `tokenizers`, or a hand-written one.
+ */
+export interface Tokenizer {
+  encode(text: string): {
+    input_ids: number[];
+    attention_mask: number[];
+    token_type_ids?: number[];
+  };
+  /** Optional max sequence length the tokenizer was trained for. */
+  modelMaxLength?: number;
+}
+export interface OnnxClassifierConfig {
+  /** A pre-constructed inference session. */
+  session: OnnxInferenceRuntime;
+  /** Tokenizer matching the model. */
+  tokenizer: Tokenizer;
+  /**
+   * Probability threshold above which the input is flagged as
+   * injection. Default 0.85 (calibrated for protectai/deberta-v3-base-
+   * prompt-injection — adjust per model).
+   */
+  threshold?: number;
+  /**
+   * Name of the output node that carries logits/probabilities.
+   * Default: first key in the runtime's result map.
+   */
+  outputName?: string;
+  /**
+   * Index of the "injection" class in the model output.
+   * Default: 1 (binary classifier convention: 0 = SAFE, 1 = INJECTION).
+   */
+  injectionClassIndex?: number;
+  /**
+   * Maximum sequence length to feed. Default: 512.
+   * Larger inputs are truncated head-only (start kept) which matches
+   * the standard DeBERTa fine-tune recipe.
+   */
+  maxLength?: number;
+}
+export class OnnxInjectionScanner implements Scanner {
+  readonly name = "onnx-classifier";
+  private readonly cfg: Required<
+    Omit<OnnxClassifierConfig, "outputName">
+  > & { outputName?: string };
+  constructor(config: OnnxClassifierConfig) {
+    if (!config.session) {
+      throw new TypeError("OnnxInjectionScanner: 'session' is required");
+    }
+    if (!config.tokenizer) {
+      throw new TypeError("OnnxInjectionScanner: 'tokenizer' is required");
+    }
+    this.cfg = {
+      session: config.session,
+      tokenizer: config.tokenizer,
+      threshold: config.threshold ?? 0.85,
+      outputName: config.outputName,
+      injectionClassIndex: config.injectionClassIndex ?? 1,
+      maxLength:
+        config.maxLength ?? config.tokenizer.modelMaxLength ?? 512,
+    };
+  }
+  async scan(input: string, _context: ScanContext): Promise<ScannerResult> {
+    const start = performance.now();
+    try {
+      const probability = await this.predict(input);
+      const violations: Violation[] = [];
+      let decision: ScannerResult["decision"] = "allow";
+      if (probability >= this.cfg.threshold) {
+        decision = "block";
+        violations.push({
+          type: "prompt_injection",
+          scanner: this.name,
+          score: probability,
+          threshold: this.cfg.threshold,
+          message: "ML classifier flagged prompt-injection",
+          detail: `p(injection)=${probability.toFixed(4)} threshold=${this.cfg.threshold}`,
+        });
+      } else if (probability >= this.cfg.threshold * 0.6) {
+        decision = "warn";
+        violations.push({
+          type: "prompt_injection",
+          scanner: this.name,
+          score: probability,
+          threshold: this.cfg.threshold,
+          message: "ML classifier flagged borderline content",
+          detail: `p(injection)=${probability.toFixed(4)} threshold=${this.cfg.threshold}`,
+        });
+      }
+      return {
+        decision,
+        violations,
+        durationMs: performance.now() - start,
+      };
+    } catch (err) {
+      // ML errors must not take down the entire chain — degrade gracefully
+      // to "allow" with a synthetic violation so the audit log shows
+      // something went wrong without blocking traffic.
+      //
+      // Critic H4 round 1 — the raw error message can contain file
+      // paths (model location), native library symbols, or deployment-
+      // internal strings. Strip absolute paths before they hit the audit
+      // log. In dev mode we keep more detail to aid debugging.
+      const rawMessage = (err as Error)?.message ?? "unknown error";
+      const isDev =
+        process.env.NODE_ENV === "development" ||
+        process.env.AI_SHIELD_DEBUG === "1";
+      const safeDetail = isDev
+        ? rawMessage
+        : sanitizeOnnxErrorMessage(rawMessage);
+      return {
+        decision: "allow",
+        violations: [
+          {
+            type: "content_policy",
+            scanner: this.name,
+            score: 0,
+            threshold: this.cfg.threshold,
+            message: "ML classifier failed — degraded to allow",
+            detail: safeDetail,
+          },
+        ],
+        durationMs: performance.now() - start,
+      };
+    }
+  }
+  /** Direct probability access — useful for tests + custom flows. */
+  async predict(input: string): Promise<number> {
+    const tokens = this.cfg.tokenizer.encode(input);
+    const trunc = truncate(tokens, this.cfg.maxLength);
+    const inputIds = BigInt64Array.from(trunc.input_ids.map((n) => BigInt(n)));
+    const attentionMask = BigInt64Array.from(
+      trunc.attention_mask.map((n) => BigInt(n)),
+    );
+    const dims = [1, trunc.input_ids.length];
+    const feeds: Record<string, OnnxTensorLike> = {
+      input_ids: { data: inputIds, dims, type: "int64" },
+      attention_mask: { data: attentionMask, dims, type: "int64" },
+    };
+    if (trunc.token_type_ids) {
+      feeds.token_type_ids = {
+        data: BigInt64Array.from(trunc.token_type_ids.map((n) => BigInt(n))),
+        dims,
+        type: "int64",
+      };
+    }
+    const result = await this.cfg.session.run(feeds);
+    const outputName =
+      this.cfg.outputName ??
+      this.cfg.session.outputNames?.[0] ??
+      Object.keys(result)[0];
+    if (!outputName) {
+      throw new Error("OnnxInjectionScanner: no output node available");
+    }
+    const tensor = result[outputName];
+    if (!tensor) {
+      throw new Error(
+        `OnnxInjectionScanner: output '${outputName}' not in result`,
+      );
+    }
+    // Model emits logits of shape [1, num_classes]. Softmax + pick class.
+    const logits = Array.from(tensor.data as ArrayLike<number>).map((n) =>
+      Number(n),
+    );
+    const probs = softmax(logits);
+    const idx = this.cfg.injectionClassIndex;
+    if (idx < 0 || idx >= probs.length) {
+      throw new Error(
+        `OnnxInjectionScanner: injectionClassIndex ${idx} out of range (len=${probs.length})`,
+      );
+    }
+    return probs[idx] ?? 0;
+  }
+}
+/**
+ * Convenience loader. Imports `onnxruntime-node` *at runtime* so
+ * consumers who never call this function don't pay the install cost.
+ *
+ * Tokenizer loading is left to the caller because tokenizer
+ * implementations vary widely between models — we don't want to
+ * pin a specific HF tokenizer package.
+ */
+export async function loadOnnxClassifier(opts: {
+  modelPath: string;
+  tokenizer: Tokenizer;
+  threshold?: number;
+  outputName?: string;
+  injectionClassIndex?: number;
+  maxLength?: number;
+}): Promise<OnnxInjectionScanner> {
+  // Dynamic import keeps the dep optional — TypeScript can't see it
+  // at compile time, which is the whole point.
+  let ort: unknown;
+  try {
+    ort = await import("onnxruntime-node" as string);
+  } catch (err) {
+    throw new Error(
+      "ai-shield-classifier-onnx: 'onnxruntime-node' is required " +
+        "to call loadOnnxClassifier(). Install it as a peer dependency.\n" +
+        `Underlying error: ${(err as Error).message}`,
+    );
+  }
+  const ortModule = (ort as { InferenceSession?: { create?: (path: string) => Promise<OnnxInferenceRuntime> } });
+  const create = ortModule.InferenceSession?.create;
+  if (typeof create !== "function") {
+    throw new Error(
+      "ai-shield-classifier-onnx: 'onnxruntime-node' did not expose InferenceSession.create",
+    );
+  }
+  const session = await create(opts.modelPath);
+  return new OnnxInjectionScanner({
+    session,
+    tokenizer: opts.tokenizer,
+    threshold: opts.threshold,
+    outputName: opts.outputName,
+    injectionClassIndex: opts.injectionClassIndex,
+    maxLength: opts.maxLength,
+  });
+}
+// --- helpers ---
+/**
+ * Strip absolute paths and other deployment-internal strings from an
+ * ONNX-runtime error message before it lands in the audit log. Keeps
+ * the short error class / cause hint that helps diagnose the failure.
+ */
+function sanitizeOnnxErrorMessage(message: string): string {
+  if (typeof message !== "string" || message.length === 0) {
+    return "classifier_runtime_error";
+  }
+  // Truncate before sanitizing — bounded work on adversarial input.
+  const truncated = message.length > 500 ? message.slice(0, 500) : message;
+  return truncated
+    // POSIX absolute paths
+    .replace(/(?:^|[\s(])(\/[\w./@-]+)/g, " [path]")
+    // Windows drive paths
+    .replace(/[A-Za-z]:\\[\\\w./@-]+/g, "[path]")
+    // file:// URLs
+    .replace(/file:\/\/\S+/g, "[file-url]")
+    // Memory addresses (0x...)
+    .replace(/0x[0-9a-fA-F]{6,}/g, "[addr]")
+    .trim();
+}
+function softmax(logits: number[]): number[] {
+  if (logits.length === 0) return [];
+  const max = Math.max(...logits);
+  const exps = logits.map((l) => Math.exp(l - max));
+  const sum = exps.reduce((a, b) => a + b, 0);
+  if (sum === 0) return logits.map(() => 0);
+  return exps.map((e) => e / sum);
+}
+function truncate(
+  tokens: ReturnType<Tokenizer["encode"]>,
+  maxLength: number,
+): ReturnType<Tokenizer["encode"]> {
+  if (tokens.input_ids.length <= maxLength) return tokens;
+  return {
+    input_ids: tokens.input_ids.slice(0, maxLength),
+    attention_mask: tokens.attention_mask.slice(0, maxLength),
+    token_type_ids: tokens.token_type_ids?.slice(0, maxLength),
+  };
+}

package/src/index.ts ADDED Viewed

@@ -0,0 +1,30 @@
+// ============================================================
+// ai-shield-classifier-onnx — Optional ML classifier package
+//
+// Pairs with ai-shield-core. Adds an ONNX-runtime-backed
+// prompt-injection classifier (DeBERTa-style by default) that can
+// be added to a ScannerChain alongside the built-in heuristics.
+//
+// Why a separate package?
+//   The core package keeps a zero-dependency promise (Node stdlib only).
+//   ONNX inference requires `onnxruntime-node`, which ships native
+//   binaries and is too heavy to force on every consumer. Install this
+//   package only when you actively want ML-augmented detection.
+//
+// Recommended models:
+//   - protectai/deberta-v3-base-prompt-injection (Apache-2.0)
+//   - protectai/deberta-v3-base-prompt-injection-v2
+//   - hf models exported to ONNX via `optimum-cli`
+//
+// The classifier is intentionally pluggable — you bring the model file
+// + tokenizer JSON, the wrapper handles inference + thresholding +
+// integration with the Scanner interface.
+// ============================================================
+export {
+  OnnxInjectionScanner,
+  loadOnnxClassifier,
+  type OnnxClassifierConfig,
+  type OnnxInferenceRuntime,
+  type Tokenizer,
+} from "./classifier.js";

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,12 @@
+{
+  "extends": "../../tsconfig.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "./src",
+    "composite": true
+  },
+  "include": ["src/**/*"],
+  "references": [
+    { "path": "../core" }
+  ]
+}