npm - @crewhaus/prompt-injection-detector - Versions diffs - 0.1.4 → 0.1.5 - Mend

@crewhaus/prompt-injection-detector 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,129 @@
+/**
+ * Catalog R8 `prompt-injection-detector` — heuristic + optional LLM
+ * classifier for tool output. Used by runtime-core after every tool
+ * call (when the tool's `classifyOutput` flag is not explicitly false)
+ * to decide whether the output should be passed to the model verbatim,
+ * passed with a system warning, or redacted.
+ *
+ * Three layers, fail-closed when ambiguous:
+ *
+ *   Layer 1 — regex rules over a corpus drawn from OWASP LLM Top-10
+ *   plus a 50+-vector hand-crafted set. Each rule has a severity tag
+ *   and contributes to a cumulative score. The corpus is exported so
+ *   tests and downstream tools can audit it.
+ *
+ *   Layer 2 — structural heuristics. Trailing imperative blocks,
+ *   role-marker injection (e.g. "system:" / "<|im_start|>"), BOM
+ *   tampering, and base64 wrapping a malicious string are all detected
+ *   without overlap with Layer 1. These produce hits with severity
+ *   weighted by structural risk.
+ *
+ *   Layer 3 — optional LLM classifier. Activated when
+ *   `CREWHAUS_PI_CLASSIFIER_MODEL` is set; the runtime supplies a
+ *   `classify` callback that delegates to a model. Without the env
+ *   var the layer is a no-op.
+ *
+ * The aggregate score thresholds:
+ *   < 0.40 → "clean"
+ *   [0.40, 0.80) → "suspicious"
+ *   ≥ 0.80 → "malicious"
+ *
+ * Layer R8. Pairs with `tool-result-store` (the previewContent input)
+ * and `runtime-core` (the post-tool callsite that consumes the
+ * classification).
+ */
+export type PromptInjectionClassification = "clean" | "suspicious" | "malicious";
+export type PromptInjectionSeverity = "low" | "medium" | "high";
+export type PromptInjectionHit = {
+    /** Stable rule id; safe to surface in logs and the redaction notice. */
+    readonly rule: string;
+    /** [start, end) byte offset in the analyzed text. */
+    readonly span: readonly [number, number];
+    readonly severity: PromptInjectionSeverity;
+    /** Layer that produced the hit. */
+    readonly layer: "regex" | "structural" | "llm";
+};
+export type PromptInjectionResult = {
+    readonly classification: PromptInjectionClassification;
+    /** [0, 1] aggregate score. Higher = more likely injection. */
+    readonly score: number;
+    readonly hits: ReadonlyArray<PromptInjectionHit>;
+};
+export type PromptInjectionRule = {
+    readonly id: string;
+    readonly pattern: RegExp;
+    readonly severity: PromptInjectionSeverity;
+    readonly description?: string;
+};
+/**
+ * Hand-curated corpus. Rule ids are stable so callers (auditors, tests,
+ * the redaction notice) can rely on them.
+ */
+export declare const REGEX_RULES: ReadonlyArray<PromptInjectionRule>;
+/**
+ * Defensive corpus-floor guard. If the rule list is ever trimmed below the
+ * documented minimum, fail loudly at module-load instead of silently weakening
+ * detection. Extracted (and re-exported via `__internals`) so the failure path
+ * is testable without mutating the production corpus.
+ */
+declare function assertCorpusFloor(rules: ReadonlyArray<PromptInjectionRule>): void;
+declare function regexHits(text: string, rules?: ReadonlyArray<PromptInjectionRule>): PromptInjectionHit[];
+export type LlmClassifyFn = (text: string) => Promise<{
+    verdict: PromptInjectionClassification;
+    rationale?: string;
+} | undefined>;
+export type ClassifyOptions = {
+    /**
+     * When set, layer 3 LLM classifier runs and its verdict can lift
+     * "clean" → "suspicious" or upgrade an existing suspicious verdict
+     * to "malicious". A `clean` verdict from the model is advisory only —
+     * we never downgrade a high-severity regex hit.
+     *
+     * Activated when the runtime sets `CREWHAUS_PI_CLASSIFIER_MODEL` and
+     * the runtime supplies the actual classify callback.
+     */
+    readonly llmClassifier?: LlmClassifyFn;
+    /**
+     * Override the suspicious / malicious thresholds. Mostly used by
+     * tests; production should leave defaults.
+     */
+    readonly thresholds?: {
+        readonly suspicious?: number;
+        readonly malicious?: number;
+    };
+};
+declare function tryDecodeBase64(blob: string): string | undefined;
+declare function tryDecodeHex(blob: string): string | undefined;
+declare function tryDecodePercent(text: string): string | undefined;
+/**
+ * Classify a tool output. Pure with respect to the input string when
+ * the LLM classifier is not supplied.
+ */
+export declare function classifyText(text: string, opts?: ClassifyOptions): Promise<PromptInjectionResult>;
+/**
+ * Build a redaction notice safe to substitute for the original tool
+ * output. The notice names the rules that fired so that auditors can
+ * verify the decision later.
+ */
+export declare function buildRedactionNotice(hits: ReadonlyArray<PromptInjectionHit>): string;
+/**
+ * Returns true when the env-driven LLM classifier should run.
+ */
+export declare function llmClassifierEnabled(env?: NodeJS.ProcessEnv): boolean;
+/**
+ * Internal seams exposed ONLY for unit tests. Not part of the public API and
+ * not subject to semver — these let the test suite drive the module's
+ * defensive branches (corpus-floor guard, global-flag `lastIndex` reset, and
+ * the decoder `try/catch` fallbacks) with crafted inputs that the public
+ * `classifyText` entrypoint can never construct on its own. Do not import
+ * from application code.
+ */
+export declare const __internals: {
+    readonly assertCorpusFloor: typeof assertCorpusFloor;
+    readonly regexHits: typeof regexHits;
+    readonly tryDecodeBase64: typeof tryDecodeBase64;
+    readonly tryDecodeHex: typeof tryDecodeHex;
+    readonly tryDecodePercent: typeof tryDecodePercent;
+    readonly MIN_CORPUS_RULES: 50;
+};
+export {};