@crewhaus/prompt-injection-detector 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Catalog R8 `prompt-injection-detector` — heuristic + optional LLM
3
+ * classifier for tool output. Used by runtime-core after every tool
4
+ * call (when the tool's `classifyOutput` flag is not explicitly false)
5
+ * to decide whether the output should be passed to the model verbatim,
6
+ * passed with a system warning, or redacted.
7
+ *
8
+ * Three layers, fail-closed when ambiguous:
9
+ *
10
+ * Layer 1 — regex rules over a corpus drawn from OWASP LLM Top-10
11
+ * plus a 50+-vector hand-crafted set. Each rule has a severity tag
12
+ * and contributes to a cumulative score. The corpus is exported so
13
+ * tests and downstream tools can audit it.
14
+ *
15
+ * Layer 2 — structural heuristics. Trailing imperative blocks,
16
+ * role-marker injection (e.g. "system:" / "<|im_start|>"), BOM
17
+ * tampering, and base64 wrapping a malicious string are all detected
18
+ * without overlap with Layer 1. These produce hits with severity
19
+ * weighted by structural risk.
20
+ *
21
+ * Layer 3 — optional LLM classifier. Activated when
22
+ * `CREWHAUS_PI_CLASSIFIER_MODEL` is set; the runtime supplies a
23
+ * `classify` callback that delegates to a model. Without the env
24
+ * var the layer is a no-op.
25
+ *
26
+ * The aggregate score thresholds:
27
+ * < 0.40 → "clean"
28
+ * [0.40, 0.80) → "suspicious"
29
+ * ≥ 0.80 → "malicious"
30
+ *
31
+ * Layer R8. Pairs with `tool-result-store` (the previewContent input)
32
+ * and `runtime-core` (the post-tool callsite that consumes the
33
+ * classification).
34
+ */
35
+ export type PromptInjectionClassification = "clean" | "suspicious" | "malicious";
36
+ export type PromptInjectionSeverity = "low" | "medium" | "high";
37
+ export type PromptInjectionHit = {
38
+ /** Stable rule id; safe to surface in logs and the redaction notice. */
39
+ readonly rule: string;
40
+ /** [start, end) byte offset in the analyzed text. */
41
+ readonly span: readonly [number, number];
42
+ readonly severity: PromptInjectionSeverity;
43
+ /** Layer that produced the hit. */
44
+ readonly layer: "regex" | "structural" | "llm";
45
+ };
46
+ export type PromptInjectionResult = {
47
+ readonly classification: PromptInjectionClassification;
48
+ /** [0, 1] aggregate score. Higher = more likely injection. */
49
+ readonly score: number;
50
+ readonly hits: ReadonlyArray<PromptInjectionHit>;
51
+ };
52
+ export type PromptInjectionRule = {
53
+ readonly id: string;
54
+ readonly pattern: RegExp;
55
+ readonly severity: PromptInjectionSeverity;
56
+ readonly description?: string;
57
+ };
58
+ /**
59
+ * Hand-curated corpus. Rule ids are stable so callers (auditors, tests,
60
+ * the redaction notice) can rely on them.
61
+ */
62
+ export declare const REGEX_RULES: ReadonlyArray<PromptInjectionRule>;
63
+ /**
64
+ * Defensive corpus-floor guard. If the rule list is ever trimmed below the
65
+ * documented minimum, fail loudly at module-load instead of silently weakening
66
+ * detection. Extracted (and re-exported via `__internals`) so the failure path
67
+ * is testable without mutating the production corpus.
68
+ */
69
+ declare function assertCorpusFloor(rules: ReadonlyArray<PromptInjectionRule>): void;
70
+ declare function regexHits(text: string, rules?: ReadonlyArray<PromptInjectionRule>): PromptInjectionHit[];
71
+ export type LlmClassifyFn = (text: string) => Promise<{
72
+ verdict: PromptInjectionClassification;
73
+ rationale?: string;
74
+ } | undefined>;
75
+ export type ClassifyOptions = {
76
+ /**
77
+ * When set, layer 3 LLM classifier runs and its verdict can lift
78
+ * "clean" → "suspicious" or upgrade an existing suspicious verdict
79
+ * to "malicious". A `clean` verdict from the model is advisory only —
80
+ * we never downgrade a high-severity regex hit.
81
+ *
82
+ * Activated when the runtime sets `CREWHAUS_PI_CLASSIFIER_MODEL` and
83
+ * the runtime supplies the actual classify callback.
84
+ */
85
+ readonly llmClassifier?: LlmClassifyFn;
86
+ /**
87
+ * Override the suspicious / malicious thresholds. Mostly used by
88
+ * tests; production should leave defaults.
89
+ */
90
+ readonly thresholds?: {
91
+ readonly suspicious?: number;
92
+ readonly malicious?: number;
93
+ };
94
+ };
95
+ declare function tryDecodeBase64(blob: string): string | undefined;
96
+ declare function tryDecodeHex(blob: string): string | undefined;
97
+ declare function tryDecodePercent(text: string): string | undefined;
98
+ /**
99
+ * Classify a tool output. Pure with respect to the input string when
100
+ * the LLM classifier is not supplied.
101
+ */
102
+ export declare function classifyText(text: string, opts?: ClassifyOptions): Promise<PromptInjectionResult>;
103
+ /**
104
+ * Build a redaction notice safe to substitute for the original tool
105
+ * output. The notice names the rules that fired so that auditors can
106
+ * verify the decision later.
107
+ */
108
+ export declare function buildRedactionNotice(hits: ReadonlyArray<PromptInjectionHit>): string;
109
+ /**
110
+ * Returns true when the env-driven LLM classifier should run.
111
+ */
112
+ export declare function llmClassifierEnabled(env?: NodeJS.ProcessEnv): boolean;
113
+ /**
114
+ * Internal seams exposed ONLY for unit tests. Not part of the public API and
115
+ * not subject to semver — these let the test suite drive the module's
116
+ * defensive branches (corpus-floor guard, global-flag `lastIndex` reset, and
117
+ * the decoder `try/catch` fallbacks) with crafted inputs that the public
118
+ * `classifyText` entrypoint can never construct on its own. Do not import
119
+ * from application code.
120
+ */
121
+ export declare const __internals: {
122
+ readonly assertCorpusFloor: typeof assertCorpusFloor;
123
+ readonly regexHits: typeof regexHits;
124
+ readonly tryDecodeBase64: typeof tryDecodeBase64;
125
+ readonly tryDecodeHex: typeof tryDecodeHex;
126
+ readonly tryDecodePercent: typeof tryDecodePercent;
127
+ readonly MIN_CORPUS_RULES: 50;
128
+ };
129
+ export {};