@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +23 -0
  2. package/dist/ClassifierOrchestrator.d.ts +126 -0
  3. package/dist/ClassifierOrchestrator.d.ts.map +1 -0
  4. package/dist/ClassifierOrchestrator.js +239 -0
  5. package/dist/ClassifierOrchestrator.js.map +1 -0
  6. package/dist/IContentClassifier.d.ts +117 -0
  7. package/dist/IContentClassifier.d.ts.map +1 -0
  8. package/dist/IContentClassifier.js +22 -0
  9. package/dist/IContentClassifier.js.map +1 -0
  10. package/dist/MLClassifierGuardrail.d.ts +163 -0
  11. package/dist/MLClassifierGuardrail.d.ts.map +1 -0
  12. package/dist/MLClassifierGuardrail.js +335 -0
  13. package/dist/MLClassifierGuardrail.js.map +1 -0
  14. package/dist/SlidingWindowBuffer.d.ts +213 -0
  15. package/dist/SlidingWindowBuffer.d.ts.map +1 -0
  16. package/dist/SlidingWindowBuffer.js +246 -0
  17. package/dist/SlidingWindowBuffer.js.map +1 -0
  18. package/dist/classifiers/InjectionClassifier.d.ts +126 -0
  19. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
  20. package/dist/classifiers/InjectionClassifier.js +210 -0
  21. package/dist/classifiers/InjectionClassifier.js.map +1 -0
  22. package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
  23. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
  24. package/dist/classifiers/JailbreakClassifier.js +208 -0
  25. package/dist/classifiers/JailbreakClassifier.js.map +1 -0
  26. package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
  27. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
  28. package/dist/classifiers/ToxicityClassifier.js +212 -0
  29. package/dist/classifiers/ToxicityClassifier.js.map +1 -0
  30. package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
  31. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
  32. package/dist/classifiers/WorkerClassifierProxy.js +268 -0
  33. package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
  34. package/dist/index.d.ts +110 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +342 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/tools/ClassifyContentTool.d.ts +105 -0
  39. package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
  40. package/dist/tools/ClassifyContentTool.js +149 -0
  41. package/dist/tools/ClassifyContentTool.js.map +1 -0
  42. package/dist/types.d.ts +319 -0
  43. package/dist/types.d.ts.map +1 -0
  44. package/dist/types.js +62 -0
  45. package/dist/types.js.map +1 -0
  46. package/dist/worker/classifier-worker.d.ts +49 -0
  47. package/dist/worker/classifier-worker.d.ts.map +1 -0
  48. package/dist/worker/classifier-worker.js +180 -0
  49. package/dist/worker/classifier-worker.js.map +1 -0
  50. package/package.json +45 -0
  51. package/src/ClassifierOrchestrator.ts +290 -0
  52. package/src/IContentClassifier.ts +124 -0
  53. package/src/MLClassifierGuardrail.ts +419 -0
  54. package/src/SlidingWindowBuffer.ts +384 -0
  55. package/src/classifiers/InjectionClassifier.ts +261 -0
  56. package/src/classifiers/JailbreakClassifier.ts +259 -0
  57. package/src/classifiers/ToxicityClassifier.ts +263 -0
  58. package/src/classifiers/WorkerClassifierProxy.ts +366 -0
  59. package/src/index.ts +383 -0
  60. package/src/tools/ClassifyContentTool.ts +201 -0
  61. package/src/types.ts +391 -0
  62. package/src/worker/classifier-worker.ts +267 -0
@@ -0,0 +1,210 @@
1
+ /**
2
+ * @fileoverview Prompt-injection content classifier using the
3
+ * `protectai/deberta-v3-small-prompt-injection-v2` model.
4
+ *
5
+ * Prompt injection is the attack pattern where adversarial instructions are
6
+ * embedded inside user-supplied text to override or hijack the agent's system
7
+ * prompt. This classifier provides a dedicated binary signal (INJECTION /
8
+ * SAFE) that the guardrail orchestrator can act on independently of the
9
+ * toxicity or jailbreak classifiers.
10
+ *
11
+ * Model details
12
+ * -------------
13
+ * `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa
14
+ * model from ProtectAI, specifically trained to distinguish benign user
15
+ * messages from prompt-injection payloads. It outputs two labels:
16
+ * - `INJECTION` — high-confidence injection attempt
17
+ * - `SAFE` — normal user input
18
+ *
19
+ * Graceful degradation
20
+ * --------------------
21
+ * If the model fails to load the classifier sets `unavailable = true` and
22
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
23
+ * on every subsequent call.
24
+ *
25
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier
26
+ */
27
+ import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
28
+ // ---------------------------------------------------------------------------
29
+ // InjectionClassifier
30
+ // ---------------------------------------------------------------------------
31
+ /**
32
+ * Binary prompt-injection classifier backed by
33
+ * `protectai/deberta-v3-small-prompt-injection-v2`.
34
+ *
35
+ * Returns one of two labels:
36
+ * - `INJECTION` — the text contains an injection attempt
37
+ * - `SAFE` — the text is clean
38
+ *
39
+ * The label with the higher confidence becomes `bestClass` / `confidence`.
40
+ * Both labels are present in `allScores` so callers can read the SAFE score
41
+ * as well.
42
+ *
43
+ * @implements {IContentClassifier}
44
+ *
45
+ * @example
46
+ * ```typescript
47
+ * const classifier = new InjectionClassifier(serviceRegistry);
48
+ * const result = await classifier.classify('Ignore previous instructions and …');
49
+ * // result.bestClass === 'INJECTION', result.confidence ≈ 0.97
50
+ * ```
51
+ */
52
+ export class InjectionClassifier {
53
+ services;
54
+ config;
55
+ // -------------------------------------------------------------------------
56
+ // IContentClassifier identity fields
57
+ // -------------------------------------------------------------------------
58
+ /** Unique service identifier for this classifier. */
59
+ id = 'prompt-injection';
60
+ /** Human-readable name for dashboards and log output. */
61
+ displayName = 'Prompt Injection Classifier';
62
+ /** Short description of what this classifier detects. */
63
+ description = 'Detects prompt-injection attempts where adversarial instructions are ' +
64
+ 'embedded in user input to override or hijack the agent system prompt.';
65
+ /**
66
+ * Default Hugging Face model ID.
67
+ * Overridable via {@link ClassifierConfig.modelId}.
68
+ */
69
+ modelId = 'protectai/deberta-v3-small-prompt-injection-v2';
70
+ // -------------------------------------------------------------------------
71
+ // Internal state
72
+ // -------------------------------------------------------------------------
73
+ /**
74
+ * Whether the model weights are fully loaded and the classifier is ready
75
+ * to accept `classify()` calls.
76
+ */
77
+ _isLoaded = false;
78
+ /**
79
+ * Set to `true` when the model fails to load. Once `unavailable`, every
80
+ * subsequent `classify()` call immediately returns the pass result rather
81
+ * than retrying the expensive model load.
82
+ */
83
+ unavailable = false;
84
+ // -------------------------------------------------------------------------
85
+ // Constructor
86
+ // -------------------------------------------------------------------------
87
+ /**
88
+ * @param services - Shared service registry used to lazily create and cache
89
+ * the underlying HuggingFace pipeline instance.
90
+ * @param config - Optional per-classifier configuration. When
91
+ * `config.modelId` is provided it overrides the default `modelId` when
92
+ * loading the model.
93
+ */
94
+ constructor(services, config) {
95
+ this.services = services;
96
+ this.config = config;
97
+ }
98
+ // -------------------------------------------------------------------------
99
+ // IContentClassifier.isLoaded (getter)
100
+ // -------------------------------------------------------------------------
101
+ /**
102
+ * Whether the underlying model pipeline has been successfully initialised.
103
+ * The flag is set to `true` after the first successful `classify()` call.
104
+ */
105
+ get isLoaded() {
106
+ return this._isLoaded;
107
+ }
108
+ // -------------------------------------------------------------------------
109
+ // classify
110
+ // -------------------------------------------------------------------------
111
+ /**
112
+ * Run prompt-injection inference on `text`.
113
+ *
114
+ * Lazily loads the pipeline on the first call via the shared service
115
+ * registry, then calls it with `{ topk: null }` to retrieve scores for both
116
+ * labels.
117
+ *
118
+ * @param text - The text to evaluate.
119
+ * @returns A promise that resolves with the classification result. If the
120
+ * model is unavailable the pass result is returned instead of throwing.
121
+ */
122
+ async classify(text) {
123
+ // Return the pass result immediately if the model previously failed to load.
124
+ if (this.unavailable) {
125
+ return this.passResult();
126
+ }
127
+ // Lazily obtain (or create) the HuggingFace pipeline instance from the
128
+ // shared service registry so the model is only downloaded once.
129
+ let pipeline;
130
+ try {
131
+ pipeline = await this.services.getOrCreate(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE, async () => {
132
+ // Dynamic import so environments without @huggingface/transformers
133
+ // can still load the rest of AgentOS.
134
+ const { pipeline: createPipeline } = await import('@huggingface/transformers');
135
+ return createPipeline('text-classification',
136
+ // Honour a caller-supplied model override; fall back to the default.
137
+ this.config?.modelId ?? this.modelId, { quantized: true });
138
+ }, {
139
+ /** Release ONNX/WASM resources when the registry entry is evicted. */
140
+ dispose: async (p) => p?.dispose?.(),
141
+ /** Tags used for diagnostics and capability discovery. */
142
+ tags: ['ml', 'classifier', 'prompt-injection', 'onnx'],
143
+ });
144
+ // Mark the classifier as ready now that the pipeline is available.
145
+ this._isLoaded = true;
146
+ }
147
+ catch {
148
+ // Model failed to load — mark as unavailable and return the pass result.
149
+ this.unavailable = true;
150
+ return this.passResult();
151
+ }
152
+ // Run inference and request both label scores.
153
+ const raw = await pipeline(text, { topk: null });
154
+ return this.mapResult(raw);
155
+ }
156
+ // -------------------------------------------------------------------------
157
+ // dispose (optional IContentClassifier lifecycle hook)
158
+ // -------------------------------------------------------------------------
159
+ /**
160
+ * Release the pipeline instance from the shared service registry.
161
+ *
162
+ * Idempotent — safe to call multiple times.
163
+ */
164
+ async dispose() {
165
+ await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE);
166
+ this._isLoaded = false;
167
+ }
168
+ // -------------------------------------------------------------------------
169
+ // Private helpers
170
+ // -------------------------------------------------------------------------
171
+ /**
172
+ * Returns a "pass" result used when the model is unavailable.
173
+ *
174
+ * A pass result reports `bestClass: 'benign'` with zero confidence so the
175
+ * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
176
+ */
177
+ passResult() {
178
+ return { bestClass: 'benign', confidence: 0, allScores: [] };
179
+ }
180
+ /**
181
+ * Map the raw pipeline output to a {@link ClassificationResult}.
182
+ *
183
+ * For binary classification the label with the higher confidence score
184
+ * becomes `bestClass` / `confidence`. Both labels are included in
185
+ * `allScores`.
186
+ *
187
+ * @param raw - Array returned by the pipeline when called with `topk: null`.
188
+ */
189
+ mapResult(raw) {
190
+ if (!raw || raw.length === 0) {
191
+ return this.passResult();
192
+ }
193
+ // Find the label with the highest score (should be one of INJECTION / SAFE).
194
+ let best = raw[0];
195
+ for (const item of raw) {
196
+ if (item.score > best.score) {
197
+ best = item;
198
+ }
199
+ }
200
+ return {
201
+ bestClass: best.label,
202
+ confidence: best.score,
203
+ allScores: raw.map((item) => ({
204
+ classLabel: item.label,
205
+ score: item.score,
206
+ })),
207
+ };
208
+ }
209
+ }
210
+ //# sourceMappingURL=InjectionClassifier.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"InjectionClassifier.js","sourceRoot":"","sources":["../../src/classifiers/InjectionClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAMH,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAiBrD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,mBAAmB;IAmDX;IACA;IAnDnB,4EAA4E;IAC5E,qCAAqC;IACrC,4EAA4E;IAE5E,qDAAqD;IAC5C,EAAE,GAAG,kBAAkB,CAAC;IAEjC,yDAAyD;IAChD,WAAW,GAAG,6BAA6B,CAAC;IAErD,yDAAyD;IAChD,WAAW,GAClB,uEAAuE;QACvE,uEAAuE,CAAC;IAE1E;;;OAGG;IACM,OAAO,GAAG,gDAAgD,CAAC;IAEpE,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E;;;OAGG;IACK,SAAS,GAAG,KAAK,CAAC;IAE1B;;;;OAIG;IACK,WAAW,GAAG,KAAK,CAAC;IAE5B,4EAA4E;IAC5E,cAAc;IACd,4EAA4E;IAE5E;;;;;;OAMG;IACH,YACmB,QAAgC,EAChC,MAAyB;QADzB,aAAQ,GAAR,QAAQ,CAAwB;QAChC,WAAM,GAAN,MAAM,CAAmB;IACzC,CAAC;IAEJ,4EAA4E;IAC5E,uCAAuC;IACvC,4EAA4E;IAE5E;;;OAGG;IACH,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED,4EAA4E;IAC5E,WAAW;IACX,4EAA4E;IAE5E;;;;;;;;;;OAUG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,uEAAuE;QACvE,gEAAgE;QAChE,IAAI,QAAqE,CAAC;QAC1E,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACxC,yBAAyB,CAAC,kBAAkB,EAC5C,KAAK,IAAI,EAAE;gBACT,mEAAmE;gBACnE,sCAAsC;gBACtC,MAAM,EAAE,QAAQ,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAC/C,2BAA2B,CAC5B,CAAC;gBACF,OAAO,cAAc,CACnB,qBAAqB;gBACrB,qEAAqE;gBACrE,IAAI,CAAC,MAAM,EAAE,OAAO,IAAI,IAAI,CAAC,OAAO,EACpC,EAAE,SAAS,EAAE,IAAI,EAAE,CACpB,CAAC;YACJ,CAAC,EACD;gBACE,sEAAsE;gBACtE,OAAO,EAAE,KAAK,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzC,0DAA0D;gBAC1D,IAAI,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,CAAC;aACvD,CACF,CAAC;YAEF,mEAAmE;YACnE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,yEAAyE;YACzE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,+CAA+C;QAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,4EAA4E;IAC5E,uDAAuD;IACvD,4EAA4E;IAE5E;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAC;QAC1E,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC;IACzB,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACK,UAAU;QAChB,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;IAC/D,CAAC;IAED;;;;;;;;OAQG;IACK,SAAS,CAAC,GAAe;QAC/B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,6EAA6E;QAC7E,IAAI,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAClB,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;YACvB,IAAI,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC5B,IAAI,GAAG,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,UAAU,EAAE,IAAI,CAAC,KAAK;YACtB,SAAS,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF"}
@@ -0,0 +1,124 @@
1
+ /**
2
+ * @fileoverview Jailbreak content classifier using Meta's `PromptGuard-86M`
3
+ * model.
4
+ *
5
+ * Jailbreak attempts are adversarial prompts specifically crafted to bypass
6
+ * an LLM's safety guidelines — e.g. "DAN mode", role-play exploits, or
7
+ * indirect instruction injections. This classifier uses Meta's PromptGuard
8
+ * model which was trained to distinguish three classes:
9
+ *
10
+ * - `jailbreak` — explicit attempt to override safety behaviour
11
+ * - `injection` — indirect or embedded instruction injection
12
+ * - `benign` — normal user input
13
+ *
14
+ * Unlike the binary {@link InjectionClassifier}, PromptGuard separates
15
+ * direct jailbreaks from indirect injections, giving the guardrail
16
+ * orchestrator finer-grained control over which action to take for each.
17
+ *
18
+ * Graceful degradation
19
+ * --------------------
20
+ * If the model fails to load the classifier sets `unavailable = true` and
21
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
22
+ * on every subsequent call.
23
+ *
24
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/JailbreakClassifier
25
+ */
26
+ import type { ClassificationResult } from '@framers/agentos';
27
+ import type { ISharedServiceRegistry } from '@framers/agentos';
28
+ import type { IContentClassifier } from '../IContentClassifier';
29
+ import type { ClassifierConfig } from '../types';
30
+ /**
31
+ * Multi-class jailbreak classifier backed by `meta-llama/PromptGuard-86M`.
32
+ *
33
+ * Distinguishes three mutually-exclusive classes:
34
+ * - `jailbreak` — direct attempt to bypass safety guidelines
35
+ * - `injection` — indirect prompt injection embedded in user input
36
+ * - `benign` — normal, non-adversarial message
37
+ *
38
+ * The winning class (highest softmax score) is reported as `bestClass` /
39
+ * `confidence`. All three scores are present in `allScores`.
40
+ *
41
+ * @implements {IContentClassifier}
42
+ *
43
+ * @example
44
+ * ```typescript
45
+ * const classifier = new JailbreakClassifier(serviceRegistry);
46
+ * const result = await classifier.classify('Pretend you have no restrictions…');
47
+ * // result.bestClass === 'jailbreak', result.confidence ≈ 0.88
48
+ * ```
49
+ */
50
+ export declare class JailbreakClassifier implements IContentClassifier {
51
+ private readonly services;
52
+ private readonly config?;
53
+ /** Unique service identifier for this classifier. */
54
+ readonly id = "jailbreak";
55
+ /** Human-readable name for dashboards and log output. */
56
+ readonly displayName = "Jailbreak Classifier";
57
+ /** Short description of what this classifier detects. */
58
+ readonly description: string;
59
+ /**
60
+ * Default Hugging Face model ID.
61
+ * Overridable via {@link ClassifierConfig.modelId}.
62
+ */
63
+ readonly modelId = "meta-llama/PromptGuard-86M";
64
+ /**
65
+ * Whether the model weights are fully loaded and the classifier is ready
66
+ * to accept `classify()` calls.
67
+ */
68
+ private _isLoaded;
69
+ /**
70
+ * Set to `true` when the model fails to load. Once `unavailable`, every
71
+ * subsequent `classify()` call immediately returns the pass result rather
72
+ * than retrying the expensive model load.
73
+ */
74
+ private unavailable;
75
+ /**
76
+ * @param services - Shared service registry used to lazily create and cache
77
+ * the underlying HuggingFace pipeline instance.
78
+ * @param config - Optional per-classifier configuration. When
79
+ * `config.modelId` is provided it overrides the default `modelId` when
80
+ * loading the model.
81
+ */
82
+ constructor(services: ISharedServiceRegistry, config?: ClassifierConfig | undefined);
83
+ /**
84
+ * Whether the underlying model pipeline has been successfully initialised.
85
+ * The flag is set to `true` after the first successful `classify()` call.
86
+ */
87
+ get isLoaded(): boolean;
88
+ /**
89
+ * Run jailbreak inference on `text`.
90
+ *
91
+ * Lazily loads the pipeline on the first call via the shared service
92
+ * registry, then calls it with `{ topk: null }` to retrieve scores for all
93
+ * three classes.
94
+ *
95
+ * @param text - The text to evaluate.
96
+ * @returns A promise that resolves with the classification result. If the
97
+ * model is unavailable the pass result is returned instead of throwing.
98
+ */
99
+ classify(text: string): Promise<ClassificationResult>;
100
+ /**
101
+ * Release the pipeline instance from the shared service registry.
102
+ *
103
+ * Idempotent — safe to call multiple times.
104
+ */
105
+ dispose(): Promise<void>;
106
+ /**
107
+ * Returns a "pass" result used when the model is unavailable.
108
+ *
109
+ * A pass result reports `bestClass: 'benign'` with zero confidence so the
110
+ * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
111
+ */
112
+ private passResult;
113
+ /**
114
+ * Map the raw pipeline output to a {@link ClassificationResult}.
115
+ *
116
+ * For multi-class classification the label with the highest softmax score
117
+ * becomes `bestClass` / `confidence`. All three labels are included in
118
+ * `allScores`.
119
+ *
120
+ * @param raw - Array returned by the pipeline when called with `topk: null`.
121
+ */
122
+ private mapResult;
123
+ }
124
+ //# sourceMappingURL=JailbreakClassifier.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"JailbreakClassifier.d.ts","sourceRoot":"","sources":["../../src/classifiers/JailbreakClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AAC7D,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC/D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAChE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAsBjD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,mBAAoB,YAAW,kBAAkB;IAmD1D,OAAO,CAAC,QAAQ,CAAC,QAAQ;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;IA/C1B,qDAAqD;IACrD,QAAQ,CAAC,EAAE,eAAe;IAE1B,yDAAyD;IACzD,QAAQ,CAAC,WAAW,0BAA0B;IAE9C,yDAAyD;IACzD,QAAQ,CAAC,WAAW,SAEoC;IAExD;;;OAGG;IACH,QAAQ,CAAC,OAAO,gCAAgC;IAMhD;;;OAGG;IACH,OAAO,CAAC,SAAS,CAAS;IAE1B;;;;OAIG;IACH,OAAO,CAAC,WAAW,CAAS;IAM5B;;;;;;OAMG;gBAEgB,QAAQ,EAAE,sBAAsB,EAChC,MAAM,CAAC,EAAE,gBAAgB,YAAA;IAO5C;;;OAGG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAMD;;;;;;;;;;OAUG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAkD3D;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAS9B;;;;;OAKG;IACH,OAAO,CAAC,UAAU;IAIlB;;;;;;;;OAQG;IACH,OAAO,CAAC,SAAS;CAsBlB"}
@@ -0,0 +1,208 @@
1
+ /**
2
+ * @fileoverview Jailbreak content classifier using Meta's `PromptGuard-86M`
3
+ * model.
4
+ *
5
+ * Jailbreak attempts are adversarial prompts specifically crafted to bypass
6
+ * an LLM's safety guidelines — e.g. "DAN mode", role-play exploits, or
7
+ * indirect instruction injections. This classifier uses Meta's PromptGuard
8
+ * model which was trained to distinguish three classes:
9
+ *
10
+ * - `jailbreak` — explicit attempt to override safety behaviour
11
+ * - `injection` — indirect or embedded instruction injection
12
+ * - `benign` — normal user input
13
+ *
14
+ * Unlike the binary {@link InjectionClassifier}, PromptGuard separates
15
+ * direct jailbreaks from indirect injections, giving the guardrail
16
+ * orchestrator finer-grained control over which action to take for each.
17
+ *
18
+ * Graceful degradation
19
+ * --------------------
20
+ * If the model fails to load the classifier sets `unavailable = true` and
21
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
22
+ * on every subsequent call.
23
+ *
24
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/JailbreakClassifier
25
+ */
26
+ import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
27
+ // ---------------------------------------------------------------------------
28
+ // JailbreakClassifier
29
+ // ---------------------------------------------------------------------------
30
+ /**
31
+ * Multi-class jailbreak classifier backed by `meta-llama/PromptGuard-86M`.
32
+ *
33
+ * Distinguishes three mutually-exclusive classes:
34
+ * - `jailbreak` — direct attempt to bypass safety guidelines
35
+ * - `injection` — indirect prompt injection embedded in user input
36
+ * - `benign` — normal, non-adversarial message
37
+ *
38
+ * The winning class (highest softmax score) is reported as `bestClass` /
39
+ * `confidence`. All three scores are present in `allScores`.
40
+ *
41
+ * @implements {IContentClassifier}
42
+ *
43
+ * @example
44
+ * ```typescript
45
+ * const classifier = new JailbreakClassifier(serviceRegistry);
46
+ * const result = await classifier.classify('Pretend you have no restrictions…');
47
+ * // result.bestClass === 'jailbreak', result.confidence ≈ 0.88
48
+ * ```
49
+ */
50
+ export class JailbreakClassifier {
51
+ services;
52
+ config;
53
+ // -------------------------------------------------------------------------
54
+ // IContentClassifier identity fields
55
+ // -------------------------------------------------------------------------
56
+ /** Unique service identifier for this classifier. */
57
+ id = 'jailbreak';
58
+ /** Human-readable name for dashboards and log output. */
59
+ displayName = 'Jailbreak Classifier';
60
+ /** Short description of what this classifier detects. */
61
+ description = 'Detects jailbreak and indirect injection attacks using Meta PromptGuard. ' +
62
+ 'Classifies text as jailbreak, injection, or benign.';
63
+ /**
64
+ * Default Hugging Face model ID.
65
+ * Overridable via {@link ClassifierConfig.modelId}.
66
+ */
67
+ modelId = 'meta-llama/PromptGuard-86M';
68
+ // -------------------------------------------------------------------------
69
+ // Internal state
70
+ // -------------------------------------------------------------------------
71
+ /**
72
+ * Whether the model weights are fully loaded and the classifier is ready
73
+ * to accept `classify()` calls.
74
+ */
75
+ _isLoaded = false;
76
+ /**
77
+ * Set to `true` when the model fails to load. Once `unavailable`, every
78
+ * subsequent `classify()` call immediately returns the pass result rather
79
+ * than retrying the expensive model load.
80
+ */
81
+ unavailable = false;
82
+ // -------------------------------------------------------------------------
83
+ // Constructor
84
+ // -------------------------------------------------------------------------
85
+ /**
86
+ * @param services - Shared service registry used to lazily create and cache
87
+ * the underlying HuggingFace pipeline instance.
88
+ * @param config - Optional per-classifier configuration. When
89
+ * `config.modelId` is provided it overrides the default `modelId` when
90
+ * loading the model.
91
+ */
92
+ constructor(services, config) {
93
+ this.services = services;
94
+ this.config = config;
95
+ }
96
+ // -------------------------------------------------------------------------
97
+ // IContentClassifier.isLoaded (getter)
98
+ // -------------------------------------------------------------------------
99
+ /**
100
+ * Whether the underlying model pipeline has been successfully initialised.
101
+ * The flag is set to `true` after the first successful `classify()` call.
102
+ */
103
+ get isLoaded() {
104
+ return this._isLoaded;
105
+ }
106
+ // -------------------------------------------------------------------------
107
+ // classify
108
+ // -------------------------------------------------------------------------
109
+ /**
110
+ * Run jailbreak inference on `text`.
111
+ *
112
+ * Lazily loads the pipeline on the first call via the shared service
113
+ * registry, then calls it with `{ topk: null }` to retrieve scores for all
114
+ * three classes.
115
+ *
116
+ * @param text - The text to evaluate.
117
+ * @returns A promise that resolves with the classification result. If the
118
+ * model is unavailable the pass result is returned instead of throwing.
119
+ */
120
+ async classify(text) {
121
+ // Return the pass result immediately if the model previously failed to load.
122
+ if (this.unavailable) {
123
+ return this.passResult();
124
+ }
125
+ // Lazily obtain (or create) the HuggingFace pipeline from the shared
126
+ // registry — the model is only downloaded and initialised once.
127
+ let pipeline;
128
+ try {
129
+ pipeline = await this.services.getOrCreate(ML_CLASSIFIER_SERVICE_IDS.JAILBREAK_PIPELINE, async () => {
130
+ // Dynamic import so the ONNX runtime is excluded from the initial
131
+ // bundle and environments without the package are unaffected.
132
+ const { pipeline: createPipeline } = await import('@huggingface/transformers');
133
+ return createPipeline('text-classification',
134
+ // Honour a caller-supplied model override; fall back to the default.
135
+ this.config?.modelId ?? this.modelId, { quantized: true });
136
+ }, {
137
+ /** Release ONNX/WASM resources when the registry entry is evicted. */
138
+ dispose: async (p) => p?.dispose?.(),
139
+ /** Tags used for diagnostics and capability discovery. */
140
+ tags: ['ml', 'classifier', 'jailbreak', 'onnx'],
141
+ });
142
+ // Mark the classifier as ready now that the pipeline is available.
143
+ this._isLoaded = true;
144
+ }
145
+ catch {
146
+ // Model failed to load — mark as unavailable and return the pass result.
147
+ this.unavailable = true;
148
+ return this.passResult();
149
+ }
150
+ // Run inference and request scores for all three classes.
151
+ const raw = await pipeline(text, { topk: null });
152
+ return this.mapResult(raw);
153
+ }
154
+ // -------------------------------------------------------------------------
155
+ // dispose (optional IContentClassifier lifecycle hook)
156
+ // -------------------------------------------------------------------------
157
+ /**
158
+ * Release the pipeline instance from the shared service registry.
159
+ *
160
+ * Idempotent — safe to call multiple times.
161
+ */
162
+ async dispose() {
163
+ await this.services.release(ML_CLASSIFIER_SERVICE_IDS.JAILBREAK_PIPELINE);
164
+ this._isLoaded = false;
165
+ }
166
+ // -------------------------------------------------------------------------
167
+ // Private helpers
168
+ // -------------------------------------------------------------------------
169
+ /**
170
+ * Returns a "pass" result used when the model is unavailable.
171
+ *
172
+ * A pass result reports `bestClass: 'benign'` with zero confidence so the
173
+ * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
174
+ */
175
+ passResult() {
176
+ return { bestClass: 'benign', confidence: 0, allScores: [] };
177
+ }
178
+ /**
179
+ * Map the raw pipeline output to a {@link ClassificationResult}.
180
+ *
181
+ * For multi-class classification the label with the highest softmax score
182
+ * becomes `bestClass` / `confidence`. All three labels are included in
183
+ * `allScores`.
184
+ *
185
+ * @param raw - Array returned by the pipeline when called with `topk: null`.
186
+ */
187
+ mapResult(raw) {
188
+ if (!raw || raw.length === 0) {
189
+ return this.passResult();
190
+ }
191
+ // Find the class with the highest probability (winner-takes-all).
192
+ let best = raw[0];
193
+ for (const item of raw) {
194
+ if (item.score > best.score) {
195
+ best = item;
196
+ }
197
+ }
198
+ return {
199
+ bestClass: best.label,
200
+ confidence: best.score,
201
+ allScores: raw.map((item) => ({
202
+ classLabel: item.label,
203
+ score: item.score,
204
+ })),
205
+ };
206
+ }
207
+ }
208
+ //# sourceMappingURL=JailbreakClassifier.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"JailbreakClassifier.js","sourceRoot":"","sources":["../../src/classifiers/JailbreakClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAMH,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAiBrD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,OAAO,mBAAmB;IAmDX;IACA;IAnDnB,4EAA4E;IAC5E,qCAAqC;IACrC,4EAA4E;IAE5E,qDAAqD;IAC5C,EAAE,GAAG,WAAW,CAAC;IAE1B,yDAAyD;IAChD,WAAW,GAAG,sBAAsB,CAAC;IAE9C,yDAAyD;IAChD,WAAW,GAClB,2EAA2E;QAC3E,qDAAqD,CAAC;IAExD;;;OAGG;IACM,OAAO,GAAG,4BAA4B,CAAC;IAEhD,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E;;;OAGG;IACK,SAAS,GAAG,KAAK,CAAC;IAE1B;;;;OAIG;IACK,WAAW,GAAG,KAAK,CAAC;IAE5B,4EAA4E;IAC5E,cAAc;IACd,4EAA4E;IAE5E;;;;;;OAMG;IACH,YACmB,QAAgC,EAChC,MAAyB;QADzB,aAAQ,GAAR,QAAQ,CAAwB;QAChC,WAAM,GAAN,MAAM,CAAmB;IACzC,CAAC;IAEJ,4EAA4E;IAC5E,uCAAuC;IACvC,4EAA4E;IAE5E;;;OAGG;IACH,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED,4EAA4E;IAC5E,WAAW;IACX,4EAA4E;IAE5E;;;;;;;;;;OAUG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,qEAAqE;QACrE,gEAAgE;QAChE,IAAI,QAAqE,CAAC;QAC1E,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACxC,yBAAyB,CAAC,kBAAkB,EAC5C,KAAK,IAAI,EAAE;gBACT,kEAAkE;gBAClE,8DAA8D;gBAC9D,MAAM,EAAE,QAAQ,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAC/C,2BAA2B,CAC5B,CAAC;gBACF,OAAO,cAAc,CACnB,qBAAqB;gBACrB,qEAAqE;gBACrE,IAAI,CAAC,MAAM,EAAE,OAAO,IAAI,IAAI,CAAC,OAAO,EACpC,EAAE,SAAS,EAAE,IAAI,EAAE,CACpB,CAAC;YACJ,CAAC,EACD;gBACE,sEAAsE;gBACtE,OAAO,EAAE,KAAK,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzC,0DAA0D;gBAC1D,IAAI,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC;aAChD,CACF,CAAC;YAEF,mEAAmE;YACnE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,yEAAyE;YACzE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,0DAA0D;QAC1D,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,4EAA4E;IAC5E,uDAAuD;IACvD,4EAA4E;IAE5E;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAC;QAC1E,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC;IACzB,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACK,UAAU;QAChB,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;IAC/D,CAAC;IAED;;;;;;;;OAQG;IACK,SAAS,CAAC,GAAe;QAC/B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,kEAAkE;QAClE,IAAI,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAClB,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;YACvB,IAAI,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC5B,IAAI,GAAG,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,UAAU,EAAE,IAAI,CAAC,KAAK;YACtB,SAAS,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF"}