@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +23 -0
  2. package/dist/ClassifierOrchestrator.d.ts +126 -0
  3. package/dist/ClassifierOrchestrator.d.ts.map +1 -0
  4. package/dist/ClassifierOrchestrator.js +239 -0
  5. package/dist/ClassifierOrchestrator.js.map +1 -0
  6. package/dist/IContentClassifier.d.ts +117 -0
  7. package/dist/IContentClassifier.d.ts.map +1 -0
  8. package/dist/IContentClassifier.js +22 -0
  9. package/dist/IContentClassifier.js.map +1 -0
  10. package/dist/MLClassifierGuardrail.d.ts +163 -0
  11. package/dist/MLClassifierGuardrail.d.ts.map +1 -0
  12. package/dist/MLClassifierGuardrail.js +335 -0
  13. package/dist/MLClassifierGuardrail.js.map +1 -0
  14. package/dist/SlidingWindowBuffer.d.ts +213 -0
  15. package/dist/SlidingWindowBuffer.d.ts.map +1 -0
  16. package/dist/SlidingWindowBuffer.js +246 -0
  17. package/dist/SlidingWindowBuffer.js.map +1 -0
  18. package/dist/classifiers/InjectionClassifier.d.ts +126 -0
  19. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
  20. package/dist/classifiers/InjectionClassifier.js +210 -0
  21. package/dist/classifiers/InjectionClassifier.js.map +1 -0
  22. package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
  23. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
  24. package/dist/classifiers/JailbreakClassifier.js +208 -0
  25. package/dist/classifiers/JailbreakClassifier.js.map +1 -0
  26. package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
  27. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
  28. package/dist/classifiers/ToxicityClassifier.js +212 -0
  29. package/dist/classifiers/ToxicityClassifier.js.map +1 -0
  30. package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
  31. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
  32. package/dist/classifiers/WorkerClassifierProxy.js +268 -0
  33. package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
  34. package/dist/index.d.ts +110 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +342 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/tools/ClassifyContentTool.d.ts +105 -0
  39. package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
  40. package/dist/tools/ClassifyContentTool.js +149 -0
  41. package/dist/tools/ClassifyContentTool.js.map +1 -0
  42. package/dist/types.d.ts +319 -0
  43. package/dist/types.d.ts.map +1 -0
  44. package/dist/types.js +62 -0
  45. package/dist/types.js.map +1 -0
  46. package/dist/worker/classifier-worker.d.ts +49 -0
  47. package/dist/worker/classifier-worker.d.ts.map +1 -0
  48. package/dist/worker/classifier-worker.js +180 -0
  49. package/dist/worker/classifier-worker.js.map +1 -0
  50. package/package.json +45 -0
  51. package/src/ClassifierOrchestrator.ts +290 -0
  52. package/src/IContentClassifier.ts +124 -0
  53. package/src/MLClassifierGuardrail.ts +419 -0
  54. package/src/SlidingWindowBuffer.ts +384 -0
  55. package/src/classifiers/InjectionClassifier.ts +261 -0
  56. package/src/classifiers/JailbreakClassifier.ts +259 -0
  57. package/src/classifiers/ToxicityClassifier.ts +263 -0
  58. package/src/classifiers/WorkerClassifierProxy.ts +366 -0
  59. package/src/index.ts +383 -0
  60. package/src/tools/ClassifyContentTool.ts +201 -0
  61. package/src/types.ts +391 -0
  62. package/src/worker/classifier-worker.ts +267 -0
@@ -0,0 +1,384 @@
1
+ /**
2
+ * @fileoverview Sliding-window text buffer for streaming ML classifier evaluation.
3
+ *
4
+ * When an LLM streams its response token-by-token, we cannot wait for the
5
+ * complete response before running safety classifiers — that would be too late
6
+ * to block or sanitise harmful content. At the same time, classifiers are
7
+ * expensive: running one on every individual token is wasteful and introduces
8
+ * unacceptable latency.
9
+ *
10
+ * `SlidingWindowBuffer` solves this by accumulating tokens from one or more
11
+ * concurrent streams and emitting a {@link ChunkReady} event only when enough
12
+ * tokens have accumulated to fill a `chunkSize`-token window. Each window
13
+ * also includes a `contextSize`-token "ring" from the previous chunk, so the
14
+ * classifier can reason about content that spans window boundaries.
15
+ *
16
+ * Architecture
17
+ * ------------
18
+ * - **Per-stream state**: Stored in a `Map<streamId, WindowState>`. Each
19
+ * stream is fully independent and can be used across multiple concurrent
20
+ * responses.
21
+ * - **Token estimation**: Uses the 4-chars-per-token heuristic for speed;
22
+ * callers that need exact counts should pre-tokenise text before pushing.
23
+ * - **Evaluation budget**: Once a stream reaches `maxEvaluations` chunks,
24
+ * `push()` returns `null` for all subsequent pushes, preventing unbounded
25
+ * classifier invocations on very long responses.
26
+ * - **Stale-stream pruning**: Streams that have not received data within
27
+ * `streamTimeoutMs` milliseconds are lazily evicted from the map to prevent
28
+ * memory leaks in long-running servers.
29
+ *
30
+ * @module agentos/extensions/packs/ml-classifiers/SlidingWindowBuffer
31
+ */
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // Public configuration & result shapes
35
+ // ---------------------------------------------------------------------------
36
+
37
+ /**
38
+ * Configuration for a {@link SlidingWindowBuffer} instance.
39
+ *
40
+ * All fields are optional; unset fields fall back to the defaults shown below.
41
+ */
42
+ export interface SlidingWindowConfig {
43
+ /**
44
+ * Target window size in *estimated* tokens. When the accumulated buffer
45
+ * reaches or exceeds this many tokens, a {@link ChunkReady} is emitted and
46
+ * the buffer is slid forward.
47
+ *
48
+ * @default 200
49
+ */
50
+ chunkSize: number;
51
+
52
+ /**
53
+ * Number of tokens from the tail of the previous window to carry into the
54
+ * `text` field of the next {@link ChunkReady}. This overlap prevents
55
+ * boundary effects where a phrase split across two windows is misclassified.
56
+ *
57
+ * @default 50
58
+ */
59
+ contextSize: number;
60
+
61
+ /**
62
+ * Maximum number of {@link ChunkReady} events to emit per stream. After
63
+ * this budget is exhausted, `push()` returns `null` for the remainder of the
64
+ * stream. Use `flush()` to retrieve any buffered text that has not been
65
+ * emitted yet.
66
+ *
67
+ * @default 100
68
+ */
69
+ maxEvaluations: number;
70
+
71
+ /**
72
+ * Milliseconds of inactivity after which a stream is considered stale and
73
+ * eligible for eviction by {@link SlidingWindowBuffer.pruneStale}.
74
+ *
75
+ * @default 30000
76
+ */
77
+ streamTimeoutMs: number;
78
+ }
79
+
80
+ /**
81
+ * Emitted by {@link SlidingWindowBuffer.push} when sufficient tokens have
82
+ * accumulated to fill one evaluation window.
83
+ */
84
+ export interface ChunkReady {
85
+ /**
86
+ * The full text to classify. Equals `contextRing + newBuffer`, where
87
+ * `contextRing` is the carried-forward tail from the previous window.
88
+ * Always non-empty.
89
+ */
90
+ text: string;
91
+
92
+ /**
93
+ * Only the *new* text pushed since the last chunk was emitted (i.e. without
94
+ * the context prefix). Useful for determining which part of the response
95
+ * was newly evaluated.
96
+ */
97
+ newText: string;
98
+
99
+ /**
100
+ * 1-indexed sequence number for this chunk within the stream.
101
+ * The first chunk emitted for a stream has `evaluationNumber === 1`.
102
+ */
103
+ evaluationNumber: number;
104
+ }
105
+
106
+ // ---------------------------------------------------------------------------
107
+ // Private per-stream state
108
+ // ---------------------------------------------------------------------------
109
+
110
+ /**
111
+ * Internal state tracked for each active stream.
112
+ *
113
+ * @internal
114
+ */
115
+ interface WindowState {
116
+ /**
117
+ * Accumulated text that has not yet been emitted in a chunk.
118
+ * Reset (but not cleared) after each chunk: the tail is moved to
119
+ * `contextRing` and the buffer starts fresh.
120
+ */
121
+ buffer: string;
122
+
123
+ /**
124
+ * Running count of *estimated* tokens in `buffer`.
125
+ * Derived from `Math.ceil(buffer.length / 4)`.
126
+ */
127
+ tokenCount: number;
128
+
129
+ /**
130
+ * The context tail from the previous chunk. Prepended to `buffer` when
131
+ * assembling the `text` field of {@link ChunkReady}.
132
+ */
133
+ contextRing: string;
134
+
135
+ /**
136
+ * Number of chunks already emitted for this stream.
137
+ * Used to enforce the {@link SlidingWindowConfig.maxEvaluations} budget.
138
+ */
139
+ evaluationCount: number;
140
+
141
+ /**
142
+ * Unix timestamp (ms) of the last `push()` call for this stream.
143
+ * Used by {@link SlidingWindowBuffer.pruneStale} to evict idle streams.
144
+ */
145
+ lastSeenAt: number;
146
+ }
147
+
148
+ // ---------------------------------------------------------------------------
149
+ // SlidingWindowBuffer implementation
150
+ // ---------------------------------------------------------------------------
151
+
152
+ /**
153
+ * A stateful, multi-stream text accumulator that emits fixed-size windows
154
+ * for ML classifier evaluation with configurable context carry-forward.
155
+ *
156
+ * @example
157
+ * ```typescript
158
+ * const buf = new SlidingWindowBuffer({ chunkSize: 200, contextSize: 50 });
159
+ *
160
+ * // Simulate streaming tokens
161
+ * for (const token of streamedTokens) {
162
+ * const chunk = buf.push('stream-1', token);
163
+ * if (chunk) {
164
+ * const result = await toxicityClassifier.classify(chunk.text);
165
+ * if (result.confidence > 0.9) terminateStream();
166
+ * }
167
+ * }
168
+ *
169
+ * // Evaluate remaining tokens
170
+ * const finalChunk = buf.flush('stream-1');
171
+ * if (finalChunk) {
172
+ * await toxicityClassifier.classify(finalChunk.text);
173
+ * }
174
+ * ```
175
+ */
176
+ export class SlidingWindowBuffer {
177
+ /** Resolved configuration (defaults applied). */
178
+ private readonly config: SlidingWindowConfig;
179
+
180
+ /**
181
+ * Per-stream state map. Keyed by the `streamId` passed to `push()`.
182
+ * Entries are created lazily on first push and removed on flush or prune.
183
+ */
184
+ private readonly streams: Map<string, WindowState> = new Map();
185
+
186
+ /**
187
+ * Construct a new buffer with the supplied configuration.
188
+ *
189
+ * @param config - Partial configuration; unset fields fall back to defaults:
190
+ * `chunkSize=200`, `contextSize=50`, `maxEvaluations=100`,
191
+ * `streamTimeoutMs=30000`.
192
+ */
193
+ constructor(config?: Partial<SlidingWindowConfig>) {
194
+ this.config = {
195
+ chunkSize: config?.chunkSize ?? 200,
196
+ contextSize: config?.contextSize ?? 50,
197
+ maxEvaluations: config?.maxEvaluations ?? 100,
198
+ streamTimeoutMs: config?.streamTimeoutMs ?? 30_000,
199
+ };
200
+ }
201
+
202
+ // -------------------------------------------------------------------------
203
+ // Public API
204
+ // -------------------------------------------------------------------------
205
+
206
+ /**
207
+ * Push new text into the buffer for the specified stream.
208
+ *
209
+ * Internally the text is appended to the stream's accumulation buffer.
210
+ * If the buffer's estimated token count reaches `chunkSize`, a
211
+ * {@link ChunkReady} is assembled and returned; the buffer is then reset
212
+ * (with the tail preserved as the context ring for the next window).
213
+ *
214
+ * Returns `null` when:
215
+ * - The buffer has not yet accumulated `chunkSize` tokens.
216
+ * - The stream has already emitted `maxEvaluations` chunks.
217
+ *
218
+ * When the map contains more than 10 streams, stale streams are pruned
219
+ * lazily after the push is processed.
220
+ *
221
+ * @param streamId - Opaque identifier for the stream (e.g. a request UUID).
222
+ * @param text - The new text fragment to accumulate.
223
+ * @returns A {@link ChunkReady} when an evaluation window is complete, or
224
+ * `null` if more data is needed (or the budget is exhausted).
225
+ */
226
+ push(streamId: string, text: string): ChunkReady | null {
227
+ if (!text) {
228
+ return null;
229
+ }
230
+
231
+ // Initialise state for a new stream.
232
+ if (!this.streams.has(streamId)) {
233
+ this.streams.set(streamId, {
234
+ buffer: '',
235
+ tokenCount: 0,
236
+ contextRing: '',
237
+ evaluationCount: 0,
238
+ lastSeenAt: Date.now(),
239
+ });
240
+ }
241
+
242
+ const state = this.streams.get(streamId)!;
243
+ state.lastSeenAt = Date.now();
244
+
245
+ // Respect the evaluation budget — stop emitting chunks once exhausted.
246
+ if (state.evaluationCount >= this.config.maxEvaluations) {
247
+ return null;
248
+ }
249
+
250
+ // Accumulate incoming text.
251
+ state.buffer += text;
252
+ state.tokenCount = this.estimateTokens(state.buffer);
253
+
254
+ // Lazy pruning: clean up stale streams whenever the map grows large.
255
+ // Done unconditionally (not just on chunk emit) so stale entries are
256
+ // reclaimed even when streams are slow to accumulate a full window.
257
+ if (this.streams.size > 10) {
258
+ this.pruneStale();
259
+ }
260
+
261
+ // Not enough tokens yet — wait for more.
262
+ if (state.tokenCount < this.config.chunkSize) {
263
+ return null;
264
+ }
265
+
266
+ // We have a full window. Assemble the chunk.
267
+ const chunk = this.assembleChunk(state);
268
+
269
+ // Slide the context ring forward: keep the last `contextSize` tokens'
270
+ // worth of characters from the buffer that was just emitted.
271
+ const contextCharBudget = this.config.contextSize * 4;
272
+ state.contextRing = state.buffer.slice(-contextCharBudget);
273
+
274
+ // Reset the buffer and token count for the next window.
275
+ state.buffer = '';
276
+ state.tokenCount = 0;
277
+ state.evaluationCount += 1;
278
+
279
+ return chunk;
280
+ }
281
+
282
+ /**
283
+ * Flush any remaining buffered text for the stream as a final chunk.
284
+ *
285
+ * Call this after the stream ends (e.g. when the LLM emits its final
286
+ * token) to ensure the classifier evaluates the tail of the response.
287
+ *
288
+ * The stream's state entry is removed from the map after flushing.
289
+ *
290
+ * @param streamId - Identifier of the stream to flush.
291
+ * @returns A {@link ChunkReady} for the remaining buffer, or `null` if the
292
+ * buffer is empty or the stream does not exist.
293
+ */
294
+ flush(streamId: string): ChunkReady | null {
295
+ const state = this.streams.get(streamId);
296
+
297
+ // Nothing to flush if the stream is unknown or the buffer is empty.
298
+ if (!state || state.buffer.length === 0) {
299
+ // Always clean up the map entry, even for empty buffers.
300
+ this.streams.delete(streamId);
301
+ return null;
302
+ }
303
+
304
+ const chunk = this.assembleChunk(state);
305
+ this.streams.delete(streamId);
306
+ return chunk;
307
+ }
308
+
309
+ /**
310
+ * Remove streams that have not received data within `streamTimeoutMs`.
311
+ *
312
+ * Called lazily by `push()` when the stream map grows beyond 10 entries.
313
+ * May also be called proactively by a maintenance timer.
314
+ */
315
+ pruneStale(): void {
316
+ const now = Date.now();
317
+ for (const [id, state] of this.streams) {
318
+ if (now - state.lastSeenAt > this.config.streamTimeoutMs) {
319
+ this.streams.delete(id);
320
+ }
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Remove all stream state from the buffer.
326
+ *
327
+ * Useful for graceful shutdown or unit-test teardown to ensure no cross-test
328
+ * state leaks.
329
+ */
330
+ clear(): void {
331
+ this.streams.clear();
332
+ }
333
+
334
+ /**
335
+ * The number of streams currently tracked (including stale ones not yet
336
+ * pruned).
337
+ *
338
+ * Exposed primarily for testing and diagnostics.
339
+ */
340
+ get size(): number {
341
+ return this.streams.size;
342
+ }
343
+
344
+ // -------------------------------------------------------------------------
345
+ // Private helpers
346
+ // -------------------------------------------------------------------------
347
+
348
+ /**
349
+ * Assemble a {@link ChunkReady} from the current stream state.
350
+ *
351
+ * The `text` field is the concatenation of `contextRing` and the current
352
+ * `buffer`, giving the classifier cross-boundary context. The `newText`
353
+ * field is just the raw `buffer` so callers can distinguish old from new.
354
+ *
355
+ * @param state - The mutable state for the stream being assembled.
356
+ * @returns A fully-populated {@link ChunkReady}.
357
+ */
358
+ private assembleChunk(state: WindowState): ChunkReady {
359
+ const newText = state.buffer;
360
+ const text = state.contextRing + newText;
361
+ return {
362
+ text,
363
+ newText,
364
+ // evaluationCount is 0-indexed before increment, so +1 gives 1-indexed number.
365
+ evaluationNumber: state.evaluationCount + 1,
366
+ };
367
+ }
368
+
369
+ /**
370
+ * Estimate the number of LLM tokens in a string using the 4-chars-per-token
371
+ * heuristic.
372
+ *
373
+ * This deliberately mirrors {@link estimateTokens} from `core/utils/text-utils`
374
+ * without importing it, keeping this module self-contained and safe to load
375
+ * in Web Worker contexts where module resolution may differ.
376
+ *
377
+ * @param text - The string to estimate.
378
+ * @returns Non-negative integer token count estimate.
379
+ */
380
+ private estimateTokens(text: string): number {
381
+ if (!text) return 0;
382
+ return Math.ceil(text.length / 4);
383
+ }
384
+ }
@@ -0,0 +1,261 @@
1
+ /**
2
+ * @fileoverview Prompt-injection content classifier using the
3
+ * `protectai/deberta-v3-small-prompt-injection-v2` model.
4
+ *
5
+ * Prompt injection is the attack pattern where adversarial instructions are
6
+ * embedded inside user-supplied text to override or hijack the agent's system
7
+ * prompt. This classifier provides a dedicated binary signal (INJECTION /
8
+ * SAFE) that the guardrail orchestrator can act on independently of the
9
+ * toxicity or jailbreak classifiers.
10
+ *
11
+ * Model details
12
+ * -------------
13
+ * `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa
14
+ * model from ProtectAI, specifically trained to distinguish benign user
15
+ * messages from prompt-injection payloads. It outputs two labels:
16
+ * - `INJECTION` — high-confidence injection attempt
17
+ * - `SAFE` — normal user input
18
+ *
19
+ * Graceful degradation
20
+ * --------------------
21
+ * If the model fails to load the classifier sets `unavailable = true` and
22
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
23
+ * on every subsequent call.
24
+ *
25
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier
26
+ */
27
+
28
+ import type { ClassificationResult } from '@framers/agentos';
29
+ import type { ISharedServiceRegistry } from '@framers/agentos';
30
+ import type { IContentClassifier } from '../IContentClassifier';
31
+ import type { ClassifierConfig } from '../types';
32
+ import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Internal raw pipeline output type
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /**
39
+ * A single label/score pair as returned by the HuggingFace text-classification
40
+ * pipeline when called with `{ topk: null }`.
41
+ */
42
+ interface RawLabel {
43
+ /** Label name, e.g. `'INJECTION'` or `'SAFE'`. */
44
+ label: string;
45
+ /** Confidence score in the range [0, 1]. */
46
+ score: number;
47
+ }
48
+
49
+ // ---------------------------------------------------------------------------
50
+ // InjectionClassifier
51
+ // ---------------------------------------------------------------------------
52
+
53
+ /**
54
+ * Binary prompt-injection classifier backed by
55
+ * `protectai/deberta-v3-small-prompt-injection-v2`.
56
+ *
57
+ * Returns one of two labels:
58
+ * - `INJECTION` — the text contains an injection attempt
59
+ * - `SAFE` — the text is clean
60
+ *
61
+ * The label with the higher confidence becomes `bestClass` / `confidence`.
62
+ * Both labels are present in `allScores` so callers can read the SAFE score
63
+ * as well.
64
+ *
65
+ * @implements {IContentClassifier}
66
+ *
67
+ * @example
68
+ * ```typescript
69
+ * const classifier = new InjectionClassifier(serviceRegistry);
70
+ * const result = await classifier.classify('Ignore previous instructions and …');
71
+ * // result.bestClass === 'INJECTION', result.confidence ≈ 0.97
72
+ * ```
73
+ */
74
+ export class InjectionClassifier implements IContentClassifier {
75
+ // -------------------------------------------------------------------------
76
+ // IContentClassifier identity fields
77
+ // -------------------------------------------------------------------------
78
+
79
+ /** Unique service identifier for this classifier. */
80
+ readonly id = 'prompt-injection';
81
+
82
+ /** Human-readable name for dashboards and log output. */
83
+ readonly displayName = 'Prompt Injection Classifier';
84
+
85
+ /** Short description of what this classifier detects. */
86
+ readonly description =
87
+ 'Detects prompt-injection attempts where adversarial instructions are ' +
88
+ 'embedded in user input to override or hijack the agent system prompt.';
89
+
90
+ /**
91
+ * Default Hugging Face model ID.
92
+ * Overridable via {@link ClassifierConfig.modelId}.
93
+ */
94
+ readonly modelId = 'protectai/deberta-v3-small-prompt-injection-v2';
95
+
96
+ // -------------------------------------------------------------------------
97
+ // Internal state
98
+ // -------------------------------------------------------------------------
99
+
100
+ /**
101
+ * Whether the model weights are fully loaded and the classifier is ready
102
+ * to accept `classify()` calls.
103
+ */
104
+ private _isLoaded = false;
105
+
106
+ /**
107
+ * Set to `true` when the model fails to load. Once `unavailable`, every
108
+ * subsequent `classify()` call immediately returns the pass result rather
109
+ * than retrying the expensive model load.
110
+ */
111
+ private unavailable = false;
112
+
113
+ // -------------------------------------------------------------------------
114
+ // Constructor
115
+ // -------------------------------------------------------------------------
116
+
117
+ /**
118
+ * @param services - Shared service registry used to lazily create and cache
119
+ * the underlying HuggingFace pipeline instance.
120
+ * @param config - Optional per-classifier configuration. When
121
+ * `config.modelId` is provided it overrides the default `modelId` when
122
+ * loading the model.
123
+ */
124
+ constructor(
125
+ private readonly services: ISharedServiceRegistry,
126
+ private readonly config?: ClassifierConfig,
127
+ ) {}
128
+
129
+ // -------------------------------------------------------------------------
130
+ // IContentClassifier.isLoaded (getter)
131
+ // -------------------------------------------------------------------------
132
+
133
+ /**
134
+ * Whether the underlying model pipeline has been successfully initialised.
135
+ * The flag is set to `true` after the first successful `classify()` call.
136
+ */
137
+ get isLoaded(): boolean {
138
+ return this._isLoaded;
139
+ }
140
+
141
+ // -------------------------------------------------------------------------
142
+ // classify
143
+ // -------------------------------------------------------------------------
144
+
145
+ /**
146
+ * Run prompt-injection inference on `text`.
147
+ *
148
+ * Lazily loads the pipeline on the first call via the shared service
149
+ * registry, then calls it with `{ topk: null }` to retrieve scores for both
150
+ * labels.
151
+ *
152
+ * @param text - The text to evaluate.
153
+ * @returns A promise that resolves with the classification result. If the
154
+ * model is unavailable the pass result is returned instead of throwing.
155
+ */
156
+ async classify(text: string): Promise<ClassificationResult> {
157
+ // Return the pass result immediately if the model previously failed to load.
158
+ if (this.unavailable) {
159
+ return this.passResult();
160
+ }
161
+
162
+ // Lazily obtain (or create) the HuggingFace pipeline instance from the
163
+ // shared service registry so the model is only downloaded once.
164
+ let pipeline: (text: string, opts: { topk: null }) => Promise<RawLabel[]>;
165
+ try {
166
+ pipeline = await this.services.getOrCreate(
167
+ ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE,
168
+ async () => {
169
+ // Dynamic import so environments without @huggingface/transformers
170
+ // can still load the rest of AgentOS.
171
+ const { pipeline: createPipeline } = await import(
172
+ '@huggingface/transformers'
173
+ );
174
+ return createPipeline(
175
+ 'text-classification',
176
+ // Honour a caller-supplied model override; fall back to the default.
177
+ this.config?.modelId ?? this.modelId,
178
+ { quantized: true },
179
+ );
180
+ },
181
+ {
182
+ /** Release ONNX/WASM resources when the registry entry is evicted. */
183
+ dispose: async (p: any) => p?.dispose?.(),
184
+ /** Tags used for diagnostics and capability discovery. */
185
+ tags: ['ml', 'classifier', 'prompt-injection', 'onnx'],
186
+ },
187
+ );
188
+
189
+ // Mark the classifier as ready now that the pipeline is available.
190
+ this._isLoaded = true;
191
+ } catch {
192
+ // Model failed to load — mark as unavailable and return the pass result.
193
+ this.unavailable = true;
194
+ return this.passResult();
195
+ }
196
+
197
+ // Run inference and request both label scores.
198
+ const raw = await pipeline(text, { topk: null });
199
+ return this.mapResult(raw);
200
+ }
201
+
202
+ // -------------------------------------------------------------------------
203
+ // dispose (optional IContentClassifier lifecycle hook)
204
+ // -------------------------------------------------------------------------
205
+
206
+ /**
207
+ * Release the pipeline instance from the shared service registry.
208
+ *
209
+ * Idempotent — safe to call multiple times.
210
+ */
211
+ async dispose(): Promise<void> {
212
+ await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE);
213
+ this._isLoaded = false;
214
+ }
215
+
216
+ // -------------------------------------------------------------------------
217
+ // Private helpers
218
+ // -------------------------------------------------------------------------
219
+
220
+ /**
221
+ * Returns a "pass" result used when the model is unavailable.
222
+ *
223
+ * A pass result reports `bestClass: 'benign'` with zero confidence so the
224
+ * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
225
+ */
226
+ private passResult(): ClassificationResult {
227
+ return { bestClass: 'benign', confidence: 0, allScores: [] };
228
+ }
229
+
230
+ /**
231
+ * Map the raw pipeline output to a {@link ClassificationResult}.
232
+ *
233
+ * For binary classification the label with the higher confidence score
234
+ * becomes `bestClass` / `confidence`. Both labels are included in
235
+ * `allScores`.
236
+ *
237
+ * @param raw - Array returned by the pipeline when called with `topk: null`.
238
+ */
239
+ private mapResult(raw: RawLabel[]): ClassificationResult {
240
+ if (!raw || raw.length === 0) {
241
+ return this.passResult();
242
+ }
243
+
244
+ // Find the label with the highest score (should be one of INJECTION / SAFE).
245
+ let best = raw[0];
246
+ for (const item of raw) {
247
+ if (item.score > best.score) {
248
+ best = item;
249
+ }
250
+ }
251
+
252
+ return {
253
+ bestClass: best.label,
254
+ confidence: best.score,
255
+ allScores: raw.map((item) => ({
256
+ classLabel: item.label,
257
+ score: item.score,
258
+ })),
259
+ };
260
+ }
261
+ }