@framers/agentos-ext-ml-classifiers 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -0
- package/dist/ClassifierOrchestrator.d.ts +126 -0
- package/dist/ClassifierOrchestrator.d.ts.map +1 -0
- package/dist/ClassifierOrchestrator.js +239 -0
- package/dist/ClassifierOrchestrator.js.map +1 -0
- package/dist/IContentClassifier.d.ts +117 -0
- package/dist/IContentClassifier.d.ts.map +1 -0
- package/dist/IContentClassifier.js +22 -0
- package/dist/IContentClassifier.js.map +1 -0
- package/dist/MLClassifierGuardrail.d.ts +163 -0
- package/dist/MLClassifierGuardrail.d.ts.map +1 -0
- package/dist/MLClassifierGuardrail.js +335 -0
- package/dist/MLClassifierGuardrail.js.map +1 -0
- package/dist/SlidingWindowBuffer.d.ts +213 -0
- package/dist/SlidingWindowBuffer.d.ts.map +1 -0
- package/dist/SlidingWindowBuffer.js +246 -0
- package/dist/SlidingWindowBuffer.js.map +1 -0
- package/dist/classifiers/InjectionClassifier.d.ts +126 -0
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
- package/dist/classifiers/InjectionClassifier.js +210 -0
- package/dist/classifiers/InjectionClassifier.js.map +1 -0
- package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
- package/dist/classifiers/JailbreakClassifier.js +208 -0
- package/dist/classifiers/JailbreakClassifier.js.map +1 -0
- package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
- package/dist/classifiers/ToxicityClassifier.js +212 -0
- package/dist/classifiers/ToxicityClassifier.js.map +1 -0
- package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
- package/dist/classifiers/WorkerClassifierProxy.js +268 -0
- package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
- package/dist/index.d.ts +110 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +342 -0
- package/dist/index.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +105 -0
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
- package/dist/tools/ClassifyContentTool.js +149 -0
- package/dist/tools/ClassifyContentTool.js.map +1 -0
- package/dist/types.d.ts +319 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +62 -0
- package/dist/types.js.map +1 -0
- package/dist/worker/classifier-worker.d.ts +49 -0
- package/dist/worker/classifier-worker.d.ts.map +1 -0
- package/dist/worker/classifier-worker.js +180 -0
- package/dist/worker/classifier-worker.js.map +1 -0
- package/package.json +45 -0
- package/src/ClassifierOrchestrator.ts +290 -0
- package/src/IContentClassifier.ts +124 -0
- package/src/MLClassifierGuardrail.ts +419 -0
- package/src/SlidingWindowBuffer.ts +384 -0
- package/src/classifiers/InjectionClassifier.ts +261 -0
- package/src/classifiers/JailbreakClassifier.ts +259 -0
- package/src/classifiers/ToxicityClassifier.ts +263 -0
- package/src/classifiers/WorkerClassifierProxy.ts +366 -0
- package/src/index.ts +383 -0
- package/src/tools/ClassifyContentTool.ts +201 -0
- package/src/types.ts +391 -0
- package/src/worker/classifier-worker.ts +267 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Sliding-window text buffer for streaming ML classifier evaluation.
|
|
3
|
+
*
|
|
4
|
+
* When an LLM streams its response token-by-token, we cannot wait for the
|
|
5
|
+
* complete response before running safety classifiers — that would be too late
|
|
6
|
+
* to block or sanitise harmful content. At the same time, classifiers are
|
|
7
|
+
* expensive: running one on every individual token is wasteful and introduces
|
|
8
|
+
* unacceptable latency.
|
|
9
|
+
*
|
|
10
|
+
* `SlidingWindowBuffer` solves this by accumulating tokens from one or more
|
|
11
|
+
* concurrent streams and emitting a {@link ChunkReady} event only when enough
|
|
12
|
+
* tokens have accumulated to fill a `chunkSize`-token window. Each window
|
|
13
|
+
* also includes a `contextSize`-token "ring" from the previous chunk, so the
|
|
14
|
+
* classifier can reason about content that spans window boundaries.
|
|
15
|
+
*
|
|
16
|
+
* Architecture
|
|
17
|
+
* ------------
|
|
18
|
+
* - **Per-stream state**: Stored in a `Map<streamId, WindowState>`. Each
|
|
19
|
+
* stream is fully independent and can be used across multiple concurrent
|
|
20
|
+
* responses.
|
|
21
|
+
* - **Token estimation**: Uses the 4-chars-per-token heuristic for speed;
|
|
22
|
+
* callers that need exact counts should pre-tokenise text before pushing.
|
|
23
|
+
* - **Evaluation budget**: Once a stream reaches `maxEvaluations` chunks,
|
|
24
|
+
* `push()` returns `null` for all subsequent pushes, preventing unbounded
|
|
25
|
+
* classifier invocations on very long responses.
|
|
26
|
+
* - **Stale-stream pruning**: Streams that have not received data within
|
|
27
|
+
* `streamTimeoutMs` milliseconds are lazily evicted from the map to prevent
|
|
28
|
+
* memory leaks in long-running servers.
|
|
29
|
+
*
|
|
30
|
+
* @module agentos/extensions/packs/ml-classifiers/SlidingWindowBuffer
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Public configuration & result shapes
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Configuration for a {@link SlidingWindowBuffer} instance.
|
|
39
|
+
*
|
|
40
|
+
* All fields are optional; unset fields fall back to the defaults shown below.
|
|
41
|
+
*/
|
|
42
|
+
export interface SlidingWindowConfig {
|
|
43
|
+
/**
|
|
44
|
+
* Target window size in *estimated* tokens. When the accumulated buffer
|
|
45
|
+
* reaches or exceeds this many tokens, a {@link ChunkReady} is emitted and
|
|
46
|
+
* the buffer is slid forward.
|
|
47
|
+
*
|
|
48
|
+
* @default 200
|
|
49
|
+
*/
|
|
50
|
+
chunkSize: number;
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Number of tokens from the tail of the previous window to carry into the
|
|
54
|
+
* `text` field of the next {@link ChunkReady}. This overlap prevents
|
|
55
|
+
* boundary effects where a phrase split across two windows is misclassified.
|
|
56
|
+
*
|
|
57
|
+
* @default 50
|
|
58
|
+
*/
|
|
59
|
+
contextSize: number;
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Maximum number of {@link ChunkReady} events to emit per stream. After
|
|
63
|
+
* this budget is exhausted, `push()` returns `null` for the remainder of the
|
|
64
|
+
* stream. Use `flush()` to retrieve any buffered text that has not been
|
|
65
|
+
* emitted yet.
|
|
66
|
+
*
|
|
67
|
+
* @default 100
|
|
68
|
+
*/
|
|
69
|
+
maxEvaluations: number;
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Milliseconds of inactivity after which a stream is considered stale and
|
|
73
|
+
* eligible for eviction by {@link SlidingWindowBuffer.pruneStale}.
|
|
74
|
+
*
|
|
75
|
+
* @default 30000
|
|
76
|
+
*/
|
|
77
|
+
streamTimeoutMs: number;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Emitted by {@link SlidingWindowBuffer.push} when sufficient tokens have
|
|
82
|
+
* accumulated to fill one evaluation window.
|
|
83
|
+
*/
|
|
84
|
+
export interface ChunkReady {
|
|
85
|
+
/**
|
|
86
|
+
* The full text to classify. Equals `contextRing + newBuffer`, where
|
|
87
|
+
* `contextRing` is the carried-forward tail from the previous window.
|
|
88
|
+
* Always non-empty.
|
|
89
|
+
*/
|
|
90
|
+
text: string;
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Only the *new* text pushed since the last chunk was emitted (i.e. without
|
|
94
|
+
* the context prefix). Useful for determining which part of the response
|
|
95
|
+
* was newly evaluated.
|
|
96
|
+
*/
|
|
97
|
+
newText: string;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* 1-indexed sequence number for this chunk within the stream.
|
|
101
|
+
* The first chunk emitted for a stream has `evaluationNumber === 1`.
|
|
102
|
+
*/
|
|
103
|
+
evaluationNumber: number;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
// Private per-stream state
|
|
108
|
+
// ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Internal state tracked for each active stream.
|
|
112
|
+
*
|
|
113
|
+
* @internal
|
|
114
|
+
*/
|
|
115
|
+
interface WindowState {
|
|
116
|
+
/**
|
|
117
|
+
* Accumulated text that has not yet been emitted in a chunk.
|
|
118
|
+
* Reset (but not cleared) after each chunk: the tail is moved to
|
|
119
|
+
* `contextRing` and the buffer starts fresh.
|
|
120
|
+
*/
|
|
121
|
+
buffer: string;
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Running count of *estimated* tokens in `buffer`.
|
|
125
|
+
* Derived from `Math.ceil(buffer.length / 4)`.
|
|
126
|
+
*/
|
|
127
|
+
tokenCount: number;
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* The context tail from the previous chunk. Prepended to `buffer` when
|
|
131
|
+
* assembling the `text` field of {@link ChunkReady}.
|
|
132
|
+
*/
|
|
133
|
+
contextRing: string;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Number of chunks already emitted for this stream.
|
|
137
|
+
* Used to enforce the {@link SlidingWindowConfig.maxEvaluations} budget.
|
|
138
|
+
*/
|
|
139
|
+
evaluationCount: number;
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Unix timestamp (ms) of the last `push()` call for this stream.
|
|
143
|
+
* Used by {@link SlidingWindowBuffer.pruneStale} to evict idle streams.
|
|
144
|
+
*/
|
|
145
|
+
lastSeenAt: number;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// SlidingWindowBuffer implementation
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* A stateful, multi-stream text accumulator that emits fixed-size windows
|
|
154
|
+
* for ML classifier evaluation with configurable context carry-forward.
|
|
155
|
+
*
|
|
156
|
+
* @example
|
|
157
|
+
* ```typescript
|
|
158
|
+
* const buf = new SlidingWindowBuffer({ chunkSize: 200, contextSize: 50 });
|
|
159
|
+
*
|
|
160
|
+
* // Simulate streaming tokens
|
|
161
|
+
* for (const token of streamedTokens) {
|
|
162
|
+
* const chunk = buf.push('stream-1', token);
|
|
163
|
+
* if (chunk) {
|
|
164
|
+
* const result = await toxicityClassifier.classify(chunk.text);
|
|
165
|
+
* if (result.confidence > 0.9) terminateStream();
|
|
166
|
+
* }
|
|
167
|
+
* }
|
|
168
|
+
*
|
|
169
|
+
* // Evaluate remaining tokens
|
|
170
|
+
* const finalChunk = buf.flush('stream-1');
|
|
171
|
+
* if (finalChunk) {
|
|
172
|
+
* await toxicityClassifier.classify(finalChunk.text);
|
|
173
|
+
* }
|
|
174
|
+
* ```
|
|
175
|
+
*/
|
|
176
|
+
export class SlidingWindowBuffer {
|
|
177
|
+
/** Resolved configuration (defaults applied). */
|
|
178
|
+
private readonly config: SlidingWindowConfig;
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Per-stream state map. Keyed by the `streamId` passed to `push()`.
|
|
182
|
+
* Entries are created lazily on first push and removed on flush or prune.
|
|
183
|
+
*/
|
|
184
|
+
private readonly streams: Map<string, WindowState> = new Map();
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Construct a new buffer with the supplied configuration.
|
|
188
|
+
*
|
|
189
|
+
* @param config - Partial configuration; unset fields fall back to defaults:
|
|
190
|
+
* `chunkSize=200`, `contextSize=50`, `maxEvaluations=100`,
|
|
191
|
+
* `streamTimeoutMs=30000`.
|
|
192
|
+
*/
|
|
193
|
+
constructor(config?: Partial<SlidingWindowConfig>) {
|
|
194
|
+
this.config = {
|
|
195
|
+
chunkSize: config?.chunkSize ?? 200,
|
|
196
|
+
contextSize: config?.contextSize ?? 50,
|
|
197
|
+
maxEvaluations: config?.maxEvaluations ?? 100,
|
|
198
|
+
streamTimeoutMs: config?.streamTimeoutMs ?? 30_000,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// -------------------------------------------------------------------------
|
|
203
|
+
// Public API
|
|
204
|
+
// -------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Push new text into the buffer for the specified stream.
|
|
208
|
+
*
|
|
209
|
+
* Internally the text is appended to the stream's accumulation buffer.
|
|
210
|
+
* If the buffer's estimated token count reaches `chunkSize`, a
|
|
211
|
+
* {@link ChunkReady} is assembled and returned; the buffer is then reset
|
|
212
|
+
* (with the tail preserved as the context ring for the next window).
|
|
213
|
+
*
|
|
214
|
+
* Returns `null` when:
|
|
215
|
+
* - The buffer has not yet accumulated `chunkSize` tokens.
|
|
216
|
+
* - The stream has already emitted `maxEvaluations` chunks.
|
|
217
|
+
*
|
|
218
|
+
* When the map contains more than 10 streams, stale streams are pruned
|
|
219
|
+
* lazily after the push is processed.
|
|
220
|
+
*
|
|
221
|
+
* @param streamId - Opaque identifier for the stream (e.g. a request UUID).
|
|
222
|
+
* @param text - The new text fragment to accumulate.
|
|
223
|
+
* @returns A {@link ChunkReady} when an evaluation window is complete, or
|
|
224
|
+
* `null` if more data is needed (or the budget is exhausted).
|
|
225
|
+
*/
|
|
226
|
+
push(streamId: string, text: string): ChunkReady | null {
|
|
227
|
+
if (!text) {
|
|
228
|
+
return null;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Initialise state for a new stream.
|
|
232
|
+
if (!this.streams.has(streamId)) {
|
|
233
|
+
this.streams.set(streamId, {
|
|
234
|
+
buffer: '',
|
|
235
|
+
tokenCount: 0,
|
|
236
|
+
contextRing: '',
|
|
237
|
+
evaluationCount: 0,
|
|
238
|
+
lastSeenAt: Date.now(),
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const state = this.streams.get(streamId)!;
|
|
243
|
+
state.lastSeenAt = Date.now();
|
|
244
|
+
|
|
245
|
+
// Respect the evaluation budget — stop emitting chunks once exhausted.
|
|
246
|
+
if (state.evaluationCount >= this.config.maxEvaluations) {
|
|
247
|
+
return null;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Accumulate incoming text.
|
|
251
|
+
state.buffer += text;
|
|
252
|
+
state.tokenCount = this.estimateTokens(state.buffer);
|
|
253
|
+
|
|
254
|
+
// Lazy pruning: clean up stale streams whenever the map grows large.
|
|
255
|
+
// Done unconditionally (not just on chunk emit) so stale entries are
|
|
256
|
+
// reclaimed even when streams are slow to accumulate a full window.
|
|
257
|
+
if (this.streams.size > 10) {
|
|
258
|
+
this.pruneStale();
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Not enough tokens yet — wait for more.
|
|
262
|
+
if (state.tokenCount < this.config.chunkSize) {
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// We have a full window. Assemble the chunk.
|
|
267
|
+
const chunk = this.assembleChunk(state);
|
|
268
|
+
|
|
269
|
+
// Slide the context ring forward: keep the last `contextSize` tokens'
|
|
270
|
+
// worth of characters from the buffer that was just emitted.
|
|
271
|
+
const contextCharBudget = this.config.contextSize * 4;
|
|
272
|
+
state.contextRing = state.buffer.slice(-contextCharBudget);
|
|
273
|
+
|
|
274
|
+
// Reset the buffer and token count for the next window.
|
|
275
|
+
state.buffer = '';
|
|
276
|
+
state.tokenCount = 0;
|
|
277
|
+
state.evaluationCount += 1;
|
|
278
|
+
|
|
279
|
+
return chunk;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Flush any remaining buffered text for the stream as a final chunk.
|
|
284
|
+
*
|
|
285
|
+
* Call this after the stream ends (e.g. when the LLM emits its final
|
|
286
|
+
* token) to ensure the classifier evaluates the tail of the response.
|
|
287
|
+
*
|
|
288
|
+
* The stream's state entry is removed from the map after flushing.
|
|
289
|
+
*
|
|
290
|
+
* @param streamId - Identifier of the stream to flush.
|
|
291
|
+
* @returns A {@link ChunkReady} for the remaining buffer, or `null` if the
|
|
292
|
+
* buffer is empty or the stream does not exist.
|
|
293
|
+
*/
|
|
294
|
+
flush(streamId: string): ChunkReady | null {
|
|
295
|
+
const state = this.streams.get(streamId);
|
|
296
|
+
|
|
297
|
+
// Nothing to flush if the stream is unknown or the buffer is empty.
|
|
298
|
+
if (!state || state.buffer.length === 0) {
|
|
299
|
+
// Always clean up the map entry, even for empty buffers.
|
|
300
|
+
this.streams.delete(streamId);
|
|
301
|
+
return null;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
const chunk = this.assembleChunk(state);
|
|
305
|
+
this.streams.delete(streamId);
|
|
306
|
+
return chunk;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Remove streams that have not received data within `streamTimeoutMs`.
|
|
311
|
+
*
|
|
312
|
+
* Called lazily by `push()` when the stream map grows beyond 10 entries.
|
|
313
|
+
* May also be called proactively by a maintenance timer.
|
|
314
|
+
*/
|
|
315
|
+
pruneStale(): void {
|
|
316
|
+
const now = Date.now();
|
|
317
|
+
for (const [id, state] of this.streams) {
|
|
318
|
+
if (now - state.lastSeenAt > this.config.streamTimeoutMs) {
|
|
319
|
+
this.streams.delete(id);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Remove all stream state from the buffer.
|
|
326
|
+
*
|
|
327
|
+
* Useful for graceful shutdown or unit-test teardown to ensure no cross-test
|
|
328
|
+
* state leaks.
|
|
329
|
+
*/
|
|
330
|
+
clear(): void {
|
|
331
|
+
this.streams.clear();
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* The number of streams currently tracked (including stale ones not yet
|
|
336
|
+
* pruned).
|
|
337
|
+
*
|
|
338
|
+
* Exposed primarily for testing and diagnostics.
|
|
339
|
+
*/
|
|
340
|
+
get size(): number {
|
|
341
|
+
return this.streams.size;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// -------------------------------------------------------------------------
|
|
345
|
+
// Private helpers
|
|
346
|
+
// -------------------------------------------------------------------------
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Assemble a {@link ChunkReady} from the current stream state.
|
|
350
|
+
*
|
|
351
|
+
* The `text` field is the concatenation of `contextRing` and the current
|
|
352
|
+
* `buffer`, giving the classifier cross-boundary context. The `newText`
|
|
353
|
+
* field is just the raw `buffer` so callers can distinguish old from new.
|
|
354
|
+
*
|
|
355
|
+
* @param state - The mutable state for the stream being assembled.
|
|
356
|
+
* @returns A fully-populated {@link ChunkReady}.
|
|
357
|
+
*/
|
|
358
|
+
private assembleChunk(state: WindowState): ChunkReady {
|
|
359
|
+
const newText = state.buffer;
|
|
360
|
+
const text = state.contextRing + newText;
|
|
361
|
+
return {
|
|
362
|
+
text,
|
|
363
|
+
newText,
|
|
364
|
+
// evaluationCount is 0-indexed before increment, so +1 gives 1-indexed number.
|
|
365
|
+
evaluationNumber: state.evaluationCount + 1,
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/**
|
|
370
|
+
* Estimate the number of LLM tokens in a string using the 4-chars-per-token
|
|
371
|
+
* heuristic.
|
|
372
|
+
*
|
|
373
|
+
* This deliberately mirrors {@link estimateTokens} from `core/utils/text-utils`
|
|
374
|
+
* without importing it, keeping this module self-contained and safe to load
|
|
375
|
+
* in Web Worker contexts where module resolution may differ.
|
|
376
|
+
*
|
|
377
|
+
* @param text - The string to estimate.
|
|
378
|
+
* @returns Non-negative integer token count estimate.
|
|
379
|
+
*/
|
|
380
|
+
private estimateTokens(text: string): number {
|
|
381
|
+
if (!text) return 0;
|
|
382
|
+
return Math.ceil(text.length / 4);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Prompt-injection content classifier using the
|
|
3
|
+
* `protectai/deberta-v3-small-prompt-injection-v2` model.
|
|
4
|
+
*
|
|
5
|
+
* Prompt injection is the attack pattern where adversarial instructions are
|
|
6
|
+
* embedded inside user-supplied text to override or hijack the agent's system
|
|
7
|
+
* prompt. This classifier provides a dedicated binary signal (INJECTION /
|
|
8
|
+
* SAFE) that the guardrail orchestrator can act on independently of the
|
|
9
|
+
* toxicity or jailbreak classifiers.
|
|
10
|
+
*
|
|
11
|
+
* Model details
|
|
12
|
+
* -------------
|
|
13
|
+
* `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa
|
|
14
|
+
* model from ProtectAI, specifically trained to distinguish benign user
|
|
15
|
+
* messages from prompt-injection payloads. It outputs two labels:
|
|
16
|
+
* - `INJECTION` — high-confidence injection attempt
|
|
17
|
+
* - `SAFE` — normal user input
|
|
18
|
+
*
|
|
19
|
+
* Graceful degradation
|
|
20
|
+
* --------------------
|
|
21
|
+
* If the model fails to load the classifier sets `unavailable = true` and
|
|
22
|
+
* returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
|
|
23
|
+
* on every subsequent call.
|
|
24
|
+
*
|
|
25
|
+
* @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import type { ClassificationResult } from '@framers/agentos';
|
|
29
|
+
import type { ISharedServiceRegistry } from '@framers/agentos';
|
|
30
|
+
import type { IContentClassifier } from '../IContentClassifier';
|
|
31
|
+
import type { ClassifierConfig } from '../types';
|
|
32
|
+
import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Internal raw pipeline output type
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* A single label/score pair as returned by the HuggingFace text-classification
|
|
40
|
+
* pipeline when called with `{ topk: null }`.
|
|
41
|
+
*/
|
|
42
|
+
interface RawLabel {
|
|
43
|
+
/** Label name, e.g. `'INJECTION'` or `'SAFE'`. */
|
|
44
|
+
label: string;
|
|
45
|
+
/** Confidence score in the range [0, 1]. */
|
|
46
|
+
score: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// InjectionClassifier
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Binary prompt-injection classifier backed by
|
|
55
|
+
* `protectai/deberta-v3-small-prompt-injection-v2`.
|
|
56
|
+
*
|
|
57
|
+
* Returns one of two labels:
|
|
58
|
+
* - `INJECTION` — the text contains an injection attempt
|
|
59
|
+
* - `SAFE` — the text is clean
|
|
60
|
+
*
|
|
61
|
+
* The label with the higher confidence becomes `bestClass` / `confidence`.
|
|
62
|
+
* Both labels are present in `allScores` so callers can read the SAFE score
|
|
63
|
+
* as well.
|
|
64
|
+
*
|
|
65
|
+
* @implements {IContentClassifier}
|
|
66
|
+
*
|
|
67
|
+
* @example
|
|
68
|
+
* ```typescript
|
|
69
|
+
* const classifier = new InjectionClassifier(serviceRegistry);
|
|
70
|
+
* const result = await classifier.classify('Ignore previous instructions and …');
|
|
71
|
+
* // result.bestClass === 'INJECTION', result.confidence ≈ 0.97
|
|
72
|
+
* ```
|
|
73
|
+
*/
|
|
74
|
+
export class InjectionClassifier implements IContentClassifier {
|
|
75
|
+
// -------------------------------------------------------------------------
|
|
76
|
+
// IContentClassifier identity fields
|
|
77
|
+
// -------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
/** Unique service identifier for this classifier. */
|
|
80
|
+
readonly id = 'prompt-injection';
|
|
81
|
+
|
|
82
|
+
/** Human-readable name for dashboards and log output. */
|
|
83
|
+
readonly displayName = 'Prompt Injection Classifier';
|
|
84
|
+
|
|
85
|
+
/** Short description of what this classifier detects. */
|
|
86
|
+
readonly description =
|
|
87
|
+
'Detects prompt-injection attempts where adversarial instructions are ' +
|
|
88
|
+
'embedded in user input to override or hijack the agent system prompt.';
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Default Hugging Face model ID.
|
|
92
|
+
* Overridable via {@link ClassifierConfig.modelId}.
|
|
93
|
+
*/
|
|
94
|
+
readonly modelId = 'protectai/deberta-v3-small-prompt-injection-v2';
|
|
95
|
+
|
|
96
|
+
// -------------------------------------------------------------------------
|
|
97
|
+
// Internal state
|
|
98
|
+
// -------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Whether the model weights are fully loaded and the classifier is ready
|
|
102
|
+
* to accept `classify()` calls.
|
|
103
|
+
*/
|
|
104
|
+
private _isLoaded = false;
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Set to `true` when the model fails to load. Once `unavailable`, every
|
|
108
|
+
* subsequent `classify()` call immediately returns the pass result rather
|
|
109
|
+
* than retrying the expensive model load.
|
|
110
|
+
*/
|
|
111
|
+
private unavailable = false;
|
|
112
|
+
|
|
113
|
+
// -------------------------------------------------------------------------
|
|
114
|
+
// Constructor
|
|
115
|
+
// -------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* @param services - Shared service registry used to lazily create and cache
|
|
119
|
+
* the underlying HuggingFace pipeline instance.
|
|
120
|
+
* @param config - Optional per-classifier configuration. When
|
|
121
|
+
* `config.modelId` is provided it overrides the default `modelId` when
|
|
122
|
+
* loading the model.
|
|
123
|
+
*/
|
|
124
|
+
constructor(
|
|
125
|
+
private readonly services: ISharedServiceRegistry,
|
|
126
|
+
private readonly config?: ClassifierConfig,
|
|
127
|
+
) {}
|
|
128
|
+
|
|
129
|
+
// -------------------------------------------------------------------------
|
|
130
|
+
// IContentClassifier.isLoaded (getter)
|
|
131
|
+
// -------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Whether the underlying model pipeline has been successfully initialised.
|
|
135
|
+
* The flag is set to `true` after the first successful `classify()` call.
|
|
136
|
+
*/
|
|
137
|
+
get isLoaded(): boolean {
|
|
138
|
+
return this._isLoaded;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// -------------------------------------------------------------------------
|
|
142
|
+
// classify
|
|
143
|
+
// -------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Run prompt-injection inference on `text`.
|
|
147
|
+
*
|
|
148
|
+
* Lazily loads the pipeline on the first call via the shared service
|
|
149
|
+
* registry, then calls it with `{ topk: null }` to retrieve scores for both
|
|
150
|
+
* labels.
|
|
151
|
+
*
|
|
152
|
+
* @param text - The text to evaluate.
|
|
153
|
+
* @returns A promise that resolves with the classification result. If the
|
|
154
|
+
* model is unavailable the pass result is returned instead of throwing.
|
|
155
|
+
*/
|
|
156
|
+
async classify(text: string): Promise<ClassificationResult> {
|
|
157
|
+
// Return the pass result immediately if the model previously failed to load.
|
|
158
|
+
if (this.unavailable) {
|
|
159
|
+
return this.passResult();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Lazily obtain (or create) the HuggingFace pipeline instance from the
|
|
163
|
+
// shared service registry so the model is only downloaded once.
|
|
164
|
+
let pipeline: (text: string, opts: { topk: null }) => Promise<RawLabel[]>;
|
|
165
|
+
try {
|
|
166
|
+
pipeline = await this.services.getOrCreate(
|
|
167
|
+
ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE,
|
|
168
|
+
async () => {
|
|
169
|
+
// Dynamic import so environments without @huggingface/transformers
|
|
170
|
+
// can still load the rest of AgentOS.
|
|
171
|
+
const { pipeline: createPipeline } = await import(
|
|
172
|
+
'@huggingface/transformers'
|
|
173
|
+
);
|
|
174
|
+
return createPipeline(
|
|
175
|
+
'text-classification',
|
|
176
|
+
// Honour a caller-supplied model override; fall back to the default.
|
|
177
|
+
this.config?.modelId ?? this.modelId,
|
|
178
|
+
{ quantized: true },
|
|
179
|
+
);
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
/** Release ONNX/WASM resources when the registry entry is evicted. */
|
|
183
|
+
dispose: async (p: any) => p?.dispose?.(),
|
|
184
|
+
/** Tags used for diagnostics and capability discovery. */
|
|
185
|
+
tags: ['ml', 'classifier', 'prompt-injection', 'onnx'],
|
|
186
|
+
},
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
// Mark the classifier as ready now that the pipeline is available.
|
|
190
|
+
this._isLoaded = true;
|
|
191
|
+
} catch {
|
|
192
|
+
// Model failed to load — mark as unavailable and return the pass result.
|
|
193
|
+
this.unavailable = true;
|
|
194
|
+
return this.passResult();
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Run inference and request both label scores.
|
|
198
|
+
const raw = await pipeline(text, { topk: null });
|
|
199
|
+
return this.mapResult(raw);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// -------------------------------------------------------------------------
|
|
203
|
+
// dispose (optional IContentClassifier lifecycle hook)
|
|
204
|
+
// -------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Release the pipeline instance from the shared service registry.
|
|
208
|
+
*
|
|
209
|
+
* Idempotent — safe to call multiple times.
|
|
210
|
+
*/
|
|
211
|
+
async dispose(): Promise<void> {
|
|
212
|
+
await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE);
|
|
213
|
+
this._isLoaded = false;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// -------------------------------------------------------------------------
|
|
217
|
+
// Private helpers
|
|
218
|
+
// -------------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Returns a "pass" result used when the model is unavailable.
|
|
222
|
+
*
|
|
223
|
+
* A pass result reports `bestClass: 'benign'` with zero confidence so the
|
|
224
|
+
* guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
|
|
225
|
+
*/
|
|
226
|
+
private passResult(): ClassificationResult {
|
|
227
|
+
return { bestClass: 'benign', confidence: 0, allScores: [] };
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Map the raw pipeline output to a {@link ClassificationResult}.
|
|
232
|
+
*
|
|
233
|
+
* For binary classification the label with the higher confidence score
|
|
234
|
+
* becomes `bestClass` / `confidence`. Both labels are included in
|
|
235
|
+
* `allScores`.
|
|
236
|
+
*
|
|
237
|
+
* @param raw - Array returned by the pipeline when called with `topk: null`.
|
|
238
|
+
*/
|
|
239
|
+
private mapResult(raw: RawLabel[]): ClassificationResult {
|
|
240
|
+
if (!raw || raw.length === 0) {
|
|
241
|
+
return this.passResult();
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Find the label with the highest score (should be one of INJECTION / SAFE).
|
|
245
|
+
let best = raw[0];
|
|
246
|
+
for (const item of raw) {
|
|
247
|
+
if (item.score > best.score) {
|
|
248
|
+
best = item;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
bestClass: best.label,
|
|
254
|
+
confidence: best.score,
|
|
255
|
+
allScores: raw.map((item) => ({
|
|
256
|
+
classLabel: item.label,
|
|
257
|
+
score: item.score,
|
|
258
|
+
})),
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
}
|