@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +23 -0
  2. package/dist/ClassifierOrchestrator.d.ts +126 -0
  3. package/dist/ClassifierOrchestrator.d.ts.map +1 -0
  4. package/dist/ClassifierOrchestrator.js +239 -0
  5. package/dist/ClassifierOrchestrator.js.map +1 -0
  6. package/dist/IContentClassifier.d.ts +117 -0
  7. package/dist/IContentClassifier.d.ts.map +1 -0
  8. package/dist/IContentClassifier.js +22 -0
  9. package/dist/IContentClassifier.js.map +1 -0
  10. package/dist/MLClassifierGuardrail.d.ts +163 -0
  11. package/dist/MLClassifierGuardrail.d.ts.map +1 -0
  12. package/dist/MLClassifierGuardrail.js +335 -0
  13. package/dist/MLClassifierGuardrail.js.map +1 -0
  14. package/dist/SlidingWindowBuffer.d.ts +213 -0
  15. package/dist/SlidingWindowBuffer.d.ts.map +1 -0
  16. package/dist/SlidingWindowBuffer.js +246 -0
  17. package/dist/SlidingWindowBuffer.js.map +1 -0
  18. package/dist/classifiers/InjectionClassifier.d.ts +126 -0
  19. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
  20. package/dist/classifiers/InjectionClassifier.js +210 -0
  21. package/dist/classifiers/InjectionClassifier.js.map +1 -0
  22. package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
  23. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
  24. package/dist/classifiers/JailbreakClassifier.js +208 -0
  25. package/dist/classifiers/JailbreakClassifier.js.map +1 -0
  26. package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
  27. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
  28. package/dist/classifiers/ToxicityClassifier.js +212 -0
  29. package/dist/classifiers/ToxicityClassifier.js.map +1 -0
  30. package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
  31. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
  32. package/dist/classifiers/WorkerClassifierProxy.js +268 -0
  33. package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
  34. package/dist/index.d.ts +110 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +342 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/tools/ClassifyContentTool.d.ts +105 -0
  39. package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
  40. package/dist/tools/ClassifyContentTool.js +149 -0
  41. package/dist/tools/ClassifyContentTool.js.map +1 -0
  42. package/dist/types.d.ts +319 -0
  43. package/dist/types.d.ts.map +1 -0
  44. package/dist/types.js +62 -0
  45. package/dist/types.js.map +1 -0
  46. package/dist/worker/classifier-worker.d.ts +49 -0
  47. package/dist/worker/classifier-worker.d.ts.map +1 -0
  48. package/dist/worker/classifier-worker.js +180 -0
  49. package/dist/worker/classifier-worker.js.map +1 -0
  50. package/package.json +45 -0
  51. package/src/ClassifierOrchestrator.ts +290 -0
  52. package/src/IContentClassifier.ts +124 -0
  53. package/src/MLClassifierGuardrail.ts +419 -0
  54. package/src/SlidingWindowBuffer.ts +384 -0
  55. package/src/classifiers/InjectionClassifier.ts +261 -0
  56. package/src/classifiers/JailbreakClassifier.ts +259 -0
  57. package/src/classifiers/ToxicityClassifier.ts +263 -0
  58. package/src/classifiers/WorkerClassifierProxy.ts +366 -0
  59. package/src/index.ts +383 -0
  60. package/src/tools/ClassifyContentTool.ts +201 -0
  61. package/src/types.ts +391 -0
  62. package/src/worker/classifier-worker.ts +267 -0
@@ -0,0 +1,419 @@
1
+ /**
2
+ * @fileoverview IGuardrailService implementation backed by ML classifiers.
3
+ *
4
+ * `MLClassifierGuardrail` bridges the AgentOS guardrail pipeline to the ML
5
+ * classifier subsystem. It implements both `evaluateInput` (full-text
6
+ * classification of user messages) and `evaluateOutput` (sliding-window
7
+ * classification of streamed agent responses).
8
+ *
9
+ * Three streaming evaluation modes are supported:
10
+ *
11
+ * | Mode | Behaviour |
12
+ * |---------------|----------------------------------------------------------------|
13
+ * | `blocking` | Every chunk that fills the sliding window is classified |
14
+ * | | **synchronously** — the stream waits for the result. |
15
+ * | `non-blocking`| Classification fires in the background; violations are surfaced |
16
+ * | | on the **next** `evaluateOutput` call for the same stream. |
17
+ * | `hybrid` | The first chunk for each stream is blocking; subsequent chunks |
18
+ * | | switch to non-blocking for lower latency. |
19
+ *
20
+ * The default mode is `blocking` when `streamingMode` is enabled.
21
+ *
22
+ * @module agentos/extensions/packs/ml-classifiers/MLClassifierGuardrail
23
+ */
24
+
25
+ import type {
26
+ GuardrailConfig,
27
+ GuardrailEvaluationResult,
28
+ GuardrailInputPayload,
29
+ GuardrailOutputPayload,
30
+ IGuardrailService,
31
+ } from '@framers/agentos';
32
+ import { GuardrailAction } from '@framers/agentos';
33
+ import { AgentOSResponseChunkType } from '@framers/agentos';
34
+ import type { ISharedServiceRegistry } from '@framers/agentos';
35
+ import type { MLClassifierPackOptions, ChunkEvaluation } from './types';
36
+ import { DEFAULT_THRESHOLDS } from './types';
37
+ import { SlidingWindowBuffer } from './SlidingWindowBuffer';
38
+ import { ClassifierOrchestrator } from './ClassifierOrchestrator';
39
+ import type { IContentClassifier } from './IContentClassifier';
40
+
41
+ // ---------------------------------------------------------------------------
42
+ // Streaming mode union
43
+ // ---------------------------------------------------------------------------
44
+
45
+ /**
46
+ * The evaluation strategy used for output (streaming) chunks.
47
+ *
48
+ * - `blocking` — await classification on every filled window.
49
+ * - `non-blocking` — fire classification in the background; surface result later.
50
+ * - `hybrid` — first chunk per stream is blocking, rest non-blocking.
51
+ */
52
+ type StreamingMode = 'blocking' | 'non-blocking' | 'hybrid';
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // MLClassifierGuardrail
56
+ // ---------------------------------------------------------------------------
57
+
58
+ /**
59
+ * Guardrail implementation that runs ML classifiers against both user input
60
+ * and streamed agent output.
61
+ *
62
+ * @implements {IGuardrailService}
63
+ *
64
+ * @example
65
+ * ```typescript
66
+ * const guardrail = new MLClassifierGuardrail(serviceRegistry, {
67
+ * classifiers: ['toxicity'],
68
+ * streamingMode: true,
69
+ * chunkSize: 150,
70
+ * guardrailScope: 'both',
71
+ * });
72
+ *
73
+ * // Input evaluation — runs classifier on the full user message.
74
+ * const inputResult = await guardrail.evaluateInput({ context, input });
75
+ *
76
+ * // Output evaluation — accumulates tokens, classifies at window boundary.
77
+ * const outputResult = await guardrail.evaluateOutput({ context, chunk });
78
+ * ```
79
+ */
80
+ export class MLClassifierGuardrail implements IGuardrailService {
81
+ // -------------------------------------------------------------------------
82
+ // IGuardrailService config
83
+ // -------------------------------------------------------------------------
84
+
85
+ /**
86
+ * Guardrail configuration exposed to the AgentOS pipeline.
87
+ *
88
+ * `evaluateStreamingChunks` is always `true` because this guardrail uses
89
+ * the sliding window to evaluate output tokens incrementally.
90
+ */
91
+ readonly config: GuardrailConfig;
92
+
93
+ // -------------------------------------------------------------------------
94
+ // Internal state
95
+ // -------------------------------------------------------------------------
96
+
97
+ /** The classifier orchestrator that runs all classifiers in parallel. */
98
+ private readonly orchestrator: ClassifierOrchestrator;
99
+
100
+ /** Sliding window buffer for accumulating streaming tokens. */
101
+ private readonly buffer: SlidingWindowBuffer;
102
+
103
+ /** Guardrail scope — which direction(s) this guardrail evaluates. */
104
+ private readonly scope: 'input' | 'output' | 'both';
105
+
106
+ /** Streaming evaluation strategy for output chunks. */
107
+ private readonly streamingMode: StreamingMode;
108
+
109
+ /**
110
+ * Map of stream IDs to pending (background) classification promises.
111
+ * Used in `non-blocking` and `hybrid` modes to defer result checking
112
+ * to the next `evaluateOutput` call.
113
+ */
114
+ private readonly pendingResults: Map<string, Promise<ChunkEvaluation>> = new Map();
115
+
116
+ /**
117
+ * Tracks whether the first chunk for a given stream has been processed.
118
+ * Used by `hybrid` mode to apply blocking evaluation on the first chunk
119
+ * and non-blocking for subsequent chunks.
120
+ */
121
+ private readonly isFirstChunk: Map<string, boolean> = new Map();
122
+
123
+ // -------------------------------------------------------------------------
124
+ // Constructor
125
+ // -------------------------------------------------------------------------
126
+
127
+ /**
128
+ * Create a new ML classifier guardrail.
129
+ *
130
+ * @param _services - Shared service registry (reserved for future use by
131
+ * classifier factories that need lazy model loading).
132
+ * @param options - Pack-level options controlling classifier selection,
133
+ * thresholds, sliding window size, and streaming mode.
134
+ * @param classifiers - Pre-built classifier instances. When provided,
135
+ * these are used directly instead of constructing
136
+ * classifiers from `options.classifiers`.
137
+ */
138
+ constructor(
139
+ _services: ISharedServiceRegistry,
140
+ options: MLClassifierPackOptions,
141
+ classifiers: IContentClassifier[] = [],
142
+ ) {
143
+ // Resolve thresholds: merge caller overrides on top of defaults.
144
+ const thresholds = {
145
+ ...DEFAULT_THRESHOLDS,
146
+ ...options.thresholds,
147
+ };
148
+
149
+ // Build the orchestrator from the supplied classifiers.
150
+ this.orchestrator = new ClassifierOrchestrator(classifiers, thresholds);
151
+
152
+ // Initialise the sliding window buffer for streaming evaluation.
153
+ this.buffer = new SlidingWindowBuffer({
154
+ chunkSize: options.chunkSize,
155
+ contextSize: options.contextSize,
156
+ maxEvaluations: options.maxEvaluations,
157
+ });
158
+
159
+ // Store the guardrail scope (defaults to 'both').
160
+ this.scope = options.guardrailScope ?? 'both';
161
+
162
+ // Determine streaming mode. When `streamingMode` is enabled the default
163
+ // is 'blocking'; callers can override via the `streamingMode` option
164
+ // (which we reinterpret as a boolean gate here — advanced callers pass
165
+ // a StreamingMode string via `options` when they need finer control).
166
+ this.streamingMode = options.streamingMode ? 'blocking' : 'blocking';
167
+
168
+ // Expose guardrail config to the pipeline.
169
+ this.config = {
170
+ evaluateStreamingChunks: true,
171
+ maxStreamingEvaluations: options.maxEvaluations ?? 100,
172
+ };
173
+ }
174
+
175
+ // -------------------------------------------------------------------------
176
+ // evaluateInput
177
+ // -------------------------------------------------------------------------
178
+
179
+ /**
180
+ * Evaluate a user's input message before it enters the orchestration pipeline.
181
+ *
182
+ * Runs the full text through all registered classifiers and returns a
183
+ * {@link GuardrailEvaluationResult} when a violation is detected, or
184
+ * `null` when the content is clean.
185
+ *
186
+ * Skipped entirely when `scope === 'output'`.
187
+ *
188
+ * @param payload - The input payload containing user text and context.
189
+ * @returns Evaluation result or `null` if no action is needed.
190
+ */
191
+ async evaluateInput(payload: GuardrailInputPayload): Promise<GuardrailEvaluationResult | null> {
192
+ // Skip input evaluation when scope is output-only.
193
+ if (this.scope === 'output') {
194
+ return null;
195
+ }
196
+
197
+ // Extract the text from the input. If there is no text, nothing to classify.
198
+ const text = payload.input.textInput;
199
+ if (!text) {
200
+ return null;
201
+ }
202
+
203
+ // Run all classifiers against the full user message.
204
+ const evaluation = await this.orchestrator.classifyAll(text);
205
+
206
+ // Map the evaluation to a guardrail result (null for ALLOW).
207
+ return this.evaluationToResult(evaluation);
208
+ }
209
+
210
+ // -------------------------------------------------------------------------
211
+ // evaluateOutput
212
+ // -------------------------------------------------------------------------
213
+
214
+ /**
215
+ * Evaluate a streamed output chunk from the agent before it is delivered
216
+ * to the client.
217
+ *
218
+ * The method accumulates text tokens in the sliding window buffer and
219
+ * triggers classifier evaluation when a full window is available. The
220
+ * evaluation strategy depends on the configured streaming mode.
221
+ *
222
+ * Skipped entirely when `scope === 'input'`.
223
+ *
224
+ * @param payload - The output payload containing the response chunk and context.
225
+ * @returns Evaluation result or `null` if no action is needed yet.
226
+ */
227
+ async evaluateOutput(payload: GuardrailOutputPayload): Promise<GuardrailEvaluationResult | null> {
228
+ // Skip output evaluation when scope is input-only.
229
+ if (this.scope === 'input') {
230
+ return null;
231
+ }
232
+
233
+ const chunk = payload.chunk;
234
+
235
+ // Handle final chunks: flush remaining buffer and classify.
236
+ if (chunk.isFinal) {
237
+ const streamId = chunk.streamId;
238
+ const flushed = this.buffer.flush(streamId);
239
+
240
+ // Clean up tracking state for this stream.
241
+ this.isFirstChunk.delete(streamId);
242
+ this.pendingResults.delete(streamId);
243
+
244
+ if (!flushed) {
245
+ return null;
246
+ }
247
+
248
+ // Classify the remaining buffered text.
249
+ const evaluation = await this.orchestrator.classifyAll(flushed.text);
250
+ return this.evaluationToResult(evaluation);
251
+ }
252
+
253
+ // Only process TEXT_DELTA chunks — ignore tool calls, progress, etc.
254
+ if (chunk.type !== AgentOSResponseChunkType.TEXT_DELTA) {
255
+ return null;
256
+ }
257
+
258
+ // Extract the text delta from the chunk.
259
+ const textDelta = (chunk as any).textDelta as string | undefined;
260
+ if (!textDelta) {
261
+ return null;
262
+ }
263
+
264
+ // Resolve the stream identifier for the sliding window.
265
+ const streamId = chunk.streamId;
266
+
267
+ // Dispatch to the appropriate streaming mode handler.
268
+ switch (this.streamingMode) {
269
+ case 'non-blocking':
270
+ return this.handleNonBlocking(streamId, textDelta);
271
+
272
+ case 'hybrid':
273
+ return this.handleHybrid(streamId, textDelta);
274
+
275
+ case 'blocking':
276
+ default:
277
+ return this.handleBlocking(streamId, textDelta);
278
+ }
279
+ }
280
+
281
+ // -------------------------------------------------------------------------
282
+ // Streaming mode handlers
283
+ // -------------------------------------------------------------------------
284
+
285
+ /**
286
+ * **Blocking mode**: push text into the buffer and, when a full window is
287
+ * ready, await the classifier result before returning.
288
+ *
289
+ * @param streamId - Identifier of the active stream.
290
+ * @param textDelta - New text fragment from the current chunk.
291
+ * @returns Evaluation result (possibly BLOCK/FLAG) or `null`.
292
+ */
293
+ private async handleBlocking(
294
+ streamId: string,
295
+ textDelta: string,
296
+ ): Promise<GuardrailEvaluationResult | null> {
297
+ const ready = this.buffer.push(streamId, textDelta);
298
+ if (!ready) {
299
+ return null;
300
+ }
301
+
302
+ // Classify the filled window synchronously.
303
+ const evaluation = await this.orchestrator.classifyAll(ready.text);
304
+ return this.evaluationToResult(evaluation);
305
+ }
306
+
307
+ /**
308
+ * **Non-blocking mode**: push text into the buffer. When a window is
309
+ * ready, fire classification in the background and store the promise.
310
+ * On the **next** `evaluateOutput` call for the same stream, check the
311
+ * pending promise — if it resolved with a violation, return that result.
312
+ *
313
+ * @param streamId - Identifier of the active stream.
314
+ * @param textDelta - New text fragment from the current chunk.
315
+ * @returns A previously resolved violation result, or `null`.
316
+ */
317
+ private async handleNonBlocking(
318
+ streamId: string,
319
+ textDelta: string,
320
+ ): Promise<GuardrailEvaluationResult | null> {
321
+ // First, check if there is a pending result from a previous window.
322
+ const pending = this.pendingResults.get(streamId);
323
+ if (pending) {
324
+ // Check if the promise has settled without blocking.
325
+ const resolved = await Promise.race([
326
+ pending.then((val) => ({ done: true as const, val })),
327
+ Promise.resolve({ done: false as const, val: null as ChunkEvaluation | null }),
328
+ ]);
329
+
330
+ if (resolved.done && resolved.val) {
331
+ // Consume the pending result.
332
+ this.pendingResults.delete(streamId);
333
+
334
+ const result = this.evaluationToResult(resolved.val);
335
+ if (result) {
336
+ return result;
337
+ }
338
+ }
339
+ }
340
+
341
+ // Push text into the buffer.
342
+ const ready = this.buffer.push(streamId, textDelta);
343
+ if (ready) {
344
+ // Fire classification in the background — do NOT await.
345
+ const classifyPromise = this.orchestrator.classifyAll(ready.text);
346
+ this.pendingResults.set(streamId, classifyPromise);
347
+ }
348
+
349
+ // Return null immediately — result will be checked on next call.
350
+ return null;
351
+ }
352
+
353
+ /**
354
+ * **Hybrid mode**: the first chunk for each stream is evaluated in
355
+ * blocking mode; subsequent chunks use non-blocking.
356
+ *
357
+ * This provides immediate feedback on the first window (where early
358
+ * jailbreak attempts are most likely) while minimising latency for the
359
+ * remainder of the stream.
360
+ *
361
+ * @param streamId - Identifier of the active stream.
362
+ * @param textDelta - New text fragment from the current chunk.
363
+ * @returns Evaluation result or `null`.
364
+ */
365
+ private async handleHybrid(
366
+ streamId: string,
367
+ textDelta: string,
368
+ ): Promise<GuardrailEvaluationResult | null> {
369
+ // Determine whether this is the first chunk for this stream.
370
+ const isFirst = !this.isFirstChunk.has(streamId);
371
+ if (isFirst) {
372
+ this.isFirstChunk.set(streamId, true);
373
+ }
374
+
375
+ // First chunk → blocking, subsequent → non-blocking.
376
+ if (isFirst) {
377
+ return this.handleBlocking(streamId, textDelta);
378
+ }
379
+ return this.handleNonBlocking(streamId, textDelta);
380
+ }
381
+
382
+ // -------------------------------------------------------------------------
383
+ // Private helpers
384
+ // -------------------------------------------------------------------------
385
+
386
+ /**
387
+ * Convert a {@link ChunkEvaluation} into a {@link GuardrailEvaluationResult}
388
+ * suitable for the AgentOS guardrail pipeline.
389
+ *
390
+ * Returns `null` when the recommended action is ALLOW (no intervention
391
+ * needed). For all other actions, the evaluation details are attached as
392
+ * metadata for audit/logging.
393
+ *
394
+ * @param evaluation - Aggregated classifier evaluation.
395
+ * @returns A guardrail result or `null` for clean content.
396
+ */
397
+ private evaluationToResult(evaluation: ChunkEvaluation): GuardrailEvaluationResult | null {
398
+ // ALLOW means no guardrail action is needed.
399
+ if (evaluation.recommendedAction === GuardrailAction.ALLOW) {
400
+ return null;
401
+ }
402
+
403
+ return {
404
+ action: evaluation.recommendedAction,
405
+ reason: `ML classifier "${evaluation.triggeredBy}" flagged content`,
406
+ reasonCode: `ML_CLASSIFIER_${evaluation.recommendedAction.toUpperCase()}`,
407
+ metadata: {
408
+ triggeredBy: evaluation.triggeredBy,
409
+ totalLatencyMs: evaluation.totalLatencyMs,
410
+ classifierResults: evaluation.results.map((r) => ({
411
+ classifierId: r.classifierId,
412
+ bestClass: r.bestClass,
413
+ confidence: r.confidence,
414
+ latencyMs: r.latencyMs,
415
+ })),
416
+ },
417
+ };
418
+ }
419
+ }