@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/dist/MLClassifierGuardrail.d.ts +88 -117
  3. package/dist/MLClassifierGuardrail.d.ts.map +1 -1
  4. package/dist/MLClassifierGuardrail.js +255 -264
  5. package/dist/MLClassifierGuardrail.js.map +1 -1
  6. package/dist/classifiers/InjectionClassifier.d.ts +1 -1
  7. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
  8. package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
  9. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
  10. package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
  11. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
  12. package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
  13. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
  14. package/dist/index.d.ts +16 -90
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +33 -306
  17. package/dist/index.js.map +1 -1
  18. package/dist/keyword-classifier.d.ts +26 -0
  19. package/dist/keyword-classifier.d.ts.map +1 -0
  20. package/dist/keyword-classifier.js +113 -0
  21. package/dist/keyword-classifier.js.map +1 -0
  22. package/dist/llm-classifier.d.ts +27 -0
  23. package/dist/llm-classifier.d.ts.map +1 -0
  24. package/dist/llm-classifier.js +129 -0
  25. package/dist/llm-classifier.js.map +1 -0
  26. package/dist/tools/ClassifyContentTool.d.ts +53 -80
  27. package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
  28. package/dist/tools/ClassifyContentTool.js +52 -103
  29. package/dist/tools/ClassifyContentTool.js.map +1 -1
  30. package/dist/types.d.ts +77 -277
  31. package/dist/types.d.ts.map +1 -1
  32. package/dist/types.js +9 -55
  33. package/dist/types.js.map +1 -1
  34. package/package.json +10 -16
  35. package/src/MLClassifierGuardrail.ts +279 -316
  36. package/src/index.ts +35 -339
  37. package/src/keyword-classifier.ts +130 -0
  38. package/src/llm-classifier.ts +163 -0
  39. package/src/tools/ClassifyContentTool.ts +75 -132
  40. package/src/types.ts +78 -325
  41. package/test/ClassifierOrchestrator.spec.ts +365 -0
  42. package/test/ClassifyContentTool.spec.ts +226 -0
  43. package/test/InjectionClassifier.spec.ts +263 -0
  44. package/test/JailbreakClassifier.spec.ts +295 -0
  45. package/test/MLClassifierGuardrail.spec.ts +486 -0
  46. package/test/SlidingWindowBuffer.spec.ts +391 -0
  47. package/test/ToxicityClassifier.spec.ts +268 -0
  48. package/test/WorkerClassifierProxy.spec.ts +303 -0
  49. package/test/index.spec.ts +431 -0
  50. package/tsconfig.json +20 -0
  51. package/vitest.config.ts +24 -0
@@ -0,0 +1,486 @@
1
+ /**
2
+ * @fileoverview Unit tests for `MLClassifierGuardrail`.
3
+ *
4
+ * Tests verify:
5
+ * - evaluateInput: classifies full text, returns BLOCK/FLAG/null
6
+ * - evaluateInput: returns null for clean text
7
+ * - evaluateInput: returns null when scope is 'output'
8
+ * - evaluateOutput: accumulates chunks, triggers at chunkSize
9
+ * - evaluateOutput: returns null for non-TEXT_DELTA chunks
10
+ * - evaluateOutput: returns null when scope is 'input'
11
+ * - evaluateOutput: flushes on isFinal
12
+ * - config.evaluateStreamingChunks is true
13
+ * - blocking mode: awaits classification
14
+ * - non-blocking mode: returns null immediately, BLOCK on next call if violation
15
+ */
16
+
17
+ import { describe, it, expect, vi } from 'vitest';
18
+ import { MLClassifierGuardrail } from '../src/MLClassifierGuardrail';
19
+ import type { IContentClassifier } from '../src/IContentClassifier';
20
+ import type { ClassificationResult } from '@framers/agentos';
21
+ import type { MLClassifierPackOptions } from '../src/types';
22
+ import type { ISharedServiceRegistry } from '@framers/agentos';
23
+ import type {
24
+ GuardrailInputPayload,
25
+ GuardrailOutputPayload,
26
+ GuardrailContext,
27
+ } from '@framers/agentos';
28
+ import { GuardrailAction } from '@framers/agentos';
29
+ import { AgentOSResponseChunkType } from '@framers/agentos';
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Mock helpers
33
+ // ---------------------------------------------------------------------------
34
+
35
+ /**
36
+ * Create a mock {@link IContentClassifier} that returns a configurable result.
37
+ *
38
+ * @param id - Unique classifier ID.
39
+ * @param result - The classification result to return from `classify()`.
40
+ */
41
+ function createMockClassifier(
42
+ id: string,
43
+ result: ClassificationResult,
44
+ ): IContentClassifier {
45
+ return {
46
+ id,
47
+ displayName: `Mock ${id}`,
48
+ description: `Mock classifier: ${id}`,
49
+ modelId: `mock/${id}`,
50
+ isLoaded: true,
51
+ classify: vi.fn(async () => result),
52
+ dispose: vi.fn(async () => {}),
53
+ };
54
+ }
55
+
56
+ /** Minimal mock of the shared service registry (not used by tests). */
57
+ function createMockRegistry(): ISharedServiceRegistry {
58
+ return {
59
+ getOrCreate: vi.fn(),
60
+ has: vi.fn(() => false),
61
+ release: vi.fn(),
62
+ releaseAll: vi.fn(),
63
+ };
64
+ }
65
+
66
+ /** Benign result — low confidence, should result in ALLOW. */
67
+ const BENIGN: ClassificationResult = {
68
+ bestClass: 'benign',
69
+ confidence: 0.1,
70
+ allScores: [{ classLabel: 'benign', score: 0.1 }],
71
+ };
72
+
73
+ /** Toxic result — high confidence, above default block threshold (0.9). */
74
+ const TOXIC: ClassificationResult = {
75
+ bestClass: 'toxic',
76
+ confidence: 0.95,
77
+ allScores: [{ classLabel: 'toxic', score: 0.95 }],
78
+ };
79
+
80
+ /** Flag-level result — confidence between flag (0.7) and block (0.9). */
81
+ const FLAGGABLE: ClassificationResult = {
82
+ bestClass: 'suspicious',
83
+ confidence: 0.75,
84
+ allScores: [{ classLabel: 'suspicious', score: 0.75 }],
85
+ };
86
+
87
+ /** Shared guardrail context for all test payloads. */
88
+ const CONTEXT: GuardrailContext = {
89
+ userId: 'user-1',
90
+ sessionId: 'session-1',
91
+ };
92
+
93
+ /**
94
+ * Build a {@link GuardrailInputPayload} with the given text.
95
+ */
96
+ function inputPayload(text: string | null): GuardrailInputPayload {
97
+ return {
98
+ context: CONTEXT,
99
+ input: {
100
+ userId: 'user-1',
101
+ sessionId: 'session-1',
102
+ textInput: text,
103
+ },
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Build a {@link GuardrailOutputPayload} wrapping a TEXT_DELTA chunk.
109
+ *
110
+ * @param streamId - Stream identifier.
111
+ * @param textDelta - The text delta content.
112
+ * @param isFinal - Whether this is the final chunk.
113
+ */
114
+ function textDeltaPayload(
115
+ streamId: string,
116
+ textDelta: string,
117
+ isFinal = false,
118
+ ): GuardrailOutputPayload {
119
+ return {
120
+ context: CONTEXT,
121
+ chunk: {
122
+ type: AgentOSResponseChunkType.TEXT_DELTA,
123
+ streamId,
124
+ gmiInstanceId: 'gmi-1',
125
+ personaId: 'persona-1',
126
+ isFinal,
127
+ timestamp: new Date().toISOString(),
128
+ textDelta,
129
+ } as any,
130
+ };
131
+ }
132
+
133
+ /**
134
+ * Build a {@link GuardrailOutputPayload} wrapping a non-TEXT_DELTA chunk
135
+ * (e.g. SYSTEM_PROGRESS).
136
+ */
137
+ function nonTextPayload(streamId: string): GuardrailOutputPayload {
138
+ return {
139
+ context: CONTEXT,
140
+ chunk: {
141
+ type: AgentOSResponseChunkType.SYSTEM_PROGRESS,
142
+ streamId,
143
+ gmiInstanceId: 'gmi-1',
144
+ personaId: 'persona-1',
145
+ isFinal: false,
146
+ timestamp: new Date().toISOString(),
147
+ message: 'Processing...',
148
+ } as any,
149
+ };
150
+ }
151
+
152
+ /**
153
+ * Build a final chunk payload (isFinal=true) with a given stream ID.
154
+ * Uses FINAL_RESPONSE type to trigger the flush path.
155
+ */
156
+ function finalPayload(streamId: string): GuardrailOutputPayload {
157
+ return {
158
+ context: CONTEXT,
159
+ chunk: {
160
+ type: AgentOSResponseChunkType.FINAL_RESPONSE,
161
+ streamId,
162
+ gmiInstanceId: 'gmi-1',
163
+ personaId: 'persona-1',
164
+ isFinal: true,
165
+ timestamp: new Date().toISOString(),
166
+ finalResponseText: 'done',
167
+ } as any,
168
+ };
169
+ }
170
+
171
+ /** Default pack options for a guardrail with small window for easier testing. */
172
+ const DEFAULT_OPTIONS: MLClassifierPackOptions = {
173
+ chunkSize: 10, // 10 tokens = 40 chars triggers a window
174
+ contextSize: 2,
175
+ maxEvaluations: 100,
176
+ guardrailScope: 'both',
177
+ streamingMode: true,
178
+ };
179
+
180
+ // ---------------------------------------------------------------------------
181
+ // Tests
182
+ // ---------------------------------------------------------------------------
183
+
184
+ describe('MLClassifierGuardrail', () => {
185
+ // -----------------------------------------------------------------------
186
+ // evaluateInput
187
+ // -----------------------------------------------------------------------
188
+
189
+ describe('evaluateInput', () => {
190
+ it('classifies full text and returns BLOCK for toxic content', async () => {
191
+ const classifier = createMockClassifier('tox', TOXIC);
192
+ const guardrail = new MLClassifierGuardrail(
193
+ createMockRegistry(),
194
+ DEFAULT_OPTIONS,
195
+ [classifier],
196
+ );
197
+
198
+ const result = await guardrail.evaluateInput!(inputPayload('you are terrible'));
199
+
200
+ expect(result).not.toBeNull();
201
+ expect(result!.action).toBe(GuardrailAction.BLOCK);
202
+ expect(result!.reason).toContain('tox');
203
+ });
204
+
205
+ it('returns FLAG for moderately suspicious content', async () => {
206
+ const classifier = createMockClassifier('mod', FLAGGABLE);
207
+ const guardrail = new MLClassifierGuardrail(
208
+ createMockRegistry(),
209
+ DEFAULT_OPTIONS,
210
+ [classifier],
211
+ );
212
+
213
+ const result = await guardrail.evaluateInput!(inputPayload('hmm suspicious'));
214
+
215
+ expect(result).not.toBeNull();
216
+ expect(result!.action).toBe(GuardrailAction.FLAG);
217
+ });
218
+
219
+ it('returns null for clean text', async () => {
220
+ const classifier = createMockClassifier('safe', BENIGN);
221
+ const guardrail = new MLClassifierGuardrail(
222
+ createMockRegistry(),
223
+ DEFAULT_OPTIONS,
224
+ [classifier],
225
+ );
226
+
227
+ const result = await guardrail.evaluateInput!(inputPayload('hello world'));
228
+
229
+ expect(result).toBeNull();
230
+ });
231
+
232
+ it('returns null when scope is output', async () => {
233
+ const classifier = createMockClassifier('tox', TOXIC);
234
+ const guardrail = new MLClassifierGuardrail(
235
+ createMockRegistry(),
236
+ { ...DEFAULT_OPTIONS, guardrailScope: 'output' },
237
+ [classifier],
238
+ );
239
+
240
+ const result = await guardrail.evaluateInput!(inputPayload('toxic content'));
241
+
242
+ expect(result).toBeNull();
243
+ // Classifier should NOT have been called.
244
+ expect(classifier.classify).not.toHaveBeenCalled();
245
+ });
246
+
247
+ it('returns null when textInput is null', async () => {
248
+ const classifier = createMockClassifier('tox', TOXIC);
249
+ const guardrail = new MLClassifierGuardrail(
250
+ createMockRegistry(),
251
+ DEFAULT_OPTIONS,
252
+ [classifier],
253
+ );
254
+
255
+ const result = await guardrail.evaluateInput!(inputPayload(null));
256
+
257
+ expect(result).toBeNull();
258
+ });
259
+ });
260
+
261
+ // -----------------------------------------------------------------------
262
+ // evaluateOutput
263
+ // -----------------------------------------------------------------------
264
+
265
+ describe('evaluateOutput', () => {
266
+ it('returns null when scope is input', async () => {
267
+ const classifier = createMockClassifier('tox', TOXIC);
268
+ const guardrail = new MLClassifierGuardrail(
269
+ createMockRegistry(),
270
+ { ...DEFAULT_OPTIONS, guardrailScope: 'input' },
271
+ [classifier],
272
+ );
273
+
274
+ const result = await guardrail.evaluateOutput!(
275
+ textDeltaPayload('s1', 'a'.repeat(100)),
276
+ );
277
+
278
+ expect(result).toBeNull();
279
+ });
280
+
281
+ it('returns null for non-TEXT_DELTA chunks', async () => {
282
+ const classifier = createMockClassifier('tox', TOXIC);
283
+ const guardrail = new MLClassifierGuardrail(
284
+ createMockRegistry(),
285
+ DEFAULT_OPTIONS,
286
+ [classifier],
287
+ );
288
+
289
+ const result = await guardrail.evaluateOutput!(nonTextPayload('s1'));
290
+
291
+ expect(result).toBeNull();
292
+ });
293
+
294
+ it('accumulates chunks and triggers classification at chunkSize', async () => {
295
+ const classifier = createMockClassifier('tox', TOXIC);
296
+ const guardrail = new MLClassifierGuardrail(
297
+ createMockRegistry(),
298
+ DEFAULT_OPTIONS,
299
+ [classifier],
300
+ );
301
+
302
+ // Push less than chunkSize (10 tokens = 40 chars) — should not trigger.
303
+ const r1 = await guardrail.evaluateOutput!(
304
+ textDeltaPayload('s1', 'a'.repeat(20)),
305
+ );
306
+ expect(r1).toBeNull();
307
+
308
+ // Push enough to exceed chunkSize — should trigger classification.
309
+ const r2 = await guardrail.evaluateOutput!(
310
+ textDeltaPayload('s1', 'a'.repeat(25)),
311
+ );
312
+
313
+ // With TOXIC classifier, the result should be BLOCK.
314
+ expect(r2).not.toBeNull();
315
+ expect(r2!.action).toBe(GuardrailAction.BLOCK);
316
+ });
317
+
318
+ it('flushes remaining buffer on isFinal', async () => {
319
+ const classifier = createMockClassifier('tox', TOXIC);
320
+ const guardrail = new MLClassifierGuardrail(
321
+ createMockRegistry(),
322
+ DEFAULT_OPTIONS,
323
+ [classifier],
324
+ );
325
+
326
+ // Push some text (not enough for a full window).
327
+ await guardrail.evaluateOutput!(textDeltaPayload('s1', 'a'.repeat(20)));
328
+
329
+ // Send a final chunk — should flush and classify remaining text.
330
+ const result = await guardrail.evaluateOutput!(finalPayload('s1'));
331
+
332
+ expect(result).not.toBeNull();
333
+ expect(result!.action).toBe(GuardrailAction.BLOCK);
334
+ });
335
+
336
+ it('returns null on isFinal when buffer is empty', async () => {
337
+ const classifier = createMockClassifier('safe', BENIGN);
338
+ const guardrail = new MLClassifierGuardrail(
339
+ createMockRegistry(),
340
+ DEFAULT_OPTIONS,
341
+ [classifier],
342
+ );
343
+
344
+ // No text was pushed — final flush should return null.
345
+ const result = await guardrail.evaluateOutput!(finalPayload('s1'));
346
+
347
+ expect(result).toBeNull();
348
+ });
349
+ });
350
+
351
+ // -----------------------------------------------------------------------
352
+ // config
353
+ // -----------------------------------------------------------------------
354
+
355
+ describe('config', () => {
356
+ it('evaluateStreamingChunks is true', () => {
357
+ const guardrail = new MLClassifierGuardrail(
358
+ createMockRegistry(),
359
+ DEFAULT_OPTIONS,
360
+ [],
361
+ );
362
+
363
+ expect(guardrail.config.evaluateStreamingChunks).toBe(true);
364
+ });
365
+
366
+ it('maxStreamingEvaluations defaults to 100', () => {
367
+ const guardrail = new MLClassifierGuardrail(
368
+ createMockRegistry(),
369
+ { ...DEFAULT_OPTIONS, maxEvaluations: undefined },
370
+ [],
371
+ );
372
+
373
+ expect(guardrail.config.maxStreamingEvaluations).toBe(100);
374
+ });
375
+
376
+ it('maxStreamingEvaluations uses provided value', () => {
377
+ const guardrail = new MLClassifierGuardrail(
378
+ createMockRegistry(),
379
+ { ...DEFAULT_OPTIONS, maxEvaluations: 50 },
380
+ [],
381
+ );
382
+
383
+ expect(guardrail.config.maxStreamingEvaluations).toBe(50);
384
+ });
385
+ });
386
+
387
+ // -----------------------------------------------------------------------
388
+ // Blocking mode
389
+ // -----------------------------------------------------------------------
390
+
391
+ describe('blocking mode', () => {
392
+ it('awaits classification and returns result immediately when window fills', async () => {
393
+ const classifier = createMockClassifier('tox', TOXIC);
394
+ const guardrail = new MLClassifierGuardrail(
395
+ createMockRegistry(),
396
+ { ...DEFAULT_OPTIONS, streamingMode: true },
397
+ [classifier],
398
+ );
399
+
400
+ // Push enough text to fill the window (10 tokens = 40 chars).
401
+ const result = await guardrail.evaluateOutput!(
402
+ textDeltaPayload('s1', 'a'.repeat(45)),
403
+ );
404
+
405
+ // Should return BLOCK synchronously (within the same call).
406
+ expect(result).not.toBeNull();
407
+ expect(result!.action).toBe(GuardrailAction.BLOCK);
408
+ });
409
+
410
+ it('returns null when window is not yet full', async () => {
411
+ const classifier = createMockClassifier('tox', TOXIC);
412
+ const guardrail = new MLClassifierGuardrail(
413
+ createMockRegistry(),
414
+ DEFAULT_OPTIONS,
415
+ [classifier],
416
+ );
417
+
418
+ // Push less than chunkSize.
419
+ const result = await guardrail.evaluateOutput!(
420
+ textDeltaPayload('s1', 'a'.repeat(10)),
421
+ );
422
+
423
+ expect(result).toBeNull();
424
+ });
425
+ });
426
+
427
+ // -----------------------------------------------------------------------
428
+ // Non-blocking mode (requires direct instantiation with mode override)
429
+ // -----------------------------------------------------------------------
430
+
431
+ describe('non-blocking mode behaviour via evaluateOutput', () => {
432
+ it('returns ALLOW (null) for clean classifier even when window fills', async () => {
433
+ const classifier = createMockClassifier('safe', BENIGN);
434
+ const guardrail = new MLClassifierGuardrail(
435
+ createMockRegistry(),
436
+ DEFAULT_OPTIONS,
437
+ [classifier],
438
+ );
439
+
440
+ // Fill window.
441
+ const result = await guardrail.evaluateOutput!(
442
+ textDeltaPayload('s1', 'a'.repeat(45)),
443
+ );
444
+
445
+ // Benign → ALLOW → null.
446
+ expect(result).toBeNull();
447
+ });
448
+ });
449
+
450
+ // -----------------------------------------------------------------------
451
+ // Metadata in results
452
+ // -----------------------------------------------------------------------
453
+
454
+ describe('result metadata', () => {
455
+ it('includes triggeredBy and classifier details in metadata', async () => {
456
+ const classifier = createMockClassifier('injection', TOXIC);
457
+ const guardrail = new MLClassifierGuardrail(
458
+ createMockRegistry(),
459
+ DEFAULT_OPTIONS,
460
+ [classifier],
461
+ );
462
+
463
+ const result = await guardrail.evaluateInput!(inputPayload('inject this'));
464
+
465
+ expect(result).not.toBeNull();
466
+ expect(result!.metadata).toBeDefined();
467
+ expect(result!.metadata!.triggeredBy).toBe('injection');
468
+ expect(result!.metadata!.classifierResults).toBeInstanceOf(Array);
469
+ expect((result!.metadata!.classifierResults as any[])[0].classifierId).toBe('injection');
470
+ });
471
+
472
+ it('includes reasonCode in result', async () => {
473
+ const classifier = createMockClassifier('tox', TOXIC);
474
+ const guardrail = new MLClassifierGuardrail(
475
+ createMockRegistry(),
476
+ DEFAULT_OPTIONS,
477
+ [classifier],
478
+ );
479
+
480
+ const result = await guardrail.evaluateInput!(inputPayload('bad'));
481
+
482
+ expect(result).not.toBeNull();
483
+ expect(result!.reasonCode).toBe('ML_CLASSIFIER_BLOCK');
484
+ });
485
+ });
486
+ });