@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/MLClassifierGuardrail.d.ts +88 -117
- package/dist/MLClassifierGuardrail.d.ts.map +1 -1
- package/dist/MLClassifierGuardrail.js +255 -264
- package/dist/MLClassifierGuardrail.js.map +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
- package/dist/index.d.ts +16 -90
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +33 -306
- package/dist/index.js.map +1 -1
- package/dist/keyword-classifier.d.ts +26 -0
- package/dist/keyword-classifier.d.ts.map +1 -0
- package/dist/keyword-classifier.js +113 -0
- package/dist/keyword-classifier.js.map +1 -0
- package/dist/llm-classifier.d.ts +27 -0
- package/dist/llm-classifier.d.ts.map +1 -0
- package/dist/llm-classifier.js +129 -0
- package/dist/llm-classifier.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +53 -80
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
- package/dist/tools/ClassifyContentTool.js +52 -103
- package/dist/tools/ClassifyContentTool.js.map +1 -1
- package/dist/types.d.ts +77 -277
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +9 -55
- package/dist/types.js.map +1 -1
- package/package.json +10 -16
- package/src/MLClassifierGuardrail.ts +279 -316
- package/src/index.ts +35 -339
- package/src/keyword-classifier.ts +130 -0
- package/src/llm-classifier.ts +163 -0
- package/src/tools/ClassifyContentTool.ts +75 -132
- package/src/types.ts +78 -325
- package/test/ClassifierOrchestrator.spec.ts +365 -0
- package/test/ClassifyContentTool.spec.ts +226 -0
- package/test/InjectionClassifier.spec.ts +263 -0
- package/test/JailbreakClassifier.spec.ts +295 -0
- package/test/MLClassifierGuardrail.spec.ts +486 -0
- package/test/SlidingWindowBuffer.spec.ts +391 -0
- package/test/ToxicityClassifier.spec.ts +268 -0
- package/test/WorkerClassifierProxy.spec.ts +303 -0
- package/test/index.spec.ts +431 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +24 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Unit tests for `ClassifierOrchestrator`.
|
|
3
|
+
*
|
|
4
|
+
* Tests verify:
|
|
5
|
+
* - Classifiers run in parallel (total time < sum of individual latencies)
|
|
6
|
+
* - Worst-wins aggregation: any BLOCK → overall BLOCK
|
|
7
|
+
* - FLAG > ALLOW in aggregation
|
|
8
|
+
* - All pass → ALLOW with triggeredBy null
|
|
9
|
+
* - triggeredBy identifies the classifier that caused escalation
|
|
10
|
+
* - Single classifier failure does not block others (contributes ALLOW)
|
|
11
|
+
* - Per-classifier threshold overrides work correctly
|
|
12
|
+
* - dispose() calls dispose on all classifiers
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
16
|
+
import { ClassifierOrchestrator } from '../src/ClassifierOrchestrator';
|
|
17
|
+
import type { IContentClassifier } from '../src/IContentClassifier';
|
|
18
|
+
import type { ClassificationResult } from '@framers/agentos';
|
|
19
|
+
import type { ClassifierThresholds } from '../src/types';
|
|
20
|
+
import { DEFAULT_THRESHOLDS } from '../src/types';
|
|
21
|
+
import { GuardrailAction } from '@framers/agentos';
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Mock classifier factory
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Create a mock classifier that returns a configurable result after an
|
|
29
|
+
* optional simulated delay. The `dispose` method is a vitest spy so
|
|
30
|
+
* callers can assert it was invoked.
|
|
31
|
+
*
|
|
32
|
+
* @param id - Unique classifier ID.
|
|
33
|
+
* @param result - The classification result to return.
|
|
34
|
+
* @param delayMs - Simulated inference latency (ms).
|
|
35
|
+
* @param shouldFail - If true, `classify()` rejects with an error.
|
|
36
|
+
*/
|
|
37
|
+
function createMockClassifier(
|
|
38
|
+
id: string,
|
|
39
|
+
result: ClassificationResult,
|
|
40
|
+
delayMs = 0,
|
|
41
|
+
shouldFail = false,
|
|
42
|
+
): IContentClassifier & { dispose: ReturnType<typeof vi.fn> } {
|
|
43
|
+
return {
|
|
44
|
+
id,
|
|
45
|
+
displayName: `Mock ${id}`,
|
|
46
|
+
description: `Mock classifier: ${id}`,
|
|
47
|
+
modelId: `mock/${id}`,
|
|
48
|
+
isLoaded: true,
|
|
49
|
+
classify: async (_text: string): Promise<ClassificationResult> => {
|
|
50
|
+
if (delayMs > 0) {
|
|
51
|
+
await new Promise((r) => setTimeout(r, delayMs));
|
|
52
|
+
}
|
|
53
|
+
if (shouldFail) {
|
|
54
|
+
throw new Error(`${id} inference failed`);
|
|
55
|
+
}
|
|
56
|
+
return result;
|
|
57
|
+
},
|
|
58
|
+
dispose: vi.fn(async () => {}),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Helpers — pre-built classification results
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
/** A benign result with very low confidence. */
|
|
67
|
+
const BENIGN_RESULT: ClassificationResult = {
|
|
68
|
+
bestClass: 'benign',
|
|
69
|
+
confidence: 0.1,
|
|
70
|
+
allScores: [{ classLabel: 'benign', score: 0.1 }],
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
/** A toxic result with confidence above the default block threshold (0.9). */
|
|
74
|
+
const TOXIC_BLOCK_RESULT: ClassificationResult = {
|
|
75
|
+
bestClass: 'toxic',
|
|
76
|
+
confidence: 0.95,
|
|
77
|
+
allScores: [{ classLabel: 'toxic', score: 0.95 }],
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
/** A flaggable result — confidence between flag (0.7) and block (0.9). */
|
|
81
|
+
const FLAG_RESULT: ClassificationResult = {
|
|
82
|
+
bestClass: 'suspicious',
|
|
83
|
+
confidence: 0.75,
|
|
84
|
+
allScores: [{ classLabel: 'suspicious', score: 0.75 }],
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
/** A warn-level result — confidence between warn (0.4) and flag (0.7). */
|
|
88
|
+
const WARN_RESULT: ClassificationResult = {
|
|
89
|
+
bestClass: 'mildly_suspicious',
|
|
90
|
+
confidence: 0.5,
|
|
91
|
+
allScores: [{ classLabel: 'mildly_suspicious', score: 0.5 }],
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
// Tests
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
describe('ClassifierOrchestrator', () => {
|
|
99
|
+
// -----------------------------------------------------------------------
|
|
100
|
+
// Parallel execution
|
|
101
|
+
// -----------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
it('runs classifiers in parallel (total time < sum of individual latencies)', async () => {
|
|
104
|
+
// Each classifier takes 50ms. If run sequentially total would be ~150ms.
|
|
105
|
+
const classifiers = [
|
|
106
|
+
createMockClassifier('a', BENIGN_RESULT, 50),
|
|
107
|
+
createMockClassifier('b', BENIGN_RESULT, 50),
|
|
108
|
+
createMockClassifier('c', BENIGN_RESULT, 50),
|
|
109
|
+
];
|
|
110
|
+
|
|
111
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
112
|
+
|
|
113
|
+
const start = performance.now();
|
|
114
|
+
await orchestrator.classifyAll('hello');
|
|
115
|
+
const elapsed = performance.now() - start;
|
|
116
|
+
|
|
117
|
+
// Parallel execution should complete in roughly 50ms + overhead,
|
|
118
|
+
// well under the sequential total of 150ms.
|
|
119
|
+
expect(elapsed).toBeLessThan(130);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
// -----------------------------------------------------------------------
|
|
123
|
+
// Worst-wins aggregation
|
|
124
|
+
// -----------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
it('worst-wins: any BLOCK → result is BLOCK', async () => {
|
|
127
|
+
const classifiers = [
|
|
128
|
+
createMockClassifier('clean', BENIGN_RESULT),
|
|
129
|
+
createMockClassifier('toxic', TOXIC_BLOCK_RESULT),
|
|
130
|
+
];
|
|
131
|
+
|
|
132
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
133
|
+
const result = await orchestrator.classifyAll('bad text');
|
|
134
|
+
|
|
135
|
+
expect(result.recommendedAction).toBe(GuardrailAction.BLOCK);
|
|
136
|
+
expect(result.triggeredBy).toBe('toxic');
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it('FLAG > ALLOW in aggregation', async () => {
|
|
140
|
+
const classifiers = [
|
|
141
|
+
createMockClassifier('clean', BENIGN_RESULT),
|
|
142
|
+
createMockClassifier('flagger', FLAG_RESULT),
|
|
143
|
+
];
|
|
144
|
+
|
|
145
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
146
|
+
const result = await orchestrator.classifyAll('some text');
|
|
147
|
+
|
|
148
|
+
expect(result.recommendedAction).toBe(GuardrailAction.FLAG);
|
|
149
|
+
expect(result.triggeredBy).toBe('flagger');
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it('BLOCK wins over FLAG in aggregation', async () => {
|
|
153
|
+
const classifiers = [
|
|
154
|
+
createMockClassifier('flagger', FLAG_RESULT),
|
|
155
|
+
createMockClassifier('blocker', TOXIC_BLOCK_RESULT),
|
|
156
|
+
];
|
|
157
|
+
|
|
158
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
159
|
+
const result = await orchestrator.classifyAll('bad text');
|
|
160
|
+
|
|
161
|
+
expect(result.recommendedAction).toBe(GuardrailAction.BLOCK);
|
|
162
|
+
expect(result.triggeredBy).toBe('blocker');
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
// -----------------------------------------------------------------------
|
|
166
|
+
// All pass → ALLOW
|
|
167
|
+
// -----------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
it('all pass → ALLOW with triggeredBy null', async () => {
|
|
170
|
+
const classifiers = [
|
|
171
|
+
createMockClassifier('a', BENIGN_RESULT),
|
|
172
|
+
createMockClassifier('b', BENIGN_RESULT),
|
|
173
|
+
];
|
|
174
|
+
|
|
175
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
176
|
+
const result = await orchestrator.classifyAll('hello world');
|
|
177
|
+
|
|
178
|
+
expect(result.recommendedAction).toBe(GuardrailAction.ALLOW);
|
|
179
|
+
expect(result.triggeredBy).toBeNull();
|
|
180
|
+
expect(result.results).toHaveLength(2);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
// -----------------------------------------------------------------------
|
|
184
|
+
// triggeredBy identification
|
|
185
|
+
// -----------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
it('triggeredBy identifies which classifier triggered the action', async () => {
|
|
188
|
+
const classifiers = [
|
|
189
|
+
createMockClassifier('safe', BENIGN_RESULT),
|
|
190
|
+
createMockClassifier('injection-detector', TOXIC_BLOCK_RESULT),
|
|
191
|
+
createMockClassifier('also-safe', BENIGN_RESULT),
|
|
192
|
+
];
|
|
193
|
+
|
|
194
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
195
|
+
const result = await orchestrator.classifyAll('inject this');
|
|
196
|
+
|
|
197
|
+
expect(result.triggeredBy).toBe('injection-detector');
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// -----------------------------------------------------------------------
|
|
201
|
+
// Classifier failure handling
|
|
202
|
+
// -----------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
it('single classifier failure does not block others (contributes ALLOW)', async () => {
|
|
205
|
+
const classifiers = [
|
|
206
|
+
createMockClassifier('broken', BENIGN_RESULT, 0, /* shouldFail */ true),
|
|
207
|
+
createMockClassifier('working', BENIGN_RESULT),
|
|
208
|
+
];
|
|
209
|
+
|
|
210
|
+
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
211
|
+
|
|
212
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
213
|
+
const result = await orchestrator.classifyAll('test');
|
|
214
|
+
|
|
215
|
+
// Only the working classifier's result should appear.
|
|
216
|
+
expect(result.results).toHaveLength(1);
|
|
217
|
+
expect(result.results[0].classifierId).toBe('working');
|
|
218
|
+
expect(result.recommendedAction).toBe(GuardrailAction.ALLOW);
|
|
219
|
+
|
|
220
|
+
// A warning should have been logged for the broken classifier.
|
|
221
|
+
expect(warnSpy).toHaveBeenCalledWith(
|
|
222
|
+
expect.stringContaining('Classifier "broken" failed'),
|
|
223
|
+
);
|
|
224
|
+
|
|
225
|
+
warnSpy.mockRestore();
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
it('failure of one classifier does not suppress BLOCK from another', async () => {
|
|
229
|
+
const classifiers = [
|
|
230
|
+
createMockClassifier('broken', BENIGN_RESULT, 0, true),
|
|
231
|
+
createMockClassifier('blocker', TOXIC_BLOCK_RESULT),
|
|
232
|
+
];
|
|
233
|
+
|
|
234
|
+
vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
235
|
+
|
|
236
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
237
|
+
const result = await orchestrator.classifyAll('toxic input');
|
|
238
|
+
|
|
239
|
+
expect(result.recommendedAction).toBe(GuardrailAction.BLOCK);
|
|
240
|
+
expect(result.triggeredBy).toBe('blocker');
|
|
241
|
+
|
|
242
|
+
vi.restoreAllMocks();
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
// -----------------------------------------------------------------------
|
|
246
|
+
// Per-classifier threshold overrides
|
|
247
|
+
// -----------------------------------------------------------------------
|
|
248
|
+
|
|
249
|
+
it('per-classifier threshold overrides work', async () => {
|
|
250
|
+
// Create a classifier whose confidence (0.75) would normally be FLAG
|
|
251
|
+
// with default thresholds, but we lower the block threshold to 0.6
|
|
252
|
+
// for this specific classifier.
|
|
253
|
+
const classifiers = [createMockClassifier('custom', FLAG_RESULT)];
|
|
254
|
+
|
|
255
|
+
const perClassifierThresholds: Record<string, Partial<ClassifierThresholds>> = {
|
|
256
|
+
custom: { blockThreshold: 0.6 },
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
const orchestrator = new ClassifierOrchestrator(
|
|
260
|
+
classifiers,
|
|
261
|
+
DEFAULT_THRESHOLDS,
|
|
262
|
+
perClassifierThresholds,
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
const result = await orchestrator.classifyAll('test');
|
|
266
|
+
|
|
267
|
+
// With block threshold at 0.6 and confidence at 0.75, this should BLOCK.
|
|
268
|
+
expect(result.recommendedAction).toBe(GuardrailAction.BLOCK);
|
|
269
|
+
expect(result.triggeredBy).toBe('custom');
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
it('per-classifier overrides do not affect other classifiers', async () => {
|
|
273
|
+
const classifiers = [
|
|
274
|
+
createMockClassifier('overridden', WARN_RESULT),
|
|
275
|
+
createMockClassifier('default', WARN_RESULT),
|
|
276
|
+
];
|
|
277
|
+
|
|
278
|
+
const perClassifierThresholds: Record<string, Partial<ClassifierThresholds>> = {
|
|
279
|
+
// Lower warn threshold for 'overridden' so 0.5 becomes FLAG-level
|
|
280
|
+
overridden: { flagThreshold: 0.45 },
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
const orchestrator = new ClassifierOrchestrator(
|
|
284
|
+
classifiers,
|
|
285
|
+
DEFAULT_THRESHOLDS,
|
|
286
|
+
perClassifierThresholds,
|
|
287
|
+
);
|
|
288
|
+
|
|
289
|
+
const result = await orchestrator.classifyAll('test');
|
|
290
|
+
|
|
291
|
+
// 'overridden' at 0.5 with flagThreshold=0.45 → FLAG.
|
|
292
|
+
// 'default' at 0.5 with flagThreshold=0.7 → SANITIZE (warn).
|
|
293
|
+
// Worst wins: FLAG > SANITIZE → FLAG.
|
|
294
|
+
expect(result.recommendedAction).toBe(GuardrailAction.FLAG);
|
|
295
|
+
expect(result.triggeredBy).toBe('overridden');
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
// -----------------------------------------------------------------------
|
|
299
|
+
// Result metadata
|
|
300
|
+
// -----------------------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
it('includes totalLatencyMs as wall time', async () => {
|
|
303
|
+
const classifiers = [createMockClassifier('a', BENIGN_RESULT, 20)];
|
|
304
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
305
|
+
const result = await orchestrator.classifyAll('text');
|
|
306
|
+
|
|
307
|
+
expect(result.totalLatencyMs).toBeGreaterThanOrEqual(0);
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
it('annotates each result with classifierId and latencyMs', async () => {
|
|
311
|
+
const classifiers = [
|
|
312
|
+
createMockClassifier('alpha', BENIGN_RESULT),
|
|
313
|
+
createMockClassifier('beta', FLAG_RESULT),
|
|
314
|
+
];
|
|
315
|
+
|
|
316
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
317
|
+
const result = await orchestrator.classifyAll('text');
|
|
318
|
+
|
|
319
|
+
expect(result.results).toHaveLength(2);
|
|
320
|
+
expect(result.results[0].classifierId).toBe('alpha');
|
|
321
|
+
expect(result.results[1].classifierId).toBe('beta');
|
|
322
|
+
|
|
323
|
+
for (const r of result.results) {
|
|
324
|
+
expect(r.latencyMs).toBeGreaterThanOrEqual(0);
|
|
325
|
+
expect(r.bestClass).toBeDefined();
|
|
326
|
+
expect(r.confidence).toBeDefined();
|
|
327
|
+
}
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
// -----------------------------------------------------------------------
|
|
331
|
+
// dispose
|
|
332
|
+
// -----------------------------------------------------------------------
|
|
333
|
+
|
|
334
|
+
it('dispose calls dispose on all classifiers', async () => {
|
|
335
|
+
const classifiers = [
|
|
336
|
+
createMockClassifier('a', BENIGN_RESULT),
|
|
337
|
+
createMockClassifier('b', BENIGN_RESULT),
|
|
338
|
+
createMockClassifier('c', BENIGN_RESULT),
|
|
339
|
+
];
|
|
340
|
+
|
|
341
|
+
const orchestrator = new ClassifierOrchestrator(classifiers, DEFAULT_THRESHOLDS);
|
|
342
|
+
await orchestrator.dispose();
|
|
343
|
+
|
|
344
|
+
for (const c of classifiers) {
|
|
345
|
+
expect(c.dispose).toHaveBeenCalledOnce();
|
|
346
|
+
}
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
it('dispose handles classifiers without dispose method', async () => {
|
|
350
|
+
const classifier: IContentClassifier = {
|
|
351
|
+
id: 'no-dispose',
|
|
352
|
+
displayName: 'No Dispose',
|
|
353
|
+
description: 'Test',
|
|
354
|
+
modelId: 'test',
|
|
355
|
+
isLoaded: true,
|
|
356
|
+
classify: async () => BENIGN_RESULT,
|
|
357
|
+
// No dispose method.
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
const orchestrator = new ClassifierOrchestrator([classifier], DEFAULT_THRESHOLDS);
|
|
361
|
+
|
|
362
|
+
// Should not throw.
|
|
363
|
+
await expect(orchestrator.dispose()).resolves.toBeUndefined();
|
|
364
|
+
});
|
|
365
|
+
});
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Unit tests for `ClassifyContentTool`.
|
|
3
|
+
*
|
|
4
|
+
* Tests verify:
|
|
5
|
+
* - Has correct ITool properties (id, name, displayName, etc.)
|
|
6
|
+
* - inputSchema has text (required) and classifiers (optional)
|
|
7
|
+
* - execute returns ChunkEvaluation with results for toxic text
|
|
8
|
+
* - Returns ALLOW for benign text
|
|
9
|
+
* - Returns error for empty text
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
13
|
+
import { ClassifyContentTool } from '../src/tools/ClassifyContentTool';
|
|
14
|
+
import { ClassifierOrchestrator } from '../src/ClassifierOrchestrator';
|
|
15
|
+
import type { IContentClassifier } from '../src/IContentClassifier';
|
|
16
|
+
import type { ClassificationResult } from '@framers/agentos';
|
|
17
|
+
import { DEFAULT_THRESHOLDS } from '../src/types';
|
|
18
|
+
import { GuardrailAction } from '@framers/agentos';
|
|
19
|
+
import type { ToolExecutionContext } from '@framers/agentos';
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Mock helpers
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Create a mock classifier returning a configurable result.
|
|
27
|
+
*/
|
|
28
|
+
function createMockClassifier(
|
|
29
|
+
id: string,
|
|
30
|
+
result: ClassificationResult,
|
|
31
|
+
): IContentClassifier {
|
|
32
|
+
return {
|
|
33
|
+
id,
|
|
34
|
+
displayName: `Mock ${id}`,
|
|
35
|
+
description: `Mock classifier: ${id}`,
|
|
36
|
+
modelId: `mock/${id}`,
|
|
37
|
+
isLoaded: true,
|
|
38
|
+
classify: vi.fn(async () => result),
|
|
39
|
+
dispose: vi.fn(async () => {}),
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Benign result — low confidence. */
|
|
44
|
+
const BENIGN: ClassificationResult = {
|
|
45
|
+
bestClass: 'benign',
|
|
46
|
+
confidence: 0.1,
|
|
47
|
+
allScores: [{ classLabel: 'benign', score: 0.1 }],
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
/** Toxic result — above default block threshold. */
|
|
51
|
+
const TOXIC: ClassificationResult = {
|
|
52
|
+
bestClass: 'toxic',
|
|
53
|
+
confidence: 0.95,
|
|
54
|
+
allScores: [{ classLabel: 'toxic', score: 0.95 }],
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
/** Minimal execution context for tool invocation. */
|
|
58
|
+
const EXEC_CONTEXT: ToolExecutionContext = {
|
|
59
|
+
gmiId: 'gmi-1',
|
|
60
|
+
personaId: 'persona-1',
|
|
61
|
+
userContext: { userId: 'user-1' } as any,
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
// Tests
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
describe('ClassifyContentTool', () => {
|
|
69
|
+
// -----------------------------------------------------------------------
|
|
70
|
+
// ITool metadata
|
|
71
|
+
// -----------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
describe('ITool properties', () => {
|
|
74
|
+
it('has correct id and name', () => {
|
|
75
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
76
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
77
|
+
|
|
78
|
+
expect(tool.id).toBe('classify_content');
|
|
79
|
+
expect(tool.name).toBe('classify_content');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('has correct displayName and description', () => {
|
|
83
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
84
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
85
|
+
|
|
86
|
+
expect(tool.displayName).toBe('Content Safety Classifier');
|
|
87
|
+
expect(tool.description).toContain('toxicity');
|
|
88
|
+
expect(tool.description).toContain('prompt injection');
|
|
89
|
+
expect(tool.description).toContain('jailbreak');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('has category=security and version=1.0.0', () => {
|
|
93
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
94
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
95
|
+
|
|
96
|
+
expect(tool.category).toBe('security');
|
|
97
|
+
expect(tool.version).toBe('1.0.0');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('has hasSideEffects=false', () => {
|
|
101
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
102
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
103
|
+
|
|
104
|
+
expect(tool.hasSideEffects).toBe(false);
|
|
105
|
+
});
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
// -----------------------------------------------------------------------
|
|
109
|
+
// inputSchema
|
|
110
|
+
// -----------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
describe('inputSchema', () => {
|
|
113
|
+
it('has text as a required property', () => {
|
|
114
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
115
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
116
|
+
|
|
117
|
+
expect(tool.inputSchema.type).toBe('object');
|
|
118
|
+
expect(tool.inputSchema.properties.text).toBeDefined();
|
|
119
|
+
expect(tool.inputSchema.properties.text.type).toBe('string');
|
|
120
|
+
expect(tool.inputSchema.required).toContain('text');
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
it('has classifiers as an optional array property', () => {
|
|
124
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
125
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
126
|
+
|
|
127
|
+
const classifiersProp = tool.inputSchema.properties.classifiers;
|
|
128
|
+
expect(classifiersProp).toBeDefined();
|
|
129
|
+
expect(classifiersProp.type).toBe('array');
|
|
130
|
+
expect(classifiersProp.items.type).toBe('string');
|
|
131
|
+
|
|
132
|
+
// Should NOT be in the required list.
|
|
133
|
+
expect(tool.inputSchema.required).not.toContain('classifiers');
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
// -----------------------------------------------------------------------
|
|
138
|
+
// execute
|
|
139
|
+
// -----------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
describe('execute', () => {
|
|
142
|
+
it('returns ChunkEvaluation with results for toxic text', async () => {
|
|
143
|
+
const classifier = createMockClassifier('tox', TOXIC);
|
|
144
|
+
const orchestrator = new ClassifierOrchestrator([classifier], DEFAULT_THRESHOLDS);
|
|
145
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
146
|
+
|
|
147
|
+
const result = await tool.execute({ text: 'you are terrible' }, EXEC_CONTEXT);
|
|
148
|
+
|
|
149
|
+
expect(result.success).toBe(true);
|
|
150
|
+
expect(result.output).toBeDefined();
|
|
151
|
+
expect(result.output!.recommendedAction).toBe(GuardrailAction.BLOCK);
|
|
152
|
+
expect(result.output!.results).toHaveLength(1);
|
|
153
|
+
expect(result.output!.results[0].classifierId).toBe('tox');
|
|
154
|
+
expect(result.output!.triggeredBy).toBe('tox');
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
it('returns ALLOW for benign text', async () => {
|
|
158
|
+
const classifier = createMockClassifier('safe', BENIGN);
|
|
159
|
+
const orchestrator = new ClassifierOrchestrator([classifier], DEFAULT_THRESHOLDS);
|
|
160
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
161
|
+
|
|
162
|
+
const result = await tool.execute({ text: 'hello world' }, EXEC_CONTEXT);
|
|
163
|
+
|
|
164
|
+
expect(result.success).toBe(true);
|
|
165
|
+
expect(result.output).toBeDefined();
|
|
166
|
+
expect(result.output!.recommendedAction).toBe(GuardrailAction.ALLOW);
|
|
167
|
+
expect(result.output!.triggeredBy).toBeNull();
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it('returns error for empty text', async () => {
|
|
171
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
172
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
173
|
+
|
|
174
|
+
const result = await tool.execute({ text: '' }, EXEC_CONTEXT);
|
|
175
|
+
|
|
176
|
+
expect(result.success).toBe(false);
|
|
177
|
+
expect(result.error).toContain('required');
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
it('returns error for whitespace-only text', async () => {
|
|
181
|
+
const orchestrator = new ClassifierOrchestrator([], DEFAULT_THRESHOLDS);
|
|
182
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
183
|
+
|
|
184
|
+
const result = await tool.execute({ text: ' ' }, EXEC_CONTEXT);
|
|
185
|
+
|
|
186
|
+
expect(result.success).toBe(false);
|
|
187
|
+
expect(result.error).toContain('required');
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
it('handles orchestrator errors gracefully', async () => {
|
|
191
|
+
// Create a classifier that always throws.
|
|
192
|
+
const brokenClassifier: IContentClassifier = {
|
|
193
|
+
id: 'broken',
|
|
194
|
+
displayName: 'Broken',
|
|
195
|
+
description: 'Always fails',
|
|
196
|
+
modelId: 'broken',
|
|
197
|
+
isLoaded: true,
|
|
198
|
+
classify: async () => { throw new Error('model crash'); },
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
// Even though the classifier throws, the orchestrator catches it via
|
|
202
|
+
// allSettled, so the tool should still succeed with ALLOW.
|
|
203
|
+
const orchestrator = new ClassifierOrchestrator([brokenClassifier], DEFAULT_THRESHOLDS);
|
|
204
|
+
vi.spyOn(console, 'warn').mockImplementation(() => {});
|
|
205
|
+
|
|
206
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
207
|
+
const result = await tool.execute({ text: 'test' }, EXEC_CONTEXT);
|
|
208
|
+
|
|
209
|
+
expect(result.success).toBe(true);
|
|
210
|
+
expect(result.output!.recommendedAction).toBe(GuardrailAction.ALLOW);
|
|
211
|
+
|
|
212
|
+
vi.restoreAllMocks();
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
it('includes totalLatencyMs in output', async () => {
|
|
216
|
+
const classifier = createMockClassifier('safe', BENIGN);
|
|
217
|
+
const orchestrator = new ClassifierOrchestrator([classifier], DEFAULT_THRESHOLDS);
|
|
218
|
+
const tool = new ClassifyContentTool(orchestrator);
|
|
219
|
+
|
|
220
|
+
const result = await tool.execute({ text: 'test' }, EXEC_CONTEXT);
|
|
221
|
+
|
|
222
|
+
expect(result.success).toBe(true);
|
|
223
|
+
expect(result.output!.totalLatencyMs).toBeGreaterThanOrEqual(0);
|
|
224
|
+
});
|
|
225
|
+
});
|
|
226
|
+
});
|