@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/MLClassifierGuardrail.d.ts +88 -117
- package/dist/MLClassifierGuardrail.d.ts.map +1 -1
- package/dist/MLClassifierGuardrail.js +255 -264
- package/dist/MLClassifierGuardrail.js.map +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
- package/dist/index.d.ts +16 -90
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +33 -306
- package/dist/index.js.map +1 -1
- package/dist/keyword-classifier.d.ts +26 -0
- package/dist/keyword-classifier.d.ts.map +1 -0
- package/dist/keyword-classifier.js +113 -0
- package/dist/keyword-classifier.js.map +1 -0
- package/dist/llm-classifier.d.ts +27 -0
- package/dist/llm-classifier.d.ts.map +1 -0
- package/dist/llm-classifier.js +129 -0
- package/dist/llm-classifier.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +53 -80
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
- package/dist/tools/ClassifyContentTool.js +52 -103
- package/dist/tools/ClassifyContentTool.js.map +1 -1
- package/dist/types.d.ts +77 -277
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +9 -55
- package/dist/types.js.map +1 -1
- package/package.json +10 -16
- package/src/MLClassifierGuardrail.ts +279 -316
- package/src/index.ts +35 -339
- package/src/keyword-classifier.ts +130 -0
- package/src/llm-classifier.ts +163 -0
- package/src/tools/ClassifyContentTool.ts +75 -132
- package/src/types.ts +78 -325
- package/test/ClassifierOrchestrator.spec.ts +365 -0
- package/test/ClassifyContentTool.spec.ts +226 -0
- package/test/InjectionClassifier.spec.ts +263 -0
- package/test/JailbreakClassifier.spec.ts +295 -0
- package/test/MLClassifierGuardrail.spec.ts +486 -0
- package/test/SlidingWindowBuffer.spec.ts +391 -0
- package/test/ToxicityClassifier.spec.ts +268 -0
- package/test/WorkerClassifierProxy.spec.ts +303 -0
- package/test/index.spec.ts +431 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +24 -0
|
@@ -1,335 +1,326 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @
|
|
2
|
+
* @file MLClassifierGuardrail.ts
|
|
3
|
+
* @description IGuardrailService implementation that classifies text for toxicity,
|
|
4
|
+
* prompt injection, NSFW content, and threats using a three-tier strategy:
|
|
3
5
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
6
|
+
* 1. **ONNX inference** — attempts to load `@huggingface/transformers` at runtime
|
|
7
|
+
* and run a lightweight ONNX classification model.
|
|
8
|
+
* 2. **LLM-as-judge** — falls back to an LLM invoker callback that prompts a
|
|
9
|
+
* language model for structured JSON safety classification.
|
|
10
|
+
* 3. **Keyword matching** — last-resort regex/keyword-based detection when neither
|
|
11
|
+
* ONNX nor LLM are available.
|
|
8
12
|
*
|
|
9
|
-
*
|
|
13
|
+
* The guardrail is configured as Phase 2 (parallel, non-sanitizing) so it runs
|
|
14
|
+
* alongside other read-only guardrails without blocking the streaming pipeline.
|
|
10
15
|
*
|
|
11
|
-
*
|
|
12
|
-
* |---------------|----------------------------------------------------------------|
|
|
13
|
-
* | `blocking` | Every chunk that fills the sliding window is classified |
|
|
14
|
-
* | | **synchronously** — the stream waits for the result. |
|
|
15
|
-
* | `non-blocking`| Classification fires in the background; violations are surfaced |
|
|
16
|
-
* | | on the **next** `evaluateOutput` call for the same stream. |
|
|
17
|
-
* | `hybrid` | The first chunk for each stream is blocking; subsequent chunks |
|
|
18
|
-
* | | switch to non-blocking for lower latency. |
|
|
16
|
+
* ### Action thresholds
|
|
19
17
|
*
|
|
20
|
-
*
|
|
18
|
+
* - **FLAG** when any category confidence exceeds `flagThreshold` (default 0.5).
|
|
19
|
+
* - **BLOCK** when any category confidence exceeds `blockThreshold` (default 0.8).
|
|
21
20
|
*
|
|
22
|
-
* @module
|
|
21
|
+
* @module ml-classifiers/MLClassifierGuardrail
|
|
23
22
|
*/
|
|
24
23
|
import { GuardrailAction } from '@framers/agentos';
|
|
25
24
|
import { AgentOSResponseChunkType } from '@framers/agentos';
|
|
26
|
-
import {
|
|
27
|
-
import {
|
|
28
|
-
import {
|
|
25
|
+
import { ALL_CATEGORIES } from './types';
|
|
26
|
+
import { classifyByKeywords } from './keyword-classifier';
|
|
27
|
+
import { classifyByLlm } from './llm-classifier';
|
|
29
28
|
// ---------------------------------------------------------------------------
|
|
30
29
|
// MLClassifierGuardrail
|
|
31
30
|
// ---------------------------------------------------------------------------
|
|
32
31
|
/**
|
|
33
|
-
*
|
|
34
|
-
*
|
|
32
|
+
* AgentOS guardrail that classifies text for safety using ML models, LLM
|
|
33
|
+
* inference, or keyword fallback.
|
|
35
34
|
*
|
|
36
35
|
* @implements {IGuardrailService}
|
|
37
|
-
*
|
|
38
|
-
* @example
|
|
39
|
-
* ```typescript
|
|
40
|
-
* const guardrail = new MLClassifierGuardrail(serviceRegistry, {
|
|
41
|
-
* classifiers: ['toxicity'],
|
|
42
|
-
* streamingMode: true,
|
|
43
|
-
* chunkSize: 150,
|
|
44
|
-
* guardrailScope: 'both',
|
|
45
|
-
* });
|
|
46
|
-
*
|
|
47
|
-
* // Input evaluation — runs classifier on the full user message.
|
|
48
|
-
* const inputResult = await guardrail.evaluateInput({ context, input });
|
|
49
|
-
*
|
|
50
|
-
* // Output evaluation — accumulates tokens, classifies at window boundary.
|
|
51
|
-
* const outputResult = await guardrail.evaluateOutput({ context, chunk });
|
|
52
|
-
* ```
|
|
53
36
|
*/
|
|
54
37
|
export class MLClassifierGuardrail {
|
|
55
|
-
//
|
|
56
|
-
// IGuardrailService
|
|
57
|
-
//
|
|
38
|
+
// -----------------------------------------------------------------------
|
|
39
|
+
// IGuardrailService.config
|
|
40
|
+
// -----------------------------------------------------------------------
|
|
58
41
|
/**
|
|
59
|
-
* Guardrail configuration
|
|
42
|
+
* Guardrail configuration.
|
|
60
43
|
*
|
|
61
|
-
* `
|
|
62
|
-
*
|
|
44
|
+
* - `canSanitize: false` — this guardrail does not modify content; it only
|
|
45
|
+
* BLOCKs or FLAGs. This places it in Phase 2 (parallel) of the guardrail
|
|
46
|
+
* dispatcher for better performance.
|
|
47
|
+
* - `evaluateStreamingChunks: false` — only evaluates complete messages, not
|
|
48
|
+
* individual streaming deltas. ML classification on partial text produces
|
|
49
|
+
* unreliable results.
|
|
63
50
|
*/
|
|
64
|
-
config
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
51
|
+
config = {
|
|
52
|
+
canSanitize: false,
|
|
53
|
+
evaluateStreamingChunks: false,
|
|
54
|
+
};
|
|
55
|
+
// -----------------------------------------------------------------------
|
|
56
|
+
// Private state
|
|
57
|
+
// -----------------------------------------------------------------------
|
|
58
|
+
/** Categories to evaluate. */
|
|
59
|
+
categories;
|
|
60
|
+
/** Per-category flag thresholds. */
|
|
61
|
+
flagThresholds;
|
|
62
|
+
/** Per-category block thresholds. */
|
|
63
|
+
blockThresholds;
|
|
64
|
+
/** Optional LLM invoker callback for tier-2 classification. */
|
|
65
|
+
llmInvoker;
|
|
76
66
|
/**
|
|
77
|
-
*
|
|
78
|
-
*
|
|
79
|
-
*
|
|
67
|
+
* Cached reference to the `@huggingface/transformers` pipeline function.
|
|
68
|
+
* `null` means we already tried and failed to load the module.
|
|
69
|
+
* `undefined` means we have not tried yet.
|
|
80
70
|
*/
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
* Tracks whether the first chunk for a given stream has been processed.
|
|
84
|
-
* Used by `hybrid` mode to apply blocking evaluation on the first chunk
|
|
85
|
-
* and non-blocking for subsequent chunks.
|
|
86
|
-
*/
|
|
87
|
-
isFirstChunk = new Map();
|
|
88
|
-
// -------------------------------------------------------------------------
|
|
71
|
+
onnxPipeline = undefined;
|
|
72
|
+
// -----------------------------------------------------------------------
|
|
89
73
|
// Constructor
|
|
90
|
-
//
|
|
74
|
+
// -----------------------------------------------------------------------
|
|
91
75
|
/**
|
|
92
|
-
* Create a new
|
|
76
|
+
* Create a new MLClassifierGuardrail.
|
|
93
77
|
*
|
|
94
|
-
* @param
|
|
95
|
-
*
|
|
96
|
-
* @param options - Pack-level options controlling classifier selection,
|
|
97
|
-
* thresholds, sliding window size, and streaming mode.
|
|
98
|
-
* @param classifiers - Pre-built classifier instances. When provided,
|
|
99
|
-
* these are used directly instead of constructing
|
|
100
|
-
* classifiers from `options.classifiers`.
|
|
78
|
+
* @param options - Pack-level configuration. All properties have sensible
|
|
79
|
+
* defaults for zero-config operation.
|
|
101
80
|
*/
|
|
102
|
-
constructor(
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
this.
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
});
|
|
116
|
-
// Store the guardrail scope (defaults to 'both').
|
|
117
|
-
this.scope = options.guardrailScope ?? 'both';
|
|
118
|
-
// Determine streaming mode. When `streamingMode` is enabled the default
|
|
119
|
-
// is 'blocking'; callers can override via the `streamingMode` option
|
|
120
|
-
// (which we reinterpret as a boolean gate here — advanced callers pass
|
|
121
|
-
// a StreamingMode string via `options` when they need finer control).
|
|
122
|
-
this.streamingMode = options.streamingMode ? 'blocking' : 'blocking';
|
|
123
|
-
// Expose guardrail config to the pipeline.
|
|
124
|
-
this.config = {
|
|
125
|
-
evaluateStreamingChunks: true,
|
|
126
|
-
maxStreamingEvaluations: options.maxEvaluations ?? 100,
|
|
127
|
-
};
|
|
81
|
+
constructor(options) {
|
|
82
|
+
const opts = options ?? {};
|
|
83
|
+
this.categories = opts.categories ?? [...ALL_CATEGORIES];
|
|
84
|
+
this.llmInvoker = opts.llmInvoker;
|
|
85
|
+
// Resolve per-category thresholds.
|
|
86
|
+
const globalFlag = opts.flagThreshold ?? 0.5;
|
|
87
|
+
const globalBlock = opts.blockThreshold ?? 0.8;
|
|
88
|
+
this.flagThresholds = {};
|
|
89
|
+
this.blockThresholds = {};
|
|
90
|
+
for (const cat of ALL_CATEGORIES) {
|
|
91
|
+
this.flagThresholds[cat] = opts.thresholds?.[cat]?.flag ?? globalFlag;
|
|
92
|
+
this.blockThresholds[cat] = opts.thresholds?.[cat]?.block ?? globalBlock;
|
|
93
|
+
}
|
|
128
94
|
}
|
|
129
|
-
//
|
|
130
|
-
// evaluateInput
|
|
131
|
-
//
|
|
95
|
+
// -----------------------------------------------------------------------
|
|
96
|
+
// IGuardrailService — evaluateInput
|
|
97
|
+
// -----------------------------------------------------------------------
|
|
132
98
|
/**
|
|
133
|
-
* Evaluate
|
|
134
|
-
*
|
|
135
|
-
* Runs the full text through all registered classifiers and returns a
|
|
136
|
-
* {@link GuardrailEvaluationResult} when a violation is detected, or
|
|
137
|
-
* `null` when the content is clean.
|
|
99
|
+
* Evaluate user input for safety before orchestration begins.
|
|
138
100
|
*
|
|
139
|
-
*
|
|
140
|
-
*
|
|
141
|
-
* @param payload - The input payload containing user text and context.
|
|
142
|
-
* @returns Evaluation result or `null` if no action is needed.
|
|
101
|
+
* @param payload - Input evaluation payload containing the user's message.
|
|
102
|
+
* @returns Guardrail result or `null` if no action is required.
|
|
143
103
|
*/
|
|
144
104
|
async evaluateInput(payload) {
|
|
145
|
-
|
|
146
|
-
if (
|
|
105
|
+
const text = payload.input.textInput;
|
|
106
|
+
if (!text || text.length === 0)
|
|
107
|
+
return null;
|
|
108
|
+
const result = await this.classify(text);
|
|
109
|
+
return this.buildResult(result);
|
|
110
|
+
}
|
|
111
|
+
// -----------------------------------------------------------------------
|
|
112
|
+
// IGuardrailService — evaluateOutput
|
|
113
|
+
// -----------------------------------------------------------------------
|
|
114
|
+
/**
|
|
115
|
+
* Evaluate agent output for safety. Only processes FINAL_RESPONSE chunks
|
|
116
|
+
* since `evaluateStreamingChunks` is disabled.
|
|
117
|
+
*
|
|
118
|
+
* @param payload - Output evaluation payload from the AgentOS dispatcher.
|
|
119
|
+
* @returns Guardrail result or `null` if no action is required.
|
|
120
|
+
*/
|
|
121
|
+
async evaluateOutput(payload) {
|
|
122
|
+
const { chunk } = payload;
|
|
123
|
+
// Only evaluate final text responses.
|
|
124
|
+
if (chunk.type !== AgentOSResponseChunkType.FINAL_RESPONSE) {
|
|
147
125
|
return null;
|
|
148
126
|
}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if (!text) {
|
|
127
|
+
const text = chunk.text ?? chunk.content ?? '';
|
|
128
|
+
if (typeof text !== 'string' || text.length === 0)
|
|
152
129
|
return null;
|
|
130
|
+
const result = await this.classify(text);
|
|
131
|
+
return this.buildResult(result);
|
|
132
|
+
}
|
|
133
|
+
// -----------------------------------------------------------------------
|
|
134
|
+
// Public classification method (also used by ClassifyContentTool)
|
|
135
|
+
// -----------------------------------------------------------------------
|
|
136
|
+
/**
|
|
137
|
+
* Classify a text string using the three-tier strategy: ONNX -> LLM -> keyword.
|
|
138
|
+
*
|
|
139
|
+
* @param text - The text to classify.
|
|
140
|
+
* @returns Classification result with per-category scores.
|
|
141
|
+
*/
|
|
142
|
+
async classify(text) {
|
|
143
|
+
// Tier 1: try ONNX inference.
|
|
144
|
+
const onnxResult = await this.tryOnnxClassification(text);
|
|
145
|
+
if (onnxResult)
|
|
146
|
+
return onnxResult;
|
|
147
|
+
// Tier 2: try LLM-as-judge.
|
|
148
|
+
if (this.llmInvoker) {
|
|
149
|
+
const llmResult = await this.tryLlmClassification(text);
|
|
150
|
+
if (llmResult)
|
|
151
|
+
return llmResult;
|
|
153
152
|
}
|
|
154
|
-
//
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
|
|
153
|
+
// Tier 3: keyword fallback.
|
|
154
|
+
const scores = classifyByKeywords(text, this.categories);
|
|
155
|
+
return {
|
|
156
|
+
categories: scores,
|
|
157
|
+
flagged: scores.some((s) => s.confidence > this.flagThresholds[s.name]),
|
|
158
|
+
source: 'keyword',
|
|
159
|
+
};
|
|
158
160
|
}
|
|
159
|
-
//
|
|
160
|
-
//
|
|
161
|
-
//
|
|
161
|
+
// -----------------------------------------------------------------------
|
|
162
|
+
// Private — ONNX classification (Tier 1)
|
|
163
|
+
// -----------------------------------------------------------------------
|
|
162
164
|
/**
|
|
163
|
-
*
|
|
164
|
-
*
|
|
165
|
+
* Attempt to load `@huggingface/transformers` and run ONNX-based text
|
|
166
|
+
* classification. Returns `null` if the module is unavailable or inference
|
|
167
|
+
* fails.
|
|
165
168
|
*
|
|
166
|
-
* The
|
|
167
|
-
*
|
|
168
|
-
* evaluation strategy depends on the configured streaming mode.
|
|
169
|
+
* The module load is attempted only once; subsequent calls use the cached
|
|
170
|
+
* result (either a working pipeline or `null`).
|
|
169
171
|
*
|
|
170
|
-
*
|
|
172
|
+
* @param text - Text to classify.
|
|
173
|
+
* @returns Classification result or `null`.
|
|
171
174
|
*
|
|
172
|
-
* @
|
|
173
|
-
* @returns Evaluation result or `null` if no action is needed yet.
|
|
175
|
+
* @internal
|
|
174
176
|
*/
|
|
175
|
-
async
|
|
176
|
-
//
|
|
177
|
-
if (this.
|
|
177
|
+
async tryOnnxClassification(text) {
|
|
178
|
+
// If we already know ONNX is unavailable, skip.
|
|
179
|
+
if (this.onnxPipeline === null)
|
|
178
180
|
return null;
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
181
|
+
// First-time load attempt.
|
|
182
|
+
if (this.onnxPipeline === undefined) {
|
|
183
|
+
try {
|
|
184
|
+
// Dynamic import so the optional dependency does not fail at boot.
|
|
185
|
+
const transformers = await import('@huggingface/transformers');
|
|
186
|
+
this.onnxPipeline = await transformers.pipeline('text-classification', 'Xenova/toxic-bert', { device: 'cpu' });
|
|
187
|
+
}
|
|
188
|
+
catch {
|
|
189
|
+
// Module not installed or model load failed — mark as unavailable.
|
|
190
|
+
this.onnxPipeline = null;
|
|
189
191
|
return null;
|
|
190
192
|
}
|
|
191
|
-
// Classify the remaining buffered text.
|
|
192
|
-
const evaluation = await this.orchestrator.classifyAll(flushed.text);
|
|
193
|
-
return this.evaluationToResult(evaluation);
|
|
194
193
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
194
|
+
try {
|
|
195
|
+
const raw = await this.onnxPipeline(text, { topk: null });
|
|
196
|
+
// Map ONNX labels to our categories.
|
|
197
|
+
const scores = this.mapOnnxScores(raw);
|
|
198
|
+
return {
|
|
199
|
+
categories: scores,
|
|
200
|
+
flagged: scores.some((s) => s.confidence > this.flagThresholds[s.name]),
|
|
201
|
+
source: 'onnx',
|
|
202
|
+
};
|
|
198
203
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
if (!textDelta) {
|
|
204
|
+
catch {
|
|
205
|
+
// Inference failed — fall through to next tier.
|
|
202
206
|
return null;
|
|
203
207
|
}
|
|
204
|
-
// Resolve the stream identifier for the sliding window.
|
|
205
|
-
const streamId = chunk.streamId;
|
|
206
|
-
// Dispatch to the appropriate streaming mode handler.
|
|
207
|
-
switch (this.streamingMode) {
|
|
208
|
-
case 'non-blocking':
|
|
209
|
-
return this.handleNonBlocking(streamId, textDelta);
|
|
210
|
-
case 'hybrid':
|
|
211
|
-
return this.handleHybrid(streamId, textDelta);
|
|
212
|
-
case 'blocking':
|
|
213
|
-
default:
|
|
214
|
-
return this.handleBlocking(streamId, textDelta);
|
|
215
|
-
}
|
|
216
208
|
}
|
|
217
|
-
// -------------------------------------------------------------------------
|
|
218
|
-
// Streaming mode handlers
|
|
219
|
-
// -------------------------------------------------------------------------
|
|
220
209
|
/**
|
|
221
|
-
*
|
|
222
|
-
* ready, await the classifier result before returning.
|
|
210
|
+
* Map raw ONNX text-classification output labels to our standard categories.
|
|
223
211
|
*
|
|
224
|
-
*
|
|
225
|
-
*
|
|
226
|
-
*
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
return null;
|
|
232
|
-
}
|
|
233
|
-
// Classify the filled window synchronously.
|
|
234
|
-
const evaluation = await this.orchestrator.classifyAll(ready.text);
|
|
235
|
-
return this.evaluationToResult(evaluation);
|
|
236
|
-
}
|
|
237
|
-
/**
|
|
238
|
-
* **Non-blocking mode**: push text into the buffer. When a window is
|
|
239
|
-
* ready, fire classification in the background and store the promise.
|
|
240
|
-
* On the **next** `evaluateOutput` call for the same stream, check the
|
|
241
|
-
* pending promise — if it resolved with a violation, return that result.
|
|
212
|
+
* ONNX models (e.g. toxic-bert) produce labels like `"toxic"`, `"obscene"`,
|
|
213
|
+
* `"threat"`, `"insult"`, `"identity_hate"`, etc. We map these to our four
|
|
214
|
+
* categories, taking the max score when multiple ONNX labels map to the same
|
|
215
|
+
* category.
|
|
216
|
+
*
|
|
217
|
+
* @param raw - Raw ONNX pipeline output.
|
|
218
|
+
* @returns Per-category scores.
|
|
242
219
|
*
|
|
243
|
-
* @
|
|
244
|
-
* @param textDelta - New text fragment from the current chunk.
|
|
245
|
-
* @returns A previously resolved violation result, or `null`.
|
|
220
|
+
* @internal
|
|
246
221
|
*/
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
const
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
222
|
+
mapOnnxScores(raw) {
|
|
223
|
+
/** Map of ONNX label -> our category. */
|
|
224
|
+
const labelMap = {
|
|
225
|
+
toxic: 'toxic',
|
|
226
|
+
severe_toxic: 'toxic',
|
|
227
|
+
obscene: 'nsfw',
|
|
228
|
+
insult: 'toxic',
|
|
229
|
+
identity_hate: 'toxic',
|
|
230
|
+
threat: 'threat',
|
|
231
|
+
};
|
|
232
|
+
const maxScores = {
|
|
233
|
+
toxic: 0,
|
|
234
|
+
injection: 0,
|
|
235
|
+
nsfw: 0,
|
|
236
|
+
threat: 0,
|
|
237
|
+
};
|
|
238
|
+
for (const item of raw) {
|
|
239
|
+
const label = (item.label ?? '').toLowerCase().replace(/\s+/g, '_');
|
|
240
|
+
const score = typeof item.score === 'number' ? item.score : 0;
|
|
241
|
+
const cat = labelMap[label];
|
|
242
|
+
if (cat && score > maxScores[cat]) {
|
|
243
|
+
maxScores[cat] = score;
|
|
263
244
|
}
|
|
264
245
|
}
|
|
265
|
-
//
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
this.pendingResults.set(streamId, classifyPromise);
|
|
271
|
-
}
|
|
272
|
-
// Return null immediately — result will be checked on next call.
|
|
273
|
-
return null;
|
|
246
|
+
// ONNX models typically do not detect prompt injection; leave at 0.
|
|
247
|
+
return this.categories.map((name) => ({
|
|
248
|
+
name,
|
|
249
|
+
confidence: maxScores[name] ?? 0,
|
|
250
|
+
}));
|
|
274
251
|
}
|
|
252
|
+
// -----------------------------------------------------------------------
|
|
253
|
+
// Private — LLM classification (Tier 2)
|
|
254
|
+
// -----------------------------------------------------------------------
|
|
275
255
|
/**
|
|
276
|
-
*
|
|
277
|
-
* blocking mode; subsequent chunks use non-blocking.
|
|
256
|
+
* Classify text using the LLM-as-judge fallback.
|
|
278
257
|
*
|
|
279
|
-
*
|
|
280
|
-
*
|
|
281
|
-
* remainder of the stream.
|
|
258
|
+
* @param text - Text to classify.
|
|
259
|
+
* @returns Classification result or `null` if the LLM call fails.
|
|
282
260
|
*
|
|
283
|
-
* @
|
|
284
|
-
* @param textDelta - New text fragment from the current chunk.
|
|
285
|
-
* @returns Evaluation result or `null`.
|
|
261
|
+
* @internal
|
|
286
262
|
*/
|
|
287
|
-
async
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
this.
|
|
263
|
+
async tryLlmClassification(text) {
|
|
264
|
+
if (!this.llmInvoker)
|
|
265
|
+
return null;
|
|
266
|
+
try {
|
|
267
|
+
const scores = await classifyByLlm(text, this.llmInvoker, this.categories);
|
|
268
|
+
// If all scores are zero the LLM likely failed to parse — treat as null.
|
|
269
|
+
if (scores.every((s) => s.confidence === 0))
|
|
270
|
+
return null;
|
|
271
|
+
return {
|
|
272
|
+
categories: scores,
|
|
273
|
+
flagged: scores.some((s) => s.confidence > this.flagThresholds[s.name]),
|
|
274
|
+
source: 'llm',
|
|
275
|
+
};
|
|
292
276
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
return this.handleBlocking(streamId, textDelta);
|
|
277
|
+
catch {
|
|
278
|
+
return null;
|
|
296
279
|
}
|
|
297
|
-
return this.handleNonBlocking(streamId, textDelta);
|
|
298
280
|
}
|
|
299
|
-
//
|
|
300
|
-
// Private
|
|
301
|
-
//
|
|
281
|
+
// -----------------------------------------------------------------------
|
|
282
|
+
// Private — result builder
|
|
283
|
+
// -----------------------------------------------------------------------
|
|
302
284
|
/**
|
|
303
|
-
* Convert a {@link
|
|
304
|
-
*
|
|
285
|
+
* Convert a {@link ClassifierResult} into a {@link GuardrailEvaluationResult},
|
|
286
|
+
* or return `null` when no thresholds are exceeded.
|
|
305
287
|
*
|
|
306
|
-
*
|
|
307
|
-
*
|
|
308
|
-
* metadata for audit/logging.
|
|
288
|
+
* @param result - Classification result from any tier.
|
|
289
|
+
* @returns Guardrail evaluation result or `null`.
|
|
309
290
|
*
|
|
310
|
-
* @
|
|
311
|
-
* @returns A guardrail result or `null` for clean content.
|
|
291
|
+
* @internal
|
|
312
292
|
*/
|
|
313
|
-
|
|
314
|
-
//
|
|
315
|
-
|
|
316
|
-
|
|
293
|
+
buildResult(result) {
|
|
294
|
+
// Check for BLOCK-level violations first.
|
|
295
|
+
const blockers = result.categories.filter((s) => s.confidence > this.blockThresholds[s.name]);
|
|
296
|
+
if (blockers.length > 0) {
|
|
297
|
+
const worst = blockers.reduce((a, b) => (b.confidence > a.confidence ? b : a));
|
|
298
|
+
return {
|
|
299
|
+
action: GuardrailAction.BLOCK,
|
|
300
|
+
reason: `ML classifier detected unsafe content: ${blockers.map((b) => `${b.name}(${b.confidence.toFixed(2)})`).join(', ')}`,
|
|
301
|
+
reasonCode: `ML_CLASSIFIER_${worst.name.toUpperCase()}`,
|
|
302
|
+
metadata: {
|
|
303
|
+
source: result.source,
|
|
304
|
+
categories: result.categories,
|
|
305
|
+
},
|
|
306
|
+
};
|
|
317
307
|
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
308
|
+
// Check for FLAG-level violations.
|
|
309
|
+
const flaggers = result.categories.filter((s) => s.confidence > this.flagThresholds[s.name]);
|
|
310
|
+
if (flaggers.length > 0) {
|
|
311
|
+
const worst = flaggers.reduce((a, b) => (b.confidence > a.confidence ? b : a));
|
|
312
|
+
return {
|
|
313
|
+
action: GuardrailAction.FLAG,
|
|
314
|
+
reason: `ML classifier flagged content: ${flaggers.map((f) => `${f.name}(${f.confidence.toFixed(2)})`).join(', ')}`,
|
|
315
|
+
reasonCode: `ML_CLASSIFIER_${worst.name.toUpperCase()}`,
|
|
316
|
+
metadata: {
|
|
317
|
+
source: result.source,
|
|
318
|
+
categories: result.categories,
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
// No thresholds exceeded — allow.
|
|
323
|
+
return null;
|
|
333
324
|
}
|
|
334
325
|
}
|
|
335
326
|
//# sourceMappingURL=MLClassifierGuardrail.js.map
|