@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/MLClassifierGuardrail.d.ts +88 -117
- package/dist/MLClassifierGuardrail.d.ts.map +1 -1
- package/dist/MLClassifierGuardrail.js +255 -264
- package/dist/MLClassifierGuardrail.js.map +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
- package/dist/index.d.ts +16 -90
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +33 -306
- package/dist/index.js.map +1 -1
- package/dist/keyword-classifier.d.ts +26 -0
- package/dist/keyword-classifier.d.ts.map +1 -0
- package/dist/keyword-classifier.js +113 -0
- package/dist/keyword-classifier.js.map +1 -0
- package/dist/llm-classifier.d.ts +27 -0
- package/dist/llm-classifier.d.ts.map +1 -0
- package/dist/llm-classifier.js +129 -0
- package/dist/llm-classifier.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +53 -80
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
- package/dist/tools/ClassifyContentTool.js +52 -103
- package/dist/tools/ClassifyContentTool.js.map +1 -1
- package/dist/types.d.ts +77 -277
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +9 -55
- package/dist/types.js.map +1 -1
- package/package.json +10 -16
- package/src/MLClassifierGuardrail.ts +279 -316
- package/src/index.ts +35 -339
- package/src/keyword-classifier.ts +130 -0
- package/src/llm-classifier.ts +163 -0
- package/src/tools/ClassifyContentTool.ts +75 -132
- package/src/types.ts +78 -325
- package/test/ClassifierOrchestrator.spec.ts +365 -0
- package/test/ClassifyContentTool.spec.ts +226 -0
- package/test/InjectionClassifier.spec.ts +263 -0
- package/test/JailbreakClassifier.spec.ts +295 -0
- package/test/MLClassifierGuardrail.spec.ts +486 -0
- package/test/SlidingWindowBuffer.spec.ts +391 -0
- package/test/ToxicityClassifier.spec.ts +268 -0
- package/test/WorkerClassifierProxy.spec.ts +303 -0
- package/test/index.spec.ts +431 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +24 -0
package/src/types.ts
CHANGED
|
@@ -1,391 +1,144 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @
|
|
2
|
+
* @file types.ts
|
|
3
|
+
* @description Core type definitions for the ML Classifiers extension pack.
|
|
3
4
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* (toxicity, prompt-injection, jailbreak) and emit structured results that
|
|
8
|
-
* feed into the AgentOS guardrail decision tree.
|
|
5
|
+
* Defines the shared interfaces used across the ML classification system:
|
|
6
|
+
* classifier categories, confidence results, option shapes, and the LLM
|
|
7
|
+
* invoker callback signature.
|
|
9
8
|
*
|
|
10
|
-
*
|
|
11
|
-
* ----------------
|
|
12
|
-
* ```
|
|
13
|
-
* IUtilityAI ──── ClassificationResult, ClassificationScore
|
|
14
|
-
* IGuardrailService ── GuardrailAction
|
|
15
|
-
* │
|
|
16
|
-
* ▼
|
|
17
|
-
* types.ts (this file)
|
|
18
|
-
* │
|
|
19
|
-
* ▼
|
|
20
|
-
* IContentClassifier.ts / SlidingWindowBuffer.ts / …
|
|
21
|
-
* ```
|
|
22
|
-
*
|
|
23
|
-
* @module agentos/extensions/packs/ml-classifiers/types
|
|
9
|
+
* @module ml-classifiers/types
|
|
24
10
|
*/
|
|
25
11
|
|
|
26
|
-
import type { ClassificationResult, ClassificationScore } from '@framers/agentos';
|
|
27
|
-
import type { GuardrailAction } from '@framers/agentos';
|
|
28
|
-
|
|
29
|
-
// Re-export types used by dependents so they can import from a single source.
|
|
30
|
-
export type { ClassificationResult, ClassificationScore };
|
|
31
|
-
|
|
32
12
|
// ---------------------------------------------------------------------------
|
|
33
|
-
//
|
|
13
|
+
// Category type
|
|
34
14
|
// ---------------------------------------------------------------------------
|
|
35
15
|
|
|
36
16
|
/**
|
|
37
|
-
*
|
|
38
|
-
* guardrail actions.
|
|
17
|
+
* Safety categories evaluated by the ML classifier.
|
|
39
18
|
*
|
|
40
|
-
*
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
*
|
|
44
|
-
* 4. otherwise → {@link GuardrailAction.ALLOW}
|
|
19
|
+
* - `'toxic'` — Hateful, abusive, or threatening language.
|
|
20
|
+
* - `'injection'` — Prompt injection or jailbreak attempts.
|
|
21
|
+
* - `'nsfw'` — Sexually explicit or adult content.
|
|
22
|
+
* - `'threat'` — Direct threats of violence or self-harm.
|
|
45
23
|
*/
|
|
46
|
-
export
|
|
47
|
-
/**
|
|
48
|
-
* Minimum score at which content is **blocked** (interaction terminated).
|
|
49
|
-
* Must be in the range [0, 1]. Typical default: `0.9`.
|
|
50
|
-
*/
|
|
51
|
-
blockThreshold: number;
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Minimum score at which content is **flagged** for review while still
|
|
55
|
-
* being allowed through. Must be in the range [0, 1]. Typical default: `0.7`.
|
|
56
|
-
*/
|
|
57
|
-
flagThreshold: number;
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
* Minimum score at which a **warn** action is taken (e.g. the chunk is
|
|
61
|
-
* sanitised or a warning is appended to the response). Must be in the range
|
|
62
|
-
* [0, 1]. Typical default: `0.4`.
|
|
63
|
-
*/
|
|
64
|
-
warnThreshold: number;
|
|
65
|
-
}
|
|
24
|
+
export type ClassifierCategory = 'toxic' | 'injection' | 'nsfw' | 'threat';
|
|
66
25
|
|
|
67
26
|
/**
|
|
68
|
-
*
|
|
69
|
-
*
|
|
70
|
-
* These values reflect a conservative-but-pragmatic policy:
|
|
71
|
-
* - block at 90 % confidence → very high bar, minimises false positives
|
|
72
|
-
* - flag at 70 % → surfaced for human review, not blocked
|
|
73
|
-
* - warn at 40 % → low-confidence signal, handled with a light touch
|
|
27
|
+
* All supported classifier categories as a constant array, used for
|
|
28
|
+
* iteration and default configuration.
|
|
74
29
|
*/
|
|
75
|
-
export const
|
|
76
|
-
blockThreshold: 0.9,
|
|
77
|
-
flagThreshold: 0.7,
|
|
78
|
-
warnThreshold: 0.4,
|
|
79
|
-
} as const;
|
|
80
|
-
|
|
81
|
-
// ---------------------------------------------------------------------------
|
|
82
|
-
// Per-classifier configuration
|
|
83
|
-
// ---------------------------------------------------------------------------
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Configuration for a single ML classifier pipeline.
|
|
87
|
-
*
|
|
88
|
-
* Allows individual classifiers to override the pack-level defaults for the
|
|
89
|
-
* model variant and decision thresholds, and to customise which guardrail
|
|
90
|
-
* action is taken for each classification label.
|
|
91
|
-
*/
|
|
92
|
-
export interface ClassifierConfig {
|
|
93
|
-
/**
|
|
94
|
-
* Hugging Face model identifier (e.g. `"Xenova/toxic-bert"`) or a local
|
|
95
|
-
* model path to load instead of the pack default.
|
|
96
|
-
* @optional Falls back to the pack-level `MLClassifierPackOptions.modelCacheDir` default.
|
|
97
|
-
*/
|
|
98
|
-
modelId?: string;
|
|
99
|
-
|
|
100
|
-
/**
|
|
101
|
-
* Per-classifier threshold overrides.
|
|
102
|
-
* @optional Falls back to {@link DEFAULT_THRESHOLDS}.
|
|
103
|
-
*/
|
|
104
|
-
thresholds?: Partial<ClassifierThresholds>;
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Maps classification labels to the guardrail action that should be taken
|
|
108
|
-
* when that label is the winning class.
|
|
109
|
-
*
|
|
110
|
-
* @example
|
|
111
|
-
* ```typescript
|
|
112
|
-
* // Always block on TOXIC label regardless of threshold.
|
|
113
|
-
* labelActions: { TOXIC: GuardrailAction.BLOCK }
|
|
114
|
-
* ```
|
|
115
|
-
*/
|
|
116
|
-
labelActions?: Record<string, GuardrailAction>;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// ---------------------------------------------------------------------------
|
|
120
|
-
// Browser / web-worker options
|
|
121
|
-
// ---------------------------------------------------------------------------
|
|
122
|
-
|
|
123
|
-
/**
|
|
124
|
-
* Configuration for browser-side model execution.
|
|
125
|
-
*
|
|
126
|
-
* When the ML classifier pack is loaded in a browser context (e.g. a chat
|
|
127
|
-
* widget), models run inside a Web Worker to avoid blocking the main thread.
|
|
128
|
-
* This interface controls worker lifecycle and cache management.
|
|
129
|
-
*/
|
|
130
|
-
export interface BrowserConfig {
|
|
131
|
-
/**
|
|
132
|
-
* Run model inference in a Web Worker.
|
|
133
|
-
* @default true
|
|
134
|
-
*/
|
|
135
|
-
useWebWorker?: boolean;
|
|
136
|
-
|
|
137
|
-
/**
|
|
138
|
-
* Caching strategy for downloaded model weights.
|
|
139
|
-
* - `'memory'` — keep weights in memory only (lost on page unload)
|
|
140
|
-
* - `'indexeddb'` — persist weights to IndexedDB (survives reloads)
|
|
141
|
-
* - `'none'` — no caching; re-download on every page load
|
|
142
|
-
* @default 'indexeddb'
|
|
143
|
-
*/
|
|
144
|
-
cacheStrategy?: 'memory' | 'indexeddb' | 'none';
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Maximum number of model shards to keep in the in-memory cache when
|
|
148
|
-
* `cacheStrategy === 'memory'`. Oldest entries are evicted LRU-style.
|
|
149
|
-
* @default 3
|
|
150
|
-
*/
|
|
151
|
-
maxCacheSize?: number;
|
|
152
|
-
|
|
153
|
-
/**
|
|
154
|
-
* Callback invoked with download progress as model weights are fetched.
|
|
155
|
-
* Useful for showing a progress bar in the UI.
|
|
156
|
-
*
|
|
157
|
-
* @param progress - Current progress state.
|
|
158
|
-
*/
|
|
159
|
-
onProgress?: (progress: ModelDownloadProgress) => void;
|
|
160
|
-
}
|
|
30
|
+
export const ALL_CATEGORIES: ClassifierCategory[] = ['toxic', 'injection', 'nsfw', 'threat'];
|
|
161
31
|
|
|
162
32
|
// ---------------------------------------------------------------------------
|
|
163
|
-
//
|
|
33
|
+
// Result interfaces
|
|
164
34
|
// ---------------------------------------------------------------------------
|
|
165
35
|
|
|
166
36
|
/**
|
|
167
|
-
*
|
|
37
|
+
* Confidence score for a single safety category.
|
|
168
38
|
*
|
|
169
|
-
*
|
|
170
|
-
*
|
|
171
|
-
* onProgress({ modelId: 'Xenova/toxic-bert', loaded: 50_000, total: 200_000, percent: 25 })
|
|
172
|
-
* ```
|
|
39
|
+
* Scores are normalised to the range `[0, 1]`, where `0` means "no signal"
|
|
40
|
+
* and `1` means "extremely confident match".
|
|
173
41
|
*/
|
|
174
|
-
export interface
|
|
175
|
-
/**
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
/** Number of bytes downloaded so far. */
|
|
179
|
-
loaded: number;
|
|
42
|
+
export interface CategoryScore {
|
|
43
|
+
/** The safety category this score applies to. */
|
|
44
|
+
name: ClassifierCategory;
|
|
180
45
|
|
|
181
|
-
/**
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
/** Download progress as a percentage in the range [0, 100]. */
|
|
185
|
-
percent: number;
|
|
46
|
+
/** Normalised confidence score in the range [0, 1]. */
|
|
47
|
+
confidence: number;
|
|
186
48
|
}
|
|
187
49
|
|
|
188
|
-
// ---------------------------------------------------------------------------
|
|
189
|
-
// Pack-level options
|
|
190
|
-
// ---------------------------------------------------------------------------
|
|
191
|
-
|
|
192
50
|
/**
|
|
193
|
-
*
|
|
194
|
-
*
|
|
195
|
-
* Passed to `createMLClassifierPack()` (or the NestJS module factory) to
|
|
196
|
-
* control which classifiers are active, how models are loaded, and how the
|
|
197
|
-
* sliding-window streaming evaluation behaves.
|
|
51
|
+
* Complete result from a classification pass over a text input.
|
|
198
52
|
*
|
|
199
|
-
*
|
|
200
|
-
*
|
|
201
|
-
* const packOptions: MLClassifierPackOptions = {
|
|
202
|
-
* classifiers: ['toxicity', 'jailbreak'],
|
|
203
|
-
* quantized: true,
|
|
204
|
-
* runtime: 'node',
|
|
205
|
-
* thresholds: { blockThreshold: 0.95, flagThreshold: 0.75, warnThreshold: 0.5 },
|
|
206
|
-
* streamingMode: true,
|
|
207
|
-
* chunkSize: 150,
|
|
208
|
-
* contextSize: 50,
|
|
209
|
-
* };
|
|
210
|
-
* ```
|
|
53
|
+
* Includes per-category scores and an overall `flagged` boolean that is
|
|
54
|
+
* `true` when any category exceeds the configured flag threshold (default 0.5).
|
|
211
55
|
*/
|
|
212
|
-
export interface
|
|
56
|
+
export interface ClassifierResult {
|
|
213
57
|
/**
|
|
214
|
-
*
|
|
215
|
-
*
|
|
216
|
-
*
|
|
217
|
-
* @example `['toxicity', 'injection']`
|
|
218
|
-
*/
|
|
219
|
-
classifiers?: Array<'toxicity' | 'injection' | 'jailbreak'>;
|
|
220
|
-
|
|
221
|
-
/**
|
|
222
|
-
* Fully-qualified `IContentClassifier` instances to add alongside the
|
|
223
|
-
* built-in classifiers (e.g. domain-specific harm classifiers).
|
|
58
|
+
* Per-category confidence scores, one entry for each category that was
|
|
59
|
+
* evaluated.
|
|
224
60
|
*/
|
|
225
|
-
|
|
61
|
+
categories: CategoryScore[];
|
|
226
62
|
|
|
227
63
|
/**
|
|
228
|
-
*
|
|
229
|
-
*
|
|
64
|
+
* `true` when at least one category score exceeds the flag threshold.
|
|
65
|
+
* Convenience field — equivalent to `categories.some(c => c.confidence > flagThreshold)`.
|
|
230
66
|
*/
|
|
231
|
-
|
|
67
|
+
flagged: boolean;
|
|
232
68
|
|
|
233
69
|
/**
|
|
234
|
-
*
|
|
235
|
-
*
|
|
236
|
-
* accuracy cost.
|
|
237
|
-
* @default false
|
|
70
|
+
* Which classification backend produced this result.
|
|
71
|
+
* Useful for logging and debugging which tier was active.
|
|
238
72
|
*/
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
/**
|
|
242
|
-
* Execution runtime for model inference.
|
|
243
|
-
* - `'node'` — Runs via `@xenova/transformers` in the Node.js process.
|
|
244
|
-
* - `'browser'` — Runs via `@xenova/transformers` in a Web Worker.
|
|
245
|
-
* - `'wasm'` — Explicit WebAssembly fallback (Node.js or browser).
|
|
246
|
-
* @default 'node'
|
|
247
|
-
*/
|
|
248
|
-
runtime?: 'node' | 'browser' | 'wasm';
|
|
249
|
-
|
|
250
|
-
/**
|
|
251
|
-
* Browser-specific options. Only applicable when `runtime === 'browser'`.
|
|
252
|
-
*/
|
|
253
|
-
browser?: BrowserConfig;
|
|
254
|
-
|
|
255
|
-
/**
|
|
256
|
-
* Number of tokens per evaluation window when streaming mode is enabled.
|
|
257
|
-
* Smaller values detect issues earlier but increase evaluation frequency.
|
|
258
|
-
* @default 200
|
|
259
|
-
*/
|
|
260
|
-
chunkSize?: number;
|
|
261
|
-
|
|
262
|
-
/**
|
|
263
|
-
* Number of tokens from the previous chunk to carry forward as context into
|
|
264
|
-
* the next window, preventing boundary effects.
|
|
265
|
-
* @default 50
|
|
266
|
-
*/
|
|
267
|
-
contextSize?: number;
|
|
268
|
-
|
|
269
|
-
/**
|
|
270
|
-
* Maximum number of classifier evaluations per stream. The sliding window
|
|
271
|
-
* stops advancing after this many evaluations, allowing the stream to
|
|
272
|
-
* complete without further overhead.
|
|
273
|
-
* @default 100
|
|
274
|
-
*/
|
|
275
|
-
maxEvaluations?: number;
|
|
276
|
-
|
|
277
|
-
/**
|
|
278
|
-
* Enable sliding-window evaluation for streamed (token-by-token) output.
|
|
279
|
-
* When `false`, classifiers only run on the completed final response.
|
|
280
|
-
* @default false
|
|
281
|
-
*/
|
|
282
|
-
streamingMode?: boolean;
|
|
283
|
-
|
|
284
|
-
/**
|
|
285
|
-
* Pack-level threshold defaults applied to every classifier unless
|
|
286
|
-
* overridden by a per-classifier {@link ClassifierConfig}.
|
|
287
|
-
*/
|
|
288
|
-
thresholds?: Partial<ClassifierThresholds>;
|
|
289
|
-
|
|
290
|
-
/**
|
|
291
|
-
* Scope of guardrail enforcement.
|
|
292
|
-
* - `'input'` — Evaluate user messages before orchestration.
|
|
293
|
-
* - `'output'` — Evaluate agent responses before delivery.
|
|
294
|
-
* - `'both'` — Evaluate at both stages.
|
|
295
|
-
* @default 'both'
|
|
296
|
-
*/
|
|
297
|
-
guardrailScope?: 'input' | 'output' | 'both';
|
|
73
|
+
source: 'onnx' | 'llm' | 'keyword';
|
|
298
74
|
}
|
|
299
75
|
|
|
300
76
|
// ---------------------------------------------------------------------------
|
|
301
|
-
//
|
|
77
|
+
// LLM invoker callback
|
|
302
78
|
// ---------------------------------------------------------------------------
|
|
303
79
|
|
|
304
80
|
/**
|
|
305
|
-
*
|
|
306
|
-
*
|
|
81
|
+
* Callback signature for invoking an LLM to perform classification when
|
|
82
|
+
* ONNX models are unavailable.
|
|
307
83
|
*
|
|
308
|
-
*
|
|
309
|
-
*
|
|
310
|
-
*
|
|
84
|
+
* The callback receives a system prompt and a user message and returns
|
|
85
|
+
* the raw LLM text response. The caller is responsible for parsing the
|
|
86
|
+
* JSON output.
|
|
311
87
|
*
|
|
312
|
-
* @
|
|
313
|
-
*
|
|
314
|
-
*
|
|
315
|
-
* ```
|
|
88
|
+
* @param systemPrompt - Instruction prompt describing the classification task.
|
|
89
|
+
* @param userMessage - The text to classify.
|
|
90
|
+
* @returns The raw string response from the LLM.
|
|
316
91
|
*/
|
|
317
|
-
export
|
|
318
|
-
/** Classifier that detects toxic, hateful, or abusive language. */
|
|
319
|
-
TOXICITY_PIPELINE: 'agentos:ml-classifiers:toxicity-pipeline',
|
|
320
|
-
|
|
321
|
-
/** Classifier that detects prompt-injection attempts. */
|
|
322
|
-
INJECTION_PIPELINE: 'agentos:ml-classifiers:injection-pipeline',
|
|
323
|
-
|
|
324
|
-
/** Classifier that detects jailbreak / system-override attempts. */
|
|
325
|
-
JAILBREAK_PIPELINE: 'agentos:ml-classifiers:jailbreak-pipeline',
|
|
326
|
-
} as const;
|
|
327
|
-
|
|
328
|
-
/** Union type of all ML classifier service ID strings. */
|
|
329
|
-
export type MLClassifierServiceId =
|
|
330
|
-
(typeof ML_CLASSIFIER_SERVICE_IDS)[keyof typeof ML_CLASSIFIER_SERVICE_IDS];
|
|
92
|
+
export type LlmInvoker = (systemPrompt: string, userMessage: string) => Promise<string>;
|
|
331
93
|
|
|
332
94
|
// ---------------------------------------------------------------------------
|
|
333
|
-
//
|
|
95
|
+
// Pack options
|
|
334
96
|
// ---------------------------------------------------------------------------
|
|
335
97
|
|
|
336
98
|
/**
|
|
337
|
-
*
|
|
99
|
+
* Configuration options for the ML Classifiers extension pack.
|
|
338
100
|
*
|
|
339
|
-
*
|
|
340
|
-
*
|
|
341
|
-
* audit trails and SLO dashboards.
|
|
101
|
+
* All properties are optional. Sensible defaults allow zero-config operation
|
|
102
|
+
* using the keyword fallback classifier.
|
|
342
103
|
*/
|
|
343
|
-
export interface
|
|
344
|
-
/**
|
|
345
|
-
* The {@link IContentClassifier.id} of the classifier that produced this
|
|
346
|
-
* result (e.g. `ML_CLASSIFIER_SERVICE_IDS.TOXICITY_PIPELINE`).
|
|
347
|
-
*/
|
|
348
|
-
classifierId: string;
|
|
349
|
-
|
|
104
|
+
export interface MLClassifierOptions {
|
|
350
105
|
/**
|
|
351
|
-
*
|
|
352
|
-
*
|
|
106
|
+
* Which safety categories to evaluate.
|
|
107
|
+
* @default ALL_CATEGORIES
|
|
353
108
|
*/
|
|
354
|
-
|
|
355
|
-
}
|
|
109
|
+
categories?: ClassifierCategory[];
|
|
356
110
|
|
|
357
|
-
/**
|
|
358
|
-
* Aggregated evaluation outcome for a single sliding-window chunk.
|
|
359
|
-
*
|
|
360
|
-
* Produced by running all active classifiers against one text window and
|
|
361
|
-
* collating their results into a single action recommendation.
|
|
362
|
-
*
|
|
363
|
-
* The `recommendedAction` is the most restrictive action across all
|
|
364
|
-
* classifiers (BLOCK > FLAG > SANITIZE > ALLOW).
|
|
365
|
-
*/
|
|
366
|
-
export interface ChunkEvaluation {
|
|
367
111
|
/**
|
|
368
|
-
*
|
|
369
|
-
*
|
|
112
|
+
* Per-category confidence thresholds that override the global defaults.
|
|
113
|
+
*
|
|
114
|
+
* Keys are category names; values are threshold overrides with optional
|
|
115
|
+
* `flag` and `block` levels.
|
|
116
|
+
*
|
|
117
|
+
* @example `{ toxic: { flag: 0.4, block: 0.7 } }`
|
|
370
118
|
*/
|
|
371
|
-
|
|
119
|
+
thresholds?: Partial<Record<ClassifierCategory, { flag?: number; block?: number }>>;
|
|
372
120
|
|
|
373
121
|
/**
|
|
374
|
-
*
|
|
375
|
-
*
|
|
376
|
-
*
|
|
122
|
+
* Global flag threshold applied to all categories that do not have a
|
|
123
|
+
* per-category override.
|
|
124
|
+
* @default 0.5
|
|
377
125
|
*/
|
|
378
|
-
|
|
126
|
+
flagThreshold?: number;
|
|
379
127
|
|
|
380
128
|
/**
|
|
381
|
-
*
|
|
382
|
-
*
|
|
129
|
+
* Global block threshold applied to all categories that do not have a
|
|
130
|
+
* per-category override.
|
|
131
|
+
* @default 0.8
|
|
383
132
|
*/
|
|
384
|
-
|
|
133
|
+
blockThreshold?: number;
|
|
385
134
|
|
|
386
135
|
/**
|
|
387
|
-
*
|
|
388
|
-
*
|
|
136
|
+
* Optional LLM invoker callback. When provided and ONNX models are
|
|
137
|
+
* unavailable, the classifier will fall back to LLM-as-judge classification
|
|
138
|
+
* using this callback.
|
|
139
|
+
*
|
|
140
|
+
* When omitted AND ONNX models are unavailable, the classifier falls back
|
|
141
|
+
* to keyword-based detection.
|
|
389
142
|
*/
|
|
390
|
-
|
|
143
|
+
llmInvoker?: LlmInvoker;
|
|
391
144
|
}
|