@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/MLClassifierGuardrail.d.ts +88 -117
- package/dist/MLClassifierGuardrail.d.ts.map +1 -1
- package/dist/MLClassifierGuardrail.js +255 -264
- package/dist/MLClassifierGuardrail.js.map +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
- package/dist/index.d.ts +16 -90
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +33 -306
- package/dist/index.js.map +1 -1
- package/dist/keyword-classifier.d.ts +26 -0
- package/dist/keyword-classifier.d.ts.map +1 -0
- package/dist/keyword-classifier.js +113 -0
- package/dist/keyword-classifier.js.map +1 -0
- package/dist/llm-classifier.d.ts +27 -0
- package/dist/llm-classifier.d.ts.map +1 -0
- package/dist/llm-classifier.js +129 -0
- package/dist/llm-classifier.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +53 -80
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
- package/dist/tools/ClassifyContentTool.js +52 -103
- package/dist/tools/ClassifyContentTool.js.map +1 -1
- package/dist/types.d.ts +77 -277
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +9 -55
- package/dist/types.js.map +1 -1
- package/package.json +10 -16
- package/src/MLClassifierGuardrail.ts +279 -316
- package/src/index.ts +35 -339
- package/src/keyword-classifier.ts +130 -0
- package/src/llm-classifier.ts +163 -0
- package/src/tools/ClassifyContentTool.ts +75 -132
- package/src/types.ts +78 -325
- package/test/ClassifierOrchestrator.spec.ts +365 -0
- package/test/ClassifyContentTool.spec.ts +226 -0
- package/test/InjectionClassifier.spec.ts +263 -0
- package/test/JailbreakClassifier.spec.ts +295 -0
- package/test/MLClassifierGuardrail.spec.ts +486 -0
- package/test/SlidingWindowBuffer.spec.ts +391 -0
- package/test/ToxicityClassifier.spec.ts +268 -0
- package/test/WorkerClassifierProxy.spec.ts +303 -0
- package/test/index.spec.ts +431 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +24 -0
package/src/index.ts
CHANGED
|
@@ -1,81 +1,24 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @
|
|
2
|
+
* @file index.ts
|
|
3
|
+
* @description Pack factory for the ML Classifiers extension pack.
|
|
3
4
|
*
|
|
4
|
-
* Exports
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* manager.
|
|
5
|
+
* Exports a `createExtensionPack()` factory that assembles the ML classifier
|
|
6
|
+
* guardrail and the `classify_content` tool into a single {@link ExtensionPack}
|
|
7
|
+
* ready for registration with the AgentOS extension manager.
|
|
8
8
|
*
|
|
9
|
-
*
|
|
10
|
-
* the AgentOS manifest factory convention, delegating to
|
|
11
|
-
* `createMLClassifierPack()` with options extracted from the
|
|
12
|
-
* {@link ExtensionPackContext}.
|
|
13
|
-
*
|
|
14
|
-
* ### Default behaviour (zero-config)
|
|
15
|
-
* When called without arguments, all three built-in classifiers (toxicity,
|
|
16
|
-
* prompt-injection, jailbreak) are active using their default model IDs and
|
|
17
|
-
* the default threshold set:
|
|
18
|
-
* - block at 0.90 confidence
|
|
19
|
-
* - flag at 0.70 confidence
|
|
20
|
-
* - warn (sanitize) at 0.40 confidence
|
|
21
|
-
*
|
|
22
|
-
* ### Activation lifecycle
|
|
23
|
-
* Components are built eagerly at pack creation time for direct programmatic
|
|
24
|
-
* use. When the extension manager activates the pack, `onActivate` rebuilds
|
|
25
|
-
* all components with the manager's shared service registry so heavyweight
|
|
26
|
-
* resources (ONNX/WASM model pipelines) are shared across the agent.
|
|
27
|
-
*
|
|
28
|
-
* ### Disabling classifiers
|
|
29
|
-
* Individual classifiers can be disabled by omitting them from the
|
|
30
|
-
* `options.classifiers` array. An empty array or `undefined` activates all
|
|
31
|
-
* three built-in classifiers.
|
|
32
|
-
*
|
|
33
|
-
* @example
|
|
34
|
-
* ```typescript
|
|
35
|
-
* import { createMLClassifierPack } from './ml-classifiers';
|
|
36
|
-
*
|
|
37
|
-
* // All built-in classifiers at default thresholds:
|
|
38
|
-
* const pack = createMLClassifierPack();
|
|
39
|
-
*
|
|
40
|
-
* // Toxicity only with custom block threshold:
|
|
41
|
-
* const strictPack = createMLClassifierPack({
|
|
42
|
-
* classifiers: ['toxicity'],
|
|
43
|
-
* thresholds: { blockThreshold: 0.85 },
|
|
44
|
-
* streamingMode: true,
|
|
45
|
-
* guardrailScope: 'both',
|
|
46
|
-
* });
|
|
47
|
-
* ```
|
|
48
|
-
*
|
|
49
|
-
* @module agentos/extensions/packs/ml-classifiers
|
|
9
|
+
* @module ml-classifiers
|
|
50
10
|
*/
|
|
51
11
|
|
|
52
|
-
import type { ISharedServiceRegistry } from '@framers/agentos';
|
|
53
|
-
import { SharedServiceRegistry } from '@framers/agentos';
|
|
54
12
|
import type { ExtensionPack, ExtensionPackContext } from '@framers/agentos';
|
|
55
|
-
import type { ExtensionDescriptor, ExtensionLifecycleContext } from '@framers/agentos';
|
|
56
13
|
import { EXTENSION_KIND_GUARDRAIL, EXTENSION_KIND_TOOL } from '@framers/agentos';
|
|
57
|
-
import type {
|
|
58
|
-
import { DEFAULT_THRESHOLDS } from './types';
|
|
14
|
+
import type { MLClassifierOptions } from './types';
|
|
59
15
|
import { MLClassifierGuardrail } from './MLClassifierGuardrail';
|
|
60
|
-
import { ClassifierOrchestrator } from './ClassifierOrchestrator';
|
|
61
|
-
import { SlidingWindowBuffer } from './SlidingWindowBuffer';
|
|
62
16
|
import { ClassifyContentTool } from './tools/ClassifyContentTool';
|
|
63
|
-
import { ToxicityClassifier } from './classifiers/ToxicityClassifier';
|
|
64
|
-
import { InjectionClassifier } from './classifiers/InjectionClassifier';
|
|
65
|
-
import { JailbreakClassifier } from './classifiers/JailbreakClassifier';
|
|
66
|
-
import type { IContentClassifier } from './IContentClassifier';
|
|
67
17
|
|
|
68
18
|
// ---------------------------------------------------------------------------
|
|
69
|
-
// Re-exports
|
|
19
|
+
// Re-exports
|
|
70
20
|
// ---------------------------------------------------------------------------
|
|
71
21
|
|
|
72
|
-
/**
|
|
73
|
-
* Re-export all types from the ML classifier type definitions so consumers
|
|
74
|
-
* can import everything from a single entry point:
|
|
75
|
-
* ```ts
|
|
76
|
-
* import { createMLClassifierPack, DEFAULT_THRESHOLDS } from './ml-classifiers';
|
|
77
|
-
* ```
|
|
78
|
-
*/
|
|
79
22
|
export * from './types';
|
|
80
23
|
|
|
81
24
|
// ---------------------------------------------------------------------------
|
|
@@ -83,267 +26,34 @@ export * from './types';
|
|
|
83
26
|
// ---------------------------------------------------------------------------
|
|
84
27
|
|
|
85
28
|
/**
|
|
86
|
-
* Create an
|
|
87
|
-
*
|
|
88
|
-
* - The {@link ClassifyContentTool} `classify_content` tool (on-demand analysis).
|
|
89
|
-
*
|
|
90
|
-
* The built-in classifiers that are instantiated depend on `options.classifiers`:
|
|
91
|
-
* - `'toxicity'` → {@link ToxicityClassifier} (`unitary/toxic-bert`)
|
|
92
|
-
* - `'injection'` → {@link InjectionClassifier} (`protectai/deberta-v3-small-prompt-injection-v2`)
|
|
93
|
-
* - `'jailbreak'` → {@link JailbreakClassifier} (`meta-llama/PromptGuard-86M`)
|
|
94
|
-
*
|
|
95
|
-
* When `options.classifiers` is `undefined` or empty, **all three** are active.
|
|
96
|
-
*
|
|
97
|
-
* Additional classifiers supplied via `options.customClassifiers` are appended
|
|
98
|
-
* to the active list and run in parallel alongside the built-in ones.
|
|
29
|
+
* Create an ExtensionPack that bundles the ML classifier guardrail with
|
|
30
|
+
* the `classify_content` tool.
|
|
99
31
|
*
|
|
100
32
|
* @param options - Optional pack-level configuration. All properties have
|
|
101
|
-
* sensible defaults; see {@link
|
|
102
|
-
* @returns A fully-configured {@link ExtensionPack}
|
|
103
|
-
* descriptor and one tool descriptor.
|
|
33
|
+
* sensible defaults; see {@link MLClassifierOptions}.
|
|
34
|
+
* @returns A fully-configured {@link ExtensionPack}.
|
|
104
35
|
*/
|
|
105
|
-
export function
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
* safely use `opts.foo` without null-guarding the whole `options` reference.
|
|
109
|
-
*/
|
|
110
|
-
const opts: MLClassifierPackOptions = options ?? {};
|
|
111
|
-
|
|
112
|
-
// -------------------------------------------------------------------------
|
|
113
|
-
// Mutable state — upgraded by onActivate with the extension manager's
|
|
114
|
-
// shared service registry.
|
|
115
|
-
// -------------------------------------------------------------------------
|
|
116
|
-
|
|
117
|
-
const state = {
|
|
118
|
-
/**
|
|
119
|
-
* Service registry — starts as a standalone instance so the pack can be
|
|
120
|
-
* used directly (without activation) in unit tests and scripts.
|
|
121
|
-
* Replaced with the shared registry when `onActivate` is called by the
|
|
122
|
-
* extension manager.
|
|
123
|
-
*/
|
|
124
|
-
services: new SharedServiceRegistry() as ISharedServiceRegistry,
|
|
125
|
-
};
|
|
126
|
-
|
|
127
|
-
// -------------------------------------------------------------------------
|
|
128
|
-
// Component instances — rebuilt by buildComponents()
|
|
129
|
-
// -------------------------------------------------------------------------
|
|
130
|
-
|
|
131
|
-
/**
|
|
132
|
-
* The guardrail that evaluates user input and/or agent output streams
|
|
133
|
-
* against all active ML classifiers.
|
|
134
|
-
*/
|
|
135
|
-
let guardrail: MLClassifierGuardrail;
|
|
136
|
-
|
|
137
|
-
/**
|
|
138
|
-
* The on-demand classification tool exposed to agents and workflows.
|
|
139
|
-
*/
|
|
140
|
-
let tool: ClassifyContentTool;
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* The orchestrator that runs all active classifiers in parallel and folds
|
|
144
|
-
* their results into a single {@link ChunkEvaluation} via worst-wins
|
|
145
|
-
* aggregation.
|
|
146
|
-
*/
|
|
147
|
-
let orchestrator: ClassifierOrchestrator;
|
|
148
|
-
|
|
149
|
-
/**
|
|
150
|
-
* The sliding-window buffer used internally by the guardrail to evaluate
|
|
151
|
-
* streamed output tokens incrementally.
|
|
152
|
-
*/
|
|
153
|
-
let buffer: SlidingWindowBuffer;
|
|
154
|
-
|
|
155
|
-
// -------------------------------------------------------------------------
|
|
156
|
-
// buildComponents
|
|
157
|
-
// -------------------------------------------------------------------------
|
|
158
|
-
|
|
159
|
-
/**
|
|
160
|
-
* (Re)construct all pack components using the current `state.services`.
|
|
161
|
-
*
|
|
162
|
-
* Called once at pack creation for direct programmatic use, and again
|
|
163
|
-
* during `onActivate` to upgrade to the extension manager's shared
|
|
164
|
-
* service registry (so ONNX/WASM pipelines are shared across the agent).
|
|
165
|
-
*
|
|
166
|
-
* ### Classifier selection
|
|
167
|
-
* The active classifiers are determined by `opts.classifiers`:
|
|
168
|
-
* - `undefined` or empty → all three built-in classifiers are created.
|
|
169
|
-
* - Non-empty array → only the named classifiers are created.
|
|
170
|
-
*
|
|
171
|
-
* Any `opts.customClassifiers` are always appended to the list.
|
|
172
|
-
*/
|
|
173
|
-
function buildComponents(): void {
|
|
174
|
-
// ------------------------------------------------------------------
|
|
175
|
-
// 1. Determine which built-in classifiers to instantiate.
|
|
176
|
-
// ------------------------------------------------------------------
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* Determine whether a given built-in classifier name is enabled.
|
|
180
|
-
*
|
|
181
|
-
* When `opts.classifiers` is undefined or an empty array every built-in
|
|
182
|
-
* classifier is considered enabled (zero-config default).
|
|
183
|
-
*
|
|
184
|
-
* @param name - One of `'toxicity'`, `'injection'`, or `'jailbreak'`.
|
|
185
|
-
* @returns `true` when the classifier should be included.
|
|
186
|
-
*/
|
|
187
|
-
function isBuiltInEnabled(name: 'toxicity' | 'injection' | 'jailbreak'): boolean {
|
|
188
|
-
// No explicit list — enable all built-in classifiers.
|
|
189
|
-
if (!opts.classifiers || opts.classifiers.length === 0) {
|
|
190
|
-
return true;
|
|
191
|
-
}
|
|
192
|
-
return opts.classifiers.includes(name);
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
/** Array that will be populated with every active IContentClassifier. */
|
|
196
|
-
const activeClassifiers: IContentClassifier[] = [];
|
|
197
|
-
|
|
198
|
-
// Toxicity classifier — detects hateful, abusive, and toxic language.
|
|
199
|
-
if (isBuiltInEnabled('toxicity')) {
|
|
200
|
-
activeClassifiers.push(new ToxicityClassifier(state.services));
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
// Injection classifier — detects prompt-injection payloads.
|
|
204
|
-
if (isBuiltInEnabled('injection')) {
|
|
205
|
-
activeClassifiers.push(new InjectionClassifier(state.services));
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// Jailbreak classifier — detects system-prompt override attempts.
|
|
209
|
-
if (isBuiltInEnabled('jailbreak')) {
|
|
210
|
-
activeClassifiers.push(new JailbreakClassifier(state.services));
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// Append any caller-supplied custom classifiers.
|
|
214
|
-
if (opts.customClassifiers && opts.customClassifiers.length > 0) {
|
|
215
|
-
activeClassifiers.push(...opts.customClassifiers);
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
// ------------------------------------------------------------------
|
|
219
|
-
// 2. Resolve pack-level thresholds (merge caller overrides on top of
|
|
220
|
-
// the library defaults).
|
|
221
|
-
// ------------------------------------------------------------------
|
|
222
|
-
|
|
223
|
-
const thresholds = {
|
|
224
|
-
...DEFAULT_THRESHOLDS,
|
|
225
|
-
...opts.thresholds,
|
|
226
|
-
};
|
|
227
|
-
|
|
228
|
-
// ------------------------------------------------------------------
|
|
229
|
-
// 3. Build the orchestrator with the resolved classifier list and
|
|
230
|
-
// thresholds.
|
|
231
|
-
// ------------------------------------------------------------------
|
|
232
|
-
orchestrator = new ClassifierOrchestrator(activeClassifiers, thresholds);
|
|
233
|
-
|
|
234
|
-
// ------------------------------------------------------------------
|
|
235
|
-
// 4. Build the sliding-window buffer for streaming evaluation.
|
|
236
|
-
// ------------------------------------------------------------------
|
|
237
|
-
buffer = new SlidingWindowBuffer({
|
|
238
|
-
chunkSize: opts.chunkSize,
|
|
239
|
-
contextSize: opts.contextSize,
|
|
240
|
-
maxEvaluations: opts.maxEvaluations,
|
|
241
|
-
});
|
|
242
|
-
|
|
243
|
-
// ------------------------------------------------------------------
|
|
244
|
-
// 5. Build the guardrail, passing the shared registry and options.
|
|
245
|
-
// The guardrail creates its own orchestrator internally from the
|
|
246
|
-
// `classifiers` option — we pass the pre-built classifier instances
|
|
247
|
-
// via the third constructor argument.
|
|
248
|
-
// ------------------------------------------------------------------
|
|
249
|
-
guardrail = new MLClassifierGuardrail(state.services, opts, activeClassifiers);
|
|
250
|
-
|
|
251
|
-
// ------------------------------------------------------------------
|
|
252
|
-
// 6. Build the on-demand classification tool backed by the orchestrator.
|
|
253
|
-
// ------------------------------------------------------------------
|
|
254
|
-
tool = new ClassifyContentTool(orchestrator);
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// Initial build — makes the pack usable immediately without activation.
|
|
258
|
-
buildComponents();
|
|
259
|
-
|
|
260
|
-
// -------------------------------------------------------------------------
|
|
261
|
-
// ExtensionPack shape
|
|
262
|
-
// -------------------------------------------------------------------------
|
|
36
|
+
export function createMLClassifierGuardrail(options?: MLClassifierOptions): ExtensionPack {
|
|
37
|
+
const guardrail = new MLClassifierGuardrail(options);
|
|
38
|
+
const tool = new ClassifyContentTool(guardrail);
|
|
263
39
|
|
|
264
40
|
return {
|
|
265
|
-
/** Canonical pack name used in manifests and logs. */
|
|
266
41
|
name: 'ml-classifiers',
|
|
267
|
-
|
|
268
|
-
/** Semantic version of this pack implementation. */
|
|
269
42
|
version: '1.0.0',
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
* (priority 10) so PII is stripped before ML classification.
|
|
285
|
-
*/
|
|
286
|
-
id: 'ml-classifier-guardrail',
|
|
287
|
-
kind: EXTENSION_KIND_GUARDRAIL,
|
|
288
|
-
priority: 5,
|
|
289
|
-
payload: guardrail,
|
|
290
|
-
},
|
|
291
|
-
{
|
|
292
|
-
/**
|
|
293
|
-
* On-demand classification tool descriptor.
|
|
294
|
-
*
|
|
295
|
-
* Priority 0 uses the default ordering — tools are typically
|
|
296
|
-
* ordered by name rather than priority.
|
|
297
|
-
*/
|
|
298
|
-
id: 'classify_content',
|
|
299
|
-
kind: EXTENSION_KIND_TOOL,
|
|
300
|
-
priority: 0,
|
|
301
|
-
payload: tool,
|
|
302
|
-
},
|
|
303
|
-
];
|
|
304
|
-
},
|
|
305
|
-
|
|
306
|
-
/**
|
|
307
|
-
* Lifecycle hook called by the extension manager when the pack is
|
|
308
|
-
* activated.
|
|
309
|
-
*
|
|
310
|
-
* Upgrades the internal service registry to the extension manager's
|
|
311
|
-
* shared instance (so ONNX/WASM model weights are shared across all
|
|
312
|
-
* extensions) then rebuilds all components to use the new registry.
|
|
313
|
-
*
|
|
314
|
-
* @param context - Activation context provided by the extension manager.
|
|
315
|
-
*/
|
|
316
|
-
onActivate: (context: ExtensionLifecycleContext): void => {
|
|
317
|
-
// Upgrade to the shared registry when the manager provides one.
|
|
318
|
-
if (context.services) {
|
|
319
|
-
state.services = context.services;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// Rebuild all components with the upgraded registry.
|
|
323
|
-
buildComponents();
|
|
324
|
-
},
|
|
325
|
-
|
|
326
|
-
/**
|
|
327
|
-
* Lifecycle hook called when the pack is deactivated or the agent shuts
|
|
328
|
-
* down.
|
|
329
|
-
*
|
|
330
|
-
* Disposes the classifier orchestrator (which releases ONNX/WASM
|
|
331
|
-
* resources for every registered classifier) and clears the sliding
|
|
332
|
-
* window buffer to release per-stream state.
|
|
333
|
-
*/
|
|
334
|
-
onDeactivate: async (): Promise<void> => {
|
|
335
|
-
// Dispose all classifiers managed by the orchestrator.
|
|
336
|
-
// orchestrator may be undefined if buildComponents() was never called
|
|
337
|
-
// successfully (defensive guard).
|
|
338
|
-
if (orchestrator) {
|
|
339
|
-
await orchestrator.dispose();
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
// Clear any in-progress stream buffers.
|
|
343
|
-
if (buffer) {
|
|
344
|
-
buffer.clear();
|
|
345
|
-
}
|
|
346
|
-
},
|
|
43
|
+
descriptors: [
|
|
44
|
+
{
|
|
45
|
+
id: 'ml-classifier-guardrail',
|
|
46
|
+
kind: EXTENSION_KIND_GUARDRAIL,
|
|
47
|
+
priority: 5,
|
|
48
|
+
payload: guardrail,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
id: 'classify_content',
|
|
52
|
+
kind: EXTENSION_KIND_TOOL,
|
|
53
|
+
priority: 0,
|
|
54
|
+
payload: tool,
|
|
55
|
+
},
|
|
56
|
+
],
|
|
347
57
|
};
|
|
348
58
|
}
|
|
349
59
|
|
|
@@ -356,28 +66,14 @@ export function createMLClassifierPack(options?: MLClassifierPackOptions): Exten
|
|
|
356
66
|
*
|
|
357
67
|
* Conforms to the convention expected by the extension loader when resolving
|
|
358
68
|
* packs from manifests. Extracts `options` from the {@link ExtensionPackContext}
|
|
359
|
-
* and delegates to {@link
|
|
69
|
+
* and delegates to {@link createMLClassifierGuardrail}.
|
|
360
70
|
*
|
|
361
|
-
* @param context - Manifest context containing optional pack options
|
|
362
|
-
* resolver, and shared service registry.
|
|
71
|
+
* @param context - Manifest context containing optional pack options.
|
|
363
72
|
* @returns A fully-configured {@link ExtensionPack}.
|
|
364
|
-
*
|
|
365
|
-
* @example Manifest entry:
|
|
366
|
-
* ```json
|
|
367
|
-
* {
|
|
368
|
-
* "packs": [
|
|
369
|
-
* {
|
|
370
|
-
* "module": "./ml-classifiers",
|
|
371
|
-
* "options": {
|
|
372
|
-
* "classifiers": ["toxicity", "jailbreak"],
|
|
373
|
-
* "thresholds": { "blockThreshold": 0.95 },
|
|
374
|
-
* "streamingMode": true
|
|
375
|
-
* }
|
|
376
|
-
* }
|
|
377
|
-
* ]
|
|
378
|
-
* }
|
|
379
|
-
* ```
|
|
380
73
|
*/
|
|
381
74
|
export function createExtensionPack(context: ExtensionPackContext): ExtensionPack {
|
|
382
|
-
return
|
|
75
|
+
return createMLClassifierGuardrail(context.options as MLClassifierOptions);
|
|
383
76
|
}
|
|
77
|
+
|
|
78
|
+
/** @deprecated Use createMLClassifierGuardrail instead */
|
|
79
|
+
export const createMLClassifierPack = createMLClassifierGuardrail;
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file keyword-classifier.ts
|
|
3
|
+
* @description Lightweight keyword and regex-based safety classifier used as the
|
|
4
|
+
* last-resort fallback when neither ONNX models nor an LLM invoker are available.
|
|
5
|
+
*
|
|
6
|
+
* Returns normalised confidence scores per category based on keyword density and
|
|
7
|
+
* pattern matches. This is intentionally conservative — it will produce false
|
|
8
|
+
* positives in edge cases, but ensures the guardrail is never completely blind.
|
|
9
|
+
*
|
|
10
|
+
* @module ml-classifiers/keyword-classifier
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { ClassifierCategory, CategoryScore } from './types';
|
|
14
|
+
import { ALL_CATEGORIES } from './types';
|
|
15
|
+
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Pattern dictionaries
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Toxic language patterns — slurs, hate speech, and abusive terms.
|
|
22
|
+
*
|
|
23
|
+
* Each regex uses word boundaries (`\b`) to reduce false positives from
|
|
24
|
+
* substrings appearing in innocent words.
|
|
25
|
+
*/
|
|
26
|
+
const TOXIC_PATTERNS: RegExp[] = [
|
|
27
|
+
/\b(fuck|shit|ass(?:hole)?|bitch|bastard|damn|crap)\b/i,
|
|
28
|
+
/\b(kill\s+(?:yourself|urself|you)|kys)\b/i,
|
|
29
|
+
/\b(retard(?:ed)?|idiot|moron|stupid\s+(?:bitch|ass))\b/i,
|
|
30
|
+
/\b(hate\s+(?:you|u)|die\s+(?:in|alone))\b/i,
|
|
31
|
+
/\b(racial|ethnic)\s+slur/i,
|
|
32
|
+
/\b(n[i1]gg|f[a4]g(?:got)?|tr[a4]nn)/i,
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Prompt injection / jailbreak patterns — attempts to override system
|
|
37
|
+
* instructions, extract system prompts, or bypass safety guardrails.
|
|
38
|
+
*/
|
|
39
|
+
const INJECTION_PATTERNS: RegExp[] = [
|
|
40
|
+
/\bignore\s+(?:all\s+)?(?:previous|above|prior)\s+instructions?\b/i,
|
|
41
|
+
/\byou\s+are\s+now\s+(?:DAN|evil|unrestricted|jailbroken)\b/i,
|
|
42
|
+
/\bsystem\s*prompt\s*[:=]/i,
|
|
43
|
+
/\bdo\s+anything\s+now\b/i,
|
|
44
|
+
/\bdisregard\s+(?:your|all)\s+(?:rules|guidelines|instructions)\b/i,
|
|
45
|
+
/\bpretend\s+(?:you(?:'re|\s+are)\s+)?(?:not\s+an?\s+AI|unrestricted|evil)\b/i,
|
|
46
|
+
/\bact\s+as\s+(?:if|though)\s+(?:you\s+have\s+)?no\s+(?:restrictions|rules|limits)\b/i,
|
|
47
|
+
/\boverride\s+(?:safety|content)\s+(?:filters?|policies|guidelines)\b/i,
|
|
48
|
+
/\bjailbreak/i,
|
|
49
|
+
/\bprompt\s+(?:leak|injection|extract)/i,
|
|
50
|
+
];
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* NSFW patterns — sexually explicit content markers.
|
|
54
|
+
*/
|
|
55
|
+
const NSFW_PATTERNS: RegExp[] = [
|
|
56
|
+
/\b(porn(?:ography)?|hentai|xxx|nsfw)\b/i,
|
|
57
|
+
/\b(nude|naked|topless)\s+(?:photo|pic|image|video)\b/i,
|
|
58
|
+
/\bsexual(?:ly)?\s+explicit\b/i,
|
|
59
|
+
/\b(erotic|orgasm|masturbat)/i,
|
|
60
|
+
/\bsext(?:ing)?\b/i,
|
|
61
|
+
];
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Threat patterns — direct threats of violence, self-harm instructions,
|
|
65
|
+
* or dangerous activity incitement.
|
|
66
|
+
*/
|
|
67
|
+
const THREAT_PATTERNS: RegExp[] = [
|
|
68
|
+
/\b(?:i(?:'ll|\s+will)\s+)?kill\s+(?:you|him|her|them)\b/i,
|
|
69
|
+
/\b(?:how\s+to\s+)?make\s+a?\s*(?:bomb|explosive|weapon)\b/i,
|
|
70
|
+
/\b(?:i(?:'ll|\s+will)\s+)?hurt\s+(?:you|myself|someone)\b/i,
|
|
71
|
+
/\bsuicid(?:e|al)\s+(?:method|instruction|guide|how)/i,
|
|
72
|
+
/\b(?:swat(?:ting)?|dox(?:x?ing)?)\s+(?:someone|him|her|you)\b/i,
|
|
73
|
+
/\bshoot\s+up\s+(?:a\s+)?(?:school|church|mosque|synagogue|building)\b/i,
|
|
74
|
+
];
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Map category names to their pattern arrays for uniform iteration.
|
|
78
|
+
*/
|
|
79
|
+
const CATEGORY_PATTERNS: Record<ClassifierCategory, RegExp[]> = {
|
|
80
|
+
toxic: TOXIC_PATTERNS,
|
|
81
|
+
injection: INJECTION_PATTERNS,
|
|
82
|
+
nsfw: NSFW_PATTERNS,
|
|
83
|
+
threat: THREAT_PATTERNS,
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
// Public API
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Classify a text string using keyword and regex pattern matching.
|
|
92
|
+
*
|
|
93
|
+
* Confidence is computed as `min(1.0, matchCount * weight)` where `weight`
|
|
94
|
+
* scales the number of distinct pattern matches into the [0, 1] range.
|
|
95
|
+
* A single match yields a base confidence of 0.4; each additional match
|
|
96
|
+
* adds 0.15 up to a cap of 1.0.
|
|
97
|
+
*
|
|
98
|
+
* @param text - The text to classify.
|
|
99
|
+
* @param categories - Which categories to evaluate. Defaults to all four.
|
|
100
|
+
* @returns Per-category confidence scores.
|
|
101
|
+
*/
|
|
102
|
+
export function classifyByKeywords(
|
|
103
|
+
text: string,
|
|
104
|
+
categories: ClassifierCategory[] = ALL_CATEGORIES
|
|
105
|
+
): CategoryScore[] {
|
|
106
|
+
const scores: CategoryScore[] = [];
|
|
107
|
+
|
|
108
|
+
for (const cat of categories) {
|
|
109
|
+
const patterns = CATEGORY_PATTERNS[cat];
|
|
110
|
+
if (!patterns) {
|
|
111
|
+
scores.push({ name: cat, confidence: 0 });
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Count how many distinct patterns match.
|
|
116
|
+
let matchCount = 0;
|
|
117
|
+
for (const re of patterns) {
|
|
118
|
+
if (re.test(text)) {
|
|
119
|
+
matchCount++;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Scale: first match = 0.4, each additional += 0.15, capped at 1.0.
|
|
124
|
+
const confidence = matchCount === 0 ? 0 : Math.min(1.0, 0.4 + (matchCount - 1) * 0.15);
|
|
125
|
+
|
|
126
|
+
scores.push({ name: cat, confidence });
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return scores;
|
|
130
|
+
}
|