@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/dist/MLClassifierGuardrail.d.ts +88 -117
  3. package/dist/MLClassifierGuardrail.d.ts.map +1 -1
  4. package/dist/MLClassifierGuardrail.js +255 -264
  5. package/dist/MLClassifierGuardrail.js.map +1 -1
  6. package/dist/classifiers/InjectionClassifier.d.ts +1 -1
  7. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
  8. package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
  9. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
  10. package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
  11. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
  12. package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
  13. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
  14. package/dist/index.d.ts +16 -90
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +33 -306
  17. package/dist/index.js.map +1 -1
  18. package/dist/keyword-classifier.d.ts +26 -0
  19. package/dist/keyword-classifier.d.ts.map +1 -0
  20. package/dist/keyword-classifier.js +113 -0
  21. package/dist/keyword-classifier.js.map +1 -0
  22. package/dist/llm-classifier.d.ts +27 -0
  23. package/dist/llm-classifier.d.ts.map +1 -0
  24. package/dist/llm-classifier.js +129 -0
  25. package/dist/llm-classifier.js.map +1 -0
  26. package/dist/tools/ClassifyContentTool.d.ts +53 -80
  27. package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
  28. package/dist/tools/ClassifyContentTool.js +52 -103
  29. package/dist/tools/ClassifyContentTool.js.map +1 -1
  30. package/dist/types.d.ts +77 -277
  31. package/dist/types.d.ts.map +1 -1
  32. package/dist/types.js +9 -55
  33. package/dist/types.js.map +1 -1
  34. package/package.json +10 -16
  35. package/src/MLClassifierGuardrail.ts +279 -316
  36. package/src/index.ts +35 -339
  37. package/src/keyword-classifier.ts +130 -0
  38. package/src/llm-classifier.ts +163 -0
  39. package/src/tools/ClassifyContentTool.ts +75 -132
  40. package/src/types.ts +78 -325
  41. package/test/ClassifierOrchestrator.spec.ts +365 -0
  42. package/test/ClassifyContentTool.spec.ts +226 -0
  43. package/test/InjectionClassifier.spec.ts +263 -0
  44. package/test/JailbreakClassifier.spec.ts +295 -0
  45. package/test/MLClassifierGuardrail.spec.ts +486 -0
  46. package/test/SlidingWindowBuffer.spec.ts +391 -0
  47. package/test/ToxicityClassifier.spec.ts +268 -0
  48. package/test/WorkerClassifierProxy.spec.ts +303 -0
  49. package/test/index.spec.ts +431 -0
  50. package/tsconfig.json +20 -0
  51. package/vitest.config.ts +24 -0
package/src/types.ts CHANGED
@@ -1,391 +1,144 @@
1
1
  /**
2
- * @fileoverview Core type definitions for the ML Classifier Guardrail Extension Pack.
2
+ * @file types.ts
3
+ * @description Core type definitions for the ML Classifiers extension pack.
3
4
  *
4
- * This file defines all configuration shapes, runtime result types, and
5
- * service-identifier constants used by the ML classifier pipeline. All
6
- * classifiers in this pack evaluate text content against learned models
7
- * (toxicity, prompt-injection, jailbreak) and emit structured results that
8
- * feed into the AgentOS guardrail decision tree.
5
+ * Defines the shared interfaces used across the ML classification system:
6
+ * classifier categories, confidence results, option shapes, and the LLM
7
+ * invoker callback signature.
9
8
  *
10
- * Import hierarchy
11
- * ----------------
12
- * ```
13
- * IUtilityAI ──── ClassificationResult, ClassificationScore
14
- * IGuardrailService ── GuardrailAction
15
- * │
16
- * ▼
17
- * types.ts (this file)
18
- * │
19
- * ▼
20
- * IContentClassifier.ts / SlidingWindowBuffer.ts / …
21
- * ```
22
- *
23
- * @module agentos/extensions/packs/ml-classifiers/types
9
+ * @module ml-classifiers/types
24
10
  */
25
11
 
26
- import type { ClassificationResult, ClassificationScore } from '@framers/agentos';
27
- import type { GuardrailAction } from '@framers/agentos';
28
-
29
- // Re-export types used by dependents so they can import from a single source.
30
- export type { ClassificationResult, ClassificationScore };
31
-
32
12
  // ---------------------------------------------------------------------------
33
- // Threshold configuration
13
+ // Category type
34
14
  // ---------------------------------------------------------------------------
35
15
 
36
16
  /**
37
- * Numeric thresholds that map raw classifier confidence scores (0–1) to
38
- * guardrail actions.
17
+ * Safety categories evaluated by the ML classifier.
39
18
  *
40
- * The thresholds are applied in descending priority:
41
- * 1. `score >= blockThreshold` {@link GuardrailAction.BLOCK}
42
- * 2. `score >= flagThreshold` → {@link GuardrailAction.FLAG}
43
- * 3. `score >= warnThreshold` → {@link GuardrailAction.SANITIZE}
44
- * 4. otherwise → {@link GuardrailAction.ALLOW}
19
+ * - `'toxic'` — Hateful, abusive, or threatening language.
20
+ * - `'injection'` Prompt injection or jailbreak attempts.
21
+ * - `'nsfw'` — Sexually explicit or adult content.
22
+ * - `'threat'` — Direct threats of violence or self-harm.
45
23
  */
46
- export interface ClassifierThresholds {
47
- /**
48
- * Minimum score at which content is **blocked** (interaction terminated).
49
- * Must be in the range [0, 1]. Typical default: `0.9`.
50
- */
51
- blockThreshold: number;
52
-
53
- /**
54
- * Minimum score at which content is **flagged** for review while still
55
- * being allowed through. Must be in the range [0, 1]. Typical default: `0.7`.
56
- */
57
- flagThreshold: number;
58
-
59
- /**
60
- * Minimum score at which a **warn** action is taken (e.g. the chunk is
61
- * sanitised or a warning is appended to the response). Must be in the range
62
- * [0, 1]. Typical default: `0.4`.
63
- */
64
- warnThreshold: number;
65
- }
24
+ export type ClassifierCategory = 'toxic' | 'injection' | 'nsfw' | 'threat';
66
25
 
67
26
  /**
68
- * Sensible defaults for {@link ClassifierThresholds}.
69
- *
70
- * These values reflect a conservative-but-pragmatic policy:
71
- * - block at 90 % confidence → very high bar, minimises false positives
72
- * - flag at 70 % → surfaced for human review, not blocked
73
- * - warn at 40 % → low-confidence signal, handled with a light touch
27
+ * All supported classifier categories as a constant array, used for
28
+ * iteration and default configuration.
74
29
  */
75
- export const DEFAULT_THRESHOLDS: ClassifierThresholds = {
76
- blockThreshold: 0.9,
77
- flagThreshold: 0.7,
78
- warnThreshold: 0.4,
79
- } as const;
80
-
81
- // ---------------------------------------------------------------------------
82
- // Per-classifier configuration
83
- // ---------------------------------------------------------------------------
84
-
85
- /**
86
- * Configuration for a single ML classifier pipeline.
87
- *
88
- * Allows individual classifiers to override the pack-level defaults for the
89
- * model variant and decision thresholds, and to customise which guardrail
90
- * action is taken for each classification label.
91
- */
92
- export interface ClassifierConfig {
93
- /**
94
- * Hugging Face model identifier (e.g. `"Xenova/toxic-bert"`) or a local
95
- * model path to load instead of the pack default.
96
- * @optional Falls back to the pack-level `MLClassifierPackOptions.modelCacheDir` default.
97
- */
98
- modelId?: string;
99
-
100
- /**
101
- * Per-classifier threshold overrides.
102
- * @optional Falls back to {@link DEFAULT_THRESHOLDS}.
103
- */
104
- thresholds?: Partial<ClassifierThresholds>;
105
-
106
- /**
107
- * Maps classification labels to the guardrail action that should be taken
108
- * when that label is the winning class.
109
- *
110
- * @example
111
- * ```typescript
112
- * // Always block on TOXIC label regardless of threshold.
113
- * labelActions: { TOXIC: GuardrailAction.BLOCK }
114
- * ```
115
- */
116
- labelActions?: Record<string, GuardrailAction>;
117
- }
118
-
119
- // ---------------------------------------------------------------------------
120
- // Browser / web-worker options
121
- // ---------------------------------------------------------------------------
122
-
123
- /**
124
- * Configuration for browser-side model execution.
125
- *
126
- * When the ML classifier pack is loaded in a browser context (e.g. a chat
127
- * widget), models run inside a Web Worker to avoid blocking the main thread.
128
- * This interface controls worker lifecycle and cache management.
129
- */
130
- export interface BrowserConfig {
131
- /**
132
- * Run model inference in a Web Worker.
133
- * @default true
134
- */
135
- useWebWorker?: boolean;
136
-
137
- /**
138
- * Caching strategy for downloaded model weights.
139
- * - `'memory'` — keep weights in memory only (lost on page unload)
140
- * - `'indexeddb'` — persist weights to IndexedDB (survives reloads)
141
- * - `'none'` — no caching; re-download on every page load
142
- * @default 'indexeddb'
143
- */
144
- cacheStrategy?: 'memory' | 'indexeddb' | 'none';
145
-
146
- /**
147
- * Maximum number of model shards to keep in the in-memory cache when
148
- * `cacheStrategy === 'memory'`. Oldest entries are evicted LRU-style.
149
- * @default 3
150
- */
151
- maxCacheSize?: number;
152
-
153
- /**
154
- * Callback invoked with download progress as model weights are fetched.
155
- * Useful for showing a progress bar in the UI.
156
- *
157
- * @param progress - Current progress state.
158
- */
159
- onProgress?: (progress: ModelDownloadProgress) => void;
160
- }
30
+ export const ALL_CATEGORIES: ClassifierCategory[] = ['toxic', 'injection', 'nsfw', 'threat'];
161
31
 
162
32
  // ---------------------------------------------------------------------------
163
- // Model download progress
33
+ // Result interfaces
164
34
  // ---------------------------------------------------------------------------
165
35
 
166
36
  /**
167
- * Progress report emitted during model weight downloads.
37
+ * Confidence score for a single safety category.
168
38
  *
169
- * @example
170
- * ```typescript
171
- * onProgress({ modelId: 'Xenova/toxic-bert', loaded: 50_000, total: 200_000, percent: 25 })
172
- * ```
39
+ * Scores are normalised to the range `[0, 1]`, where `0` means "no signal"
40
+ * and `1` means "extremely confident match".
173
41
  */
174
- export interface ModelDownloadProgress {
175
- /** Identifier of the model being downloaded (Hugging Face ID or path). */
176
- modelId: string;
177
-
178
- /** Number of bytes downloaded so far. */
179
- loaded: number;
42
+ export interface CategoryScore {
43
+ /** The safety category this score applies to. */
44
+ name: ClassifierCategory;
180
45
 
181
- /** Total number of bytes to download (`0` if unknown). */
182
- total: number;
183
-
184
- /** Download progress as a percentage in the range [0, 100]. */
185
- percent: number;
46
+ /** Normalised confidence score in the range [0, 1]. */
47
+ confidence: number;
186
48
  }
187
49
 
188
- // ---------------------------------------------------------------------------
189
- // Pack-level options
190
- // ---------------------------------------------------------------------------
191
-
192
50
  /**
193
- * Top-level configuration for the ML Classifier Extension Pack.
194
- *
195
- * Passed to `createMLClassifierPack()` (or the NestJS module factory) to
196
- * control which classifiers are active, how models are loaded, and how the
197
- * sliding-window streaming evaluation behaves.
51
+ * Complete result from a classification pass over a text input.
198
52
  *
199
- * @example
200
- * ```typescript
201
- * const packOptions: MLClassifierPackOptions = {
202
- * classifiers: ['toxicity', 'jailbreak'],
203
- * quantized: true,
204
- * runtime: 'node',
205
- * thresholds: { blockThreshold: 0.95, flagThreshold: 0.75, warnThreshold: 0.5 },
206
- * streamingMode: true,
207
- * chunkSize: 150,
208
- * contextSize: 50,
209
- * };
210
- * ```
53
+ * Includes per-category scores and an overall `flagged` boolean that is
54
+ * `true` when any category exceeds the configured flag threshold (default 0.5).
211
55
  */
212
- export interface MLClassifierPackOptions {
56
+ export interface ClassifierResult {
213
57
  /**
214
- * Subset of built-in classifiers to activate.
215
- * Omit or pass an empty array to activate all built-in classifiers.
216
- *
217
- * @example `['toxicity', 'injection']`
218
- */
219
- classifiers?: Array<'toxicity' | 'injection' | 'jailbreak'>;
220
-
221
- /**
222
- * Fully-qualified `IContentClassifier` instances to add alongside the
223
- * built-in classifiers (e.g. domain-specific harm classifiers).
58
+ * Per-category confidence scores, one entry for each category that was
59
+ * evaluated.
224
60
  */
225
- customClassifiers?: import('./IContentClassifier').IContentClassifier[];
61
+ categories: CategoryScore[];
226
62
 
227
63
  /**
228
- * Local filesystem path where downloaded model weights are cached.
229
- * Defaults to `~/.cache/agentos/ml-classifiers`.
64
+ * `true` when at least one category score exceeds the flag threshold.
65
+ * Convenience field — equivalent to `categories.some(c => c.confidence > flagThreshold)`.
230
66
  */
231
- modelCacheDir?: string;
67
+ flagged: boolean;
232
68
 
233
69
  /**
234
- * Use 8-bit quantised model variants when available.
235
- * Reduces VRAM/RAM footprint and increases inference speed at a small
236
- * accuracy cost.
237
- * @default false
70
+ * Which classification backend produced this result.
71
+ * Useful for logging and debugging which tier was active.
238
72
  */
239
- quantized?: boolean;
240
-
241
- /**
242
- * Execution runtime for model inference.
243
- * - `'node'` — Runs via `@xenova/transformers` in the Node.js process.
244
- * - `'browser'` — Runs via `@xenova/transformers` in a Web Worker.
245
- * - `'wasm'` — Explicit WebAssembly fallback (Node.js or browser).
246
- * @default 'node'
247
- */
248
- runtime?: 'node' | 'browser' | 'wasm';
249
-
250
- /**
251
- * Browser-specific options. Only applicable when `runtime === 'browser'`.
252
- */
253
- browser?: BrowserConfig;
254
-
255
- /**
256
- * Number of tokens per evaluation window when streaming mode is enabled.
257
- * Smaller values detect issues earlier but increase evaluation frequency.
258
- * @default 200
259
- */
260
- chunkSize?: number;
261
-
262
- /**
263
- * Number of tokens from the previous chunk to carry forward as context into
264
- * the next window, preventing boundary effects.
265
- * @default 50
266
- */
267
- contextSize?: number;
268
-
269
- /**
270
- * Maximum number of classifier evaluations per stream. The sliding window
271
- * stops advancing after this many evaluations, allowing the stream to
272
- * complete without further overhead.
273
- * @default 100
274
- */
275
- maxEvaluations?: number;
276
-
277
- /**
278
- * Enable sliding-window evaluation for streamed (token-by-token) output.
279
- * When `false`, classifiers only run on the completed final response.
280
- * @default false
281
- */
282
- streamingMode?: boolean;
283
-
284
- /**
285
- * Pack-level threshold defaults applied to every classifier unless
286
- * overridden by a per-classifier {@link ClassifierConfig}.
287
- */
288
- thresholds?: Partial<ClassifierThresholds>;
289
-
290
- /**
291
- * Scope of guardrail enforcement.
292
- * - `'input'` — Evaluate user messages before orchestration.
293
- * - `'output'` — Evaluate agent responses before delivery.
294
- * - `'both'` — Evaluate at both stages.
295
- * @default 'both'
296
- */
297
- guardrailScope?: 'input' | 'output' | 'both';
73
+ source: 'onnx' | 'llm' | 'keyword';
298
74
  }
299
75
 
300
76
  // ---------------------------------------------------------------------------
301
- // Service identifiers
77
+ // LLM invoker callback
302
78
  // ---------------------------------------------------------------------------
303
79
 
304
80
  /**
305
- * Well-known service identifier strings for the three built-in ML classifier
306
- * pipelines.
81
+ * Callback signature for invoking an LLM to perform classification when
82
+ * ONNX models are unavailable.
307
83
  *
308
- * These IDs follow the `agentos:<domain>:<name>` naming convention used
309
- * throughout the AgentOS extension ecosystem. Use them to retrieve specific
310
- * classifier services from the shared service registry.
84
+ * The callback receives a system prompt and a user message and returns
85
+ * the raw LLM text response. The caller is responsible for parsing the
86
+ * JSON output.
311
87
  *
312
- * @example
313
- * ```typescript
314
- * const toxicity = serviceRegistry.get(ML_CLASSIFIER_SERVICE_IDS.TOXICITY_PIPELINE);
315
- * ```
88
+ * @param systemPrompt - Instruction prompt describing the classification task.
89
+ * @param userMessage - The text to classify.
90
+ * @returns The raw string response from the LLM.
316
91
  */
317
- export const ML_CLASSIFIER_SERVICE_IDS = {
318
- /** Classifier that detects toxic, hateful, or abusive language. */
319
- TOXICITY_PIPELINE: 'agentos:ml-classifiers:toxicity-pipeline',
320
-
321
- /** Classifier that detects prompt-injection attempts. */
322
- INJECTION_PIPELINE: 'agentos:ml-classifiers:injection-pipeline',
323
-
324
- /** Classifier that detects jailbreak / system-override attempts. */
325
- JAILBREAK_PIPELINE: 'agentos:ml-classifiers:jailbreak-pipeline',
326
- } as const;
327
-
328
- /** Union type of all ML classifier service ID strings. */
329
- export type MLClassifierServiceId =
330
- (typeof ML_CLASSIFIER_SERVICE_IDS)[keyof typeof ML_CLASSIFIER_SERVICE_IDS];
92
+ export type LlmInvoker = (systemPrompt: string, userMessage: string) => Promise<string>;
331
93
 
332
94
  // ---------------------------------------------------------------------------
333
- // Annotated & aggregated result types
95
+ // Pack options
334
96
  // ---------------------------------------------------------------------------
335
97
 
336
98
  /**
337
- * A {@link ClassificationResult} augmented with provenance metadata.
99
+ * Configuration options for the ML Classifiers extension pack.
338
100
  *
339
- * Produced when a classifier evaluates a chunk of text. Carries the
340
- * classifier's identity and the wall-clock latency so callers can build
341
- * audit trails and SLO dashboards.
101
+ * All properties are optional. Sensible defaults allow zero-config operation
102
+ * using the keyword fallback classifier.
342
103
  */
343
- export interface AnnotatedClassificationResult extends ClassificationResult {
344
- /**
345
- * The {@link IContentClassifier.id} of the classifier that produced this
346
- * result (e.g. `ML_CLASSIFIER_SERVICE_IDS.TOXICITY_PIPELINE`).
347
- */
348
- classifierId: string;
349
-
104
+ export interface MLClassifierOptions {
350
105
  /**
351
- * Wall-clock time in milliseconds from when `classify()` was called to when
352
- * it resolved.
106
+ * Which safety categories to evaluate.
107
+ * @default ALL_CATEGORIES
353
108
  */
354
- latencyMs: number;
355
- }
109
+ categories?: ClassifierCategory[];
356
110
 
357
- /**
358
- * Aggregated evaluation outcome for a single sliding-window chunk.
359
- *
360
- * Produced by running all active classifiers against one text window and
361
- * collating their results into a single action recommendation.
362
- *
363
- * The `recommendedAction` is the most restrictive action across all
364
- * classifiers (BLOCK > FLAG > SANITIZE > ALLOW).
365
- */
366
- export interface ChunkEvaluation {
367
111
  /**
368
- * Individual results from every classifier that evaluated this chunk,
369
- * in the order the classifiers were invoked.
112
+ * Per-category confidence thresholds that override the global defaults.
113
+ *
114
+ * Keys are category names; values are threshold overrides with optional
115
+ * `flag` and `block` levels.
116
+ *
117
+ * @example `{ toxic: { flag: 0.4, block: 0.7 } }`
370
118
  */
371
- results: AnnotatedClassificationResult[];
119
+ thresholds?: Partial<Record<ClassifierCategory, { flag?: number; block?: number }>>;
372
120
 
373
121
  /**
374
- * The most restrictive guardrail action recommended across all results.
375
- * The pipeline should act on this value rather than iterating `results`
376
- * manually.
122
+ * Global flag threshold applied to all categories that do not have a
123
+ * per-category override.
124
+ * @default 0.5
377
125
  */
378
- recommendedAction: GuardrailAction;
126
+ flagThreshold?: number;
379
127
 
380
128
  /**
381
- * ID of the classifier that triggered the `recommendedAction`, or `null`
382
- * if the action is {@link GuardrailAction.ALLOW} (no classifier triggered).
129
+ * Global block threshold applied to all categories that do not have a
130
+ * per-category override.
131
+ * @default 0.8
383
132
  */
384
- triggeredBy: string | null;
133
+ blockThreshold?: number;
385
134
 
386
135
  /**
387
- * Sum of all classifier `latencyMs` values useful for profiling the
388
- * total evaluation overhead per chunk.
136
+ * Optional LLM invoker callback. When provided and ONNX models are
137
+ * unavailable, the classifier will fall back to LLM-as-judge classification
138
+ * using this callback.
139
+ *
140
+ * When omitted AND ONNX models are unavailable, the classifier falls back
141
+ * to keyword-based detection.
389
142
  */
390
- totalLatencyMs: number;
143
+ llmInvoker?: LlmInvoker;
391
144
  }