@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +23 -0
  2. package/dist/ClassifierOrchestrator.d.ts +126 -0
  3. package/dist/ClassifierOrchestrator.d.ts.map +1 -0
  4. package/dist/ClassifierOrchestrator.js +239 -0
  5. package/dist/ClassifierOrchestrator.js.map +1 -0
  6. package/dist/IContentClassifier.d.ts +117 -0
  7. package/dist/IContentClassifier.d.ts.map +1 -0
  8. package/dist/IContentClassifier.js +22 -0
  9. package/dist/IContentClassifier.js.map +1 -0
  10. package/dist/MLClassifierGuardrail.d.ts +163 -0
  11. package/dist/MLClassifierGuardrail.d.ts.map +1 -0
  12. package/dist/MLClassifierGuardrail.js +335 -0
  13. package/dist/MLClassifierGuardrail.js.map +1 -0
  14. package/dist/SlidingWindowBuffer.d.ts +213 -0
  15. package/dist/SlidingWindowBuffer.d.ts.map +1 -0
  16. package/dist/SlidingWindowBuffer.js +246 -0
  17. package/dist/SlidingWindowBuffer.js.map +1 -0
  18. package/dist/classifiers/InjectionClassifier.d.ts +126 -0
  19. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
  20. package/dist/classifiers/InjectionClassifier.js +210 -0
  21. package/dist/classifiers/InjectionClassifier.js.map +1 -0
  22. package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
  23. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
  24. package/dist/classifiers/JailbreakClassifier.js +208 -0
  25. package/dist/classifiers/JailbreakClassifier.js.map +1 -0
  26. package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
  27. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
  28. package/dist/classifiers/ToxicityClassifier.js +212 -0
  29. package/dist/classifiers/ToxicityClassifier.js.map +1 -0
  30. package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
  31. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
  32. package/dist/classifiers/WorkerClassifierProxy.js +268 -0
  33. package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
  34. package/dist/index.d.ts +110 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +342 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/tools/ClassifyContentTool.d.ts +105 -0
  39. package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
  40. package/dist/tools/ClassifyContentTool.js +149 -0
  41. package/dist/tools/ClassifyContentTool.js.map +1 -0
  42. package/dist/types.d.ts +319 -0
  43. package/dist/types.d.ts.map +1 -0
  44. package/dist/types.js +62 -0
  45. package/dist/types.js.map +1 -0
  46. package/dist/worker/classifier-worker.d.ts +49 -0
  47. package/dist/worker/classifier-worker.d.ts.map +1 -0
  48. package/dist/worker/classifier-worker.js +180 -0
  49. package/dist/worker/classifier-worker.js.map +1 -0
  50. package/package.json +45 -0
  51. package/src/ClassifierOrchestrator.ts +290 -0
  52. package/src/IContentClassifier.ts +124 -0
  53. package/src/MLClassifierGuardrail.ts +419 -0
  54. package/src/SlidingWindowBuffer.ts +384 -0
  55. package/src/classifiers/InjectionClassifier.ts +261 -0
  56. package/src/classifiers/JailbreakClassifier.ts +259 -0
  57. package/src/classifiers/ToxicityClassifier.ts +263 -0
  58. package/src/classifiers/WorkerClassifierProxy.ts +366 -0
  59. package/src/index.ts +383 -0
  60. package/src/tools/ClassifyContentTool.ts +201 -0
  61. package/src/types.ts +391 -0
  62. package/src/worker/classifier-worker.ts +267 -0
@@ -0,0 +1,290 @@
1
+ /**
2
+ * @fileoverview Orchestrator for parallel ML classifier execution with worst-wins aggregation.
3
+ *
4
+ * The `ClassifierOrchestrator` runs all registered {@link IContentClassifier}
5
+ * instances in parallel against a single text input and aggregates their
6
+ * results into a single {@link ChunkEvaluation}. The aggregation policy is
7
+ * **worst-wins**: if any classifier recommends BLOCK the overall result is
8
+ * BLOCK, even if every other classifier returned ALLOW.
9
+ *
10
+ * Priority order (descending):
11
+ * ```
12
+ * BLOCK > FLAG > SANITIZE > ALLOW
13
+ * ```
14
+ *
15
+ * Each classifier may have its own threshold overrides (via
16
+ * `perClassifierThresholds`), and individual labels can be mapped to
17
+ * hard-coded actions via `ClassifierConfig.labelActions`.
18
+ *
19
+ * @module agentos/extensions/packs/ml-classifiers/ClassifierOrchestrator
20
+ */
21
+
22
+ import type { IContentClassifier } from './IContentClassifier';
23
+ import type {
24
+ AnnotatedClassificationResult,
25
+ ChunkEvaluation,
26
+ ClassifierThresholds,
27
+ ClassifierConfig,
28
+ } from './types';
29
+ import { DEFAULT_THRESHOLDS } from './types';
30
+ import { GuardrailAction } from '@framers/agentos';
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // Action severity ranking — used by worst-wins aggregation
34
+ // ---------------------------------------------------------------------------
35
+
36
+ /**
37
+ * Numeric severity for each {@link GuardrailAction}, where higher values
38
+ * represent more restrictive actions. Used to implement the worst-wins
39
+ * comparison without brittle string ordering.
40
+ */
41
+ const ACTION_SEVERITY: Record<GuardrailAction, number> = {
42
+ [GuardrailAction.ALLOW]: 0,
43
+ [GuardrailAction.SANITIZE]: 1,
44
+ [GuardrailAction.FLAG]: 2,
45
+ [GuardrailAction.BLOCK]: 3,
46
+ };
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // ClassifierOrchestrator
50
+ // ---------------------------------------------------------------------------
51
+
52
+ /**
53
+ * Drives all registered ML classifiers in parallel and folds their results
54
+ * into a single {@link ChunkEvaluation} using worst-wins aggregation.
55
+ *
56
+ * @example
57
+ * ```typescript
58
+ * const orchestrator = new ClassifierOrchestrator(
59
+ * [toxicityClassifier, injectionClassifier],
60
+ * DEFAULT_THRESHOLDS,
61
+ * );
62
+ *
63
+ * const evaluation = await orchestrator.classifyAll('some user message');
64
+ * if (evaluation.recommendedAction === GuardrailAction.BLOCK) {
65
+ * // Terminate the interaction.
66
+ * }
67
+ * ```
68
+ */
69
+ export class ClassifierOrchestrator {
70
+ // -------------------------------------------------------------------------
71
+ // Private state
72
+ // -------------------------------------------------------------------------
73
+
74
+ /** Immutable list of classifiers to run on every `classifyAll()` call. */
75
+ private readonly classifiers: IContentClassifier[];
76
+
77
+ /** Merged default thresholds (pack-level defaults + caller overrides). */
78
+ private readonly defaultThresholds: ClassifierThresholds;
79
+
80
+ /**
81
+ * Optional per-classifier threshold overrides, keyed by classifier ID.
82
+ * When a classifier's ID appears here, the partial thresholds are merged
83
+ * on top of {@link defaultThresholds} for that classifier only.
84
+ */
85
+ private readonly perClassifierThresholds: Record<string, Partial<ClassifierThresholds>>;
86
+
87
+ // -------------------------------------------------------------------------
88
+ // Constructor
89
+ // -------------------------------------------------------------------------
90
+
91
+ /**
92
+ * Create a new orchestrator.
93
+ *
94
+ * @param classifiers - Array of classifier instances to run in parallel.
95
+ * @param defaultThresholds - Pack-level threshold defaults applied to every classifier
96
+ * unless overridden by `perClassifierThresholds`.
97
+ * @param perClassifierThresholds - Optional map from classifier ID to partial threshold
98
+ * overrides. Missing fields fall back to `defaultThresholds`.
99
+ */
100
+ constructor(
101
+ classifiers: IContentClassifier[],
102
+ defaultThresholds: ClassifierThresholds = DEFAULT_THRESHOLDS,
103
+ perClassifierThresholds: Record<string, Partial<ClassifierThresholds>> = {},
104
+ ) {
105
+ this.classifiers = classifiers;
106
+ this.defaultThresholds = defaultThresholds;
107
+ this.perClassifierThresholds = perClassifierThresholds;
108
+ }
109
+
110
+ // -------------------------------------------------------------------------
111
+ // Public API
112
+ // -------------------------------------------------------------------------
113
+
114
+ /**
115
+ * Classify `text` against every registered classifier in parallel and
116
+ * return the aggregated {@link ChunkEvaluation}.
117
+ *
118
+ * Execution details:
119
+ * 1. All classifiers run concurrently via `Promise.allSettled`.
120
+ * 2. Fulfilled results are wrapped as {@link AnnotatedClassificationResult}
121
+ * with provenance metadata (`classifierId`, `latencyMs`).
122
+ * 3. Rejected promises log a warning and contribute an implicit ALLOW so
123
+ * a single broken classifier does not block all content.
124
+ * 4. Each result is mapped to a {@link GuardrailAction} using
125
+ * per-classifier thresholds (if configured) or the pack defaults.
126
+ * 5. The final `recommendedAction` is the most restrictive action across
127
+ * all classifiers (worst-wins).
128
+ *
129
+ * @param text - The text to evaluate. Must not be empty.
130
+ * @returns A promise resolving to the aggregated evaluation result.
131
+ */
132
+ async classifyAll(text: string): Promise<ChunkEvaluation> {
133
+ // Record wall-clock start time so `totalLatencyMs` reflects the
134
+ // real-world time spent, not the sum of sequential latencies.
135
+ const wallStart = performance.now();
136
+
137
+ // Fire all classifiers in parallel and wait for every one to settle.
138
+ const settled = await Promise.allSettled(
139
+ this.classifiers.map((c) => this.timedClassify(c, text)),
140
+ );
141
+
142
+ // Accumulate annotated results and track the worst action seen.
143
+ const results: AnnotatedClassificationResult[] = [];
144
+ let worstAction = GuardrailAction.ALLOW;
145
+ let triggeredBy: string | null = null;
146
+
147
+ for (let i = 0; i < settled.length; i++) {
148
+ const outcome = settled[i];
149
+ const classifier = this.classifiers[i];
150
+
151
+ if (outcome.status === 'fulfilled') {
152
+ const annotated = outcome.value;
153
+ results.push(annotated);
154
+
155
+ // Resolve the thresholds for this specific classifier.
156
+ const thresholds = this.resolveThresholds(classifier.id);
157
+
158
+ // Map the raw confidence score to a guardrail action.
159
+ const action = this.scoreToAction(annotated, thresholds);
160
+
161
+ // Worst-wins: keep the most restrictive action.
162
+ if (ACTION_SEVERITY[action] > ACTION_SEVERITY[worstAction]) {
163
+ worstAction = action;
164
+ triggeredBy = classifier.id;
165
+ }
166
+ } else {
167
+ // Classifier failed — log and contribute an implicit ALLOW.
168
+ console.warn(
169
+ `[ClassifierOrchestrator] Classifier "${classifier.id}" failed: ${outcome.reason}`,
170
+ );
171
+ }
172
+ }
173
+
174
+ const wallEnd = performance.now();
175
+
176
+ return {
177
+ results,
178
+ recommendedAction: worstAction,
179
+ triggeredBy,
180
+ totalLatencyMs: Math.round(wallEnd - wallStart),
181
+ };
182
+ }
183
+
184
+ /**
185
+ * Dispose every registered classifier, releasing model weights and any
186
+ * other resources they hold.
187
+ *
188
+ * Calls each classifier's `dispose()` method (if present) and swallows
189
+ * errors so a single failing classifier does not prevent cleanup of the
190
+ * others.
191
+ */
192
+ async dispose(): Promise<void> {
193
+ await Promise.allSettled(
194
+ this.classifiers.map(async (c) => {
195
+ if (c.dispose) {
196
+ await c.dispose();
197
+ }
198
+ }),
199
+ );
200
+ }
201
+
202
+ // -------------------------------------------------------------------------
203
+ // Private helpers
204
+ // -------------------------------------------------------------------------
205
+
206
+ /**
207
+ * Invoke a single classifier with wall-clock latency tracking.
208
+ *
209
+ * Wraps `classifier.classify(text)` and returns the raw
210
+ * {@link ClassificationResult} augmented with `classifierId` and
211
+ * `latencyMs` fields.
212
+ *
213
+ * @param classifier - The classifier to invoke.
214
+ * @param text - The text to classify.
215
+ * @returns An annotated result with provenance metadata.
216
+ */
217
+ private async timedClassify(
218
+ classifier: IContentClassifier,
219
+ text: string,
220
+ ): Promise<AnnotatedClassificationResult> {
221
+ const start = performance.now();
222
+ const result = await classifier.classify(text);
223
+ const latencyMs = Math.round(performance.now() - start);
224
+
225
+ return {
226
+ ...result,
227
+ classifierId: classifier.id,
228
+ latencyMs,
229
+ };
230
+ }
231
+
232
+ /**
233
+ * Map a classifier's confidence score to a {@link GuardrailAction}.
234
+ *
235
+ * The mapping checks `labelActions` first (from per-classifier config in
236
+ * thresholds), then falls back to numeric threshold comparison:
237
+ *
238
+ * 1. `confidence >= blockThreshold` -> BLOCK
239
+ * 2. `confidence >= flagThreshold` -> FLAG
240
+ * 3. `confidence >= warnThreshold` -> SANITIZE
241
+ * 4. otherwise -> ALLOW
242
+ *
243
+ * @param result - The annotated classification result.
244
+ * @param thresholds - Resolved thresholds for this classifier.
245
+ * @returns The appropriate guardrail action.
246
+ */
247
+ private scoreToAction(
248
+ result: AnnotatedClassificationResult,
249
+ thresholds: ClassifierThresholds,
250
+ ): GuardrailAction {
251
+ // Extract the confidence as a single number.
252
+ // ClassificationResult.confidence may be number | number[]; normalise.
253
+ const confidence = Array.isArray(result.confidence)
254
+ ? result.confidence[0] ?? 0
255
+ : result.confidence;
256
+
257
+ // Threshold comparison — checked in descending severity order.
258
+ if (confidence >= thresholds.blockThreshold) {
259
+ return GuardrailAction.BLOCK;
260
+ }
261
+ if (confidence >= thresholds.flagThreshold) {
262
+ return GuardrailAction.FLAG;
263
+ }
264
+ if (confidence >= thresholds.warnThreshold) {
265
+ return GuardrailAction.SANITIZE;
266
+ }
267
+
268
+ return GuardrailAction.ALLOW;
269
+ }
270
+
271
+ /**
272
+ * Resolve the effective thresholds for a given classifier by merging
273
+ * per-classifier overrides on top of the pack-level defaults.
274
+ *
275
+ * @param classifierId - ID of the classifier to resolve thresholds for.
276
+ * @returns Fully-resolved thresholds with no undefined fields.
277
+ */
278
+ private resolveThresholds(classifierId: string): ClassifierThresholds {
279
+ const overrides = this.perClassifierThresholds[classifierId];
280
+ if (!overrides) {
281
+ return this.defaultThresholds;
282
+ }
283
+
284
+ return {
285
+ blockThreshold: overrides.blockThreshold ?? this.defaultThresholds.blockThreshold,
286
+ flagThreshold: overrides.flagThreshold ?? this.defaultThresholds.flagThreshold,
287
+ warnThreshold: overrides.warnThreshold ?? this.defaultThresholds.warnThreshold,
288
+ };
289
+ }
290
+ }
@@ -0,0 +1,124 @@
1
+ /**
2
+ * @fileoverview Interface contract for ML-backed content classifiers.
3
+ *
4
+ * An `IContentClassifier` represents a single model pipeline that accepts
5
+ * arbitrary text and returns a {@link ClassificationResult} containing the
6
+ * winning label and confidence scores for all candidate classes.
7
+ *
8
+ * Built-in implementations (toxicity, injection, jailbreak) each implement
9
+ * this interface. Third-party classifiers may be registered via the
10
+ * `customClassifiers` option of {@link MLClassifierPackOptions}.
11
+ *
12
+ * Lifecycle
13
+ * ---------
14
+ * 1. The pack initialises each classifier (model loading, warm-up).
15
+ * 2. The guardrail pipeline calls `classify()` for every text chunk.
16
+ * 3. On pack teardown, `dispose()` is called (if present) to release GPU/
17
+ * WASM memory.
18
+ *
19
+ * @module agentos/extensions/packs/ml-classifiers/IContentClassifier
20
+ */
21
+
22
+ import type { ClassificationResult } from '@framers/agentos';
23
+
24
+ /**
25
+ * Contract for a single ML content classifier.
26
+ *
27
+ * Implementations back one model pipeline and expose a narrow classify/dispose
28
+ * API so the guardrail orchestrator can drive them uniformly regardless of the
29
+ * underlying runtime (Node.js ONNX, browser WASM, remote inference endpoint).
30
+ *
31
+ * @example Minimal custom classifier
32
+ * ```typescript
33
+ * class SarcasmClassifier implements IContentClassifier {
34
+ * readonly id = 'custom:sarcasm-detector';
35
+ * readonly displayName = 'Sarcasm Detector';
36
+ * readonly description = 'Detects sarcastic or ironic statements.';
37
+ * readonly modelId = 'my-org/sarcasm-bert';
38
+ * isLoaded = false;
39
+ *
40
+ * async classify(text: string): Promise<ClassificationResult> {
41
+ * // … run inference …
42
+ * return { bestClass: 'NOT_SARCASTIC', confidence: 0.8, allScores: [] };
43
+ * }
44
+ *
45
+ * async dispose(): Promise<void> {
46
+ * // Free resources.
47
+ * }
48
+ * }
49
+ * ```
50
+ */
51
+ export interface IContentClassifier {
52
+ /**
53
+ * Unique service identifier for this classifier.
54
+ *
55
+ * Must follow the `agentos:<domain>:<name>` convention so it can be
56
+ * registered with the AgentOS shared service registry.
57
+ *
58
+ * @example `'agentos:ml-classifiers:toxicity-pipeline'`
59
+ */
60
+ readonly id: string;
61
+
62
+ /**
63
+ * Human-readable name displayed in logs and dashboards.
64
+ *
65
+ * @example `'Toxicity Pipeline'`
66
+ */
67
+ readonly displayName: string;
68
+
69
+ /**
70
+ * Short prose description of what this classifier detects.
71
+ *
72
+ * @example `'Detects toxic, hateful, or abusive language in text.'`
73
+ */
74
+ readonly description: string;
75
+
76
+ /**
77
+ * Identifier of the underlying model being used, typically a Hugging Face
78
+ * model ID or a local filesystem path.
79
+ *
80
+ * @example `'Xenova/toxic-bert'`
81
+ */
82
+ readonly modelId: string;
83
+
84
+ /**
85
+ * Whether the model weights have been fully loaded into memory and the
86
+ * classifier is ready to accept `classify()` calls.
87
+ *
88
+ * The pack initialiser sets this to `true` after the warm-up inference
89
+ * succeeds. Callers can check this flag before calling `classify()` to
90
+ * avoid queueing calls during a slow model download.
91
+ */
92
+ isLoaded: boolean;
93
+
94
+ /**
95
+ * Classify the provided text and return confidence scores for all candidate
96
+ * labels.
97
+ *
98
+ * The classifier is responsible for mapping raw model output to the
99
+ * {@link ClassificationResult} shape. It should NOT apply thresholds or
100
+ * guardrail actions — that is the responsibility of the pack orchestrator.
101
+ *
102
+ * @param text - The text to classify. May be a short chunk from a streaming
103
+ * response or a complete message. Must not be empty.
104
+ * @returns A promise that resolves with the classification result, including
105
+ * the winning label (`bestClass`), its `confidence`, and `allScores` for
106
+ * every label the model evaluated.
107
+ * @throws {Error} If the model is not loaded (`isLoaded === false`) or if
108
+ * inference fails for an unrecoverable reason.
109
+ */
110
+ classify(text: string): Promise<ClassificationResult>;
111
+
112
+ /**
113
+ * Release all resources held by this classifier (model weights, WASM
114
+ * module, GPU buffers, worker threads, etc.).
115
+ *
116
+ * Called by the pack orchestrator during AgentOS shutdown or when the pack
117
+ * is unloaded. Implementations should be idempotent — calling `dispose()`
118
+ * multiple times must not throw.
119
+ *
120
+ * @optional Classifiers that hold no persistent resources may omit this
121
+ * method.
122
+ */
123
+ dispose?(): Promise<void>;
124
+ }