@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/dist/MLClassifierGuardrail.d.ts +88 -117
  3. package/dist/MLClassifierGuardrail.d.ts.map +1 -1
  4. package/dist/MLClassifierGuardrail.js +255 -264
  5. package/dist/MLClassifierGuardrail.js.map +1 -1
  6. package/dist/classifiers/InjectionClassifier.d.ts +1 -1
  7. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
  8. package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
  9. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
  10. package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
  11. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
  12. package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
  13. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
  14. package/dist/index.d.ts +16 -90
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +33 -306
  17. package/dist/index.js.map +1 -1
  18. package/dist/keyword-classifier.d.ts +26 -0
  19. package/dist/keyword-classifier.d.ts.map +1 -0
  20. package/dist/keyword-classifier.js +113 -0
  21. package/dist/keyword-classifier.js.map +1 -0
  22. package/dist/llm-classifier.d.ts +27 -0
  23. package/dist/llm-classifier.d.ts.map +1 -0
  24. package/dist/llm-classifier.js +129 -0
  25. package/dist/llm-classifier.js.map +1 -0
  26. package/dist/tools/ClassifyContentTool.d.ts +53 -80
  27. package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
  28. package/dist/tools/ClassifyContentTool.js +52 -103
  29. package/dist/tools/ClassifyContentTool.js.map +1 -1
  30. package/dist/types.d.ts +77 -277
  31. package/dist/types.d.ts.map +1 -1
  32. package/dist/types.js +9 -55
  33. package/dist/types.js.map +1 -1
  34. package/package.json +10 -16
  35. package/src/MLClassifierGuardrail.ts +279 -316
  36. package/src/index.ts +35 -339
  37. package/src/keyword-classifier.ts +130 -0
  38. package/src/llm-classifier.ts +163 -0
  39. package/src/tools/ClassifyContentTool.ts +75 -132
  40. package/src/types.ts +78 -325
  41. package/test/ClassifierOrchestrator.spec.ts +365 -0
  42. package/test/ClassifyContentTool.spec.ts +226 -0
  43. package/test/InjectionClassifier.spec.ts +263 -0
  44. package/test/JailbreakClassifier.spec.ts +295 -0
  45. package/test/MLClassifierGuardrail.spec.ts +486 -0
  46. package/test/SlidingWindowBuffer.spec.ts +391 -0
  47. package/test/ToxicityClassifier.spec.ts +268 -0
  48. package/test/WorkerClassifierProxy.spec.ts +303 -0
  49. package/test/index.spec.ts +431 -0
  50. package/tsconfig.json +20 -0
  51. package/vitest.config.ts +24 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,18 @@
1
+ # @framers/agentos-ext-ml-classifiers
2
+
3
+ ## 0.2.1
4
+
5
+ ### Patch Changes
6
+
7
+ - [`15065c9`](https://github.com/framersai/agentos-extensions/commit/15065c949ea5d25f4408ffab2079ad3e600ddded) Thanks [@jddunn](https://github.com/jddunn)! - Fix npm publish: add missing repository.url field for sigstore provenance verification
8
+
9
+ ## 0.2.0
10
+
11
+ ### Minor Changes
12
+
13
+ - [`c35afe8`](https://github.com/framersai/agentos-extensions/commit/c35afe8c16fdf51df6ce2d0bb83de6cd702e3a8b) Thanks [@jddunn](https://github.com/jddunn)! - Implement all 5 guardrail extension packs with full detection logic:
14
+ - PII Redaction: 4-tier detection (regex + keyword + NER + LLM)
15
+ - Code Safety: OWASP regex patterns for SQL injection, XSS, command injection
16
+ - ML Classifiers: toxicity/injection/NSFW via ONNX or LLM fallback
17
+ - Topicality: embedding-based topic enforcement with LLM fallback
18
+ - Grounding Guard: NLI-based hallucination detection against RAG sources
@@ -1,163 +1,134 @@
1
1
  /**
2
- * @fileoverview IGuardrailService implementation backed by ML classifiers.
2
+ * @file MLClassifierGuardrail.ts
3
+ * @description IGuardrailService implementation that classifies text for toxicity,
4
+ * prompt injection, NSFW content, and threats using a three-tier strategy:
3
5
  *
4
- * `MLClassifierGuardrail` bridges the AgentOS guardrail pipeline to the ML
5
- * classifier subsystem. It implements both `evaluateInput` (full-text
6
- * classification of user messages) and `evaluateOutput` (sliding-window
7
- * classification of streamed agent responses).
6
+ * 1. **ONNX inference** attempts to load `@huggingface/transformers` at runtime
7
+ * and run a lightweight ONNX classification model.
8
+ * 2. **LLM-as-judge** falls back to an LLM invoker callback that prompts a
9
+ * language model for structured JSON safety classification.
10
+ * 3. **Keyword matching** — last-resort regex/keyword-based detection when neither
11
+ * ONNX nor LLM are available.
8
12
  *
9
- * Three streaming evaluation modes are supported:
13
+ * The guardrail is configured as Phase 2 (parallel, non-sanitizing) so it runs
14
+ * alongside other read-only guardrails without blocking the streaming pipeline.
10
15
  *
11
- * | Mode | Behaviour |
12
- * |---------------|----------------------------------------------------------------|
13
- * | `blocking` | Every chunk that fills the sliding window is classified |
14
- * | | **synchronously** — the stream waits for the result. |
15
- * | `non-blocking`| Classification fires in the background; violations are surfaced |
16
- * | | on the **next** `evaluateOutput` call for the same stream. |
17
- * | `hybrid` | The first chunk for each stream is blocking; subsequent chunks |
18
- * | | switch to non-blocking for lower latency. |
16
+ * ### Action thresholds
19
17
  *
20
- * The default mode is `blocking` when `streamingMode` is enabled.
18
+ * - **FLAG** when any category confidence exceeds `flagThreshold` (default 0.5).
19
+ * - **BLOCK** when any category confidence exceeds `blockThreshold` (default 0.8).
21
20
  *
22
- * @module agentos/extensions/packs/ml-classifiers/MLClassifierGuardrail
21
+ * @module ml-classifiers/MLClassifierGuardrail
23
22
  */
24
- import type { GuardrailConfig, GuardrailEvaluationResult, GuardrailInputPayload, GuardrailOutputPayload, IGuardrailService } from '@framers/agentos';
25
- import type { ISharedServiceRegistry } from '@framers/agentos';
26
- import type { MLClassifierPackOptions } from './types';
27
- import type { IContentClassifier } from './IContentClassifier';
23
+ import type { IGuardrailService, GuardrailConfig, GuardrailInputPayload, GuardrailOutputPayload, GuardrailEvaluationResult } from '@framers/agentos';
24
+ import type { MLClassifierOptions, ClassifierResult } from './types';
28
25
  /**
29
- * Guardrail implementation that runs ML classifiers against both user input
30
- * and streamed agent output.
26
+ * AgentOS guardrail that classifies text for safety using ML models, LLM
27
+ * inference, or keyword fallback.
31
28
  *
32
29
  * @implements {IGuardrailService}
33
- *
34
- * @example
35
- * ```typescript
36
- * const guardrail = new MLClassifierGuardrail(serviceRegistry, {
37
- * classifiers: ['toxicity'],
38
- * streamingMode: true,
39
- * chunkSize: 150,
40
- * guardrailScope: 'both',
41
- * });
42
- *
43
- * // Input evaluation — runs classifier on the full user message.
44
- * const inputResult = await guardrail.evaluateInput({ context, input });
45
- *
46
- * // Output evaluation — accumulates tokens, classifies at window boundary.
47
- * const outputResult = await guardrail.evaluateOutput({ context, chunk });
48
- * ```
49
30
  */
50
31
  export declare class MLClassifierGuardrail implements IGuardrailService {
51
32
  /**
52
- * Guardrail configuration exposed to the AgentOS pipeline.
33
+ * Guardrail configuration.
53
34
  *
54
- * `evaluateStreamingChunks` is always `true` because this guardrail uses
55
- * the sliding window to evaluate output tokens incrementally.
35
+ * - `canSanitize: false` this guardrail does not modify content; it only
36
+ * BLOCKs or FLAGs. This places it in Phase 2 (parallel) of the guardrail
37
+ * dispatcher for better performance.
38
+ * - `evaluateStreamingChunks: false` — only evaluates complete messages, not
39
+ * individual streaming deltas. ML classification on partial text produces
40
+ * unreliable results.
56
41
  */
57
42
  readonly config: GuardrailConfig;
58
- /** The classifier orchestrator that runs all classifiers in parallel. */
59
- private readonly orchestrator;
60
- /** Sliding window buffer for accumulating streaming tokens. */
61
- private readonly buffer;
62
- /** Guardrail scope — which direction(s) this guardrail evaluates. */
63
- private readonly scope;
64
- /** Streaming evaluation strategy for output chunks. */
65
- private readonly streamingMode;
43
+ /** Categories to evaluate. */
44
+ private readonly categories;
45
+ /** Per-category flag thresholds. */
46
+ private readonly flagThresholds;
47
+ /** Per-category block thresholds. */
48
+ private readonly blockThresholds;
49
+ /** Optional LLM invoker callback for tier-2 classification. */
50
+ private readonly llmInvoker;
66
51
  /**
67
- * Map of stream IDs to pending (background) classification promises.
68
- * Used in `non-blocking` and `hybrid` modes to defer result checking
69
- * to the next `evaluateOutput` call.
52
+ * Cached reference to the `@huggingface/transformers` pipeline function.
53
+ * `null` means we already tried and failed to load the module.
54
+ * `undefined` means we have not tried yet.
70
55
  */
71
- private readonly pendingResults;
56
+ private onnxPipeline;
72
57
  /**
73
- * Tracks whether the first chunk for a given stream has been processed.
74
- * Used by `hybrid` mode to apply blocking evaluation on the first chunk
75
- * and non-blocking for subsequent chunks.
58
+ * Create a new MLClassifierGuardrail.
59
+ *
60
+ * @param options - Pack-level configuration. All properties have sensible
61
+ * defaults for zero-config operation.
76
62
  */
77
- private readonly isFirstChunk;
63
+ constructor(options?: MLClassifierOptions);
78
64
  /**
79
- * Create a new ML classifier guardrail.
65
+ * Evaluate user input for safety before orchestration begins.
80
66
  *
81
- * @param _services - Shared service registry (reserved for future use by
82
- * classifier factories that need lazy model loading).
83
- * @param options - Pack-level options controlling classifier selection,
84
- * thresholds, sliding window size, and streaming mode.
85
- * @param classifiers - Pre-built classifier instances. When provided,
86
- * these are used directly instead of constructing
87
- * classifiers from `options.classifiers`.
67
+ * @param payload - Input evaluation payload containing the user's message.
68
+ * @returns Guardrail result or `null` if no action is required.
88
69
  */
89
- constructor(_services: ISharedServiceRegistry, options: MLClassifierPackOptions, classifiers?: IContentClassifier[]);
70
+ evaluateInput(payload: GuardrailInputPayload): Promise<GuardrailEvaluationResult | null>;
90
71
  /**
91
- * Evaluate a user's input message before it enters the orchestration pipeline.
92
- *
93
- * Runs the full text through all registered classifiers and returns a
94
- * {@link GuardrailEvaluationResult} when a violation is detected, or
95
- * `null` when the content is clean.
72
+ * Evaluate agent output for safety. Only processes FINAL_RESPONSE chunks
73
+ * since `evaluateStreamingChunks` is disabled.
96
74
  *
97
- * Skipped entirely when `scope === 'output'`.
75
+ * @param payload - Output evaluation payload from the AgentOS dispatcher.
76
+ * @returns Guardrail result or `null` if no action is required.
77
+ */
78
+ evaluateOutput(payload: GuardrailOutputPayload): Promise<GuardrailEvaluationResult | null>;
79
+ /**
80
+ * Classify a text string using the three-tier strategy: ONNX -> LLM -> keyword.
98
81
  *
99
- * @param payload - The input payload containing user text and context.
100
- * @returns Evaluation result or `null` if no action is needed.
82
+ * @param text - The text to classify.
83
+ * @returns Classification result with per-category scores.
101
84
  */
102
- evaluateInput(payload: GuardrailInputPayload): Promise<GuardrailEvaluationResult | null>;
85
+ classify(text: string): Promise<ClassifierResult>;
103
86
  /**
104
- * Evaluate a streamed output chunk from the agent before it is delivered
105
- * to the client.
87
+ * Attempt to load `@huggingface/transformers` and run ONNX-based text
88
+ * classification. Returns `null` if the module is unavailable or inference
89
+ * fails.
106
90
  *
107
- * The method accumulates text tokens in the sliding window buffer and
108
- * triggers classifier evaluation when a full window is available. The
109
- * evaluation strategy depends on the configured streaming mode.
91
+ * The module load is attempted only once; subsequent calls use the cached
92
+ * result (either a working pipeline or `null`).
110
93
  *
111
- * Skipped entirely when `scope === 'input'`.
94
+ * @param text - Text to classify.
95
+ * @returns Classification result or `null`.
112
96
  *
113
- * @param payload - The output payload containing the response chunk and context.
114
- * @returns Evaluation result or `null` if no action is needed yet.
97
+ * @internal
115
98
  */
116
- evaluateOutput(payload: GuardrailOutputPayload): Promise<GuardrailEvaluationResult | null>;
99
+ private tryOnnxClassification;
117
100
  /**
118
- * **Blocking mode**: push text into the buffer and, when a full window is
119
- * ready, await the classifier result before returning.
101
+ * Map raw ONNX text-classification output labels to our standard categories.
120
102
  *
121
- * @param streamId - Identifier of the active stream.
122
- * @param textDelta - New text fragment from the current chunk.
123
- * @returns Evaluation result (possibly BLOCK/FLAG) or `null`.
124
- */
125
- private handleBlocking;
126
- /**
127
- * **Non-blocking mode**: push text into the buffer. When a window is
128
- * ready, fire classification in the background and store the promise.
129
- * On the **next** `evaluateOutput` call for the same stream, check the
130
- * pending promise — if it resolved with a violation, return that result.
103
+ * ONNX models (e.g. toxic-bert) produce labels like `"toxic"`, `"obscene"`,
104
+ * `"threat"`, `"insult"`, `"identity_hate"`, etc. We map these to our four
105
+ * categories, taking the max score when multiple ONNX labels map to the same
106
+ * category.
107
+ *
108
+ * @param raw - Raw ONNX pipeline output.
109
+ * @returns Per-category scores.
131
110
  *
132
- * @param streamId - Identifier of the active stream.
133
- * @param textDelta - New text fragment from the current chunk.
134
- * @returns A previously resolved violation result, or `null`.
111
+ * @internal
135
112
  */
136
- private handleNonBlocking;
113
+ private mapOnnxScores;
137
114
  /**
138
- * **Hybrid mode**: the first chunk for each stream is evaluated in
139
- * blocking mode; subsequent chunks use non-blocking.
115
+ * Classify text using the LLM-as-judge fallback.
140
116
  *
141
- * This provides immediate feedback on the first window (where early
142
- * jailbreak attempts are most likely) while minimising latency for the
143
- * remainder of the stream.
117
+ * @param text - Text to classify.
118
+ * @returns Classification result or `null` if the LLM call fails.
144
119
  *
145
- * @param streamId - Identifier of the active stream.
146
- * @param textDelta - New text fragment from the current chunk.
147
- * @returns Evaluation result or `null`.
120
+ * @internal
148
121
  */
149
- private handleHybrid;
122
+ private tryLlmClassification;
150
123
  /**
151
- * Convert a {@link ChunkEvaluation} into a {@link GuardrailEvaluationResult}
152
- * suitable for the AgentOS guardrail pipeline.
124
+ * Convert a {@link ClassifierResult} into a {@link GuardrailEvaluationResult},
125
+ * or return `null` when no thresholds are exceeded.
153
126
  *
154
- * Returns `null` when the recommended action is ALLOW (no intervention
155
- * needed). For all other actions, the evaluation details are attached as
156
- * metadata for audit/logging.
127
+ * @param result - Classification result from any tier.
128
+ * @returns Guardrail evaluation result or `null`.
157
129
  *
158
- * @param evaluation - Aggregated classifier evaluation.
159
- * @returns A guardrail result or `null` for clean content.
130
+ * @internal
160
131
  */
161
- private evaluationToResult;
132
+ private buildResult;
162
133
  }
163
134
  //# sourceMappingURL=MLClassifierGuardrail.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"MLClassifierGuardrail.d.ts","sourceRoot":"","sources":["../src/MLClassifierGuardrail.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,yBAAyB,EACzB,qBAAqB,EACrB,sBAAsB,EACtB,iBAAiB,EAClB,MAAM,kBAAkB,CAAC;AAG1B,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC/D,OAAO,KAAK,EAAE,uBAAuB,EAAmB,MAAM,SAAS,CAAC;AAIxE,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAmB/D;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,qBAAa,qBAAsB,YAAW,iBAAiB;IAK7D;;;;;OAKG;IACH,QAAQ,CAAC,MAAM,EAAE,eAAe,CAAC;IAMjC,yEAAyE;IACzE,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAyB;IAEtD,+DAA+D;IAC/D,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAsB;IAE7C,qEAAqE;IACrE,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA8B;IAEpD,uDAAuD;IACvD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAgB;IAE9C;;;;OAIG;IACH,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAoD;IAEnF;;;;OAIG;IACH,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAmC;IAMhE;;;;;;;;;;OAUG;gBAED,SAAS,EAAE,sBAAsB,EACjC,OAAO,EAAE,uBAAuB,EAChC,WAAW,GAAE,kBAAkB,EAAO;IAsCxC;;;;;;;;;;;OAWG;IACG,aAAa,CAAC,OAAO,EAAE,qBAAqB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAuB9F;;;;;;;;;;;;OAYG;IACG,cAAc,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IA0DhG;;;;;;;OAOG;YACW,cAAc;IAc5B;;;;;;;;;OASG;YACW,iBAAiB;IAoC/B;;;;;;;;;;;OAWG;YACW,YAAY;IAqB1B;;;;;;;;;;OAUG;IACH,OAAO,CAAC,kBAAkB;CAsB3B"}
1
+ {"version":3,"file":"MLClassifierGuardrail.d.ts","sourceRoot":"","sources":["../src/MLClassifierGuardrail.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,KAAK,EACV,iBAAiB,EACjB,eAAe,EACf,qBAAqB,EACrB,sBAAsB,EACtB,yBAAyB,EAC1B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,KAAK,EACV,mBAAmB,EAEnB,gBAAgB,EAEjB,MAAM,SAAS,CAAC;AASjB;;;;;GAKG;AACH,qBAAa,qBAAsB,YAAW,iBAAiB;IAK7D;;;;;;;;;OASG;IACH,QAAQ,CAAC,MAAM,EAAE,eAAe,CAG9B;IAMF,8BAA8B;IAC9B,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAuB;IAElD,oCAAoC;IACpC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAqC;IAEpE,qCAAqC;IACrC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAqC;IAErE,+DAA+D;IAC/D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAoC;IAE/D;;;;OAIG;IACH,OAAO,CAAC,YAAY,CAAqC;IAMzD;;;;;OAKG;gBACS,OAAO,CAAC,EAAE,mBAAmB;IAuBzC;;;;;OAKG;IACG,aAAa,CAAC,OAAO,EAAE,qBAAqB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAY9F;;;;;;OAMG;IACG,cAAc,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAmBhG;;;;;OAKG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAwBvD;;;;;;;;;;;;OAYG;YACW,qBAAqB;IAqCnC;;;;;;;;;;;;OAYG;IACH,OAAO,CAAC,aAAa;IAsCrB;;;;;;;OAOG;YACW,oBAAoB;IAuBlC;;;;;;;;OAQG;IACH,OAAO,CAAC,WAAW;CAsCpB"}