@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/MLClassifierGuardrail.d.ts +88 -117
- package/dist/MLClassifierGuardrail.d.ts.map +1 -1
- package/dist/MLClassifierGuardrail.js +255 -264
- package/dist/MLClassifierGuardrail.js.map +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts +1 -1
- package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
- package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
- package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
- package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
- package/dist/index.d.ts +16 -90
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +33 -306
- package/dist/index.js.map +1 -1
- package/dist/keyword-classifier.d.ts +26 -0
- package/dist/keyword-classifier.d.ts.map +1 -0
- package/dist/keyword-classifier.js +113 -0
- package/dist/keyword-classifier.js.map +1 -0
- package/dist/llm-classifier.d.ts +27 -0
- package/dist/llm-classifier.d.ts.map +1 -0
- package/dist/llm-classifier.js +129 -0
- package/dist/llm-classifier.js.map +1 -0
- package/dist/tools/ClassifyContentTool.d.ts +53 -80
- package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
- package/dist/tools/ClassifyContentTool.js +52 -103
- package/dist/tools/ClassifyContentTool.js.map +1 -1
- package/dist/types.d.ts +77 -277
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +9 -55
- package/dist/types.js.map +1 -1
- package/package.json +10 -16
- package/src/MLClassifierGuardrail.ts +279 -316
- package/src/index.ts +35 -339
- package/src/keyword-classifier.ts +130 -0
- package/src/llm-classifier.ts +163 -0
- package/src/tools/ClassifyContentTool.ts +75 -132
- package/src/types.ts +78 -325
- package/test/ClassifierOrchestrator.spec.ts +365 -0
- package/test/ClassifyContentTool.spec.ts +226 -0
- package/test/InjectionClassifier.spec.ts +263 -0
- package/test/JailbreakClassifier.spec.ts +295 -0
- package/test/MLClassifierGuardrail.spec.ts +486 -0
- package/test/SlidingWindowBuffer.spec.ts +391 -0
- package/test/ToxicityClassifier.spec.ts +268 -0
- package/test/WorkerClassifierProxy.spec.ts +303 -0
- package/test/index.spec.ts +431 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +24 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# @framers/agentos-ext-ml-classifiers
|
|
2
|
+
|
|
3
|
+
## 0.2.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- [`15065c9`](https://github.com/framersai/agentos-extensions/commit/15065c949ea5d25f4408ffab2079ad3e600ddded) Thanks [@jddunn](https://github.com/jddunn)! - Fix npm publish: add missing repository.url field for sigstore provenance verification
|
|
8
|
+
|
|
9
|
+
## 0.2.0
|
|
10
|
+
|
|
11
|
+
### Minor Changes
|
|
12
|
+
|
|
13
|
+
- [`c35afe8`](https://github.com/framersai/agentos-extensions/commit/c35afe8c16fdf51df6ce2d0bb83de6cd702e3a8b) Thanks [@jddunn](https://github.com/jddunn)! - Implement all 5 guardrail extension packs with full detection logic:
|
|
14
|
+
- PII Redaction: 4-tier detection (regex + keyword + NER + LLM)
|
|
15
|
+
- Code Safety: OWASP regex patterns for SQL injection, XSS, command injection
|
|
16
|
+
- ML Classifiers: toxicity/injection/NSFW via ONNX or LLM fallback
|
|
17
|
+
- Topicality: embedding-based topic enforcement with LLM fallback
|
|
18
|
+
- Grounding Guard: NLI-based hallucination detection against RAG sources
|
|
@@ -1,163 +1,134 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @
|
|
2
|
+
* @file MLClassifierGuardrail.ts
|
|
3
|
+
* @description IGuardrailService implementation that classifies text for toxicity,
|
|
4
|
+
* prompt injection, NSFW content, and threats using a three-tier strategy:
|
|
3
5
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
6
|
+
* 1. **ONNX inference** — attempts to load `@huggingface/transformers` at runtime
|
|
7
|
+
* and run a lightweight ONNX classification model.
|
|
8
|
+
* 2. **LLM-as-judge** — falls back to an LLM invoker callback that prompts a
|
|
9
|
+
* language model for structured JSON safety classification.
|
|
10
|
+
* 3. **Keyword matching** — last-resort regex/keyword-based detection when neither
|
|
11
|
+
* ONNX nor LLM are available.
|
|
8
12
|
*
|
|
9
|
-
*
|
|
13
|
+
* The guardrail is configured as Phase 2 (parallel, non-sanitizing) so it runs
|
|
14
|
+
* alongside other read-only guardrails without blocking the streaming pipeline.
|
|
10
15
|
*
|
|
11
|
-
*
|
|
12
|
-
* |---------------|----------------------------------------------------------------|
|
|
13
|
-
* | `blocking` | Every chunk that fills the sliding window is classified |
|
|
14
|
-
* | | **synchronously** — the stream waits for the result. |
|
|
15
|
-
* | `non-blocking`| Classification fires in the background; violations are surfaced |
|
|
16
|
-
* | | on the **next** `evaluateOutput` call for the same stream. |
|
|
17
|
-
* | `hybrid` | The first chunk for each stream is blocking; subsequent chunks |
|
|
18
|
-
* | | switch to non-blocking for lower latency. |
|
|
16
|
+
* ### Action thresholds
|
|
19
17
|
*
|
|
20
|
-
*
|
|
18
|
+
* - **FLAG** when any category confidence exceeds `flagThreshold` (default 0.5).
|
|
19
|
+
* - **BLOCK** when any category confidence exceeds `blockThreshold` (default 0.8).
|
|
21
20
|
*
|
|
22
|
-
* @module
|
|
21
|
+
* @module ml-classifiers/MLClassifierGuardrail
|
|
23
22
|
*/
|
|
24
|
-
import type {
|
|
25
|
-
import type {
|
|
26
|
-
import type { MLClassifierPackOptions } from './types';
|
|
27
|
-
import type { IContentClassifier } from './IContentClassifier';
|
|
23
|
+
import type { IGuardrailService, GuardrailConfig, GuardrailInputPayload, GuardrailOutputPayload, GuardrailEvaluationResult } from '@framers/agentos';
|
|
24
|
+
import type { MLClassifierOptions, ClassifierResult } from './types';
|
|
28
25
|
/**
|
|
29
|
-
*
|
|
30
|
-
*
|
|
26
|
+
* AgentOS guardrail that classifies text for safety using ML models, LLM
|
|
27
|
+
* inference, or keyword fallback.
|
|
31
28
|
*
|
|
32
29
|
* @implements {IGuardrailService}
|
|
33
|
-
*
|
|
34
|
-
* @example
|
|
35
|
-
* ```typescript
|
|
36
|
-
* const guardrail = new MLClassifierGuardrail(serviceRegistry, {
|
|
37
|
-
* classifiers: ['toxicity'],
|
|
38
|
-
* streamingMode: true,
|
|
39
|
-
* chunkSize: 150,
|
|
40
|
-
* guardrailScope: 'both',
|
|
41
|
-
* });
|
|
42
|
-
*
|
|
43
|
-
* // Input evaluation — runs classifier on the full user message.
|
|
44
|
-
* const inputResult = await guardrail.evaluateInput({ context, input });
|
|
45
|
-
*
|
|
46
|
-
* // Output evaluation — accumulates tokens, classifies at window boundary.
|
|
47
|
-
* const outputResult = await guardrail.evaluateOutput({ context, chunk });
|
|
48
|
-
* ```
|
|
49
30
|
*/
|
|
50
31
|
export declare class MLClassifierGuardrail implements IGuardrailService {
|
|
51
32
|
/**
|
|
52
|
-
* Guardrail configuration
|
|
33
|
+
* Guardrail configuration.
|
|
53
34
|
*
|
|
54
|
-
* `
|
|
55
|
-
*
|
|
35
|
+
* - `canSanitize: false` — this guardrail does not modify content; it only
|
|
36
|
+
* BLOCKs or FLAGs. This places it in Phase 2 (parallel) of the guardrail
|
|
37
|
+
* dispatcher for better performance.
|
|
38
|
+
* - `evaluateStreamingChunks: false` — only evaluates complete messages, not
|
|
39
|
+
* individual streaming deltas. ML classification on partial text produces
|
|
40
|
+
* unreliable results.
|
|
56
41
|
*/
|
|
57
42
|
readonly config: GuardrailConfig;
|
|
58
|
-
/**
|
|
59
|
-
private readonly
|
|
60
|
-
/**
|
|
61
|
-
private readonly
|
|
62
|
-
/**
|
|
63
|
-
private readonly
|
|
64
|
-
/**
|
|
65
|
-
private readonly
|
|
43
|
+
/** Categories to evaluate. */
|
|
44
|
+
private readonly categories;
|
|
45
|
+
/** Per-category flag thresholds. */
|
|
46
|
+
private readonly flagThresholds;
|
|
47
|
+
/** Per-category block thresholds. */
|
|
48
|
+
private readonly blockThresholds;
|
|
49
|
+
/** Optional LLM invoker callback for tier-2 classification. */
|
|
50
|
+
private readonly llmInvoker;
|
|
66
51
|
/**
|
|
67
|
-
*
|
|
68
|
-
*
|
|
69
|
-
*
|
|
52
|
+
* Cached reference to the `@huggingface/transformers` pipeline function.
|
|
53
|
+
* `null` means we already tried and failed to load the module.
|
|
54
|
+
* `undefined` means we have not tried yet.
|
|
70
55
|
*/
|
|
71
|
-
private
|
|
56
|
+
private onnxPipeline;
|
|
72
57
|
/**
|
|
73
|
-
*
|
|
74
|
-
*
|
|
75
|
-
*
|
|
58
|
+
* Create a new MLClassifierGuardrail.
|
|
59
|
+
*
|
|
60
|
+
* @param options - Pack-level configuration. All properties have sensible
|
|
61
|
+
* defaults for zero-config operation.
|
|
76
62
|
*/
|
|
77
|
-
|
|
63
|
+
constructor(options?: MLClassifierOptions);
|
|
78
64
|
/**
|
|
79
|
-
*
|
|
65
|
+
* Evaluate user input for safety before orchestration begins.
|
|
80
66
|
*
|
|
81
|
-
* @param
|
|
82
|
-
*
|
|
83
|
-
* @param options - Pack-level options controlling classifier selection,
|
|
84
|
-
* thresholds, sliding window size, and streaming mode.
|
|
85
|
-
* @param classifiers - Pre-built classifier instances. When provided,
|
|
86
|
-
* these are used directly instead of constructing
|
|
87
|
-
* classifiers from `options.classifiers`.
|
|
67
|
+
* @param payload - Input evaluation payload containing the user's message.
|
|
68
|
+
* @returns Guardrail result or `null` if no action is required.
|
|
88
69
|
*/
|
|
89
|
-
|
|
70
|
+
evaluateInput(payload: GuardrailInputPayload): Promise<GuardrailEvaluationResult | null>;
|
|
90
71
|
/**
|
|
91
|
-
* Evaluate
|
|
92
|
-
*
|
|
93
|
-
* Runs the full text through all registered classifiers and returns a
|
|
94
|
-
* {@link GuardrailEvaluationResult} when a violation is detected, or
|
|
95
|
-
* `null` when the content is clean.
|
|
72
|
+
* Evaluate agent output for safety. Only processes FINAL_RESPONSE chunks
|
|
73
|
+
* since `evaluateStreamingChunks` is disabled.
|
|
96
74
|
*
|
|
97
|
-
*
|
|
75
|
+
* @param payload - Output evaluation payload from the AgentOS dispatcher.
|
|
76
|
+
* @returns Guardrail result or `null` if no action is required.
|
|
77
|
+
*/
|
|
78
|
+
evaluateOutput(payload: GuardrailOutputPayload): Promise<GuardrailEvaluationResult | null>;
|
|
79
|
+
/**
|
|
80
|
+
* Classify a text string using the three-tier strategy: ONNX -> LLM -> keyword.
|
|
98
81
|
*
|
|
99
|
-
* @param
|
|
100
|
-
* @returns
|
|
82
|
+
* @param text - The text to classify.
|
|
83
|
+
* @returns Classification result with per-category scores.
|
|
101
84
|
*/
|
|
102
|
-
|
|
85
|
+
classify(text: string): Promise<ClassifierResult>;
|
|
103
86
|
/**
|
|
104
|
-
*
|
|
105
|
-
*
|
|
87
|
+
* Attempt to load `@huggingface/transformers` and run ONNX-based text
|
|
88
|
+
* classification. Returns `null` if the module is unavailable or inference
|
|
89
|
+
* fails.
|
|
106
90
|
*
|
|
107
|
-
* The
|
|
108
|
-
*
|
|
109
|
-
* evaluation strategy depends on the configured streaming mode.
|
|
91
|
+
* The module load is attempted only once; subsequent calls use the cached
|
|
92
|
+
* result (either a working pipeline or `null`).
|
|
110
93
|
*
|
|
111
|
-
*
|
|
94
|
+
* @param text - Text to classify.
|
|
95
|
+
* @returns Classification result or `null`.
|
|
112
96
|
*
|
|
113
|
-
* @
|
|
114
|
-
* @returns Evaluation result or `null` if no action is needed yet.
|
|
97
|
+
* @internal
|
|
115
98
|
*/
|
|
116
|
-
|
|
99
|
+
private tryOnnxClassification;
|
|
117
100
|
/**
|
|
118
|
-
*
|
|
119
|
-
* ready, await the classifier result before returning.
|
|
101
|
+
* Map raw ONNX text-classification output labels to our standard categories.
|
|
120
102
|
*
|
|
121
|
-
*
|
|
122
|
-
*
|
|
123
|
-
*
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
*
|
|
128
|
-
* ready, fire classification in the background and store the promise.
|
|
129
|
-
* On the **next** `evaluateOutput` call for the same stream, check the
|
|
130
|
-
* pending promise — if it resolved with a violation, return that result.
|
|
103
|
+
* ONNX models (e.g. toxic-bert) produce labels like `"toxic"`, `"obscene"`,
|
|
104
|
+
* `"threat"`, `"insult"`, `"identity_hate"`, etc. We map these to our four
|
|
105
|
+
* categories, taking the max score when multiple ONNX labels map to the same
|
|
106
|
+
* category.
|
|
107
|
+
*
|
|
108
|
+
* @param raw - Raw ONNX pipeline output.
|
|
109
|
+
* @returns Per-category scores.
|
|
131
110
|
*
|
|
132
|
-
* @
|
|
133
|
-
* @param textDelta - New text fragment from the current chunk.
|
|
134
|
-
* @returns A previously resolved violation result, or `null`.
|
|
111
|
+
* @internal
|
|
135
112
|
*/
|
|
136
|
-
private
|
|
113
|
+
private mapOnnxScores;
|
|
137
114
|
/**
|
|
138
|
-
*
|
|
139
|
-
* blocking mode; subsequent chunks use non-blocking.
|
|
115
|
+
* Classify text using the LLM-as-judge fallback.
|
|
140
116
|
*
|
|
141
|
-
*
|
|
142
|
-
*
|
|
143
|
-
* remainder of the stream.
|
|
117
|
+
* @param text - Text to classify.
|
|
118
|
+
* @returns Classification result or `null` if the LLM call fails.
|
|
144
119
|
*
|
|
145
|
-
* @
|
|
146
|
-
* @param textDelta - New text fragment from the current chunk.
|
|
147
|
-
* @returns Evaluation result or `null`.
|
|
120
|
+
* @internal
|
|
148
121
|
*/
|
|
149
|
-
private
|
|
122
|
+
private tryLlmClassification;
|
|
150
123
|
/**
|
|
151
|
-
* Convert a {@link
|
|
152
|
-
*
|
|
124
|
+
* Convert a {@link ClassifierResult} into a {@link GuardrailEvaluationResult},
|
|
125
|
+
* or return `null` when no thresholds are exceeded.
|
|
153
126
|
*
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
* metadata for audit/logging.
|
|
127
|
+
* @param result - Classification result from any tier.
|
|
128
|
+
* @returns Guardrail evaluation result or `null`.
|
|
157
129
|
*
|
|
158
|
-
* @
|
|
159
|
-
* @returns A guardrail result or `null` for clean content.
|
|
130
|
+
* @internal
|
|
160
131
|
*/
|
|
161
|
-
private
|
|
132
|
+
private buildResult;
|
|
162
133
|
}
|
|
163
134
|
//# sourceMappingURL=MLClassifierGuardrail.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"MLClassifierGuardrail.d.ts","sourceRoot":"","sources":["../src/MLClassifierGuardrail.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"MLClassifierGuardrail.d.ts","sourceRoot":"","sources":["../src/MLClassifierGuardrail.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,KAAK,EACV,iBAAiB,EACjB,eAAe,EACf,qBAAqB,EACrB,sBAAsB,EACtB,yBAAyB,EAC1B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,KAAK,EACV,mBAAmB,EAEnB,gBAAgB,EAEjB,MAAM,SAAS,CAAC;AASjB;;;;;GAKG;AACH,qBAAa,qBAAsB,YAAW,iBAAiB;IAK7D;;;;;;;;;OASG;IACH,QAAQ,CAAC,MAAM,EAAE,eAAe,CAG9B;IAMF,8BAA8B;IAC9B,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAuB;IAElD,oCAAoC;IACpC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAqC;IAEpE,qCAAqC;IACrC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAqC;IAErE,+DAA+D;IAC/D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAoC;IAE/D;;;;OAIG;IACH,OAAO,CAAC,YAAY,CAAqC;IAMzD;;;;;OAKG;gBACS,OAAO,CAAC,EAAE,mBAAmB;IAuBzC;;;;;OAKG;IACG,aAAa,CAAC,OAAO,EAAE,qBAAqB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAY9F;;;;;;OAMG;IACG,cAAc,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAmBhG;;;;;OAKG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAwBvD;;;;;;;;;;;;OAYG;YACW,qBAAqB;IAqCnC;;;;;;;;;;;;OAYG;IACH,OAAO,CAAC,aAAa;IAsCrB;;;;;;;OAOG;YACW,oBAAoB;IAuBlC;;;;;;;;OAQG;IACH,OAAO,CAAC,WAAW;CAsCpB"}
|