@framers/agentos-ext-ml-classifiers 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/dist/MLClassifierGuardrail.d.ts +88 -117
  3. package/dist/MLClassifierGuardrail.d.ts.map +1 -1
  4. package/dist/MLClassifierGuardrail.js +255 -264
  5. package/dist/MLClassifierGuardrail.js.map +1 -1
  6. package/dist/classifiers/InjectionClassifier.d.ts +1 -1
  7. package/dist/classifiers/InjectionClassifier.d.ts.map +1 -1
  8. package/dist/classifiers/JailbreakClassifier.d.ts +1 -1
  9. package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -1
  10. package/dist/classifiers/ToxicityClassifier.d.ts +1 -1
  11. package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -1
  12. package/dist/classifiers/WorkerClassifierProxy.d.ts +1 -1
  13. package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -1
  14. package/dist/index.d.ts +16 -90
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +33 -306
  17. package/dist/index.js.map +1 -1
  18. package/dist/keyword-classifier.d.ts +26 -0
  19. package/dist/keyword-classifier.d.ts.map +1 -0
  20. package/dist/keyword-classifier.js +113 -0
  21. package/dist/keyword-classifier.js.map +1 -0
  22. package/dist/llm-classifier.d.ts +27 -0
  23. package/dist/llm-classifier.d.ts.map +1 -0
  24. package/dist/llm-classifier.js +129 -0
  25. package/dist/llm-classifier.js.map +1 -0
  26. package/dist/tools/ClassifyContentTool.d.ts +53 -80
  27. package/dist/tools/ClassifyContentTool.d.ts.map +1 -1
  28. package/dist/tools/ClassifyContentTool.js +52 -103
  29. package/dist/tools/ClassifyContentTool.js.map +1 -1
  30. package/dist/types.d.ts +77 -277
  31. package/dist/types.d.ts.map +1 -1
  32. package/dist/types.js +9 -55
  33. package/dist/types.js.map +1 -1
  34. package/package.json +10 -16
  35. package/src/MLClassifierGuardrail.ts +279 -316
  36. package/src/index.ts +35 -339
  37. package/src/keyword-classifier.ts +130 -0
  38. package/src/llm-classifier.ts +163 -0
  39. package/src/tools/ClassifyContentTool.ts +75 -132
  40. package/src/types.ts +78 -325
  41. package/test/ClassifierOrchestrator.spec.ts +365 -0
  42. package/test/ClassifyContentTool.spec.ts +226 -0
  43. package/test/InjectionClassifier.spec.ts +263 -0
  44. package/test/JailbreakClassifier.spec.ts +295 -0
  45. package/test/MLClassifierGuardrail.spec.ts +486 -0
  46. package/test/SlidingWindowBuffer.spec.ts +391 -0
  47. package/test/ToxicityClassifier.spec.ts +268 -0
  48. package/test/WorkerClassifierProxy.spec.ts +303 -0
  49. package/test/index.spec.ts +431 -0
  50. package/tsconfig.json +20 -0
  51. package/vitest.config.ts +24 -0
@@ -1,147 +1,96 @@
1
1
  /**
2
- * @fileoverview On-demand content classification tool for AgentOS.
2
+ * @file ClassifyContentTool.ts
3
+ * @description An AgentOS tool that exposes the ML classifier as a callable tool,
4
+ * enabling agents to perform on-demand safety classification of arbitrary text.
3
5
  *
4
- * `ClassifyContentTool` exposes the ML classifier pipeline as an invocable
5
- * {@link ITool}, enabling agents and workflows to explicitly classify text
6
- * for safety signals (toxicity, prompt injection, jailbreak) on demand,
7
- * rather than relying solely on the implicit guardrail pipeline.
8
- *
9
- * Use cases:
10
- * - An agent that needs to evaluate user-generated content before storing
11
- * it in a knowledge base.
12
- * - A moderation workflow that classifies a batch of flagged messages.
13
- * - A debugging tool for inspecting classifier behaviour on specific inputs.
14
- *
15
- * The tool delegates to a {@link ClassifierOrchestrator} instance and returns
16
- * the full {@link ChunkEvaluation} (including per-classifier scores and the
17
- * aggregated recommended action).
18
- *
19
- * @module agentos/extensions/packs/ml-classifiers/tools/ClassifyContentTool
6
+ * @module ml-classifiers/tools/ClassifyContentTool
20
7
  */
21
8
  // ---------------------------------------------------------------------------
22
9
  // ClassifyContentTool
23
10
  // ---------------------------------------------------------------------------
24
11
  /**
25
- * ITool implementation that runs ML content classifiers on demand.
26
- *
27
- * The tool is read-only (`hasSideEffects: false`) — it inspects text and
28
- * returns structured classification results without modifying any state.
12
+ * AgentOS tool that classifies text for toxicity, injection, NSFW, and threat
13
+ * content using the same three-tier strategy as the guardrail.
29
14
  *
30
- * @implements {ITool<ClassifyInput, ChunkEvaluation>}
31
- *
32
- * @example
33
- * ```typescript
34
- * const tool = new ClassifyContentTool(orchestrator);
35
- * const result = await tool.execute(
36
- * { text: 'some potentially harmful text' },
37
- * executionContext,
38
- * );
39
- *
40
- * if (result.success) {
41
- * console.log(result.output.recommendedAction); // 'allow' | 'flag' | 'block' | …
42
- * }
43
- * ```
15
+ * @implements {ITool<ClassifyContentInput, ClassifyContentOutput>}
44
16
  */
45
17
  export class ClassifyContentTool {
46
- // -------------------------------------------------------------------------
47
- // ITool identity & metadata
48
- // -------------------------------------------------------------------------
49
- /** Unique tool identifier used for registration and lookup. */
18
+ // -----------------------------------------------------------------------
19
+ // ITool metadata
20
+ // -----------------------------------------------------------------------
21
+ /** Stable tool identifier. */
50
22
  id = 'classify_content';
51
- /** Functional name exposed to LLMs for tool-call invocation. */
23
+ /** Tool name presented to the LLM. */
52
24
  name = 'classify_content';
53
- /** Human-readable display name for dashboards and UI. */
54
- displayName = 'Content Safety Classifier';
55
- /** Natural-language description of the tool's purpose and behaviour. */
56
- description = 'Classify text for toxicity, prompt injection, and jailbreak attempts ' +
57
- 'using ML models. Returns per-classifier scores and an aggregated ' +
58
- 'recommended guardrail action.';
59
- /** Logical grouping for tool discovery and filtering. */
25
+ /** Human-readable display name. */
26
+ displayName = 'ML Content Classifier';
27
+ /** Description used by the LLM to decide when to invoke the tool. */
28
+ description = 'Classify text for safety across four categories: toxic, injection, nsfw, and threat. ' +
29
+ 'Returns per-category confidence scores and a flagged boolean. Use this tool to ' +
30
+ 'pre-screen user-generated content or agent output before further processing.';
31
+ /** Tool category for capability discovery grouping. */
60
32
  category = 'security';
61
- /** SemVer version of this tool implementation. */
33
+ /** Semantic version. */
62
34
  version = '1.0.0';
63
- /** This tool only reads text it performs no mutations. */
35
+ /** Read-only analysisno side effects. */
64
36
  hasSideEffects = false;
65
- // -------------------------------------------------------------------------
66
- // JSON Schema for input validation
67
- // -------------------------------------------------------------------------
68
- /**
69
- * JSON Schema describing the expected input arguments.
70
- *
71
- * - `text` (required): The string to classify.
72
- * - `classifiers` (optional): Array of classifier IDs to restrict evaluation.
73
- */
37
+ /** JSON Schema for tool input validation. */
74
38
  inputSchema = {
75
39
  type: 'object',
76
40
  properties: {
77
41
  text: {
78
42
  type: 'string',
79
- description: 'Text to classify for safety signals.',
80
- },
81
- classifiers: {
82
- type: 'array',
83
- items: { type: 'string' },
84
- description: 'Optional: only run these classifier IDs. When omitted all registered classifiers are used.',
43
+ description: 'The text to classify for safety.',
85
44
  },
86
45
  },
87
46
  required: ['text'],
88
47
  };
89
- // -------------------------------------------------------------------------
90
- // Internal state
91
- // -------------------------------------------------------------------------
92
- /** The orchestrator that drives the underlying ML classifiers. */
93
- orchestrator;
94
- // -------------------------------------------------------------------------
48
+ // -----------------------------------------------------------------------
49
+ // Private fields
50
+ // -----------------------------------------------------------------------
51
+ /** The guardrail instance used for classification. */
52
+ guardrail;
53
+ // -----------------------------------------------------------------------
95
54
  // Constructor
96
- // -------------------------------------------------------------------------
55
+ // -----------------------------------------------------------------------
97
56
  /**
98
57
  * Create a new ClassifyContentTool.
99
58
  *
100
- * @param orchestrator - The classifier orchestrator that will handle
101
- * parallel classification and result aggregation.
59
+ * @param guardrail - The {@link MLClassifierGuardrail} instance to delegate
60
+ * classification to. Shared and stateless (except for the
61
+ * cached ONNX pipeline).
102
62
  */
103
- constructor(orchestrator) {
104
- this.orchestrator = orchestrator;
63
+ constructor(guardrail) {
64
+ this.guardrail = guardrail;
105
65
  }
106
- // -------------------------------------------------------------------------
107
- // execute
108
- // -------------------------------------------------------------------------
66
+ // -----------------------------------------------------------------------
67
+ // ITool.execute
68
+ // -----------------------------------------------------------------------
109
69
  /**
110
- * Run all (or a subset of) ML classifiers against the provided text and
111
- * return the aggregated evaluation.
70
+ * Execute the classification against the provided text.
112
71
  *
113
- * @param args - Tool input containing the text to classify and an
114
- * optional list of classifier IDs to restrict execution.
115
- * @param _context - Execution context (unused classification is
116
- * stateless and user-agnostic).
117
- * @returns A successful result containing the {@link ChunkEvaluation},
118
- * or a failure result if the text is missing or classification
119
- * throws an unexpected error.
72
+ * @param args - Validated input arguments containing `text`.
73
+ * @param context - Tool execution context (unused by this read-only tool).
74
+ * @returns Tool execution result wrapping the classification output.
120
75
  */
121
- async execute(args, _context) {
122
- // Validate that text is provided and non-empty.
123
- if (!args.text || args.text.trim().length === 0) {
124
- return {
125
- success: false,
126
- error: 'The "text" argument is required and must not be empty.',
127
- };
128
- }
76
+ async execute(args,
77
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
78
+ context) {
129
79
  try {
130
- // Delegate to the orchestrator for parallel classification.
131
- // NOTE: The `args.classifiers` filter is not yet implemented in the
132
- // orchestrator — it would require a filtering layer. For now, all
133
- // registered classifiers are invoked regardless.
134
- const evaluation = await this.orchestrator.classifyAll(args.text);
80
+ const result = await this.guardrail.classify(args.text);
135
81
  return {
136
82
  success: true,
137
- output: evaluation,
83
+ output: {
84
+ categories: result.categories,
85
+ flagged: result.flagged,
86
+ },
138
87
  };
139
88
  }
140
89
  catch (err) {
141
- const message = err instanceof Error ? err.message : String(err);
90
+ const message = err instanceof Error ? err.message : 'Unknown error during classification';
142
91
  return {
143
92
  success: false,
144
- error: `Classification failed: ${message}`,
93
+ error: `Content classification failed: ${message}`,
145
94
  };
146
95
  }
147
96
  }
@@ -1 +1 @@
1
- {"version":3,"file":"ClassifyContentTool.js","sourceRoot":"","sources":["../../src/tools/ClassifyContentTool.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAgCH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,mBAAmB;IAC9B,4EAA4E;IAC5E,4BAA4B;IAC5B,4EAA4E;IAE5E,+DAA+D;IACtD,EAAE,GAAG,kBAAkB,CAAC;IAEjC,gEAAgE;IACvD,IAAI,GAAG,kBAAkB,CAAC;IAEnC,yDAAyD;IAChD,WAAW,GAAG,2BAA2B,CAAC;IAEnD,wEAAwE;IAC/D,WAAW,GAClB,uEAAuE;QACvE,mEAAmE;QACnE,+BAA+B,CAAC;IAElC,yDAAyD;IAChD,QAAQ,GAAG,UAAU,CAAC;IAE/B,kDAAkD;IACzC,OAAO,GAAG,OAAO,CAAC;IAE3B,4DAA4D;IACnD,cAAc,GAAG,KAAK,CAAC;IAEhC,4EAA4E;IAC5E,mCAAmC;IACnC,4EAA4E;IAE5E;;;;;OAKG;IACM,WAAW,GAAqB;QACvC,IAAI,EAAE,QAAQ;QACd,UAAU,EAAE;YACV,IAAI,EAAE;gBACJ,IAAI,EAAE,QAAQ;gBACd,WAAW,EAAE,sCAAsC;aACpD;YACD,WAAW,EAAE;gBACX,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;gBACzB,WAAW,EACT,4FAA4F;aAC/F;SACF;QACD,QAAQ,EAAE,CAAC,MAAM,CAAC;KACnB,CAAC;IAEF,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E,kEAAkE;IACjD,YAAY,CAAyB;IAEtD,4EAA4E;IAC5E,cAAc;IACd,4EAA4E;IAE5E;;;;;OAKG;IACH,YAAY,YAAoC;QAC9C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;IACnC,CAAC;IAED,4EAA4E;IAC5E,UAAU;IACV,4EAA4E;IAE5E;;;;;;;;;;;OAWG;IACH,KAAK,CAAC,OAAO,CACX,IAAmB,EACnB,QAA8B;QAE9B,gDAAgD;QAChD,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAChD,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,wDAAwD;aAChE,CAAC;QACJ,CAAC;QAED,IAAI,CAAC;YACH,4DAA4D;YAC5D,oEAAoE;YACpE,mEAAmE;YACnE,iDAAiD;YACjD,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAElE,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE,UAAU;aACnB,CAAC;QACJ,CAAC;QAAC,OAAO,GAAY,EAAE,CAAC;YACtB,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACjE,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,0BAA0B,OAAO,EAAE;aAC3C,CAAC;QACJ,CAAC;IACH,CAAC;CACF"}
1
+ {"version":3,"file":"ClassifyContentTool.js","sourceRoot":"","sources":["../../src/tools/ClassifyContentTool.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA6BH,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;GAKG;AACH,MAAM,OAAO,mBAAmB;IAC9B,0EAA0E;IAC1E,iBAAiB;IACjB,0EAA0E;IAE1E,8BAA8B;IACrB,EAAE,GAAG,kBAAkB,CAAC;IAEjC,sCAAsC;IAC7B,IAAI,GAAG,kBAAkB,CAAC;IAEnC,mCAAmC;IAC1B,WAAW,GAAG,uBAAuB,CAAC;IAE/C,qEAAqE;IAC5D,WAAW,GAClB,uFAAuF;QACvF,iFAAiF;QACjF,8EAA8E,CAAC;IAEjF,uDAAuD;IAC9C,QAAQ,GAAG,UAAU,CAAC;IAE/B,wBAAwB;IACf,OAAO,GAAG,OAAO,CAAC;IAE3B,4CAA4C;IACnC,cAAc,GAAG,KAAK,CAAC;IAEhC,6CAA6C;IACpC,WAAW,GAAG;QACrB,IAAI,EAAE,QAAiB;QACvB,UAAU,EAAE;YACV,IAAI,EAAE;gBACJ,IAAI,EAAE,QAAiB;gBACvB,WAAW,EAAE,kCAAkC;aAChD;SACF;QACD,QAAQ,EAAE,CAAC,MAAM,CAAC;KACnB,CAAC;IAEF,0EAA0E;IAC1E,iBAAiB;IACjB,0EAA0E;IAE1E,sDAAsD;IACrC,SAAS,CAAwB;IAElD,0EAA0E;IAC1E,cAAc;IACd,0EAA0E;IAE1E;;;;;;OAMG;IACH,YAAY,SAAgC;QAC1C,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,0EAA0E;IAC1E,gBAAgB;IAChB,0EAA0E;IAE1E;;;;;;OAMG;IACH,KAAK,CAAC,OAAO,CACX,IAA0B;IAC1B,6DAA6D;IAC7D,OAA6B;QAE7B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAExD,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE;oBACN,UAAU,EAAE,MAAM,CAAC,UAAU;oBAC7B,OAAO,EAAE,MAAM,CAAC,OAAO;iBACxB;aACF,CAAC;QACJ,CAAC;QAAC,OAAO,GAAY,EAAE,CAAC;YACtB,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,qCAAqC,CAAC;YAE3F,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,kCAAkC,OAAO,EAAE;aACnD,CAAC;QACJ,CAAC;IACH,CAAC;CACF"}
package/dist/types.d.ts CHANGED
@@ -1,319 +1,119 @@
1
1
  /**
2
- * @fileoverview Core type definitions for the ML Classifier Guardrail Extension Pack.
2
+ * @file types.ts
3
+ * @description Core type definitions for the ML Classifiers extension pack.
3
4
  *
4
- * This file defines all configuration shapes, runtime result types, and
5
- * service-identifier constants used by the ML classifier pipeline. All
6
- * classifiers in this pack evaluate text content against learned models
7
- * (toxicity, prompt-injection, jailbreak) and emit structured results that
8
- * feed into the AgentOS guardrail decision tree.
5
+ * Defines the shared interfaces used across the ML classification system:
6
+ * classifier categories, confidence results, option shapes, and the LLM
7
+ * invoker callback signature.
9
8
  *
10
- * Import hierarchy
11
- * ----------------
12
- * ```
13
- * IUtilityAI ──── ClassificationResult, ClassificationScore
14
- * IGuardrailService ── GuardrailAction
15
- * │
16
- * ▼
17
- * types.ts (this file)
18
- * │
19
- * ▼
20
- * IContentClassifier.ts / SlidingWindowBuffer.ts / …
21
- * ```
22
- *
23
- * @module agentos/extensions/packs/ml-classifiers/types
9
+ * @module ml-classifiers/types
24
10
  */
25
- import type { ClassificationResult, ClassificationScore } from '@framers/agentos';
26
- import type { GuardrailAction } from '@framers/agentos';
27
- export type { ClassificationResult, ClassificationScore };
28
11
  /**
29
- * Numeric thresholds that map raw classifier confidence scores (0–1) to
30
- * guardrail actions.
12
+ * Safety categories evaluated by the ML classifier.
31
13
  *
32
- * The thresholds are applied in descending priority:
33
- * 1. `score >= blockThreshold` {@link GuardrailAction.BLOCK}
34
- * 2. `score >= flagThreshold` → {@link GuardrailAction.FLAG}
35
- * 3. `score >= warnThreshold` → {@link GuardrailAction.SANITIZE}
36
- * 4. otherwise → {@link GuardrailAction.ALLOW}
14
+ * - `'toxic'` — Hateful, abusive, or threatening language.
15
+ * - `'injection'` Prompt injection or jailbreak attempts.
16
+ * - `'nsfw'` — Sexually explicit or adult content.
17
+ * - `'threat'` — Direct threats of violence or self-harm.
37
18
  */
38
- export interface ClassifierThresholds {
39
- /**
40
- * Minimum score at which content is **blocked** (interaction terminated).
41
- * Must be in the range [0, 1]. Typical default: `0.9`.
42
- */
43
- blockThreshold: number;
44
- /**
45
- * Minimum score at which content is **flagged** for review while still
46
- * being allowed through. Must be in the range [0, 1]. Typical default: `0.7`.
47
- */
48
- flagThreshold: number;
49
- /**
50
- * Minimum score at which a **warn** action is taken (e.g. the chunk is
51
- * sanitised or a warning is appended to the response). Must be in the range
52
- * [0, 1]. Typical default: `0.4`.
53
- */
54
- warnThreshold: number;
55
- }
19
+ export type ClassifierCategory = 'toxic' | 'injection' | 'nsfw' | 'threat';
56
20
  /**
57
- * Sensible defaults for {@link ClassifierThresholds}.
58
- *
59
- * These values reflect a conservative-but-pragmatic policy:
60
- * - block at 90 % confidence → very high bar, minimises false positives
61
- * - flag at 70 % → surfaced for human review, not blocked
62
- * - warn at 40 % → low-confidence signal, handled with a light touch
21
+ * All supported classifier categories as a constant array, used for
22
+ * iteration and default configuration.
63
23
  */
64
- export declare const DEFAULT_THRESHOLDS: ClassifierThresholds;
24
+ export declare const ALL_CATEGORIES: ClassifierCategory[];
65
25
  /**
66
- * Configuration for a single ML classifier pipeline.
26
+ * Confidence score for a single safety category.
67
27
  *
68
- * Allows individual classifiers to override the pack-level defaults for the
69
- * model variant and decision thresholds, and to customise which guardrail
70
- * action is taken for each classification label.
28
+ * Scores are normalised to the range `[0, 1]`, where `0` means "no signal"
29
+ * and `1` means "extremely confident match".
71
30
  */
72
- export interface ClassifierConfig {
73
- /**
74
- * Hugging Face model identifier (e.g. `"Xenova/toxic-bert"`) or a local
75
- * model path to load instead of the pack default.
76
- * @optional Falls back to the pack-level `MLClassifierPackOptions.modelCacheDir` default.
77
- */
78
- modelId?: string;
79
- /**
80
- * Per-classifier threshold overrides.
81
- * @optional Falls back to {@link DEFAULT_THRESHOLDS}.
82
- */
83
- thresholds?: Partial<ClassifierThresholds>;
84
- /**
85
- * Maps classification labels to the guardrail action that should be taken
86
- * when that label is the winning class.
87
- *
88
- * @example
89
- * ```typescript
90
- * // Always block on TOXIC label regardless of threshold.
91
- * labelActions: { TOXIC: GuardrailAction.BLOCK }
92
- * ```
93
- */
94
- labelActions?: Record<string, GuardrailAction>;
31
+ export interface CategoryScore {
32
+ /** The safety category this score applies to. */
33
+ name: ClassifierCategory;
34
+ /** Normalised confidence score in the range [0, 1]. */
35
+ confidence: number;
95
36
  }
96
37
  /**
97
- * Configuration for browser-side model execution.
38
+ * Complete result from a classification pass over a text input.
98
39
  *
99
- * When the ML classifier pack is loaded in a browser context (e.g. a chat
100
- * widget), models run inside a Web Worker to avoid blocking the main thread.
101
- * This interface controls worker lifecycle and cache management.
40
+ * Includes per-category scores and an overall `flagged` boolean that is
41
+ * `true` when any category exceeds the configured flag threshold (default 0.5).
102
42
  */
103
- export interface BrowserConfig {
43
+ export interface ClassifierResult {
104
44
  /**
105
- * Run model inference in a Web Worker.
106
- * @default true
45
+ * Per-category confidence scores, one entry for each category that was
46
+ * evaluated.
107
47
  */
108
- useWebWorker?: boolean;
48
+ categories: CategoryScore[];
109
49
  /**
110
- * Caching strategy for downloaded model weights.
111
- * - `'memory'` keep weights in memory only (lost on page unload)
112
- * - `'indexeddb'` — persist weights to IndexedDB (survives reloads)
113
- * - `'none'` — no caching; re-download on every page load
114
- * @default 'indexeddb'
50
+ * `true` when at least one category score exceeds the flag threshold.
51
+ * Convenience field equivalent to `categories.some(c => c.confidence > flagThreshold)`.
115
52
  */
116
- cacheStrategy?: 'memory' | 'indexeddb' | 'none';
53
+ flagged: boolean;
117
54
  /**
118
- * Maximum number of model shards to keep in the in-memory cache when
119
- * `cacheStrategy === 'memory'`. Oldest entries are evicted LRU-style.
120
- * @default 3
55
+ * Which classification backend produced this result.
56
+ * Useful for logging and debugging which tier was active.
121
57
  */
122
- maxCacheSize?: number;
123
- /**
124
- * Callback invoked with download progress as model weights are fetched.
125
- * Useful for showing a progress bar in the UI.
126
- *
127
- * @param progress - Current progress state.
128
- */
129
- onProgress?: (progress: ModelDownloadProgress) => void;
130
- }
131
- /**
132
- * Progress report emitted during model weight downloads.
133
- *
134
- * @example
135
- * ```typescript
136
- * onProgress({ modelId: 'Xenova/toxic-bert', loaded: 50_000, total: 200_000, percent: 25 })
137
- * ```
138
- */
139
- export interface ModelDownloadProgress {
140
- /** Identifier of the model being downloaded (Hugging Face ID or path). */
141
- modelId: string;
142
- /** Number of bytes downloaded so far. */
143
- loaded: number;
144
- /** Total number of bytes to download (`0` if unknown). */
145
- total: number;
146
- /** Download progress as a percentage in the range [0, 100]. */
147
- percent: number;
58
+ source: 'onnx' | 'llm' | 'keyword';
148
59
  }
149
60
  /**
150
- * Top-level configuration for the ML Classifier Extension Pack.
61
+ * Callback signature for invoking an LLM to perform classification when
62
+ * ONNX models are unavailable.
151
63
  *
152
- * Passed to `createMLClassifierPack()` (or the NestJS module factory) to
153
- * control which classifiers are active, how models are loaded, and how the
154
- * sliding-window streaming evaluation behaves.
64
+ * The callback receives a system prompt and a user message and returns
65
+ * the raw LLM text response. The caller is responsible for parsing the
66
+ * JSON output.
155
67
  *
156
- * @example
157
- * ```typescript
158
- * const packOptions: MLClassifierPackOptions = {
159
- * classifiers: ['toxicity', 'jailbreak'],
160
- * quantized: true,
161
- * runtime: 'node',
162
- * thresholds: { blockThreshold: 0.95, flagThreshold: 0.75, warnThreshold: 0.5 },
163
- * streamingMode: true,
164
- * chunkSize: 150,
165
- * contextSize: 50,
166
- * };
167
- * ```
68
+ * @param systemPrompt - Instruction prompt describing the classification task.
69
+ * @param userMessage - The text to classify.
70
+ * @returns The raw string response from the LLM.
168
71
  */
169
- export interface MLClassifierPackOptions {
170
- /**
171
- * Subset of built-in classifiers to activate.
172
- * Omit or pass an empty array to activate all built-in classifiers.
173
- *
174
- * @example `['toxicity', 'injection']`
175
- */
176
- classifiers?: Array<'toxicity' | 'injection' | 'jailbreak'>;
177
- /**
178
- * Fully-qualified `IContentClassifier` instances to add alongside the
179
- * built-in classifiers (e.g. domain-specific harm classifiers).
180
- */
181
- customClassifiers?: import('./IContentClassifier').IContentClassifier[];
182
- /**
183
- * Local filesystem path where downloaded model weights are cached.
184
- * Defaults to `~/.cache/agentos/ml-classifiers`.
185
- */
186
- modelCacheDir?: string;
187
- /**
188
- * Use 8-bit quantised model variants when available.
189
- * Reduces VRAM/RAM footprint and increases inference speed at a small
190
- * accuracy cost.
191
- * @default false
192
- */
193
- quantized?: boolean;
194
- /**
195
- * Execution runtime for model inference.
196
- * - `'node'` — Runs via `@xenova/transformers` in the Node.js process.
197
- * - `'browser'` — Runs via `@xenova/transformers` in a Web Worker.
198
- * - `'wasm'` — Explicit WebAssembly fallback (Node.js or browser).
199
- * @default 'node'
200
- */
201
- runtime?: 'node' | 'browser' | 'wasm';
202
- /**
203
- * Browser-specific options. Only applicable when `runtime === 'browser'`.
204
- */
205
- browser?: BrowserConfig;
206
- /**
207
- * Number of tokens per evaluation window when streaming mode is enabled.
208
- * Smaller values detect issues earlier but increase evaluation frequency.
209
- * @default 200
210
- */
211
- chunkSize?: number;
212
- /**
213
- * Number of tokens from the previous chunk to carry forward as context into
214
- * the next window, preventing boundary effects.
215
- * @default 50
216
- */
217
- contextSize?: number;
218
- /**
219
- * Maximum number of classifier evaluations per stream. The sliding window
220
- * stops advancing after this many evaluations, allowing the stream to
221
- * complete without further overhead.
222
- * @default 100
223
- */
224
- maxEvaluations?: number;
225
- /**
226
- * Enable sliding-window evaluation for streamed (token-by-token) output.
227
- * When `false`, classifiers only run on the completed final response.
228
- * @default false
229
- */
230
- streamingMode?: boolean;
231
- /**
232
- * Pack-level threshold defaults applied to every classifier unless
233
- * overridden by a per-classifier {@link ClassifierConfig}.
234
- */
235
- thresholds?: Partial<ClassifierThresholds>;
236
- /**
237
- * Scope of guardrail enforcement.
238
- * - `'input'` — Evaluate user messages before orchestration.
239
- * - `'output'` — Evaluate agent responses before delivery.
240
- * - `'both'` — Evaluate at both stages.
241
- * @default 'both'
242
- */
243
- guardrailScope?: 'input' | 'output' | 'both';
244
- }
245
- /**
246
- * Well-known service identifier strings for the three built-in ML classifier
247
- * pipelines.
248
- *
249
- * These IDs follow the `agentos:<domain>:<name>` naming convention used
250
- * throughout the AgentOS extension ecosystem. Use them to retrieve specific
251
- * classifier services from the shared service registry.
252
- *
253
- * @example
254
- * ```typescript
255
- * const toxicity = serviceRegistry.get(ML_CLASSIFIER_SERVICE_IDS.TOXICITY_PIPELINE);
256
- * ```
257
- */
258
- export declare const ML_CLASSIFIER_SERVICE_IDS: {
259
- /** Classifier that detects toxic, hateful, or abusive language. */
260
- readonly TOXICITY_PIPELINE: "agentos:ml-classifiers:toxicity-pipeline";
261
- /** Classifier that detects prompt-injection attempts. */
262
- readonly INJECTION_PIPELINE: "agentos:ml-classifiers:injection-pipeline";
263
- /** Classifier that detects jailbreak / system-override attempts. */
264
- readonly JAILBREAK_PIPELINE: "agentos:ml-classifiers:jailbreak-pipeline";
265
- };
266
- /** Union type of all ML classifier service ID strings. */
267
- export type MLClassifierServiceId = (typeof ML_CLASSIFIER_SERVICE_IDS)[keyof typeof ML_CLASSIFIER_SERVICE_IDS];
72
+ export type LlmInvoker = (systemPrompt: string, userMessage: string) => Promise<string>;
268
73
  /**
269
- * A {@link ClassificationResult} augmented with provenance metadata.
74
+ * Configuration options for the ML Classifiers extension pack.
270
75
  *
271
- * Produced when a classifier evaluates a chunk of text. Carries the
272
- * classifier's identity and the wall-clock latency so callers can build
273
- * audit trails and SLO dashboards.
76
+ * All properties are optional. Sensible defaults allow zero-config operation
77
+ * using the keyword fallback classifier.
274
78
  */
275
- export interface AnnotatedClassificationResult extends ClassificationResult {
79
+ export interface MLClassifierOptions {
276
80
  /**
277
- * The {@link IContentClassifier.id} of the classifier that produced this
278
- * result (e.g. `ML_CLASSIFIER_SERVICE_IDS.TOXICITY_PIPELINE`).
81
+ * Which safety categories to evaluate.
82
+ * @default ALL_CATEGORIES
279
83
  */
280
- classifierId: string;
84
+ categories?: ClassifierCategory[];
281
85
  /**
282
- * Wall-clock time in milliseconds from when `classify()` was called to when
283
- * it resolved.
284
- */
285
- latencyMs: number;
286
- }
287
- /**
288
- * Aggregated evaluation outcome for a single sliding-window chunk.
289
- *
290
- * Produced by running all active classifiers against one text window and
291
- * collating their results into a single action recommendation.
292
- *
293
- * The `recommendedAction` is the most restrictive action across all
294
- * classifiers (BLOCK > FLAG > SANITIZE > ALLOW).
295
- */
296
- export interface ChunkEvaluation {
297
- /**
298
- * Individual results from every classifier that evaluated this chunk,
299
- * in the order the classifiers were invoked.
86
+ * Per-category confidence thresholds that override the global defaults.
87
+ *
88
+ * Keys are category names; values are threshold overrides with optional
89
+ * `flag` and `block` levels.
90
+ *
91
+ * @example `{ toxic: { flag: 0.4, block: 0.7 } }`
300
92
  */
301
- results: AnnotatedClassificationResult[];
93
+ thresholds?: Partial<Record<ClassifierCategory, {
94
+ flag?: number;
95
+ block?: number;
96
+ }>>;
302
97
  /**
303
- * The most restrictive guardrail action recommended across all results.
304
- * The pipeline should act on this value rather than iterating `results`
305
- * manually.
98
+ * Global flag threshold applied to all categories that do not have a
99
+ * per-category override.
100
+ * @default 0.5
306
101
  */
307
- recommendedAction: GuardrailAction;
102
+ flagThreshold?: number;
308
103
  /**
309
- * ID of the classifier that triggered the `recommendedAction`, or `null`
310
- * if the action is {@link GuardrailAction.ALLOW} (no classifier triggered).
104
+ * Global block threshold applied to all categories that do not have a
105
+ * per-category override.
106
+ * @default 0.8
311
107
  */
312
- triggeredBy: string | null;
108
+ blockThreshold?: number;
313
109
  /**
314
- * Sum of all classifier `latencyMs` values useful for profiling the
315
- * total evaluation overhead per chunk.
110
+ * Optional LLM invoker callback. When provided and ONNX models are
111
+ * unavailable, the classifier will fall back to LLM-as-judge classification
112
+ * using this callback.
113
+ *
114
+ * When omitted AND ONNX models are unavailable, the classifier falls back
115
+ * to keyword-based detection.
316
116
  */
317
- totalLatencyMs: number;
117
+ llmInvoker?: LlmInvoker;
318
118
  }
319
119
  //# sourceMappingURL=types.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,KAAK,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,kBAAkB,CAAC;AAClF,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAGxD,YAAY,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,CAAC;AAM1D;;;;;;;;;GASG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;IAEvB;;;OAGG;IACH,aAAa,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,EAAE,oBAIvB,CAAC;AAMX;;;;;;GAMG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC,oBAAoB,CAAC,CAAC;IAE3C;;;;;;;;;OASG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC;CAChD;AAMD;;;;;;GAMG;AACH,MAAM,WAAW,aAAa;IAC5B;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IAEvB;;;;;;OAMG;IACH,aAAa,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,MAAM,CAAC;IAEhD;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;;OAKG;IACH,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,qBAAqB,KAAK,IAAI,CAAC;CACxD;AAMD;;;;;;;GAOG;AACH,MAAM,WAAW,qBAAqB;IACpC,0EAA0E;IAC1E,OAAO,EAAE,MAAM,CAAC;IAEhB,yCAAyC;IACzC,MAAM,EAAE,MAAM,CAAC;IAEf,0DAA0D;IAC1D,KAAK,EAAE,MAAM,CAAC;IAEd,+DAA+D;IAC/D,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,WAAW,uBAAuB;IACtC;;;;;OAKG;IACH,WAAW,CAAC,EAAE,KAAK,CAAC,UAAU,GAAG,WAAW,GAAG,WAAW,CAAC,CAAC;IAE5D;;;OAGG;IACH,iBAAiB,CAAC,EAAE,OAAO,sBAAsB,EAAE,kBAAkB,EAAE,CAAC;IAExE;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;;;;OAMG;IACH,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,CAAC;IAEtC;;OAEG;IACH,OAAO,CAAC,EAAE,aAAa,CAAC;IAExB;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;;OAKG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC,oBAAoB,CAAC,CAAC;IAE3C;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,GAAG,QAAQ,GAAG,MAAM,CAAC;CAC9C;AAMD;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,yBAAyB;IACpC,mEAAmE;;IAGnE,yDAAyD;;IAGzD,oEAAoE;;CAE5D,CAAC;AAEX,0DAA0D;AAC1D,MAAM,MAAM,qBAAqB,GAC/B,CAAC,OAAO,yBAAyB,CAAC,CAAC,MAAM,OAAO,yBAAyB,CAAC,CAAC;AAM7E;;;;;;GAMG;AACH,MAAM,WAAW,6BAA8B,SAAQ,oBAAoB;IACzE;;;OAGG;IACH,YAAY,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,eAAe;IAC9B;;;OAGG;IACH,OAAO,EAAE,6BAA6B,EAAE,CAAC;IAEzC;;;;OAIG;IACH,iBAAiB,EAAE,eAAe,CAAC;IAEnC;;;OAGG;IACH,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAE3B;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;CACxB"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAMH;;;;;;;GAOG;AACH,MAAM,MAAM,kBAAkB,GAAG,OAAO,GAAG,WAAW,GAAG,MAAM,GAAG,QAAQ,CAAC;AAE3E;;;GAGG;AACH,eAAO,MAAM,cAAc,EAAE,kBAAkB,EAA6C,CAAC;AAM7F;;;;;GAKG;AACH,MAAM,WAAW,aAAa;IAC5B,iDAAiD;IACjD,IAAI,EAAE,kBAAkB,CAAC;IAEzB,uDAAuD;IACvD,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;;;;GAKG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,UAAU,EAAE,aAAa,EAAE,CAAC;IAE5B;;;OAGG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;;OAGG;IACH,MAAM,EAAE,MAAM,GAAG,KAAK,GAAG,SAAS,CAAC;CACpC;AAMD;;;;;;;;;;;GAWG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;AAMxF;;;;;GAKG;AACH,MAAM,WAAW,mBAAmB;IAClC;;;OAGG;IACH,UAAU,CAAC,EAAE,kBAAkB,EAAE,CAAC;IAElC;;;;;;;OAOG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAAE;QAAE,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC,CAAC;IAEpF;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;OAIG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;;;;;OAOG;IACH,UAAU,CAAC,EAAE,UAAU,CAAC;CACzB"}