npm - @framers/agentos-ext-ml-classifiers - Versions diffs - 0.1.0 - Mend

@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +23 -0
package/dist/ClassifierOrchestrator.d.ts +126 -0
package/dist/ClassifierOrchestrator.d.ts.map +1 -0
package/dist/ClassifierOrchestrator.js +239 -0
package/dist/ClassifierOrchestrator.js.map +1 -0
package/dist/IContentClassifier.d.ts +117 -0
package/dist/IContentClassifier.d.ts.map +1 -0
package/dist/IContentClassifier.js +22 -0
package/dist/IContentClassifier.js.map +1 -0
package/dist/MLClassifierGuardrail.d.ts +163 -0
package/dist/MLClassifierGuardrail.d.ts.map +1 -0
package/dist/MLClassifierGuardrail.js +335 -0
package/dist/MLClassifierGuardrail.js.map +1 -0
package/dist/SlidingWindowBuffer.d.ts +213 -0
package/dist/SlidingWindowBuffer.d.ts.map +1 -0
package/dist/SlidingWindowBuffer.js +246 -0
package/dist/SlidingWindowBuffer.js.map +1 -0
package/dist/classifiers/InjectionClassifier.d.ts +126 -0
package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
package/dist/classifiers/InjectionClassifier.js +210 -0
package/dist/classifiers/InjectionClassifier.js.map +1 -0
package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
package/dist/classifiers/JailbreakClassifier.js +208 -0
package/dist/classifiers/JailbreakClassifier.js.map +1 -0
package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
package/dist/classifiers/ToxicityClassifier.js +212 -0
package/dist/classifiers/ToxicityClassifier.js.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.js +268 -0
package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
package/dist/index.d.ts +110 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +342 -0
package/dist/index.js.map +1 -0
package/dist/tools/ClassifyContentTool.d.ts +105 -0
package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
package/dist/tools/ClassifyContentTool.js +149 -0
package/dist/tools/ClassifyContentTool.js.map +1 -0
package/dist/types.d.ts +319 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +62 -0
package/dist/types.js.map +1 -0
package/dist/worker/classifier-worker.d.ts +49 -0
package/dist/worker/classifier-worker.d.ts.map +1 -0
package/dist/worker/classifier-worker.js +180 -0
package/dist/worker/classifier-worker.js.map +1 -0
package/package.json +45 -0
package/src/ClassifierOrchestrator.ts +290 -0
package/src/IContentClassifier.ts +124 -0
package/src/MLClassifierGuardrail.ts +419 -0
package/src/SlidingWindowBuffer.ts +384 -0
package/src/classifiers/InjectionClassifier.ts +261 -0
package/src/classifiers/JailbreakClassifier.ts +259 -0
package/src/classifiers/ToxicityClassifier.ts +263 -0
package/src/classifiers/WorkerClassifierProxy.ts +366 -0
package/src/index.ts +383 -0
package/src/tools/ClassifyContentTool.ts +201 -0
package/src/types.ts +391 -0
package/src/worker/classifier-worker.ts +267 -0

package/src/index.ts ADDED Viewed

@@ -0,0 +1,383 @@
+/**
+ * @fileoverview Pack factory for the ML Classifier Guardrail Extension Pack.
+ *
+ * Exports the main `createMLClassifierPack()` factory that assembles the
+ * ML classifier guardrail and the `classify_content` tool into a single
+ * {@link ExtensionPack} ready for registration with the AgentOS extension
+ * manager.
+ *
+ * Also exports a `createExtensionPack()` bridge function that conforms to
+ * the AgentOS manifest factory convention, delegating to
+ * `createMLClassifierPack()` with options extracted from the
+ * {@link ExtensionPackContext}.
+ *
+ * ### Default behaviour (zero-config)
+ * When called without arguments, all three built-in classifiers (toxicity,
+ * prompt-injection, jailbreak) are active using their default model IDs and
+ * the default threshold set:
+ *  - block at 0.90 confidence
+ *  - flag at 0.70 confidence
+ *  - warn (sanitize) at 0.40 confidence
+ *
+ * ### Activation lifecycle
+ * Components are built eagerly at pack creation time for direct programmatic
+ * use.  When the extension manager activates the pack, `onActivate` rebuilds
+ * all components with the manager's shared service registry so heavyweight
+ * resources (ONNX/WASM model pipelines) are shared across the agent.
+ *
+ * ### Disabling classifiers
+ * Individual classifiers can be disabled by omitting them from the
+ * `options.classifiers` array.  An empty array or `undefined` activates all
+ * three built-in classifiers.
+ *
+ * @example
+ * ```typescript
+ * import { createMLClassifierPack } from './ml-classifiers';
+ *
+ * // All built-in classifiers at default thresholds:
+ * const pack = createMLClassifierPack();
+ *
+ * // Toxicity only with custom block threshold:
+ * const strictPack = createMLClassifierPack({
+ *   classifiers: ['toxicity'],
+ *   thresholds: { blockThreshold: 0.85 },
+ *   streamingMode: true,
+ *   guardrailScope: 'both',
+ * });
+ * ```
+ *
+ * @module agentos/extensions/packs/ml-classifiers
+ */
+import type { ISharedServiceRegistry } from '@framers/agentos';
+import { SharedServiceRegistry } from '@framers/agentos';
+import type { ExtensionPack, ExtensionPackContext } from '@framers/agentos';
+import type { ExtensionDescriptor, ExtensionLifecycleContext } from '@framers/agentos';
+import { EXTENSION_KIND_GUARDRAIL, EXTENSION_KIND_TOOL } from '@framers/agentos';
+import type { MLClassifierPackOptions } from './types';
+import { DEFAULT_THRESHOLDS } from './types';
+import { MLClassifierGuardrail } from './MLClassifierGuardrail';
+import { ClassifierOrchestrator } from './ClassifierOrchestrator';
+import { SlidingWindowBuffer } from './SlidingWindowBuffer';
+import { ClassifyContentTool } from './tools/ClassifyContentTool';
+import { ToxicityClassifier } from './classifiers/ToxicityClassifier';
+import { InjectionClassifier } from './classifiers/InjectionClassifier';
+import { JailbreakClassifier } from './classifiers/JailbreakClassifier';
+import type { IContentClassifier } from './IContentClassifier';
+// ---------------------------------------------------------------------------
+// Re-exports — allow single-import for consumers
+// ---------------------------------------------------------------------------
+/**
+ * Re-export all types from the ML classifier type definitions so consumers
+ * can import everything from a single entry point:
+ * ```ts
+ * import { createMLClassifierPack, DEFAULT_THRESHOLDS } from './ml-classifiers';
+ * ```
+ */
+export * from './types';
+// ---------------------------------------------------------------------------
+// Pack factory
+// ---------------------------------------------------------------------------
+/**
+ * Create an {@link ExtensionPack} that bundles:
+ *  - The {@link MLClassifierGuardrail} guardrail (evaluates input & output).
+ *  - The {@link ClassifyContentTool} `classify_content` tool (on-demand analysis).
+ *
+ * The built-in classifiers that are instantiated depend on `options.classifiers`:
+ *  - `'toxicity'`  → {@link ToxicityClassifier}   (`unitary/toxic-bert`)
+ *  - `'injection'` → {@link InjectionClassifier}  (`protectai/deberta-v3-small-prompt-injection-v2`)
+ *  - `'jailbreak'` → {@link JailbreakClassifier}  (`meta-llama/PromptGuard-86M`)
+ *
+ * When `options.classifiers` is `undefined` or empty, **all three** are active.
+ *
+ * Additional classifiers supplied via `options.customClassifiers` are appended
+ * to the active list and run in parallel alongside the built-in ones.
+ *
+ * @param options - Optional pack-level configuration.  All properties have
+ *                  sensible defaults; see {@link MLClassifierPackOptions}.
+ * @returns A fully-configured {@link ExtensionPack} with one guardrail
+ *          descriptor and one tool descriptor.
+ */
+export function createMLClassifierPack(options?: MLClassifierPackOptions): ExtensionPack {
+  /**
+   * Resolved options — default to empty object so every sub-check can
+   * safely use `opts.foo` without null-guarding the whole `options` reference.
+   */
+  const opts: MLClassifierPackOptions = options ?? {};
+  // -------------------------------------------------------------------------
+  // Mutable state — upgraded by onActivate with the extension manager's
+  // shared service registry.
+  // -------------------------------------------------------------------------
+  const state = {
+    /**
+     * Service registry — starts as a standalone instance so the pack can be
+     * used directly (without activation) in unit tests and scripts.
+     * Replaced with the shared registry when `onActivate` is called by the
+     * extension manager.
+     */
+    services: new SharedServiceRegistry() as ISharedServiceRegistry,
+  };
+  // -------------------------------------------------------------------------
+  // Component instances — rebuilt by buildComponents()
+  // -------------------------------------------------------------------------
+  /**
+   * The guardrail that evaluates user input and/or agent output streams
+   * against all active ML classifiers.
+   */
+  let guardrail: MLClassifierGuardrail;
+  /**
+   * The on-demand classification tool exposed to agents and workflows.
+   */
+  let tool: ClassifyContentTool;
+  /**
+   * The orchestrator that runs all active classifiers in parallel and folds
+   * their results into a single {@link ChunkEvaluation} via worst-wins
+   * aggregation.
+   */
+  let orchestrator: ClassifierOrchestrator;
+  /**
+   * The sliding-window buffer used internally by the guardrail to evaluate
+   * streamed output tokens incrementally.
+   */
+  let buffer: SlidingWindowBuffer;
+  // -------------------------------------------------------------------------
+  // buildComponents
+  // -------------------------------------------------------------------------
+  /**
+   * (Re)construct all pack components using the current `state.services`.
+   *
+   * Called once at pack creation for direct programmatic use, and again
+   * during `onActivate` to upgrade to the extension manager's shared
+   * service registry (so ONNX/WASM pipelines are shared across the agent).
+   *
+   * ### Classifier selection
+   * The active classifiers are determined by `opts.classifiers`:
+   *  - `undefined` or empty → all three built-in classifiers are created.
+   *  - Non-empty array      → only the named classifiers are created.
+   *
+   * Any `opts.customClassifiers` are always appended to the list.
+   */
+  function buildComponents(): void {
+    // ------------------------------------------------------------------
+    // 1. Determine which built-in classifiers to instantiate.
+    // ------------------------------------------------------------------
+    /**
+     * Determine whether a given built-in classifier name is enabled.
+     *
+     * When `opts.classifiers` is undefined or an empty array every built-in
+     * classifier is considered enabled (zero-config default).
+     *
+     * @param name - One of `'toxicity'`, `'injection'`, or `'jailbreak'`.
+     * @returns `true` when the classifier should be included.
+     */
+    function isBuiltInEnabled(name: 'toxicity' | 'injection' | 'jailbreak'): boolean {
+      // No explicit list — enable all built-in classifiers.
+      if (!opts.classifiers || opts.classifiers.length === 0) {
+        return true;
+      }
+      return opts.classifiers.includes(name);
+    }
+    /** Array that will be populated with every active IContentClassifier. */
+    const activeClassifiers: IContentClassifier[] = [];
+    // Toxicity classifier — detects hateful, abusive, and toxic language.
+    if (isBuiltInEnabled('toxicity')) {
+      activeClassifiers.push(new ToxicityClassifier(state.services));
+    }
+    // Injection classifier — detects prompt-injection payloads.
+    if (isBuiltInEnabled('injection')) {
+      activeClassifiers.push(new InjectionClassifier(state.services));
+    }
+    // Jailbreak classifier — detects system-prompt override attempts.
+    if (isBuiltInEnabled('jailbreak')) {
+      activeClassifiers.push(new JailbreakClassifier(state.services));
+    }
+    // Append any caller-supplied custom classifiers.
+    if (opts.customClassifiers && opts.customClassifiers.length > 0) {
+      activeClassifiers.push(...opts.customClassifiers);
+    }
+    // ------------------------------------------------------------------
+    // 2. Resolve pack-level thresholds (merge caller overrides on top of
+    //    the library defaults).
+    // ------------------------------------------------------------------
+    const thresholds = {
+      ...DEFAULT_THRESHOLDS,
+      ...opts.thresholds,
+    };
+    // ------------------------------------------------------------------
+    // 3. Build the orchestrator with the resolved classifier list and
+    //    thresholds.
+    // ------------------------------------------------------------------
+    orchestrator = new ClassifierOrchestrator(activeClassifiers, thresholds);
+    // ------------------------------------------------------------------
+    // 4. Build the sliding-window buffer for streaming evaluation.
+    // ------------------------------------------------------------------
+    buffer = new SlidingWindowBuffer({
+      chunkSize: opts.chunkSize,
+      contextSize: opts.contextSize,
+      maxEvaluations: opts.maxEvaluations,
+    });
+    // ------------------------------------------------------------------
+    // 5. Build the guardrail, passing the shared registry and options.
+    //    The guardrail creates its own orchestrator internally from the
+    //    `classifiers` option — we pass the pre-built classifier instances
+    //    via the third constructor argument.
+    // ------------------------------------------------------------------
+    guardrail = new MLClassifierGuardrail(state.services, opts, activeClassifiers);
+    // ------------------------------------------------------------------
+    // 6. Build the on-demand classification tool backed by the orchestrator.
+    // ------------------------------------------------------------------
+    tool = new ClassifyContentTool(orchestrator);
+  }
+  // Initial build — makes the pack usable immediately without activation.
+  buildComponents();
+  // -------------------------------------------------------------------------
+  // ExtensionPack shape
+  // -------------------------------------------------------------------------
+  return {
+    /** Canonical pack name used in manifests and logs. */
+    name: 'ml-classifiers',
+    /** Semantic version of this pack implementation. */
+    version: '1.0.0',
+    /**
+     * Descriptor getter — always returns the latest (possibly rebuilt)
+     * component instances.  Using a getter ensures that after `onActivate`
+     * rebuilds the components, the descriptors array reflects the new
+     * references rather than stale closures from the initial build.
+     */
+    get descriptors(): ExtensionDescriptor[] {
+      return [
+        {
+          /**
+           * Guardrail descriptor.
+           *
+           * Priority 5 places this guardrail after the PII redaction guardrail
+           * (priority 10) so PII is stripped before ML classification.
+           */
+          id: 'ml-classifier-guardrail',
+          kind: EXTENSION_KIND_GUARDRAIL,
+          priority: 5,
+          payload: guardrail,
+        },
+        {
+          /**
+           * On-demand classification tool descriptor.
+           *
+           * Priority 0 uses the default ordering — tools are typically
+           * ordered by name rather than priority.
+           */
+          id: 'classify_content',
+          kind: EXTENSION_KIND_TOOL,
+          priority: 0,
+          payload: tool,
+        },
+      ];
+    },
+    /**
+     * Lifecycle hook called by the extension manager when the pack is
+     * activated.
+     *
+     * Upgrades the internal service registry to the extension manager's
+     * shared instance (so ONNX/WASM model weights are shared across all
+     * extensions) then rebuilds all components to use the new registry.
+     *
+     * @param context - Activation context provided by the extension manager.
+     */
+    onActivate: (context: ExtensionLifecycleContext): void => {
+      // Upgrade to the shared registry when the manager provides one.
+      if (context.services) {
+        state.services = context.services;
+      }
+      // Rebuild all components with the upgraded registry.
+      buildComponents();
+    },
+    /**
+     * Lifecycle hook called when the pack is deactivated or the agent shuts
+     * down.
+     *
+     * Disposes the classifier orchestrator (which releases ONNX/WASM
+     * resources for every registered classifier) and clears the sliding
+     * window buffer to release per-stream state.
+     */
+    onDeactivate: async (): Promise<void> => {
+      // Dispose all classifiers managed by the orchestrator.
+      // orchestrator may be undefined if buildComponents() was never called
+      // successfully (defensive guard).
+      if (orchestrator) {
+        await orchestrator.dispose();
+      }
+      // Clear any in-progress stream buffers.
+      if (buffer) {
+        buffer.clear();
+      }
+    },
+  };
+}
+// ---------------------------------------------------------------------------
+// Manifest factory bridge
+// ---------------------------------------------------------------------------
+/**
+ * AgentOS manifest factory function.
+ *
+ * Conforms to the convention expected by the extension loader when resolving
+ * packs from manifests.  Extracts `options` from the {@link ExtensionPackContext}
+ * and delegates to {@link createMLClassifierPack}.
+ *
+ * @param context - Manifest context containing optional pack options, secret
+ *                  resolver, and shared service registry.
+ * @returns A fully-configured {@link ExtensionPack}.
+ *
+ * @example Manifest entry:
+ * ```json
+ * {
+ *   "packs": [
+ *     {
+ *       "module": "./ml-classifiers",
+ *       "options": {
+ *         "classifiers": ["toxicity", "jailbreak"],
+ *         "thresholds": { "blockThreshold": 0.95 },
+ *         "streamingMode": true
+ *       }
+ *     }
+ *   ]
+ * }
+ * ```
+ */
+export function createExtensionPack(context: ExtensionPackContext): ExtensionPack {
+  return createMLClassifierPack(context.options as MLClassifierPackOptions);
+}

package/src/tools/ClassifyContentTool.ts ADDED Viewed

@@ -0,0 +1,201 @@
+/**
+ * @fileoverview On-demand content classification tool for AgentOS.
+ *
+ * `ClassifyContentTool` exposes the ML classifier pipeline as an invocable
+ * {@link ITool}, enabling agents and workflows to explicitly classify text
+ * for safety signals (toxicity, prompt injection, jailbreak) on demand,
+ * rather than relying solely on the implicit guardrail pipeline.
+ *
+ * Use cases:
+ * - An agent that needs to evaluate user-generated content before storing
+ *   it in a knowledge base.
+ * - A moderation workflow that classifies a batch of flagged messages.
+ * - A debugging tool for inspecting classifier behaviour on specific inputs.
+ *
+ * The tool delegates to a {@link ClassifierOrchestrator} instance and returns
+ * the full {@link ChunkEvaluation} (including per-classifier scores and the
+ * aggregated recommended action).
+ *
+ * @module agentos/extensions/packs/ml-classifiers/tools/ClassifyContentTool
+ */
+import type {
+  ITool,
+  JSONSchemaObject,
+  ToolExecutionContext,
+  ToolExecutionResult,
+} from '@framers/agentos';
+import type { ChunkEvaluation } from '../types';
+import type { ClassifierOrchestrator } from '../ClassifierOrchestrator';
+// ---------------------------------------------------------------------------
+// Input shape
+// ---------------------------------------------------------------------------
+/**
+ * Input arguments for the `classify_content` tool.
+ */
+export interface ClassifyInput {
+  /**
+   * The text to classify for safety signals.
+   * Must not be empty.
+   */
+  text: string;
+  /**
+   * Optional subset of classifier IDs to run.
+   * When omitted, all registered classifiers are invoked.
+   */
+  classifiers?: string[];
+}
+// ---------------------------------------------------------------------------
+// ClassifyContentTool
+// ---------------------------------------------------------------------------
+/**
+ * ITool implementation that runs ML content classifiers on demand.
+ *
+ * The tool is read-only (`hasSideEffects: false`) — it inspects text and
+ * returns structured classification results without modifying any state.
+ *
+ * @implements {ITool<ClassifyInput, ChunkEvaluation>}
+ *
+ * @example
+ * ```typescript
+ * const tool = new ClassifyContentTool(orchestrator);
+ * const result = await tool.execute(
+ *   { text: 'some potentially harmful text' },
+ *   executionContext,
+ * );
+ *
+ * if (result.success) {
+ *   console.log(result.output.recommendedAction); // 'allow' | 'flag' | 'block' | …
+ * }
+ * ```
+ */
+export class ClassifyContentTool implements ITool<ClassifyInput, ChunkEvaluation> {
+  // -------------------------------------------------------------------------
+  // ITool identity & metadata
+  // -------------------------------------------------------------------------
+  /** Unique tool identifier used for registration and lookup. */
+  readonly id = 'classify_content';
+  /** Functional name exposed to LLMs for tool-call invocation. */
+  readonly name = 'classify_content';
+  /** Human-readable display name for dashboards and UI. */
+  readonly displayName = 'Content Safety Classifier';
+  /** Natural-language description of the tool's purpose and behaviour. */
+  readonly description =
+    'Classify text for toxicity, prompt injection, and jailbreak attempts ' +
+    'using ML models. Returns per-classifier scores and an aggregated ' +
+    'recommended guardrail action.';
+  /** Logical grouping for tool discovery and filtering. */
+  readonly category = 'security';
+  /** SemVer version of this tool implementation. */
+  readonly version = '1.0.0';
+  /** This tool only reads text — it performs no mutations. */
+  readonly hasSideEffects = false;
+  // -------------------------------------------------------------------------
+  // JSON Schema for input validation
+  // -------------------------------------------------------------------------
+  /**
+   * JSON Schema describing the expected input arguments.
+   *
+   * - `text` (required): The string to classify.
+   * - `classifiers` (optional): Array of classifier IDs to restrict evaluation.
+   */
+  readonly inputSchema: JSONSchemaObject = {
+    type: 'object',
+    properties: {
+      text: {
+        type: 'string',
+        description: 'Text to classify for safety signals.',
+      },
+      classifiers: {
+        type: 'array',
+        items: { type: 'string' },
+        description:
+          'Optional: only run these classifier IDs. When omitted all registered classifiers are used.',
+      },
+    },
+    required: ['text'],
+  };
+  // -------------------------------------------------------------------------
+  // Internal state
+  // -------------------------------------------------------------------------
+  /** The orchestrator that drives the underlying ML classifiers. */
+  private readonly orchestrator: ClassifierOrchestrator;
+  // -------------------------------------------------------------------------
+  // Constructor
+  // -------------------------------------------------------------------------
+  /**
+   * Create a new ClassifyContentTool.
+   *
+   * @param orchestrator - The classifier orchestrator that will handle
+   *                       parallel classification and result aggregation.
+   */
+  constructor(orchestrator: ClassifierOrchestrator) {
+    this.orchestrator = orchestrator;
+  }
+  // -------------------------------------------------------------------------
+  // execute
+  // -------------------------------------------------------------------------
+  /**
+   * Run all (or a subset of) ML classifiers against the provided text and
+   * return the aggregated evaluation.
+   *
+   * @param args    - Tool input containing the text to classify and an
+   *                  optional list of classifier IDs to restrict execution.
+   * @param _context - Execution context (unused — classification is
+   *                   stateless and user-agnostic).
+   * @returns A successful result containing the {@link ChunkEvaluation},
+   *          or a failure result if the text is missing or classification
+   *          throws an unexpected error.
+   */
+  async execute(
+    args: ClassifyInput,
+    _context: ToolExecutionContext,
+  ): Promise<ToolExecutionResult<ChunkEvaluation>> {
+    // Validate that text is provided and non-empty.
+    if (!args.text || args.text.trim().length === 0) {
+      return {
+        success: false,
+        error: 'The "text" argument is required and must not be empty.',
+      };
+    }
+    try {
+      // Delegate to the orchestrator for parallel classification.
+      // NOTE: The `args.classifiers` filter is not yet implemented in the
+      // orchestrator — it would require a filtering layer.  For now, all
+      // registered classifiers are invoked regardless.
+      const evaluation = await this.orchestrator.classifyAll(args.text);
+      return {
+        success: true,
+        output: evaluation,
+      };
+    } catch (err: unknown) {
+      const message = err instanceof Error ? err.message : String(err);
+      return {
+        success: false,
+        error: `Classification failed: ${message}`,
+      };
+    }
+  }
+}