npm - @framers/agentos-ext-ml-classifiers - Versions diffs - 0.1.0 - Mend

@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +23 -0
package/dist/ClassifierOrchestrator.d.ts +126 -0
package/dist/ClassifierOrchestrator.d.ts.map +1 -0
package/dist/ClassifierOrchestrator.js +239 -0
package/dist/ClassifierOrchestrator.js.map +1 -0
package/dist/IContentClassifier.d.ts +117 -0
package/dist/IContentClassifier.d.ts.map +1 -0
package/dist/IContentClassifier.js +22 -0
package/dist/IContentClassifier.js.map +1 -0
package/dist/MLClassifierGuardrail.d.ts +163 -0
package/dist/MLClassifierGuardrail.d.ts.map +1 -0
package/dist/MLClassifierGuardrail.js +335 -0
package/dist/MLClassifierGuardrail.js.map +1 -0
package/dist/SlidingWindowBuffer.d.ts +213 -0
package/dist/SlidingWindowBuffer.d.ts.map +1 -0
package/dist/SlidingWindowBuffer.js +246 -0
package/dist/SlidingWindowBuffer.js.map +1 -0
package/dist/classifiers/InjectionClassifier.d.ts +126 -0
package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
package/dist/classifiers/InjectionClassifier.js +210 -0
package/dist/classifiers/InjectionClassifier.js.map +1 -0
package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
package/dist/classifiers/JailbreakClassifier.js +208 -0
package/dist/classifiers/JailbreakClassifier.js.map +1 -0
package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
package/dist/classifiers/ToxicityClassifier.js +212 -0
package/dist/classifiers/ToxicityClassifier.js.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.js +268 -0
package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
package/dist/index.d.ts +110 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +342 -0
package/dist/index.js.map +1 -0
package/dist/tools/ClassifyContentTool.d.ts +105 -0
package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
package/dist/tools/ClassifyContentTool.js +149 -0
package/dist/tools/ClassifyContentTool.js.map +1 -0
package/dist/types.d.ts +319 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +62 -0
package/dist/types.js.map +1 -0
package/dist/worker/classifier-worker.d.ts +49 -0
package/dist/worker/classifier-worker.d.ts.map +1 -0
package/dist/worker/classifier-worker.js +180 -0
package/dist/worker/classifier-worker.js.map +1 -0
package/package.json +45 -0
package/src/ClassifierOrchestrator.ts +290 -0
package/src/IContentClassifier.ts +124 -0
package/src/MLClassifierGuardrail.ts +419 -0
package/src/SlidingWindowBuffer.ts +384 -0
package/src/classifiers/InjectionClassifier.ts +261 -0
package/src/classifiers/JailbreakClassifier.ts +259 -0
package/src/classifiers/ToxicityClassifier.ts +263 -0
package/src/classifiers/WorkerClassifierProxy.ts +366 -0
package/src/index.ts +383 -0
package/src/tools/ClassifyContentTool.ts +201 -0
package/src/types.ts +391 -0
package/src/worker/classifier-worker.ts +267 -0

package/src/ClassifierOrchestrator.ts ADDED Viewed

@@ -0,0 +1,290 @@
+/**
+ * @fileoverview Orchestrator for parallel ML classifier execution with worst-wins aggregation.
+ *
+ * The `ClassifierOrchestrator` runs all registered {@link IContentClassifier}
+ * instances in parallel against a single text input and aggregates their
+ * results into a single {@link ChunkEvaluation}.  The aggregation policy is
+ * **worst-wins**: if any classifier recommends BLOCK the overall result is
+ * BLOCK, even if every other classifier returned ALLOW.
+ *
+ * Priority order (descending):
+ * ```
+ * BLOCK > FLAG > SANITIZE > ALLOW
+ * ```
+ *
+ * Each classifier may have its own threshold overrides (via
+ * `perClassifierThresholds`), and individual labels can be mapped to
+ * hard-coded actions via `ClassifierConfig.labelActions`.
+ *
+ * @module agentos/extensions/packs/ml-classifiers/ClassifierOrchestrator
+ */
+import type { IContentClassifier } from './IContentClassifier';
+import type {
+  AnnotatedClassificationResult,
+  ChunkEvaluation,
+  ClassifierThresholds,
+  ClassifierConfig,
+} from './types';
+import { DEFAULT_THRESHOLDS } from './types';
+import { GuardrailAction } from '@framers/agentos';
+// ---------------------------------------------------------------------------
+// Action severity ranking — used by worst-wins aggregation
+// ---------------------------------------------------------------------------
+/**
+ * Numeric severity for each {@link GuardrailAction}, where higher values
+ * represent more restrictive actions.  Used to implement the worst-wins
+ * comparison without brittle string ordering.
+ */
+const ACTION_SEVERITY: Record<GuardrailAction, number> = {
+  [GuardrailAction.ALLOW]: 0,
+  [GuardrailAction.SANITIZE]: 1,
+  [GuardrailAction.FLAG]: 2,
+  [GuardrailAction.BLOCK]: 3,
+};
+// ---------------------------------------------------------------------------
+// ClassifierOrchestrator
+// ---------------------------------------------------------------------------
+/**
+ * Drives all registered ML classifiers in parallel and folds their results
+ * into a single {@link ChunkEvaluation} using worst-wins aggregation.
+ *
+ * @example
+ * ```typescript
+ * const orchestrator = new ClassifierOrchestrator(
+ *   [toxicityClassifier, injectionClassifier],
+ *   DEFAULT_THRESHOLDS,
+ * );
+ *
+ * const evaluation = await orchestrator.classifyAll('some user message');
+ * if (evaluation.recommendedAction === GuardrailAction.BLOCK) {
+ *   // Terminate the interaction.
+ * }
+ * ```
+ */
+export class ClassifierOrchestrator {
+  // -------------------------------------------------------------------------
+  // Private state
+  // -------------------------------------------------------------------------
+  /** Immutable list of classifiers to run on every `classifyAll()` call. */
+  private readonly classifiers: IContentClassifier[];
+  /** Merged default thresholds (pack-level defaults + caller overrides). */
+  private readonly defaultThresholds: ClassifierThresholds;
+  /**
+   * Optional per-classifier threshold overrides, keyed by classifier ID.
+   * When a classifier's ID appears here, the partial thresholds are merged
+   * on top of {@link defaultThresholds} for that classifier only.
+   */
+  private readonly perClassifierThresholds: Record<string, Partial<ClassifierThresholds>>;
+  // -------------------------------------------------------------------------
+  // Constructor
+  // -------------------------------------------------------------------------
+  /**
+   * Create a new orchestrator.
+   *
+   * @param classifiers            - Array of classifier instances to run in parallel.
+   * @param defaultThresholds      - Pack-level threshold defaults applied to every classifier
+   *                                 unless overridden by `perClassifierThresholds`.
+   * @param perClassifierThresholds - Optional map from classifier ID to partial threshold
+   *                                  overrides.  Missing fields fall back to `defaultThresholds`.
+   */
+  constructor(
+    classifiers: IContentClassifier[],
+    defaultThresholds: ClassifierThresholds = DEFAULT_THRESHOLDS,
+    perClassifierThresholds: Record<string, Partial<ClassifierThresholds>> = {},
+  ) {
+    this.classifiers = classifiers;
+    this.defaultThresholds = defaultThresholds;
+    this.perClassifierThresholds = perClassifierThresholds;
+  }
+  // -------------------------------------------------------------------------
+  // Public API
+  // -------------------------------------------------------------------------
+  /**
+   * Classify `text` against every registered classifier in parallel and
+   * return the aggregated {@link ChunkEvaluation}.
+   *
+   * Execution details:
+   * 1. All classifiers run concurrently via `Promise.allSettled`.
+   * 2. Fulfilled results are wrapped as {@link AnnotatedClassificationResult}
+   *    with provenance metadata (`classifierId`, `latencyMs`).
+   * 3. Rejected promises log a warning and contribute an implicit ALLOW so
+   *    a single broken classifier does not block all content.
+   * 4. Each result is mapped to a {@link GuardrailAction} using
+   *    per-classifier thresholds (if configured) or the pack defaults.
+   * 5. The final `recommendedAction` is the most restrictive action across
+   *    all classifiers (worst-wins).
+   *
+   * @param text - The text to evaluate.  Must not be empty.
+   * @returns A promise resolving to the aggregated evaluation result.
+   */
+  async classifyAll(text: string): Promise<ChunkEvaluation> {
+    // Record wall-clock start time so `totalLatencyMs` reflects the
+    // real-world time spent, not the sum of sequential latencies.
+    const wallStart = performance.now();
+    // Fire all classifiers in parallel and wait for every one to settle.
+    const settled = await Promise.allSettled(
+      this.classifiers.map((c) => this.timedClassify(c, text)),
+    );
+    // Accumulate annotated results and track the worst action seen.
+    const results: AnnotatedClassificationResult[] = [];
+    let worstAction = GuardrailAction.ALLOW;
+    let triggeredBy: string | null = null;
+    for (let i = 0; i < settled.length; i++) {
+      const outcome = settled[i];
+      const classifier = this.classifiers[i];
+      if (outcome.status === 'fulfilled') {
+        const annotated = outcome.value;
+        results.push(annotated);
+        // Resolve the thresholds for this specific classifier.
+        const thresholds = this.resolveThresholds(classifier.id);
+        // Map the raw confidence score to a guardrail action.
+        const action = this.scoreToAction(annotated, thresholds);
+        // Worst-wins: keep the most restrictive action.
+        if (ACTION_SEVERITY[action] > ACTION_SEVERITY[worstAction]) {
+          worstAction = action;
+          triggeredBy = classifier.id;
+        }
+      } else {
+        // Classifier failed — log and contribute an implicit ALLOW.
+        console.warn(
+          `[ClassifierOrchestrator] Classifier "${classifier.id}" failed: ${outcome.reason}`,
+        );
+      }
+    }
+    const wallEnd = performance.now();
+    return {
+      results,
+      recommendedAction: worstAction,
+      triggeredBy,
+      totalLatencyMs: Math.round(wallEnd - wallStart),
+    };
+  }
+  /**
+   * Dispose every registered classifier, releasing model weights and any
+   * other resources they hold.
+   *
+   * Calls each classifier's `dispose()` method (if present) and swallows
+   * errors so a single failing classifier does not prevent cleanup of the
+   * others.
+   */
+  async dispose(): Promise<void> {
+    await Promise.allSettled(
+      this.classifiers.map(async (c) => {
+        if (c.dispose) {
+          await c.dispose();
+        }
+      }),
+    );
+  }
+  // -------------------------------------------------------------------------
+  // Private helpers
+  // -------------------------------------------------------------------------
+  /**
+   * Invoke a single classifier with wall-clock latency tracking.
+   *
+   * Wraps `classifier.classify(text)` and returns the raw
+   * {@link ClassificationResult} augmented with `classifierId` and
+   * `latencyMs` fields.
+   *
+   * @param classifier - The classifier to invoke.
+   * @param text       - The text to classify.
+   * @returns An annotated result with provenance metadata.
+   */
+  private async timedClassify(
+    classifier: IContentClassifier,
+    text: string,
+  ): Promise<AnnotatedClassificationResult> {
+    const start = performance.now();
+    const result = await classifier.classify(text);
+    const latencyMs = Math.round(performance.now() - start);
+    return {
+      ...result,
+      classifierId: classifier.id,
+      latencyMs,
+    };
+  }
+  /**
+   * Map a classifier's confidence score to a {@link GuardrailAction}.
+   *
+   * The mapping checks `labelActions` first (from per-classifier config in
+   * thresholds), then falls back to numeric threshold comparison:
+   *
+   * 1. `confidence >= blockThreshold` -> BLOCK
+   * 2. `confidence >= flagThreshold`  -> FLAG
+   * 3. `confidence >= warnThreshold`  -> SANITIZE
+   * 4. otherwise                      -> ALLOW
+   *
+   * @param result     - The annotated classification result.
+   * @param thresholds - Resolved thresholds for this classifier.
+   * @returns The appropriate guardrail action.
+   */
+  private scoreToAction(
+    result: AnnotatedClassificationResult,
+    thresholds: ClassifierThresholds,
+  ): GuardrailAction {
+    // Extract the confidence as a single number.
+    // ClassificationResult.confidence may be number | number[]; normalise.
+    const confidence = Array.isArray(result.confidence)
+      ? result.confidence[0] ?? 0
+      : result.confidence;
+    // Threshold comparison — checked in descending severity order.
+    if (confidence >= thresholds.blockThreshold) {
+      return GuardrailAction.BLOCK;
+    }
+    if (confidence >= thresholds.flagThreshold) {
+      return GuardrailAction.FLAG;
+    }
+    if (confidence >= thresholds.warnThreshold) {
+      return GuardrailAction.SANITIZE;
+    }
+    return GuardrailAction.ALLOW;
+  }
+  /**
+   * Resolve the effective thresholds for a given classifier by merging
+   * per-classifier overrides on top of the pack-level defaults.
+   *
+   * @param classifierId - ID of the classifier to resolve thresholds for.
+   * @returns Fully-resolved thresholds with no undefined fields.
+   */
+  private resolveThresholds(classifierId: string): ClassifierThresholds {
+    const overrides = this.perClassifierThresholds[classifierId];
+    if (!overrides) {
+      return this.defaultThresholds;
+    }
+    return {
+      blockThreshold: overrides.blockThreshold ?? this.defaultThresholds.blockThreshold,
+      flagThreshold: overrides.flagThreshold ?? this.defaultThresholds.flagThreshold,
+      warnThreshold: overrides.warnThreshold ?? this.defaultThresholds.warnThreshold,
+    };
+  }
+}

package/src/IContentClassifier.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * @fileoverview Interface contract for ML-backed content classifiers.
+ *
+ * An `IContentClassifier` represents a single model pipeline that accepts
+ * arbitrary text and returns a {@link ClassificationResult} containing the
+ * winning label and confidence scores for all candidate classes.
+ *
+ * Built-in implementations (toxicity, injection, jailbreak) each implement
+ * this interface.  Third-party classifiers may be registered via the
+ * `customClassifiers` option of {@link MLClassifierPackOptions}.
+ *
+ * Lifecycle
+ * ---------
+ * 1. The pack initialises each classifier (model loading, warm-up).
+ * 2. The guardrail pipeline calls `classify()` for every text chunk.
+ * 3. On pack teardown, `dispose()` is called (if present) to release GPU/
+ *    WASM memory.
+ *
+ * @module agentos/extensions/packs/ml-classifiers/IContentClassifier
+ */
+import type { ClassificationResult } from '@framers/agentos';
+/**
+ * Contract for a single ML content classifier.
+ *
+ * Implementations back one model pipeline and expose a narrow classify/dispose
+ * API so the guardrail orchestrator can drive them uniformly regardless of the
+ * underlying runtime (Node.js ONNX, browser WASM, remote inference endpoint).
+ *
+ * @example Minimal custom classifier
+ * ```typescript
+ * class SarcasmClassifier implements IContentClassifier {
+ *   readonly id = 'custom:sarcasm-detector';
+ *   readonly displayName = 'Sarcasm Detector';
+ *   readonly description = 'Detects sarcastic or ironic statements.';
+ *   readonly modelId = 'my-org/sarcasm-bert';
+ *   isLoaded = false;
+ *
+ *   async classify(text: string): Promise<ClassificationResult> {
+ *     // … run inference …
+ *     return { bestClass: 'NOT_SARCASTIC', confidence: 0.8, allScores: [] };
+ *   }
+ *
+ *   async dispose(): Promise<void> {
+ *     // Free resources.
+ *   }
+ * }
+ * ```
+ */
+export interface IContentClassifier {
+  /**
+   * Unique service identifier for this classifier.
+   *
+   * Must follow the `agentos:<domain>:<name>` convention so it can be
+   * registered with the AgentOS shared service registry.
+   *
+   * @example `'agentos:ml-classifiers:toxicity-pipeline'`
+   */
+  readonly id: string;
+  /**
+   * Human-readable name displayed in logs and dashboards.
+   *
+   * @example `'Toxicity Pipeline'`
+   */
+  readonly displayName: string;
+  /**
+   * Short prose description of what this classifier detects.
+   *
+   * @example `'Detects toxic, hateful, or abusive language in text.'`
+   */
+  readonly description: string;
+  /**
+   * Identifier of the underlying model being used, typically a Hugging Face
+   * model ID or a local filesystem path.
+   *
+   * @example `'Xenova/toxic-bert'`
+   */
+  readonly modelId: string;
+  /**
+   * Whether the model weights have been fully loaded into memory and the
+   * classifier is ready to accept `classify()` calls.
+   *
+   * The pack initialiser sets this to `true` after the warm-up inference
+   * succeeds.  Callers can check this flag before calling `classify()` to
+   * avoid queueing calls during a slow model download.
+   */
+  isLoaded: boolean;
+  /**
+   * Classify the provided text and return confidence scores for all candidate
+   * labels.
+   *
+   * The classifier is responsible for mapping raw model output to the
+   * {@link ClassificationResult} shape.  It should NOT apply thresholds or
+   * guardrail actions — that is the responsibility of the pack orchestrator.
+   *
+   * @param text - The text to classify.  May be a short chunk from a streaming
+   *   response or a complete message.  Must not be empty.
+   * @returns A promise that resolves with the classification result, including
+   *   the winning label (`bestClass`), its `confidence`, and `allScores` for
+   *   every label the model evaluated.
+   * @throws {Error} If the model is not loaded (`isLoaded === false`) or if
+   *   inference fails for an unrecoverable reason.
+   */
+  classify(text: string): Promise<ClassificationResult>;
+  /**
+   * Release all resources held by this classifier (model weights, WASM
+   * module, GPU buffers, worker threads, etc.).
+   *
+   * Called by the pack orchestrator during AgentOS shutdown or when the pack
+   * is unloaded.  Implementations should be idempotent — calling `dispose()`
+   * multiple times must not throw.
+   *
+   * @optional Classifiers that hold no persistent resources may omit this
+   *   method.
+   */
+  dispose?(): Promise<void>;
+}