npm - @framers/agentos-ext-ml-classifiers - Versions diffs - 0.1.0 - Mend

@framers/agentos-ext-ml-classifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +23 -0
package/dist/ClassifierOrchestrator.d.ts +126 -0
package/dist/ClassifierOrchestrator.d.ts.map +1 -0
package/dist/ClassifierOrchestrator.js +239 -0
package/dist/ClassifierOrchestrator.js.map +1 -0
package/dist/IContentClassifier.d.ts +117 -0
package/dist/IContentClassifier.d.ts.map +1 -0
package/dist/IContentClassifier.js +22 -0
package/dist/IContentClassifier.js.map +1 -0
package/dist/MLClassifierGuardrail.d.ts +163 -0
package/dist/MLClassifierGuardrail.d.ts.map +1 -0
package/dist/MLClassifierGuardrail.js +335 -0
package/dist/MLClassifierGuardrail.js.map +1 -0
package/dist/SlidingWindowBuffer.d.ts +213 -0
package/dist/SlidingWindowBuffer.d.ts.map +1 -0
package/dist/SlidingWindowBuffer.js +246 -0
package/dist/SlidingWindowBuffer.js.map +1 -0
package/dist/classifiers/InjectionClassifier.d.ts +126 -0
package/dist/classifiers/InjectionClassifier.d.ts.map +1 -0
package/dist/classifiers/InjectionClassifier.js +210 -0
package/dist/classifiers/InjectionClassifier.js.map +1 -0
package/dist/classifiers/JailbreakClassifier.d.ts +124 -0
package/dist/classifiers/JailbreakClassifier.d.ts.map +1 -0
package/dist/classifiers/JailbreakClassifier.js +208 -0
package/dist/classifiers/JailbreakClassifier.js.map +1 -0
package/dist/classifiers/ToxicityClassifier.d.ts +125 -0
package/dist/classifiers/ToxicityClassifier.d.ts.map +1 -0
package/dist/classifiers/ToxicityClassifier.js +212 -0
package/dist/classifiers/ToxicityClassifier.js.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts +158 -0
package/dist/classifiers/WorkerClassifierProxy.d.ts.map +1 -0
package/dist/classifiers/WorkerClassifierProxy.js +268 -0
package/dist/classifiers/WorkerClassifierProxy.js.map +1 -0
package/dist/index.d.ts +110 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +342 -0
package/dist/index.js.map +1 -0
package/dist/tools/ClassifyContentTool.d.ts +105 -0
package/dist/tools/ClassifyContentTool.d.ts.map +1 -0
package/dist/tools/ClassifyContentTool.js +149 -0
package/dist/tools/ClassifyContentTool.js.map +1 -0
package/dist/types.d.ts +319 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +62 -0
package/dist/types.js.map +1 -0
package/dist/worker/classifier-worker.d.ts +49 -0
package/dist/worker/classifier-worker.d.ts.map +1 -0
package/dist/worker/classifier-worker.js +180 -0
package/dist/worker/classifier-worker.js.map +1 -0
package/package.json +45 -0
package/src/ClassifierOrchestrator.ts +290 -0
package/src/IContentClassifier.ts +124 -0
package/src/MLClassifierGuardrail.ts +419 -0
package/src/SlidingWindowBuffer.ts +384 -0
package/src/classifiers/InjectionClassifier.ts +261 -0
package/src/classifiers/JailbreakClassifier.ts +259 -0
package/src/classifiers/ToxicityClassifier.ts +263 -0
package/src/classifiers/WorkerClassifierProxy.ts +366 -0
package/src/index.ts +383 -0
package/src/tools/ClassifyContentTool.ts +201 -0
package/src/types.ts +391 -0
package/src/worker/classifier-worker.ts +267 -0

package/dist/classifiers/InjectionClassifier.js ADDED Viewed

@@ -0,0 +1,210 @@
+/**
+ * @fileoverview Prompt-injection content classifier using the
+ * `protectai/deberta-v3-small-prompt-injection-v2` model.
+ *
+ * Prompt injection is the attack pattern where adversarial instructions are
+ * embedded inside user-supplied text to override or hijack the agent's system
+ * prompt.  This classifier provides a dedicated binary signal (INJECTION /
+ * SAFE) that the guardrail orchestrator can act on independently of the
+ * toxicity or jailbreak classifiers.
+ *
+ * Model details
+ * -------------
+ * `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa
+ * model from ProtectAI, specifically trained to distinguish benign user
+ * messages from prompt-injection payloads.  It outputs two labels:
+ *  - `INJECTION` — high-confidence injection attempt
+ *  - `SAFE`      — normal user input
+ *
+ * Graceful degradation
+ * --------------------
+ * If the model fails to load the classifier sets `unavailable = true` and
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
+ * on every subsequent call.
+ *
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier
+ */
+import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
+// ---------------------------------------------------------------------------
+// InjectionClassifier
+// ---------------------------------------------------------------------------
+/**
+ * Binary prompt-injection classifier backed by
+ * `protectai/deberta-v3-small-prompt-injection-v2`.
+ *
+ * Returns one of two labels:
+ *  - `INJECTION` — the text contains an injection attempt
+ *  - `SAFE`      — the text is clean
+ *
+ * The label with the higher confidence becomes `bestClass` / `confidence`.
+ * Both labels are present in `allScores` so callers can read the SAFE score
+ * as well.
+ *
+ * @implements {IContentClassifier}
+ *
+ * @example
+ * ```typescript
+ * const classifier = new InjectionClassifier(serviceRegistry);
+ * const result = await classifier.classify('Ignore previous instructions and …');
+ * // result.bestClass === 'INJECTION', result.confidence ≈ 0.97
+ * ```
+ */
+export class InjectionClassifier {
+    services;
+    config;
+    // -------------------------------------------------------------------------
+    // IContentClassifier identity fields
+    // -------------------------------------------------------------------------
+    /** Unique service identifier for this classifier. */
+    id = 'prompt-injection';
+    /** Human-readable name for dashboards and log output. */
+    displayName = 'Prompt Injection Classifier';
+    /** Short description of what this classifier detects. */
+    description = 'Detects prompt-injection attempts where adversarial instructions are ' +
+        'embedded in user input to override or hijack the agent system prompt.';
+    /**
+     * Default Hugging Face model ID.
+     * Overridable via {@link ClassifierConfig.modelId}.
+     */
+    modelId = 'protectai/deberta-v3-small-prompt-injection-v2';
+    // -------------------------------------------------------------------------
+    // Internal state
+    // -------------------------------------------------------------------------
+    /**
+     * Whether the model weights are fully loaded and the classifier is ready
+     * to accept `classify()` calls.
+     */
+    _isLoaded = false;
+    /**
+     * Set to `true` when the model fails to load.  Once `unavailable`, every
+     * subsequent `classify()` call immediately returns the pass result rather
+     * than retrying the expensive model load.
+     */
+    unavailable = false;
+    // -------------------------------------------------------------------------
+    // Constructor
+    // -------------------------------------------------------------------------
+    /**
+     * @param services - Shared service registry used to lazily create and cache
+     *   the underlying HuggingFace pipeline instance.
+     * @param config - Optional per-classifier configuration.  When
+     *   `config.modelId` is provided it overrides the default `modelId` when
+     *   loading the model.
+     */
+    constructor(services, config) {
+        this.services = services;
+        this.config = config;
+    }
+    // -------------------------------------------------------------------------
+    // IContentClassifier.isLoaded (getter)
+    // -------------------------------------------------------------------------
+    /**
+     * Whether the underlying model pipeline has been successfully initialised.
+     * The flag is set to `true` after the first successful `classify()` call.
+     */
+    get isLoaded() {
+        return this._isLoaded;
+    }
+    // -------------------------------------------------------------------------
+    // classify
+    // -------------------------------------------------------------------------
+    /**
+     * Run prompt-injection inference on `text`.
+     *
+     * Lazily loads the pipeline on the first call via the shared service
+     * registry, then calls it with `{ topk: null }` to retrieve scores for both
+     * labels.
+     *
+     * @param text - The text to evaluate.
+     * @returns A promise that resolves with the classification result.  If the
+     *   model is unavailable the pass result is returned instead of throwing.
+     */
+    async classify(text) {
+        // Return the pass result immediately if the model previously failed to load.
+        if (this.unavailable) {
+            return this.passResult();
+        }
+        // Lazily obtain (or create) the HuggingFace pipeline instance from the
+        // shared service registry so the model is only downloaded once.
+        let pipeline;
+        try {
+            pipeline = await this.services.getOrCreate(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE, async () => {
+                // Dynamic import so environments without @huggingface/transformers
+                // can still load the rest of AgentOS.
+                const { pipeline: createPipeline } = await import('@huggingface/transformers');
+                return createPipeline('text-classification',
+                // Honour a caller-supplied model override; fall back to the default.
+                this.config?.modelId ?? this.modelId, { quantized: true });
+            }, {
+                /** Release ONNX/WASM resources when the registry entry is evicted. */
+                dispose: async (p) => p?.dispose?.(),
+                /** Tags used for diagnostics and capability discovery. */
+                tags: ['ml', 'classifier', 'prompt-injection', 'onnx'],
+            });
+            // Mark the classifier as ready now that the pipeline is available.
+            this._isLoaded = true;
+        }
+        catch {
+            // Model failed to load — mark as unavailable and return the pass result.
+            this.unavailable = true;
+            return this.passResult();
+        }
+        // Run inference and request both label scores.
+        const raw = await pipeline(text, { topk: null });
+        return this.mapResult(raw);
+    }
+    // -------------------------------------------------------------------------
+    // dispose (optional IContentClassifier lifecycle hook)
+    // -------------------------------------------------------------------------
+    /**
+     * Release the pipeline instance from the shared service registry.
+     *
+     * Idempotent — safe to call multiple times.
+     */
+    async dispose() {
+        await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE);
+        this._isLoaded = false;
+    }
+    // -------------------------------------------------------------------------
+    // Private helpers
+    // -------------------------------------------------------------------------
+    /**
+     * Returns a "pass" result used when the model is unavailable.
+     *
+     * A pass result reports `bestClass: 'benign'` with zero confidence so the
+     * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
+     */
+    passResult() {
+        return { bestClass: 'benign', confidence: 0, allScores: [] };
+    }
+    /**
+     * Map the raw pipeline output to a {@link ClassificationResult}.
+     *
+     * For binary classification the label with the higher confidence score
+     * becomes `bestClass` / `confidence`.  Both labels are included in
+     * `allScores`.
+     *
+     * @param raw - Array returned by the pipeline when called with `topk: null`.
+     */
+    mapResult(raw) {
+        if (!raw || raw.length === 0) {
+            return this.passResult();
+        }
+        // Find the label with the highest score (should be one of INJECTION / SAFE).
+        let best = raw[0];
+        for (const item of raw) {
+            if (item.score > best.score) {
+                best = item;
+            }
+        }
+        return {
+            bestClass: best.label,
+            confidence: best.score,
+            allScores: raw.map((item) => ({
+                classLabel: item.label,
+                score: item.score,
+            })),
+        };
+    }
+}
+//# sourceMappingURL=InjectionClassifier.js.map

package/dist/classifiers/InjectionClassifier.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"InjectionClassifier.js","sourceRoot":"","sources":["../../src/classifiers/InjectionClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAMH,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAiBrD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,OAAO,mBAAmB;IAmDX;IACA;IAnDnB,4EAA4E;IAC5E,qCAAqC;IACrC,4EAA4E;IAE5E,qDAAqD;IAC5C,EAAE,GAAG,kBAAkB,CAAC;IAEjC,yDAAyD;IAChD,WAAW,GAAG,6BAA6B,CAAC;IAErD,yDAAyD;IAChD,WAAW,GAClB,uEAAuE;QACvE,uEAAuE,CAAC;IAE1E;;;OAGG;IACM,OAAO,GAAG,gDAAgD,CAAC;IAEpE,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E;;;OAGG;IACK,SAAS,GAAG,KAAK,CAAC;IAE1B;;;;OAIG;IACK,WAAW,GAAG,KAAK,CAAC;IAE5B,4EAA4E;IAC5E,cAAc;IACd,4EAA4E;IAE5E;;;;;;OAMG;IACH,YACmB,QAAgC,EAChC,MAAyB;QADzB,aAAQ,GAAR,QAAQ,CAAwB;QAChC,WAAM,GAAN,MAAM,CAAmB;IACzC,CAAC;IAEJ,4EAA4E;IAC5E,uCAAuC;IACvC,4EAA4E;IAE5E;;;OAGG;IACH,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED,4EAA4E;IAC5E,WAAW;IACX,4EAA4E;IAE5E;;;;;;;;;;OAUG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,uEAAuE;QACvE,gEAAgE;QAChE,IAAI,QAAqE,CAAC;QAC1E,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACxC,yBAAyB,CAAC,kBAAkB,EAC5C,KAAK,IAAI,EAAE;gBACT,mEAAmE;gBACnE,sCAAsC;gBACtC,MAAM,EAAE,QAAQ,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAC/C,2BAA2B,CAC5B,CAAC;gBACF,OAAO,cAAc,CACnB,qBAAqB;gBACrB,qEAAqE;gBACrE,IAAI,CAAC,MAAM,EAAE,OAAO,IAAI,IAAI,CAAC,OAAO,EACpC,EAAE,SAAS,EAAE,IAAI,EAAE,CACpB,CAAC;YACJ,CAAC,EACD;gBACE,sEAAsE;gBACtE,OAAO,EAAE,KAAK,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzC,0DAA0D;gBAC1D,IAAI,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,CAAC;aACvD,CACF,CAAC;YAEF,mEAAmE;YACnE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,yEAAyE;YACzE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,+CAA+C;QAC/C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,4EAA4E;IAC5E,uDAAuD;IACvD,4EAA4E;IAE5E;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAC;QAC1E,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC;IACzB,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACK,UAAU;QAChB,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;IAC/D,CAAC;IAED;;;;;;;;OAQG;IACK,SAAS,CAAC,GAAe;QAC/B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,6EAA6E;QAC7E,IAAI,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAClB,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;YACvB,IAAI,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC5B,IAAI,GAAG,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,UAAU,EAAE,IAAI,CAAC,KAAK;YACtB,SAAS,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF"}

package/dist/classifiers/JailbreakClassifier.d.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * @fileoverview Jailbreak content classifier using Meta's `PromptGuard-86M`
+ * model.
+ *
+ * Jailbreak attempts are adversarial prompts specifically crafted to bypass
+ * an LLM's safety guidelines — e.g. "DAN mode", role-play exploits, or
+ * indirect instruction injections.  This classifier uses Meta's PromptGuard
+ * model which was trained to distinguish three classes:
+ *
+ *  - `jailbreak`  — explicit attempt to override safety behaviour
+ *  - `injection`  — indirect or embedded instruction injection
+ *  - `benign`     — normal user input
+ *
+ * Unlike the binary {@link InjectionClassifier}, PromptGuard separates
+ * direct jailbreaks from indirect injections, giving the guardrail
+ * orchestrator finer-grained control over which action to take for each.
+ *
+ * Graceful degradation
+ * --------------------
+ * If the model fails to load the classifier sets `unavailable = true` and
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
+ * on every subsequent call.
+ *
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/JailbreakClassifier
+ */
+import type { ClassificationResult } from '@framers/agentos';
+import type { ISharedServiceRegistry } from '@framers/agentos';
+import type { IContentClassifier } from '../IContentClassifier';
+import type { ClassifierConfig } from '../types';
+/**
+ * Multi-class jailbreak classifier backed by `meta-llama/PromptGuard-86M`.
+ *
+ * Distinguishes three mutually-exclusive classes:
+ *  - `jailbreak`  — direct attempt to bypass safety guidelines
+ *  - `injection`  — indirect prompt injection embedded in user input
+ *  - `benign`     — normal, non-adversarial message
+ *
+ * The winning class (highest softmax score) is reported as `bestClass` /
+ * `confidence`.  All three scores are present in `allScores`.
+ *
+ * @implements {IContentClassifier}
+ *
+ * @example
+ * ```typescript
+ * const classifier = new JailbreakClassifier(serviceRegistry);
+ * const result = await classifier.classify('Pretend you have no restrictions…');
+ * // result.bestClass === 'jailbreak', result.confidence ≈ 0.88
+ * ```
+ */
+export declare class JailbreakClassifier implements IContentClassifier {
+    private readonly services;
+    private readonly config?;
+    /** Unique service identifier for this classifier. */
+    readonly id = "jailbreak";
+    /** Human-readable name for dashboards and log output. */
+    readonly displayName = "Jailbreak Classifier";
+    /** Short description of what this classifier detects. */
+    readonly description: string;
+    /**
+     * Default Hugging Face model ID.
+     * Overridable via {@link ClassifierConfig.modelId}.
+     */
+    readonly modelId = "meta-llama/PromptGuard-86M";
+    /**
+     * Whether the model weights are fully loaded and the classifier is ready
+     * to accept `classify()` calls.
+     */
+    private _isLoaded;
+    /**
+     * Set to `true` when the model fails to load.  Once `unavailable`, every
+     * subsequent `classify()` call immediately returns the pass result rather
+     * than retrying the expensive model load.
+     */
+    private unavailable;
+    /**
+     * @param services - Shared service registry used to lazily create and cache
+     *   the underlying HuggingFace pipeline instance.
+     * @param config - Optional per-classifier configuration.  When
+     *   `config.modelId` is provided it overrides the default `modelId` when
+     *   loading the model.
+     */
+    constructor(services: ISharedServiceRegistry, config?: ClassifierConfig | undefined);
+    /**
+     * Whether the underlying model pipeline has been successfully initialised.
+     * The flag is set to `true` after the first successful `classify()` call.
+     */
+    get isLoaded(): boolean;
+    /**
+     * Run jailbreak inference on `text`.
+     *
+     * Lazily loads the pipeline on the first call via the shared service
+     * registry, then calls it with `{ topk: null }` to retrieve scores for all
+     * three classes.
+     *
+     * @param text - The text to evaluate.
+     * @returns A promise that resolves with the classification result.  If the
+     *   model is unavailable the pass result is returned instead of throwing.
+     */
+    classify(text: string): Promise<ClassificationResult>;
+    /**
+     * Release the pipeline instance from the shared service registry.
+     *
+     * Idempotent — safe to call multiple times.
+     */
+    dispose(): Promise<void>;
+    /**
+     * Returns a "pass" result used when the model is unavailable.
+     *
+     * A pass result reports `bestClass: 'benign'` with zero confidence so the
+     * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
+     */
+    private passResult;
+    /**
+     * Map the raw pipeline output to a {@link ClassificationResult}.
+     *
+     * For multi-class classification the label with the highest softmax score
+     * becomes `bestClass` / `confidence`.  All three labels are included in
+     * `allScores`.
+     *
+     * @param raw - Array returned by the pipeline when called with `topk: null`.
+     */
+    private mapResult;
+}
+//# sourceMappingURL=JailbreakClassifier.d.ts.map

package/dist/classifiers/JailbreakClassifier.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"JailbreakClassifier.d.ts","sourceRoot":"","sources":["../../src/classifiers/JailbreakClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AAC7D,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC/D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAChE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAsBjD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,mBAAoB,YAAW,kBAAkB;IAmD1D,OAAO,CAAC,QAAQ,CAAC,QAAQ;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC;IA/C1B,qDAAqD;IACrD,QAAQ,CAAC,EAAE,eAAe;IAE1B,yDAAyD;IACzD,QAAQ,CAAC,WAAW,0BAA0B;IAE9C,yDAAyD;IACzD,QAAQ,CAAC,WAAW,SAEoC;IAExD;;;OAGG;IACH,QAAQ,CAAC,OAAO,gCAAgC;IAMhD;;;OAGG;IACH,OAAO,CAAC,SAAS,CAAS;IAE1B;;;;OAIG;IACH,OAAO,CAAC,WAAW,CAAS;IAM5B;;;;;;OAMG;gBAEgB,QAAQ,EAAE,sBAAsB,EAChC,MAAM,CAAC,EAAE,gBAAgB,YAAA;IAO5C;;;OAGG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAMD;;;;;;;;;;OAUG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAkD3D;;;;OAIG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAS9B;;;;;OAKG;IACH,OAAO,CAAC,UAAU;IAIlB;;;;;;;;OAQG;IACH,OAAO,CAAC,SAAS;CAsBlB"}

package/dist/classifiers/JailbreakClassifier.js ADDED Viewed

@@ -0,0 +1,208 @@
+/**
+ * @fileoverview Jailbreak content classifier using Meta's `PromptGuard-86M`
+ * model.
+ *
+ * Jailbreak attempts are adversarial prompts specifically crafted to bypass
+ * an LLM's safety guidelines — e.g. "DAN mode", role-play exploits, or
+ * indirect instruction injections.  This classifier uses Meta's PromptGuard
+ * model which was trained to distinguish three classes:
+ *
+ *  - `jailbreak`  — explicit attempt to override safety behaviour
+ *  - `injection`  — indirect or embedded instruction injection
+ *  - `benign`     — normal user input
+ *
+ * Unlike the binary {@link InjectionClassifier}, PromptGuard separates
+ * direct jailbreaks from indirect injections, giving the guardrail
+ * orchestrator finer-grained control over which action to take for each.
+ *
+ * Graceful degradation
+ * --------------------
+ * If the model fails to load the classifier sets `unavailable = true` and
+ * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
+ * on every subsequent call.
+ *
+ * @module agentos/extensions/packs/ml-classifiers/classifiers/JailbreakClassifier
+ */
+import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
+// ---------------------------------------------------------------------------
+// JailbreakClassifier
+// ---------------------------------------------------------------------------
+/**
+ * Multi-class jailbreak classifier backed by `meta-llama/PromptGuard-86M`.
+ *
+ * Distinguishes three mutually-exclusive classes:
+ *  - `jailbreak`  — direct attempt to bypass safety guidelines
+ *  - `injection`  — indirect prompt injection embedded in user input
+ *  - `benign`     — normal, non-adversarial message
+ *
+ * The winning class (highest softmax score) is reported as `bestClass` /
+ * `confidence`.  All three scores are present in `allScores`.
+ *
+ * @implements {IContentClassifier}
+ *
+ * @example
+ * ```typescript
+ * const classifier = new JailbreakClassifier(serviceRegistry);
+ * const result = await classifier.classify('Pretend you have no restrictions…');
+ * // result.bestClass === 'jailbreak', result.confidence ≈ 0.88
+ * ```
+ */
+export class JailbreakClassifier {
+    services;
+    config;
+    // -------------------------------------------------------------------------
+    // IContentClassifier identity fields
+    // -------------------------------------------------------------------------
+    /** Unique service identifier for this classifier. */
+    id = 'jailbreak';
+    /** Human-readable name for dashboards and log output. */
+    displayName = 'Jailbreak Classifier';
+    /** Short description of what this classifier detects. */
+    description = 'Detects jailbreak and indirect injection attacks using Meta PromptGuard. ' +
+        'Classifies text as jailbreak, injection, or benign.';
+    /**
+     * Default Hugging Face model ID.
+     * Overridable via {@link ClassifierConfig.modelId}.
+     */
+    modelId = 'meta-llama/PromptGuard-86M';
+    // -------------------------------------------------------------------------
+    // Internal state
+    // -------------------------------------------------------------------------
+    /**
+     * Whether the model weights are fully loaded and the classifier is ready
+     * to accept `classify()` calls.
+     */
+    _isLoaded = false;
+    /**
+     * Set to `true` when the model fails to load.  Once `unavailable`, every
+     * subsequent `classify()` call immediately returns the pass result rather
+     * than retrying the expensive model load.
+     */
+    unavailable = false;
+    // -------------------------------------------------------------------------
+    // Constructor
+    // -------------------------------------------------------------------------
+    /**
+     * @param services - Shared service registry used to lazily create and cache
+     *   the underlying HuggingFace pipeline instance.
+     * @param config - Optional per-classifier configuration.  When
+     *   `config.modelId` is provided it overrides the default `modelId` when
+     *   loading the model.
+     */
+    constructor(services, config) {
+        this.services = services;
+        this.config = config;
+    }
+    // -------------------------------------------------------------------------
+    // IContentClassifier.isLoaded (getter)
+    // -------------------------------------------------------------------------
+    /**
+     * Whether the underlying model pipeline has been successfully initialised.
+     * The flag is set to `true` after the first successful `classify()` call.
+     */
+    get isLoaded() {
+        return this._isLoaded;
+    }
+    // -------------------------------------------------------------------------
+    // classify
+    // -------------------------------------------------------------------------
+    /**
+     * Run jailbreak inference on `text`.
+     *
+     * Lazily loads the pipeline on the first call via the shared service
+     * registry, then calls it with `{ topk: null }` to retrieve scores for all
+     * three classes.
+     *
+     * @param text - The text to evaluate.
+     * @returns A promise that resolves with the classification result.  If the
+     *   model is unavailable the pass result is returned instead of throwing.
+     */
+    async classify(text) {
+        // Return the pass result immediately if the model previously failed to load.
+        if (this.unavailable) {
+            return this.passResult();
+        }
+        // Lazily obtain (or create) the HuggingFace pipeline from the shared
+        // registry — the model is only downloaded and initialised once.
+        let pipeline;
+        try {
+            pipeline = await this.services.getOrCreate(ML_CLASSIFIER_SERVICE_IDS.JAILBREAK_PIPELINE, async () => {
+                // Dynamic import so the ONNX runtime is excluded from the initial
+                // bundle and environments without the package are unaffected.
+                const { pipeline: createPipeline } = await import('@huggingface/transformers');
+                return createPipeline('text-classification',
+                // Honour a caller-supplied model override; fall back to the default.
+                this.config?.modelId ?? this.modelId, { quantized: true });
+            }, {
+                /** Release ONNX/WASM resources when the registry entry is evicted. */
+                dispose: async (p) => p?.dispose?.(),
+                /** Tags used for diagnostics and capability discovery. */
+                tags: ['ml', 'classifier', 'jailbreak', 'onnx'],
+            });
+            // Mark the classifier as ready now that the pipeline is available.
+            this._isLoaded = true;
+        }
+        catch {
+            // Model failed to load — mark as unavailable and return the pass result.
+            this.unavailable = true;
+            return this.passResult();
+        }
+        // Run inference and request scores for all three classes.
+        const raw = await pipeline(text, { topk: null });
+        return this.mapResult(raw);
+    }
+    // -------------------------------------------------------------------------
+    // dispose (optional IContentClassifier lifecycle hook)
+    // -------------------------------------------------------------------------
+    /**
+     * Release the pipeline instance from the shared service registry.
+     *
+     * Idempotent — safe to call multiple times.
+     */
+    async dispose() {
+        await this.services.release(ML_CLASSIFIER_SERVICE_IDS.JAILBREAK_PIPELINE);
+        this._isLoaded = false;
+    }
+    // -------------------------------------------------------------------------
+    // Private helpers
+    // -------------------------------------------------------------------------
+    /**
+     * Returns a "pass" result used when the model is unavailable.
+     *
+     * A pass result reports `bestClass: 'benign'` with zero confidence so the
+     * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
+     */
+    passResult() {
+        return { bestClass: 'benign', confidence: 0, allScores: [] };
+    }
+    /**
+     * Map the raw pipeline output to a {@link ClassificationResult}.
+     *
+     * For multi-class classification the label with the highest softmax score
+     * becomes `bestClass` / `confidence`.  All three labels are included in
+     * `allScores`.
+     *
+     * @param raw - Array returned by the pipeline when called with `topk: null`.
+     */
+    mapResult(raw) {
+        if (!raw || raw.length === 0) {
+            return this.passResult();
+        }
+        // Find the class with the highest probability (winner-takes-all).
+        let best = raw[0];
+        for (const item of raw) {
+            if (item.score > best.score) {
+                best = item;
+            }
+        }
+        return {
+            bestClass: best.label,
+            confidence: best.score,
+            allScores: raw.map((item) => ({
+                classLabel: item.label,
+                score: item.score,
+            })),
+        };
+    }
+}
+//# sourceMappingURL=JailbreakClassifier.js.map

package/dist/classifiers/JailbreakClassifier.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"JailbreakClassifier.js","sourceRoot":"","sources":["../../src/classifiers/JailbreakClassifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAMH,OAAO,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAiBrD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,OAAO,mBAAmB;IAmDX;IACA;IAnDnB,4EAA4E;IAC5E,qCAAqC;IACrC,4EAA4E;IAE5E,qDAAqD;IAC5C,EAAE,GAAG,WAAW,CAAC;IAE1B,yDAAyD;IAChD,WAAW,GAAG,sBAAsB,CAAC;IAE9C,yDAAyD;IAChD,WAAW,GAClB,2EAA2E;QAC3E,qDAAqD,CAAC;IAExD;;;OAGG;IACM,OAAO,GAAG,4BAA4B,CAAC;IAEhD,4EAA4E;IAC5E,iBAAiB;IACjB,4EAA4E;IAE5E;;;OAGG;IACK,SAAS,GAAG,KAAK,CAAC;IAE1B;;;;OAIG;IACK,WAAW,GAAG,KAAK,CAAC;IAE5B,4EAA4E;IAC5E,cAAc;IACd,4EAA4E;IAE5E;;;;;;OAMG;IACH,YACmB,QAAgC,EAChC,MAAyB;QADzB,aAAQ,GAAR,QAAQ,CAAwB;QAChC,WAAM,GAAN,MAAM,CAAmB;IACzC,CAAC;IAEJ,4EAA4E;IAC5E,uCAAuC;IACvC,4EAA4E;IAE5E;;;OAGG;IACH,IAAI,QAAQ;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED,4EAA4E;IAC5E,WAAW;IACX,4EAA4E;IAE5E;;;;;;;;;;OAUG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,qEAAqE;QACrE,gEAAgE;QAChE,IAAI,QAAqE,CAAC;QAC1E,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACxC,yBAAyB,CAAC,kBAAkB,EAC5C,KAAK,IAAI,EAAE;gBACT,kEAAkE;gBAClE,8DAA8D;gBAC9D,MAAM,EAAE,QAAQ,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAC/C,2BAA2B,CAC5B,CAAC;gBACF,OAAO,cAAc,CACnB,qBAAqB;gBACrB,qEAAqE;gBACrE,IAAI,CAAC,MAAM,EAAE,OAAO,IAAI,IAAI,CAAC,OAAO,EACpC,EAAE,SAAS,EAAE,IAAI,EAAE,CACpB,CAAC;YACJ,CAAC,EACD;gBACE,sEAAsE;gBACtE,OAAO,EAAE,KAAK,EAAE,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzC,0DAA0D;gBAC1D,IAAI,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC;aAChD,CACF,CAAC;YAEF,mEAAmE;YACnE,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;QAAC,MAAM,CAAC;YACP,yEAAyE;YACzE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,0DAA0D;QAC1D,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QACjD,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,4EAA4E;IAC5E,uDAAuD;IACvD,4EAA4E;IAE5E;;;;OAIG;IACH,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAC;QAC1E,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC;IACzB,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;OAKG;IACK,UAAU;QAChB,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;IAC/D,CAAC;IAED;;;;;;;;OAQG;IACK,SAAS,CAAC,GAAe;QAC/B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,UAAU,EAAE,CAAC;QAC3B,CAAC;QAED,kEAAkE;QAClE,IAAI,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;QAClB,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;YACvB,IAAI,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC5B,IAAI,GAAG,IAAI,CAAC;YACd,CAAC;QACH,CAAC;QAED,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,UAAU,EAAE,IAAI,CAAC,KAAK;YACtB,SAAS,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK;gBACtB,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF"}