npm - @framers/agentos-ext-ml-classifiers - Versions diffs - 0.2.1 → 0.3.1 - Mend

@framers/agentos-ext-ml-classifiers 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/.github/workflows/ci.yml +20 -0
package/.github/workflows/release.yml +37 -0
package/.releaserc.json +9 -0
package/LICENSE +96 -21
package/README.md +72 -0
package/dist/MLClassifierGuardrail.d.ts.map +1 -1
package/dist/MLClassifierGuardrail.js +14 -6
package/dist/MLClassifierGuardrail.js.map +1 -1
package/dist/index.js +3 -3
package/dist/keyword-classifier.js +1 -1
package/dist/llm-classifier.js +1 -1
package/package.json +5 -13
package/scripts/fix-esm-imports.mjs +181 -0
package/src/MLClassifierGuardrail.ts +38 -5
package/test/llm-tier.spec.ts +267 -0
package/test/ml-classifiers.spec.ts +57 -0
package/test/onnx-tier.spec.ts +255 -0
package/test/tier-fallthrough.spec.ts +185 -0
package/vitest.config.ts +18 -7
package/CHANGELOG.md +0 -18
package/dist/ClassifierOrchestrator.d.ts +0 -126
package/dist/ClassifierOrchestrator.d.ts.map +0 -1
package/dist/ClassifierOrchestrator.js +0 -239
package/dist/ClassifierOrchestrator.js.map +0 -1
package/dist/IContentClassifier.d.ts +0 -117
package/dist/IContentClassifier.d.ts.map +0 -1
package/dist/IContentClassifier.js +0 -22
package/dist/IContentClassifier.js.map +0 -1
package/dist/SlidingWindowBuffer.d.ts +0 -213
package/dist/SlidingWindowBuffer.d.ts.map +0 -1
package/dist/SlidingWindowBuffer.js +0 -246
package/dist/SlidingWindowBuffer.js.map +0 -1
package/dist/classifiers/InjectionClassifier.d.ts +0 -126
package/dist/classifiers/InjectionClassifier.d.ts.map +0 -1
package/dist/classifiers/InjectionClassifier.js +0 -210
package/dist/classifiers/InjectionClassifier.js.map +0 -1
package/dist/classifiers/JailbreakClassifier.d.ts +0 -124
package/dist/classifiers/JailbreakClassifier.d.ts.map +0 -1
package/dist/classifiers/JailbreakClassifier.js +0 -208
package/dist/classifiers/JailbreakClassifier.js.map +0 -1
package/dist/classifiers/ToxicityClassifier.d.ts +0 -125
package/dist/classifiers/ToxicityClassifier.d.ts.map +0 -1
package/dist/classifiers/ToxicityClassifier.js +0 -212
package/dist/classifiers/ToxicityClassifier.js.map +0 -1
package/dist/classifiers/WorkerClassifierProxy.d.ts +0 -158
package/dist/classifiers/WorkerClassifierProxy.d.ts.map +0 -1
package/dist/classifiers/WorkerClassifierProxy.js +0 -268
package/dist/classifiers/WorkerClassifierProxy.js.map +0 -1
package/dist/worker/classifier-worker.d.ts +0 -49
package/dist/worker/classifier-worker.d.ts.map +0 -1
package/dist/worker/classifier-worker.js +0 -180
package/dist/worker/classifier-worker.js.map +0 -1
package/src/ClassifierOrchestrator.ts +0 -290
package/src/IContentClassifier.ts +0 -124
package/src/SlidingWindowBuffer.ts +0 -384
package/src/classifiers/InjectionClassifier.ts +0 -261
package/src/classifiers/JailbreakClassifier.ts +0 -259
package/src/classifiers/ToxicityClassifier.ts +0 -263
package/src/classifiers/WorkerClassifierProxy.ts +0 -366
package/src/worker/classifier-worker.ts +0 -267
package/test/ClassifierOrchestrator.spec.ts +0 -365
package/test/ClassifyContentTool.spec.ts +0 -226
package/test/InjectionClassifier.spec.ts +0 -263
package/test/JailbreakClassifier.spec.ts +0 -295
package/test/MLClassifierGuardrail.spec.ts +0 -486
package/test/SlidingWindowBuffer.spec.ts +0 -391
package/test/ToxicityClassifier.spec.ts +0 -268
package/test/WorkerClassifierProxy.spec.ts +0 -303
package/test/index.spec.ts +0 -431

package/src/MLClassifierGuardrail.ts CHANGED Viewed

@@ -27,6 +27,7 @@ import type {
   GuardrailInputPayload,
   GuardrailOutputPayload,
   GuardrailEvaluationResult,
+  AgentOSFinalResponseChunk,
 } from '@framers/agentos';
 import { GuardrailAction } from '@framers/agentos';
 import { AgentOSResponseChunkType } from '@framers/agentos';
@@ -40,6 +41,27 @@ import { ALL_CATEGORIES } from './types';
 import { classifyByKeywords } from './keyword-classifier';
 import { classifyByLlm } from './llm-classifier';
+// ---------------------------------------------------------------------------
+// HuggingFace / ONNX pipeline types
+// ---------------------------------------------------------------------------
+/**
+ * A single label + score pair returned by a HuggingFace text-classification
+ * pipeline.  The `label` is the model's class name (e.g. `"toxic"`,
+ * `"obscene"`) and `score` is the softmax probability in [0, 1].
+ */
+interface OnnxClassificationLabel {
+  label: string;
+  score: number;
+}
+/**
+ * Callable returned by `pipeline('text-classification', ...)` from the
+ * `@huggingface/transformers` package.  Returns all label scores for the
+ * given text.
+ */
+type OnnxTextClassificationPipeline = (text: string) => Promise<OnnxClassificationLabel[]>;
 // ---------------------------------------------------------------------------
 // MLClassifierGuardrail
 // ---------------------------------------------------------------------------
@@ -91,7 +113,7 @@ export class MLClassifierGuardrail implements IGuardrailService {
    * `null` means we already tried and failed to load the module.
    * `undefined` means we have not tried yet.
    */
-  private onnxPipeline: any | null | undefined = undefined;
+  private onnxPipeline: OnnxTextClassificationPipeline | null | undefined = undefined;
   // -----------------------------------------------------------------------
   // Constructor
@@ -159,7 +181,8 @@ export class MLClassifierGuardrail implements IGuardrailService {
       return null;
     }
-    const text = (chunk as any).text ?? (chunk as any).content ?? '';
+    const finalChunk = chunk as AgentOSFinalResponseChunk;
+    const text = finalChunk.finalResponseText ?? '';
     if (typeof text !== 'string' || text.length === 0) return null;
     const result = await this.classify(text);
@@ -222,11 +245,21 @@ export class MLClassifierGuardrail implements IGuardrailService {
       try {
         // Dynamic import so the optional dependency does not fail at boot.
         const transformers = await import('@huggingface/transformers');
-        this.onnxPipeline = await transformers.pipeline(
+        const pipelineInstance = await transformers.pipeline(
           'text-classification',
           'Xenova/toxic-bert',
           { device: 'cpu' }
         );
+        // The HuggingFace pipeline is callable as a function. We always
+        // request all labels (top_k higher than any model's label count
+        // causes HF to return every label, matching `top_k: null`).
+        // Cast through unknown because the Pipeline union type is too
+        // wide for the inferred call signature here.
+        const callable = pipelineInstance as unknown as (
+          text: string,
+          opts: { top_k: number },
+        ) => Promise<OnnxClassificationLabel[]>;
+        this.onnxPipeline = (text: string) => callable(text, { top_k: 9999 });
       } catch {
         // Module not installed or model load failed — mark as unavailable.
         this.onnxPipeline = null;
@@ -235,7 +268,7 @@ export class MLClassifierGuardrail implements IGuardrailService {
     }
     try {
-      const raw = await this.onnxPipeline(text, { topk: null });
+      const raw = await this.onnxPipeline(text);
       // Map ONNX labels to our categories.
       const scores = this.mapOnnxScores(raw);
@@ -263,7 +296,7 @@ export class MLClassifierGuardrail implements IGuardrailService {
    *
    * @internal
    */
-  private mapOnnxScores(raw: any[]): CategoryScore[] {
+  private mapOnnxScores(raw: OnnxClassificationLabel[]): CategoryScore[] {
     /** Map of ONNX label -> our category. */
     const labelMap: Record<string, ClassifierCategory> = {
       toxic: 'toxic',

package/test/llm-tier.spec.ts ADDED Viewed

@@ -0,0 +1,267 @@
+/**
+ * @file llm-tier.spec.ts
+ * @description Tests for the LLM-as-judge (Tier 2) classification path.
+ *
+ * Exercises `classifyByLlm()` directly — verifying that the structured
+ * classification prompt is sent to the invoker, JSON and markdown-wrapped JSON
+ * are parsed correctly, and failures produce zero-confidence scores.
+ */
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+// Force ONNX unavailable so classify() falls through to LLM tier
+vi.mock('@huggingface/transformers', () => {
+  throw new Error('ONNX not available');
+});
+import { classifyByLlm } from '../src/llm-classifier';
+import type { LlmInvoker, ClassifierCategory } from '../src/types';
+import { MLClassifierGuardrail } from '../src/MLClassifierGuardrail';
+// ---------------------------------------------------------------------------
+// Tests — classifyByLlm directly
+// ---------------------------------------------------------------------------
+describe('LLM tier — classifyByLlm()', () => {
+  // -----------------------------------------------------------------------
+  // Prompt verification
+  // -----------------------------------------------------------------------
+  describe('prompt sent to invoker', () => {
+    it('sends a system prompt and the user text to the invoker', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+        JSON.stringify({
+          toxic: false,
+          injection: false,
+          nsfw: false,
+          threat: false,
+          confidence: 0.9,
+        })
+      );
+      await classifyByLlm('Hello world', invoker);
+      expect(invoker).toHaveBeenCalledTimes(1);
+      const [systemPrompt, userMessage] = (invoker as any).mock.calls[0];
+      expect(typeof systemPrompt).toBe('string');
+      expect(systemPrompt).toContain('content safety classifier');
+      expect(userMessage).toBe('Hello world');
+    });
+  });
+  // -----------------------------------------------------------------------
+  // JSON parsing
+  // -----------------------------------------------------------------------
+  describe('JSON response parsing', () => {
+    it('parses a clean JSON response into category scores', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+        JSON.stringify({
+          toxic: true,
+          injection: false,
+          nsfw: false,
+          threat: true,
+          confidence: 0.85,
+        })
+      );
+      const scores = await classifyByLlm('some bad text', invoker);
+      expect(scores).toHaveLength(4);
+      const toxic = scores.find((s) => s.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.85);
+      const injection = scores.find((s) => s.name === 'injection');
+      expect(injection?.confidence).toBe(0);
+      const nsfw = scores.find((s) => s.name === 'nsfw');
+      expect(nsfw?.confidence).toBe(0);
+      const threat = scores.find((s) => s.name === 'threat');
+      expect(threat?.confidence).toBe(0.85);
+    });
+    it('uses default confidence (0.7) when confidence is omitted', async () => {
+      const invoker: LlmInvoker = vi
+        .fn()
+        .mockResolvedValue(
+          JSON.stringify({ toxic: true, injection: false, nsfw: false, threat: false })
+        );
+      const scores = await classifyByLlm('abusive text', invoker);
+      const toxic = scores.find((s) => s.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.7);
+    });
+    it('clamps confidence to [0, 1]', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+        JSON.stringify({
+          toxic: true,
+          injection: false,
+          nsfw: false,
+          threat: false,
+          confidence: 5.0,
+        })
+      );
+      const scores = await classifyByLlm('test', invoker);
+      const toxic = scores.find((s) => s.name === 'toxic');
+      expect(toxic?.confidence).toBeLessThanOrEqual(1.0);
+    });
+  });
+  // -----------------------------------------------------------------------
+  // Markdown-wrapped JSON
+  // -----------------------------------------------------------------------
+  describe('markdown-wrapped JSON handling', () => {
+    it('strips ```json fences before parsing', async () => {
+      const invoker: LlmInvoker = vi
+        .fn()
+        .mockResolvedValue(
+          '```json\n{"toxic": true, "injection": false, "nsfw": false, "threat": false, "confidence": 0.9}\n```'
+        );
+      const scores = await classifyByLlm('wrapped response', invoker);
+      const toxic = scores.find((s) => s.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.9);
+    });
+    it('strips bare ``` fences (no language tag)', async () => {
+      const invoker: LlmInvoker = vi
+        .fn()
+        .mockResolvedValue(
+          '```\n{"toxic": false, "injection": true, "nsfw": false, "threat": false, "confidence": 0.75}\n```'
+        );
+      const scores = await classifyByLlm('injection attempt', invoker);
+      const injection = scores.find((s) => s.name === 'injection');
+      expect(injection?.confidence).toBe(0.75);
+    });
+    it('handles trailing commas in LLM output', async () => {
+      const invoker: LlmInvoker = vi
+        .fn()
+        .mockResolvedValue(
+          '{"toxic": true, "injection": false, "nsfw": false, "threat": false, "confidence": 0.8,}'
+        );
+      const scores = await classifyByLlm('trailing comma', invoker);
+      const toxic = scores.find((s) => s.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.8);
+    });
+  });
+  // -----------------------------------------------------------------------
+  // Failure modes
+  // -----------------------------------------------------------------------
+  describe('failure handling', () => {
+    it('returns zero scores when invoker throws', async () => {
+      const invoker: LlmInvoker = vi.fn().mockRejectedValue(new Error('LLM unavailable'));
+      const scores = await classifyByLlm('test', invoker);
+      expect(scores).toHaveLength(4);
+      for (const score of scores) {
+        expect(score.confidence).toBe(0);
+      }
+    });
+    it('returns zero scores when invoker returns unparseable text', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue('I cannot classify this content.');
+      const scores = await classifyByLlm('test', invoker);
+      for (const score of scores) {
+        expect(score.confidence).toBe(0);
+      }
+    });
+    it('returns zero scores when invoker returns an array instead of object', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue('[1, 2, 3]');
+      const scores = await classifyByLlm('test', invoker);
+      for (const score of scores) {
+        expect(score.confidence).toBe(0);
+      }
+    });
+  });
+  // -----------------------------------------------------------------------
+  // Category filtering
+  // -----------------------------------------------------------------------
+  describe('category filtering', () => {
+    it('returns scores only for requested categories', async () => {
+      const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+        JSON.stringify({
+          toxic: true,
+          injection: true,
+          nsfw: false,
+          threat: false,
+          confidence: 0.9,
+        })
+      );
+      const subset: ClassifierCategory[] = ['toxic', 'injection'];
+      const scores = await classifyByLlm('targeted', invoker, subset);
+      expect(scores).toHaveLength(2);
+      expect(scores.map((s) => s.name)).toEqual(['toxic', 'injection']);
+    });
+  });
+});
+// ---------------------------------------------------------------------------
+// Tests — LLM tier via MLClassifierGuardrail.classify()
+// ---------------------------------------------------------------------------
+describe('LLM tier — via guardrail classify()', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+  it('falls through to LLM when ONNX is unavailable', async () => {
+    const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+      JSON.stringify({
+        toxic: true,
+        injection: false,
+        nsfw: false,
+        threat: false,
+        confidence: 0.9,
+      })
+    );
+    const guardrail = new MLClassifierGuardrail({ llmInvoker: invoker });
+    const result = await guardrail.classify('test');
+    expect(result.source).toBe('llm');
+    expect(invoker).toHaveBeenCalledTimes(1);
+  });
+  it('result.flagged is true when LLM detects a category above threshold', async () => {
+    const invoker: LlmInvoker = vi.fn().mockResolvedValue(
+      JSON.stringify({
+        toxic: true,
+        injection: false,
+        nsfw: false,
+        threat: false,
+        confidence: 0.85,
+      })
+    );
+    const guardrail = new MLClassifierGuardrail({ llmInvoker: invoker });
+    const result = await guardrail.classify('abusive text');
+    expect(result.flagged).toBe(true);
+    expect(result.source).toBe('llm');
+    const toxic = result.categories.find((c) => c.name === 'toxic');
+    expect(toxic?.confidence).toBe(0.85);
+  });
+});

package/test/ml-classifiers.spec.ts ADDED Viewed

@@ -0,0 +1,57 @@
+// @ts-nocheck
+import { describe, it, expect } from 'vitest';
+import { createExtensionPack } from '../src/index';
+describe('ML Classifiers Extension Pack', () => {
+  // -------------------------------------------------------------------------
+  // Pack structure
+  // -------------------------------------------------------------------------
+  it('createExtensionPack returns correct structure', () => {
+    const pack = createExtensionPack({ options: {} } as any);
+    expect(pack.name).toBe('ml-classifiers');
+    expect(pack.version).toBe('1.0.0');
+    expect(pack.descriptors).toHaveLength(2);
+    const kinds = pack.descriptors.map((d) => d.kind);
+    expect(kinds).toContain('guardrail');
+    expect(kinds).toContain('tool');
+    const ids = pack.descriptors.map((d) => d.id);
+    expect(ids).toContain('ml-classifier-guardrail');
+    expect(ids).toContain('classify_content');
+  });
+  // -------------------------------------------------------------------------
+  // Guardrail — keyword fallback detection
+  // -------------------------------------------------------------------------
+  describe('guardrail evaluateInput', () => {
+    function getGuardrail() {
+      const pack = createExtensionPack({ options: {} } as any);
+      const desc = pack.descriptors.find((d) => d.kind === 'guardrail');
+      return desc!.payload as any;
+    }
+    it('detects highly toxic input', async () => {
+      const guardrail = getGuardrail();
+      // Use strongly toxic text that both ONNX and keyword fallback would flag
+      const result = await guardrail.evaluateInput({
+        input: { textInput: 'You are a stupid idiot, kill yourself you moron' },
+      });
+      expect(result).not.toBeNull();
+      expect(['flag', 'block']).toContain(result!.action);
+    });
+    it('allows clean input through', async () => {
+      const guardrail = getGuardrail();
+      const result = await guardrail.evaluateInput({
+        input: { textInput: 'What is the weather like today?' },
+      });
+      expect(result).toBeNull();
+    });
+  });
+});

package/test/onnx-tier.spec.ts ADDED Viewed

@@ -0,0 +1,255 @@
+/**
+ * @file onnx-tier.spec.ts
+ * @description Tests for the ONNX (Tier 1) classification path in MLClassifierGuardrail.
+ *
+ * Mocks `@huggingface/transformers` to return controlled toxic-bert label/score
+ * pairs, verifying that ONNX results are mapped to internal categories, threshold
+ * logic works, and the result carries `source: 'onnx'`.
+ */
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+// ---------------------------------------------------------------------------
+// Mock @huggingface/transformers
+// ---------------------------------------------------------------------------
+/**
+ * Callable mock that stands in for the ONNX text-classification pipeline.
+ * Tests configure its return value per-case via `mockResolvedValue`.
+ */
+const mockPipelineCall = vi.fn();
+vi.mock('@huggingface/transformers', () => ({
+  pipeline: vi.fn().mockResolvedValue({
+    _call: mockPipelineCall,
+  }),
+}));
+// ---------------------------------------------------------------------------
+// SUT
+// ---------------------------------------------------------------------------
+import { MLClassifierGuardrail } from '../src/MLClassifierGuardrail';
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/** Builds a fresh guardrail instance for each test (resets cached pipeline). */
+function createGuardrail(options?: any): MLClassifierGuardrail {
+  return new MLClassifierGuardrail(options);
+}
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+describe('ONNX tier classification', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+  // -------------------------------------------------------------------------
+  // Label mapping
+  // -------------------------------------------------------------------------
+  describe('label-to-category mapping', () => {
+    it('maps toxic-bert labels to internal categories', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.92 },
+        { label: 'severe_toxic', score: 0.45 },
+        { label: 'obscene', score: 0.78 },
+        { label: 'insult', score: 0.65 },
+        { label: 'identity_hate', score: 0.3 },
+        { label: 'threat', score: 0.15 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('test text');
+      expect(result.source).toBe('onnx');
+      // toxic = max(toxic:0.92, severe_toxic:0.45, insult:0.65, identity_hate:0.30)
+      const toxic = result.categories.find((c) => c.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.92);
+      // nsfw = max(obscene:0.78)
+      const nsfw = result.categories.find((c) => c.name === 'nsfw');
+      expect(nsfw?.confidence).toBe(0.78);
+      // threat = max(threat:0.15)
+      const threat = result.categories.find((c) => c.name === 'threat');
+      expect(threat?.confidence).toBe(0.15);
+      // injection is not produced by toxic-bert, stays at 0
+      const injection = result.categories.find((c) => c.name === 'injection');
+      expect(injection?.confidence).toBe(0);
+    });
+    it('takes max score when multiple ONNX labels map to the same category', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.3 },
+        { label: 'severe_toxic', score: 0.85 },
+        { label: 'insult', score: 0.6 },
+        { label: 'identity_hate', score: 0.7 },
+        { label: 'obscene', score: 0.1 },
+        { label: 'threat', score: 0.05 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('some text');
+      // toxic category = max(0.30, 0.85, 0.60, 0.70) = 0.85
+      const toxic = result.categories.find((c) => c.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.85);
+    });
+    it('handles labels with mixed case and whitespace', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'Toxic', score: 0.7 },
+        { label: 'OBSCENE', score: 0.6 },
+        { label: 'identity hate', score: 0.5 },
+        { label: 'THREAT', score: 0.3 },
+        { label: 'severe toxic', score: 0.2 },
+        { label: 'INSULT', score: 0.1 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('some text');
+      // identity_hate is "identity hate" with space -> lowered + underscore = identity_hate -> toxic
+      // toxic = max(toxic:0.7, identity_hate:0.5, severe_toxic:0.2, insult:0.1) = 0.7
+      const toxic = result.categories.find((c) => c.name === 'toxic');
+      expect(toxic?.confidence).toBe(0.7);
+      // obscene -> nsfw = 0.6
+      const nsfw = result.categories.find((c) => c.name === 'nsfw');
+      expect(nsfw?.confidence).toBe(0.6);
+    });
+  });
+  // -------------------------------------------------------------------------
+  // Threshold behaviour
+  // -------------------------------------------------------------------------
+  describe('threshold behaviour', () => {
+    it('flags content above default flag threshold (0.5)', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.65 },
+        { label: 'severe_toxic', score: 0.0 },
+        { label: 'obscene', score: 0.0 },
+        { label: 'insult', score: 0.0 },
+        { label: 'identity_hate', score: 0.0 },
+        { label: 'threat', score: 0.0 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('mildly toxic text');
+      expect(result.flagged).toBe(true);
+      expect(result.source).toBe('onnx');
+    });
+    it('does not flag content below all thresholds', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.1 },
+        { label: 'severe_toxic', score: 0.05 },
+        { label: 'obscene', score: 0.02 },
+        { label: 'insult', score: 0.08 },
+        { label: 'identity_hate', score: 0.01 },
+        { label: 'threat', score: 0.03 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('perfectly clean text');
+      expect(result.flagged).toBe(false);
+      expect(result.source).toBe('onnx');
+    });
+    it('respects per-category threshold overrides', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.35 },
+        { label: 'severe_toxic', score: 0.0 },
+        { label: 'obscene', score: 0.0 },
+        { label: 'insult', score: 0.0 },
+        { label: 'identity_hate', score: 0.0 },
+        { label: 'threat', score: 0.0 },
+      ]);
+      // Lower the toxic flag threshold so 0.35 exceeds it
+      const guardrail = createGuardrail({
+        thresholds: { toxic: { flag: 0.3 } },
+      });
+      const result = await guardrail.classify('borderline text');
+      expect(result.flagged).toBe(true);
+    });
+    it('does not flag when score equals the threshold exactly', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.5 },
+        { label: 'severe_toxic', score: 0.0 },
+        { label: 'obscene', score: 0.0 },
+        { label: 'insult', score: 0.0 },
+        { label: 'identity_hate', score: 0.0 },
+        { label: 'threat', score: 0.0 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('edge case text');
+      // Flag threshold is 0.5, score is exactly 0.5 -> ">" check, not ">="
+      expect(result.flagged).toBe(false);
+    });
+  });
+  // -------------------------------------------------------------------------
+  // Result source
+  // -------------------------------------------------------------------------
+  describe('result source', () => {
+    it('always returns source: onnx when pipeline succeeds', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.0 },
+        { label: 'severe_toxic', score: 0.0 },
+        { label: 'obscene', score: 0.0 },
+        { label: 'insult', score: 0.0 },
+        { label: 'identity_hate', score: 0.0 },
+        { label: 'threat', score: 0.0 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('hello');
+      expect(result.source).toBe('onnx');
+    });
+  });
+  // -------------------------------------------------------------------------
+  // All four categories present
+  // -------------------------------------------------------------------------
+  describe('category completeness', () => {
+    it('returns scores for all four categories', async () => {
+      mockPipelineCall.mockResolvedValue([
+        { label: 'toxic', score: 0.1 },
+        { label: 'severe_toxic', score: 0.0 },
+        { label: 'obscene', score: 0.2 },
+        { label: 'insult', score: 0.0 },
+        { label: 'identity_hate', score: 0.0 },
+        { label: 'threat', score: 0.3 },
+      ]);
+      const guardrail = createGuardrail();
+      const result = await guardrail.classify('test');
+      const names = result.categories.map((c) => c.name);
+      expect(names).toContain('toxic');
+      expect(names).toContain('injection');
+      expect(names).toContain('nsfw');
+      expect(names).toContain('threat');
+      expect(result.categories).toHaveLength(4);
+    });
+  });
+});