npm - @crewhaus/egress-classifier - Versions diffs - 0.1.0 → 0.1.2 - Mend

@crewhaus/egress-classifier 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@crewhaus/egress-classifier",
-  "version": "0.1.0",
+  "version": "0.1.2",
   "type": "module",
   "description": "Pillar-3 sink-side chokepoint — classify content leaving via external sinks (fetch / web / mcp / channel / federation / evm-tx) against the data-lineage carried in run-context",
   "main": "src/index.ts",
@@ -12,14 +12,14 @@
     "test": "bun test src"
   },
   "dependencies": {
-    "@crewhaus/errors": "0.0.0",
-    "@crewhaus/run-context": "0.0.0"
+    "@crewhaus/errors": "0.1.2",
+    "@crewhaus/run-context": "0.1.2"
   },
   "license": "Apache-2.0",
   "author": {
     "name": "Max Meier",
-    "email": "max@studiomax.io",
-    "url": "https://studiomax.io"
+    "email": "max@crewhaus.ai",
+    "url": "https://crewhaus.ai"
   },
   "repository": {
     "type": "git",
@@ -31,12 +31,7 @@
     "url": "https://github.com/crewhaus/factory/issues"
   },
   "publishConfig": {
-    "access": "restricted"
+    "access": "public"
   },
-  "files": [
-    "src",
-    "README.md",
-    "LICENSE",
-    "NOTICE"
-  ]
+  "files": ["src", "README.md", "LICENSE", "NOTICE"]
 }

package/src/coverage.test.ts ADDED Viewed

@@ -0,0 +1,486 @@
+/**
+ * Supplemental coverage + hardening tests for `egress-classifier`.
+ *
+ * Companion to `index.test.ts`: the FR-006 acceptance suite there exercises
+ * the matcher seam and the headline pass/warn/block flows; this file drives
+ * the remaining branches (LRU eviction + recency, every policy-matrix cell,
+ * the cache-key framing regression, and the summarize/diagnostics helpers)
+ * to 100% and pins the security-relevant invariants.
+ */
+import { afterEach, describe, expect, test } from "bun:test";
+import { CrewhausError } from "@crewhaus/errors";
+import { type TrustOrigin, createRunContext, tagContent } from "@crewhaus/run-context";
+import {
+  EgressClassifierError,
+  type EgressMatcher,
+  type EgressResult,
+  MIN_MATCH_LENGTH,
+  type SinkScope,
+  SubstringEgressMatcher,
+  _cacheSize,
+  _clearEgressCache,
+  classifyEgress,
+  substringMatcher,
+  summarizeEgress,
+} from "./index";
+afterEach(() => {
+  _clearEgressCache();
+});
+// A trivially deterministic matcher that always reports the given origins.
+function fixedMatcher(
+  name: string,
+  originsFound: TrustOrigin[],
+  matchCount = originsFound.length,
+): EgressMatcher {
+  return { name, match: () => ({ originsFound, matchCount }) };
+}
+describe("post-match (await) return paths with forced cache miss", () => {
+  test("bypassCache + matcher returns no hits → fresh pass after the await", async () => {
+    // Guarantees the cache-miss branch runs (bypassCache), the matcher is
+    // awaited, and the post-await `originsFound.length === 0` early return is
+    // taken — distinct from the no-lineage pre-await pass.
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything-present", "subagent"]]);
+    const r = await classifyEgress("outbound bytes", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: fixedMatcher("no-hits", [], 0),
+      bypassCache: true,
+    });
+    expect(r.verdict).toBe("pass");
+    expect(r.fromCache).toBe(false);
+    expect(r.originsFound).toEqual([]);
+    expect(r.matchCount).toBe(0);
+    expect(_cacheSize()).toBe(0); // bypassCache never wrote
+  });
+  test("bypassCache + matcher returns hits → fresh non-pass after the await", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything-present", "subagent"]]);
+    const r = await classifyEgress("outbound bytes", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic",
+      matcher: fixedMatcher("has-hits", ["subagent"], 1),
+      bypassCache: true,
+    });
+    expect(r.verdict).toBe("block");
+    expect(r.fromCache).toBe(false);
+    expect(_cacheSize()).toBe(0);
+  });
+  test("real substring path: cache miss, await, then non-empty return", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent payload that is verbatim present", "subagent");
+    const r = await classifyEgress("POST subagent payload that is verbatim present now", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      bypassCache: true,
+    });
+    expect(r.verdict).toBe("warn");
+    expect(r.fromCache).toBe(false);
+  });
+  test("real substring path: cache miss, await, then empty return (no overlap)", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "tagged content that will not appear", "subagent");
+    const r = await classifyEgress("a completely disjoint outbound string", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      bypassCache: true,
+    });
+    expect(r.verdict).toBe("pass");
+    expect(r.fromCache).toBe(false);
+    expect(r.originsFound).toEqual([]);
+  });
+});
+describe("policy matrix — every TrustOrigin × SinkScope cell", () => {
+  // Every non-user origin: warn on configured, block on dynamic.
+  const nonUser: TrustOrigin[] = [
+    "mcp",
+    "subagent",
+    "channel",
+    "federation",
+    "skill",
+    "compaction",
+    "tool",
+    "chain",
+  ];
+  for (const origin of nonUser) {
+    test(`${origin}: configured → warn, dynamic → block`, async () => {
+      const ctx = createRunContext();
+      ctx.dataLineage = new Map<string, TrustOrigin>([["anything-tagged", origin]]);
+      const m = fixedMatcher(`fixed-${origin}`, [origin], 1);
+      const configured = await classifyEgress("payload", ctx, {
+        sinkId: "fetch",
+        sinkScope: "external-configured",
+        matcher: m,
+        bypassCache: true,
+      });
+      const dynamic = await classifyEgress("payload", ctx, {
+        sinkId: "dyn",
+        sinkScope: "external-dynamic",
+        matcher: m,
+        bypassCache: true,
+      });
+      expect(configured.verdict).toBe("warn");
+      expect(dynamic.verdict).toBe("block");
+    });
+  }
+  test("user: pass on both configured and dynamic", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything-tagged", "user"]]);
+    const m = fixedMatcher("fixed-user", ["user"], 1);
+    for (const sinkScope of ["external-configured", "external-dynamic"] as SinkScope[]) {
+      const r = await classifyEgress("payload", ctx, {
+        sinkId: "s",
+        sinkScope,
+        matcher: m,
+        bypassCache: true,
+      });
+      expect(r.verdict).toBe("pass");
+      expect(r.originsFound).toEqual(["user"]);
+    }
+  });
+});
+describe("foldVerdict precedence (via classifyEgress)", () => {
+  test("warn wins over pass when no block present", async () => {
+    // user (pass) + tool@configured (warn) → warn, exercising the
+    // `some(warn)` branch after `some(block)` short-circuits to false.
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([
+      ["one", "user"],
+      ["two", "tool"],
+    ]);
+    const m = fixedMatcher("multi", ["user", "tool"], 2);
+    const r = await classifyEgress("payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: m,
+    });
+    expect(r.verdict).toBe("warn");
+  });
+  test("block wins over warn", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([
+      ["one", "user"],
+      ["two", "mcp"],
+    ]);
+    // user → pass, mcp@dynamic → block; folded = block.
+    const m = fixedMatcher("multi2", ["user", "mcp"], 2);
+    const r = await classifyEgress("payload", ctx, {
+      sinkId: "dyn",
+      sinkScope: "external-dynamic",
+      matcher: m,
+    });
+    expect(r.verdict).toBe("block");
+  });
+  test("all-pass origins fold to pass", async () => {
+    // Two user hits → foldVerdict reaches the trailing `return "pass"`.
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([
+      ["one", "user"],
+      ["two", "user"],
+    ]);
+    const m = fixedMatcher("two-user", ["user", "user"], 2);
+    const r = await classifyEgress("payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic",
+      matcher: m,
+    });
+    expect(r.verdict).toBe("pass");
+  });
+});
+describe("override semantics", () => {
+  test("override can loosen a default-block to pass on a dynamic sink", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "mcp-sourced content from a server", "mcp");
+    const r = await classifyEgress("body: mcp-sourced content from a server", ctx, {
+      sinkId: "dyn-mcp",
+      sinkScope: "external-dynamic", // default mcp@dynamic = block
+      override: { mcp: "pass" },
+    });
+    expect(r.verdict).toBe("pass");
+    expect(r.originsFound).toEqual(["mcp"]);
+  });
+  test("override only applies to listed origins; others keep defaults", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([
+      ["one", "subagent"],
+      ["two", "channel"],
+    ]);
+    const m = fixedMatcher("two-origin", ["subagent", "channel"], 2);
+    // Loosen subagent to pass, leave channel at its dynamic default (block).
+    const r = await classifyEgress("payload", ctx, {
+      sinkId: "dyn",
+      sinkScope: "external-dynamic",
+      override: { subagent: "pass" },
+      matcher: m,
+    });
+    expect(r.verdict).toBe("block"); // channel still blocks
+  });
+  test("cached verdict is re-folded under a different override on the second call", async () => {
+    // First call caches the raw hit (subagent) with no override. Second call
+    // serves from cache but recomputes the verdict under a tightening
+    // override — exercising the cache-hit `.map(originVerdict)` arrow.
+    const ctx = createRunContext();
+    const tagged = "subagent content for cache reeval test";
+    tagContent(ctx, tagged, "subagent");
+    const first = await classifyEgress(`x ${tagged}`, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured", // warn
+    });
+    expect(first.fromCache).toBe(false);
+    expect(first.verdict).toBe("warn");
+    const second = await classifyEgress(`x ${tagged}`, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      override: { subagent: "block" }, // tighten
+    });
+    expect(second.fromCache).toBe(true);
+    expect(second.verdict).toBe("block");
+    // The cached raw hit is preserved even though the folded verdict changed.
+    expect(second.originsFound).toEqual(["subagent"]);
+    expect(second.matchCount).toBe(1);
+  });
+});
+describe("cache-key framing (regression: delimiter-collision exfil bypass)", () => {
+  test("shifted sinkId/payload boundary does NOT cross-serve a cached verdict", async () => {
+    // CONSTRUCT A TRUE COLLISION for the old bare-`|` key scheme. With
+    // matcher+scope held constant, these two calls byte-concatenate the same
+    // `sinkId|payload` stream:
+    //   A: sinkId = "tool|", payload = P          → "…|tool||P"
+    //   B: sinkId = "tool",  payload = "|" + P    → "…|tool||P"
+    // Under the vulnerable key, B would hash-collide with A and be served A's
+    // cached entry (fromCache:true, cache size stays 1) — a cache-poisoning /
+    // egress-scan bypass when sinkId carries attacker influence (e.g. a
+    // dynamically discovered MCP tool name). Length-framed keys make the two
+    // self-delimiting and therefore distinct.
+    const ctx = createRunContext();
+    // Lineage must be non-empty so the classifier reaches the cache/match path
+    // (an empty lineage short-circuits to pass before any key is computed).
+    ctx.dataLineage = new Map<string, TrustOrigin>([["present-tag-entry", "subagent"]]);
+    const P = "shared-suffix outbound payload bytes";
+    // Use a matcher whose result is independent of payload so the two calls'
+    // verdicts would coincide — isolating `fromCache`/size as the sole tell.
+    const m = fixedMatcher("framing", ["subagent"], 1);
+    const a = await classifyEgress(P, ctx, {
+      sinkId: "tool|",
+      sinkScope: "external-configured",
+      matcher: m,
+    });
+    expect(a.fromCache).toBe(false);
+    expect(_cacheSize()).toBe(1);
+    const b = await classifyEgress(`|${P}`, ctx, {
+      sinkId: "tool",
+      sinkScope: "external-configured",
+      matcher: m,
+    });
+    // The discriminator: on the fixed key B is a fresh miss (its own slot);
+    // on the vulnerable key B would have been served A's entry.
+    expect(b.fromCache).toBe(false);
+    expect(_cacheSize()).toBe(2);
+  });
+  test("identical (matcher, scope, sinkId, payload) still hits cache", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent content stable for cache", "subagent");
+    const p = "POST subagent content stable for cache";
+    const first = await classifyEgress(p, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+    });
+    const second = await classifyEgress(p, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+    });
+    expect(first.fromCache).toBe(false);
+    expect(second.fromCache).toBe(true);
+    expect(_cacheSize()).toBe(1);
+  });
+  test("a literal '|' inside sinkId does not collide with a different split", async () => {
+    // Direct key-injectivity check at the classifier boundary: same payload,
+    // sinkIds "a|b" vs "a" with payload prefixed — must be two cache slots.
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent content for framing test ok", "subagent");
+    await classifyEgress("subagent content for framing test ok", ctx, {
+      sinkId: "a|b",
+      sinkScope: "external-configured",
+    });
+    await classifyEgress("subagent content for framing test ok", ctx, {
+      sinkId: "a",
+      sinkScope: "external-configured",
+    });
+    expect(_cacheSize()).toBe(2);
+  });
+});
+describe("LRU cache behaviour", () => {
+  test("distinct payloads accumulate distinct entries", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent content for lru accumulation", "subagent");
+    for (let i = 0; i < 5; i++) {
+      await classifyEgress(`payload number ${i} subagent content for lru accumulation`, ctx, {
+        sinkId: "fetch",
+        sinkScope: "external-configured",
+      });
+    }
+    expect(_cacheSize()).toBe(5);
+  });
+  test("re-accessing an entry refreshes its recency (get path)", async () => {
+    // Exercises LruCache.get's move-to-end recency bump: an entry read on the
+    // second call survives even as new entries arrive.
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent recency probe content here", "subagent");
+    const p0 = "first subagent recency probe content here";
+    const r1 = await classifyEgress(p0, ctx, { sinkId: "fetch", sinkScope: "external-configured" });
+    expect(r1.fromCache).toBe(false);
+    // Touch p0 again → cache hit, recency refreshed.
+    const r2 = await classifyEgress(p0, ctx, { sinkId: "fetch", sinkScope: "external-configured" });
+    expect(r2.fromCache).toBe(true);
+    expect(_cacheSize()).toBe(1);
+  });
+  test("bypassCache never populates the cache (no store on miss)", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent content under bypass mode here", "subagent");
+    await classifyEgress("x subagent content under bypass mode here", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      bypassCache: true,
+    });
+    expect(_cacheSize()).toBe(0);
+  });
+});
+describe("summarizeEgress", () => {
+  const base: Omit<EgressResult, "verdict" | "originsFound" | "matchCount"> = {
+    fromCache: false,
+    sinkId: "fetch",
+    sinkScope: "external-configured",
+  };
+  test("clean summary when no origins matched", () => {
+    const s = summarizeEgress({ ...base, verdict: "pass", originsFound: [], matchCount: 0 });
+    expect(s).toBe("clean (sink=fetch scope=external-configured)");
+  });
+  test("warn summary lists origins and count", () => {
+    const s = summarizeEgress({
+      ...base,
+      verdict: "warn",
+      originsFound: ["subagent"],
+      matchCount: 1,
+      sinkScope: "external-configured",
+    });
+    expect(s).toBe("warn: 1 match(es) from [subagent] (sink=fetch scope=external-configured)");
+  });
+  test("block summary with multiple origins joins with commas", () => {
+    const s = summarizeEgress({
+      verdict: "block",
+      originsFound: ["mcp", "federation"],
+      matchCount: 4,
+      fromCache: true,
+      sinkId: "dyn:peer",
+      sinkScope: "external-dynamic",
+    });
+    expect(s).toBe(
+      "block: 4 match(es) from [mcp,federation] (sink=dyn:peer scope=external-dynamic)",
+    );
+  });
+});
+describe("SubstringEgressMatcher direct", () => {
+  test("empty lineage yields no hits", () => {
+    const r = new SubstringEgressMatcher().match({
+      payload: "anything at all goes here",
+      lineage: new Map(),
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(r.originsFound).toEqual([]);
+    expect(r.matchCount).toBe(0);
+  });
+  test("dedupes origins but counts distinct matched strings", () => {
+    const lineage = new Map<string, TrustOrigin>([
+      ["first tagged string over floor", "subagent"],
+      ["second tagged string over floor", "subagent"],
+    ]);
+    const r = new SubstringEgressMatcher().match({
+      payload: "first tagged string over floor and second tagged string over floor",
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(r.originsFound).toEqual(["subagent"]); // deduped
+    expect(r.matchCount).toBe(2); // two distinct strings
+  });
+  test("singleton and class share the same name", () => {
+    expect(substringMatcher.name).toBe("substring");
+  });
+});
+describe("diagnostics helpers", () => {
+  test("_clearEgressCache empties the cache", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "subagent content to populate cache now", "subagent");
+    await classifyEgress("x subagent content to populate cache now", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+    });
+    expect(_cacheSize()).toBeGreaterThan(0);
+    _clearEgressCache();
+    expect(_cacheSize()).toBe(0);
+  });
+});
+describe("EgressClassifierError", () => {
+  test("carries the config error code, fixed name, message, and cause chain", () => {
+    const cause = new Error("root");
+    const err = new EgressClassifierError("boom", cause);
+    // The `name` field initializer + constructor (the class's two functions)
+    // are exercised directly here, independent of the internal throw site.
+    expect(err.name).toBe("EgressClassifierError");
+    expect(err.message).toBe("boom");
+    expect(err.code).toBe("config");
+    expect(err.cause).toBe(cause);
+    expect(err).toBeInstanceOf(EgressClassifierError);
+    expect(err).toBeInstanceOf(CrewhausError);
+    expect(err).toBeInstanceOf(Error);
+  });
+  test("cause is optional", () => {
+    const err = new EgressClassifierError("no cause");
+    expect(err.cause).toBeUndefined();
+    expect(err.name).toBe("EgressClassifierError");
+  });
+  test("classifyEgress throws an EgressClassifierError for a non-string payload", async () => {
+    const ctx = createRunContext();
+    await expect(
+      // biome-ignore lint/suspicious/noExplicitAny: exercising the runtime type guard
+      classifyEgress({ not: "a string" } as any, ctx, {
+        sinkId: "fetch",
+        sinkScope: "external-configured",
+      }),
+    ).rejects.toBeInstanceOf(EgressClassifierError);
+  });
+});

package/src/index.test.ts CHANGED Viewed

@@ -1,10 +1,15 @@
 import { afterEach, describe, expect, test } from "bun:test";
 import { type TrustOrigin, createRunContext, tagContent } from "@crewhaus/run-context";
 import {
+  type EgressMatchInput,
+  type EgressMatchResult,
+  type EgressMatcher,
   MIN_MATCH_LENGTH,
+  SubstringEgressMatcher,
   _cacheSize,
   _clearEgressCache,
   classifyEgress,
+  substringMatcher,
   summarizeEgress,
 } from "./index";
@@ -63,7 +68,7 @@ describe("classifyEgress", () => {
   test("ignores tagged content shorter than the match floor", async () => {
     const ctx = createRunContext();
-    tagContent(ctx, "abc", "subagent"); // way under 16-char floor
+    tagContent(ctx, "abc", "subagent"); // way under the 8-char match floor
     const result = await classifyEgress("https://example.com/?q=abc", ctx, {
       sinkId: "fetch",
       sinkScope: "external-configured",
@@ -74,10 +79,11 @@ describe("classifyEgress", () => {
   test("respects a custom minMatchLength for fixtures", async () => {
     const ctx = createRunContext();
-    // tagContent itself enforces a 16-char floor to keep lineage clean, so
-    // for short-fixture tests we pre-populate dataLineage directly. In
-    // production, the classifier's floor and tagContent's floor are both
-    // 16; minMatchLength override is intended for tests + recipes.
+    // tagContent enforces its own floors (16 for blob/lines, 8 for vetted
+    // credential tokens) to keep lineage clean, so for short-fixture tests we
+    // pre-populate dataLineage directly. In production the classifier's
+    // MIN_MATCH_LENGTH=8 backstop matches the token floor; the
+    // minMatchLength override is intended for tests + recipes.
     ctx.dataLineage = new Map<string, TrustOrigin>([["shortish", "subagent"]]);
     const result = await classifyEgress("payload shortish embedded", ctx, {
       sinkId: "fetch",
@@ -157,11 +163,52 @@ describe("classifyEgress", () => {
       classifyEgress(123 as any, ctx, { sinkId: "fetch", sinkScope: "external-configured" }),
     ).rejects.toThrow(/expected a string/);
   });
+  // SECURITY (audit R2): the cache key includes a digest of the LINEAGE
+  // CONTENT. The lineage map grows during a run; a verdict computed before a
+  // secret was tagged must not be served after the tag lands — that would be
+  // an egress-scan bypass for every repeated payload.
+  test("lineage growth invalidates a cached verdict for the same payload", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "some early boundary content of length", "subagent");
+    const payload = "exfiltrating sk-LaterTagged99 now";
+    const first = await classifyEgress(payload, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic",
+    });
+    expect(first.verdict).toBe("pass"); // secret not tagged yet
+    // The secret now crosses a boundary and gets token-tagged.
+    tagContent(ctx, "key issued: sk-LaterTagged99 keep private", "mcp");
+    const second = await classifyEgress(payload, ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic",
+    });
+    expect(second.fromCache).toBe(false); // NOT served stale
+    expect(second.verdict).toBe("block");
+    expect(second.originsFound).toEqual(["mcp"]);
+  });
+  // SECURITY (audit R2): end-to-end short-secret coverage — a credential-
+  // shaped token too short for line tagging (under 16 chars) is token-tagged
+  // at the boundary and caught at egress when the model extracts JUST the
+  // secret from its line.
+  test("a short credential token extracted from its line is caught at egress", async () => {
+    const ctx = createRunContext();
+    tagContent(ctx, "Stripe key for deploys: sk-Ab12Cd34 (rotate quarterly)", "mcp");
+    const result = await classifyEgress("posting sk-Ab12Cd34 to a webhook", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic",
+      bypassCache: true,
+    });
+    expect(result.verdict).toBe("block");
+    expect(result.matchCount).toBeGreaterThanOrEqual(1);
+    expect(result.originsFound).toEqual(["mcp"]);
+  });
 });
 describe("MIN_MATCH_LENGTH constant", () => {
-  test("is 16", () => {
-    expect(MIN_MATCH_LENGTH).toBe(16);
+  test("is 8 — parity with run-context's MIN_TOKEN_TAG_LENGTH (audit R2)", () => {
+    expect(MIN_MATCH_LENGTH).toBe(8);
   });
 });
@@ -195,3 +242,254 @@ describe("summarizeEgress", () => {
     expect(summary).toContain("dynamic-mcp:foo");
   });
 });
+// ---------------------------------------------------------------------------
+// FR-006 — the EgressMatcher seam.
+// ---------------------------------------------------------------------------
+describe("SubstringEgressMatcher (FR-006)", () => {
+  test('name is "substring" for audit + cache namespacing', () => {
+    expect(substringMatcher.name).toBe("substring");
+    expect(new SubstringEgressMatcher().name).toBe("substring");
+  });
+  test("matches identically to the legacy inline scan", () => {
+    // The default matcher is the verbatim pre-FR-006 loop: tagged entries
+    // >= floor that the payload contains, deduped origins, distinct count.
+    const lineage = new Map<string, TrustOrigin>([
+      ["mcp-sourced bearer token segment", "mcp"],
+      ["subagent-flagged content from worker", "subagent"],
+      ["short", "tool"], // under floor — must be ignored
+      ["user-typed sentence here visible", "user"], // not present in payload
+    ]);
+    const payload =
+      "POST mcp-sourced bearer token segment + subagent-flagged content from worker (short)";
+    const result = new SubstringEgressMatcher().match({
+      payload,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect([...result.originsFound].sort()).toEqual(["mcp", "subagent"]);
+    expect(result.matchCount).toBe(2); // the two over-floor hits; "short" skipped
+  });
+  test("respects the minMatchLength floor passed in the input", () => {
+    // Use the concrete class so `.match` is the synchronous overload.
+    const m = new SubstringEgressMatcher();
+    const lineage = new Map<string, TrustOrigin>([["short67", "subagent"]]);
+    // Under default floor (8) → a 7-char tag never matches.
+    expect(
+      m.match({
+        payload: "carries short67 inside",
+        lineage,
+        minMatchLength: MIN_MATCH_LENGTH,
+      }).matchCount,
+    ).toBe(0);
+    // With a low floor → hit.
+    expect(
+      m.match({
+        payload: "carries short67 inside",
+        lineage,
+        minMatchLength: 4,
+      }).matchCount,
+    ).toBe(1);
+  });
+  // SECURITY: a prompt-injectable model can re-encode a tagged secret before
+  // egress. A verbatim substring scan misses these; the decode-aware views do
+  // not. The raw tagged content is the lineage key in every case.
+  const TAGGED = "mcp-sourced secret value that exceeds the floor length";
+  test("detects raw tagged content hidden by JSON.stringify escaping (#5)", () => {
+    // runtime-core builds the egress payload as JSON.stringify(toolInput). A
+    // multi-line tagged string is escaped (\\n, \\\") inside it, so the raw
+    // string is NOT a verbatim substring — but the JSON-decoded view recovers it.
+    const tagged = `${TAGGED}\nsecond "quoted" line`;
+    const lineage = new Map<string, TrustOrigin>([[tagged, "mcp"]]);
+    const payload = JSON.stringify({ url: "https://evil.test", body: tagged });
+    expect(payload.includes(tagged)).toBe(false); // escaped — verbatim scan misses it
+    const result = new SubstringEgressMatcher().match({
+      payload,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(result.originsFound).toEqual(["mcp"]);
+    expect(result.matchCount).toBe(1);
+  });
+  test("detects base64-re-encoded tagged content (#6)", () => {
+    const lineage = new Map<string, TrustOrigin>([[TAGGED, "subagent"]]);
+    const b64 = Buffer.from(TAGGED, "utf8").toString("base64");
+    const payload = JSON.stringify({ note: `exfil: ${b64}` });
+    expect(payload.includes(TAGGED)).toBe(false);
+    const result = new SubstringEgressMatcher().match({
+      payload,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(result.originsFound).toEqual(["subagent"]);
+  });
+  test("detects hex-re-encoded tagged content (#6)", () => {
+    const lineage = new Map<string, TrustOrigin>([[TAGGED, "channel"]]);
+    const hex = Buffer.from(TAGGED, "utf8").toString("hex");
+    const result = new SubstringEgressMatcher().match({
+      payload: `prefix ${hex} suffix`,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(result.originsFound).toEqual(["channel"]);
+  });
+  test("detects percent-encoded tagged content (#6)", () => {
+    const lineage = new Map<string, TrustOrigin>([[TAGGED, "federation"]]);
+    const result = new SubstringEgressMatcher().match({
+      payload: `q=${encodeURIComponent(TAGGED)}`,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(result.originsFound).toEqual(["federation"]);
+  });
+  test("does not flag unrelated content (no false positive from decoding)", () => {
+    const lineage = new Map<string, TrustOrigin>([[TAGGED, "mcp"]]);
+    const payload = JSON.stringify({
+      note: Buffer.from("totally unrelated bytes here", "utf8").toString("base64"),
+    });
+    const result = new SubstringEgressMatcher().match({
+      payload,
+      lineage,
+      minMatchLength: MIN_MATCH_LENGTH,
+    });
+    expect(result.matchCount).toBe(0);
+  });
+});
+describe("classifyEgress with an injected matcher (FR-006)", () => {
+  test("uses the injected matcher's hits and folds policy over them", async () => {
+    const ctx = createRunContext();
+    // Populate lineage with content the SUBSTRING matcher would NOT find in
+    // the payload, proving the verdict came from the injected matcher.
+    ctx.dataLineage = new Map<string, TrustOrigin>([
+      ["paraphrased-and-reencoded original text", "subagent"],
+    ]);
+    const fakeMatcher: EgressMatcher = {
+      name: "fake-fixed",
+      match: (_input: EgressMatchInput): EgressMatchResult => ({
+        originsFound: ["subagent"],
+        matchCount: 1,
+      }),
+    };
+    const result = await classifyEgress("totally unrelated outbound bytes", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured", // subagent on configured → warn
+      matcher: fakeMatcher,
+    });
+    // The substring matcher would have returned pass (no verbatim overlap);
+    // the injected matcher's hit drives the warn verdict. This proves the
+    // policy fold is matcher-independent (acceptance #3).
+    expect(result.verdict).toBe("warn");
+    expect(result.originsFound).toEqual(["subagent"]);
+    expect(result.matchCount).toBe(1);
+  });
+  test("custom-matcher hits still respect per-origin/per-sink policy", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything", "subagent"]]);
+    const subagentHit: EgressMatcher = {
+      name: "subagent-hit",
+      match: () => ({ originsFound: ["subagent"], matchCount: 1 }),
+    };
+    // Same matcher, same hit — warn on configured, block on dynamic. The
+    // outcome difference comes purely from sinkScope policy, not the matcher.
+    const configured = await classifyEgress("payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: subagentHit,
+      bypassCache: true,
+    });
+    const dynamic = await classifyEgress("payload", ctx, {
+      sinkId: "dyn",
+      sinkScope: "external-dynamic",
+      matcher: subagentHit,
+      bypassCache: true,
+    });
+    expect(configured.verdict).toBe("warn");
+    expect(dynamic.verdict).toBe("block");
+  });
+  test("an injected matcher may be async", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything", "mcp"]]);
+    const asyncMatcher: EgressMatcher = {
+      name: "async-hit",
+      match: async () => {
+        await Promise.resolve();
+        return { originsFound: ["mcp"], matchCount: 2 };
+      },
+    };
+    const result = await classifyEgress("payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-dynamic", // mcp on dynamic → block
+      matcher: asyncMatcher,
+    });
+    expect(result.verdict).toBe("block");
+    expect(result.matchCount).toBe(2);
+  });
+  test("cache key namespaces by matcher name (no cross-serve)", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything", "subagent"]]);
+    _clearEgressCache();
+    // Matcher A finds a hit → warn, and caches under name "A".
+    const matcherA: EgressMatcher = {
+      name: "matcher-A",
+      match: () => ({ originsFound: ["subagent"], matchCount: 1 }),
+    };
+    // Matcher B finds nothing → pass, under name "B". Same payload/sink.
+    const matcherB: EgressMatcher = {
+      name: "matcher-B",
+      match: () => ({ originsFound: [], matchCount: 0 }),
+    };
+    const a = await classifyEgress("same payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: matcherA,
+    });
+    const b = await classifyEgress("same payload", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: matcherB,
+    });
+    expect(a.verdict).toBe("warn");
+    expect(a.fromCache).toBe(false);
+    // If the cache did NOT namespace by matcher name, B would have served
+    // A's cached warn-hit. It must compute its own (pass) verdict instead.
+    expect(b.verdict).toBe("pass");
+    expect(b.fromCache).toBe(false);
+    expect(_cacheSize()).toBe(2); // two distinct keys, not one
+  });
+  test("re-running the same matcher does serve from cache", async () => {
+    const ctx = createRunContext();
+    ctx.dataLineage = new Map<string, TrustOrigin>([["anything", "subagent"]]);
+    _clearEgressCache();
+    const m: EgressMatcher = {
+      name: "stable",
+      match: () => ({ originsFound: ["subagent"], matchCount: 1 }),
+    };
+    const first = await classifyEgress("p", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: m,
+    });
+    const second = await classifyEgress("p", ctx, {
+      sinkId: "fetch",
+      sinkScope: "external-configured",
+      matcher: m,
+    });
+    expect(first.fromCache).toBe(false);
+    expect(second.fromCache).toBe(true);
+    expect(second.verdict).toBe("warn");
+  });
+});

package/src/index.ts CHANGED Viewed

@@ -43,7 +43,7 @@
  * a perf optimisation.
  *
  * Catalog layer: R8 (extension of §18 safety primitives, symmetric to
- * `boundary-classifier`). Recipe: demos/walkthroughs/51-egress-fabric.md.
+ * `boundary-classifier`). Recipe: demos/walkthroughs/55-egress-fabric.md.
  */
 import { createHash } from "node:crypto";
 import { CrewhausError } from "@crewhaus/errors";
@@ -113,13 +113,221 @@ const ORIGIN_DEFAULT_POLICY: SeverityMatrix = {
 };
 /**
- * Minimum length for a tagged-content match to count. Short common
- * strings (whitespace, single words, IDs ≤8 chars) produce too many
- * false positives. 16 chars is the floor that empirically lets through
- * benign overlap (`"the"`, `"https"`, short identifiers) while still
- * catching meaningful exfil (URLs, tokens, sentences).
+ * Minimum length for a tagged-content match to count. This is a BACKSTOP
+ * against pathological lineage entries, not the primary false-positive
+ * control: insertion discipline lives in run-context's `tagContent`, which
+ * only admits whole blobs / lines >= 16 chars and credential-shaped tokens
+ * >= 8 chars (audit follow-up R2 — see `MIN_TOKEN_TAG_LENGTH` and
+ * `isCredentialShaped` there). 8 matches the token floor so vetted short
+ * secrets (sk-..., hex runs, key=value secrets) can actually match at
+ * egress; anything shorter is indistinguishable from prose. Keep in sync
+ * with run-context's `MIN_TOKEN_TAG_LENGTH`.
  */
-export const MIN_MATCH_LENGTH = 16;
+export const MIN_MATCH_LENGTH = 8;
+/**
+ * FR-006 — the matching step factored behind a strategy interface. The
+ * matcher decides *which* tagged lineage entries the outbound payload
+ * "contains"; it never decides pass/warn/block. The verdict fold (origin
+ * policy + `block > warn > pass` precedence) stays in `classifyEgress`, so
+ * the three audit outcomes and their precedence are structurally
+ * matcher-independent.
+ *
+ * The default `SubstringEgressMatcher` is behavior-preserving: it is the
+ * verbatim substring scan that lived inline before the seam existed,
+ * including the `MIN_MATCH_LENGTH` floor. An optional embedding-backed
+ * matcher ships separately as `@crewhaus/egress-matcher-semantic`; the
+ * default egress path never imports it (no new hard dependency).
+ *
+ * NOTE: the FR sketch wrote `match(payload, lineage, opts)` with
+ * `DataLineage` / `EgressOpts` types. Those names do not exist in the
+ * codebase (lineage is `Map<string, TrustOrigin>` on `RunContext`; there
+ * is no `DataLineage` type). This implementation uses a single
+ * `EgressMatchInput` bag — idiomatic with this codebase's option-bag style
+ * — and keeps the matcher returning only raw hits, which strictly
+ * strengthens the matcher-independence guarantee.
+ */
+export type EgressMatchInput = {
+  /** The serialized outbound payload to inspect. */
+  readonly payload: string;
+  /** The run-context data-lineage map: tagged content → its trust origin. */
+  readonly lineage: ReadonlyMap<string, TrustOrigin>;
+  /** Floor below which a tagged entry is too short to count as a match. */
+  readonly minMatchLength: number;
+};
+/**
+ * Raw lineage hits — origins whose tagged content the matcher considers
+ * present in the payload, plus a count of distinct matched tagged strings.
+ * Deliberately verdict-free: `classifyEgress` folds policy over
+ * `originsFound`, the matcher does not.
+ */
+export type EgressMatchResult = {
+  readonly originsFound: ReadonlyArray<TrustOrigin>;
+  readonly matchCount: number;
+};
+/**
+ * A pluggable egress-matching strategy. `name` namespaces audit/trace
+ * records and the verdict cache key (so a semantic-matcher verdict never
+ * serves a substring-matcher hit from cache). `match` may be sync or
+ * async; `classifyEgress` awaits it either way.
+ */
+export interface EgressMatcher {
+  readonly name: string;
+  match(input: EgressMatchInput): EgressMatchResult | Promise<EgressMatchResult>;
+}
+/**
+ * Decode-aware normalization so a tagged secret that was *re-encoded* before
+ * egress is still detected. A pure verbatim `payload.includes(tagged)` scan is
+ * trivially bypassed by a prompt-injectable model two ways:
+ *
+ *   1. JSON-escaping — runtime-core builds the egress payload as
+ *      `JSON.stringify(toolInput)`, which escapes the newlines/quotes/
+ *      backslashes that tagged MCP/sub-agent/channel content almost always
+ *      contains, so the raw tagged string is no longer a substring.
+ *   2. base64/hex/percent re-encoding — the model emits `base64(secret)`
+ *      instead of the plaintext.
+ *
+ * `buildScanViews` returns the payload plus normalized views (JSON-decoded
+ * string values, and base64/hex/percent-decoded blobs found in either form),
+ * and the matcher tests each tagged entry against ALL of them. The decoders
+ * mirror `@crewhaus/prompt-injection-detector` (replicated, not imported, to
+ * keep egress-classifier dependency-free; keep the copies in sync) and are
+ * bounded (match count + recursion depth) so this is not itself a DoS vector.
+ */
+function isMostlyPrintable(s: string): boolean {
+  if (s.length === 0) return false;
+  let printable = 0;
+  for (let i = 0; i < s.length; i++) {
+    const c = s.charCodeAt(i);
+    if (c === 9 || c === 10 || c === 13 || (c >= 32 && c < 127)) printable++;
+  }
+  return printable / s.length > 0.85;
+}
+function tryDecodeBase64(blob: string): string | undefined {
+  if (blob.length < 16 || blob.length % 4 === 1) return undefined;
+  try {
+    const decoded = Buffer.from(blob, "base64").toString("utf8");
+    return isMostlyPrintable(decoded) ? decoded : undefined;
+  } catch {
+    return undefined;
+  }
+}
+function tryDecodeHex(blob: string): string | undefined {
+  if (blob.length < 16 || blob.length % 2 !== 0) return undefined;
+  try {
+    const decoded = Buffer.from(blob, "hex").toString("utf8");
+    return isMostlyPrintable(decoded) ? decoded : undefined;
+  } catch {
+    return undefined;
+  }
+}
+function tryDecodePercent(text: string): string | undefined {
+  try {
+    const decoded = decodeURIComponent(text);
+    return decoded !== text ? decoded : undefined;
+  } catch {
+    return undefined;
+  }
+}
+/** Recursively decode base64/hex/percent blobs. Bounded for DoS-safety. */
+function decodedVariants(text: string, depth = 2): string[] {
+  if (depth <= 0 || text.length === 0) return [];
+  const out: string[] = [];
+  const push = (s: string | undefined): void => {
+    if (s !== undefined && s.length > 0) out.push(s, ...decodedVariants(s, depth - 1));
+  };
+  for (const m of [...text.matchAll(/[A-Za-z0-9+/]{16,}={0,2}/g)].slice(0, 8)) {
+    push(tryDecodeBase64(m[0]));
+  }
+  for (const m of [...text.matchAll(/(?:[0-9A-Fa-f]{2}){8,}/g)].slice(0, 8)) {
+    push(tryDecodeHex(m[0]));
+  }
+  if (/%[0-9A-Fa-f]{2}/.test(text)) push(tryDecodePercent(text));
+  return out.slice(0, 16);
+}
+/** Collect every string leaf of a parsed JSON value (bounded by JSON size). */
+function collectJsonStrings(value: unknown, out: string[]): void {
+  if (typeof value === "string") {
+    out.push(value);
+    return;
+  }
+  if (Array.isArray(value)) {
+    for (const v of value) collectJsonStrings(v, out);
+    return;
+  }
+  if (value !== null && typeof value === "object") {
+    for (const v of Object.values(value)) collectJsonStrings(v, out);
+  }
+}
+/**
+ * The set of strings to scan a tagged entry against: the raw payload, the
+ * JSON-decoded string values (recovers content the `JSON.stringify` egress
+ * encoding escaped), and base64/hex/percent decodings of both.
+ */
+function buildScanViews(payload: string): string[] {
+  const views: string[] = [payload];
+  let jsonView: string | undefined;
+  try {
+    const parsed = JSON.parse(payload);
+    const strings: string[] = [];
+    collectJsonStrings(parsed, strings);
+    if (strings.length > 0) jsonView = strings.join("\n");
+  } catch {
+    // Not JSON — only the raw payload + its decodings are scanned.
+  }
+  if (jsonView !== undefined) views.push(jsonView);
+  const decodeSources = jsonView !== undefined ? [payload, jsonView] : [payload];
+  for (const src of decodeSources) {
+    for (const v of decodedVariants(src)) views.push(v);
+  }
+  return views;
+}
+/**
+ * The default egress matcher. A tagged entry counts when it is at least
+ * `minMatchLength` chars and appears in the payload OR in any of its
+ * normalized views (see `buildScanViews`) — so JSON-escaping and
+ * base64/hex/percent re-encoding can no longer slip a tagged secret past the
+ * sink-side fabric. The raw payload is always scanned first, so every match
+ * the old verbatim scan caught is still caught. `originsFound` is deduped;
+ * `matchCount` counts distinct matched tagged strings.
+ */
+export class SubstringEgressMatcher implements EgressMatcher {
+  // Assigned in the constructor rather than as an inline field initializer:
+  // bun's coverage instruments a class-field initializer as its own function
+  // and (as of bun 1.3.x) cannot mark it covered, leaving an unreachable-by-
+  // tests gap in the function-coverage count. A plain constructor assignment
+  // is equivalent at runtime and is counted normally.
+  readonly name: string;
+  constructor() {
+    this.name = "substring";
+  }
+  match(input: EgressMatchInput): EgressMatchResult {
+    const views = buildScanViews(input.payload);
+    const seen = new Set<TrustOrigin>();
+    let matchCount = 0;
+    for (const [tagged, origin] of input.lineage.entries()) {
+      if (tagged.length < input.minMatchLength) continue;
+      if (views.some((view) => view.includes(tagged))) {
+        seen.add(origin);
+        matchCount += 1;
+      }
+    }
+    return { originsFound: [...seen], matchCount };
+  }
+}
+/** Shared default-matcher singleton — the built-in egress detection. */
+export const substringMatcher: EgressMatcher = new SubstringEgressMatcher();
 export type EgressPolicyOverride = Partial<Record<TrustOrigin, EgressVerdict>>;
@@ -150,6 +358,15 @@ export type ClassifyEgressOptions = {
    * supply this.
    */
   readonly minMatchLength?: number;
+  /**
+   * FR-006 — pluggable matching strategy. Defaults to `substringMatcher`
+   * (behavior-preserving). Supply an alternate matcher (e.g. the optional
+   * `@crewhaus/egress-matcher-semantic`) to swap *how* lineage matches are
+   * detected; the per-origin/per-sink policy and the three audit outcomes
+   * are unaffected. The cache key namespaces by `matcher.name`, so
+   * switching matchers mid-run never cross-serves a stale verdict.
+   */
+  readonly matcher?: EgressMatcher;
 };
 /**
@@ -179,9 +396,6 @@ class LruCache<V> {
       this.map.delete(oldest);
     }
   }
-  has(key: string): boolean {
-    return this.map.has(key);
-  }
   size(): number {
     return this.map.size;
   }
@@ -198,15 +412,51 @@ type CachedVerdict = {
 const cache = new LruCache<CachedVerdict>(DEFAULT_CACHE_CAP);
-function cacheKey(payload: string, sinkScope: SinkScope, sinkId: string): string {
-  const h = createHash("sha256")
-    .update(sinkScope)
-    .update("|")
-    .update(sinkId)
-    .update("|")
-    .update(payload, "utf8")
-    .digest("hex");
-  return h;
+function cacheKey(
+  payload: string,
+  sinkScope: SinkScope,
+  sinkId: string,
+  matcherName: string,
+  lineageDigest: string,
+): string {
+  // Length-prefix every field before hashing so the component boundaries are
+  // unambiguous. A bare `"|"` delimiter is not injective when a field can
+  // contain `"|"`: (sinkId="tool|", payload="x") and (sinkId="tool",
+  // payload="|x") would otherwise hash identically and cross-serve a cached
+  // verdict for a *different* payload — a cache-poisoning / egress-scan-bypass
+  // vector when sinkId carries attacker influence (e.g. a dynamically
+  // discovered MCP tool name). `<byteLength>:` framing makes each field
+  // self-delimiting regardless of its contents.
+  const h = createHash("sha256");
+  for (const field of [matcherName, sinkScope, sinkId, payload, lineageDigest]) {
+    h.update(String(Buffer.byteLength(field, "utf8")));
+    h.update(":");
+    h.update(field, "utf8");
+  }
+  return h.digest("hex");
+}
+/**
+ * Stable digest of the lineage map's CONTENT (keys + origins, sorted), used
+ * as a cache-key component. Without it the cache serves stale verdicts: the
+ * lineage map GROWS during a run (every boundary crossing tags more
+ * content), so the same (payload, sink) pair legitimately classifies
+ * differently once a secret contained in the payload gets tagged. A verdict
+ * cached before that tag would otherwise be served forever — an egress-scan
+ * bypass. Sorting makes the digest insensitive to recency-refresh reordering
+ * (delete + re-insert on re-tag), which changes Map iteration order without
+ * changing content.
+ */
+function lineageDigestOf(lineage: ReadonlyMap<string, TrustOrigin>): string {
+  const h = createHash("sha256");
+  const keys = [...lineage.keys()].sort();
+  for (const k of keys) {
+    h.update(String(Buffer.byteLength(k, "utf8")));
+    h.update(":");
+    h.update(k, "utf8");
+    h.update(lineage.get(k) as string, "utf8");
+  }
+  return h.digest("hex");
 }
 /**
@@ -265,7 +515,22 @@ export async function classifyEgress(
     };
   }
-  const key = cacheKey(payload, opts.sinkScope, opts.sinkId);
+  const floor = opts.minMatchLength ?? MIN_MATCH_LENGTH;
+  const matcher = opts.matcher ?? substringMatcher;
+  // Namespace the cache by matcher name so a verdict produced by one
+  // matcher (e.g. semantic) is never served to a call using another
+  // (e.g. substring) over the same (sinkScope, sinkId, payload) — and by a
+  // digest of the lineage content so a verdict computed against an OLDER,
+  // smaller lineage is never served after new tags land (see
+  // `lineageDigestOf`).
+  const key = cacheKey(
+    payload,
+    opts.sinkScope,
+    opts.sinkId,
+    matcher.name,
+    lineageDigestOf(lineage),
+  );
   if (opts.bypassCache !== true) {
     const hit = cache.get(key);
     if (hit !== undefined) {
@@ -283,19 +548,13 @@ export async function classifyEgress(
     }
   }
-  const floor = opts.minMatchLength ?? MIN_MATCH_LENGTH;
-  const seen = new Set<TrustOrigin>();
-  let matchCount = 0;
-  for (const [tagged, origin] of lineage.entries()) {
-    if (tagged.length < floor) continue;
-    if (payload.includes(tagged)) {
-      seen.add(origin);
-      matchCount += 1;
-    }
-  }
-  const originsFound: ReadonlyArray<TrustOrigin> = [...seen];
+  // The matcher decides *which* lineage entries the payload contains; the
+  // policy fold below is matcher-independent. `match` may be sync or async.
+  const { originsFound, matchCount } = await matcher.match({
+    payload,
+    lineage,
+    minMatchLength: floor,
+  });
   const cached: CachedVerdict = { verdict: "pass", originsFound, matchCount };
   if (opts.bypassCache !== true) {
     cache.set(key, cached);