npm - @sanity/ailf - Versions diffs - 3.9.0 → 4.0.1 - Mend

@sanity/ailf 3.9.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/config/models.ts +32 -4
package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
package/dist/_vendor/ailf-core/config-helpers.js +54 -1
package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
package/dist/_vendor/ailf-shared/index.d.ts +16 -10
package/dist/_vendor/ailf-shared/index.js +13 -10
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/check-staleness.js +12 -4
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/models.ts +32 -4
package/dist/orchestration/steps/run-eval-step.js +39 -29
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/cache-hit-restore.d.ts +24 -0
package/dist/pipeline/cache-hit-restore.js +32 -0
package/dist/pipeline/calculate-scores.js +40 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/eval-fingerprint.d.ts +33 -35
package/dist/pipeline/eval-fingerprint.js +124 -106
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/dist/report-store.js +3 -0
package/package.json +2 -2

package/config/models.ts CHANGED Viewed

@@ -35,16 +35,23 @@ export default defineModels({
     // ── OpenAI ─────────────────────────────────────────────────
     {
+      // gpt-5.2 routes through chat completions (and through the in-house
+      // agentic provider for naive/optimized variants). `verbosity` is a
+      // Responses-API-only field — it would be silently dropped here, so
+      // it isn't configured. See W0131.
       id: "openai:chat:gpt-5.2",
       label: "GPT 5.2",
       config: {
         max_completion_tokens: 8192,
-        verbosity: "medium",
       },
       modes: ["literacy", "knowledge-probe"],
       // All literacy variants included by default
     },
     {
+      // GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
+      // native handling of `openai:responses:` honors reasoning / verbosity /
+      // summary; the in-house agentic provider does not (W0131). MCP-server
+      // and knowledge-probe routes go through Promptfoo native too.
       id: "openai:responses:gpt-5.4",
       label: "GPT 5.4",
       config: {
@@ -55,7 +62,9 @@ export default defineModels({
       },
       timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
       modes: ["literacy", "mcp-server", "knowledge-probe"],
-      // All literacy variants included by default
+      variants: {
+        literacy: ["baseline"],
+      },
     },
     // ── Disabled models (uncomment to enable) ──────────────────
@@ -93,12 +102,31 @@ export default defineModels({
   defaults: {
     temperature: 0.2,
     max_tokens: 4096,
-    maxToolRounds: 5, // for agentic modes
+    // Global default round budget for agentic modes. Per-mode overrides
+    // below give naive more headroom (W0134) since it spends rounds on
+    // retries when fetches fail. Per-model `config.maxToolRounds` still
+    // wins over both values.
+    maxToolRounds: 5,
+    modeMaxToolRounds: {
+      "agentic-naive": 8,
+      "agentic-optimized": 5,
+    },
     observerOptions: {
-      maxPreviewBytes: 2048,
+      // Per-class preview caps (W0133): default 4 KB, but search responses
+      // get 16 KB and llms.txt gets 128 KB so trace audits can resolve
+      // which result the model actually saw.
+      maxPreviewBytes: 4096,
+      previewLimits: {
+        default: 4096,
+        llmsTxt: 131072,
+        search: 16384,
+      },
       captureResponsePreview: true,
       includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
       sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
+      // statusOnlyForUnmatched defaults to true (W0132) — model-side
+      // traffic to api.openai.com / api.anthropic.com / googleapis.com
+      // surfaces in run artifacts as slim status-only entries.
     },
   },
 })

package/dist/_vendor/ailf-core/config-helpers.d.ts CHANGED Viewed

@@ -57,8 +57,14 @@ export declare function defineTask(task: GeneralizedTaskDefinition): Generalized
  * Validates:
  * - Every `modes` entry is a canonical eval mode name
  * - Every `variants` key is a mode the model is enrolled in
- *
- * @throws {Error} On invalid mode names or mismatched variant keys
+ * - `openai:responses:` model ids are not used for agentic literacy variants
+ *   (the in-house agentic loop dispatches to chat completions only)
+ * - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
+ *   set on a model that routes through the agentic provider — they would be
+ *   silently dropped.
+ *
+ * @throws {Error} On invalid mode names, mismatched variant keys, or
+ *                 misconfigured OpenAI Responses-API fields.
  */
 export declare function defineModels(models: ModelsConfig): ModelsConfig;
 /**

package/dist/_vendor/ailf-core/config-helpers.js CHANGED Viewed

@@ -54,6 +54,33 @@ export function defineTask(task) {
 // ---------------------------------------------------------------------------
 // Model registry helpers
 // ---------------------------------------------------------------------------
+/**
+ * OpenAI Responses-API-only fields. The agentic provider's OpenAI loop
+ * routes everything through `/v1/chat/completions` and would silently drop
+ * these. We surface the misconfiguration at config-load time instead.
+ *
+ * @see docs/work-items/W0131-honor-openai-responses-provider.json
+ */
+const RESPONSES_ONLY_FIELDS = ["reasoning", "summary", "verbosity"];
+/**
+ * Whether a model would be assembled into agentic-naive or agentic-optimized
+ * literacy variants. These are the variants that route through the in-house
+ * agentic provider (which speaks chat completions only); baseline routes
+ * through Promptfoo's native handling, which honors `openai:responses:` ids.
+ *
+ * Note: variant names mirror the literacy mode base in
+ * `packages/eval/src/pipeline/compiler/mode-bases/literacy.ts`.
+ */
+function participatesInAgenticLiteracy(model) {
+    const enrolledInLiteracy = !model.modes || model.modes.includes("literacy");
+    if (!enrolledInLiteracy)
+        return false;
+    const literacyVariants = model.variants?.literacy;
+    if (!literacyVariants)
+        return true;
+    return (literacyVariants.includes("agentic-naive") ||
+        literacyVariants.includes("agentic-optimized"));
+}
 /**
  * Define the model registry (models to evaluate and grader model).
  *
@@ -62,8 +89,14 @@ export function defineTask(task) {
  * Validates:
  * - Every `modes` entry is a canonical eval mode name
  * - Every `variants` key is a mode the model is enrolled in
+ * - `openai:responses:` model ids are not used for agentic literacy variants
+ *   (the in-house agentic loop dispatches to chat completions only)
+ * - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
+ *   set on a model that routes through the agentic provider — they would be
+ *   silently dropped.
  *
- * @throws {Error} On invalid mode names or mismatched variant keys
+ * @throws {Error} On invalid mode names, mismatched variant keys, or
+ *                 misconfigured OpenAI Responses-API fields.
  */
 export function defineModels(models) {
     const validModes = new Set(CANONICAL_EVAL_MODES);
@@ -87,6 +120,26 @@ export function defineModels(models) {
                 }
             }
         }
+        const usesAgentic = participatesInAgenticLiteracy(model);
+        if (usesAgentic && model.id.startsWith("openai:responses:")) {
+            throw new Error(`Model "${model.label ?? model.id}": the in-house agentic provider ` +
+                `does not implement the OpenAI Responses API endpoint — requests would ` +
+                `be silently downgraded to chat completions. Either restrict variants to ` +
+                `["baseline"] (Promptfoo's native handling honors openai:responses:) or ` +
+                `change the id to "openai:chat:..." for agentic evaluation. ` +
+                `See W0131 for context.`);
+        }
+        if (usesAgentic && model.config) {
+            const droppedFields = RESPONSES_ONLY_FIELDS.filter((f) => f in model.config);
+            if (droppedFields.length > 0) {
+                throw new Error(`Model "${model.label ?? model.id}": configured fields ` +
+                    `${droppedFields.map((f) => `"${f}"`).join(", ")} are only honored ` +
+                    `by the OpenAI Responses API. The agentic provider's chat-completions ` +
+                    `path would silently drop them. Either remove these fields or restrict ` +
+                    `variants to ["baseline"] so the model is evaluated only through ` +
+                    `Promptfoo's native Responses-API handler. See W0131 for context.`);
+            }
+        }
     }
     return models;
 }

package/dist/_vendor/ailf-core/services/slim-report-summary.js CHANGED Viewed

@@ -138,12 +138,21 @@ function toTitleCase(id) {
 // ---------------------------------------------------------------------------
 const RECOMMENDATION_TOP_N = 3;
 function slimRecommendations(full) {
+    // Cache-hit pass-through: when the pipeline restores a previously
+    // published report on a remote cache hit, `score-summary.json` carries
+    // recommendations in their already-slim shape (no `.gaps` field).
+    // Re-slimming would crash on `for (gap of undefined)`; the slim shape
+    // has no full-fidelity data to recover, so we return it verbatim.
+    if (!Array.isArray(full.gaps)) {
+        return full;
+    }
+    const fullReport = full;
     const counts = {};
-    for (const gap of full.gaps) {
+    for (const gap of fullReport.gaps) {
         counts[gap.area] = (counts[gap.area] ?? 0) + 1;
     }
     // Sort by priority descending, break ties by estimatedLift.
-    const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
+    const sorted = [...fullReport.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
         (b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
     const top3 = sorted
         .slice(0, RECOMMENDATION_TOP_N)
@@ -156,8 +165,8 @@ function slimRecommendations(full) {
     return {
         counts,
         top3,
-        totalGaps: full.gaps.length,
-        totalPotentialLift: full.totalPotentialLift,
+        totalGaps: fullReport.gaps.length,
+        totalPotentialLift: fullReport.totalPotentialLift,
     };
 }
 /**

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -1364,6 +1364,15 @@ export interface ArtifactRefEntry {
  *   - `truncated` on the bulk row indicates the single-object body was capped.
  *   - `preview` on the bulk row carries a descriptor-typed summary for list
  *     views; wiring lands in W0051.
+ *
+ * D0040/W0135 extension:
+ *   - `sourceRunId` declares that this ref's bytes physically live under a
+ *     different run's storage prefix than the manifest containing it.
+ *     `path` is already self-contained and authoritative for resolution;
+ *     `sourceRunId` is purely a lineage marker for retention, GC,
+ *     observability, and BigQuery joins. Set by the cache-hit branch in
+ *     `RunEvalStep` when a new run reuses a prior report's artifacts;
+ *     unset on cold-path producers.
  */
 export interface ArtifactRef {
     store: "gcs" | "local";
@@ -1381,6 +1390,7 @@ export interface ArtifactRef {
     entries?: ArtifactRefEntry[];
     truncated?: boolean;
     preview?: unknown;
+    sourceRunId?: RunId;
 }
 /**
  * Catalog of artifact refs produced by a single pipeline run.

package/dist/_vendor/ailf-shared/index.d.ts CHANGED Viewed

@@ -8,14 +8,20 @@
  * Design rule: this package has ZERO runtime dependencies and ZERO imports
  * from @sanity/ailf-core, @sanity/ailf, or
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
+ *
+ * Re-exports are explicit (named) rather than `export *` so that the studio
+ * tsup DTS bundle can statically resolve each symbol's canonical owner —
+ * `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
+ * external namespace resolution" warning even when no symbol actually
+ * collides. See W0124.
  */
-export * from "./canary-drift.js";
-export * from "./document-ref.js";
-export * from "./feature-flags.js";
-export * from "./score-grades.js";
-export * from "./noise-threshold.js";
-export * from "./eval-modes.js";
-export * from "./owner-teams.js";
-export * from "./run-classification.js";
-export * from "./run-trigger.js";
-export * from "./run-context.js";
+export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
+export { type DocumentRef } from "./document-ref.js";
+export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
+export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
+export { NOISE_THRESHOLD } from "./noise-threshold.js";
+export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
+export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
+export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
+export { type RunTrigger } from "./run-trigger.js";
+export { type RunContext } from "./run-context.js";

package/dist/_vendor/ailf-shared/index.js CHANGED Viewed

@@ -8,14 +8,17 @@
  * Design rule: this package has ZERO runtime dependencies and ZERO imports
  * from @sanity/ailf-core, @sanity/ailf, or
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
+ *
+ * Re-exports are explicit (named) rather than `export *` so that the studio
+ * tsup DTS bundle can statically resolve each symbol's canonical owner —
+ * `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
+ * external namespace resolution" warning even when no symbol actually
+ * collides. See W0124.
  */
-export * from "./canary-drift.js";
-export * from "./document-ref.js";
-export * from "./feature-flags.js";
-export * from "./score-grades.js";
-export * from "./noise-threshold.js";
-export * from "./eval-modes.js";
-export * from "./owner-teams.js";
-export * from "./run-classification.js";
-export * from "./run-trigger.js";
-export * from "./run-context.js";
+export { computeCanaryDrift, } from "./canary-drift.js";
+export { FEATURE_FLAGS, } from "./feature-flags.js";
+export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
+export { NOISE_THRESHOLD } from "./noise-threshold.js";
+export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
+export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
+export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";

package/dist/adapters/task-sources/repo-schemas.d.ts CHANGED Viewed

@@ -147,8 +147,8 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
     baseline: z.ZodOptional<z.ZodObject<{
         enabled: z.ZodOptional<z.ZodBoolean>;
         rubric: z.ZodOptional<z.ZodEnum<{
-            abbreviated: "abbreviated";
             full: "full";
+            abbreviated: "abbreviated";
             none: "none";
         }>>;
     }, z.core.$strip>>;
@@ -773,8 +773,8 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
     baseline: z.ZodOptional<z.ZodObject<{
         enabled: z.ZodOptional<z.ZodBoolean>;
         rubric: z.ZodOptional<z.ZodEnum<{
-            abbreviated: "abbreviated";
             full: "full";
+            abbreviated: "abbreviated";
             none: "none";
         }>>;
     }, z.core.$strip>>;
@@ -893,8 +893,8 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
     baseline: z.ZodOptional<z.ZodObject<{
         enabled: z.ZodOptional<z.ZodBoolean>;
         rubric: z.ZodOptional<z.ZodEnum<{
-            abbreviated: "abbreviated";
             full: "full";
+            abbreviated: "abbreviated";
             none: "none";
         }>>;
     }, z.core.$strip>>;

package/dist/agent-observer/agentic-provider.js CHANGED Viewed

@@ -479,20 +479,18 @@ export default class AgenticProvider {
                 // Jina search unavailable
             }
         }
-        // Final fallback: construct likely Sanity doc URLs from the query
+        // Final fallback: search returned nothing usable. Point the agent at
+        // llms.txt (a real, fetchable doc index) instead of fabricating a URL
+        // from the query slug — fabricated URLs 404 and mislead the agent into
+        // thinking the doc system is unreachable. See W0129.
         if (results.length === 0) {
-            const sanitized = query
-                .toLowerCase()
-                .replace(/sanity\.?(io)?/gi, "")
-                .trim();
-            const slugGuess = sanitized
-                .replace(/\s+/g, "-")
-                .replace(/[^a-z0-9-]/g, "");
             results = [
                 {
-                    snippet: `Try the documentation page for: ${sanitized}`,
-                    title: `Documentation: ${query}`,
-                    url: `${this.docBaseUrl}/${slugGuess}`,
+                    snippet: `No direct search results. The documentation index is available at ` +
+                        `${this.llmsTxtUrl} — fetch it to discover real doc URLs, ` +
+                        `then fetch_page specific topics.`,
+                    title: `No results — try fetching ${this.llmsTxtUrl} for the doc index`,
+                    url: this.llmsTxtUrl,
                 },
             ];
         }
@@ -806,12 +804,14 @@ export default class AgenticProvider {
         const maxToolRounds = this.config.maxToolRounds || 5;
         const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
         // Newer OpenAI models (gpt-5.x, o-series) use max_completion_tokens
-        // instead of max_tokens. Detect from config or model name.
-        const useMaxCompletionTokens = this.config.max_output_tokens != null ||
-            this.config.max_completion_tokens != null ||
-            model.startsWith("gpt-5") ||
+        // instead of max_tokens, and reject custom temperature values. Detect
+        // from config or model name. See W0131.
+        const isReasoningModel = model.startsWith("gpt-5") ||
             model.startsWith("o3") ||
             model.startsWith("o4");
+        const useMaxCompletionTokens = this.config.max_output_tokens != null ||
+            this.config.max_completion_tokens != null ||
+            isReasoningModel;
         const maxTokensValue = this.config.max_output_tokens ??
             this.config.max_completion_tokens ??
             this.config.max_tokens ??
@@ -840,15 +840,20 @@ export default class AgenticProvider {
         const startTime = Date.now();
         for (let round = 0; round <= maxToolRounds; round++) {
             const isLastRound = round === maxToolRounds;
+            const requestBody = {
+                ...tokenLimitParam,
+                messages,
+                model,
+                tool_choice: isLastRound ? "none" : "auto",
+                tools,
+            };
+            // gpt-5.x and o-series reject custom temperature; chat-completions
+            // models continue to receive the configured value. See W0131.
+            if (!isReasoningModel) {
+                requestBody.temperature = temperature;
+            }
             const response = await fetchFn("https://api.openai.com/v1/chat/completions", {
-                body: JSON.stringify({
-                    ...tokenLimitParam,
-                    messages,
-                    model,
-                    temperature,
-                    tool_choice: isLastRound ? "none" : "auto",
-                    tools,
-                }),
+                body: JSON.stringify(requestBody),
                 headers: {
                     Authorization: `Bearer ${apiKey}`,
                     "Content-Type": "application/json",

package/dist/agent-observer/classifier.js CHANGED Viewed

@@ -65,6 +65,11 @@ export function classifyRequests(requests) {
         // Skip failed requests (no response)
         if (req.statusCode === 0)
             continue;
+        // Status-only entries (W0132) carry no body, so we can't infer search
+        // queries or doc-page metadata reliably. They still count as API calls
+        // (Sanity API) or external requests (everything else) so the run shape
+        // shows that the call happened, but we skip the body-dependent buckets.
+        const isStatusOnly = req.capture === "status-only";
         // Order matters: API calls first (they may have ?query= params that look like searches),
         // then searches, then doc pages, then external
         if (isSanityApiRequest(req)) {
@@ -75,14 +80,14 @@ export function classifyRequests(requests) {
                 url: req.url,
             });
         }
-        else if (isSearchRequest(req)) {
+        else if (!isStatusOnly && isSearchRequest(req)) {
             result.searchQueries.push({
                 query: extractSearchQuery(req),
                 timestamp: req.timestamp,
                 url: req.url,
             });
         }
-        else if (isDocPageRequest(req)) {
+        else if (!isStatusOnly && isDocPageRequest(req)) {
             const slug = extractDocSlug(req.url);
             result.docPageVisits.push({
                 contentSize: req.responseSize,

package/dist/agent-observer/proxy.d.ts CHANGED Viewed

@@ -21,6 +21,25 @@
  *
  *   const log = recorder.stop()
  *   // → AgentBehaviorLog with all requests classified
+ *
+ * W0133 — per-class preview byte caps
+ *
+ * `responsePreview` is capped at `previewLimits.default` (4 KB) for most
+ * responses, with per-class overrides for two payloads whose contents are
+ * the ground truth for trace audits:
+ *
+ *   - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
+ *     bing.com/search, duckduckgo.com, google.com/search responses. Captures
+ *     the full result list (typical 8–10 KB) so trace audits can resolve
+ *     which result the model fetched next.
+ *   - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
+ *     index is ~110 KB. Capturing the full body lets trace audits
+ *     distinguish "model fetched a path that wasn't in the index" from
+ *     "model fetched a path that was in the index but the page is missing".
+ *
+ * The slim Content Lake report (W0051) does not inline previews — they
+ * live in the GCS `traces` NDJSON artifact only, so bumping these caps
+ * has no effect on the 10 MB Sanity document budget.
  */
 import type { ObservedRequest, AgentBehaviorLog } from "./types.js";
 export interface RecorderOptions {
@@ -31,13 +50,50 @@ export interface RecorderOptions {
     /** Filter: skip requests matching these URL patterns. Default: skip none.
      *  Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
     excludePatterns?: (RegExp | string)[];
-    /** Filter: only record requests matching these URL patterns. Default: record all.
-     *  Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
+    /** Filter: only fully record requests matching these URL patterns. Default: record all fully.
+     *  When `statusOnlyForUnmatched` is true (default), unmatched URLs still emit a slim
+     *  status-only observation. Accepts RegExp or string (strings are auto-converted to
+     *  case-insensitive RegExp). */
     includePatterns?: (RegExp | string)[];
     /** Maximum request body bytes to capture. Default: 4096 */
     maxBodyBytes?: number;
-    /** Maximum response body bytes to capture in preview. Default: 2048 */
+    /**
+     * Default response preview byte cap. Default: 4096.
+     *
+     * Per-class overrides in `previewLimits` may extend this for specific
+     * URL patterns. If `previewLimits` is set, `previewLimits.default` wins
+     * over `maxPreviewBytes`.
+     */
     maxPreviewBytes?: number;
+    /**
+     * Per-class response preview byte caps (W0133). Lets the recorder
+     * capture larger previews for response classes whose contents are the
+     * ground truth for trace audits, without inflating preview size for
+     * generic responses.
+     *
+     * - `default` — used when no other class matches. Falls back to
+     *   `maxPreviewBytes` when omitted (defaults to 4 KB).
+     * - `search` — Jina-wrapped DuckDuckGo, Google CSE, bing/duckduckgo,
+     *   google.com/search responses. Default: 16 KB.
+     * - `llmsTxt` — `/llms.txt` responses. Default: 128 KB.
+     */
+    previewLimits?: {
+        default?: number;
+        llmsTxt?: number;
+        search?: number;
+    };
+    /**
+     * When a URL fails `includePatterns` but passes `excludePatterns`, emit a
+     * slim observation (url/method/statusCode/latencyMs/timestamp/seq, with
+     * `capture: "status-only"`) instead of dropping it entirely. Default: true.
+     *
+     * Setting to `false` restores strict-allowlist behavior — unmatched URLs
+     * are dropped, leaving no record of the call. The default exists so
+     * model-side traffic to api.openai.com / api.anthropic.com /
+     * googleapis.com is visible in run artifacts without recording prompts,
+     * completions, or API keys. See W0132.
+     */
+    statusOnlyForUnmatched?: boolean;
 }
 export declare class RequestRecorder {
     private observations;
@@ -69,8 +125,37 @@ export declare class RequestRecorder {
      *
      * Use this when you can't wrap `fetch` directly but can observe traffic
      * (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
+     *
+     * Filter behavior (W0132):
+     * - `excludePatterns` always drops the observation entirely.
+     * - `includePatterns` mismatch produces a slim `capture: "status-only"`
+     *   record when `statusOnlyForUnmatched` is true (default), or drops it
+     *   when false.
+     * - The discriminator on the input is honored: callers that already
+     *   know they're emitting a slim record (e.g., the fetch wrapper) can
+     *   set `capture: "status-only"` themselves.
      */
     record(observation: Omit<ObservedRequest, "seq">): void;
+    /**
+     * Resolve the preview byte cap for a given URL using per-class overrides
+     * (W0133). Order of preference:
+     *   1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
+     *   2. `previewLimits.search` for known search providers.
+     *   3. `previewLimits.default`.
+     */
+    private resolvePreviewBytes;
+    /**
+     * Decide how to record a URL given the current filter configuration.
+     *
+     * - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
+     *   and `statusOnlyForUnmatched` is false.
+     * - `"status-only"` — `includePatterns` failed but
+     *   `statusOnlyForUnmatched` is true (default). Skip body/headers.
+     * - `"full"` — record everything.
+     *
+     * See W0132.
+     */
+    private classifyCaptureMode;
     /**
      * Reset the recorder for reuse without creating a new instance.
      */