@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/models.ts +32 -4
  3. package/config/test-budgets.ts +24 -0
  4. package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
  5. package/dist/_vendor/ailf-core/config-helpers.js +81 -1
  6. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  7. package/dist/_vendor/ailf-core/index.js +1 -1
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  9. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  10. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  11. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  13. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  15. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  16. package/dist/_vendor/ailf-shared/index.d.ts +16 -9
  17. package/dist/_vendor/ailf-shared/index.js +13 -9
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/agent-observer/agentic-provider.js +28 -23
  20. package/dist/agent-observer/classifier.js +7 -2
  21. package/dist/agent-observer/proxy.d.ts +88 -3
  22. package/dist/agent-observer/proxy.js +174 -16
  23. package/dist/agent-observer/types.d.ts +23 -5
  24. package/dist/cli-program.js +1 -1
  25. package/dist/commands/baseline.d.ts +3 -1
  26. package/dist/commands/baseline.js +29 -9
  27. package/dist/commands/cache.d.ts +5 -1
  28. package/dist/commands/cache.js +31 -15
  29. package/dist/commands/compare.js +11 -4
  30. package/dist/commands/explain-handler.js +2 -2
  31. package/dist/config/canary-tasks.ts +64 -0
  32. package/dist/config/models.ts +32 -4
  33. package/dist/config/test-budgets.ts +24 -0
  34. package/dist/pipeline/baseline.d.ts +14 -3
  35. package/dist/pipeline/baseline.js +7 -13
  36. package/dist/pipeline/calculate-scores.d.ts +17 -2
  37. package/dist/pipeline/calculate-scores.js +139 -1
  38. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  39. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  40. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  41. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  42. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  43. package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
  44. package/dist/pipeline/compiler/provider-assembler.js +37 -2
  45. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  46. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  47. package/package.json +2 -1
  48. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
@@ -0,0 +1,86 @@
1
+ /**
2
+ * canary/drift.ts — Pure drift-statistic computation for the Tier 3
3
+ * framework-tests-framework loop.
4
+ *
5
+ * Consumes the projection shape returned by Studio's `latestReportsQuery`
6
+ * (we accept a slim subset so the function stays a pure-domain dependency
7
+ * with no Studio-package import). Computes per-area Δscore between the
8
+ * most-recent canary run and the trailing-N median, plus an overall
9
+ * Δscore for the run as a whole. Output classifies each delta as `ok`,
10
+ * `warn`, or `regression` against caller-provided thresholds.
11
+ *
12
+ * The function is total — it never throws. Edge cases (empty trailing
13
+ * window, missing scores) surface as `verdict: "no-baseline"` so the
14
+ * caller can decide whether to treat the missing baseline as a fail.
15
+ *
16
+ * @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
17
+ * @see packages/studio/src/queries.ts — `latestReportsQuery`
18
+ */
19
+ /**
20
+ * Compute per-area + overall drift for a sequence of canary runs.
21
+ *
22
+ * `reports` must be ordered **newest-first** (matching `latestReportsQuery`'s
23
+ * `order(completedAt desc)`). The most-recent run is `reports[0]`; the
24
+ * trailing window is `reports.slice(1, 1 + trailingN)`.
25
+ *
26
+ * @throws never — all error states surface as `no-baseline` verdicts.
27
+ */
28
+ export function computeCanaryDrift(reports, thresholds) {
29
+ if (reports.length === 0)
30
+ return null;
31
+ const minBaseline = thresholds.minBaselineRuns ?? 1;
32
+ const current = reports[0];
33
+ const trailing = reports.slice(1, 1 + thresholds.trailingN);
34
+ const overall = scoreDrift("overall", current.overall, trailing.map((r) => r.overall), thresholds, minBaseline);
35
+ const byArea = [];
36
+ for (const score of current.scores) {
37
+ const trailingArea = [];
38
+ for (const t of trailing) {
39
+ const match = t.scores.find((s) => s.feature === score.feature);
40
+ if (match)
41
+ trailingArea.push(match.totalScore);
42
+ }
43
+ byArea.push(scoreDrift(score.feature, score.totalScore, trailingArea, thresholds, minBaseline));
44
+ }
45
+ const hasRegression = overall.verdict === "regression" ||
46
+ byArea.some((e) => e.verdict === "regression");
47
+ const hasMovement = hasRegression ||
48
+ overall.verdict === "warn" ||
49
+ byArea.some((e) => e.verdict === "warn");
50
+ return {
51
+ reportId: current.reportId,
52
+ completedAt: current.completedAt,
53
+ overall,
54
+ byArea,
55
+ hasRegression,
56
+ hasMovement,
57
+ };
58
+ }
59
+ function scoreDrift(feature, current, trailing, thresholds, minBaseline) {
60
+ if (trailing.length < minBaseline) {
61
+ return {
62
+ feature,
63
+ current,
64
+ trailingMedian: null,
65
+ delta: null,
66
+ verdict: "no-baseline",
67
+ };
68
+ }
69
+ const trailingMedian = median(trailing);
70
+ const delta = current - trailingMedian;
71
+ const drop = -delta;
72
+ let verdict = "ok";
73
+ if (drop >= thresholds.failDelta)
74
+ verdict = "regression";
75
+ else if (drop >= thresholds.warnDelta)
76
+ verdict = "warn";
77
+ return { feature, current, trailingMedian, delta, verdict };
78
+ }
79
+ function median(values) {
80
+ const sorted = [...values].sort((a, b) => a - b);
81
+ const mid = Math.floor(sorted.length / 2);
82
+ if (sorted.length % 2 === 0) {
83
+ return (sorted[mid - 1] + sorted[mid]) / 2;
84
+ }
85
+ return sorted[mid];
86
+ }
@@ -8,13 +8,20 @@
8
8
  * Design rule: this package has ZERO runtime dependencies and ZERO imports
9
9
  * from @sanity/ailf-core, @sanity/ailf, or
10
10
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
11
+ *
12
+ * Re-exports are explicit (named) rather than `export *` so that the studio
13
+ * tsup DTS bundle can statically resolve each symbol's canonical owner —
14
+ * `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
15
+ * external namespace resolution" warning even when no symbol actually
16
+ * collides. See W0124.
11
17
  */
12
- export * from "./document-ref.js";
13
- export * from "./feature-flags.js";
14
- export * from "./score-grades.js";
15
- export * from "./noise-threshold.js";
16
- export * from "./eval-modes.js";
17
- export * from "./owner-teams.js";
18
- export * from "./run-classification.js";
19
- export * from "./run-trigger.js";
20
- export * from "./run-context.js";
18
+ export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
19
+ export { type DocumentRef } from "./document-ref.js";
20
+ export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
21
+ export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
22
+ export { NOISE_THRESHOLD } from "./noise-threshold.js";
23
+ export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
24
+ export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
25
+ export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
26
+ export { type RunTrigger } from "./run-trigger.js";
27
+ export { type RunContext } from "./run-context.js";
@@ -8,13 +8,17 @@
8
8
  * Design rule: this package has ZERO runtime dependencies and ZERO imports
9
9
  * from @sanity/ailf-core, @sanity/ailf, or
10
10
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
11
+ *
12
+ * Re-exports are explicit (named) rather than `export *` so that the studio
13
+ * tsup DTS bundle can statically resolve each symbol's canonical owner —
14
+ * `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
15
+ * external namespace resolution" warning even when no symbol actually
16
+ * collides. See W0124.
11
17
  */
12
- export * from "./document-ref.js";
13
- export * from "./feature-flags.js";
14
- export * from "./score-grades.js";
15
- export * from "./noise-threshold.js";
16
- export * from "./eval-modes.js";
17
- export * from "./owner-teams.js";
18
- export * from "./run-classification.js";
19
- export * from "./run-trigger.js";
20
- export * from "./run-context.js";
18
+ export { computeCanaryDrift, } from "./canary-drift.js";
19
+ export { FEATURE_FLAGS, } from "./feature-flags.js";
20
+ export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
21
+ export { NOISE_THRESHOLD } from "./noise-threshold.js";
22
+ export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
23
+ export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
24
+ export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
@@ -147,8 +147,8 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
147
147
  baseline: z.ZodOptional<z.ZodObject<{
148
148
  enabled: z.ZodOptional<z.ZodBoolean>;
149
149
  rubric: z.ZodOptional<z.ZodEnum<{
150
- abbreviated: "abbreviated";
151
150
  full: "full";
151
+ abbreviated: "abbreviated";
152
152
  none: "none";
153
153
  }>>;
154
154
  }, z.core.$strip>>;
@@ -773,8 +773,8 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
773
773
  baseline: z.ZodOptional<z.ZodObject<{
774
774
  enabled: z.ZodOptional<z.ZodBoolean>;
775
775
  rubric: z.ZodOptional<z.ZodEnum<{
776
- abbreviated: "abbreviated";
777
776
  full: "full";
777
+ abbreviated: "abbreviated";
778
778
  none: "none";
779
779
  }>>;
780
780
  }, z.core.$strip>>;
@@ -893,8 +893,8 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
893
893
  baseline: z.ZodOptional<z.ZodObject<{
894
894
  enabled: z.ZodOptional<z.ZodBoolean>;
895
895
  rubric: z.ZodOptional<z.ZodEnum<{
896
- abbreviated: "abbreviated";
897
896
  full: "full";
897
+ abbreviated: "abbreviated";
898
898
  none: "none";
899
899
  }>>;
900
900
  }, z.core.$strip>>;
@@ -479,20 +479,18 @@ export default class AgenticProvider {
479
479
  // Jina search unavailable
480
480
  }
481
481
  }
482
- // Final fallback: construct likely Sanity doc URLs from the query
482
+ // Final fallback: search returned nothing usable. Point the agent at
483
+ // llms.txt (a real, fetchable doc index) instead of fabricating a URL
484
+ // from the query slug — fabricated URLs 404 and mislead the agent into
485
+ // thinking the doc system is unreachable. See W0129.
483
486
  if (results.length === 0) {
484
- const sanitized = query
485
- .toLowerCase()
486
- .replace(/sanity\.?(io)?/gi, "")
487
- .trim();
488
- const slugGuess = sanitized
489
- .replace(/\s+/g, "-")
490
- .replace(/[^a-z0-9-]/g, "");
491
487
  results = [
492
488
  {
493
- snippet: `Try the documentation page for: ${sanitized}`,
494
- title: `Documentation: ${query}`,
495
- url: `${this.docBaseUrl}/${slugGuess}`,
489
+ snippet: `No direct search results. The documentation index is available at ` +
490
+ `${this.llmsTxtUrl} — fetch it to discover real doc URLs, ` +
491
+ `then fetch_page specific topics.`,
492
+ title: `No results — try fetching ${this.llmsTxtUrl} for the doc index`,
493
+ url: this.llmsTxtUrl,
496
494
  },
497
495
  ];
498
496
  }
@@ -806,12 +804,14 @@ export default class AgenticProvider {
806
804
  const maxToolRounds = this.config.maxToolRounds || 5;
807
805
  const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
808
806
  // Newer OpenAI models (gpt-5.x, o-series) use max_completion_tokens
809
- // instead of max_tokens. Detect from config or model name.
810
- const useMaxCompletionTokens = this.config.max_output_tokens != null ||
811
- this.config.max_completion_tokens != null ||
812
- model.startsWith("gpt-5") ||
807
+ // instead of max_tokens, and reject custom temperature values. Detect
808
+ // from config or model name. See W0131.
809
+ const isReasoningModel = model.startsWith("gpt-5") ||
813
810
  model.startsWith("o3") ||
814
811
  model.startsWith("o4");
812
+ const useMaxCompletionTokens = this.config.max_output_tokens != null ||
813
+ this.config.max_completion_tokens != null ||
814
+ isReasoningModel;
815
815
  const maxTokensValue = this.config.max_output_tokens ??
816
816
  this.config.max_completion_tokens ??
817
817
  this.config.max_tokens ??
@@ -840,15 +840,20 @@ export default class AgenticProvider {
840
840
  const startTime = Date.now();
841
841
  for (let round = 0; round <= maxToolRounds; round++) {
842
842
  const isLastRound = round === maxToolRounds;
843
+ const requestBody = {
844
+ ...tokenLimitParam,
845
+ messages,
846
+ model,
847
+ tool_choice: isLastRound ? "none" : "auto",
848
+ tools,
849
+ };
850
+ // gpt-5.x and o-series reject custom temperature; chat-completions
851
+ // models continue to receive the configured value. See W0131.
852
+ if (!isReasoningModel) {
853
+ requestBody.temperature = temperature;
854
+ }
843
855
  const response = await fetchFn("https://api.openai.com/v1/chat/completions", {
844
- body: JSON.stringify({
845
- ...tokenLimitParam,
846
- messages,
847
- model,
848
- temperature,
849
- tool_choice: isLastRound ? "none" : "auto",
850
- tools,
851
- }),
856
+ body: JSON.stringify(requestBody),
852
857
  headers: {
853
858
  Authorization: `Bearer ${apiKey}`,
854
859
  "Content-Type": "application/json",
@@ -65,6 +65,11 @@ export function classifyRequests(requests) {
65
65
  // Skip failed requests (no response)
66
66
  if (req.statusCode === 0)
67
67
  continue;
68
+ // Status-only entries (W0132) carry no body, so we can't infer search
69
+ // queries or doc-page metadata reliably. They still count as API calls
70
+ // (Sanity API) or external requests (everything else) so the run shape
71
+ // shows that the call happened, but we skip the body-dependent buckets.
72
+ const isStatusOnly = req.capture === "status-only";
68
73
  // Order matters: API calls first (they may have ?query= params that look like searches),
69
74
  // then searches, then doc pages, then external
70
75
  if (isSanityApiRequest(req)) {
@@ -75,14 +80,14 @@ export function classifyRequests(requests) {
75
80
  url: req.url,
76
81
  });
77
82
  }
78
- else if (isSearchRequest(req)) {
83
+ else if (!isStatusOnly && isSearchRequest(req)) {
79
84
  result.searchQueries.push({
80
85
  query: extractSearchQuery(req),
81
86
  timestamp: req.timestamp,
82
87
  url: req.url,
83
88
  });
84
89
  }
85
- else if (isDocPageRequest(req)) {
90
+ else if (!isStatusOnly && isDocPageRequest(req)) {
86
91
  const slug = extractDocSlug(req.url);
87
92
  result.docPageVisits.push({
88
93
  contentSize: req.responseSize,
@@ -21,6 +21,25 @@
21
21
  *
22
22
  * const log = recorder.stop()
23
23
  * // → AgentBehaviorLog with all requests classified
24
+ *
25
+ * W0133 — per-class preview byte caps
26
+ *
27
+ * `responsePreview` is capped at `previewLimits.default` (4 KB) for most
28
+ * responses, with per-class overrides for two payloads whose contents are
29
+ * the ground truth for trace audits:
30
+ *
31
+ * - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
32
+ * bing.com/search, duckduckgo.com, google.com/search responses. Captures
33
+ * the full result list (typical 8–10 KB) so trace audits can resolve
34
+ * which result the model fetched next.
35
+ * - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
36
+ * index is ~110 KB. Capturing the full body lets trace audits
37
+ * distinguish "model fetched a path that wasn't in the index" from
38
+ * "model fetched a path that was in the index but the page is missing".
39
+ *
40
+ * The slim Content Lake report (W0051) does not inline previews — they
41
+ * live in the GCS `traces` NDJSON artifact only, so bumping these caps
42
+ * has no effect on the 10 MB Sanity document budget.
24
43
  */
25
44
  import type { ObservedRequest, AgentBehaviorLog } from "./types.js";
26
45
  export interface RecorderOptions {
@@ -31,13 +50,50 @@ export interface RecorderOptions {
31
50
  /** Filter: skip requests matching these URL patterns. Default: skip none.
32
51
  * Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
33
52
  excludePatterns?: (RegExp | string)[];
34
- /** Filter: only record requests matching these URL patterns. Default: record all.
35
- * Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
53
+ /** Filter: only fully record requests matching these URL patterns. Default: record all fully.
54
+ * When `statusOnlyForUnmatched` is true (default), unmatched URLs still emit a slim
55
+ * status-only observation. Accepts RegExp or string (strings are auto-converted to
56
+ * case-insensitive RegExp). */
36
57
  includePatterns?: (RegExp | string)[];
37
58
  /** Maximum request body bytes to capture. Default: 4096 */
38
59
  maxBodyBytes?: number;
39
- /** Maximum response body bytes to capture in preview. Default: 2048 */
60
+ /**
61
+ * Default response preview byte cap. Default: 4096.
62
+ *
63
+ * Per-class overrides in `previewLimits` may extend this for specific
64
+ * URL patterns. If `previewLimits` is set, `previewLimits.default` wins
65
+ * over `maxPreviewBytes`.
66
+ */
40
67
  maxPreviewBytes?: number;
68
+ /**
69
+ * Per-class response preview byte caps (W0133). Lets the recorder
70
+ * capture larger previews for response classes whose contents are the
71
+ * ground truth for trace audits, without inflating preview size for
72
+ * generic responses.
73
+ *
74
+ * - `default` — used when no other class matches. Falls back to
75
+ * `maxPreviewBytes` when omitted (defaults to 4 KB).
76
+ * - `search` — Jina-wrapped DuckDuckGo, Google CSE, bing/duckduckgo,
77
+ * google.com/search responses. Default: 16 KB.
78
+ * - `llmsTxt` — `/llms.txt` responses. Default: 128 KB.
79
+ */
80
+ previewLimits?: {
81
+ default?: number;
82
+ llmsTxt?: number;
83
+ search?: number;
84
+ };
85
+ /**
86
+ * When a URL fails `includePatterns` but passes `excludePatterns`, emit a
87
+ * slim observation (url/method/statusCode/latencyMs/timestamp/seq, with
88
+ * `capture: "status-only"`) instead of dropping it entirely. Default: true.
89
+ *
90
+ * Setting to `false` restores strict-allowlist behavior — unmatched URLs
91
+ * are dropped, leaving no record of the call. The default exists so
92
+ * model-side traffic to api.openai.com / api.anthropic.com /
93
+ * googleapis.com is visible in run artifacts without recording prompts,
94
+ * completions, or API keys. See W0132.
95
+ */
96
+ statusOnlyForUnmatched?: boolean;
41
97
  }
42
98
  export declare class RequestRecorder {
43
99
  private observations;
@@ -69,8 +125,37 @@ export declare class RequestRecorder {
69
125
  *
70
126
  * Use this when you can't wrap `fetch` directly but can observe traffic
71
127
  * (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
128
+ *
129
+ * Filter behavior (W0132):
130
+ * - `excludePatterns` always drops the observation entirely.
131
+ * - `includePatterns` mismatch produces a slim `capture: "status-only"`
132
+ * record when `statusOnlyForUnmatched` is true (default), or drops it
133
+ * when false.
134
+ * - The discriminator on the input is honored: callers that already
135
+ * know they're emitting a slim record (e.g., the fetch wrapper) can
136
+ * set `capture: "status-only"` themselves.
72
137
  */
73
138
  record(observation: Omit<ObservedRequest, "seq">): void;
139
+ /**
140
+ * Resolve the preview byte cap for a given URL using per-class overrides
141
+ * (W0133). Order of preference:
142
+ * 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
143
+ * 2. `previewLimits.search` for known search providers.
144
+ * 3. `previewLimits.default`.
145
+ */
146
+ private resolvePreviewBytes;
147
+ /**
148
+ * Decide how to record a URL given the current filter configuration.
149
+ *
150
+ * - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
151
+ * and `statusOnlyForUnmatched` is false.
152
+ * - `"status-only"` — `includePatterns` failed but
153
+ * `statusOnlyForUnmatched` is true (default). Skip body/headers.
154
+ * - `"full"` — record everything.
155
+ *
156
+ * See W0132.
157
+ */
158
+ private classifyCaptureMode;
74
159
  /**
75
160
  * Reset the recorder for reuse without creating a new instance.
76
161
  */
@@ -21,8 +21,49 @@
21
21
  *
22
22
  * const log = recorder.stop()
23
23
  * // → AgentBehaviorLog with all requests classified
24
+ *
25
+ * W0133 — per-class preview byte caps
26
+ *
27
+ * `responsePreview` is capped at `previewLimits.default` (4 KB) for most
28
+ * responses, with per-class overrides for two payloads whose contents are
29
+ * the ground truth for trace audits:
30
+ *
31
+ * - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
32
+ * bing.com/search, duckduckgo.com, google.com/search responses. Captures
33
+ * the full result list (typical 8–10 KB) so trace audits can resolve
34
+ * which result the model fetched next.
35
+ * - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
36
+ * index is ~110 KB. Capturing the full body lets trace audits
37
+ * distinguish "model fetched a path that wasn't in the index" from
38
+ * "model fetched a path that was in the index but the page is missing".
39
+ *
40
+ * The slim Content Lake report (W0051) does not inline previews — they
41
+ * live in the GCS `traces` NDJSON artifact only, so bumping these caps
42
+ * has no effect on the 10 MB Sanity document budget.
24
43
  */
25
44
  import { classifyRequests } from "./classifier.js";
45
+ /** Per-class preview-byte defaults (W0133). */
46
+ const DEFAULT_PREVIEW_LIMITS = {
47
+ default: 4096,
48
+ llmsTxt: 131072, // ~128 KB — covers Sanity's ~110 KB llms.txt
49
+ search: 16384, // ~16 KB — Jina/Google CSE/duckduckgo result lists
50
+ };
51
+ /**
52
+ * URL patterns for the `search` response class (W0133). These cover the
53
+ * search providers the agentic loop actually hits; new providers can be
54
+ * added here without changing the recorder API surface.
55
+ */
56
+ const SEARCH_URL_PATTERNS = [
57
+ /r\.jina\.ai\/https?:\/\/(www\.)?duckduckgo\.com/i,
58
+ /r\.jina\.ai\/https?:\/\/(www\.)?google\.com\/search/i,
59
+ /r\.jina\.ai\/https?:\/\/(www\.)?bing\.com\/search/i,
60
+ /^https?:\/\/(www\.)?googleapis\.com\/customsearch/i,
61
+ /^https?:\/\/(www\.)?google\.com\/search/i,
62
+ /^https?:\/\/(www\.)?bing\.com\/search/i,
63
+ /^https?:\/\/(www\.)?duckduckgo\.com/i,
64
+ ];
65
+ /** URL pattern for the `llmsTxt` response class (W0133). */
66
+ const LLMS_TXT_PATTERN = /\/llms\.txt(\?|$|\/)/i;
26
67
  const DEFAULT_OPTIONS = {
27
68
  captureHeaders: [
28
69
  "accept",
@@ -40,7 +81,9 @@ const DEFAULT_OPTIONS = {
40
81
  ],
41
82
  includePatterns: [],
42
83
  maxBodyBytes: 4096,
43
- maxPreviewBytes: 2048,
84
+ maxPreviewBytes: DEFAULT_PREVIEW_LIMITS.default,
85
+ previewLimits: { ...DEFAULT_PREVIEW_LIMITS },
86
+ statusOnlyForUnmatched: true,
44
87
  };
45
88
  // ---------------------------------------------------------------------------
46
89
  // RequestRecorder
@@ -63,6 +106,19 @@ export class RequestRecorder {
63
106
  if (merged.excludePatterns) {
64
107
  merged.excludePatterns = merged.excludePatterns.map(toRegExp);
65
108
  }
109
+ // Resolve per-class preview caps. `previewLimits.default` wins over
110
+ // `maxPreviewBytes`; missing entries fall through to module defaults
111
+ // (W0133).
112
+ const userLimits = options?.previewLimits ?? {};
113
+ const resolvedDefault = userLimits.default ??
114
+ options?.maxPreviewBytes ??
115
+ DEFAULT_PREVIEW_LIMITS.default;
116
+ merged.previewLimits = {
117
+ default: resolvedDefault,
118
+ llmsTxt: userLimits.llmsTxt ?? DEFAULT_PREVIEW_LIMITS.llmsTxt,
119
+ search: userLimits.search ?? DEFAULT_PREVIEW_LIMITS.search,
120
+ };
121
+ merged.maxPreviewBytes = resolvedDefault;
66
122
  this.options = merged;
67
123
  }
68
124
  /**
@@ -83,6 +139,7 @@ export class RequestRecorder {
83
139
  ? input.method
84
140
  : "GET") ??
85
141
  "GET";
142
+ const captureMode = this.classifyCaptureMode(url);
86
143
  let response;
87
144
  let error = null;
88
145
  try {
@@ -90,31 +147,64 @@ export class RequestRecorder {
90
147
  }
91
148
  catch (err) {
92
149
  error = err;
93
- // Record the failed request
150
+ if (captureMode === "drop")
151
+ throw error;
152
+ // Record the failed request — status-only captures skip body/headers
153
+ // entirely (W0132).
154
+ this.record(captureMode === "full"
155
+ ? {
156
+ body: await this.extractBody(init?.body),
157
+ capture: "full",
158
+ contentType: undefined,
159
+ headers: this.extractHeaders(init?.headers),
160
+ latencyMs: Date.now() - reqStart,
161
+ method: method.toUpperCase(),
162
+ responsePreview: `Error: ${error.message}`,
163
+ responseSize: 0,
164
+ statusCode: 0,
165
+ timestamp: new Date(reqStart).toISOString(),
166
+ url,
167
+ }
168
+ : {
169
+ capture: "status-only",
170
+ headers: {},
171
+ latencyMs: Date.now() - reqStart,
172
+ method: method.toUpperCase(),
173
+ responseSize: 0,
174
+ statusCode: 0,
175
+ timestamp: new Date(reqStart).toISOString(),
176
+ url,
177
+ });
178
+ throw error;
179
+ }
180
+ const latencyMs = Date.now() - reqStart;
181
+ if (captureMode === "drop")
182
+ return response;
183
+ if (captureMode === "status-only") {
184
+ // No body read, no header capture, no preview — only the metadata
185
+ // needed to know the call happened (W0132).
94
186
  this.record({
95
- body: await this.extractBody(init?.body),
96
- contentType: undefined,
97
- headers: this.extractHeaders(init?.headers),
98
- latencyMs: Date.now() - reqStart,
187
+ capture: "status-only",
188
+ headers: {},
189
+ latencyMs,
99
190
  method: method.toUpperCase(),
100
- responsePreview: `Error: ${error.message}`,
101
191
  responseSize: 0,
102
- statusCode: 0,
192
+ statusCode: response.status,
103
193
  timestamp: new Date(reqStart).toISOString(),
104
194
  url,
105
195
  });
106
- throw error;
196
+ return response;
107
197
  }
108
- const latencyMs = Date.now() - reqStart;
109
198
  // Clone the response so we can read the body without consuming it
110
199
  const clone = response.clone();
111
200
  let responseSize = 0;
112
201
  let responsePreview;
113
202
  if (this.options.captureResponsePreview) {
203
+ const previewBytes = this.resolvePreviewBytes(url);
114
204
  try {
115
205
  const text = await clone.text();
116
206
  responseSize = new TextEncoder().encode(text).length;
117
- responsePreview = text.slice(0, this.options.maxPreviewBytes);
207
+ responsePreview = text.slice(0, previewBytes);
118
208
  }
119
209
  catch {
120
210
  // Body might not be text — that's fine
@@ -123,6 +213,7 @@ export class RequestRecorder {
123
213
  }
124
214
  this.record({
125
215
  body: await this.extractBody(init?.body),
216
+ capture: "full",
126
217
  contentType: response.headers.get("content-type") ?? undefined,
127
218
  headers: this.extractHeaders(init?.headers),
128
219
  latencyMs,
@@ -152,26 +243,93 @@ export class RequestRecorder {
152
243
  *
153
244
  * Use this when you can't wrap `fetch` directly but can observe traffic
154
245
  * (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
246
+ *
247
+ * Filter behavior (W0132):
248
+ * - `excludePatterns` always drops the observation entirely.
249
+ * - `includePatterns` mismatch produces a slim `capture: "status-only"`
250
+ * record when `statusOnlyForUnmatched` is true (default), or drops it
251
+ * when false.
252
+ * - The discriminator on the input is honored: callers that already
253
+ * know they're emitting a slim record (e.g., the fetch wrapper) can
254
+ * set `capture: "status-only"` themselves.
155
255
  */
156
256
  record(observation) {
157
257
  if (!this.running)
158
258
  return;
159
259
  const url = observation.url;
160
- // Apply filters
260
+ if (this.options.excludePatterns.some((p) => p.test(url)))
261
+ return;
262
+ let capture = observation.capture ?? "full";
161
263
  if (this.options.includePatterns.length > 0) {
162
- if (!this.options.includePatterns.some((p) => p.test(url)))
163
- return;
264
+ const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
265
+ if (!matchesIncludes) {
266
+ if (!this.options.statusOnlyForUnmatched)
267
+ return;
268
+ capture = "status-only";
269
+ }
164
270
  }
165
- if (this.options.excludePatterns.some((p) => p.test(url)))
271
+ if (capture === "status-only") {
272
+ // Slim shape — strip body/headers/contentType/responsePreview so a
273
+ // caller that passed full data still produces a sanitized record.
274
+ this.observations.push({
275
+ capture: "status-only",
276
+ headers: {},
277
+ latencyMs: observation.latencyMs,
278
+ method: observation.method,
279
+ responseSize: 0,
280
+ seq: this.seq++,
281
+ statusCode: observation.statusCode,
282
+ timestamp: observation.timestamp,
283
+ url,
284
+ });
166
285
  return;
286
+ }
287
+ const previewBytes = this.resolvePreviewBytes(url);
167
288
  this.observations.push({
168
289
  ...observation,
290
+ capture: "full",
169
291
  // Truncate body if needed
170
292
  body: observation.body?.slice(0, this.options.maxBodyBytes),
171
- responsePreview: observation.responsePreview?.slice(0, this.options.maxPreviewBytes),
293
+ responsePreview: observation.responsePreview?.slice(0, previewBytes),
172
294
  seq: this.seq++,
173
295
  });
174
296
  }
297
+ /**
298
+ * Resolve the preview byte cap for a given URL using per-class overrides
299
+ * (W0133). Order of preference:
300
+ * 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
301
+ * 2. `previewLimits.search` for known search providers.
302
+ * 3. `previewLimits.default`.
303
+ */
304
+ resolvePreviewBytes(url) {
305
+ if (LLMS_TXT_PATTERN.test(url))
306
+ return this.options.previewLimits.llmsTxt;
307
+ if (SEARCH_URL_PATTERNS.some((p) => p.test(url))) {
308
+ return this.options.previewLimits.search;
309
+ }
310
+ return this.options.previewLimits.default;
311
+ }
312
+ /**
313
+ * Decide how to record a URL given the current filter configuration.
314
+ *
315
+ * - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
316
+ * and `statusOnlyForUnmatched` is false.
317
+ * - `"status-only"` — `includePatterns` failed but
318
+ * `statusOnlyForUnmatched` is true (default). Skip body/headers.
319
+ * - `"full"` — record everything.
320
+ *
321
+ * See W0132.
322
+ */
323
+ classifyCaptureMode(url) {
324
+ if (this.options.excludePatterns.some((p) => p.test(url)))
325
+ return "drop";
326
+ if (this.options.includePatterns.length === 0)
327
+ return "full";
328
+ const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
329
+ if (matchesIncludes)
330
+ return "full";
331
+ return this.options.statusOnlyForUnmatched ? "status-only" : "drop";
332
+ }
175
333
  /**
176
334
  * Reset the recorder for reuse without creating a new instance.
177
335
  */