@sanity/ailf 3.8.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/canary-tasks.ts +64 -0
- package/config/models.ts +32 -4
- package/config/test-budgets.ts +24 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
- package/dist/_vendor/ailf-core/config-helpers.js +81 -1
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
- package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
- package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
- package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -9
- package/dist/_vendor/ailf-shared/index.js +13 -9
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/canary-tasks.ts +64 -0
- package/dist/config/models.ts +32 -4
- package/dist/config/test-budgets.ts +24 -0
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/calculate-scores.d.ts +17 -2
- package/dist/pipeline/calculate-scores.js +139 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/package.json +2 -1
- package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* canary/drift.ts — Pure drift-statistic computation for the Tier 3
|
|
3
|
+
* framework-tests-framework loop.
|
|
4
|
+
*
|
|
5
|
+
* Consumes the projection shape returned by Studio's `latestReportsQuery`
|
|
6
|
+
* (we accept a slim subset so the function stays a pure-domain dependency
|
|
7
|
+
* with no Studio-package import). Computes per-area Δscore between the
|
|
8
|
+
* most-recent canary run and the trailing-N median, plus an overall
|
|
9
|
+
* Δscore for the run as a whole. Output classifies each delta as `ok`,
|
|
10
|
+
* `warn`, or `regression` against caller-provided thresholds.
|
|
11
|
+
*
|
|
12
|
+
* The function is total — it never throws. Edge cases (empty trailing
|
|
13
|
+
* window, missing scores) surface as `verdict: "no-baseline"` so the
|
|
14
|
+
* caller can decide whether to treat the missing baseline as a fail.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/testing-strategy.md — "Tier 3 — Live LLMs"
|
|
17
|
+
* @see packages/studio/src/queries.ts — `latestReportsQuery`
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* Compute per-area + overall drift for a sequence of canary runs.
|
|
21
|
+
*
|
|
22
|
+
* `reports` must be ordered **newest-first** (matching `latestReportsQuery`'s
|
|
23
|
+
* `order(completedAt desc)`). The most-recent run is `reports[0]`; the
|
|
24
|
+
* trailing window is `reports.slice(1, 1 + trailingN)`.
|
|
25
|
+
*
|
|
26
|
+
* @throws never — all error states surface as `no-baseline` verdicts.
|
|
27
|
+
*/
|
|
28
|
+
export function computeCanaryDrift(reports, thresholds) {
|
|
29
|
+
if (reports.length === 0)
|
|
30
|
+
return null;
|
|
31
|
+
const minBaseline = thresholds.minBaselineRuns ?? 1;
|
|
32
|
+
const current = reports[0];
|
|
33
|
+
const trailing = reports.slice(1, 1 + thresholds.trailingN);
|
|
34
|
+
const overall = scoreDrift("overall", current.overall, trailing.map((r) => r.overall), thresholds, minBaseline);
|
|
35
|
+
const byArea = [];
|
|
36
|
+
for (const score of current.scores) {
|
|
37
|
+
const trailingArea = [];
|
|
38
|
+
for (const t of trailing) {
|
|
39
|
+
const match = t.scores.find((s) => s.feature === score.feature);
|
|
40
|
+
if (match)
|
|
41
|
+
trailingArea.push(match.totalScore);
|
|
42
|
+
}
|
|
43
|
+
byArea.push(scoreDrift(score.feature, score.totalScore, trailingArea, thresholds, minBaseline));
|
|
44
|
+
}
|
|
45
|
+
const hasRegression = overall.verdict === "regression" ||
|
|
46
|
+
byArea.some((e) => e.verdict === "regression");
|
|
47
|
+
const hasMovement = hasRegression ||
|
|
48
|
+
overall.verdict === "warn" ||
|
|
49
|
+
byArea.some((e) => e.verdict === "warn");
|
|
50
|
+
return {
|
|
51
|
+
reportId: current.reportId,
|
|
52
|
+
completedAt: current.completedAt,
|
|
53
|
+
overall,
|
|
54
|
+
byArea,
|
|
55
|
+
hasRegression,
|
|
56
|
+
hasMovement,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
function scoreDrift(feature, current, trailing, thresholds, minBaseline) {
|
|
60
|
+
if (trailing.length < minBaseline) {
|
|
61
|
+
return {
|
|
62
|
+
feature,
|
|
63
|
+
current,
|
|
64
|
+
trailingMedian: null,
|
|
65
|
+
delta: null,
|
|
66
|
+
verdict: "no-baseline",
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
const trailingMedian = median(trailing);
|
|
70
|
+
const delta = current - trailingMedian;
|
|
71
|
+
const drop = -delta;
|
|
72
|
+
let verdict = "ok";
|
|
73
|
+
if (drop >= thresholds.failDelta)
|
|
74
|
+
verdict = "regression";
|
|
75
|
+
else if (drop >= thresholds.warnDelta)
|
|
76
|
+
verdict = "warn";
|
|
77
|
+
return { feature, current, trailingMedian, delta, verdict };
|
|
78
|
+
}
|
|
79
|
+
function median(values) {
|
|
80
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
81
|
+
const mid = Math.floor(sorted.length / 2);
|
|
82
|
+
if (sorted.length % 2 === 0) {
|
|
83
|
+
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
84
|
+
}
|
|
85
|
+
return sorted[mid];
|
|
86
|
+
}
|
|
@@ -8,13 +8,20 @@
|
|
|
8
8
|
* Design rule: this package has ZERO runtime dependencies and ZERO imports
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
|
+
*
|
|
12
|
+
* Re-exports are explicit (named) rather than `export *` so that the studio
|
|
13
|
+
* tsup DTS bundle can statically resolve each symbol's canonical owner —
|
|
14
|
+
* `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
|
|
15
|
+
* external namespace resolution" warning even when no symbol actually
|
|
16
|
+
* collides. See W0124.
|
|
11
17
|
*/
|
|
12
|
-
export
|
|
13
|
-
export
|
|
14
|
-
export
|
|
15
|
-
export
|
|
16
|
-
export
|
|
17
|
-
export
|
|
18
|
-
export
|
|
19
|
-
export
|
|
20
|
-
export
|
|
18
|
+
export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
|
|
19
|
+
export { type DocumentRef } from "./document-ref.js";
|
|
20
|
+
export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
|
|
21
|
+
export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
|
|
22
|
+
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
23
|
+
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
24
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
25
|
+
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
26
|
+
export { type RunTrigger } from "./run-trigger.js";
|
|
27
|
+
export { type RunContext } from "./run-context.js";
|
|
@@ -8,13 +8,17 @@
|
|
|
8
8
|
* Design rule: this package has ZERO runtime dependencies and ZERO imports
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
|
+
*
|
|
12
|
+
* Re-exports are explicit (named) rather than `export *` so that the studio
|
|
13
|
+
* tsup DTS bundle can statically resolve each symbol's canonical owner —
|
|
14
|
+
* `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
|
|
15
|
+
* external namespace resolution" warning even when no symbol actually
|
|
16
|
+
* collides. See W0124.
|
|
11
17
|
*/
|
|
12
|
-
export
|
|
13
|
-
export
|
|
14
|
-
export
|
|
15
|
-
export
|
|
16
|
-
export
|
|
17
|
-
export
|
|
18
|
-
export
|
|
19
|
-
export * from "./run-trigger.js";
|
|
20
|
-
export * from "./run-context.js";
|
|
18
|
+
export { computeCanaryDrift, } from "./canary-drift.js";
|
|
19
|
+
export { FEATURE_FLAGS, } from "./feature-flags.js";
|
|
20
|
+
export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
|
|
21
|
+
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
22
|
+
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
23
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
24
|
+
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
@@ -147,8 +147,8 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
147
147
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
148
148
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
149
149
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
150
|
-
abbreviated: "abbreviated";
|
|
151
150
|
full: "full";
|
|
151
|
+
abbreviated: "abbreviated";
|
|
152
152
|
none: "none";
|
|
153
153
|
}>>;
|
|
154
154
|
}, z.core.$strip>>;
|
|
@@ -773,8 +773,8 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
|
|
|
773
773
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
774
774
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
775
775
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
776
|
-
abbreviated: "abbreviated";
|
|
777
776
|
full: "full";
|
|
777
|
+
abbreviated: "abbreviated";
|
|
778
778
|
none: "none";
|
|
779
779
|
}>>;
|
|
780
780
|
}, z.core.$strip>>;
|
|
@@ -893,8 +893,8 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
893
893
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
894
894
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
895
895
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
896
|
-
abbreviated: "abbreviated";
|
|
897
896
|
full: "full";
|
|
897
|
+
abbreviated: "abbreviated";
|
|
898
898
|
none: "none";
|
|
899
899
|
}>>;
|
|
900
900
|
}, z.core.$strip>>;
|
|
@@ -479,20 +479,18 @@ export default class AgenticProvider {
|
|
|
479
479
|
// Jina search unavailable
|
|
480
480
|
}
|
|
481
481
|
}
|
|
482
|
-
// Final fallback:
|
|
482
|
+
// Final fallback: search returned nothing usable. Point the agent at
|
|
483
|
+
// llms.txt (a real, fetchable doc index) instead of fabricating a URL
|
|
484
|
+
// from the query slug — fabricated URLs 404 and mislead the agent into
|
|
485
|
+
// thinking the doc system is unreachable. See W0129.
|
|
483
486
|
if (results.length === 0) {
|
|
484
|
-
const sanitized = query
|
|
485
|
-
.toLowerCase()
|
|
486
|
-
.replace(/sanity\.?(io)?/gi, "")
|
|
487
|
-
.trim();
|
|
488
|
-
const slugGuess = sanitized
|
|
489
|
-
.replace(/\s+/g, "-")
|
|
490
|
-
.replace(/[^a-z0-9-]/g, "");
|
|
491
487
|
results = [
|
|
492
488
|
{
|
|
493
|
-
snippet: `
|
|
494
|
-
|
|
495
|
-
|
|
489
|
+
snippet: `No direct search results. The documentation index is available at ` +
|
|
490
|
+
`${this.llmsTxtUrl} — fetch it to discover real doc URLs, ` +
|
|
491
|
+
`then fetch_page specific topics.`,
|
|
492
|
+
title: `No results — try fetching ${this.llmsTxtUrl} for the doc index`,
|
|
493
|
+
url: this.llmsTxtUrl,
|
|
496
494
|
},
|
|
497
495
|
];
|
|
498
496
|
}
|
|
@@ -806,12 +804,14 @@ export default class AgenticProvider {
|
|
|
806
804
|
const maxToolRounds = this.config.maxToolRounds || 5;
|
|
807
805
|
const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
|
|
808
806
|
// Newer OpenAI models (gpt-5.x, o-series) use max_completion_tokens
|
|
809
|
-
// instead of max_tokens
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
model.startsWith("gpt-5") ||
|
|
807
|
+
// instead of max_tokens, and reject custom temperature values. Detect
|
|
808
|
+
// from config or model name. See W0131.
|
|
809
|
+
const isReasoningModel = model.startsWith("gpt-5") ||
|
|
813
810
|
model.startsWith("o3") ||
|
|
814
811
|
model.startsWith("o4");
|
|
812
|
+
const useMaxCompletionTokens = this.config.max_output_tokens != null ||
|
|
813
|
+
this.config.max_completion_tokens != null ||
|
|
814
|
+
isReasoningModel;
|
|
815
815
|
const maxTokensValue = this.config.max_output_tokens ??
|
|
816
816
|
this.config.max_completion_tokens ??
|
|
817
817
|
this.config.max_tokens ??
|
|
@@ -840,15 +840,20 @@ export default class AgenticProvider {
|
|
|
840
840
|
const startTime = Date.now();
|
|
841
841
|
for (let round = 0; round <= maxToolRounds; round++) {
|
|
842
842
|
const isLastRound = round === maxToolRounds;
|
|
843
|
+
const requestBody = {
|
|
844
|
+
...tokenLimitParam,
|
|
845
|
+
messages,
|
|
846
|
+
model,
|
|
847
|
+
tool_choice: isLastRound ? "none" : "auto",
|
|
848
|
+
tools,
|
|
849
|
+
};
|
|
850
|
+
// gpt-5.x and o-series reject custom temperature; chat-completions
|
|
851
|
+
// models continue to receive the configured value. See W0131.
|
|
852
|
+
if (!isReasoningModel) {
|
|
853
|
+
requestBody.temperature = temperature;
|
|
854
|
+
}
|
|
843
855
|
const response = await fetchFn("https://api.openai.com/v1/chat/completions", {
|
|
844
|
-
body: JSON.stringify(
|
|
845
|
-
...tokenLimitParam,
|
|
846
|
-
messages,
|
|
847
|
-
model,
|
|
848
|
-
temperature,
|
|
849
|
-
tool_choice: isLastRound ? "none" : "auto",
|
|
850
|
-
tools,
|
|
851
|
-
}),
|
|
856
|
+
body: JSON.stringify(requestBody),
|
|
852
857
|
headers: {
|
|
853
858
|
Authorization: `Bearer ${apiKey}`,
|
|
854
859
|
"Content-Type": "application/json",
|
|
@@ -65,6 +65,11 @@ export function classifyRequests(requests) {
|
|
|
65
65
|
// Skip failed requests (no response)
|
|
66
66
|
if (req.statusCode === 0)
|
|
67
67
|
continue;
|
|
68
|
+
// Status-only entries (W0132) carry no body, so we can't infer search
|
|
69
|
+
// queries or doc-page metadata reliably. They still count as API calls
|
|
70
|
+
// (Sanity API) or external requests (everything else) so the run shape
|
|
71
|
+
// shows that the call happened, but we skip the body-dependent buckets.
|
|
72
|
+
const isStatusOnly = req.capture === "status-only";
|
|
68
73
|
// Order matters: API calls first (they may have ?query= params that look like searches),
|
|
69
74
|
// then searches, then doc pages, then external
|
|
70
75
|
if (isSanityApiRequest(req)) {
|
|
@@ -75,14 +80,14 @@ export function classifyRequests(requests) {
|
|
|
75
80
|
url: req.url,
|
|
76
81
|
});
|
|
77
82
|
}
|
|
78
|
-
else if (isSearchRequest(req)) {
|
|
83
|
+
else if (!isStatusOnly && isSearchRequest(req)) {
|
|
79
84
|
result.searchQueries.push({
|
|
80
85
|
query: extractSearchQuery(req),
|
|
81
86
|
timestamp: req.timestamp,
|
|
82
87
|
url: req.url,
|
|
83
88
|
});
|
|
84
89
|
}
|
|
85
|
-
else if (isDocPageRequest(req)) {
|
|
90
|
+
else if (!isStatusOnly && isDocPageRequest(req)) {
|
|
86
91
|
const slug = extractDocSlug(req.url);
|
|
87
92
|
result.docPageVisits.push({
|
|
88
93
|
contentSize: req.responseSize,
|
|
@@ -21,6 +21,25 @@
|
|
|
21
21
|
*
|
|
22
22
|
* const log = recorder.stop()
|
|
23
23
|
* // → AgentBehaviorLog with all requests classified
|
|
24
|
+
*
|
|
25
|
+
* W0133 — per-class preview byte caps
|
|
26
|
+
*
|
|
27
|
+
* `responsePreview` is capped at `previewLimits.default` (4 KB) for most
|
|
28
|
+
* responses, with per-class overrides for two payloads whose contents are
|
|
29
|
+
* the ground truth for trace audits:
|
|
30
|
+
*
|
|
31
|
+
* - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
|
|
32
|
+
* bing.com/search, duckduckgo.com, google.com/search responses. Captures
|
|
33
|
+
* the full result list (typical 8–10 KB) so trace audits can resolve
|
|
34
|
+
* which result the model fetched next.
|
|
35
|
+
* - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
|
|
36
|
+
* index is ~110 KB. Capturing the full body lets trace audits
|
|
37
|
+
* distinguish "model fetched a path that wasn't in the index" from
|
|
38
|
+
* "model fetched a path that was in the index but the page is missing".
|
|
39
|
+
*
|
|
40
|
+
* The slim Content Lake report (W0051) does not inline previews — they
|
|
41
|
+
* live in the GCS `traces` NDJSON artifact only, so bumping these caps
|
|
42
|
+
* has no effect on the 10 MB Sanity document budget.
|
|
24
43
|
*/
|
|
25
44
|
import type { ObservedRequest, AgentBehaviorLog } from "./types.js";
|
|
26
45
|
export interface RecorderOptions {
|
|
@@ -31,13 +50,50 @@ export interface RecorderOptions {
|
|
|
31
50
|
/** Filter: skip requests matching these URL patterns. Default: skip none.
|
|
32
51
|
* Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
|
|
33
52
|
excludePatterns?: (RegExp | string)[];
|
|
34
|
-
/** Filter: only record requests matching these URL patterns. Default: record all.
|
|
35
|
-
*
|
|
53
|
+
/** Filter: only fully record requests matching these URL patterns. Default: record all fully.
|
|
54
|
+
* When `statusOnlyForUnmatched` is true (default), unmatched URLs still emit a slim
|
|
55
|
+
* status-only observation. Accepts RegExp or string (strings are auto-converted to
|
|
56
|
+
* case-insensitive RegExp). */
|
|
36
57
|
includePatterns?: (RegExp | string)[];
|
|
37
58
|
/** Maximum request body bytes to capture. Default: 4096 */
|
|
38
59
|
maxBodyBytes?: number;
|
|
39
|
-
/**
|
|
60
|
+
/**
|
|
61
|
+
* Default response preview byte cap. Default: 4096.
|
|
62
|
+
*
|
|
63
|
+
* Per-class overrides in `previewLimits` may extend this for specific
|
|
64
|
+
* URL patterns. If `previewLimits` is set, `previewLimits.default` wins
|
|
65
|
+
* over `maxPreviewBytes`.
|
|
66
|
+
*/
|
|
40
67
|
maxPreviewBytes?: number;
|
|
68
|
+
/**
|
|
69
|
+
* Per-class response preview byte caps (W0133). Lets the recorder
|
|
70
|
+
* capture larger previews for response classes whose contents are the
|
|
71
|
+
* ground truth for trace audits, without inflating preview size for
|
|
72
|
+
* generic responses.
|
|
73
|
+
*
|
|
74
|
+
* - `default` — used when no other class matches. Falls back to
|
|
75
|
+
* `maxPreviewBytes` when omitted (defaults to 4 KB).
|
|
76
|
+
* - `search` — Jina-wrapped DuckDuckGo, Google CSE, bing/duckduckgo,
|
|
77
|
+
* google.com/search responses. Default: 16 KB.
|
|
78
|
+
* - `llmsTxt` — `/llms.txt` responses. Default: 128 KB.
|
|
79
|
+
*/
|
|
80
|
+
previewLimits?: {
|
|
81
|
+
default?: number;
|
|
82
|
+
llmsTxt?: number;
|
|
83
|
+
search?: number;
|
|
84
|
+
};
|
|
85
|
+
/**
|
|
86
|
+
* When a URL fails `includePatterns` but passes `excludePatterns`, emit a
|
|
87
|
+
* slim observation (url/method/statusCode/latencyMs/timestamp/seq, with
|
|
88
|
+
* `capture: "status-only"`) instead of dropping it entirely. Default: true.
|
|
89
|
+
*
|
|
90
|
+
* Setting to `false` restores strict-allowlist behavior — unmatched URLs
|
|
91
|
+
* are dropped, leaving no record of the call. The default exists so
|
|
92
|
+
* model-side traffic to api.openai.com / api.anthropic.com /
|
|
93
|
+
* googleapis.com is visible in run artifacts without recording prompts,
|
|
94
|
+
* completions, or API keys. See W0132.
|
|
95
|
+
*/
|
|
96
|
+
statusOnlyForUnmatched?: boolean;
|
|
41
97
|
}
|
|
42
98
|
export declare class RequestRecorder {
|
|
43
99
|
private observations;
|
|
@@ -69,8 +125,37 @@ export declare class RequestRecorder {
|
|
|
69
125
|
*
|
|
70
126
|
* Use this when you can't wrap `fetch` directly but can observe traffic
|
|
71
127
|
* (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
|
|
128
|
+
*
|
|
129
|
+
* Filter behavior (W0132):
|
|
130
|
+
* - `excludePatterns` always drops the observation entirely.
|
|
131
|
+
* - `includePatterns` mismatch produces a slim `capture: "status-only"`
|
|
132
|
+
* record when `statusOnlyForUnmatched` is true (default), or drops it
|
|
133
|
+
* when false.
|
|
134
|
+
* - The discriminator on the input is honored: callers that already
|
|
135
|
+
* know they're emitting a slim record (e.g., the fetch wrapper) can
|
|
136
|
+
* set `capture: "status-only"` themselves.
|
|
72
137
|
*/
|
|
73
138
|
record(observation: Omit<ObservedRequest, "seq">): void;
|
|
139
|
+
/**
|
|
140
|
+
* Resolve the preview byte cap for a given URL using per-class overrides
|
|
141
|
+
* (W0133). Order of preference:
|
|
142
|
+
* 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
|
|
143
|
+
* 2. `previewLimits.search` for known search providers.
|
|
144
|
+
* 3. `previewLimits.default`.
|
|
145
|
+
*/
|
|
146
|
+
private resolvePreviewBytes;
|
|
147
|
+
/**
|
|
148
|
+
* Decide how to record a URL given the current filter configuration.
|
|
149
|
+
*
|
|
150
|
+
* - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
|
|
151
|
+
* and `statusOnlyForUnmatched` is false.
|
|
152
|
+
* - `"status-only"` — `includePatterns` failed but
|
|
153
|
+
* `statusOnlyForUnmatched` is true (default). Skip body/headers.
|
|
154
|
+
* - `"full"` — record everything.
|
|
155
|
+
*
|
|
156
|
+
* See W0132.
|
|
157
|
+
*/
|
|
158
|
+
private classifyCaptureMode;
|
|
74
159
|
/**
|
|
75
160
|
* Reset the recorder for reuse without creating a new instance.
|
|
76
161
|
*/
|
|
@@ -21,8 +21,49 @@
|
|
|
21
21
|
*
|
|
22
22
|
* const log = recorder.stop()
|
|
23
23
|
* // → AgentBehaviorLog with all requests classified
|
|
24
|
+
*
|
|
25
|
+
* W0133 — per-class preview byte caps
|
|
26
|
+
*
|
|
27
|
+
* `responsePreview` is capped at `previewLimits.default` (4 KB) for most
|
|
28
|
+
* responses, with per-class overrides for two payloads whose contents are
|
|
29
|
+
* the ground truth for trace audits:
|
|
30
|
+
*
|
|
31
|
+
* - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
|
|
32
|
+
* bing.com/search, duckduckgo.com, google.com/search responses. Captures
|
|
33
|
+
* the full result list (typical 8–10 KB) so trace audits can resolve
|
|
34
|
+
* which result the model fetched next.
|
|
35
|
+
* - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
|
|
36
|
+
* index is ~110 KB. Capturing the full body lets trace audits
|
|
37
|
+
* distinguish "model fetched a path that wasn't in the index" from
|
|
38
|
+
* "model fetched a path that was in the index but the page is missing".
|
|
39
|
+
*
|
|
40
|
+
* The slim Content Lake report (W0051) does not inline previews — they
|
|
41
|
+
* live in the GCS `traces` NDJSON artifact only, so bumping these caps
|
|
42
|
+
* has no effect on the 10 MB Sanity document budget.
|
|
24
43
|
*/
|
|
25
44
|
import { classifyRequests } from "./classifier.js";
|
|
45
|
+
/** Per-class preview-byte defaults (W0133). */
|
|
46
|
+
const DEFAULT_PREVIEW_LIMITS = {
|
|
47
|
+
default: 4096,
|
|
48
|
+
llmsTxt: 131072, // ~128 KB — covers Sanity's ~110 KB llms.txt
|
|
49
|
+
search: 16384, // ~16 KB — Jina/Google CSE/duckduckgo result lists
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* URL patterns for the `search` response class (W0133). These cover the
|
|
53
|
+
* search providers the agentic loop actually hits; new providers can be
|
|
54
|
+
* added here without changing the recorder API surface.
|
|
55
|
+
*/
|
|
56
|
+
const SEARCH_URL_PATTERNS = [
|
|
57
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?duckduckgo\.com/i,
|
|
58
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?google\.com\/search/i,
|
|
59
|
+
/r\.jina\.ai\/https?:\/\/(www\.)?bing\.com\/search/i,
|
|
60
|
+
/^https?:\/\/(www\.)?googleapis\.com\/customsearch/i,
|
|
61
|
+
/^https?:\/\/(www\.)?google\.com\/search/i,
|
|
62
|
+
/^https?:\/\/(www\.)?bing\.com\/search/i,
|
|
63
|
+
/^https?:\/\/(www\.)?duckduckgo\.com/i,
|
|
64
|
+
];
|
|
65
|
+
/** URL pattern for the `llmsTxt` response class (W0133). */
|
|
66
|
+
const LLMS_TXT_PATTERN = /\/llms\.txt(\?|$|\/)/i;
|
|
26
67
|
const DEFAULT_OPTIONS = {
|
|
27
68
|
captureHeaders: [
|
|
28
69
|
"accept",
|
|
@@ -40,7 +81,9 @@ const DEFAULT_OPTIONS = {
|
|
|
40
81
|
],
|
|
41
82
|
includePatterns: [],
|
|
42
83
|
maxBodyBytes: 4096,
|
|
43
|
-
maxPreviewBytes:
|
|
84
|
+
maxPreviewBytes: DEFAULT_PREVIEW_LIMITS.default,
|
|
85
|
+
previewLimits: { ...DEFAULT_PREVIEW_LIMITS },
|
|
86
|
+
statusOnlyForUnmatched: true,
|
|
44
87
|
};
|
|
45
88
|
// ---------------------------------------------------------------------------
|
|
46
89
|
// RequestRecorder
|
|
@@ -63,6 +106,19 @@ export class RequestRecorder {
|
|
|
63
106
|
if (merged.excludePatterns) {
|
|
64
107
|
merged.excludePatterns = merged.excludePatterns.map(toRegExp);
|
|
65
108
|
}
|
|
109
|
+
// Resolve per-class preview caps. `previewLimits.default` wins over
|
|
110
|
+
// `maxPreviewBytes`; missing entries fall through to module defaults
|
|
111
|
+
// (W0133).
|
|
112
|
+
const userLimits = options?.previewLimits ?? {};
|
|
113
|
+
const resolvedDefault = userLimits.default ??
|
|
114
|
+
options?.maxPreviewBytes ??
|
|
115
|
+
DEFAULT_PREVIEW_LIMITS.default;
|
|
116
|
+
merged.previewLimits = {
|
|
117
|
+
default: resolvedDefault,
|
|
118
|
+
llmsTxt: userLimits.llmsTxt ?? DEFAULT_PREVIEW_LIMITS.llmsTxt,
|
|
119
|
+
search: userLimits.search ?? DEFAULT_PREVIEW_LIMITS.search,
|
|
120
|
+
};
|
|
121
|
+
merged.maxPreviewBytes = resolvedDefault;
|
|
66
122
|
this.options = merged;
|
|
67
123
|
}
|
|
68
124
|
/**
|
|
@@ -83,6 +139,7 @@ export class RequestRecorder {
|
|
|
83
139
|
? input.method
|
|
84
140
|
: "GET") ??
|
|
85
141
|
"GET";
|
|
142
|
+
const captureMode = this.classifyCaptureMode(url);
|
|
86
143
|
let response;
|
|
87
144
|
let error = null;
|
|
88
145
|
try {
|
|
@@ -90,31 +147,64 @@ export class RequestRecorder {
|
|
|
90
147
|
}
|
|
91
148
|
catch (err) {
|
|
92
149
|
error = err;
|
|
93
|
-
|
|
150
|
+
if (captureMode === "drop")
|
|
151
|
+
throw error;
|
|
152
|
+
// Record the failed request — status-only captures skip body/headers
|
|
153
|
+
// entirely (W0132).
|
|
154
|
+
this.record(captureMode === "full"
|
|
155
|
+
? {
|
|
156
|
+
body: await this.extractBody(init?.body),
|
|
157
|
+
capture: "full",
|
|
158
|
+
contentType: undefined,
|
|
159
|
+
headers: this.extractHeaders(init?.headers),
|
|
160
|
+
latencyMs: Date.now() - reqStart,
|
|
161
|
+
method: method.toUpperCase(),
|
|
162
|
+
responsePreview: `Error: ${error.message}`,
|
|
163
|
+
responseSize: 0,
|
|
164
|
+
statusCode: 0,
|
|
165
|
+
timestamp: new Date(reqStart).toISOString(),
|
|
166
|
+
url,
|
|
167
|
+
}
|
|
168
|
+
: {
|
|
169
|
+
capture: "status-only",
|
|
170
|
+
headers: {},
|
|
171
|
+
latencyMs: Date.now() - reqStart,
|
|
172
|
+
method: method.toUpperCase(),
|
|
173
|
+
responseSize: 0,
|
|
174
|
+
statusCode: 0,
|
|
175
|
+
timestamp: new Date(reqStart).toISOString(),
|
|
176
|
+
url,
|
|
177
|
+
});
|
|
178
|
+
throw error;
|
|
179
|
+
}
|
|
180
|
+
const latencyMs = Date.now() - reqStart;
|
|
181
|
+
if (captureMode === "drop")
|
|
182
|
+
return response;
|
|
183
|
+
if (captureMode === "status-only") {
|
|
184
|
+
// No body read, no header capture, no preview — only the metadata
|
|
185
|
+
// needed to know the call happened (W0132).
|
|
94
186
|
this.record({
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
latencyMs: Date.now() - reqStart,
|
|
187
|
+
capture: "status-only",
|
|
188
|
+
headers: {},
|
|
189
|
+
latencyMs,
|
|
99
190
|
method: method.toUpperCase(),
|
|
100
|
-
responsePreview: `Error: ${error.message}`,
|
|
101
191
|
responseSize: 0,
|
|
102
|
-
statusCode:
|
|
192
|
+
statusCode: response.status,
|
|
103
193
|
timestamp: new Date(reqStart).toISOString(),
|
|
104
194
|
url,
|
|
105
195
|
});
|
|
106
|
-
|
|
196
|
+
return response;
|
|
107
197
|
}
|
|
108
|
-
const latencyMs = Date.now() - reqStart;
|
|
109
198
|
// Clone the response so we can read the body without consuming it
|
|
110
199
|
const clone = response.clone();
|
|
111
200
|
let responseSize = 0;
|
|
112
201
|
let responsePreview;
|
|
113
202
|
if (this.options.captureResponsePreview) {
|
|
203
|
+
const previewBytes = this.resolvePreviewBytes(url);
|
|
114
204
|
try {
|
|
115
205
|
const text = await clone.text();
|
|
116
206
|
responseSize = new TextEncoder().encode(text).length;
|
|
117
|
-
responsePreview = text.slice(0,
|
|
207
|
+
responsePreview = text.slice(0, previewBytes);
|
|
118
208
|
}
|
|
119
209
|
catch {
|
|
120
210
|
// Body might not be text — that's fine
|
|
@@ -123,6 +213,7 @@ export class RequestRecorder {
|
|
|
123
213
|
}
|
|
124
214
|
this.record({
|
|
125
215
|
body: await this.extractBody(init?.body),
|
|
216
|
+
capture: "full",
|
|
126
217
|
contentType: response.headers.get("content-type") ?? undefined,
|
|
127
218
|
headers: this.extractHeaders(init?.headers),
|
|
128
219
|
latencyMs,
|
|
@@ -152,26 +243,93 @@ export class RequestRecorder {
|
|
|
152
243
|
*
|
|
153
244
|
* Use this when you can't wrap `fetch` directly but can observe traffic
|
|
154
245
|
* (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
|
|
246
|
+
*
|
|
247
|
+
* Filter behavior (W0132):
|
|
248
|
+
* - `excludePatterns` always drops the observation entirely.
|
|
249
|
+
* - `includePatterns` mismatch produces a slim `capture: "status-only"`
|
|
250
|
+
* record when `statusOnlyForUnmatched` is true (default), or drops it
|
|
251
|
+
* when false.
|
|
252
|
+
* - The discriminator on the input is honored: callers that already
|
|
253
|
+
* know they're emitting a slim record (e.g., the fetch wrapper) can
|
|
254
|
+
* set `capture: "status-only"` themselves.
|
|
155
255
|
*/
|
|
156
256
|
record(observation) {
|
|
157
257
|
if (!this.running)
|
|
158
258
|
return;
|
|
159
259
|
const url = observation.url;
|
|
160
|
-
|
|
260
|
+
if (this.options.excludePatterns.some((p) => p.test(url)))
|
|
261
|
+
return;
|
|
262
|
+
let capture = observation.capture ?? "full";
|
|
161
263
|
if (this.options.includePatterns.length > 0) {
|
|
162
|
-
|
|
163
|
-
|
|
264
|
+
const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
|
|
265
|
+
if (!matchesIncludes) {
|
|
266
|
+
if (!this.options.statusOnlyForUnmatched)
|
|
267
|
+
return;
|
|
268
|
+
capture = "status-only";
|
|
269
|
+
}
|
|
164
270
|
}
|
|
165
|
-
if (
|
|
271
|
+
if (capture === "status-only") {
|
|
272
|
+
// Slim shape — strip body/headers/contentType/responsePreview so a
|
|
273
|
+
// caller that passed full data still produces a sanitized record.
|
|
274
|
+
this.observations.push({
|
|
275
|
+
capture: "status-only",
|
|
276
|
+
headers: {},
|
|
277
|
+
latencyMs: observation.latencyMs,
|
|
278
|
+
method: observation.method,
|
|
279
|
+
responseSize: 0,
|
|
280
|
+
seq: this.seq++,
|
|
281
|
+
statusCode: observation.statusCode,
|
|
282
|
+
timestamp: observation.timestamp,
|
|
283
|
+
url,
|
|
284
|
+
});
|
|
166
285
|
return;
|
|
286
|
+
}
|
|
287
|
+
const previewBytes = this.resolvePreviewBytes(url);
|
|
167
288
|
this.observations.push({
|
|
168
289
|
...observation,
|
|
290
|
+
capture: "full",
|
|
169
291
|
// Truncate body if needed
|
|
170
292
|
body: observation.body?.slice(0, this.options.maxBodyBytes),
|
|
171
|
-
responsePreview: observation.responsePreview?.slice(0,
|
|
293
|
+
responsePreview: observation.responsePreview?.slice(0, previewBytes),
|
|
172
294
|
seq: this.seq++,
|
|
173
295
|
});
|
|
174
296
|
}
|
|
297
|
+
/**
|
|
298
|
+
* Resolve the preview byte cap for a given URL using per-class overrides
|
|
299
|
+
* (W0133). Order of preference:
|
|
300
|
+
* 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
|
|
301
|
+
* 2. `previewLimits.search` for known search providers.
|
|
302
|
+
* 3. `previewLimits.default`.
|
|
303
|
+
*/
|
|
304
|
+
resolvePreviewBytes(url) {
|
|
305
|
+
if (LLMS_TXT_PATTERN.test(url))
|
|
306
|
+
return this.options.previewLimits.llmsTxt;
|
|
307
|
+
if (SEARCH_URL_PATTERNS.some((p) => p.test(url))) {
|
|
308
|
+
return this.options.previewLimits.search;
|
|
309
|
+
}
|
|
310
|
+
return this.options.previewLimits.default;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Decide how to record a URL given the current filter configuration.
|
|
314
|
+
*
|
|
315
|
+
* - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
|
|
316
|
+
* and `statusOnlyForUnmatched` is false.
|
|
317
|
+
* - `"status-only"` — `includePatterns` failed but
|
|
318
|
+
* `statusOnlyForUnmatched` is true (default). Skip body/headers.
|
|
319
|
+
* - `"full"` — record everything.
|
|
320
|
+
*
|
|
321
|
+
* See W0132.
|
|
322
|
+
*/
|
|
323
|
+
classifyCaptureMode(url) {
|
|
324
|
+
if (this.options.excludePatterns.some((p) => p.test(url)))
|
|
325
|
+
return "drop";
|
|
326
|
+
if (this.options.includePatterns.length === 0)
|
|
327
|
+
return "full";
|
|
328
|
+
const matchesIncludes = this.options.includePatterns.some((p) => p.test(url));
|
|
329
|
+
if (matchesIncludes)
|
|
330
|
+
return "full";
|
|
331
|
+
return this.options.statusOnlyForUnmatched ? "status-only" : "drop";
|
|
332
|
+
}
|
|
175
333
|
/**
|
|
176
334
|
* Reset the recorder for reuse without creating a new instance.
|
|
177
335
|
*/
|