@sanity/ailf 3.9.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.ts +32 -4
- package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
- package/dist/_vendor/ailf-core/config-helpers.js +54 -1
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
- package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -10
- package/dist/_vendor/ailf-shared/index.js +13 -10
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/check-staleness.js +12 -4
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/models.ts +32 -4
- package/dist/orchestration/steps/run-eval-step.js +39 -29
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/cache-hit-restore.d.ts +24 -0
- package/dist/pipeline/cache-hit-restore.js +32 -0
- package/dist/pipeline/calculate-scores.js +40 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/eval-fingerprint.d.ts +33 -35
- package/dist/pipeline/eval-fingerprint.js +124 -106
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/report-store.js +3 -0
- package/package.json +2 -2
package/config/models.ts
CHANGED
|
@@ -35,16 +35,23 @@ export default defineModels({
|
|
|
35
35
|
|
|
36
36
|
// ── OpenAI ─────────────────────────────────────────────────
|
|
37
37
|
{
|
|
38
|
+
// gpt-5.2 routes through chat completions (and through the in-house
|
|
39
|
+
// agentic provider for naive/optimized variants). `verbosity` is a
|
|
40
|
+
// Responses-API-only field — it would be silently dropped here, so
|
|
41
|
+
// it isn't configured. See W0131.
|
|
38
42
|
id: "openai:chat:gpt-5.2",
|
|
39
43
|
label: "GPT 5.2",
|
|
40
44
|
config: {
|
|
41
45
|
max_completion_tokens: 8192,
|
|
42
|
-
verbosity: "medium",
|
|
43
46
|
},
|
|
44
47
|
modes: ["literacy", "knowledge-probe"],
|
|
45
48
|
// All literacy variants included by default
|
|
46
49
|
},
|
|
47
50
|
{
|
|
51
|
+
// GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
|
|
52
|
+
// native handling of `openai:responses:` honors reasoning / verbosity /
|
|
53
|
+
// summary; the in-house agentic provider does not (W0131). MCP-server
|
|
54
|
+
// and knowledge-probe routes go through Promptfoo native too.
|
|
48
55
|
id: "openai:responses:gpt-5.4",
|
|
49
56
|
label: "GPT 5.4",
|
|
50
57
|
config: {
|
|
@@ -55,7 +62,9 @@ export default defineModels({
|
|
|
55
62
|
},
|
|
56
63
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
57
64
|
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
58
|
-
|
|
65
|
+
variants: {
|
|
66
|
+
literacy: ["baseline"],
|
|
67
|
+
},
|
|
59
68
|
},
|
|
60
69
|
|
|
61
70
|
// ── Disabled models (uncomment to enable) ──────────────────
|
|
@@ -93,12 +102,31 @@ export default defineModels({
|
|
|
93
102
|
defaults: {
|
|
94
103
|
temperature: 0.2,
|
|
95
104
|
max_tokens: 4096,
|
|
96
|
-
|
|
105
|
+
// Global default round budget for agentic modes. Per-mode overrides
|
|
106
|
+
// below give naive more headroom (W0134) since it spends rounds on
|
|
107
|
+
// retries when fetches fail. Per-model `config.maxToolRounds` still
|
|
108
|
+
// wins over both values.
|
|
109
|
+
maxToolRounds: 5,
|
|
110
|
+
modeMaxToolRounds: {
|
|
111
|
+
"agentic-naive": 8,
|
|
112
|
+
"agentic-optimized": 5,
|
|
113
|
+
},
|
|
97
114
|
observerOptions: {
|
|
98
|
-
|
|
115
|
+
// Per-class preview caps (W0133): default 4 KB, but search responses
|
|
116
|
+
// get 16 KB and llms.txt gets 128 KB so trace audits can resolve
|
|
117
|
+
// which result the model actually saw.
|
|
118
|
+
maxPreviewBytes: 4096,
|
|
119
|
+
previewLimits: {
|
|
120
|
+
default: 4096,
|
|
121
|
+
llmsTxt: 131072,
|
|
122
|
+
search: 16384,
|
|
123
|
+
},
|
|
99
124
|
captureResponsePreview: true,
|
|
100
125
|
includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
|
|
101
126
|
sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
|
|
127
|
+
// statusOnlyForUnmatched defaults to true (W0132) — model-side
|
|
128
|
+
// traffic to api.openai.com / api.anthropic.com / googleapis.com
|
|
129
|
+
// surfaces in run artifacts as slim status-only entries.
|
|
102
130
|
},
|
|
103
131
|
},
|
|
104
132
|
})
|
|
@@ -57,8 +57,14 @@ export declare function defineTask(task: GeneralizedTaskDefinition): Generalized
|
|
|
57
57
|
* Validates:
|
|
58
58
|
* - Every `modes` entry is a canonical eval mode name
|
|
59
59
|
* - Every `variants` key is a mode the model is enrolled in
|
|
60
|
-
*
|
|
61
|
-
*
|
|
60
|
+
* - `openai:responses:` model ids are not used for agentic literacy variants
|
|
61
|
+
* (the in-house agentic loop dispatches to chat completions only)
|
|
62
|
+
* - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
|
|
63
|
+
* set on a model that routes through the agentic provider — they would be
|
|
64
|
+
* silently dropped.
|
|
65
|
+
*
|
|
66
|
+
* @throws {Error} On invalid mode names, mismatched variant keys, or
|
|
67
|
+
* misconfigured OpenAI Responses-API fields.
|
|
62
68
|
*/
|
|
63
69
|
export declare function defineModels(models: ModelsConfig): ModelsConfig;
|
|
64
70
|
/**
|
|
@@ -54,6 +54,33 @@ export function defineTask(task) {
|
|
|
54
54
|
// ---------------------------------------------------------------------------
|
|
55
55
|
// Model registry helpers
|
|
56
56
|
// ---------------------------------------------------------------------------
|
|
57
|
+
/**
|
|
58
|
+
* OpenAI Responses-API-only fields. The agentic provider's OpenAI loop
|
|
59
|
+
* routes everything through `/v1/chat/completions` and would silently drop
|
|
60
|
+
* these. We surface the misconfiguration at config-load time instead.
|
|
61
|
+
*
|
|
62
|
+
* @see docs/work-items/W0131-honor-openai-responses-provider.json
|
|
63
|
+
*/
|
|
64
|
+
const RESPONSES_ONLY_FIELDS = ["reasoning", "summary", "verbosity"];
|
|
65
|
+
/**
|
|
66
|
+
* Whether a model would be assembled into agentic-naive or agentic-optimized
|
|
67
|
+
* literacy variants. These are the variants that route through the in-house
|
|
68
|
+
* agentic provider (which speaks chat completions only); baseline routes
|
|
69
|
+
* through Promptfoo's native handling, which honors `openai:responses:` ids.
|
|
70
|
+
*
|
|
71
|
+
* Note: variant names mirror the literacy mode base in
|
|
72
|
+
* `packages/eval/src/pipeline/compiler/mode-bases/literacy.ts`.
|
|
73
|
+
*/
|
|
74
|
+
function participatesInAgenticLiteracy(model) {
|
|
75
|
+
const enrolledInLiteracy = !model.modes || model.modes.includes("literacy");
|
|
76
|
+
if (!enrolledInLiteracy)
|
|
77
|
+
return false;
|
|
78
|
+
const literacyVariants = model.variants?.literacy;
|
|
79
|
+
if (!literacyVariants)
|
|
80
|
+
return true;
|
|
81
|
+
return (literacyVariants.includes("agentic-naive") ||
|
|
82
|
+
literacyVariants.includes("agentic-optimized"));
|
|
83
|
+
}
|
|
57
84
|
/**
|
|
58
85
|
* Define the model registry (models to evaluate and grader model).
|
|
59
86
|
*
|
|
@@ -62,8 +89,14 @@ export function defineTask(task) {
|
|
|
62
89
|
* Validates:
|
|
63
90
|
* - Every `modes` entry is a canonical eval mode name
|
|
64
91
|
* - Every `variants` key is a mode the model is enrolled in
|
|
92
|
+
* - `openai:responses:` model ids are not used for agentic literacy variants
|
|
93
|
+
* (the in-house agentic loop dispatches to chat completions only)
|
|
94
|
+
* - Responses-API-only fields (`reasoning`, `summary`, `verbosity`) are not
|
|
95
|
+
* set on a model that routes through the agentic provider — they would be
|
|
96
|
+
* silently dropped.
|
|
65
97
|
*
|
|
66
|
-
* @throws {Error} On invalid mode names
|
|
98
|
+
* @throws {Error} On invalid mode names, mismatched variant keys, or
|
|
99
|
+
* misconfigured OpenAI Responses-API fields.
|
|
67
100
|
*/
|
|
68
101
|
export function defineModels(models) {
|
|
69
102
|
const validModes = new Set(CANONICAL_EVAL_MODES);
|
|
@@ -87,6 +120,26 @@ export function defineModels(models) {
|
|
|
87
120
|
}
|
|
88
121
|
}
|
|
89
122
|
}
|
|
123
|
+
const usesAgentic = participatesInAgenticLiteracy(model);
|
|
124
|
+
if (usesAgentic && model.id.startsWith("openai:responses:")) {
|
|
125
|
+
throw new Error(`Model "${model.label ?? model.id}": the in-house agentic provider ` +
|
|
126
|
+
`does not implement the OpenAI Responses API endpoint — requests would ` +
|
|
127
|
+
`be silently downgraded to chat completions. Either restrict variants to ` +
|
|
128
|
+
`["baseline"] (Promptfoo's native handling honors openai:responses:) or ` +
|
|
129
|
+
`change the id to "openai:chat:..." for agentic evaluation. ` +
|
|
130
|
+
`See W0131 for context.`);
|
|
131
|
+
}
|
|
132
|
+
if (usesAgentic && model.config) {
|
|
133
|
+
const droppedFields = RESPONSES_ONLY_FIELDS.filter((f) => f in model.config);
|
|
134
|
+
if (droppedFields.length > 0) {
|
|
135
|
+
throw new Error(`Model "${model.label ?? model.id}": configured fields ` +
|
|
136
|
+
`${droppedFields.map((f) => `"${f}"`).join(", ")} are only honored ` +
|
|
137
|
+
`by the OpenAI Responses API. The agentic provider's chat-completions ` +
|
|
138
|
+
`path would silently drop them. Either remove these fields or restrict ` +
|
|
139
|
+
`variants to ["baseline"] so the model is evaluated only through ` +
|
|
140
|
+
`Promptfoo's native Responses-API handler. See W0131 for context.`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
90
143
|
}
|
|
91
144
|
return models;
|
|
92
145
|
}
|
|
@@ -138,12 +138,21 @@ function toTitleCase(id) {
|
|
|
138
138
|
// ---------------------------------------------------------------------------
|
|
139
139
|
const RECOMMENDATION_TOP_N = 3;
|
|
140
140
|
function slimRecommendations(full) {
|
|
141
|
+
// Cache-hit pass-through: when the pipeline restores a previously
|
|
142
|
+
// published report on a remote cache hit, `score-summary.json` carries
|
|
143
|
+
// recommendations in their already-slim shape (no `.gaps` field).
|
|
144
|
+
// Re-slimming would crash on `for (gap of undefined)`; the slim shape
|
|
145
|
+
// has no full-fidelity data to recover, so we return it verbatim.
|
|
146
|
+
if (!Array.isArray(full.gaps)) {
|
|
147
|
+
return full;
|
|
148
|
+
}
|
|
149
|
+
const fullReport = full;
|
|
141
150
|
const counts = {};
|
|
142
|
-
for (const gap of
|
|
151
|
+
for (const gap of fullReport.gaps) {
|
|
143
152
|
counts[gap.area] = (counts[gap.area] ?? 0) + 1;
|
|
144
153
|
}
|
|
145
154
|
// Sort by priority descending, break ties by estimatedLift.
|
|
146
|
-
const sorted = [...
|
|
155
|
+
const sorted = [...fullReport.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
|
|
147
156
|
(b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
|
|
148
157
|
const top3 = sorted
|
|
149
158
|
.slice(0, RECOMMENDATION_TOP_N)
|
|
@@ -156,8 +165,8 @@ function slimRecommendations(full) {
|
|
|
156
165
|
return {
|
|
157
166
|
counts,
|
|
158
167
|
top3,
|
|
159
|
-
totalGaps:
|
|
160
|
-
totalPotentialLift:
|
|
168
|
+
totalGaps: fullReport.gaps.length,
|
|
169
|
+
totalPotentialLift: fullReport.totalPotentialLift,
|
|
161
170
|
};
|
|
162
171
|
}
|
|
163
172
|
/**
|
|
@@ -1364,6 +1364,15 @@ export interface ArtifactRefEntry {
|
|
|
1364
1364
|
* - `truncated` on the bulk row indicates the single-object body was capped.
|
|
1365
1365
|
* - `preview` on the bulk row carries a descriptor-typed summary for list
|
|
1366
1366
|
* views; wiring lands in W0051.
|
|
1367
|
+
*
|
|
1368
|
+
* D0040/W0135 extension:
|
|
1369
|
+
* - `sourceRunId` declares that this ref's bytes physically live under a
|
|
1370
|
+
* different run's storage prefix than the manifest containing it.
|
|
1371
|
+
* `path` is already self-contained and authoritative for resolution;
|
|
1372
|
+
* `sourceRunId` is purely a lineage marker for retention, GC,
|
|
1373
|
+
* observability, and BigQuery joins. Set by the cache-hit branch in
|
|
1374
|
+
* `RunEvalStep` when a new run reuses a prior report's artifacts;
|
|
1375
|
+
* unset on cold-path producers.
|
|
1367
1376
|
*/
|
|
1368
1377
|
export interface ArtifactRef {
|
|
1369
1378
|
store: "gcs" | "local";
|
|
@@ -1381,6 +1390,7 @@ export interface ArtifactRef {
|
|
|
1381
1390
|
entries?: ArtifactRefEntry[];
|
|
1382
1391
|
truncated?: boolean;
|
|
1383
1392
|
preview?: unknown;
|
|
1393
|
+
sourceRunId?: RunId;
|
|
1384
1394
|
}
|
|
1385
1395
|
/**
|
|
1386
1396
|
* Catalog of artifact refs produced by a single pipeline run.
|
|
@@ -8,14 +8,20 @@
|
|
|
8
8
|
* Design rule: this package has ZERO runtime dependencies and ZERO imports
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
|
+
*
|
|
12
|
+
* Re-exports are explicit (named) rather than `export *` so that the studio
|
|
13
|
+
* tsup DTS bundle can statically resolve each symbol's canonical owner —
|
|
14
|
+
* `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
|
|
15
|
+
* external namespace resolution" warning even when no symbol actually
|
|
16
|
+
* collides. See W0124.
|
|
11
17
|
*/
|
|
12
|
-
export
|
|
13
|
-
export
|
|
14
|
-
export
|
|
15
|
-
export
|
|
16
|
-
export
|
|
17
|
-
export
|
|
18
|
-
export
|
|
19
|
-
export
|
|
20
|
-
export
|
|
21
|
-
export
|
|
18
|
+
export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
|
|
19
|
+
export { type DocumentRef } from "./document-ref.js";
|
|
20
|
+
export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
|
|
21
|
+
export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
|
|
22
|
+
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
23
|
+
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
24
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
25
|
+
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
26
|
+
export { type RunTrigger } from "./run-trigger.js";
|
|
27
|
+
export { type RunContext } from "./run-context.js";
|
|
@@ -8,14 +8,17 @@
|
|
|
8
8
|
* Design rule: this package has ZERO runtime dependencies and ZERO imports
|
|
9
9
|
* from @sanity/ailf-core, @sanity/ailf, or
|
|
10
10
|
* @sanity/ailf-studio. It is the leaf of the dependency graph.
|
|
11
|
+
*
|
|
12
|
+
* Re-exports are explicit (named) rather than `export *` so that the studio
|
|
13
|
+
* tsup DTS bundle can statically resolve each symbol's canonical owner —
|
|
14
|
+
* `export *` chains across many modules trip rollup-plugin-dts's "Ambiguous
|
|
15
|
+
* external namespace resolution" warning even when no symbol actually
|
|
16
|
+
* collides. See W0124.
|
|
11
17
|
*/
|
|
12
|
-
export
|
|
13
|
-
export
|
|
14
|
-
export
|
|
15
|
-
export
|
|
16
|
-
export
|
|
17
|
-
export
|
|
18
|
-
export
|
|
19
|
-
export * from "./run-classification.js";
|
|
20
|
-
export * from "./run-trigger.js";
|
|
21
|
-
export * from "./run-context.js";
|
|
18
|
+
export { computeCanaryDrift, } from "./canary-drift.js";
|
|
19
|
+
export { FEATURE_FLAGS, } from "./feature-flags.js";
|
|
20
|
+
export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
|
|
21
|
+
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
22
|
+
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
23
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
24
|
+
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
@@ -147,8 +147,8 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
147
147
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
148
148
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
149
149
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
150
|
-
abbreviated: "abbreviated";
|
|
151
150
|
full: "full";
|
|
151
|
+
abbreviated: "abbreviated";
|
|
152
152
|
none: "none";
|
|
153
153
|
}>>;
|
|
154
154
|
}, z.core.$strip>>;
|
|
@@ -773,8 +773,8 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
|
|
|
773
773
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
774
774
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
775
775
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
776
|
-
abbreviated: "abbreviated";
|
|
777
776
|
full: "full";
|
|
777
|
+
abbreviated: "abbreviated";
|
|
778
778
|
none: "none";
|
|
779
779
|
}>>;
|
|
780
780
|
}, z.core.$strip>>;
|
|
@@ -893,8 +893,8 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
893
893
|
baseline: z.ZodOptional<z.ZodObject<{
|
|
894
894
|
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
895
895
|
rubric: z.ZodOptional<z.ZodEnum<{
|
|
896
|
-
abbreviated: "abbreviated";
|
|
897
896
|
full: "full";
|
|
897
|
+
abbreviated: "abbreviated";
|
|
898
898
|
none: "none";
|
|
899
899
|
}>>;
|
|
900
900
|
}, z.core.$strip>>;
|
|
@@ -479,20 +479,18 @@ export default class AgenticProvider {
|
|
|
479
479
|
// Jina search unavailable
|
|
480
480
|
}
|
|
481
481
|
}
|
|
482
|
-
// Final fallback:
|
|
482
|
+
// Final fallback: search returned nothing usable. Point the agent at
|
|
483
|
+
// llms.txt (a real, fetchable doc index) instead of fabricating a URL
|
|
484
|
+
// from the query slug — fabricated URLs 404 and mislead the agent into
|
|
485
|
+
// thinking the doc system is unreachable. See W0129.
|
|
483
486
|
if (results.length === 0) {
|
|
484
|
-
const sanitized = query
|
|
485
|
-
.toLowerCase()
|
|
486
|
-
.replace(/sanity\.?(io)?/gi, "")
|
|
487
|
-
.trim();
|
|
488
|
-
const slugGuess = sanitized
|
|
489
|
-
.replace(/\s+/g, "-")
|
|
490
|
-
.replace(/[^a-z0-9-]/g, "");
|
|
491
487
|
results = [
|
|
492
488
|
{
|
|
493
|
-
snippet: `
|
|
494
|
-
|
|
495
|
-
|
|
489
|
+
snippet: `No direct search results. The documentation index is available at ` +
|
|
490
|
+
`${this.llmsTxtUrl} — fetch it to discover real doc URLs, ` +
|
|
491
|
+
`then fetch_page specific topics.`,
|
|
492
|
+
title: `No results — try fetching ${this.llmsTxtUrl} for the doc index`,
|
|
493
|
+
url: this.llmsTxtUrl,
|
|
496
494
|
},
|
|
497
495
|
];
|
|
498
496
|
}
|
|
@@ -806,12 +804,14 @@ export default class AgenticProvider {
|
|
|
806
804
|
const maxToolRounds = this.config.maxToolRounds || 5;
|
|
807
805
|
const apiKey = this.config.apiKey || process.env.OPENAI_API_KEY;
|
|
808
806
|
// Newer OpenAI models (gpt-5.x, o-series) use max_completion_tokens
|
|
809
|
-
// instead of max_tokens
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
model.startsWith("gpt-5") ||
|
|
807
|
+
// instead of max_tokens, and reject custom temperature values. Detect
|
|
808
|
+
// from config or model name. See W0131.
|
|
809
|
+
const isReasoningModel = model.startsWith("gpt-5") ||
|
|
813
810
|
model.startsWith("o3") ||
|
|
814
811
|
model.startsWith("o4");
|
|
812
|
+
const useMaxCompletionTokens = this.config.max_output_tokens != null ||
|
|
813
|
+
this.config.max_completion_tokens != null ||
|
|
814
|
+
isReasoningModel;
|
|
815
815
|
const maxTokensValue = this.config.max_output_tokens ??
|
|
816
816
|
this.config.max_completion_tokens ??
|
|
817
817
|
this.config.max_tokens ??
|
|
@@ -840,15 +840,20 @@ export default class AgenticProvider {
|
|
|
840
840
|
const startTime = Date.now();
|
|
841
841
|
for (let round = 0; round <= maxToolRounds; round++) {
|
|
842
842
|
const isLastRound = round === maxToolRounds;
|
|
843
|
+
const requestBody = {
|
|
844
|
+
...tokenLimitParam,
|
|
845
|
+
messages,
|
|
846
|
+
model,
|
|
847
|
+
tool_choice: isLastRound ? "none" : "auto",
|
|
848
|
+
tools,
|
|
849
|
+
};
|
|
850
|
+
// gpt-5.x and o-series reject custom temperature; chat-completions
|
|
851
|
+
// models continue to receive the configured value. See W0131.
|
|
852
|
+
if (!isReasoningModel) {
|
|
853
|
+
requestBody.temperature = temperature;
|
|
854
|
+
}
|
|
843
855
|
const response = await fetchFn("https://api.openai.com/v1/chat/completions", {
|
|
844
|
-
body: JSON.stringify(
|
|
845
|
-
...tokenLimitParam,
|
|
846
|
-
messages,
|
|
847
|
-
model,
|
|
848
|
-
temperature,
|
|
849
|
-
tool_choice: isLastRound ? "none" : "auto",
|
|
850
|
-
tools,
|
|
851
|
-
}),
|
|
856
|
+
body: JSON.stringify(requestBody),
|
|
852
857
|
headers: {
|
|
853
858
|
Authorization: `Bearer ${apiKey}`,
|
|
854
859
|
"Content-Type": "application/json",
|
|
@@ -65,6 +65,11 @@ export function classifyRequests(requests) {
|
|
|
65
65
|
// Skip failed requests (no response)
|
|
66
66
|
if (req.statusCode === 0)
|
|
67
67
|
continue;
|
|
68
|
+
// Status-only entries (W0132) carry no body, so we can't infer search
|
|
69
|
+
// queries or doc-page metadata reliably. They still count as API calls
|
|
70
|
+
// (Sanity API) or external requests (everything else) so the run shape
|
|
71
|
+
// shows that the call happened, but we skip the body-dependent buckets.
|
|
72
|
+
const isStatusOnly = req.capture === "status-only";
|
|
68
73
|
// Order matters: API calls first (they may have ?query= params that look like searches),
|
|
69
74
|
// then searches, then doc pages, then external
|
|
70
75
|
if (isSanityApiRequest(req)) {
|
|
@@ -75,14 +80,14 @@ export function classifyRequests(requests) {
|
|
|
75
80
|
url: req.url,
|
|
76
81
|
});
|
|
77
82
|
}
|
|
78
|
-
else if (isSearchRequest(req)) {
|
|
83
|
+
else if (!isStatusOnly && isSearchRequest(req)) {
|
|
79
84
|
result.searchQueries.push({
|
|
80
85
|
query: extractSearchQuery(req),
|
|
81
86
|
timestamp: req.timestamp,
|
|
82
87
|
url: req.url,
|
|
83
88
|
});
|
|
84
89
|
}
|
|
85
|
-
else if (isDocPageRequest(req)) {
|
|
90
|
+
else if (!isStatusOnly && isDocPageRequest(req)) {
|
|
86
91
|
const slug = extractDocSlug(req.url);
|
|
87
92
|
result.docPageVisits.push({
|
|
88
93
|
contentSize: req.responseSize,
|
|
@@ -21,6 +21,25 @@
|
|
|
21
21
|
*
|
|
22
22
|
* const log = recorder.stop()
|
|
23
23
|
* // → AgentBehaviorLog with all requests classified
|
|
24
|
+
*
|
|
25
|
+
* W0133 — per-class preview byte caps
|
|
26
|
+
*
|
|
27
|
+
* `responsePreview` is capped at `previewLimits.default` (4 KB) for most
|
|
28
|
+
* responses, with per-class overrides for two payloads whose contents are
|
|
29
|
+
* the ground truth for trace audits:
|
|
30
|
+
*
|
|
31
|
+
* - `previewLimits.search` (16 KB) — Jina-wrapped DuckDuckGo, Google CSE,
|
|
32
|
+
* bing.com/search, duckduckgo.com, google.com/search responses. Captures
|
|
33
|
+
* the full result list (typical 8–10 KB) so trace audits can resolve
|
|
34
|
+
* which result the model fetched next.
|
|
35
|
+
* - `previewLimits.llmsTxt` (128 KB) — `/llms.txt` responses. The Sanity
|
|
36
|
+
* index is ~110 KB. Capturing the full body lets trace audits
|
|
37
|
+
* distinguish "model fetched a path that wasn't in the index" from
|
|
38
|
+
* "model fetched a path that was in the index but the page is missing".
|
|
39
|
+
*
|
|
40
|
+
* The slim Content Lake report (W0051) does not inline previews — they
|
|
41
|
+
* live in the GCS `traces` NDJSON artifact only, so bumping these caps
|
|
42
|
+
* has no effect on the 10 MB Sanity document budget.
|
|
24
43
|
*/
|
|
25
44
|
import type { ObservedRequest, AgentBehaviorLog } from "./types.js";
|
|
26
45
|
export interface RecorderOptions {
|
|
@@ -31,13 +50,50 @@ export interface RecorderOptions {
|
|
|
31
50
|
/** Filter: skip requests matching these URL patterns. Default: skip none.
|
|
32
51
|
* Accepts RegExp or string (strings are auto-converted to case-insensitive RegExp). */
|
|
33
52
|
excludePatterns?: (RegExp | string)[];
|
|
34
|
-
/** Filter: only record requests matching these URL patterns. Default: record all.
|
|
35
|
-
*
|
|
53
|
+
/** Filter: only fully record requests matching these URL patterns. Default: record all fully.
|
|
54
|
+
* When `statusOnlyForUnmatched` is true (default), unmatched URLs still emit a slim
|
|
55
|
+
* status-only observation. Accepts RegExp or string (strings are auto-converted to
|
|
56
|
+
* case-insensitive RegExp). */
|
|
36
57
|
includePatterns?: (RegExp | string)[];
|
|
37
58
|
/** Maximum request body bytes to capture. Default: 4096 */
|
|
38
59
|
maxBodyBytes?: number;
|
|
39
|
-
/**
|
|
60
|
+
/**
|
|
61
|
+
* Default response preview byte cap. Default: 4096.
|
|
62
|
+
*
|
|
63
|
+
* Per-class overrides in `previewLimits` may extend this for specific
|
|
64
|
+
* URL patterns. If `previewLimits` is set, `previewLimits.default` wins
|
|
65
|
+
* over `maxPreviewBytes`.
|
|
66
|
+
*/
|
|
40
67
|
maxPreviewBytes?: number;
|
|
68
|
+
/**
|
|
69
|
+
* Per-class response preview byte caps (W0133). Lets the recorder
|
|
70
|
+
* capture larger previews for response classes whose contents are the
|
|
71
|
+
* ground truth for trace audits, without inflating preview size for
|
|
72
|
+
* generic responses.
|
|
73
|
+
*
|
|
74
|
+
* - `default` — used when no other class matches. Falls back to
|
|
75
|
+
* `maxPreviewBytes` when omitted (defaults to 4 KB).
|
|
76
|
+
* - `search` — Jina-wrapped DuckDuckGo, Google CSE, bing/duckduckgo,
|
|
77
|
+
* google.com/search responses. Default: 16 KB.
|
|
78
|
+
* - `llmsTxt` — `/llms.txt` responses. Default: 128 KB.
|
|
79
|
+
*/
|
|
80
|
+
previewLimits?: {
|
|
81
|
+
default?: number;
|
|
82
|
+
llmsTxt?: number;
|
|
83
|
+
search?: number;
|
|
84
|
+
};
|
|
85
|
+
/**
|
|
86
|
+
* When a URL fails `includePatterns` but passes `excludePatterns`, emit a
|
|
87
|
+
* slim observation (url/method/statusCode/latencyMs/timestamp/seq, with
|
|
88
|
+
* `capture: "status-only"`) instead of dropping it entirely. Default: true.
|
|
89
|
+
*
|
|
90
|
+
* Setting to `false` restores strict-allowlist behavior — unmatched URLs
|
|
91
|
+
* are dropped, leaving no record of the call. The default exists so
|
|
92
|
+
* model-side traffic to api.openai.com / api.anthropic.com /
|
|
93
|
+
* googleapis.com is visible in run artifacts without recording prompts,
|
|
94
|
+
* completions, or API keys. See W0132.
|
|
95
|
+
*/
|
|
96
|
+
statusOnlyForUnmatched?: boolean;
|
|
41
97
|
}
|
|
42
98
|
export declare class RequestRecorder {
|
|
43
99
|
private observations;
|
|
@@ -69,8 +125,37 @@ export declare class RequestRecorder {
|
|
|
69
125
|
*
|
|
70
126
|
* Use this when you can't wrap `fetch` directly but can observe traffic
|
|
71
127
|
* (e.g., via browser DevTools Protocol, mitmproxy logs, etc.).
|
|
128
|
+
*
|
|
129
|
+
* Filter behavior (W0132):
|
|
130
|
+
* - `excludePatterns` always drops the observation entirely.
|
|
131
|
+
* - `includePatterns` mismatch produces a slim `capture: "status-only"`
|
|
132
|
+
* record when `statusOnlyForUnmatched` is true (default), or drops it
|
|
133
|
+
* when false.
|
|
134
|
+
* - The discriminator on the input is honored: callers that already
|
|
135
|
+
* know they're emitting a slim record (e.g., the fetch wrapper) can
|
|
136
|
+
* set `capture: "status-only"` themselves.
|
|
72
137
|
*/
|
|
73
138
|
record(observation: Omit<ObservedRequest, "seq">): void;
|
|
139
|
+
/**
|
|
140
|
+
* Resolve the preview byte cap for a given URL using per-class overrides
|
|
141
|
+
* (W0133). Order of preference:
|
|
142
|
+
* 1. `previewLimits.llmsTxt` for `/llms.txt` URLs.
|
|
143
|
+
* 2. `previewLimits.search` for known search providers.
|
|
144
|
+
* 3. `previewLimits.default`.
|
|
145
|
+
*/
|
|
146
|
+
private resolvePreviewBytes;
|
|
147
|
+
/**
|
|
148
|
+
* Decide how to record a URL given the current filter configuration.
|
|
149
|
+
*
|
|
150
|
+
* - `"drop"` — `excludePatterns` matched, or `includePatterns` failed
|
|
151
|
+
* and `statusOnlyForUnmatched` is false.
|
|
152
|
+
* - `"status-only"` — `includePatterns` failed but
|
|
153
|
+
* `statusOnlyForUnmatched` is true (default). Skip body/headers.
|
|
154
|
+
* - `"full"` — record everything.
|
|
155
|
+
*
|
|
156
|
+
* See W0132.
|
|
157
|
+
*/
|
|
158
|
+
private classifyCaptureMode;
|
|
74
159
|
/**
|
|
75
160
|
* Reset the recorder for reuse without creating a new instance.
|
|
76
161
|
*/
|