@sanity/ailf 7.0.1 → 7.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +12 -13
- package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/report.js +2 -0
- package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/team.js +63 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
- package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
- package/dist/_vendor/ailf-core/types/team.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
- package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
- package/dist/_vendor/ailf-shared/event-types.js +23 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
- package/dist/_vendor/ailf-shared/index.d.ts +5 -3
- package/dist/_vendor/ailf-shared/index.js +5 -2
- package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
- package/dist/_vendor/ailf-shared/member-roles.js +16 -0
- package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
- package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
- package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
- package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
- package/dist/adapters/task-sources/repo-task-source.js +2 -1
- package/dist/commands/pipeline-action.d.ts +4 -3
- package/dist/commands/pipeline-action.js +7 -5
- package/dist/commands/run.js +2 -2
- package/dist/config/rubrics.ts +12 -13
- package/dist/job-store.d.ts +18 -0
- package/dist/job-store.js +34 -0
- package/dist/orchestration/build-app-context.js +8 -1
- package/dist/orchestration/pipeline-orchestrator.js +46 -1
- package/dist/orchestration/steps/compare-step.d.ts +7 -0
- package/dist/orchestration/steps/compare-step.js +59 -23
- package/dist/orchestration/steps/fetch-docs-step.js +3 -0
- package/dist/orchestration/steps/finalize-run-step.js +2 -0
- package/dist/orchestration/steps/gap-analysis-step.js +9 -8
- package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
- package/dist/orchestration/steps/generate-configs-step.js +47 -13
- package/dist/orchestration/steps/grader-consistency-step.js +11 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
- package/dist/orchestration/steps/publish-report-step.js +36 -8
- package/dist/pipeline/cache-hit-restore.d.ts +14 -1
- package/dist/pipeline/cache-hit-restore.js +17 -0
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +123 -29
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
- package/dist/pipeline/compiler/provider-assembler.js +16 -3
- package/dist/pipeline/failure-modes.d.ts +20 -10
- package/dist/pipeline/failure-modes.js +84 -15
- package/dist/pipeline/map-request-to-config.js +2 -0
- package/dist/pipeline/normalize-mode.d.ts +1 -1
- package/dist/pipeline/normalize-mode.js +2 -0
- package/dist/pipeline/run-context.d.ts +16 -1
- package/dist/pipeline/run-context.js +12 -1
- package/dist/pipeline/validate.d.ts +8 -4
- package/dist/pipeline/validate.js +8 -18
- package/dist/report-store.d.ts +14 -1
- package/dist/report-store.js +32 -0
- package/dist/sanity/client.js +2 -2
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
package/config/rubrics.ts
CHANGED
|
@@ -15,10 +15,6 @@ import { defineRubrics } from "@sanity/ailf-core"
|
|
|
15
15
|
// template entry below. Source of truth lives in packages/eval/src/grader/;
|
|
16
16
|
// the helper picks the right list by dimension family.
|
|
17
17
|
import { failureModesForDimension } from "../src/grader/index.js"
|
|
18
|
-
// Single source of truth for the wire-format version stamped into the
|
|
19
|
-
// grader-prompt footer (VER-01 D-02). Interpolated below so the
|
|
20
|
-
// announced version cannot drift from the schema's expected value.
|
|
21
|
-
import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
|
|
22
18
|
|
|
23
19
|
export default defineRubrics({
|
|
24
20
|
templates: {
|
|
@@ -242,20 +238,23 @@ export default defineRubrics({
|
|
|
242
238
|
"agent-harness": { gold: "agent-harness" },
|
|
243
239
|
},
|
|
244
240
|
|
|
245
|
-
//
|
|
246
|
-
//
|
|
247
|
-
//
|
|
248
|
-
//
|
|
241
|
+
// W0273 — the footer documents the wire-format subset of GraderJudgment
|
|
242
|
+
// that the grader LLM actually controls. The pipeline parses this against
|
|
243
|
+
// GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
|
|
244
|
+
// fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
|
|
245
|
+
// hallucinationCheckedAgainst) to build the storage GraderJudgment.
|
|
246
|
+
//
|
|
247
|
+
// See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
|
|
248
|
+
// rationale (Phase 3 GRAD-05 made these fields required + .strict(),
|
|
249
|
+
// and asking the LLM for pipeline-owned values caused 100% parse
|
|
250
|
+
// failures starting 2026-05-11).
|
|
249
251
|
footer: `Return ONLY a JSON object with this exact shape:
|
|
250
252
|
{
|
|
251
|
-
"judgmentId": "<string>",
|
|
252
253
|
"score": <number 0-100>,
|
|
253
254
|
"reason": "<explanation, ≤500 chars>",
|
|
255
|
+
"failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
|
|
254
256
|
"subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
|
|
255
257
|
"docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
|
|
256
|
-
"
|
|
257
|
-
"confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
|
|
258
|
-
"hallucinationCheckedAgainst": ["<doc id>"],
|
|
259
|
-
"metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
|
|
258
|
+
"confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
|
|
260
259
|
}`,
|
|
261
260
|
})
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* Fields marked optional are transitional — they will become required
|
|
12
12
|
* as downstream consumers are converted to use them.
|
|
13
13
|
*/
|
|
14
|
-
import type { RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
|
|
14
|
+
import type { LiteracyVariant, RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
|
|
15
15
|
import type { RunId } from "../types/branded-ids.js";
|
|
16
16
|
import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
|
|
17
17
|
import type { ArtifactWriter } from "./artifact-writer.js";
|
|
@@ -42,12 +42,20 @@ export interface ResolvedConfig {
|
|
|
42
42
|
* `mode: "literacy", variant: "baseline"`. This keeps the pipeline
|
|
43
43
|
* mode-agnostic while preserving literacy's multi-variant behavior.
|
|
44
44
|
*
|
|
45
|
-
* Values: "baseline" | "agentic" | "observed" | "full" | undefined
|
|
46
45
|
* Undefined means "use the default variant for the mode" (baseline for literacy).
|
|
47
46
|
*/
|
|
48
|
-
variant?:
|
|
47
|
+
variant?: LiteracyVariant;
|
|
49
48
|
/** Debug options */
|
|
50
49
|
debug?: DebugOptions;
|
|
50
|
+
/**
|
|
51
|
+
* Filter the evaluated cohort to a subset of the configured model IDs.
|
|
52
|
+
*
|
|
53
|
+
* Each entry must match the `id` of a model declared in
|
|
54
|
+
* `config/models.ts`. Unknown IDs are dropped at the runner with a
|
|
55
|
+
* structured warning AND surfaced on the job's `error` field so callers
|
|
56
|
+
* can detect typos — silent strips are not acceptable.
|
|
57
|
+
*/
|
|
58
|
+
models?: string[];
|
|
51
59
|
/** Feature area filter */
|
|
52
60
|
areas?: string[];
|
|
53
61
|
/** Task ID filter */
|
|
@@ -68,6 +76,12 @@ export interface ResolvedConfig {
|
|
|
68
76
|
compareThreshold?: number;
|
|
69
77
|
/** Comparison baseline path */
|
|
70
78
|
compareBaseline?: string;
|
|
79
|
+
/**
|
|
80
|
+
* Comparison baseline expressed as a previously-published
|
|
81
|
+
* `ailf.report` document id. Takes precedence over `compareBaseline`
|
|
82
|
+
* when both are set.
|
|
83
|
+
*/
|
|
84
|
+
compareBaselineReportId?: string;
|
|
71
85
|
/** Whether gap analysis is enabled */
|
|
72
86
|
gapAnalysisEnabled: boolean;
|
|
73
87
|
/** Whether publishing is enabled */
|
|
@@ -323,6 +337,26 @@ export interface AppContext {
|
|
|
323
337
|
/** Task definition source (YAML, Content Lake, repo) */
|
|
324
338
|
readonly taskSource: TaskSource;
|
|
325
339
|
}
|
|
340
|
+
/**
|
|
341
|
+
* Discriminated result for `ReportStorePort.loadBaselineFromReport`.
|
|
342
|
+
*
|
|
343
|
+
* Lets the compare step distinguish a genuine 404 (the pinned report
|
|
344
|
+
* doesn't exist — skip with a clear reason) from a transport failure
|
|
345
|
+
* (Sanity 5xx, network blew up — fail the step so the user knows the
|
|
346
|
+
* pinned baseline didn't actually compare). The `baseline` payload is
|
|
347
|
+
* typed as `unknown` to keep the port surface decoupled from the eval
|
|
348
|
+
* package's `ComparableSummary` type — concrete implementations return
|
|
349
|
+
* a more specific shape, which is sound.
|
|
350
|
+
*/
|
|
351
|
+
export type LoadBaselineResult = {
|
|
352
|
+
kind: "ok";
|
|
353
|
+
baseline: unknown;
|
|
354
|
+
} | {
|
|
355
|
+
kind: "not_found";
|
|
356
|
+
} | {
|
|
357
|
+
kind: "error";
|
|
358
|
+
message: string;
|
|
359
|
+
};
|
|
326
360
|
/**
|
|
327
361
|
* Minimal report store interface used by AppContext.
|
|
328
362
|
*
|
|
@@ -341,6 +375,14 @@ export interface ReportStorePort {
|
|
|
341
375
|
write(report: unknown): Promise<unknown>;
|
|
342
376
|
/** Read a report by its ID (used by the post-run diagnosis hook). */
|
|
343
377
|
read(id: string): Promise<null | unknown>;
|
|
378
|
+
/**
|
|
379
|
+
* Load a previously-published report's score summary as a baseline
|
|
380
|
+
* for the `compare` step. Returns a discriminated result so callers
|
|
381
|
+
* can distinguish a genuine 404 (skip with a clear reason) from a
|
|
382
|
+
* transport failure (fail the step — the user pinned a baseline and
|
|
383
|
+
* deserves to know it didn't actually compare).
|
|
384
|
+
*/
|
|
385
|
+
loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
|
|
344
386
|
/** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
|
|
345
387
|
patchSynthesis(id: string, telemetry: unknown): Promise<void>;
|
|
346
388
|
/**
|
|
@@ -53,6 +53,16 @@ export interface DocumentManifestEntry {
|
|
|
53
53
|
_id: string;
|
|
54
54
|
_rev: string;
|
|
55
55
|
slug: string;
|
|
56
|
+
/** Parent section slug (`primarySection->slug.current`), when resolvable. */
|
|
57
|
+
sectionSlug?: string;
|
|
58
|
+
/**
|
|
59
|
+
* Full URL path under `/docs/` (e.g. `content-lake/groq-introduction`)
|
|
60
|
+
* composed via `buildContextDocPath` from `sectionSlug + "/" + slug`.
|
|
61
|
+
* Optional — historical manifests written before W0287 only carry
|
|
62
|
+
* `slug`; downstream `DocumentRef` builders fall back to slug-only
|
|
63
|
+
* display when this is absent.
|
|
64
|
+
*/
|
|
65
|
+
path?: string;
|
|
56
66
|
title: string;
|
|
57
67
|
}
|
|
58
68
|
/** Impact of a content release on canonical documents */
|
|
@@ -8,7 +8,7 @@ export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
|
|
|
8
8
|
export { NoOpArtifactWriter } from "./artifact-writer.js";
|
|
9
9
|
export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
|
|
10
10
|
export type { ConfigSource } from "./config-source.js";
|
|
11
|
-
export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
|
|
11
|
+
export type { AppContext, LoadBaselineResult, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
|
|
12
12
|
export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, SymbolIndexManifestEntry, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
|
|
13
13
|
export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
|
|
14
14
|
export type { LLMCallContext, LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion, LLMUsage, ModelId, ModelProvider, ParsedModelId, } from "./llm-client.js";
|
|
@@ -36,5 +36,13 @@ import type { Brand } from "../types/branded-ids.js";
|
|
|
36
36
|
* exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
|
|
37
37
|
* NOT replicate the cast at their own call sites — call this helper
|
|
38
38
|
* instead so the rule violation stays centralized.
|
|
39
|
+
*
|
|
40
|
+
* Pass `regex` to enforce a stricter shape than non-empty. The
|
|
41
|
+
* runtime validator becomes `z.string().regex(regex)` instead of
|
|
42
|
+
* `z.string().min(1)`; the brand-cast at the call boundary is
|
|
43
|
+
* unchanged. Callers passing `regex` are responsible for ensuring
|
|
44
|
+
* it rejects the empty string (typically anchor with `^` and
|
|
45
|
+
* require at least one character via `+` or a non-`*` quantifier);
|
|
46
|
+
* the `.min(1)` floor is dropped when `regex` is supplied.
|
|
39
47
|
*/
|
|
40
|
-
export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
|
|
48
|
+
export declare function brandedString<TBrand extends string>(regex?: RegExp): z.ZodType<Brand<string, TBrand>>;
|
|
@@ -35,11 +35,21 @@ import { z } from "zod";
|
|
|
35
35
|
* exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
|
|
36
36
|
* NOT replicate the cast at their own call sites — call this helper
|
|
37
37
|
* instead so the rule violation stays centralized.
|
|
38
|
+
*
|
|
39
|
+
* Pass `regex` to enforce a stricter shape than non-empty. The
|
|
40
|
+
* runtime validator becomes `z.string().regex(regex)` instead of
|
|
41
|
+
* `z.string().min(1)`; the brand-cast at the call boundary is
|
|
42
|
+
* unchanged. Callers passing `regex` are responsible for ensuring
|
|
43
|
+
* it rejects the empty string (typically anchor with `^` and
|
|
44
|
+
* require at least one character via `+` or a non-`*` quantifier);
|
|
45
|
+
* the `.min(1)` floor is dropped when `regex` is supplied.
|
|
38
46
|
*/
|
|
39
|
-
export function brandedString() {
|
|
40
|
-
// The runtime is a plain non-empty
|
|
41
|
-
// compile-time-only nominal tag (see `Brand<>` in
|
|
42
|
-
// Zod 4's `.brand()` uses a different symbol
|
|
43
|
-
// composition does not yield the project's
|
|
44
|
-
|
|
47
|
+
export function brandedString(regex) {
|
|
48
|
+
// The runtime is a plain string (non-empty or regex-validated);
|
|
49
|
+
// the brand is a compile-time-only nominal tag (see `Brand<>` in
|
|
50
|
+
// branded-ids.ts). Zod 4's `.brand()` uses a different symbol
|
|
51
|
+
// shape, so a direct composition does not yield the project's
|
|
52
|
+
// `Brand<…>` type.
|
|
53
|
+
const base = regex === undefined ? z.string().min(1) : z.string().regex(regex);
|
|
54
|
+
return base;
|
|
45
55
|
}
|
|
@@ -33,6 +33,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
33
33
|
changedDocs: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
34
34
|
compare: z.ZodOptional<z.ZodBoolean>;
|
|
35
35
|
compareBaseline: z.ZodOptional<z.ZodString>;
|
|
36
|
+
compareBaselineReportId: z.ZodOptional<z.ZodString>;
|
|
36
37
|
compareThreshold: z.ZodOptional<z.ZodNumber>;
|
|
37
38
|
concurrency: z.ZodOptional<z.ZodNumber>;
|
|
38
39
|
dataset: z.ZodOptional<z.ZodString>;
|
|
@@ -63,6 +64,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
63
64
|
observed: "observed";
|
|
64
65
|
full: "full";
|
|
65
66
|
}>>;
|
|
67
|
+
models: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
66
68
|
noAutoScope: z.ZodOptional<z.ZodBoolean>;
|
|
67
69
|
noCache: z.ZodOptional<z.ZodBoolean>;
|
|
68
70
|
noRemoteCache: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -101,6 +101,7 @@ export const PipelineRequestSchema = z.object({
|
|
|
101
101
|
changedDocs: z.array(z.string()).optional(),
|
|
102
102
|
compare: z.boolean().optional(),
|
|
103
103
|
compareBaseline: z.string().optional(),
|
|
104
|
+
compareBaselineReportId: z.string().min(1).optional(),
|
|
104
105
|
compareThreshold: z.number().min(0).optional(),
|
|
105
106
|
concurrency: z.number().int().positive().optional(),
|
|
106
107
|
dataset: z.string().optional(),
|
|
@@ -123,6 +124,12 @@ export const PipelineRequestSchema = z.object({
|
|
|
123
124
|
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
124
125
|
*/
|
|
125
126
|
mode: z.enum(RAW_EVAL_MODES).optional(),
|
|
127
|
+
/**
|
|
128
|
+
* Filter the evaluation cohort to a subset of the configured model IDs
|
|
129
|
+
* (W0281). Unknown IDs are dropped at the runner with a structured
|
|
130
|
+
* warning + job-error patch.
|
|
131
|
+
*/
|
|
132
|
+
models: z.array(z.string().min(1)).optional(),
|
|
126
133
|
noAutoScope: z.boolean().optional(),
|
|
127
134
|
noCache: z.boolean().optional(),
|
|
128
135
|
noRemoteCache: z.boolean().optional(),
|
|
@@ -113,6 +113,12 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
|
|
|
113
113
|
documentId: z.ZodOptional<z.ZodString>;
|
|
114
114
|
source: z.ZodString;
|
|
115
115
|
}, z.core.$strict>], "type">;
|
|
116
|
+
variant: z.ZodOptional<z.ZodEnum<{
|
|
117
|
+
agentic: "agentic";
|
|
118
|
+
baseline: "baseline";
|
|
119
|
+
observed: "observed";
|
|
120
|
+
full: "full";
|
|
121
|
+
}>>;
|
|
116
122
|
autoScope: z.ZodOptional<z.ZodObject<{
|
|
117
123
|
enabled: z.ZodBoolean;
|
|
118
124
|
affectedTaskIds: z.ZodArray<z.ZodString>;
|
|
@@ -222,6 +228,12 @@ export declare const ReportSchema: z.ZodObject<{
|
|
|
222
228
|
documentId: z.ZodOptional<z.ZodString>;
|
|
223
229
|
source: z.ZodString;
|
|
224
230
|
}, z.core.$strict>], "type">;
|
|
231
|
+
variant: z.ZodOptional<z.ZodEnum<{
|
|
232
|
+
agentic: "agentic";
|
|
233
|
+
baseline: "baseline";
|
|
234
|
+
observed: "observed";
|
|
235
|
+
full: "full";
|
|
236
|
+
}>>;
|
|
225
237
|
autoScope: z.ZodOptional<z.ZodObject<{
|
|
226
238
|
enabled: z.ZodBoolean;
|
|
227
239
|
affectedTaskIds: z.ZodArray<z.ZodString>;
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
* @see docs/work-items/W0191-report-store-schema-gate.json
|
|
25
25
|
*/
|
|
26
26
|
import { z } from "zod";
|
|
27
|
+
import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
|
|
27
28
|
// ---------------------------------------------------------------------------
|
|
28
29
|
// RunContext building blocks (mirrors packages/shared/src/run-context.ts)
|
|
29
30
|
// ---------------------------------------------------------------------------
|
|
@@ -195,6 +196,7 @@ export const ReportProvenanceSchema = z
|
|
|
195
196
|
taskIds: z.array(z.string()).optional(),
|
|
196
197
|
tool: RunToolSchema.optional(),
|
|
197
198
|
trigger: RunTriggerSchema,
|
|
199
|
+
variant: z.enum(LITERACY_VARIANTS).optional(),
|
|
198
200
|
// ReportProvenance additions
|
|
199
201
|
autoScope: ReportAutoScopeSchema.optional(),
|
|
200
202
|
contextHash: z.string().optional(),
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { NotificationChannel } from "../types/team.js";
|
|
3
|
+
export declare const TeamSchema: z.ZodObject<{
|
|
4
|
+
id: z.ZodType<import("../index.js").Brand<string, "TeamId">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamId">, unknown>>;
|
|
5
|
+
slug: z.ZodType<import("../index.js").Brand<string, "TeamSlug">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamSlug">, unknown>>;
|
|
6
|
+
displayName: z.ZodString;
|
|
7
|
+
description: z.ZodOptional<z.ZodString>;
|
|
8
|
+
status: z.ZodEnum<{
|
|
9
|
+
active: "active";
|
|
10
|
+
archived: "archived";
|
|
11
|
+
}>;
|
|
12
|
+
members: z.ZodArray<z.ZodObject<{
|
|
13
|
+
email: z.ZodOptional<z.ZodString>;
|
|
14
|
+
sanityUserId: z.ZodOptional<z.ZodString>;
|
|
15
|
+
githubUsername: z.ZodOptional<z.ZodString>;
|
|
16
|
+
displayName: z.ZodOptional<z.ZodString>;
|
|
17
|
+
role: z.ZodOptional<z.ZodString>;
|
|
18
|
+
lastVerifiedAt: z.ZodOptional<z.ZodString>;
|
|
19
|
+
}, z.core.$strip>>;
|
|
20
|
+
repos: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
21
|
+
notifications: z.ZodOptional<z.ZodArray<z.ZodType<NotificationChannel, unknown, z.core.$ZodTypeInternals<NotificationChannel, unknown>>>>;
|
|
22
|
+
}, z.core.$strip>;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { brandedString } from "./branded-string.js";
|
|
3
|
+
const SLUG_REGEX = /^[a-z0-9][a-z0-9-]*$/;
|
|
4
|
+
const TEAM_ID_REGEX = /^ailf\.team\.[a-z0-9][a-z0-9-]*$/;
|
|
5
|
+
const TeamMemberSchema = z
|
|
6
|
+
.object({
|
|
7
|
+
email: z.string().email().optional(),
|
|
8
|
+
sanityUserId: z.string().optional(),
|
|
9
|
+
githubUsername: z.string().optional(),
|
|
10
|
+
displayName: z.string().optional(),
|
|
11
|
+
role: z.string().optional(),
|
|
12
|
+
lastVerifiedAt: z.string().datetime().optional(),
|
|
13
|
+
})
|
|
14
|
+
.refine((m) => Boolean(m.email || m.sanityUserId || m.githubUsername), {
|
|
15
|
+
message: "TeamMember requires at least one of email, sanityUserId, githubUsername",
|
|
16
|
+
});
|
|
17
|
+
const ChannelScopeSchema = z.discriminatedUnion("type", [
|
|
18
|
+
z.object({ type: z.literal("owned") }),
|
|
19
|
+
z.object({ type: z.literal("all") }),
|
|
20
|
+
z.object({ type: z.literal("areas"), areas: z.array(z.string()) }),
|
|
21
|
+
z.object({ type: z.literal("repos"), repos: z.array(z.string()) }),
|
|
22
|
+
z.object({ type: z.literal("tags"), tags: z.array(z.string()) }),
|
|
23
|
+
]);
|
|
24
|
+
const SlackChannelSchema = z.object({
|
|
25
|
+
_key: z.string(),
|
|
26
|
+
type: z.literal("slack"),
|
|
27
|
+
channelId: z.string().min(1),
|
|
28
|
+
channelName: z.string().optional(),
|
|
29
|
+
purpose: z.string().optional(),
|
|
30
|
+
events: z.array(z.string()).optional(),
|
|
31
|
+
scope: ChannelScopeSchema.optional(),
|
|
32
|
+
});
|
|
33
|
+
const EmailChannelSchema = z.object({
|
|
34
|
+
_key: z.string(),
|
|
35
|
+
type: z.literal("email"),
|
|
36
|
+
addresses: z.array(z.string().email()).min(1),
|
|
37
|
+
purpose: z.string().optional(),
|
|
38
|
+
events: z.array(z.string()).optional(),
|
|
39
|
+
scope: ChannelScopeSchema.optional(),
|
|
40
|
+
});
|
|
41
|
+
const WebhookChannelSchema = z.object({
|
|
42
|
+
_key: z.string(),
|
|
43
|
+
type: z.literal("webhook"),
|
|
44
|
+
logicalName: z.string().min(1),
|
|
45
|
+
purpose: z.string().optional(),
|
|
46
|
+
events: z.array(z.string()).optional(),
|
|
47
|
+
scope: ChannelScopeSchema.optional(),
|
|
48
|
+
});
|
|
49
|
+
const NotificationChannelSchema = z.discriminatedUnion("type", [
|
|
50
|
+
SlackChannelSchema,
|
|
51
|
+
EmailChannelSchema,
|
|
52
|
+
WebhookChannelSchema,
|
|
53
|
+
]);
|
|
54
|
+
export const TeamSchema = z.object({
|
|
55
|
+
id: brandedString(TEAM_ID_REGEX),
|
|
56
|
+
slug: brandedString(SLUG_REGEX),
|
|
57
|
+
displayName: z.string().min(1),
|
|
58
|
+
description: z.string().optional(),
|
|
59
|
+
status: z.enum(["active", "archived"]),
|
|
60
|
+
members: z.array(TeamMemberSchema).min(1),
|
|
61
|
+
repos: z.array(z.string()).optional(),
|
|
62
|
+
notifications: z.array(NotificationChannelSchema).optional(),
|
|
63
|
+
});
|
|
@@ -123,3 +123,54 @@ export interface GraderJudgment {
|
|
|
123
123
|
graderJudgmentsVersion: string;
|
|
124
124
|
};
|
|
125
125
|
}
|
|
126
|
+
/**
|
|
127
|
+
* Wire-format subset of {@link GraderJudgment} — the fields a grader LLM
|
|
128
|
+
* is responsible for emitting in its JSON response. The pipeline parses
|
|
129
|
+
* untrusted grader output against this shape, then synthesizes the
|
|
130
|
+
* remaining storage fields (`taskId`, `modelId`, `dimension`, `judgmentId`,
|
|
131
|
+
* `metadata.{graderModel, graderJudgmentsVersion}`, and
|
|
132
|
+
* `hallucinationCheckedAgainst`) from server-side context.
|
|
133
|
+
*
|
|
134
|
+
* The split exists because four of `GraderJudgment`'s required fields are
|
|
135
|
+
* pipeline-owned semantics the LLM cannot produce correctly:
|
|
136
|
+
*
|
|
137
|
+
* - `judgmentId` — D0052 branded id with `(taskId, modelId, dimension,
|
|
138
|
+
* runId)` uniqueness invariant. Minted by `generateJudgmentId`.
|
|
139
|
+
* - `metadata.graderJudgmentsVersion` — static constant co-located with
|
|
140
|
+
* the schema (`promptfoo-grader-output.ts:48`).
|
|
141
|
+
* - `metadata.graderModel` — the grader's deployment alias (pipeline
|
|
142
|
+
* knows from provider config; the LLM doesn't reliably know its own).
|
|
143
|
+
* - `hallucinationCheckedAgainst` — the resolvable-set union of
|
|
144
|
+
* `task.context.docs` and `run.documentManifest`, composed by
|
|
145
|
+
* `populateHallucinationFields` (gap-analysis-step.ts).
|
|
146
|
+
*
|
|
147
|
+
* Asking the LLM for any of these produces drift; `.strict()` on
|
|
148
|
+
* `GraderJudgmentSchema` amplifies that drift into 100% parse failures
|
|
149
|
+
* (the 2026-05-11 empty-gapReport regression — see W0273 and
|
|
150
|
+
* `docs/audits/2026-05-22-empty-gap-analysis-regression.md`).
|
|
151
|
+
*
|
|
152
|
+
* `taskId`, `modelId`, and `dimension` are also pipeline-supplied (from
|
|
153
|
+
* `result.description`, `result.providerId`, and the rubric-classifier
|
|
154
|
+
* output in `calculate-scores.ts:475-479`) — kept out of the wire shape
|
|
155
|
+
* for the same reason.
|
|
156
|
+
*/
|
|
157
|
+
export interface GraderEmittedJudgment {
|
|
158
|
+
/** Numeric score in [0, 100] (normalized). */
|
|
159
|
+
score: number;
|
|
160
|
+
/** The grader's natural-language reasoning. */
|
|
161
|
+
reason: string;
|
|
162
|
+
/** Per-dimension failure mode (must match the legal-mode list in the rubric). */
|
|
163
|
+
failureMode: string;
|
|
164
|
+
/** Per-criterion sub-judgments. */
|
|
165
|
+
subJudgments: CriterionSubJudgment[];
|
|
166
|
+
/** Doc citations with role + hallucinated flag. */
|
|
167
|
+
docCitations: DocCitation[];
|
|
168
|
+
/** Grader self-confidence per D0049. */
|
|
169
|
+
confidence: Confidence;
|
|
170
|
+
/**
|
|
171
|
+
* True when the candidate response was empty/whitespace/refused. The
|
|
172
|
+
* pipeline also independently detects this from
|
|
173
|
+
* `result.response.output` — both signals are OR'd.
|
|
174
|
+
*/
|
|
175
|
+
outputFailure?: boolean;
|
|
176
|
+
}
|
|
@@ -39,8 +39,9 @@ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLake
|
|
|
39
39
|
export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
|
|
40
40
|
export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
|
|
41
41
|
export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
|
|
42
|
-
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
|
|
42
|
+
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
|
|
43
43
|
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
44
|
+
export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
|
|
44
45
|
type DocumentRef = _DocumentRef;
|
|
45
46
|
/** Aggregated retrieval metrics for a feature area */
|
|
46
47
|
export interface AreaRetrievalMetrics {
|
|
@@ -259,6 +260,12 @@ export interface FilterOptions {
|
|
|
259
260
|
tags?: string[];
|
|
260
261
|
/** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
|
|
261
262
|
taskIds?: string[];
|
|
263
|
+
/**
|
|
264
|
+
* Doc slugs that changed in the calling context. When set, only tasks
|
|
265
|
+
* whose `context.docs[*].slug` intersects this list are returned.
|
|
266
|
+
* Empty array is a no-op (treated as undefined).
|
|
267
|
+
*/
|
|
268
|
+
changedDocs?: readonly string[];
|
|
262
269
|
}
|
|
263
270
|
/** Full gap analysis report */
|
|
264
271
|
export interface GapAnalysisReport {
|
|
@@ -79,6 +79,13 @@ export interface PipelineRequest {
|
|
|
79
79
|
classification?: RunClassification;
|
|
80
80
|
compare?: boolean;
|
|
81
81
|
compareBaseline?: string;
|
|
82
|
+
/**
|
|
83
|
+
* Compare against a baseline extracted from a previously-published
|
|
84
|
+
* `ailf.report` document. Takes precedence over `compareBaseline`
|
|
85
|
+
* (local FS path). Dashboard-friendly: a report id is something the
|
|
86
|
+
* user can pick from a list.
|
|
87
|
+
*/
|
|
88
|
+
compareBaselineReportId?: string;
|
|
82
89
|
compareThreshold?: number;
|
|
83
90
|
concurrency?: number;
|
|
84
91
|
dataset?: string;
|
|
@@ -93,6 +100,16 @@ export interface PipelineRequest {
|
|
|
93
100
|
jobId?: string;
|
|
94
101
|
labels?: string[];
|
|
95
102
|
mode?: RawEvalMode;
|
|
103
|
+
/**
|
|
104
|
+
* Filter the evaluation cohort to a subset of the configured model IDs.
|
|
105
|
+
*
|
|
106
|
+
* Each entry must match the `id` of a model declared in
|
|
107
|
+
* `packages/eval/config/models.ts`. IDs that don't match are dropped
|
|
108
|
+
* with a structured warning AND surfaced on the job's `error` field so
|
|
109
|
+
* callers can detect typos — silent strips are not acceptable
|
|
110
|
+
* (W0281 acceptance criterion 5).
|
|
111
|
+
*/
|
|
112
|
+
models?: string[];
|
|
96
113
|
noAutoScope?: boolean;
|
|
97
114
|
noCache?: boolean;
|
|
98
115
|
noRemoteCache?: boolean;
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import type { Brand } from "./branded-ids.js";
|
|
2
|
+
export type TeamId = Brand<string, "TeamId">;
|
|
3
|
+
export type TeamSlug = Brand<string, "TeamSlug">;
|
|
4
|
+
export type TeamStatus = "active" | "archived";
|
|
5
|
+
export type KnownMemberRole = "lead" | "member" | "oncall";
|
|
6
|
+
export type MemberRole = KnownMemberRole | (string & {});
|
|
7
|
+
export type KnownEventType = "eval.failed" | "eval.completed" | "eval.threshold-breached" | "eval.score-regressed" | "task.created" | "task.archived" | "area.unowned-tasks";
|
|
8
|
+
export type EventType = KnownEventType | (string & {});
|
|
9
|
+
export type NotificationChannelType = "slack" | "email" | "webhook";
|
|
10
|
+
export interface TeamMember {
|
|
11
|
+
email?: string;
|
|
12
|
+
sanityUserId?: string;
|
|
13
|
+
githubUsername?: string;
|
|
14
|
+
displayName?: string;
|
|
15
|
+
role?: MemberRole;
|
|
16
|
+
lastVerifiedAt?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface BaseChannel {
|
|
19
|
+
_key: string;
|
|
20
|
+
type: NotificationChannelType;
|
|
21
|
+
purpose?: string;
|
|
22
|
+
events?: EventType[];
|
|
23
|
+
scope?: ChannelScope;
|
|
24
|
+
}
|
|
25
|
+
export interface SlackChannel extends BaseChannel {
|
|
26
|
+
type: "slack";
|
|
27
|
+
channelId: string;
|
|
28
|
+
channelName?: string;
|
|
29
|
+
}
|
|
30
|
+
export interface EmailChannel extends BaseChannel {
|
|
31
|
+
type: "email";
|
|
32
|
+
addresses: string[];
|
|
33
|
+
}
|
|
34
|
+
export interface WebhookChannel extends BaseChannel {
|
|
35
|
+
type: "webhook";
|
|
36
|
+
logicalName: string;
|
|
37
|
+
}
|
|
38
|
+
export type NotificationChannel = SlackChannel | EmailChannel | WebhookChannel;
|
|
39
|
+
export type ChannelScope = {
|
|
40
|
+
type: "owned";
|
|
41
|
+
} | {
|
|
42
|
+
type: "all";
|
|
43
|
+
} | {
|
|
44
|
+
type: "areas";
|
|
45
|
+
areas: string[];
|
|
46
|
+
} | {
|
|
47
|
+
type: "repos";
|
|
48
|
+
repos: string[];
|
|
49
|
+
} | {
|
|
50
|
+
type: "tags";
|
|
51
|
+
tags: string[];
|
|
52
|
+
};
|
|
53
|
+
export interface Team {
|
|
54
|
+
id: TeamId;
|
|
55
|
+
slug: TeamSlug;
|
|
56
|
+
displayName: string;
|
|
57
|
+
description?: string;
|
|
58
|
+
status: TeamStatus;
|
|
59
|
+
members: TeamMember[];
|
|
60
|
+
repos?: string[];
|
|
61
|
+
notifications?: NotificationChannel[];
|
|
62
|
+
}
|
|
63
|
+
export type TeamRef = {
|
|
64
|
+
_ref: string;
|
|
65
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -22,8 +22,36 @@ export interface DocumentRef {
|
|
|
22
22
|
* Named `revision` (not `_rev`) for the same Sanity reserved-name reason.
|
|
23
23
|
*/
|
|
24
24
|
revision?: string;
|
|
25
|
-
/** URL-path identifier (e.g., "groq-introduction") */
|
|
25
|
+
/** URL-path identifier (e.g., "groq-introduction") — leaf segment only. */
|
|
26
26
|
slug: string;
|
|
27
|
+
/**
|
|
28
|
+
* Full URL path under `/docs/` (e.g., `content-lake/groq-introduction`).
|
|
29
|
+
* Composed from the article's `primarySection->slug.current` and
|
|
30
|
+
* `slug.current` via {@link buildContextDocPath}. Optional — historical
|
|
31
|
+
* reports written before W0287 carry only `slug`; consumers must fall
|
|
32
|
+
* back to `slug` for display when `path` is absent.
|
|
33
|
+
*/
|
|
34
|
+
path?: string;
|
|
27
35
|
/** Human-readable document title */
|
|
28
36
|
title: string;
|
|
29
37
|
}
|
|
38
|
+
/**
|
|
39
|
+
* Compose the canonical `/docs/`-relative path for a context-doc reference.
|
|
40
|
+
*
|
|
41
|
+
* Single source of truth across producers (eval doc fetcher, repo-task
|
|
42
|
+
* mirroring) and consumers (dashboard projections). Resolution order:
|
|
43
|
+
*
|
|
44
|
+
* 1. An explicit `path` (e.g. authored on a YAML/repo-mirrored task) wins.
|
|
45
|
+
* 2. Otherwise compose `sectionSlug + "/" + slug` when both are present.
|
|
46
|
+
* 3. Otherwise `null` — neither caller can build a working docs URL, so
|
|
47
|
+
* consumers should disable the link rather than emit a 404.
|
|
48
|
+
*
|
|
49
|
+
* The leaf `slug` alone is never returned as the path because
|
|
50
|
+
* `article.slug.current` is leaf-only on sanity.io/docs; the hierarchy
|
|
51
|
+
* lives on `primarySection->slug.current`.
|
|
52
|
+
*/
|
|
53
|
+
export declare function buildContextDocPath(input: {
|
|
54
|
+
path?: string | null;
|
|
55
|
+
sectionSlug?: string | null;
|
|
56
|
+
slug?: string | null;
|
|
57
|
+
}): string | null;
|
|
@@ -1 +1,23 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Compose the canonical `/docs/`-relative path for a context-doc reference.
|
|
3
|
+
*
|
|
4
|
+
* Single source of truth across producers (eval doc fetcher, repo-task
|
|
5
|
+
* mirroring) and consumers (dashboard projections). Resolution order:
|
|
6
|
+
*
|
|
7
|
+
* 1. An explicit `path` (e.g. authored on a YAML/repo-mirrored task) wins.
|
|
8
|
+
* 2. Otherwise compose `sectionSlug + "/" + slug` when both are present.
|
|
9
|
+
* 3. Otherwise `null` — neither caller can build a working docs URL, so
|
|
10
|
+
* consumers should disable the link rather than emit a 404.
|
|
11
|
+
*
|
|
12
|
+
* The leaf `slug` alone is never returned as the path because
|
|
13
|
+
* `article.slug.current` is leaf-only on sanity.io/docs; the hierarchy
|
|
14
|
+
* lives on `primarySection->slug.current`.
|
|
15
|
+
*/
|
|
16
|
+
export function buildContextDocPath(input) {
|
|
17
|
+
if (input.path)
|
|
18
|
+
return input.path;
|
|
19
|
+
if (input.sectionSlug && input.slug) {
|
|
20
|
+
return `${input.sectionSlug}/${input.slug}`;
|
|
21
|
+
}
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
@@ -52,6 +52,8 @@ export declare const LEGACY_EVAL_MODE_ALIASES: readonly ["baseline", "agentic",
|
|
|
52
52
|
export declare const LITERACY_VARIANTS: readonly ["baseline", "agentic", "observed", "full"];
|
|
53
53
|
/** Union of all literacy variant string values. */
|
|
54
54
|
export type LiteracyVariant = (typeof LITERACY_VARIANTS)[number];
|
|
55
|
+
/** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
|
|
56
|
+
export declare function isLiteracyVariant(value: unknown): value is LiteracyVariant;
|
|
55
57
|
/**
|
|
56
58
|
* All accepted mode names for Zod enum construction.
|
|
57
59
|
* Canonical modes first, then legacy aliases.
|
|
@@ -40,6 +40,11 @@ export const LITERACY_VARIANTS = [
|
|
|
40
40
|
"observed",
|
|
41
41
|
"full",
|
|
42
42
|
];
|
|
43
|
+
/** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
|
|
44
|
+
export function isLiteracyVariant(value) {
|
|
45
|
+
return (typeof value === "string" &&
|
|
46
|
+
LITERACY_VARIANTS.includes(value));
|
|
47
|
+
}
|
|
43
48
|
/**
|
|
44
49
|
* All accepted mode names for Zod enum construction.
|
|
45
50
|
* Canonical modes first, then legacy aliases.
|