@sanity/ailf 4.6.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +36 -0
- package/dist/composition-root.js +48 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* promptfoo-grader-output.ts — Zod schema for the structured grader output
|
|
3
|
+
* (GRAD-02) emitted by the promptfoo grader process and consumed by the
|
|
4
|
+
* eval pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
|
|
7
|
+
* canonical domain type in `packages/core/src/types/grader-judgment.ts`
|
|
8
|
+
* (D0045 / W0187) — drift between schema and type is a build error.
|
|
9
|
+
* The domain type was authored independently in Plan 01-01; this file
|
|
10
|
+
* authors ONLY the schema and never derives the domain type from the
|
|
11
|
+
* schema itself (no schema-derived self-reference allowed by D0045).
|
|
12
|
+
*
|
|
13
|
+
* `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
|
|
14
|
+
* source-of-truth file owns its version constant). Bumped by hand when
|
|
15
|
+
* the grader rubric, prompt template, or judgment shape changes.
|
|
16
|
+
*
|
|
17
|
+
* Phase 3 will replace the inline `JSON.parse` at
|
|
18
|
+
* `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
|
|
19
|
+
* output flows through this schema.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
23
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
24
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
25
|
+
*/
|
|
26
|
+
import { z } from "zod";
|
|
27
|
+
import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
|
|
28
|
+
/**
|
|
29
|
+
* VER-01 D-02 — co-located version constant. Bumped by hand when the
|
|
30
|
+
* grader rubric, prompt template, or judgment shape changes in a way
|
|
31
|
+
* that should invalidate cached Diagnoses.
|
|
32
|
+
*
|
|
33
|
+
* Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
|
|
34
|
+
* major) — the additive GRAD-02 surface is now required + the schema
|
|
35
|
+
* is `.strict()`. AILF has no installed external base; the legacy
|
|
36
|
+
* parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
|
|
37
|
+
* consumer for already-stored historical reports.
|
|
38
|
+
*/
|
|
39
|
+
export const graderJudgmentsVersion = "1.0.0";
|
|
40
|
+
const DocCitationRoleSchema = z.enum([
|
|
41
|
+
"supports",
|
|
42
|
+
"contradicts",
|
|
43
|
+
"missing",
|
|
44
|
+
"irrelevant",
|
|
45
|
+
]);
|
|
46
|
+
const DocCitationSchema = z.object({
|
|
47
|
+
documentId: z.string().min(1),
|
|
48
|
+
slug: z.string().optional(),
|
|
49
|
+
role: DocCitationRoleSchema,
|
|
50
|
+
hallucinated: z.boolean().optional(),
|
|
51
|
+
});
|
|
52
|
+
const CriterionSubJudgmentSchema = z.object({
|
|
53
|
+
criterionId: z.string().min(1),
|
|
54
|
+
met: z.boolean(),
|
|
55
|
+
evidence: z.string().max(280),
|
|
56
|
+
confidence: ConfidenceSchema,
|
|
57
|
+
});
|
|
58
|
+
/**
|
|
59
|
+
* Canonical schema for {@link GraderJudgment}. Required fields mirror
|
|
60
|
+
* the existing pipeline core (Doc 03 §"existing, unchanged"):
|
|
61
|
+
* `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
|
|
62
|
+
* has tightened the additive surface to required and added `.strict()`
|
|
63
|
+
* — the schema rejects unknown fields (defense-in-depth against future
|
|
64
|
+
* prompt-injection attempts that try to smuggle keys through the
|
|
65
|
+
* grader emission).
|
|
66
|
+
*
|
|
67
|
+
* Branded `JudgmentId` is represented at runtime by a non-empty string;
|
|
68
|
+
* the schema routes the brand through `brandedString<"JudgmentId">()`
|
|
69
|
+
* — the project's single audited cast site for branded-string
|
|
70
|
+
* schemas (project typescript rule: no `as` on `unknown`).
|
|
71
|
+
*/
|
|
72
|
+
export const GraderJudgmentSchema = z
|
|
73
|
+
.object({
|
|
74
|
+
// ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
|
|
75
|
+
taskId: z.string().min(1),
|
|
76
|
+
modelId: z.string().min(1),
|
|
77
|
+
dimension: z.string().min(1),
|
|
78
|
+
reason: z.string(),
|
|
79
|
+
score: z.number(),
|
|
80
|
+
outputFailure: z.boolean().optional(),
|
|
81
|
+
// ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
|
|
82
|
+
judgmentId: brandedString(),
|
|
83
|
+
subJudgments: z.array(CriterionSubJudgmentSchema),
|
|
84
|
+
docCitations: z.array(DocCitationSchema),
|
|
85
|
+
failureMode: z.string(),
|
|
86
|
+
confidence: ConfidenceSchema,
|
|
87
|
+
hallucinationCheckedAgainst: z.array(z.string()),
|
|
88
|
+
metadata: z.object({
|
|
89
|
+
graderModel: z.string().min(1),
|
|
90
|
+
graderJudgmentsVersion: z.string().min(1),
|
|
91
|
+
}),
|
|
92
|
+
})
|
|
93
|
+
.strict();
|
package/dist/adapters/index.d.ts
CHANGED
|
@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
14
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
15
|
+
export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";
|
package/dist/adapters/index.js
CHANGED
|
@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
|
|
|
10
10
|
export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
|
|
11
11
|
export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
|
|
12
12
|
export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
|
|
13
|
+
// Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
|
|
14
|
+
// Named re-exports only (W0124 / D0045).
|
|
15
|
+
export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
|
|
16
|
+
export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
|
|
@@ -55,9 +55,13 @@ interface ContentLakeCanonicalDoc {
|
|
|
55
55
|
sectionSlug?: string;
|
|
56
56
|
slug?: string;
|
|
57
57
|
}
|
|
58
|
+
interface ContentLakeCriterion {
|
|
59
|
+
id?: string;
|
|
60
|
+
text?: string;
|
|
61
|
+
}
|
|
58
62
|
/** Assertion shape from the Content Lake (mirrors the Studio schema). */
|
|
59
63
|
interface ContentLakeAssertion {
|
|
60
|
-
criteria?:
|
|
64
|
+
criteria?: ContentLakeCriterion[];
|
|
61
65
|
template?: string;
|
|
62
66
|
threshold?: number;
|
|
63
67
|
type?: string;
|
|
@@ -73,7 +73,13 @@ const TASKS_QUERY = /* groq */ `
|
|
|
73
73
|
perspective,
|
|
74
74
|
reason
|
|
75
75
|
},
|
|
76
|
-
"assertions": coalesce(assertions, assert)
|
|
76
|
+
"assertions": coalesce(assertions, assert)[] {
|
|
77
|
+
type, template, weight, value, threshold,
|
|
78
|
+
"criteria": criteria[] {
|
|
79
|
+
"id": coalesce(id.current, _key),
|
|
80
|
+
"text": coalesce(text, @)
|
|
81
|
+
}
|
|
82
|
+
},
|
|
77
83
|
rawAssert,
|
|
78
84
|
baseline,
|
|
79
85
|
tags,
|
|
@@ -256,8 +262,28 @@ function mapAssertions(raw) {
|
|
|
256
262
|
.filter((a) => !!a.type)
|
|
257
263
|
.map((a) => {
|
|
258
264
|
if (a.type === "llm-rubric" && a.template && a.criteria) {
|
|
265
|
+
// Tighten the runtime contract: the GROQ projection's
|
|
266
|
+
// `coalesce(text, @)` falls through to the entire criterion
|
|
267
|
+
// element when `text` is missing, so a partial legacy criterion
|
|
268
|
+
// like `{_key: "abc"}` arrives here as `{ id: "abc", text: {...} }`
|
|
269
|
+
// — `text` set to the whole `@` object. Explicit type checks
|
|
270
|
+
// drop those with a diagnostic, instead of letting the non-string
|
|
271
|
+
// `text` propagate until the outer ContentLakeAuthorableTaskSchema
|
|
272
|
+
// parse fails deep inside the assertions array (noisy diagnostic).
|
|
259
273
|
return {
|
|
260
|
-
criteria: a.criteria
|
|
274
|
+
criteria: a.criteria
|
|
275
|
+
.filter((c) => {
|
|
276
|
+
if (!c)
|
|
277
|
+
return false;
|
|
278
|
+
const idOk = typeof c.id === "string" && c.id.length > 0;
|
|
279
|
+
const textOk = typeof c.text === "string" && c.text.length > 0;
|
|
280
|
+
if (!idOk || !textOk) {
|
|
281
|
+
console.warn(`[ContentLakeTaskSource] dropping malformed criterion: ${JSON.stringify(c).slice(0, 100)}`);
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
return true;
|
|
285
|
+
})
|
|
286
|
+
.map((c) => ({ id: c.id, text: c.text })),
|
|
261
287
|
template: a.template,
|
|
262
288
|
type: "llm-rubric",
|
|
263
289
|
...(a.weight !== undefined ? { weight: a.weight } : {}),
|
|
@@ -32,6 +32,40 @@ export type CuratedAssertionType = (typeof CURATED_ASSERTION_TYPES)[number];
|
|
|
32
32
|
*/
|
|
33
33
|
export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage", "mcp-input-validation", "mcp-output-correctness", "mcp-error-handling", "mcp-security", "factual-correctness", "completeness", "currency", "process-quality", "agent-output", "agent-tool-usage"];
|
|
34
34
|
export type RubricTemplateName = (typeof RUBRIC_TEMPLATE_NAMES)[number];
|
|
35
|
+
/**
|
|
36
|
+
* A single criterion within an llm-rubric assertion. Stable id-text pair.
|
|
37
|
+
*/
|
|
38
|
+
export declare const CriterionRefSchema: z.ZodObject<{
|
|
39
|
+
id: z.ZodString;
|
|
40
|
+
text: z.ZodString;
|
|
41
|
+
}, z.core.$strip>;
|
|
42
|
+
/**
|
|
43
|
+
* A templated LLM-rubric assertion — uses one of the predefined rubric
|
|
44
|
+
* templates with author-supplied criteria.
|
|
45
|
+
*/
|
|
46
|
+
export declare const TemplatedAssertionSchema: z.ZodObject<{
|
|
47
|
+
type: z.ZodLiteral<"llm-rubric">;
|
|
48
|
+
template: z.ZodEnum<{
|
|
49
|
+
"task-completion": "task-completion";
|
|
50
|
+
"code-correctness": "code-correctness";
|
|
51
|
+
"doc-coverage": "doc-coverage";
|
|
52
|
+
"mcp-input-validation": "mcp-input-validation";
|
|
53
|
+
"mcp-output-correctness": "mcp-output-correctness";
|
|
54
|
+
"mcp-error-handling": "mcp-error-handling";
|
|
55
|
+
"mcp-security": "mcp-security";
|
|
56
|
+
"factual-correctness": "factual-correctness";
|
|
57
|
+
completeness: "completeness";
|
|
58
|
+
currency: "currency";
|
|
59
|
+
"process-quality": "process-quality";
|
|
60
|
+
"agent-output": "agent-output";
|
|
61
|
+
"agent-tool-usage": "agent-tool-usage";
|
|
62
|
+
}>;
|
|
63
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
64
|
+
id: z.ZodString;
|
|
65
|
+
text: z.ZodString;
|
|
66
|
+
}, z.core.$strip>>;
|
|
67
|
+
weight: z.ZodOptional<z.ZodNumber>;
|
|
68
|
+
}, z.core.$strip>;
|
|
35
69
|
/**
|
|
36
70
|
* Zod schema for a single task definition — a mode-discriminated union
|
|
37
71
|
* mirroring `GeneralizedTaskDefinition`.
|
|
@@ -73,7 +107,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
73
107
|
"agent-output": "agent-output";
|
|
74
108
|
"agent-tool-usage": "agent-tool-usage";
|
|
75
109
|
}>;
|
|
76
|
-
criteria: z.ZodArray<z.
|
|
110
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
111
|
+
id: z.ZodString;
|
|
112
|
+
text: z.ZodString;
|
|
113
|
+
}, z.core.$strip>>;
|
|
77
114
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
78
115
|
}, z.core.$strip>, z.ZodObject<{
|
|
79
116
|
type: z.ZodEnum<{
|
|
@@ -187,7 +224,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
187
224
|
"agent-output": "agent-output";
|
|
188
225
|
"agent-tool-usage": "agent-tool-usage";
|
|
189
226
|
}>;
|
|
190
|
-
criteria: z.ZodArray<z.
|
|
227
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
228
|
+
id: z.ZodString;
|
|
229
|
+
text: z.ZodString;
|
|
230
|
+
}, z.core.$strip>>;
|
|
191
231
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
192
232
|
}, z.core.$strip>, z.ZodObject<{
|
|
193
233
|
type: z.ZodEnum<{
|
|
@@ -341,7 +381,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
341
381
|
"agent-output": "agent-output";
|
|
342
382
|
"agent-tool-usage": "agent-tool-usage";
|
|
343
383
|
}>;
|
|
344
|
-
criteria: z.ZodArray<z.
|
|
384
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
385
|
+
id: z.ZodString;
|
|
386
|
+
text: z.ZodString;
|
|
387
|
+
}, z.core.$strip>>;
|
|
345
388
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
346
389
|
}, z.core.$strip>, z.ZodObject<{
|
|
347
390
|
type: z.ZodEnum<{
|
|
@@ -472,7 +515,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
472
515
|
"agent-output": "agent-output";
|
|
473
516
|
"agent-tool-usage": "agent-tool-usage";
|
|
474
517
|
}>;
|
|
475
|
-
criteria: z.ZodArray<z.
|
|
518
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
519
|
+
id: z.ZodString;
|
|
520
|
+
text: z.ZodString;
|
|
521
|
+
}, z.core.$strip>>;
|
|
476
522
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
477
523
|
}, z.core.$strip>, z.ZodObject<{
|
|
478
524
|
type: z.ZodEnum<{
|
|
@@ -591,7 +637,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
591
637
|
"agent-output": "agent-output";
|
|
592
638
|
"agent-tool-usage": "agent-tool-usage";
|
|
593
639
|
}>;
|
|
594
|
-
criteria: z.ZodArray<z.
|
|
640
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
641
|
+
id: z.ZodString;
|
|
642
|
+
text: z.ZodString;
|
|
643
|
+
}, z.core.$strip>>;
|
|
595
644
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
596
645
|
}, z.core.$strip>, z.ZodObject<{
|
|
597
646
|
type: z.ZodEnum<{
|
|
@@ -699,7 +748,10 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
|
|
|
699
748
|
"agent-output": "agent-output";
|
|
700
749
|
"agent-tool-usage": "agent-tool-usage";
|
|
701
750
|
}>;
|
|
702
|
-
criteria: z.ZodArray<z.
|
|
751
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
752
|
+
id: z.ZodString;
|
|
753
|
+
text: z.ZodString;
|
|
754
|
+
}, z.core.$strip>>;
|
|
703
755
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
704
756
|
}, z.core.$strip>, z.ZodObject<{
|
|
705
757
|
type: z.ZodEnum<{
|
|
@@ -819,7 +871,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
819
871
|
"agent-output": "agent-output";
|
|
820
872
|
"agent-tool-usage": "agent-tool-usage";
|
|
821
873
|
}>;
|
|
822
|
-
criteria: z.ZodArray<z.
|
|
874
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
875
|
+
id: z.ZodString;
|
|
876
|
+
text: z.ZodString;
|
|
877
|
+
}, z.core.$strip>>;
|
|
823
878
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
824
879
|
}, z.core.$strip>, z.ZodObject<{
|
|
825
880
|
type: z.ZodEnum<{
|
|
@@ -933,7 +988,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
933
988
|
"agent-output": "agent-output";
|
|
934
989
|
"agent-tool-usage": "agent-tool-usage";
|
|
935
990
|
}>;
|
|
936
|
-
criteria: z.ZodArray<z.
|
|
991
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
992
|
+
id: z.ZodString;
|
|
993
|
+
text: z.ZodString;
|
|
994
|
+
}, z.core.$strip>>;
|
|
937
995
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
938
996
|
}, z.core.$strip>, z.ZodObject<{
|
|
939
997
|
type: z.ZodEnum<{
|
|
@@ -1087,7 +1145,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
1087
1145
|
"agent-output": "agent-output";
|
|
1088
1146
|
"agent-tool-usage": "agent-tool-usage";
|
|
1089
1147
|
}>;
|
|
1090
|
-
criteria: z.ZodArray<z.
|
|
1148
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
1149
|
+
id: z.ZodString;
|
|
1150
|
+
text: z.ZodString;
|
|
1151
|
+
}, z.core.$strip>>;
|
|
1091
1152
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
1092
1153
|
}, z.core.$strip>, z.ZodObject<{
|
|
1093
1154
|
type: z.ZodEnum<{
|
|
@@ -1218,7 +1279,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
1218
1279
|
"agent-output": "agent-output";
|
|
1219
1280
|
"agent-tool-usage": "agent-tool-usage";
|
|
1220
1281
|
}>;
|
|
1221
|
-
criteria: z.ZodArray<z.
|
|
1282
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
1283
|
+
id: z.ZodString;
|
|
1284
|
+
text: z.ZodString;
|
|
1285
|
+
}, z.core.$strip>>;
|
|
1222
1286
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
1223
1287
|
}, z.core.$strip>, z.ZodObject<{
|
|
1224
1288
|
type: z.ZodEnum<{
|
|
@@ -1337,7 +1401,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
|
|
|
1337
1401
|
"agent-output": "agent-output";
|
|
1338
1402
|
"agent-tool-usage": "agent-tool-usage";
|
|
1339
1403
|
}>;
|
|
1340
|
-
criteria: z.ZodArray<z.
|
|
1404
|
+
criteria: z.ZodArray<z.ZodObject<{
|
|
1405
|
+
id: z.ZodString;
|
|
1406
|
+
text: z.ZodString;
|
|
1407
|
+
}, z.core.$strip>>;
|
|
1341
1408
|
weight: z.ZodOptional<z.ZodNumber>;
|
|
1342
1409
|
}, z.core.$strip>, z.ZodObject<{
|
|
1343
1410
|
type: z.ZodEnum<{
|
|
@@ -1468,6 +1535,7 @@ export declare const RepoConfigSchema: z.ZodObject<{
|
|
|
1468
1535
|
execution: z.ZodOptional<z.ZodObject<{
|
|
1469
1536
|
concurrency: z.ZodOptional<z.ZodNumber>;
|
|
1470
1537
|
graderReplications: z.ZodOptional<z.ZodNumber>;
|
|
1538
|
+
borderlineReplications: z.ZodOptional<z.ZodNumber>;
|
|
1471
1539
|
gapAnalysis: z.ZodOptional<z.ZodBoolean>;
|
|
1472
1540
|
apiUrl: z.ZodOptional<z.ZodString>;
|
|
1473
1541
|
}, z.core.$strip>>;
|
|
@@ -111,14 +111,26 @@ const CanonicalDocRefSchema = z.union([
|
|
|
111
111
|
// ---------------------------------------------------------------------------
|
|
112
112
|
// Assertion schemas
|
|
113
113
|
// ---------------------------------------------------------------------------
|
|
114
|
+
/**
|
|
115
|
+
* A single criterion within an llm-rubric assertion. Stable id-text pair.
|
|
116
|
+
*/
|
|
117
|
+
export const CriterionRefSchema = z.object({
|
|
118
|
+
id: z
|
|
119
|
+
.string()
|
|
120
|
+
.min(1)
|
|
121
|
+
.regex(/^[a-z0-9][a-z0-9-]*$/, {
|
|
122
|
+
message: "criterion id must be lowercase alphanumeric with hyphens",
|
|
123
|
+
}),
|
|
124
|
+
text: z.string().min(1),
|
|
125
|
+
});
|
|
114
126
|
/**
|
|
115
127
|
* A templated LLM-rubric assertion — uses one of the predefined rubric
|
|
116
128
|
* templates with author-supplied criteria.
|
|
117
129
|
*/
|
|
118
|
-
const TemplatedAssertionSchema = z.object({
|
|
130
|
+
export const TemplatedAssertionSchema = z.object({
|
|
119
131
|
type: z.literal("llm-rubric"),
|
|
120
132
|
template: z.enum(RUBRIC_TEMPLATE_NAMES),
|
|
121
|
-
criteria: z.array(
|
|
133
|
+
criteria: z.array(CriterionRefSchema).min(1),
|
|
122
134
|
weight: z.number().optional(),
|
|
123
135
|
});
|
|
124
136
|
/**
|
|
@@ -562,6 +574,11 @@ const ExecutionConfigSchema = z
|
|
|
562
574
|
.object({
|
|
563
575
|
concurrency: z.number().int().positive().optional(),
|
|
564
576
|
graderReplications: z.number().int().positive().optional(),
|
|
577
|
+
/**
|
|
578
|
+
* Plan 03-04 GRAD-04 — replications per borderline judgment.
|
|
579
|
+
* Default 3 (composition-root). Positive integer.
|
|
580
|
+
*/
|
|
581
|
+
borderlineReplications: z.number().int().positive().optional(),
|
|
565
582
|
gapAnalysis: z.boolean().optional(),
|
|
566
583
|
apiUrl: z.string().url().optional(),
|
|
567
584
|
})
|
|
@@ -38,7 +38,7 @@ export function createCalculateScoresCommand() {
|
|
|
38
38
|
remote: false,
|
|
39
39
|
apiUrl: "https://ailf-api.sanity.build",
|
|
40
40
|
});
|
|
41
|
-
const result = calculateAndWriteScores({
|
|
41
|
+
const result = await calculateAndWriteScores({
|
|
42
42
|
resultsPath,
|
|
43
43
|
rootDir: ctx.config.rootDir,
|
|
44
44
|
source: opts.source,
|
|
@@ -298,7 +298,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
298
298
|
],
|
|
299
299
|
},
|
|
300
300
|
"lookup-doc": {
|
|
301
|
-
description: "Search Sanity for documentation articles by keyword (find slugs for
|
|
301
|
+
description: "Search Sanity for documentation articles by keyword (find slugs for contextDocs)",
|
|
302
302
|
steps: [
|
|
303
303
|
{
|
|
304
304
|
cacheStatus: "miss",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* lookup-doc command — search Sanity for documentation articles by keyword.
|
|
3
3
|
*
|
|
4
|
-
* Helps external contributors find the correct `slug` for
|
|
4
|
+
* Helps external contributors find the correct `slug` for contextDocs
|
|
5
5
|
* references without needing to browse the CMS or guess from URLs.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* lookup-doc command — search Sanity for documentation articles by keyword.
|
|
3
3
|
*
|
|
4
|
-
* Helps external contributors find the correct `slug` for
|
|
4
|
+
* Helps external contributors find the correct `slug` for contextDocs
|
|
5
5
|
* references without needing to browse the CMS or guess from URLs.
|
|
6
6
|
*
|
|
7
7
|
* Usage:
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
import { Command } from "commander";
|
|
15
15
|
export function createLookupDocCommand() {
|
|
16
16
|
return new Command("lookup-doc")
|
|
17
|
-
.description("Search Sanity docs by keyword — find slugs for
|
|
17
|
+
.description("Search Sanity docs by keyword — find slugs for contextDocs references")
|
|
18
18
|
.argument("<keyword>", "Search keyword (matches title and slug)")
|
|
19
19
|
.option("-l, --limit <n>", "Maximum results to show", parseInt, 10)
|
|
20
20
|
.option("-s, --source <name>", "Documentation source (from sources.yaml)")
|
|
@@ -73,7 +73,7 @@ export function createLookupDocCommand() {
|
|
|
73
73
|
console.log(` ${"".padEnd(maxSlugLen + 6)} │ Section: ${section}\n`);
|
|
74
74
|
}
|
|
75
75
|
console.log(" Usage in .ailf/tasks/*.yaml:\n");
|
|
76
|
-
console.log("
|
|
76
|
+
console.log(" contextDocs:");
|
|
77
77
|
console.log(` - slug: ${results[0].slug}`);
|
|
78
78
|
console.log(` reason: "${results[0].title}"`);
|
|
79
79
|
if (results[0].sectionSlug) {
|
|
@@ -27,6 +27,12 @@ export interface ResolvedOptions {
|
|
|
27
27
|
dryRun: boolean;
|
|
28
28
|
gapAnalysisEnabled: boolean;
|
|
29
29
|
graderReplications?: number;
|
|
30
|
+
/**
|
|
31
|
+
* Replications per borderline judgment for the GRAD-04 intra-grader
|
|
32
|
+
* consensus pass. Sourced from `.ailf/config.yaml`'s
|
|
33
|
+
* `execution.borderlineReplications`.
|
|
34
|
+
*/
|
|
35
|
+
borderlineReplications?: number;
|
|
30
36
|
/** Grader context policy from `.ailf/config.yaml` `grader.context` */
|
|
31
37
|
graderContext?: "rubric-only" | "with-docs";
|
|
32
38
|
headerArgs: string[];
|
|
@@ -248,6 +248,7 @@ export function computeResolvedOptions(opts) {
|
|
|
248
248
|
// env var (where one exists) > .ailf/config.yaml > built-in default
|
|
249
249
|
const concurrency = repoConfig?.execution?.concurrency;
|
|
250
250
|
const graderReplications = repoConfig?.execution?.graderReplications;
|
|
251
|
+
const borderlineReplications = repoConfig?.execution?.borderlineReplications;
|
|
251
252
|
const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
|
|
252
253
|
// Grader context policy. Cascade: env var > .ailf/config.yaml > unset
|
|
253
254
|
// (defaults to rubric-only at the EvalConfig boundary). The env var is the
|
|
@@ -291,6 +292,7 @@ export function computeResolvedOptions(opts) {
|
|
|
291
292
|
dryRun: opts.dryRun,
|
|
292
293
|
gapAnalysisEnabled,
|
|
293
294
|
graderReplications,
|
|
295
|
+
borderlineReplications,
|
|
294
296
|
graderContext,
|
|
295
297
|
headerArgs,
|
|
296
298
|
impactSummary,
|
|
@@ -142,6 +142,7 @@ function toConfigSlice(opts) {
|
|
|
142
142
|
perspectiveOverride: opts.perspectiveOverride,
|
|
143
143
|
graderContext: opts.graderContext,
|
|
144
144
|
graderReplications: opts.graderReplications,
|
|
145
|
+
borderlineReplications: opts.borderlineReplications,
|
|
145
146
|
gapAnalysisEnabled: opts.gapAnalysisEnabled,
|
|
146
147
|
noRemoteCache: opts.noRemoteCache,
|
|
147
148
|
// D0037 / W0069 caller envelope overrides — flags override env vars
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
18
|
import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type LLMClient, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
|
+
import { type BorderlineConsensusOptions, type BorderlineConsensusResult } from "./pipeline/borderline-consensus-runner.js";
|
|
19
20
|
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./adapters/task-sources/index.js";
|
|
20
21
|
/**
|
|
21
22
|
* Create a fully wired AppContext from resolved configuration.
|
|
@@ -83,3 +84,38 @@ export declare function createTaskSource(config: ResolvedConfig): CompositeTaskS
|
|
|
83
84
|
* explicit mode whitelists.
|
|
84
85
|
*/
|
|
85
86
|
export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
|
|
87
|
+
/**
|
|
88
|
+
* Severity boundaries from `packages/eval/config/thresholds.ts`
|
|
89
|
+
* (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
|
|
90
|
+
* 60). The borderline detector flags a judgment when its score is within
|
|
91
|
+
* ±5 of any of these. Composition-root reads them ONCE and threads the
|
|
92
|
+
* typed `readonly number[]` into `runBorderlineConsensus` rather than
|
|
93
|
+
* re-deriving them at each call site (Pitfall 5 — single source of truth
|
|
94
|
+
* for the scale).
|
|
95
|
+
*/
|
|
96
|
+
export declare const BORDERLINE_SEVERITY_THRESHOLDS: readonly number[];
|
|
97
|
+
/**
|
|
98
|
+
* Default replications per borderline judgment when the caller's
|
|
99
|
+
* `RepoConfig.execution.borderlineReplications` is unset (locked answer
|
|
100
|
+
* #4 in plan 03-04). Three replications + the original score = four
|
|
101
|
+
* scores per consistency record, which is the minimum that produces a
|
|
102
|
+
* non-degenerate stdDev / median split.
|
|
103
|
+
*/
|
|
104
|
+
export declare const DEFAULT_BORDERLINE_REPLICATIONS = 3;
|
|
105
|
+
/**
|
|
106
|
+
* Factory for the borderline-consensus runner. Returns a function that
|
|
107
|
+
* applies the severity-threshold and replication defaults from
|
|
108
|
+
* composition-root, leaving the live grader entry point (the `regrade`
|
|
109
|
+
* callback) and the candidate `judgments` array as runtime inputs.
|
|
110
|
+
*
|
|
111
|
+
* The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
|
|
112
|
+
* post-extraction junction) supplies the `regrade` callback that maps a
|
|
113
|
+
* `GraderJudgment` to a fresh score via the response/rubric text from
|
|
114
|
+
* the original Promptfoo result. See the runner's header for the
|
|
115
|
+
* rationale on injecting the regrader rather than calling `gradeOnce`
|
|
116
|
+
* inline (Pitfall 6 — preserve the runner's purity wrt the existing
|
|
117
|
+
* grader-comparison split).
|
|
118
|
+
*/
|
|
119
|
+
export declare function createBorderlineConsensusRunner(opts: {
|
|
120
|
+
borderlineReplications?: number;
|
|
121
|
+
}): (args: Pick<BorderlineConsensusOptions, "judgments" | "logger" | "regrade">) => Promise<BorderlineConsensusResult>;
|
package/dist/composition-root.js
CHANGED
|
@@ -27,6 +27,7 @@ import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifa
|
|
|
27
27
|
import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
|
|
28
28
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
29
29
|
import { AnthropicLLMClient, OpenAILLMClient } from "./adapters/llm/index.js";
|
|
30
|
+
import { runBorderlineConsensus, } from "./pipeline/borderline-consensus-runner.js";
|
|
30
31
|
import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
|
|
31
32
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
32
33
|
import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
|
|
@@ -493,3 +494,50 @@ function createReportStore(config) {
|
|
|
493
494
|
undefined,
|
|
494
495
|
});
|
|
495
496
|
}
|
|
497
|
+
// ---------------------------------------------------------------------------
|
|
498
|
+
// Borderline-consensus wiring (Plan 03-04 / GRAD-04)
|
|
499
|
+
// ---------------------------------------------------------------------------
|
|
500
|
+
/**
|
|
501
|
+
* Severity boundaries from `packages/eval/config/thresholds.ts`
|
|
502
|
+
* (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
|
|
503
|
+
* 60). The borderline detector flags a judgment when its score is within
|
|
504
|
+
* ±5 of any of these. Composition-root reads them ONCE and threads the
|
|
505
|
+
* typed `readonly number[]` into `runBorderlineConsensus` rather than
|
|
506
|
+
* re-deriving them at each call site (Pitfall 5 — single source of truth
|
|
507
|
+
* for the scale).
|
|
508
|
+
*/
|
|
509
|
+
export const BORDERLINE_SEVERITY_THRESHOLDS = [
|
|
510
|
+
30, 50, 60,
|
|
511
|
+
];
|
|
512
|
+
/**
|
|
513
|
+
* Default replications per borderline judgment when the caller's
|
|
514
|
+
* `RepoConfig.execution.borderlineReplications` is unset (locked answer
|
|
515
|
+
* #4 in plan 03-04). Three replications + the original score = four
|
|
516
|
+
* scores per consistency record, which is the minimum that produces a
|
|
517
|
+
* non-degenerate stdDev / median split.
|
|
518
|
+
*/
|
|
519
|
+
export const DEFAULT_BORDERLINE_REPLICATIONS = 3;
|
|
520
|
+
/**
|
|
521
|
+
* Factory for the borderline-consensus runner. Returns a function that
|
|
522
|
+
* applies the severity-threshold and replication defaults from
|
|
523
|
+
* composition-root, leaving the live grader entry point (the `regrade`
|
|
524
|
+
* callback) and the candidate `judgments` array as runtime inputs.
|
|
525
|
+
*
|
|
526
|
+
* The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
|
|
527
|
+
* post-extraction junction) supplies the `regrade` callback that maps a
|
|
528
|
+
* `GraderJudgment` to a fresh score via the response/rubric text from
|
|
529
|
+
* the original Promptfoo result. See the runner's header for the
|
|
530
|
+
* rationale on injecting the regrader rather than calling `gradeOnce`
|
|
531
|
+
* inline (Pitfall 6 — preserve the runner's purity wrt the existing
|
|
532
|
+
* grader-comparison split).
|
|
533
|
+
*/
|
|
534
|
+
export function createBorderlineConsensusRunner(opts) {
|
|
535
|
+
const replications = opts.borderlineReplications ?? DEFAULT_BORDERLINE_REPLICATIONS;
|
|
536
|
+
return (args) => runBorderlineConsensus({
|
|
537
|
+
judgments: args.judgments,
|
|
538
|
+
...(args.logger ? { logger: args.logger } : {}),
|
|
539
|
+
regrade: args.regrade,
|
|
540
|
+
replications,
|
|
541
|
+
thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
|
|
542
|
+
});
|
|
543
|
+
}
|