@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  11. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  12. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  13. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  14. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  15. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  22. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  23. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  24. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  28. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  29. package/dist/_vendor/ailf-core/services/index.js +5 -0
  30. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  31. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  32. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  33. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  34. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  35. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  36. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  37. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  38. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  39. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  40. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  41. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  42. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  43. package/dist/_vendor/ailf-core/types/index.js +15 -1
  44. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  45. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  47. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  48. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  49. package/dist/adapters/api-client/build-request.d.ts +1 -0
  50. package/dist/adapters/api-client/build-request.js +3 -0
  51. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  52. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  53. package/dist/adapters/attribution/index.d.ts +9 -0
  54. package/dist/adapters/attribution/index.js +8 -0
  55. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  56. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  57. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  58. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  59. package/dist/adapters/grader-outputs/index.js +8 -0
  60. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  61. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  62. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  63. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  64. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  65. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  66. package/dist/adapters/index.d.ts +3 -0
  67. package/dist/adapters/index.js +4 -0
  68. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  69. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  70. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  71. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  72. package/dist/commands/calculate-scores.js +1 -1
  73. package/dist/commands/explain-handler.js +1 -1
  74. package/dist/commands/lookup-doc.d.ts +1 -1
  75. package/dist/commands/lookup-doc.js +3 -3
  76. package/dist/commands/pipeline-action.d.ts +6 -0
  77. package/dist/commands/pipeline-action.js +2 -0
  78. package/dist/commands/remote-pipeline.js +1 -0
  79. package/dist/composition-root.d.ts +36 -0
  80. package/dist/composition-root.js +48 -0
  81. package/dist/config/rubrics.ts +38 -2
  82. package/dist/grader/agent-harness.d.ts +14 -0
  83. package/dist/grader/agent-harness.js +17 -0
  84. package/dist/grader/common.d.ts +17 -0
  85. package/dist/grader/common.js +21 -0
  86. package/dist/grader/index.d.ts +38 -0
  87. package/dist/grader/index.js +75 -0
  88. package/dist/grader/knowledge-probe.d.ts +14 -0
  89. package/dist/grader/knowledge-probe.js +18 -0
  90. package/dist/grader/literacy.d.ts +13 -0
  91. package/dist/grader/literacy.js +17 -0
  92. package/dist/grader/mcp.d.ts +14 -0
  93. package/dist/grader/mcp.js +18 -0
  94. package/dist/orchestration/build-app-context.js +1 -0
  95. package/dist/orchestration/build-step-sequence.js +5 -0
  96. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  97. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  98. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  99. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  100. package/dist/orchestration/steps/index.d.ts +1 -0
  101. package/dist/orchestration/steps/index.js +1 -0
  102. package/dist/pipeline/attribution.d.ts +15 -0
  103. package/dist/pipeline/attribution.js +18 -9
  104. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  105. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  106. package/dist/pipeline/borderline-detector.d.ts +24 -0
  107. package/dist/pipeline/borderline-detector.js +26 -0
  108. package/dist/pipeline/calculate-scores.d.ts +114 -3
  109. package/dist/pipeline/calculate-scores.js +426 -24
  110. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  111. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  112. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  113. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  114. package/dist/pipeline/compute-attribution.d.ts +80 -0
  115. package/dist/pipeline/compute-attribution.js +196 -0
  116. package/dist/pipeline/failure-modes.d.ts +52 -17
  117. package/dist/pipeline/failure-modes.js +178 -117
  118. package/dist/pipeline/map-request-to-config.js +1 -0
  119. package/package.json +6 -4
@@ -0,0 +1,93 @@
1
+ /**
2
+ * promptfoo-grader-output.ts — Zod schema for the structured grader output
3
+ * (GRAD-02) emitted by the promptfoo grader process and consumed by the
4
+ * eval pipeline.
5
+ *
6
+ * The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
7
+ * canonical domain type in `packages/core/src/types/grader-judgment.ts`
8
+ * (D0045 / W0187) — drift between schema and type is a build error.
9
+ * The domain type was authored independently in Plan 01-01; this file
10
+ * authors ONLY the schema and never derives the domain type from the
11
+ * schema itself (no schema-derived self-reference allowed by D0045).
12
+ *
13
+ * `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
14
+ * source-of-truth file owns its version constant). Bumped by hand when
15
+ * the grader rubric, prompt template, or judgment shape changes.
16
+ *
17
+ * Phase 3 will replace the inline `JSON.parse` at
18
+ * `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
19
+ * output flows through this schema.
20
+ *
21
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
22
+ * @see docs/decisions/D0049-shared-confidence-contract.md
23
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
24
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
25
+ */
26
+ import { z } from "zod";
27
+ import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
28
+ /**
29
+ * VER-01 D-02 — co-located version constant. Bumped by hand when the
30
+ * grader rubric, prompt template, or judgment shape changes in a way
31
+ * that should invalidate cached Diagnoses.
32
+ *
33
+ * Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
34
+ * major) — the additive GRAD-02 surface is now required + the schema
35
+ * is `.strict()`. AILF has no installed external base; the legacy
36
+ * parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
37
+ * consumer for already-stored historical reports.
38
+ */
39
+ export const graderJudgmentsVersion = "1.0.0";
40
+ const DocCitationRoleSchema = z.enum([
41
+ "supports",
42
+ "contradicts",
43
+ "missing",
44
+ "irrelevant",
45
+ ]);
46
+ const DocCitationSchema = z.object({
47
+ documentId: z.string().min(1),
48
+ slug: z.string().optional(),
49
+ role: DocCitationRoleSchema,
50
+ hallucinated: z.boolean().optional(),
51
+ });
52
+ const CriterionSubJudgmentSchema = z.object({
53
+ criterionId: z.string().min(1),
54
+ met: z.boolean(),
55
+ evidence: z.string().max(280),
56
+ confidence: ConfidenceSchema,
57
+ });
58
+ /**
59
+ * Canonical schema for {@link GraderJudgment}. Required fields mirror
60
+ * the existing pipeline core (Doc 03 §"existing, unchanged"):
61
+ * `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
62
+ * has tightened the additive surface to required and added `.strict()`
63
+ * — the schema rejects unknown fields (defense-in-depth against future
64
+ * prompt-injection attempts that try to smuggle keys through the
65
+ * grader emission).
66
+ *
67
+ * Branded `JudgmentId` is represented at runtime by a non-empty string;
68
+ * the schema routes the brand through `brandedString<"JudgmentId">()`
69
+ * — the project's single audited cast site for branded-string
70
+ * schemas (project typescript rule: no `as` on `unknown`).
71
+ */
72
+ export const GraderJudgmentSchema = z
73
+ .object({
74
+ // ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
75
+ taskId: z.string().min(1),
76
+ modelId: z.string().min(1),
77
+ dimension: z.string().min(1),
78
+ reason: z.string(),
79
+ score: z.number(),
80
+ outputFailure: z.boolean().optional(),
81
+ // ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
82
+ judgmentId: brandedString(),
83
+ subJudgments: z.array(CriterionSubJudgmentSchema),
84
+ docCitations: z.array(DocCitationSchema),
85
+ failureMode: z.string(),
86
+ confidence: ConfidenceSchema,
87
+ hallucinationCheckedAgainst: z.array(z.string()),
88
+ metadata: z.object({
89
+ graderModel: z.string().min(1),
90
+ graderJudgmentsVersion: z.string().min(1),
91
+ }),
92
+ })
93
+ .strict();
@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
12
  export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
13
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
14
+ export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
15
+ export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";
@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
12
  export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
13
+ // Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
14
+ // Named re-exports only (W0124 / D0045).
15
+ export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
16
+ export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
@@ -55,9 +55,13 @@ interface ContentLakeCanonicalDoc {
55
55
  sectionSlug?: string;
56
56
  slug?: string;
57
57
  }
58
+ interface ContentLakeCriterion {
59
+ id?: string;
60
+ text?: string;
61
+ }
58
62
  /** Assertion shape from the Content Lake (mirrors the Studio schema). */
59
63
  interface ContentLakeAssertion {
60
- criteria?: string[];
64
+ criteria?: ContentLakeCriterion[];
61
65
  template?: string;
62
66
  threshold?: number;
63
67
  type?: string;
@@ -73,7 +73,13 @@ const TASKS_QUERY = /* groq */ `
73
73
  perspective,
74
74
  reason
75
75
  },
76
- "assertions": coalesce(assertions, assert),
76
+ "assertions": coalesce(assertions, assert)[] {
77
+ type, template, weight, value, threshold,
78
+ "criteria": criteria[] {
79
+ "id": coalesce(id.current, _key),
80
+ "text": coalesce(text, @)
81
+ }
82
+ },
77
83
  rawAssert,
78
84
  baseline,
79
85
  tags,
@@ -256,8 +262,28 @@ function mapAssertions(raw) {
256
262
  .filter((a) => !!a.type)
257
263
  .map((a) => {
258
264
  if (a.type === "llm-rubric" && a.template && a.criteria) {
265
+ // Tighten the runtime contract: the GROQ projection's
266
+ // `coalesce(text, @)` falls through to the entire criterion
267
+ // element when `text` is missing, so a partial legacy criterion
268
+ // like `{_key: "abc"}` arrives here as `{ id: "abc", text: {...} }`
269
+ // — `text` set to the whole `@` object. Explicit type checks
270
+ // drop those with a diagnostic, instead of letting the non-string
271
+ // `text` propagate until the outer ContentLakeAuthorableTaskSchema
272
+ // parse fails deep inside the assertions array (noisy diagnostic).
259
273
  return {
260
- criteria: a.criteria,
274
+ criteria: a.criteria
275
+ .filter((c) => {
276
+ if (!c)
277
+ return false;
278
+ const idOk = typeof c.id === "string" && c.id.length > 0;
279
+ const textOk = typeof c.text === "string" && c.text.length > 0;
280
+ if (!idOk || !textOk) {
281
+ console.warn(`[ContentLakeTaskSource] dropping malformed criterion: ${JSON.stringify(c).slice(0, 100)}`);
282
+ return false;
283
+ }
284
+ return true;
285
+ })
286
+ .map((c) => ({ id: c.id, text: c.text })),
261
287
  template: a.template,
262
288
  type: "llm-rubric",
263
289
  ...(a.weight !== undefined ? { weight: a.weight } : {}),
@@ -32,6 +32,40 @@ export type CuratedAssertionType = (typeof CURATED_ASSERTION_TYPES)[number];
32
32
  */
33
33
  export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage", "mcp-input-validation", "mcp-output-correctness", "mcp-error-handling", "mcp-security", "factual-correctness", "completeness", "currency", "process-quality", "agent-output", "agent-tool-usage"];
34
34
  export type RubricTemplateName = (typeof RUBRIC_TEMPLATE_NAMES)[number];
35
+ /**
36
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
37
+ */
38
+ export declare const CriterionRefSchema: z.ZodObject<{
39
+ id: z.ZodString;
40
+ text: z.ZodString;
41
+ }, z.core.$strip>;
42
+ /**
43
+ * A templated LLM-rubric assertion — uses one of the predefined rubric
44
+ * templates with author-supplied criteria.
45
+ */
46
+ export declare const TemplatedAssertionSchema: z.ZodObject<{
47
+ type: z.ZodLiteral<"llm-rubric">;
48
+ template: z.ZodEnum<{
49
+ "task-completion": "task-completion";
50
+ "code-correctness": "code-correctness";
51
+ "doc-coverage": "doc-coverage";
52
+ "mcp-input-validation": "mcp-input-validation";
53
+ "mcp-output-correctness": "mcp-output-correctness";
54
+ "mcp-error-handling": "mcp-error-handling";
55
+ "mcp-security": "mcp-security";
56
+ "factual-correctness": "factual-correctness";
57
+ completeness: "completeness";
58
+ currency: "currency";
59
+ "process-quality": "process-quality";
60
+ "agent-output": "agent-output";
61
+ "agent-tool-usage": "agent-tool-usage";
62
+ }>;
63
+ criteria: z.ZodArray<z.ZodObject<{
64
+ id: z.ZodString;
65
+ text: z.ZodString;
66
+ }, z.core.$strip>>;
67
+ weight: z.ZodOptional<z.ZodNumber>;
68
+ }, z.core.$strip>;
35
69
  /**
36
70
  * Zod schema for a single task definition — a mode-discriminated union
37
71
  * mirroring `GeneralizedTaskDefinition`.
@@ -73,7 +107,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
73
107
  "agent-output": "agent-output";
74
108
  "agent-tool-usage": "agent-tool-usage";
75
109
  }>;
76
- criteria: z.ZodArray<z.ZodString>;
110
+ criteria: z.ZodArray<z.ZodObject<{
111
+ id: z.ZodString;
112
+ text: z.ZodString;
113
+ }, z.core.$strip>>;
77
114
  weight: z.ZodOptional<z.ZodNumber>;
78
115
  }, z.core.$strip>, z.ZodObject<{
79
116
  type: z.ZodEnum<{
@@ -187,7 +224,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
187
224
  "agent-output": "agent-output";
188
225
  "agent-tool-usage": "agent-tool-usage";
189
226
  }>;
190
- criteria: z.ZodArray<z.ZodString>;
227
+ criteria: z.ZodArray<z.ZodObject<{
228
+ id: z.ZodString;
229
+ text: z.ZodString;
230
+ }, z.core.$strip>>;
191
231
  weight: z.ZodOptional<z.ZodNumber>;
192
232
  }, z.core.$strip>, z.ZodObject<{
193
233
  type: z.ZodEnum<{
@@ -341,7 +381,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
341
381
  "agent-output": "agent-output";
342
382
  "agent-tool-usage": "agent-tool-usage";
343
383
  }>;
344
- criteria: z.ZodArray<z.ZodString>;
384
+ criteria: z.ZodArray<z.ZodObject<{
385
+ id: z.ZodString;
386
+ text: z.ZodString;
387
+ }, z.core.$strip>>;
345
388
  weight: z.ZodOptional<z.ZodNumber>;
346
389
  }, z.core.$strip>, z.ZodObject<{
347
390
  type: z.ZodEnum<{
@@ -472,7 +515,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
472
515
  "agent-output": "agent-output";
473
516
  "agent-tool-usage": "agent-tool-usage";
474
517
  }>;
475
- criteria: z.ZodArray<z.ZodString>;
518
+ criteria: z.ZodArray<z.ZodObject<{
519
+ id: z.ZodString;
520
+ text: z.ZodString;
521
+ }, z.core.$strip>>;
476
522
  weight: z.ZodOptional<z.ZodNumber>;
477
523
  }, z.core.$strip>, z.ZodObject<{
478
524
  type: z.ZodEnum<{
@@ -591,7 +637,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
591
637
  "agent-output": "agent-output";
592
638
  "agent-tool-usage": "agent-tool-usage";
593
639
  }>;
594
- criteria: z.ZodArray<z.ZodString>;
640
+ criteria: z.ZodArray<z.ZodObject<{
641
+ id: z.ZodString;
642
+ text: z.ZodString;
643
+ }, z.core.$strip>>;
595
644
  weight: z.ZodOptional<z.ZodNumber>;
596
645
  }, z.core.$strip>, z.ZodObject<{
597
646
  type: z.ZodEnum<{
@@ -699,7 +748,10 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
699
748
  "agent-output": "agent-output";
700
749
  "agent-tool-usage": "agent-tool-usage";
701
750
  }>;
702
- criteria: z.ZodArray<z.ZodString>;
751
+ criteria: z.ZodArray<z.ZodObject<{
752
+ id: z.ZodString;
753
+ text: z.ZodString;
754
+ }, z.core.$strip>>;
703
755
  weight: z.ZodOptional<z.ZodNumber>;
704
756
  }, z.core.$strip>, z.ZodObject<{
705
757
  type: z.ZodEnum<{
@@ -819,7 +871,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
819
871
  "agent-output": "agent-output";
820
872
  "agent-tool-usage": "agent-tool-usage";
821
873
  }>;
822
- criteria: z.ZodArray<z.ZodString>;
874
+ criteria: z.ZodArray<z.ZodObject<{
875
+ id: z.ZodString;
876
+ text: z.ZodString;
877
+ }, z.core.$strip>>;
823
878
  weight: z.ZodOptional<z.ZodNumber>;
824
879
  }, z.core.$strip>, z.ZodObject<{
825
880
  type: z.ZodEnum<{
@@ -933,7 +988,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
933
988
  "agent-output": "agent-output";
934
989
  "agent-tool-usage": "agent-tool-usage";
935
990
  }>;
936
- criteria: z.ZodArray<z.ZodString>;
991
+ criteria: z.ZodArray<z.ZodObject<{
992
+ id: z.ZodString;
993
+ text: z.ZodString;
994
+ }, z.core.$strip>>;
937
995
  weight: z.ZodOptional<z.ZodNumber>;
938
996
  }, z.core.$strip>, z.ZodObject<{
939
997
  type: z.ZodEnum<{
@@ -1087,7 +1145,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1087
1145
  "agent-output": "agent-output";
1088
1146
  "agent-tool-usage": "agent-tool-usage";
1089
1147
  }>;
1090
- criteria: z.ZodArray<z.ZodString>;
1148
+ criteria: z.ZodArray<z.ZodObject<{
1149
+ id: z.ZodString;
1150
+ text: z.ZodString;
1151
+ }, z.core.$strip>>;
1091
1152
  weight: z.ZodOptional<z.ZodNumber>;
1092
1153
  }, z.core.$strip>, z.ZodObject<{
1093
1154
  type: z.ZodEnum<{
@@ -1218,7 +1279,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1218
1279
  "agent-output": "agent-output";
1219
1280
  "agent-tool-usage": "agent-tool-usage";
1220
1281
  }>;
1221
- criteria: z.ZodArray<z.ZodString>;
1282
+ criteria: z.ZodArray<z.ZodObject<{
1283
+ id: z.ZodString;
1284
+ text: z.ZodString;
1285
+ }, z.core.$strip>>;
1222
1286
  weight: z.ZodOptional<z.ZodNumber>;
1223
1287
  }, z.core.$strip>, z.ZodObject<{
1224
1288
  type: z.ZodEnum<{
@@ -1337,7 +1401,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1337
1401
  "agent-output": "agent-output";
1338
1402
  "agent-tool-usage": "agent-tool-usage";
1339
1403
  }>;
1340
- criteria: z.ZodArray<z.ZodString>;
1404
+ criteria: z.ZodArray<z.ZodObject<{
1405
+ id: z.ZodString;
1406
+ text: z.ZodString;
1407
+ }, z.core.$strip>>;
1341
1408
  weight: z.ZodOptional<z.ZodNumber>;
1342
1409
  }, z.core.$strip>, z.ZodObject<{
1343
1410
  type: z.ZodEnum<{
@@ -1468,6 +1535,7 @@ export declare const RepoConfigSchema: z.ZodObject<{
1468
1535
  execution: z.ZodOptional<z.ZodObject<{
1469
1536
  concurrency: z.ZodOptional<z.ZodNumber>;
1470
1537
  graderReplications: z.ZodOptional<z.ZodNumber>;
1538
+ borderlineReplications: z.ZodOptional<z.ZodNumber>;
1471
1539
  gapAnalysis: z.ZodOptional<z.ZodBoolean>;
1472
1540
  apiUrl: z.ZodOptional<z.ZodString>;
1473
1541
  }, z.core.$strip>>;
@@ -111,14 +111,26 @@ const CanonicalDocRefSchema = z.union([
111
111
  // ---------------------------------------------------------------------------
112
112
  // Assertion schemas
113
113
  // ---------------------------------------------------------------------------
114
+ /**
115
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
116
+ */
117
+ export const CriterionRefSchema = z.object({
118
+ id: z
119
+ .string()
120
+ .min(1)
121
+ .regex(/^[a-z0-9][a-z0-9-]*$/, {
122
+ message: "criterion id must be lowercase alphanumeric with hyphens",
123
+ }),
124
+ text: z.string().min(1),
125
+ });
114
126
  /**
115
127
  * A templated LLM-rubric assertion — uses one of the predefined rubric
116
128
  * templates with author-supplied criteria.
117
129
  */
118
- const TemplatedAssertionSchema = z.object({
130
+ export const TemplatedAssertionSchema = z.object({
119
131
  type: z.literal("llm-rubric"),
120
132
  template: z.enum(RUBRIC_TEMPLATE_NAMES),
121
- criteria: z.array(z.string().min(1)).min(1),
133
+ criteria: z.array(CriterionRefSchema).min(1),
122
134
  weight: z.number().optional(),
123
135
  });
124
136
  /**
@@ -562,6 +574,11 @@ const ExecutionConfigSchema = z
562
574
  .object({
563
575
  concurrency: z.number().int().positive().optional(),
564
576
  graderReplications: z.number().int().positive().optional(),
577
+ /**
578
+ * Plan 03-04 GRAD-04 — replications per borderline judgment.
579
+ * Default 3 (composition-root). Positive integer.
580
+ */
581
+ borderlineReplications: z.number().int().positive().optional(),
565
582
  gapAnalysis: z.boolean().optional(),
566
583
  apiUrl: z.string().url().optional(),
567
584
  })
@@ -38,7 +38,7 @@ export function createCalculateScoresCommand() {
38
38
  remote: false,
39
39
  apiUrl: "https://ailf-api.sanity.build",
40
40
  });
41
- const result = calculateAndWriteScores({
41
+ const result = await calculateAndWriteScores({
42
42
  resultsPath,
43
43
  rootDir: ctx.config.rootDir,
44
44
  source: opts.source,
@@ -298,7 +298,7 @@ const EXPLAIN_REGISTRY = {
298
298
  ],
299
299
  },
300
300
  "lookup-doc": {
301
- description: "Search Sanity for documentation articles by keyword (find slugs for canonicalDocs)",
301
+ description: "Search Sanity for documentation articles by keyword (find slugs for contextDocs)",
302
302
  steps: [
303
303
  {
304
304
  cacheStatus: "miss",
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * lookup-doc command — search Sanity for documentation articles by keyword.
3
3
  *
4
- * Helps external contributors find the correct `slug` for canonicalDocs
4
+ * Helps external contributors find the correct `slug` for contextDocs
5
5
  * references without needing to browse the CMS or guess from URLs.
6
6
  *
7
7
  * Usage:
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * lookup-doc command — search Sanity for documentation articles by keyword.
3
3
  *
4
- * Helps external contributors find the correct `slug` for canonicalDocs
4
+ * Helps external contributors find the correct `slug` for contextDocs
5
5
  * references without needing to browse the CMS or guess from URLs.
6
6
  *
7
7
  * Usage:
@@ -14,7 +14,7 @@
14
14
  import { Command } from "commander";
15
15
  export function createLookupDocCommand() {
16
16
  return new Command("lookup-doc")
17
- .description("Search Sanity docs by keyword — find slugs for canonicalDocs references")
17
+ .description("Search Sanity docs by keyword — find slugs for contextDocs references")
18
18
  .argument("<keyword>", "Search keyword (matches title and slug)")
19
19
  .option("-l, --limit <n>", "Maximum results to show", parseInt, 10)
20
20
  .option("-s, --source <name>", "Documentation source (from sources.yaml)")
@@ -73,7 +73,7 @@ export function createLookupDocCommand() {
73
73
  console.log(` ${"".padEnd(maxSlugLen + 6)} │ Section: ${section}\n`);
74
74
  }
75
75
  console.log(" Usage in .ailf/tasks/*.yaml:\n");
76
- console.log(" canonicalDocs:");
76
+ console.log(" contextDocs:");
77
77
  console.log(` - slug: ${results[0].slug}`);
78
78
  console.log(` reason: "${results[0].title}"`);
79
79
  if (results[0].sectionSlug) {
@@ -27,6 +27,12 @@ export interface ResolvedOptions {
27
27
  dryRun: boolean;
28
28
  gapAnalysisEnabled: boolean;
29
29
  graderReplications?: number;
30
+ /**
31
+ * Replications per borderline judgment for the GRAD-04 intra-grader
32
+ * consensus pass. Sourced from `.ailf/config.yaml`'s
33
+ * `execution.borderlineReplications`.
34
+ */
35
+ borderlineReplications?: number;
30
36
  /** Grader context policy from `.ailf/config.yaml` `grader.context` */
31
37
  graderContext?: "rubric-only" | "with-docs";
32
38
  headerArgs: string[];
@@ -248,6 +248,7 @@ export function computeResolvedOptions(opts) {
248
248
  // env var (where one exists) > .ailf/config.yaml > built-in default
249
249
  const concurrency = repoConfig?.execution?.concurrency;
250
250
  const graderReplications = repoConfig?.execution?.graderReplications;
251
+ const borderlineReplications = repoConfig?.execution?.borderlineReplications;
251
252
  const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
252
253
  // Grader context policy. Cascade: env var > .ailf/config.yaml > unset
253
254
  // (defaults to rubric-only at the EvalConfig boundary). The env var is the
@@ -291,6 +292,7 @@ export function computeResolvedOptions(opts) {
291
292
  dryRun: opts.dryRun,
292
293
  gapAnalysisEnabled,
293
294
  graderReplications,
295
+ borderlineReplications,
294
296
  graderContext,
295
297
  headerArgs,
296
298
  impactSummary,
@@ -142,6 +142,7 @@ function toConfigSlice(opts) {
142
142
  perspectiveOverride: opts.perspectiveOverride,
143
143
  graderContext: opts.graderContext,
144
144
  graderReplications: opts.graderReplications,
145
+ borderlineReplications: opts.borderlineReplications,
145
146
  gapAnalysisEnabled: opts.gapAnalysisEnabled,
146
147
  noRemoteCache: opts.noRemoteCache,
147
148
  // D0037 / W0069 caller envelope overrides — flags override env vars
@@ -16,6 +16,7 @@
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
18
  import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type LLMClient, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
+ import { type BorderlineConsensusOptions, type BorderlineConsensusResult } from "./pipeline/borderline-consensus-runner.js";
19
20
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./adapters/task-sources/index.js";
20
21
  /**
21
22
  * Create a fully wired AppContext from resolved configuration.
@@ -83,3 +84,38 @@ export declare function createTaskSource(config: ResolvedConfig): CompositeTaskS
83
84
  * explicit mode whitelists.
84
85
  */
85
86
  export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
87
+ /**
88
+ * Severity boundaries from `packages/eval/config/thresholds.ts`
89
+ * (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
90
+ * 60). The borderline detector flags a judgment when its score is within
91
+ * ±5 of any of these. Composition-root reads them ONCE and threads the
92
+ * typed `readonly number[]` into `runBorderlineConsensus` rather than
93
+ * re-deriving them at each call site (Pitfall 5 — single source of truth
94
+ * for the scale).
95
+ */
96
+ export declare const BORDERLINE_SEVERITY_THRESHOLDS: readonly number[];
97
+ /**
98
+ * Default replications per borderline judgment when the caller's
99
+ * `RepoConfig.execution.borderlineReplications` is unset (locked answer
100
+ * #4 in plan 03-04). Three replications + the original score = four
101
+ * scores per consistency record, which is the minimum that produces a
102
+ * non-degenerate stdDev / median split.
103
+ */
104
+ export declare const DEFAULT_BORDERLINE_REPLICATIONS = 3;
105
+ /**
106
+ * Factory for the borderline-consensus runner. Returns a function that
107
+ * applies the severity-threshold and replication defaults from
108
+ * composition-root, leaving the live grader entry point (the `regrade`
109
+ * callback) and the candidate `judgments` array as runtime inputs.
110
+ *
111
+ * The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
112
+ * post-extraction junction) supplies the `regrade` callback that maps a
113
+ * `GraderJudgment` to a fresh score via the response/rubric text from
114
+ * the original Promptfoo result. See the runner's header for the
115
+ * rationale on injecting the regrader rather than calling `gradeOnce`
116
+ * inline (Pitfall 6 — preserve the runner's purity wrt the existing
117
+ * grader-comparison split).
118
+ */
119
+ export declare function createBorderlineConsensusRunner(opts: {
120
+ borderlineReplications?: number;
121
+ }): (args: Pick<BorderlineConsensusOptions, "judgments" | "logger" | "regrade">) => Promise<BorderlineConsensusResult>;
@@ -27,6 +27,7 @@ import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifa
27
27
  import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
28
28
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
29
29
  import { AnthropicLLMClient, OpenAILLMClient } from "./adapters/llm/index.js";
30
+ import { runBorderlineConsensus, } from "./pipeline/borderline-consensus-runner.js";
30
31
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
31
32
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
32
33
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
@@ -493,3 +494,50 @@ function createReportStore(config) {
493
494
  undefined,
494
495
  });
495
496
  }
497
+ // ---------------------------------------------------------------------------
498
+ // Borderline-consensus wiring (Plan 03-04 / GRAD-04)
499
+ // ---------------------------------------------------------------------------
500
+ /**
501
+ * Severity boundaries from `packages/eval/config/thresholds.ts`
502
+ * (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
503
+ * 60). The borderline detector flags a judgment when its score is within
504
+ * ±5 of any of these. Composition-root reads them ONCE and threads the
505
+ * typed `readonly number[]` into `runBorderlineConsensus` rather than
506
+ * re-deriving them at each call site (Pitfall 5 — single source of truth
507
+ * for the scale).
508
+ */
509
+ export const BORDERLINE_SEVERITY_THRESHOLDS = [
510
+ 30, 50, 60,
511
+ ];
512
+ /**
513
+ * Default replications per borderline judgment when the caller's
514
+ * `RepoConfig.execution.borderlineReplications` is unset (locked answer
515
+ * #4 in plan 03-04). Three replications + the original score = four
516
+ * scores per consistency record, which is the minimum that produces a
517
+ * non-degenerate stdDev / median split.
518
+ */
519
+ export const DEFAULT_BORDERLINE_REPLICATIONS = 3;
520
+ /**
521
+ * Factory for the borderline-consensus runner. Returns a function that
522
+ * applies the severity-threshold and replication defaults from
523
+ * composition-root, leaving the live grader entry point (the `regrade`
524
+ * callback) and the candidate `judgments` array as runtime inputs.
525
+ *
526
+ * The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
527
+ * post-extraction junction) supplies the `regrade` callback that maps a
528
+ * `GraderJudgment` to a fresh score via the response/rubric text from
529
+ * the original Promptfoo result. See the runner's header for the
530
+ * rationale on injecting the regrader rather than calling `gradeOnce`
531
+ * inline (Pitfall 6 — preserve the runner's purity wrt the existing
532
+ * grader-comparison split).
533
+ */
534
+ export function createBorderlineConsensusRunner(opts) {
535
+ const replications = opts.borderlineReplications ?? DEFAULT_BORDERLINE_REPLICATIONS;
536
+ return (args) => runBorderlineConsensus({
537
+ judgments: args.judgments,
538
+ ...(args.logger ? { logger: args.logger } : {}),
539
+ regrade: args.regrade,
540
+ replications,
541
+ thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
542
+ });
543
+ }