@sanity/ailf 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +56 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
- package/dist/_vendor/ailf-core/types/index.js +16 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +59 -1
- package/dist/composition-root.js +95 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis runner — engine entry point (D0048).
|
|
3
|
+
*
|
|
4
|
+
* Phase 1 lands the version constant only; the runner factory + cache
|
|
5
|
+
* lookup land in Phase 5.
|
|
6
|
+
*
|
|
7
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
8
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
12
|
+
* card-set composition changes in a way that should invalidate cached
|
|
13
|
+
* Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
|
|
14
|
+
* contract test reads the canonical value.
|
|
15
|
+
*
|
|
16
|
+
* `export const` (never `export let`) — module-scope mutables leak
|
|
17
|
+
* across vitest workers (cross-cutting hazard #2).
|
|
18
|
+
*/
|
|
19
|
+
export declare const diagnosisVersion = "0.1.0";
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis runner — engine entry point (D0048).
|
|
3
|
+
*
|
|
4
|
+
* Phase 1 lands the version constant only; the runner factory + cache
|
|
5
|
+
* lookup land in Phase 5.
|
|
6
|
+
*
|
|
7
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
8
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
12
|
+
* card-set composition changes in a way that should invalidate cached
|
|
13
|
+
* Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
|
|
14
|
+
* contract test reads the canonical value.
|
|
15
|
+
*
|
|
16
|
+
* `export const` (never `export let`) — module-scope mutables leak
|
|
17
|
+
* across vitest workers (cross-cutting hazard #2).
|
|
18
|
+
*/
|
|
19
|
+
export const diagnosisVersion = "0.1.0";
|
|
@@ -13,3 +13,5 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
|
|
16
|
+
export { diagnosisVersion } from "./diagnosis-runner.js";
|
|
17
|
+
export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
|
|
@@ -13,3 +13,8 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, } from "./report-to-markdown.js";
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Actionability ladder Phase 1 — diagnosis runner + card registry
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
export { diagnosisVersion } from "./diagnosis-runner.js";
|
|
20
|
+
export { cardRegistry } from "./diagnosis/registry.js";
|
|
@@ -493,8 +493,9 @@ function renderLowScoringJudgments(md, judgments) {
|
|
|
493
493
|
.join("\n");
|
|
494
494
|
md.line(reasonLines);
|
|
495
495
|
md.blank();
|
|
496
|
-
|
|
497
|
-
|
|
496
|
+
const jDocs = j.contextDocs ?? j.canonicalDocs;
|
|
497
|
+
if (jDocs && jDocs.length > 0) {
|
|
498
|
+
const docList = jDocs.map((d) => `\`${d.slug}\``).join(", ");
|
|
498
499
|
md.line(`*Expected docs: ${docList}*`);
|
|
499
500
|
md.blank();
|
|
500
501
|
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Attribution core domain types — canonical shapes for the per-document
|
|
3
|
+
* attribution ensemble (Doc 04).
|
|
4
|
+
*
|
|
5
|
+
* Phase 1 lands the type carriers; Phase 4 lands the compute step. The
|
|
6
|
+
* Zod schemas in `packages/eval/src/adapters/attribution/` assert
|
|
7
|
+
* `satisfies z.ZodType<...>` against these types.
|
|
8
|
+
*
|
|
9
|
+
* Doc identity is referenced by `documentId` (D0052), not by `slug` —
|
|
10
|
+
* `slug` is retained as a human-readable annotation only. The
|
|
11
|
+
* resolvable-set check is carried as a separate
|
|
12
|
+
* `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
17
|
+
*/
|
|
18
|
+
import type { Confidence } from "./confidence.js";
|
|
19
|
+
/**
|
|
20
|
+
* Per-document attribution score for one judgment. The `signals` sub-record
|
|
21
|
+
* carries each ensemble member's contribution; the top-level `score` is
|
|
22
|
+
* the post-weighting composite.
|
|
23
|
+
*
|
|
24
|
+
* `documentId` is the canonical D0052 reference; `slug` is a
|
|
25
|
+
* human-readable annotation only and must not be relied on for identity.
|
|
26
|
+
*/
|
|
27
|
+
export interface DocAttribution {
|
|
28
|
+
/** Canonical D0052 document ref (id, not slug). */
|
|
29
|
+
documentId: string;
|
|
30
|
+
/** Optional human-readable annotation. Never the identity. */
|
|
31
|
+
slug?: string;
|
|
32
|
+
/** Composite attribution score in [0, 1]. */
|
|
33
|
+
score: number;
|
|
34
|
+
/** Per-ensemble-member contributions before weighting. */
|
|
35
|
+
signals: {
|
|
36
|
+
citation?: number;
|
|
37
|
+
canonical?: number;
|
|
38
|
+
retrieved?: number;
|
|
39
|
+
};
|
|
40
|
+
/** Shared D0049 confidence triple. */
|
|
41
|
+
confidence: Confidence;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Per-judgment attribution carrier. Emitted by Phase 4's
|
|
45
|
+
* `ComputeAttributionStep`; persisted at
|
|
46
|
+
* `runs/{runId}/attribution/{entryKey}.json`.
|
|
47
|
+
*
|
|
48
|
+
* `hallucinationCheckedAgainst` is the resolvable-set used at compute
|
|
49
|
+
* time — required (not optional) so consumers can audit citation
|
|
50
|
+
* grounding without re-deriving the set. Per Pitfall #11 the canonical
|
|
51
|
+
* task field is `contextDocs`; do not invent `expectedDocs` /
|
|
52
|
+
* `usedDocs` synonyms.
|
|
53
|
+
*/
|
|
54
|
+
export interface JudgmentAttribution {
|
|
55
|
+
/** D0052 granular ref to the underlying grader judgment. */
|
|
56
|
+
judgmentRef: string;
|
|
57
|
+
taskId: string;
|
|
58
|
+
modelId: string;
|
|
59
|
+
dimension: string;
|
|
60
|
+
attributions: DocAttribution[];
|
|
61
|
+
/** Resolvable-set used at compute time (Pitfall #11). */
|
|
62
|
+
hallucinationCheckedAgainst: string[];
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Run-scoped attribution metadata. Persisted alongside the per-entry
|
|
66
|
+
* attribution objects so consumers can interpret signal-weighting and
|
|
67
|
+
* embedding choices without re-loading the calibration set.
|
|
68
|
+
*
|
|
69
|
+
* `embeddingModel` is REQUIRED (Pitfall #6) — silently downgrading to a
|
|
70
|
+
* default has caused regressions in adjacent codebases.
|
|
71
|
+
*/
|
|
72
|
+
export interface AttributionMeta {
|
|
73
|
+
ensembleVersion: string;
|
|
74
|
+
/** Embedding model identifier — REQUIRED (Pitfall #6). */
|
|
75
|
+
embeddingModel: string;
|
|
76
|
+
calibrationSetVersion?: string;
|
|
77
|
+
weights: {
|
|
78
|
+
citation: number;
|
|
79
|
+
canonical: number;
|
|
80
|
+
retrieved: number;
|
|
81
|
+
};
|
|
82
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Attribution core domain types — canonical shapes for the per-document
|
|
3
|
+
* attribution ensemble (Doc 04).
|
|
4
|
+
*
|
|
5
|
+
* Phase 1 lands the type carriers; Phase 4 lands the compute step. The
|
|
6
|
+
* Zod schemas in `packages/eval/src/adapters/attribution/` assert
|
|
7
|
+
* `satisfies z.ZodType<...>` against these types.
|
|
8
|
+
*
|
|
9
|
+
* Doc identity is referenced by `documentId` (D0052), not by `slug` —
|
|
10
|
+
* `slug` is retained as a human-readable annotation only. The
|
|
11
|
+
* resolvable-set check is carried as a separate
|
|
12
|
+
* `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
17
|
+
*/
|
|
18
|
+
export {};
|
|
@@ -29,6 +29,8 @@ declare const __brand: unique symbol;
|
|
|
29
29
|
export type Brand<T, B extends string> = T & {
|
|
30
30
|
readonly [__brand]: B;
|
|
31
31
|
};
|
|
32
|
+
/** Unique identifier for a grader judgment (D0052 granular). */
|
|
33
|
+
export type JudgmentId = Brand<string, "JudgmentId">;
|
|
32
34
|
/** Unique identifier for an evaluation task */
|
|
33
35
|
export type TaskId = Brand<string, "TaskId">;
|
|
34
36
|
/** URL-safe slug for a task (derived from title) */
|
|
@@ -74,7 +76,7 @@ export type ArtifactId = Brand<string, "ArtifactId">;
|
|
|
74
76
|
* per-mode (e.g. `failureModes`, one entry per classified failure category —
|
|
75
77
|
* D0033 M7, W0051 Slice 2).
|
|
76
78
|
*/
|
|
77
|
-
export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
|
|
79
|
+
export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category" | "report";
|
|
78
80
|
/**
|
|
79
81
|
* The sanitized, filename-safe identifier for a single per-entry artifact
|
|
80
82
|
* object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
|
|
@@ -178,4 +180,27 @@ export declare function providerId(raw: string): Result<ProviderId, IdValidation
|
|
|
178
180
|
* Valid format: alphanumeric + hyphens, 1–128 characters.
|
|
179
181
|
*/
|
|
180
182
|
export declare function fixtureId(raw: string): Result<FixtureId, IdValidationError>;
|
|
183
|
+
/**
|
|
184
|
+
* Parse a raw string into a `JudgmentId`.
|
|
185
|
+
*
|
|
186
|
+
* See `JUDGMENT_ID_RE` for the accepted formats.
|
|
187
|
+
*/
|
|
188
|
+
export declare function judgmentId(raw: string): Result<JudgmentId, IdValidationError>;
|
|
189
|
+
/**
|
|
190
|
+
* Generate a deterministic `JudgmentId` for a synthesized fall-back
|
|
191
|
+
* judgment. Salting with `runId` (when supplied) makes the id unique
|
|
192
|
+
* per-run so consumers' `(taskId, modelId, dimension)` dedup key
|
|
193
|
+
* doesn't collide across re-runs of the same task — every run writes
|
|
194
|
+
* fresh ids that still encode the natural composite key.
|
|
195
|
+
*
|
|
196
|
+
* When `runId` is absent the salt collapses to `nosalt`, preserving the
|
|
197
|
+
* legacy "deterministic across runs" shape for callers that explicitly
|
|
198
|
+
* want it (e.g. unit tests that assert the exact id string).
|
|
199
|
+
*/
|
|
200
|
+
export declare function generateJudgmentId(input: {
|
|
201
|
+
taskId: string;
|
|
202
|
+
modelId: string;
|
|
203
|
+
dimension: string;
|
|
204
|
+
runId?: RunId | string;
|
|
205
|
+
}): JudgmentId;
|
|
181
206
|
export {};
|
|
@@ -84,11 +84,24 @@ export function generateRunId() {
|
|
|
84
84
|
.toISOString()
|
|
85
85
|
.replace(/[-:]/g, "")
|
|
86
86
|
.replace(/\.\d{3}Z$/, "Z");
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
// Rejection-sample bytes against the largest multiple of 36 ≤ 256
|
|
88
|
+
// (252) before applying `% 36`. Naive `b % 36` over [0, 255] biases
|
|
89
|
+
// digits 0..3 (probability 8/256) over 4..35 (probability 7/256) by
|
|
90
|
+
// ~14% per character. Drawing fresh bytes whenever the buffer runs
|
|
91
|
+
// dry keeps the loop terminating with overwhelming probability
|
|
92
|
+
// (each byte is kept with probability 252/256 ≈ 98.4%).
|
|
93
|
+
const suffixChars = [];
|
|
94
|
+
while (suffixChars.length < 8) {
|
|
95
|
+
const buf = crypto.getRandomValues(new Uint8Array(8));
|
|
96
|
+
for (const b of buf) {
|
|
97
|
+
if (b >= 252)
|
|
98
|
+
continue; // reject biased range
|
|
99
|
+
suffixChars.push((b % 36).toString(36));
|
|
100
|
+
if (suffixChars.length === 8)
|
|
101
|
+
break;
|
|
102
|
+
}
|
|
91
103
|
}
|
|
104
|
+
const suffix = suffixChars.join("");
|
|
92
105
|
return `run_${ts}_${suffix}`;
|
|
93
106
|
}
|
|
94
107
|
/**
|
|
@@ -166,3 +179,66 @@ export function fixtureId(raw) {
|
|
|
166
179
|
}
|
|
167
180
|
return ok(raw);
|
|
168
181
|
}
|
|
182
|
+
/**
|
|
183
|
+
* Canonical shape for a `JudgmentId`.
|
|
184
|
+
*
|
|
185
|
+
* Two accepted forms:
|
|
186
|
+
* - `judgment_<runId-suffix>_<sanitized-task>__<sanitized-model>__<dimension>`
|
|
187
|
+
* — minted by `generateJudgmentId` for synthesized fall-back judgments
|
|
188
|
+
* so dedup is per-run (a re-run of the same task produces a distinct id).
|
|
189
|
+
* - `j_<alphanumeric>` — short form used by test fixtures and any caller
|
|
190
|
+
* that wants a stable, opaque id without the structured composite.
|
|
191
|
+
*
|
|
192
|
+
* Inner segments may carry alphanumerics, hyphens, dots, and colons (the
|
|
193
|
+
* provider id surface is colon-separated). The full string is bounded to
|
|
194
|
+
* 256 characters to keep the id index-friendly downstream.
|
|
195
|
+
*/
|
|
196
|
+
const JUDGMENT_ID_RE = /^(?:judgment_[0-9a-z]{1,16}_[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9-]*|j_[A-Za-z0-9_-]{4,})$/;
|
|
197
|
+
/**
|
|
198
|
+
* Parse a raw string into a `JudgmentId`.
|
|
199
|
+
*
|
|
200
|
+
* See `JUDGMENT_ID_RE` for the accepted formats.
|
|
201
|
+
*/
|
|
202
|
+
export function judgmentId(raw) {
|
|
203
|
+
if (raw.length === 0 || raw.length > 256 || !JUDGMENT_ID_RE.test(raw)) {
|
|
204
|
+
return err({
|
|
205
|
+
code: "INVALID_JUDGMENT_ID",
|
|
206
|
+
raw,
|
|
207
|
+
message: `Invalid JudgmentId "${raw}": must match judgment_<runSalt>_<task>__<model>__<dimension> or j_<alnum>`,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
return ok(raw);
|
|
211
|
+
}
|
|
212
|
+
/** Strip a value down to the alphanumeric+hyphen alphabet the id format allows. */
|
|
213
|
+
function sanitizeJudgmentSegment(value) {
|
|
214
|
+
// Lowercase + replace runs of non-alphanumerics with a single hyphen,
|
|
215
|
+
// trim leading/trailing hyphens. Keeps dots and colons in `modelId`-like
|
|
216
|
+
// values (`openai:gpt-5.2`) since the regex permits them.
|
|
217
|
+
return value
|
|
218
|
+
.toLowerCase()
|
|
219
|
+
.replace(/[^a-z0-9.:_-]+/g, "-")
|
|
220
|
+
.replace(/^-+|-+$/g, "");
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Generate a deterministic `JudgmentId` for a synthesized fall-back
|
|
224
|
+
* judgment. Salting with `runId` (when supplied) makes the id unique
|
|
225
|
+
* per-run so consumers' `(taskId, modelId, dimension)` dedup key
|
|
226
|
+
* doesn't collide across re-runs of the same task — every run writes
|
|
227
|
+
* fresh ids that still encode the natural composite key.
|
|
228
|
+
*
|
|
229
|
+
* When `runId` is absent the salt collapses to `nosalt`, preserving the
|
|
230
|
+
* legacy "deterministic across runs" shape for callers that explicitly
|
|
231
|
+
* want it (e.g. unit tests that assert the exact id string).
|
|
232
|
+
*/
|
|
233
|
+
export function generateJudgmentId(input) {
|
|
234
|
+
// Take the trailing 8 chars of the runId (the random base36 suffix on
|
|
235
|
+
// the canonical shape) so the salt stays compact while still cycling
|
|
236
|
+
// every run. Falls back to a constant marker when runId isn't passed.
|
|
237
|
+
const runSalt = input.runId
|
|
238
|
+
? sanitizeJudgmentSegment(String(input.runId).slice(-8)) || "nosalt"
|
|
239
|
+
: "nosalt";
|
|
240
|
+
const task = sanitizeJudgmentSegment(input.taskId);
|
|
241
|
+
const model = sanitizeJudgmentSegment(input.modelId);
|
|
242
|
+
const dimension = sanitizeJudgmentSegment(input.dimension);
|
|
243
|
+
return `judgment_${runSalt}_${task}__${model}__${dimension}`;
|
|
244
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared confidence contract for actionability-ladder emitters (D0049).
|
|
3
|
+
*
|
|
4
|
+
* Every confidence-emitting site in the actionability-ladder design set
|
|
5
|
+
* (per-document attribution ensemble, structured grader judgments,
|
|
6
|
+
* diagnosis cards, regression detection) emits the same abstract triple
|
|
7
|
+
* so consumers can reason about confidence uniformly across emitters.
|
|
8
|
+
*
|
|
9
|
+
* Bucket thresholds and the formula behind `level` are emitter-specific;
|
|
10
|
+
* the externally comparable behavior is the `level` enum. Consumers that
|
|
11
|
+
* need the underlying mechanic read `derivation` and can branch.
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Conventional `derivation` identifiers for the seed set of emitters
|
|
15
|
+
* named in D0049. Re-exported as a typed tuple so consumers and tests can
|
|
16
|
+
* reference one source of truth instead of redeclaring the literals.
|
|
17
|
+
*
|
|
18
|
+
* Adding a new emitter does not require editing this list — `derivation`
|
|
19
|
+
* is an open tag (see `ConfidenceDerivation`). The list is the
|
|
20
|
+
* recommended starting set, not the universe.
|
|
21
|
+
*/
|
|
22
|
+
export declare const CONVENTIONAL_DERIVATIONS: readonly ["ensemble-stdev", "ceiling-cross-check", "regression-gate", "card-type-specific", "synthesized-pre-cross-check"];
|
|
23
|
+
/**
|
|
24
|
+
* Tag identifying the formula used to derive `Confidence.level`.
|
|
25
|
+
*
|
|
26
|
+
* Members of `CONVENTIONAL_DERIVATIONS` are surfaced as literal variants
|
|
27
|
+
* so IDEs autocomplete the recommended set, while the trailing
|
|
28
|
+
* `(string & {})` keeps the type open — emitters that need a new
|
|
29
|
+
* identifier (per-card-type tags, future mechanics) can mint their own
|
|
30
|
+
* without editing `@sanity/ailf-core`. D0049 picked the open shape so
|
|
31
|
+
* feature work isn't coupled to core's release cycle.
|
|
32
|
+
*/
|
|
33
|
+
export type ConfidenceDerivation = (typeof CONVENTIONAL_DERIVATIONS)[number] | (string & {});
|
|
34
|
+
/**
|
|
35
|
+
* The shared confidence triple. Every emitter populates all three fields.
|
|
36
|
+
*
|
|
37
|
+
* - `level` is bucketed (not numeric) — chosen over a 0..1 score so every
|
|
38
|
+
* consumer doesn't have to pick its own UI buckets. Emitters may keep a
|
|
39
|
+
* numeric internal representation and bucket at the edge.
|
|
40
|
+
* - `signalsPresent` lets consumers distinguish "1 of 1 signal said high"
|
|
41
|
+
* from "5 of 6 signals said high" without re-deriving the underlying
|
|
42
|
+
* mechanic.
|
|
43
|
+
* - `derivation` is a short identifier for the formula used to derive
|
|
44
|
+
* `level`, so consumers can interpret the mechanic without
|
|
45
|
+
* re-implementing it. Conventional values: `"ensemble-stdev"`,
|
|
46
|
+
* `"ceiling-cross-check"`, `"regression-gate"`, `"card-type-specific"`.
|
|
47
|
+
* Emitters may emit any non-empty string; new conventional identifiers
|
|
48
|
+
* land as new emitters arrive.
|
|
49
|
+
*/
|
|
50
|
+
export type Confidence = {
|
|
51
|
+
/** Bucketed level. Comparable across emitters at this granularity. */
|
|
52
|
+
level: "high" | "medium" | "low";
|
|
53
|
+
/** Number of signals contributing to the score. Lets consumers
|
|
54
|
+
* distinguish "1 of 1 signal said high" from "5 of 6 signals said high." */
|
|
55
|
+
signalsPresent: number;
|
|
56
|
+
/** Short identifier for the formula used to derive `level`. Lets
|
|
57
|
+
* consumers interpret the mechanic without re-implementing it.
|
|
58
|
+
* Conventional values: "ensemble-stdev", "ceiling-cross-check",
|
|
59
|
+
* "regression-gate", "card-type-specific". */
|
|
60
|
+
derivation: ConfidenceDerivation;
|
|
61
|
+
};
|
|
62
|
+
/**
|
|
63
|
+
* Structural type guard for `Confidence`. Verifies the runtime shape
|
|
64
|
+
* matches the contract — useful at trust boundaries that can't depend on
|
|
65
|
+
* a Zod schema (the schema lives at the consuming site since each emitter
|
|
66
|
+
* picks its own `level` thresholds, but the shape is shared).
|
|
67
|
+
*/
|
|
68
|
+
export declare function isConfidence(value: unknown): value is Confidence;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared confidence contract for actionability-ladder emitters (D0049).
|
|
3
|
+
*
|
|
4
|
+
* Every confidence-emitting site in the actionability-ladder design set
|
|
5
|
+
* (per-document attribution ensemble, structured grader judgments,
|
|
6
|
+
* diagnosis cards, regression detection) emits the same abstract triple
|
|
7
|
+
* so consumers can reason about confidence uniformly across emitters.
|
|
8
|
+
*
|
|
9
|
+
* Bucket thresholds and the formula behind `level` are emitter-specific;
|
|
10
|
+
* the externally comparable behavior is the `level` enum. Consumers that
|
|
11
|
+
* need the underlying mechanic read `derivation` and can branch.
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Conventional `derivation` identifiers for the seed set of emitters
|
|
15
|
+
* named in D0049. Re-exported as a typed tuple so consumers and tests can
|
|
16
|
+
* reference one source of truth instead of redeclaring the literals.
|
|
17
|
+
*
|
|
18
|
+
* Adding a new emitter does not require editing this list — `derivation`
|
|
19
|
+
* is an open tag (see `ConfidenceDerivation`). The list is the
|
|
20
|
+
* recommended starting set, not the universe.
|
|
21
|
+
*/
|
|
22
|
+
export const CONVENTIONAL_DERIVATIONS = [
|
|
23
|
+
"ensemble-stdev",
|
|
24
|
+
"ceiling-cross-check",
|
|
25
|
+
"regression-gate",
|
|
26
|
+
"card-type-specific",
|
|
27
|
+
// Sentinel placeholder used by the eval pipeline's
|
|
28
|
+
// `synthesizeUnparsedJudgment` fall-back. The validator
|
|
29
|
+
// (`validateGraderJudgmentsCalibration`) overwrites it with
|
|
30
|
+
// "ceiling-cross-check" before judgments leave the live pipeline,
|
|
31
|
+
// so it should not appear on stored reports — the literal is in
|
|
32
|
+
// this list so a leaked sentinel is greppable.
|
|
33
|
+
"synthesized-pre-cross-check",
|
|
34
|
+
];
|
|
35
|
+
/**
|
|
36
|
+
* Structural type guard for `Confidence`. Verifies the runtime shape
|
|
37
|
+
* matches the contract — useful at trust boundaries that can't depend on
|
|
38
|
+
* a Zod schema (the schema lives at the consuming site since each emitter
|
|
39
|
+
* picks its own `level` thresholds, but the shape is shared).
|
|
40
|
+
*/
|
|
41
|
+
export function isConfidence(value) {
|
|
42
|
+
if (typeof value !== "object" || value === null)
|
|
43
|
+
return false;
|
|
44
|
+
const v = value;
|
|
45
|
+
if (v.level !== "high" && v.level !== "medium" && v.level !== "low") {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
if (typeof v.signalsPresent !== "number" ||
|
|
49
|
+
!Number.isFinite(v.signalsPresent)) {
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
if (typeof v.derivation !== "string" || v.derivation.length === 0) {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis core domain types — canonical shapes for the post-run
|
|
3
|
+
* synthesis layer (Doc 05).
|
|
4
|
+
*
|
|
5
|
+
* `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
|
|
6
|
+
* segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
|
|
7
|
+
* outer-`status` discriminated union with a nested `cardType`
|
|
8
|
+
* discriminator inside the `ready` variant.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
+
* Doc 05 specs.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
15
|
+
* @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
|
|
16
|
+
*/
|
|
17
|
+
import type { RunId } from "./branded-ids.js";
|
|
18
|
+
import type { ReportId } from "./index.js";
|
|
19
|
+
/**
|
|
20
|
+
* The four-version cache envelope. Every cached `Diagnosis` carries the
|
|
21
|
+
* versions of the inputs that produced it; any bump in any segment
|
|
22
|
+
* invalidates the cache (cross-package contract test asserts this).
|
|
23
|
+
*
|
|
24
|
+
* Strings everywhere; semver convention by humans where authored
|
|
25
|
+
* manually. No branding, no tuples, no content-hash typing — keeps the
|
|
26
|
+
* envelope trivially serializable + greppable.
|
|
27
|
+
*/
|
|
28
|
+
export interface VersionedInputs {
|
|
29
|
+
graderJudgmentsVersion: string;
|
|
30
|
+
ensembleVersion: string;
|
|
31
|
+
diagnosisVersion: string;
|
|
32
|
+
cardVersion: string;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* The 8 ready-card archetypes. Phase 5 cards register against these
|
|
36
|
+
* literals; the slim-shape boundary in Phase 7 reads them to render the
|
|
37
|
+
* Studio diagnosis renderer.
|
|
38
|
+
*/
|
|
39
|
+
export type CardType = "area-summary" | "failure-mode-summary" | "no-issues" | "top-recommendations" | "weakest-area" | "low-confidence-attribution" | "doc-attribution-spotlight" | "regression-vs-baseline";
|
|
40
|
+
/**
|
|
41
|
+
* Per-card telemetry envelope. `cardVersion` here is the per-card
|
|
42
|
+
* version (e.g. `"area-summary@0.1.0"`), not the compound. Drives
|
|
43
|
+
* DIAG-06 cost telemetry in Phase 6.
|
|
44
|
+
*/
|
|
45
|
+
export interface CardMeta {
|
|
46
|
+
cardVersion: string;
|
|
47
|
+
tokenUsage?: {
|
|
48
|
+
input: number;
|
|
49
|
+
output: number;
|
|
50
|
+
};
|
|
51
|
+
latencyMs?: number;
|
|
52
|
+
/** ISO 8601 UTC timestamp. */
|
|
53
|
+
generatedAt: string;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* A single actionable suggestion surfaced by a recommendations card.
|
|
57
|
+
* The full Phase 5 shape may add fields (per Doc 05 specs); Phase 1
|
|
58
|
+
* locks the minimum required surface.
|
|
59
|
+
*/
|
|
60
|
+
export interface ActionSuggestion {
|
|
61
|
+
title: string;
|
|
62
|
+
body: string;
|
|
63
|
+
priority: "high" | "medium" | "low";
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Phase 1 body placeholders. Each shape is intentionally minimal; Phase 5
|
|
67
|
+
* card files enrich them per Doc 05 specs and assert
|
|
68
|
+
* `satisfies z.ZodType<Extract<DiagnosisCard, { status: "ready"; cardType: "X" }>["body"]>`
|
|
69
|
+
* against these declarations.
|
|
70
|
+
*/
|
|
71
|
+
export interface AreaSummaryBody {
|
|
72
|
+
summary: string;
|
|
73
|
+
}
|
|
74
|
+
export interface FailureModeSummaryBody {
|
|
75
|
+
summary: string;
|
|
76
|
+
}
|
|
77
|
+
export interface NoIssuesBody {
|
|
78
|
+
summary: string;
|
|
79
|
+
}
|
|
80
|
+
export interface TopRecommendationsBody {
|
|
81
|
+
summary: string;
|
|
82
|
+
suggestions: ActionSuggestion[];
|
|
83
|
+
}
|
|
84
|
+
export interface WeakestAreaBody {
|
|
85
|
+
summary: string;
|
|
86
|
+
}
|
|
87
|
+
export interface LowConfidenceAttributionBody {
|
|
88
|
+
summary: string;
|
|
89
|
+
}
|
|
90
|
+
export interface DocAttributionSpotlightBody {
|
|
91
|
+
summary: string;
|
|
92
|
+
}
|
|
93
|
+
export interface RegressionVsBaselineBody {
|
|
94
|
+
summary: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Outer-`status` discriminated union: 8 ready variants (one per
|
|
98
|
+
* `cardType`, each carrying its per-cardType body), plus a `degraded`
|
|
99
|
+
* variant (parse failed or downgraded by the runner) and a `missing`
|
|
100
|
+
* variant (card not produced for this run).
|
|
101
|
+
*
|
|
102
|
+
* No `not-yet-generated` variant — old-report fallback is a Phase 7
|
|
103
|
+
* concern at the slim-shape boundary, handled at fetch-time, not in
|
|
104
|
+
* `DiagnosisCard` itself.
|
|
105
|
+
*/
|
|
106
|
+
export type DiagnosisCard = {
|
|
107
|
+
status: "ready";
|
|
108
|
+
cardType: "area-summary";
|
|
109
|
+
body: AreaSummaryBody;
|
|
110
|
+
meta: CardMeta;
|
|
111
|
+
} | {
|
|
112
|
+
status: "ready";
|
|
113
|
+
cardType: "failure-mode-summary";
|
|
114
|
+
body: FailureModeSummaryBody;
|
|
115
|
+
meta: CardMeta;
|
|
116
|
+
} | {
|
|
117
|
+
status: "ready";
|
|
118
|
+
cardType: "no-issues";
|
|
119
|
+
body: NoIssuesBody;
|
|
120
|
+
meta: CardMeta;
|
|
121
|
+
} | {
|
|
122
|
+
status: "ready";
|
|
123
|
+
cardType: "top-recommendations";
|
|
124
|
+
body: TopRecommendationsBody;
|
|
125
|
+
meta: CardMeta;
|
|
126
|
+
} | {
|
|
127
|
+
status: "ready";
|
|
128
|
+
cardType: "weakest-area";
|
|
129
|
+
body: WeakestAreaBody;
|
|
130
|
+
meta: CardMeta;
|
|
131
|
+
} | {
|
|
132
|
+
status: "ready";
|
|
133
|
+
cardType: "low-confidence-attribution";
|
|
134
|
+
body: LowConfidenceAttributionBody;
|
|
135
|
+
meta: CardMeta;
|
|
136
|
+
} | {
|
|
137
|
+
status: "ready";
|
|
138
|
+
cardType: "doc-attribution-spotlight";
|
|
139
|
+
body: DocAttributionSpotlightBody;
|
|
140
|
+
meta: CardMeta;
|
|
141
|
+
} | {
|
|
142
|
+
status: "ready";
|
|
143
|
+
cardType: "regression-vs-baseline";
|
|
144
|
+
body: RegressionVsBaselineBody;
|
|
145
|
+
meta: CardMeta;
|
|
146
|
+
} | {
|
|
147
|
+
status: "degraded";
|
|
148
|
+
cardType: CardType;
|
|
149
|
+
reason: string;
|
|
150
|
+
parseFailed: boolean;
|
|
151
|
+
meta: CardMeta;
|
|
152
|
+
} | {
|
|
153
|
+
status: "missing";
|
|
154
|
+
cardType: CardType;
|
|
155
|
+
reason: string;
|
|
156
|
+
};
|
|
157
|
+
/**
|
|
158
|
+
* The post-run synthesis aggregate. Consumed by Phase 5 (runner +
|
|
159
|
+
* cards), Phase 6 (CLI) and Phase 7 (Studio). Phase 1 lands the
|
|
160
|
+
* declarative shape; runtime construction lands in Phase 5.
|
|
161
|
+
*/
|
|
162
|
+
export interface Diagnosis {
|
|
163
|
+
runId: RunId;
|
|
164
|
+
reportId: ReportId;
|
|
165
|
+
inputs: VersionedInputs;
|
|
166
|
+
cards: DiagnosisCard[];
|
|
167
|
+
/** ISO 8601 UTC timestamp. */
|
|
168
|
+
generatedAt: string;
|
|
169
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis core domain types — canonical shapes for the post-run
|
|
3
|
+
* synthesis layer (Doc 05).
|
|
4
|
+
*
|
|
5
|
+
* `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
|
|
6
|
+
* segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
|
|
7
|
+
* outer-`status` discriminated union with a nested `cardType`
|
|
8
|
+
* discriminator inside the `ready` variant.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
+
* Doc 05 specs.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
15
|
+
* @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
|
|
16
|
+
*/
|
|
17
|
+
export {};
|
|
@@ -53,11 +53,26 @@ export interface PerspectiveDocRef {
|
|
|
53
53
|
perspective: string;
|
|
54
54
|
reason?: string;
|
|
55
55
|
}
|
|
56
|
+
/**
|
|
57
|
+
* A single criterion within a templated llm-rubric assertion.
|
|
58
|
+
*
|
|
59
|
+
* The `id` is a stable, slug-formatted identifier — auto-derived from
|
|
60
|
+
* `text` in Studio (via slug `options.source: "text"`), or backfilled
|
|
61
|
+
* from Sanity's `_key` for pre-migration documents. Survives criterion
|
|
62
|
+
* text edits; downstream judgments and diagnosis cards reference by
|
|
63
|
+
* `id` per D0052 (judgment-ref granularity).
|
|
64
|
+
*/
|
|
65
|
+
export interface CriterionRef {
|
|
66
|
+
/** Stable per-criterion identifier — slug-format (`[a-z0-9][a-z0-9-]*`). */
|
|
67
|
+
id: string;
|
|
68
|
+
/** Author-facing criterion text (the original bullet). */
|
|
69
|
+
text: string;
|
|
70
|
+
}
|
|
56
71
|
/** A templated assertion referencing a rubric template */
|
|
57
72
|
export interface GeneralizedTemplatedAssertion {
|
|
58
73
|
type: "llm-rubric";
|
|
59
74
|
template: string;
|
|
60
|
-
criteria:
|
|
75
|
+
criteria: CriterionRef[];
|
|
61
76
|
weight?: number;
|
|
62
77
|
}
|
|
63
78
|
/** A value-based assertion (contains, javascript, cost, latency, etc.) */
|