@sanity/ailf 7.0.1 → 7.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +12 -13
- package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +10 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
- package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/report.js +2 -0
- package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/team.js +63 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
- package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
- package/dist/_vendor/ailf-core/types/team.js +1 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -1
- package/dist/_vendor/ailf-shared/document-ref.js +23 -1
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
- package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
- package/dist/_vendor/ailf-shared/event-types.js +23 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
- package/dist/_vendor/ailf-shared/index.d.ts +5 -3
- package/dist/_vendor/ailf-shared/index.js +5 -2
- package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
- package/dist/_vendor/ailf-shared/member-roles.js +16 -0
- package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
- package/dist/_vendor/ailf-shared/owner-teams.js +26 -6
- package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +15 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
- package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
- package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +14 -8
- package/dist/adapters/task-sources/repo-task-source.js +2 -1
- package/dist/commands/pipeline-action.d.ts +4 -3
- package/dist/commands/pipeline-action.js +7 -5
- package/dist/commands/run.js +2 -2
- package/dist/config/rubrics.ts +12 -13
- package/dist/job-store.d.ts +18 -0
- package/dist/job-store.js +34 -0
- package/dist/orchestration/build-app-context.js +8 -1
- package/dist/orchestration/pipeline-orchestrator.js +46 -1
- package/dist/orchestration/steps/compare-step.d.ts +7 -0
- package/dist/orchestration/steps/compare-step.js +59 -23
- package/dist/orchestration/steps/fetch-docs-step.js +3 -0
- package/dist/orchestration/steps/finalize-run-step.js +2 -0
- package/dist/orchestration/steps/gap-analysis-step.js +9 -8
- package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
- package/dist/orchestration/steps/generate-configs-step.js +47 -13
- package/dist/orchestration/steps/grader-consistency-step.js +11 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
- package/dist/orchestration/steps/publish-report-step.js +36 -8
- package/dist/pipeline/cache-hit-restore.d.ts +14 -1
- package/dist/pipeline/cache-hit-restore.js +17 -0
- package/dist/pipeline/calculate-scores.d.ts +13 -1
- package/dist/pipeline/calculate-scores.js +123 -29
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
- package/dist/pipeline/compiler/provider-assembler.js +16 -3
- package/dist/pipeline/failure-modes.d.ts +20 -10
- package/dist/pipeline/failure-modes.js +84 -15
- package/dist/pipeline/map-request-to-config.js +2 -0
- package/dist/pipeline/normalize-mode.d.ts +1 -1
- package/dist/pipeline/normalize-mode.js +2 -0
- package/dist/pipeline/run-context.d.ts +16 -1
- package/dist/pipeline/run-context.js +12 -1
- package/dist/pipeline/validate.d.ts +8 -4
- package/dist/pipeline/validate.js +8 -18
- package/dist/report-store.d.ts +14 -1
- package/dist/report-store.js +32 -0
- package/dist/sanity/client.js +2 -2
- package/dist/sanity/queries.d.ts +1 -1
- package/dist/sanity/queries.js +1 -0
- package/dist/sources.js +40 -2
- package/package.json +1 -1
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Known notification event types and soft-enum helpers.
|
|
3
|
+
*
|
|
4
|
+
* Event types are free-form strings by design — teams can wire new events
|
|
5
|
+
* without a code change. This module seeds Studio comboboxes with canonical
|
|
6
|
+
* values and provides a narrowing predicate, without closing the enum.
|
|
7
|
+
*
|
|
8
|
+
* Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
|
|
9
|
+
* `shared` is the leaf of the dependency graph, so the studio schema can
|
|
10
|
+
* import the runtime tuple without pulling in core.
|
|
11
|
+
*/
|
|
12
|
+
export declare const KNOWN_EVENT_TYPES: readonly ["eval.failed", "eval.completed", "eval.threshold-breached", "eval.score-regressed", "task.created", "task.archived", "area.unowned-tasks"];
|
|
13
|
+
export type KnownEventType = (typeof KNOWN_EVENT_TYPES)[number];
|
|
14
|
+
export type EventType = KnownEventType | (string & {});
|
|
15
|
+
export declare function isKnownEventType(value: string): value is KnownEventType;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Known notification event types and soft-enum helpers.
|
|
3
|
+
*
|
|
4
|
+
* Event types are free-form strings by design — teams can wire new events
|
|
5
|
+
* without a code change. This module seeds Studio comboboxes with canonical
|
|
6
|
+
* values and provides a narrowing predicate, without closing the enum.
|
|
7
|
+
*
|
|
8
|
+
* Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
|
|
9
|
+
* `shared` is the leaf of the dependency graph, so the studio schema can
|
|
10
|
+
* import the runtime tuple without pulling in core.
|
|
11
|
+
*/
|
|
12
|
+
export const KNOWN_EVENT_TYPES = [
|
|
13
|
+
"eval.failed",
|
|
14
|
+
"eval.completed",
|
|
15
|
+
"eval.threshold-breached",
|
|
16
|
+
"eval.score-regressed",
|
|
17
|
+
"task.created",
|
|
18
|
+
"task.archived",
|
|
19
|
+
"area.unowned-tasks",
|
|
20
|
+
];
|
|
21
|
+
export function isKnownEventType(value) {
|
|
22
|
+
return KNOWN_EVENT_TYPES.includes(value);
|
|
23
|
+
}
|
|
@@ -88,7 +88,7 @@ export const HELP_TOPICS = [
|
|
|
88
88
|
{
|
|
89
89
|
"id": "scoring-model",
|
|
90
90
|
"title": "Understanding Scores",
|
|
91
|
-
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.
|
|
91
|
+
"body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `packages/eval/config/rubrics.ts`:\n\n```\nGold (with docs): Total = Task × 0.50 + Code × 0.25 + Docs × 0.25\nBaseline (no docs): Total = Task × 0.60 + Code × 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling − floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0–100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80–100** | Docs are working well — AI agents produce correct implementations |\n| **70–79** | Needs attention — there may be gaps in specific dimensions |\n| **Below 70** | Weak — AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice — with and without\ndocumentation. This produces:\n\n- **Floor score** — Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** — Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** — Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** — 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement — what happens when AI agents find docs on\ntheir own:\n\n- **Floor** — No docs (parametric knowledge only)\n- **Ceiling** — Gold-standard docs injected (best the docs can do)\n- **Actual** — Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** — Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** — Actual ÷ ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** — Token usage for generating implementations\n- **Grader cost** — Token usage for the grading model's assessments\n- **Total cost** — Both combined, reported in the score summary",
|
|
92
92
|
"source": "docs/help/scoring-model.md",
|
|
93
93
|
"related": [
|
|
94
94
|
"three-layer",
|
|
@@ -99,7 +99,7 @@ export const HELP_TOPICS = [
|
|
|
99
99
|
{
|
|
100
100
|
"id": "weaknesses-recommendations",
|
|
101
101
|
"title": "Weaknesses & Recommendations",
|
|
102
|
-
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
102
|
+
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
103
103
|
"source": "docs/help/weaknesses-recommendations.md",
|
|
104
104
|
"related": [
|
|
105
105
|
"interpreting-diagnostics",
|
|
@@ -18,17 +18,19 @@
|
|
|
18
18
|
* surface against future regressions.
|
|
19
19
|
*/
|
|
20
20
|
export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
|
|
21
|
-
export { type DocumentRef } from "./document-ref.js";
|
|
21
|
+
export { buildContextDocPath, type DocumentRef } from "./document-ref.js";
|
|
22
22
|
export { makeEditorialReference, type EditorialReference, type MakeEditorialReferenceArgs, } from "./editorial-reference.js";
|
|
23
|
+
export { isKnownEventType, KNOWN_EVENT_TYPES, type EventType, type KnownEventType, } from "./event-types.js";
|
|
23
24
|
export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
|
|
24
25
|
export { DEFAULT_GCS_ARTIFACT_BUCKET } from "./gcs-defaults.js";
|
|
25
26
|
export { GLOSSARY, type GlossaryEntry, type GlossarySlug } from "./glossary.js";
|
|
26
27
|
export { HELP_TOPICS } from "./help-content.js";
|
|
27
28
|
export { type HelpTopic } from "./help-topics.js";
|
|
29
|
+
export { isKnownMemberRole, KNOWN_MEMBER_ROLES, type KnownMemberRole, type MemberRole, } from "./member-roles.js";
|
|
28
30
|
export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
|
|
29
31
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
30
|
-
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
31
|
-
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
32
|
+
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
|
|
33
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, type SlugLike, } from "./owner-teams.js";
|
|
32
34
|
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
|
|
33
35
|
export { type RunTrigger } from "./run-trigger.js";
|
|
34
36
|
export { type RunContext } from "./run-context.js";
|
|
@@ -18,13 +18,16 @@
|
|
|
18
18
|
* surface against future regressions.
|
|
19
19
|
*/
|
|
20
20
|
export { computeCanaryDrift, } from "./canary-drift.js";
|
|
21
|
+
export { buildContextDocPath } from "./document-ref.js";
|
|
21
22
|
export { makeEditorialReference, } from "./editorial-reference.js";
|
|
23
|
+
export { isKnownEventType, KNOWN_EVENT_TYPES, } from "./event-types.js";
|
|
22
24
|
export { FEATURE_FLAGS, } from "./feature-flags.js";
|
|
23
25
|
export { DEFAULT_GCS_ARTIFACT_BUCKET } from "./gcs-defaults.js";
|
|
24
26
|
export { GLOSSARY } from "./glossary.js";
|
|
25
27
|
export { HELP_TOPICS } from "./help-content.js";
|
|
28
|
+
export { isKnownMemberRole, KNOWN_MEMBER_ROLES, } from "./member-roles.js";
|
|
26
29
|
export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
|
|
27
30
|
export { NOISE_THRESHOLD } from "./noise-threshold.js";
|
|
28
|
-
export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
29
|
-
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
|
|
31
|
+
export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
|
|
32
|
+
export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, } from "./owner-teams.js";
|
|
30
33
|
export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Known team-member roles and soft-enum helpers.
|
|
3
|
+
*
|
|
4
|
+
* Roles are free-form strings — teams can introduce custom roles (e.g.
|
|
5
|
+
* `"reviewer"`, `"sme"`) without a code change. This module seeds Studio
|
|
6
|
+
* comboboxes with canonical values and exposes a narrowing predicate
|
|
7
|
+
* without closing the enum.
|
|
8
|
+
*
|
|
9
|
+
* Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
|
|
10
|
+
* `shared` is the leaf of the dependency graph, so the studio schema can
|
|
11
|
+
* import the runtime tuple without pulling in core.
|
|
12
|
+
*/
|
|
13
|
+
export declare const KNOWN_MEMBER_ROLES: readonly ["lead", "member", "oncall"];
|
|
14
|
+
export type KnownMemberRole = (typeof KNOWN_MEMBER_ROLES)[number];
|
|
15
|
+
export type MemberRole = KnownMemberRole | (string & {});
|
|
16
|
+
export declare function isKnownMemberRole(value: string): value is KnownMemberRole;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Known team-member roles and soft-enum helpers.
|
|
3
|
+
*
|
|
4
|
+
* Roles are free-form strings — teams can introduce custom roles (e.g.
|
|
5
|
+
* `"reviewer"`, `"sme"`) without a code change. This module seeds Studio
|
|
6
|
+
* comboboxes with canonical values and exposes a narrowing predicate
|
|
7
|
+
* without closing the enum.
|
|
8
|
+
*
|
|
9
|
+
* Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
|
|
10
|
+
* `shared` is the leaf of the dependency graph, so the studio schema can
|
|
11
|
+
* import the runtime tuple without pulling in core.
|
|
12
|
+
*/
|
|
13
|
+
export const KNOWN_MEMBER_ROLES = ["lead", "member", "oncall"];
|
|
14
|
+
export function isKnownMemberRole(value) {
|
|
15
|
+
return KNOWN_MEMBER_ROLES.includes(value);
|
|
16
|
+
}
|
|
@@ -24,3 +24,22 @@ export declare const KNOWN_OWNER_TEAMS: readonly string[];
|
|
|
24
24
|
*/
|
|
25
25
|
export declare function normalizeOwnerTeam(value: string | undefined | null): string;
|
|
26
26
|
export declare function isKnownOwnerTeam(value: string): boolean;
|
|
27
|
+
/**
|
|
28
|
+
* Lightweight team lookup against an in-memory team list.
|
|
29
|
+
*
|
|
30
|
+
* Consumers fetch team docs via GROQ then call this helper to resolve a
|
|
31
|
+
* freeform `owner.team` string (D0037) into a Sanity reference. Unknown
|
|
32
|
+
* strings return null. Known aliases (via OWNER_TEAM_ALIASES) are honored.
|
|
33
|
+
*
|
|
34
|
+
* Returning null is the cue for the UI to render an "unresolved team"
|
|
35
|
+
* badge — not an error condition.
|
|
36
|
+
*/
|
|
37
|
+
export interface SlugLike {
|
|
38
|
+
_id: string;
|
|
39
|
+
slug: {
|
|
40
|
+
current: string;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export declare function resolveTeamRef(value: string | null | undefined, teams: readonly SlugLike[]): {
|
|
44
|
+
_ref: string;
|
|
45
|
+
} | null;
|
|
@@ -14,11 +14,21 @@
|
|
|
14
14
|
* @see docs/decisions/D0037-run-classification-and-ownership-taxonomy.md
|
|
15
15
|
*/
|
|
16
16
|
export const KNOWN_OWNER_TEAMS = [
|
|
17
|
+
"ai-growth",
|
|
18
|
+
"billing-and-integrations",
|
|
19
|
+
"content-agent",
|
|
17
20
|
"content-lake",
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
21
|
+
"data",
|
|
22
|
+
"design-and-research",
|
|
23
|
+
"docs",
|
|
24
|
+
"editorial-experience",
|
|
25
|
+
"engineering",
|
|
26
|
+
"identity",
|
|
27
|
+
"media-library",
|
|
28
|
+
"product",
|
|
29
|
+
"runtime",
|
|
30
|
+
"sdk",
|
|
31
|
+
"ssi",
|
|
22
32
|
"studio",
|
|
23
33
|
];
|
|
24
34
|
/**
|
|
@@ -26,8 +36,11 @@ export const KNOWN_OWNER_TEAMS = [
|
|
|
26
36
|
* drift has been observed belong here. Unknown values pass through.
|
|
27
37
|
*/
|
|
28
38
|
const OWNER_TEAM_ALIASES = {
|
|
29
|
-
|
|
30
|
-
|
|
39
|
+
"core-docs": "docs",
|
|
40
|
+
coredocs: "docs",
|
|
41
|
+
documentation: "docs",
|
|
42
|
+
growth: "ai-growth",
|
|
43
|
+
media: "media-library",
|
|
31
44
|
studio_team: "studio",
|
|
32
45
|
"studio-team": "studio",
|
|
33
46
|
};
|
|
@@ -50,3 +63,10 @@ export function normalizeOwnerTeam(value) {
|
|
|
50
63
|
export function isKnownOwnerTeam(value) {
|
|
51
64
|
return KNOWN_OWNER_TEAMS.includes(value);
|
|
52
65
|
}
|
|
66
|
+
export function resolveTeamRef(value, teams) {
|
|
67
|
+
const normalized = normalizeOwnerTeam(value ?? undefined);
|
|
68
|
+
if (!normalized || normalized === "unknown")
|
|
69
|
+
return null;
|
|
70
|
+
const match = teams.find((t) => t.slug?.current === normalized);
|
|
71
|
+
return match ? { _ref: match._id } : null;
|
|
72
|
+
}
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
15
15
|
* @see docs/design-docs/run-artifact-store.md (§ Drift Prevention)
|
|
16
16
|
*/
|
|
17
|
-
import type { EvalMode } from "./eval-modes.js";
|
|
17
|
+
import type { EvalMode, LiteracyVariant } from "./eval-modes.js";
|
|
18
18
|
import type { RunClassification, RunExecutor, RunHost, RunLineage, RunOwner, RunTool } from "./run-classification.js";
|
|
19
19
|
import type { RunTrigger } from "./run-trigger.js";
|
|
20
20
|
export interface RunContext {
|
|
@@ -75,4 +75,11 @@ export interface RunContext {
|
|
|
75
75
|
tool?: RunTool;
|
|
76
76
|
/** What initiated this run */
|
|
77
77
|
trigger: RunTrigger;
|
|
78
|
+
/**
|
|
79
|
+
* Literacy mode variant — `baseline`, `agentic`, `observed`, or `full`.
|
|
80
|
+
* Only meaningful when `mode === "literacy"`; absent for other modes.
|
|
81
|
+
* Surfaced on `ReportProvenance` so dashboards can disambiguate which
|
|
82
|
+
* variant produced a given report.
|
|
83
|
+
*/
|
|
84
|
+
variant?: LiteracyVariant;
|
|
78
85
|
}
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
import { mkdirSync, writeFileSync } from "fs";
|
|
17
17
|
import { join } from "path";
|
|
18
18
|
import { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, } from "../../_vendor/ailf-core/index.js";
|
|
19
|
+
import { buildContextDocPath } from "../../_vendor/ailf-shared/index.js";
|
|
19
20
|
import { fetchUrlContent, } from "../../pipeline/fetch-url-content.js";
|
|
20
21
|
import { createPerspectiveClient, createPublishedClient, getSanityClient, } from "../../sanity/client.js";
|
|
21
22
|
import { extractSymbolsForDoc, renderDocument, } from "../../sanity/document-renderers.js";
|
|
@@ -376,7 +377,20 @@ export class SanityDocFetcher {
|
|
|
376
377
|
: getSanityClient(toSanityOverrides(source));
|
|
377
378
|
const allMetadata = await client.fetch(ARTICLES_METADATA_BY_SLUGS_QUERY, { slugs: [...allSlugs] });
|
|
378
379
|
return allMetadata
|
|
379
|
-
.map((m) =>
|
|
380
|
+
.map((m) => {
|
|
381
|
+
const path = buildContextDocPath({
|
|
382
|
+
sectionSlug: m.sectionSlug,
|
|
383
|
+
slug: m.slug,
|
|
384
|
+
});
|
|
385
|
+
return {
|
|
386
|
+
_id: m._id,
|
|
387
|
+
_rev: m._rev,
|
|
388
|
+
slug: m.slug,
|
|
389
|
+
...(m.sectionSlug ? { sectionSlug: m.sectionSlug } : {}),
|
|
390
|
+
...(path ? { path } : {}),
|
|
391
|
+
title: m.title,
|
|
392
|
+
};
|
|
393
|
+
})
|
|
380
394
|
.sort((a, b) => a.slug.localeCompare(b.slug));
|
|
381
395
|
}
|
|
382
396
|
// -----------------------------------------------------------------------
|
|
@@ -99,4 +99,68 @@ export declare const GraderJudgmentSchema: z.ZodObject<{
|
|
|
99
99
|
graderJudgmentsVersion: z.ZodString;
|
|
100
100
|
}, z.core.$strip>;
|
|
101
101
|
}, z.core.$strict>;
|
|
102
|
-
|
|
102
|
+
/**
|
|
103
|
+
* Wire-format schema — what the grader LLM is asked to emit.
|
|
104
|
+
*
|
|
105
|
+
* This is the subset of {@link GraderJudgmentSchema} that contains only
|
|
106
|
+
* fields the LLM can correctly produce. The pipeline parses untrusted
|
|
107
|
+
* grader output against this shape, then synthesizes the four
|
|
108
|
+
* pipeline-owned fields (`judgmentId`, `metadata.graderModel`,
|
|
109
|
+
* `metadata.graderJudgmentsVersion`, `hallucinationCheckedAgainst`) plus
|
|
110
|
+
* the three result-context fields (`taskId`, `modelId`, `dimension`) to
|
|
111
|
+
* build the full {@link GraderJudgment} storage shape.
|
|
112
|
+
*
|
|
113
|
+
* `.strict()` is retained — the LLM is told exactly which keys to emit,
|
|
114
|
+
* so any extras are either prompt-injection attempts or noise we want
|
|
115
|
+
* to surface as parse failures (which then drop to the
|
|
116
|
+
* `synthesizeUnparsedJudgment` fallback). The bar to hit that fallback
|
|
117
|
+
* is much lower than before W0273 — previously a missing pipeline-owned
|
|
118
|
+
* field tripped it on every emission.
|
|
119
|
+
*
|
|
120
|
+
* Asserts `satisfies z.ZodType<GraderEmittedJudgment>` against the
|
|
121
|
+
* independently-authored core type (D0045). The trust-boundary CI gate
|
|
122
|
+
* (`pnpm check-trust-boundary-satisfies`) covers this file.
|
|
123
|
+
*
|
|
124
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md
|
|
125
|
+
*/
|
|
126
|
+
export declare const GraderEmittedJudgmentSchema: z.ZodObject<{
|
|
127
|
+
score: z.ZodNumber;
|
|
128
|
+
reason: z.ZodString;
|
|
129
|
+
failureMode: z.ZodString;
|
|
130
|
+
subJudgments: z.ZodArray<z.ZodObject<{
|
|
131
|
+
criterionId: z.ZodString;
|
|
132
|
+
met: z.ZodBoolean;
|
|
133
|
+
evidence: z.ZodString;
|
|
134
|
+
confidence: z.ZodObject<{
|
|
135
|
+
level: z.ZodEnum<{
|
|
136
|
+
low: "low";
|
|
137
|
+
medium: "medium";
|
|
138
|
+
high: "high";
|
|
139
|
+
}>;
|
|
140
|
+
signalsPresent: z.ZodNumber;
|
|
141
|
+
derivation: z.ZodString;
|
|
142
|
+
}, z.core.$strip>;
|
|
143
|
+
}, z.core.$strip>>;
|
|
144
|
+
docCitations: z.ZodArray<z.ZodObject<{
|
|
145
|
+
documentId: z.ZodString;
|
|
146
|
+
slug: z.ZodOptional<z.ZodString>;
|
|
147
|
+
role: z.ZodEnum<{
|
|
148
|
+
supports: "supports";
|
|
149
|
+
contradicts: "contradicts";
|
|
150
|
+
missing: "missing";
|
|
151
|
+
irrelevant: "irrelevant";
|
|
152
|
+
}>;
|
|
153
|
+
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
154
|
+
}, z.core.$strip>>;
|
|
155
|
+
confidence: z.ZodObject<{
|
|
156
|
+
level: z.ZodEnum<{
|
|
157
|
+
low: "low";
|
|
158
|
+
medium: "medium";
|
|
159
|
+
high: "high";
|
|
160
|
+
}>;
|
|
161
|
+
signalsPresent: z.ZodNumber;
|
|
162
|
+
derivation: z.ZodString;
|
|
163
|
+
}, z.core.$strip>;
|
|
164
|
+
outputFailure: z.ZodOptional<z.ZodBoolean>;
|
|
165
|
+
}, z.core.$strict>;
|
|
166
|
+
export type { GraderEmittedJudgment, GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -91,3 +91,38 @@ export const GraderJudgmentSchema = z
|
|
|
91
91
|
}),
|
|
92
92
|
})
|
|
93
93
|
.strict();
|
|
94
|
+
/**
|
|
95
|
+
* Wire-format schema — what the grader LLM is asked to emit.
|
|
96
|
+
*
|
|
97
|
+
* This is the subset of {@link GraderJudgmentSchema} that contains only
|
|
98
|
+
* fields the LLM can correctly produce. The pipeline parses untrusted
|
|
99
|
+
* grader output against this shape, then synthesizes the four
|
|
100
|
+
* pipeline-owned fields (`judgmentId`, `metadata.graderModel`,
|
|
101
|
+
* `metadata.graderJudgmentsVersion`, `hallucinationCheckedAgainst`) plus
|
|
102
|
+
* the three result-context fields (`taskId`, `modelId`, `dimension`) to
|
|
103
|
+
* build the full {@link GraderJudgment} storage shape.
|
|
104
|
+
*
|
|
105
|
+
* `.strict()` is retained — the LLM is told exactly which keys to emit,
|
|
106
|
+
* so any extras are either prompt-injection attempts or noise we want
|
|
107
|
+
* to surface as parse failures (which then drop to the
|
|
108
|
+
* `synthesizeUnparsedJudgment` fallback). The bar to hit that fallback
|
|
109
|
+
* is much lower than before W0273 — previously a missing pipeline-owned
|
|
110
|
+
* field tripped it on every emission.
|
|
111
|
+
*
|
|
112
|
+
* Asserts `satisfies z.ZodType<GraderEmittedJudgment>` against the
|
|
113
|
+
* independently-authored core type (D0045). The trust-boundary CI gate
|
|
114
|
+
* (`pnpm check-trust-boundary-satisfies`) covers this file.
|
|
115
|
+
*
|
|
116
|
+
* @see docs/audits/2026-05-22-empty-gap-analysis-regression.md
|
|
117
|
+
*/
|
|
118
|
+
export const GraderEmittedJudgmentSchema = z
|
|
119
|
+
.object({
|
|
120
|
+
score: z.number(),
|
|
121
|
+
reason: z.string(),
|
|
122
|
+
failureMode: z.string(),
|
|
123
|
+
subJudgments: z.array(CriterionSubJudgmentSchema),
|
|
124
|
+
docCitations: z.array(DocCitationSchema),
|
|
125
|
+
confidence: ConfidenceSchema,
|
|
126
|
+
outputFailure: z.boolean().optional(),
|
|
127
|
+
})
|
|
128
|
+
.strict();
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Filter tasks by changed doc slugs.
|
|
3
|
+
*
|
|
4
|
+
* When `changedDocs` is set and non-empty, returns only tasks whose
|
|
5
|
+
* `context.docs[*].slug` intersects the provided list. Tasks without a
|
|
6
|
+
* `context.docs` array (e.g. knowledge-probe, mcp-server, agent-harness
|
|
7
|
+
* modes) are excluded — there's no way for them to "touch" a doc slug.
|
|
8
|
+
*
|
|
9
|
+
* An empty or undefined `changedDocs` is a no-op (returns input).
|
|
10
|
+
*/
|
|
11
|
+
import type { GeneralizedTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
12
|
+
export declare function filterByChangedDocs(tasks: readonly GeneralizedTaskDefinition[], changedDocs: readonly string[] | undefined): GeneralizedTaskDefinition[];
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Every variant of `GeneralizedTaskDefinition` declares an optional
|
|
3
|
+
* `context?: { docs?: GeneralizedDocRef[]; ... }`, so structural access
|
|
4
|
+
* narrows correctly without an `as` cast. Returns `undefined` when the
|
|
5
|
+
* task carries no doc refs.
|
|
6
|
+
*/
|
|
7
|
+
function taskContextDocs(task) {
|
|
8
|
+
return task.context?.docs;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* `GeneralizedDocRef` is a 4-way union — only `SlugDocRef` and `IdDocRef`
|
|
12
|
+
* carry a `slug`. Returns `undefined` for path / perspective refs.
|
|
13
|
+
*/
|
|
14
|
+
function docSlug(ref) {
|
|
15
|
+
return "slug" in ref ? ref.slug : undefined;
|
|
16
|
+
}
|
|
17
|
+
export function filterByChangedDocs(tasks, changedDocs) {
|
|
18
|
+
if (!changedDocs || changedDocs.length === 0)
|
|
19
|
+
return [...tasks];
|
|
20
|
+
const wanted = new Set(changedDocs);
|
|
21
|
+
return tasks.filter((task) => {
|
|
22
|
+
const docs = taskContextDocs(task);
|
|
23
|
+
if (!docs || docs.length === 0)
|
|
24
|
+
return false;
|
|
25
|
+
return docs.some((d) => {
|
|
26
|
+
const slug = docSlug(d);
|
|
27
|
+
return slug != null && wanted.has(slug);
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
}
|
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
* @see packages/core/src/ports/task-source.ts — TaskSource port
|
|
16
16
|
* @see docs/decisions/D0038-content-lake-authorable-task-modes.md
|
|
17
17
|
*/
|
|
18
|
+
import { buildContextDocPath } from "../../_vendor/ailf-shared/index.js";
|
|
19
|
+
import { filterByChangedDocs } from "./changed-docs-filter.js";
|
|
18
20
|
import { ContentLakeAuthorableTaskSchema } from "./repo-schemas.js";
|
|
19
21
|
// ---------------------------------------------------------------------------
|
|
20
22
|
// GROQ query — fetches ailf.task documents with resolved references
|
|
@@ -127,7 +129,7 @@ export class ContentLakeTaskSource {
|
|
|
127
129
|
console.warn(" ⚠️ ContentLakeTaskSource: no ailf.task documents found in the Content Lake. " +
|
|
128
130
|
"Have you run the migration (Phase 3) or created tasks in Studio?");
|
|
129
131
|
}
|
|
130
|
-
return definitions;
|
|
132
|
+
return filterByChangedDocs(definitions, filter?.changedDocs);
|
|
131
133
|
}
|
|
132
134
|
}
|
|
133
135
|
// ---------------------------------------------------------------------------
|
|
@@ -222,9 +224,11 @@ function mapCanonicalDocRef(raw) {
|
|
|
222
224
|
case "slug":
|
|
223
225
|
return raw.slug ? { slug: raw.slug, reason } : null;
|
|
224
226
|
case "path": {
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
227
|
+
const path = buildContextDocPath({
|
|
228
|
+
path: raw.path,
|
|
229
|
+
sectionSlug: raw.sectionSlug,
|
|
230
|
+
slug: raw.slug,
|
|
231
|
+
});
|
|
228
232
|
return path ? { path, reason } : null;
|
|
229
233
|
}
|
|
230
234
|
case "id": {
|
|
@@ -232,10 +236,12 @@ function mapCanonicalDocRef(raw) {
|
|
|
232
236
|
const id = raw.docId || raw.docRefId || null;
|
|
233
237
|
if (!id)
|
|
234
238
|
return null;
|
|
235
|
-
// Carry slug and derived path as optional DX annotations
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
:
|
|
239
|
+
// Carry slug and derived path as optional DX annotations — single
|
|
240
|
+
// source of truth in `buildContextDocPath` (@sanity/ailf-shared).
|
|
241
|
+
const derivedPath = buildContextDocPath({
|
|
242
|
+
sectionSlug: raw.sectionSlug,
|
|
243
|
+
slug: raw.slug,
|
|
244
|
+
});
|
|
239
245
|
return {
|
|
240
246
|
id,
|
|
241
247
|
reason,
|
|
@@ -22,6 +22,7 @@ import { existsSync, readdirSync, readFileSync } from "fs";
|
|
|
22
22
|
import { resolve } from "path";
|
|
23
23
|
import { load } from "js-yaml";
|
|
24
24
|
import { CANONICAL_EVAL_MODES } from "../../_vendor/ailf-shared/index.js";
|
|
25
|
+
import { filterByChangedDocs } from "./changed-docs-filter.js";
|
|
25
26
|
import { detectLegacyFieldNames, migratePromptShape, parseCanonicalTaskFile, } from "./repo-schemas.js";
|
|
26
27
|
import { discoverTsTaskFiles, loadTsTaskFile } from "./task-file-loader.js";
|
|
27
28
|
/** Set of canonical mode names for O(1) lookup */
|
|
@@ -111,7 +112,7 @@ export class RepoTaskSource {
|
|
|
111
112
|
}
|
|
112
113
|
}
|
|
113
114
|
}
|
|
114
|
-
return definitions;
|
|
115
|
+
return filterByChangedDocs(definitions, filter?.changedDocs);
|
|
115
116
|
}
|
|
116
117
|
}
|
|
117
118
|
// ---------------------------------------------------------------------------
|
|
@@ -98,13 +98,14 @@ export declare function computeResolvedOptions(opts: PipelineCliOptions): Resolv
|
|
|
98
98
|
/**
|
|
99
99
|
* Determine whether the post-run diagnosis summary hook should fire.
|
|
100
100
|
*
|
|
101
|
-
* 4-level precedence chain (
|
|
101
|
+
* 4-level precedence chain (D0054):
|
|
102
102
|
* Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
|
|
103
103
|
* Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
|
|
104
104
|
* anything else falls through (T-06-11 spoofing mitigation).
|
|
105
105
|
* Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
|
|
106
106
|
* "auto" or absent falls through to level 4.
|
|
107
|
-
* Level 4 — default
|
|
107
|
+
* Level 4 — default always-on: diagnosis is a first-class output produced
|
|
108
|
+
* for every pipeline run unless explicitly opted out at levels 1–3.
|
|
108
109
|
*/
|
|
109
110
|
export declare function shouldRunPostSummary(cliOpts: PipelineCliOptions, resolvedOnRun: "auto" | "always" | "never" | undefined): boolean;
|
|
110
111
|
export declare function buildSynthesisTelemetry(diagnosis: Diagnosis): SynthesisCostTelemetry;
|
|
@@ -113,7 +114,7 @@ export declare function buildSynthesisTelemetry(diagnosis: Diagnosis): Synthesis
|
|
|
113
114
|
*
|
|
114
115
|
* Fires after orchestratePipeline() + writePipelineResult() (D6-02).
|
|
115
116
|
* Hook failure prints to stderr but does NOT change exit code (D6-03).
|
|
116
|
-
*
|
|
117
|
+
* Fires whenever shouldRunPostSummary returns true (D0054 — default on).
|
|
117
118
|
*
|
|
118
119
|
* @param ctx - App context (composition root wiring)
|
|
119
120
|
* @param result - Pipeline result (includes reportId when published)
|
|
@@ -388,13 +388,14 @@ function resolvePublishAuto(repoValue) {
|
|
|
388
388
|
/**
|
|
389
389
|
* Determine whether the post-run diagnosis summary hook should fire.
|
|
390
390
|
*
|
|
391
|
-
* 4-level precedence chain (
|
|
391
|
+
* 4-level precedence chain (D0054):
|
|
392
392
|
* Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
|
|
393
393
|
* Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
|
|
394
394
|
* anything else falls through (T-06-11 spoofing mitigation).
|
|
395
395
|
* Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
|
|
396
396
|
* "auto" or absent falls through to level 4.
|
|
397
|
-
* Level 4 — default
|
|
397
|
+
* Level 4 — default always-on: diagnosis is a first-class output produced
|
|
398
|
+
* for every pipeline run unless explicitly opted out at levels 1–3.
|
|
398
399
|
*/
|
|
399
400
|
export function shouldRunPostSummary(cliOpts, resolvedOnRun) {
|
|
400
401
|
// Level 1: CLI flag wins absolutely
|
|
@@ -415,8 +416,9 @@ export function shouldRunPostSummary(cliOpts, resolvedOnRun) {
|
|
|
415
416
|
if (resolvedOnRun === "never")
|
|
416
417
|
return false;
|
|
417
418
|
// "auto" or undefined falls through
|
|
418
|
-
// Level 4:
|
|
419
|
-
|
|
419
|
+
// Level 4: diagnosis is on by default — emit a diagnosis artifact for every
|
|
420
|
+
// pipeline run unless an upstream level explicitly opted out (D0054).
|
|
421
|
+
return true;
|
|
420
422
|
}
|
|
421
423
|
/**
|
|
422
424
|
* Build a SynthesisCostTelemetry payload from a completed Diagnosis.
|
|
@@ -479,7 +481,7 @@ export function buildSynthesisTelemetry(diagnosis) {
|
|
|
479
481
|
*
|
|
480
482
|
* Fires after orchestratePipeline() + writePipelineResult() (D6-02).
|
|
481
483
|
* Hook failure prints to stderr but does NOT change exit code (D6-03).
|
|
482
|
-
*
|
|
484
|
+
* Fires whenever shouldRunPostSummary returns true (D0054 — default on).
|
|
483
485
|
*
|
|
484
486
|
* @param ctx - App context (composition root wiring)
|
|
485
487
|
* @param result - Pipeline result (includes reportId when published)
|
package/dist/commands/run.js
CHANGED
|
@@ -43,8 +43,8 @@ export function createRunCommand() {
|
|
|
43
43
|
.option("-p, --publish", "Write report to Sanity + fan out to sinks (auto-enabled for full runs when report store is configured)")
|
|
44
44
|
.option("--no-publish", "Suppress auto-publishing")
|
|
45
45
|
.option("--publish-tag <tag>", "Label for published report")
|
|
46
|
-
.option("--summary", "Force post-run diagnosis summary (overrides config and
|
|
47
|
-
.option("--no-summary", "Suppress post-run diagnosis summary")
|
|
46
|
+
.option("--summary", "Force post-run diagnosis summary (overrides config and env var)")
|
|
47
|
+
.option("--no-summary", "Suppress post-run diagnosis summary (default is on)")
|
|
48
48
|
.option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
|
|
49
49
|
.option("-o, --output <path>", "Write PR comment markdown to file")
|
|
50
50
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
package/dist/config/rubrics.ts
CHANGED
|
@@ -15,10 +15,6 @@ import { defineRubrics } from "../_vendor/ailf-core/index.js"
|
|
|
15
15
|
// template entry below. Source of truth lives in packages/eval/src/grader/;
|
|
16
16
|
// the helper picks the right list by dimension family.
|
|
17
17
|
import { failureModesForDimension } from "../grader/index.js"
|
|
18
|
-
// Single source of truth for the wire-format version stamped into the
|
|
19
|
-
// grader-prompt footer (VER-01 D-02). Interpolated below so the
|
|
20
|
-
// announced version cannot drift from the schema's expected value.
|
|
21
|
-
import { graderJudgmentsVersion } from "../adapters/grader-outputs/index.js"
|
|
22
18
|
|
|
23
19
|
export default defineRubrics({
|
|
24
20
|
templates: {
|
|
@@ -242,20 +238,23 @@ export default defineRubrics({
|
|
|
242
238
|
"agent-harness": { gold: "agent-harness" },
|
|
243
239
|
},
|
|
244
240
|
|
|
245
|
-
//
|
|
246
|
-
//
|
|
247
|
-
//
|
|
248
|
-
//
|
|
241
|
+
// W0273 — the footer documents the wire-format subset of GraderJudgment
|
|
242
|
+
// that the grader LLM actually controls. The pipeline parses this against
|
|
243
|
+
// GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
|
|
244
|
+
// fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
|
|
245
|
+
// hallucinationCheckedAgainst) to build the storage GraderJudgment.
|
|
246
|
+
//
|
|
247
|
+
// See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
|
|
248
|
+
// rationale (Phase 3 GRAD-05 made these fields required + .strict(),
|
|
249
|
+
// and asking the LLM for pipeline-owned values caused 100% parse
|
|
250
|
+
// failures starting 2026-05-11).
|
|
249
251
|
footer: `Return ONLY a JSON object with this exact shape:
|
|
250
252
|
{
|
|
251
|
-
"judgmentId": "<string>",
|
|
252
253
|
"score": <number 0-100>,
|
|
253
254
|
"reason": "<explanation, ≤500 chars>",
|
|
255
|
+
"failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
|
|
254
256
|
"subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
|
|
255
257
|
"docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
|
|
256
|
-
"
|
|
257
|
-
"confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
|
|
258
|
-
"hallucinationCheckedAgainst": ["<doc id>"],
|
|
259
|
-
"metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
|
|
258
|
+
"confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
|
|
260
259
|
}`,
|
|
261
260
|
})
|