@sanity/ailf 7.2.1 → 7.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +22 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +14 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +1 -1
- package/dist/_vendor/ailf-shared/glossary.d.ts +32 -0
- package/dist/_vendor/ailf-shared/glossary.js +35 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +8 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +10 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +12 -2
- package/dist/artifact-capture/gcs-artifact-writer.js +18 -0
- package/dist/orchestration/required-eval-runs.d.ts +35 -0
- package/dist/orchestration/required-eval-runs.js +41 -0
- package/dist/orchestration/steps/calculate-scores-step.js +15 -22
- package/dist/orchestration/steps/compute-attribution-step.js +6 -3
- package/dist/orchestration/steps/gap-analysis-step.js +8 -4
- package/dist/orchestration/steps/run-eval-step.js +56 -3
- package/dist/pipeline/assert-grader-judgments-persisted.d.ts +35 -0
- package/dist/pipeline/assert-grader-judgments-persisted.js +58 -0
- package/dist/pipeline/cache-hit-restore.d.ts +37 -1
- package/dist/pipeline/cache-hit-restore.js +108 -1
- package/dist/pipeline/calculate-scores.js +18 -0
- package/package.json +1 -1
|
@@ -76,6 +76,28 @@ export interface ArtifactWriter {
|
|
|
76
76
|
*/
|
|
77
77
|
writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
|
|
78
78
|
}
|
|
79
|
+
/**
|
|
80
|
+
* Optional capability: check whether an object physically exists at a
|
|
81
|
+
* store-relative `path`. Kept **separate** from `ArtifactWriter` so writers
|
|
82
|
+
* opt in by structural typing rather than every implementor being forced to
|
|
83
|
+
* provide it — consumers feature-detect it (see the writer-chain walkers in
|
|
84
|
+
* the eval package).
|
|
85
|
+
*
|
|
86
|
+
* Used by the cache-hit restore path (D0040 / D0057) to drop artifact refs
|
|
87
|
+
* a prior report over-claimed: a degraded source run can list `rawResults`
|
|
88
|
+
* entries whose GCS objects were never written, and `remapToCacheHitRefs`
|
|
89
|
+
* would otherwise copy those phantom entries into the new run's manifest.
|
|
90
|
+
*
|
|
91
|
+
* Contract:
|
|
92
|
+
* - resolves `true` — the object exists.
|
|
93
|
+
* - resolves `false` — the object is *definitively* absent.
|
|
94
|
+
* - **throws** — existence could not be determined (auth / network /
|
|
95
|
+
* transient error). Callers must treat a throw as "unknown" and
|
|
96
|
+
* fail open (keep the ref) rather than dropping a real artifact.
|
|
97
|
+
*/
|
|
98
|
+
export interface ArtifactObjectChecker {
|
|
99
|
+
objectExists(path: string): Promise<boolean>;
|
|
100
|
+
}
|
|
79
101
|
/**
|
|
80
102
|
* Thrown by writers that can't satisfy a method — e.g. an
|
|
81
103
|
* `ApiGatewayArtifactWriter` cannot implement `appendNdjson` until the batch
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Ports define the contracts between the domain kernel and the outside world.
|
|
5
5
|
* Adapters (in packages/eval) implement these interfaces.
|
|
6
6
|
*/
|
|
7
|
-
export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
|
|
7
|
+
export type { ArtifactEntry, ArtifactObjectChecker, ArtifactWriter, } from "./artifact-writer.js";
|
|
8
8
|
export { NoOpArtifactWriter } from "./artifact-writer.js";
|
|
9
9
|
export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
|
|
10
10
|
export type { ConfigSource } from "./config-source.js";
|
|
@@ -59,6 +59,20 @@ export const generateTopRecommendations = async (report, ctx) => {
|
|
|
59
59
|
}
|
|
60
60
|
// Build allow-list from the runtime report
|
|
61
61
|
const allowList = buildDocSlugAllowList(report);
|
|
62
|
+
// Short-circuit BEFORE the LLM call when the allow-list is empty — the shape
|
|
63
|
+
// a degraded run produces (enrichment never wrote `documentManifest`, and
|
|
64
|
+
// per-score `documents[]` carry no slugs). With an empty allow-list the
|
|
65
|
+
// system prompt instructs the model to omit every recommendation, so it
|
|
66
|
+
// returns `suggestions: []` and trips the `.min(1)` schema, surfacing as a
|
|
67
|
+
// degraded card with a raw ZodError. Degrade gracefully to "missing"
|
|
68
|
+
// instead — same structural seam the sibling attribution cards use.
|
|
69
|
+
if (allowList.size === 0) {
|
|
70
|
+
return {
|
|
71
|
+
status: "missing",
|
|
72
|
+
cardType: "top-recommendations",
|
|
73
|
+
reason: "no doc-slug allow-list for this run (enrichment incomplete)",
|
|
74
|
+
};
|
|
75
|
+
}
|
|
62
76
|
// Per-call schema: additive docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
|
|
63
77
|
const PerCallSchema = z.object({
|
|
64
78
|
summary: z.string().min(1).max(500),
|
|
@@ -142,7 +142,7 @@ export const HELP_TOPICS = [
|
|
|
142
142
|
{
|
|
143
143
|
"id": "glossary",
|
|
144
144
|
"title": "Glossary",
|
|
145
|
-
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: ⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0–100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0–100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Δ**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Δ**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap Δ**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Δ**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.\n\n**Needs Attention (70–79)**\n: Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion Δ**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness Δ**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage Δ**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Δ**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
145
|
+
"body": "**Overall Score**\n: A weighted average across all feature areas, using the gold scoring profile: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%).\n\n**Doc Lift**\n: How much the docs help, compared to the model's training data alone. Calculated as ceiling minus floor, where ceiling includes Doc Coverage and floor does not. Higher is better.\n\n**Actual Score**\n: How well an AI agent scores when it has to find docs on its own through web search and page fetching. This is the real-world scenario. Only available in full mode.\n\n**Retrieval Gap**\n: The score lost because agents can't find or use all the relevant docs. Calculated as ceiling minus actual. Lower is better; zero means agents find everything.\n\n**Infra Efficiency**\n: What percentage of the docs' potential quality actually reaches agents (actual ÷ ceiling). 100% means agents find and use all relevant docs perfectly.\n\n**Floor**\n: Output-quality composite without documentation — Task Completion (60%) and Code Correctness (40%) only. Doc Coverage is excluded because it's undefined when no docs are provided. This tells you what the model already knows from its training data.\n\n**Ceiling**\n: Score with gold-standard docs injected directly into the prompt. This is the best the documentation can do.\n\n**Actual**\n: Score when an AI agent finds docs on its own through web search and page fetching. This is the real-world experience.\n\n**Ret. Gap**\n: Quality lost to discoverability (ceiling minus actual). The gap between what the docs could deliver and what agents actually get.\n\n**Efficiency**\n: What fraction of the docs' quality reaches agents in practice (actual ÷ ceiling, shown as a percentage).\n\n**Inverted Retrieval Gap**\n: ⚠️ Inverted retrieval gap: agents that can't find the docs actually score higher, because the docs hurt performance. This usually means there's a doc quality problem.\n\n**Score**\n: Ceiling composite for this feature area: Task Completion × 50% + Code Correctness × 25% + Doc Coverage × 25%.\n\n**Task Completion**\n: Can the LLM implement the requested feature? Graded 0–100.\n\n**Code Correctness**\n: Is the generated code idiomatic, correct, and following best practices? Graded 0–100.\n\n**Doc Coverage**\n: Did the docs provide the information needed to implement the feature? Graded 0–100. This dimension only contributes to the ceiling composite (with docs) — it's excluded from the floor composite because it's undefined without documentation.\n\n**Tests**\n: Number of test cases in this feature area.\n\n**Overall Δ**\n: Change in overall score between the two runs. Positive means the experiment scored higher.\n\n**Actual Δ**\n: Change in actual (agent-retrieved) score between runs. Positive means agents did better.\n\n**Ret. Gap Δ**\n: Change in retrieval gap between runs. Negative is good here: it means the gap shrank and agents found more relevant docs.\n\n**Efficiency Δ**\n: Change in infrastructure efficiency between runs. Positive means agents are capturing more of the docs' potential.\n\n**Baseline**\n: The reference run you're comparing against.\n\n**Experiment**\n: The new run you're evaluating.\n\n**Delta**\n: Difference between experiment and baseline. Positive means improvement, negative means regression.\n\n**Change**\n: Whether the change is meaningful: improved, regressed, or unchanged (within the noise threshold).\n\n**Low-Scoring Judgments**\n: The grading model's explanations for tests that scored below 70/100.\n\n**Judgment Reason**\n: The grading model's natural language explanation of what went wrong.\n\n**Strong (80+)**\n: Feature areas scoring 80 or above. The docs are working well for these features — AI agents produce correct, complete implementations.\n\n**Needs Attention (70–79)**\n: Feature areas scoring 70–79. These are okay but could be improved — there may be gaps in specific dimensions like doc coverage or code correctness.\n\n**Weak (<70)**\n: Feature areas scoring below 70. The docs are not providing enough support for AI agents to implement these features correctly.\n\n**Negative Doc Lift**\n: Number of areas where the documentation actually hurts AI performance — the model scores higher without docs than with them. This usually means the docs contain outdated patterns or incorrect examples.\n\n**Weak Areas**\n: Feature areas where the overall score is below 70. These need the most attention — low scores mean AI agents consistently struggle to implement these features.\n\n**Docs Hurt Performance**\n: Areas where the floor score (no docs) is higher than the ceiling score (with docs). The documentation may be actively misleading the model. These docs should be reviewed.\n\n**Retrieval Issues**\n: Areas where AI agents can find less than 70% of the available doc quality. The docs exist and are good, but agents can't discover them through search. Consider improving page titles, metadata, or search engine indexing.\n\n**Dimension Weaknesses**\n: Individual grading dimensions scoring below 50 within an area. These are the specific skills where AI agents fail most — task completion (can it build the feature?), code correctness (is the code right?), or doc coverage (did it use the docs?).\n\n**Efficiency Anomalies**\n: Areas where agent efficiency exceeds 100% — meaning agents perform better with self-found docs than with gold-standard docs injected directly. This can indicate doc quality issues (injected docs confuse the model) or agent memorization.\n\n**Doc Lift Wins**\n: Areas where documentation boosts AI performance by 5 or more points. Higher doc lift means the docs are providing crucial information that the model doesn't already know.\n\n**Retrieval Excellence**\n: Areas where AI agents successfully find and use at least 85% of the available doc quality through web search. Good retrieval means your docs are well-indexed and easy for agents to discover.\n\n**Model Breakdown**\n: Break down scores by individual LLM model. The default 'All Models' view shows the cross-model average. Select a specific model to see how it performed independently — useful for spotting models that struggle with specific feature areas.\n\n**Strengths**\n: What's working well: high-scoring areas, dimensions where the docs are strong, and areas where AI agents successfully find and use the documentation.\n\n**Recommendations**\n: Prioritized remediation plan from gap analysis. Each recommendation identifies a documentation problem, the affected feature area, and the estimated score lift from fixing it.\n\n**Total Potential Lift**\n: Aggregate potential score lift if all identified gaps were fixed. This is a conservative estimate — each gap targets the median of non-bottlenecked dimensions, not 100.\n\n**Failure Mode**\n: The type of failure the grader emitted. Cross-cutting modes apply to any dimension (api-error, model-limitation, false-floor, unclassified). Per-dimension extensions cover documentation problems (missing-docs, incorrect-docs, outdated-docs, poor-structure), spec adherence (spec-mismatch), tool use (tool-misuse, chaotic-process, missing-recovery), and knowledge probes (factual-error, incompleteness, currency-violation, hallucination).\n\n**Estimated Lift**\n: Estimated composite score improvement if this gap is fully fixed. Based on raising bottleneck dimensions to the median of non-bottlenecked dimensions.\n\n**Confidence**\n: How confident we are in this diagnosis (D0049 ceiling-cross-check derivation). High = the grader's emitted failure mode agrees with the structural ceiling-decomposition signal. Medium = signals disagree (or the ceiling pattern is not informative for this score). Low = passing scores never classify; treat as absent.\n\n**Agent Behavior**\n: How AI agents interacted with your documentation during evaluation: what they searched for, which pages they visited, and how much time they spent on network requests.\n\n**Search Queries**\n: The exact search queries agents used to find documentation. Helps you understand how agents discover your content and whether your docs appear for relevant queries.\n\n**Unique Doc Slugs**\n: Documentation page slugs that agents actually visited during evaluation. Compare against canonical docs to see if agents found the right pages.\n\n**External Domains**\n: Non-Sanity domains that agents contacted during evaluation. High external domain counts may indicate agents couldn't find what they needed in your docs.\n\n**Avg Pages Visited**\n: Average number of documentation pages visited per test. Higher counts can mean agents need to consult many pages (complex task) or can't find the right one quickly.\n\n**Avg Searches**\n: Average number of web searches performed per test. High search counts can indicate docs are hard to discover through search engines.\n\n**Avg Network Time**\n: Average time spent on network requests per test. Includes page fetches, search queries, and API calls.\n\n**Total Requests**\n: Total number of HTTP requests the agent made during the test, including searches, page visits, and API calls.\n\n**Total Bytes Downloaded**\n: Total bytes downloaded by the agent. Large downloads may indicate the agent is fetching many pages or very large documents.\n\n**Task Completion Δ**\n: Change in task completion between runs. Positive means implementations are more complete.\n\n**Code Correctness Δ**\n: Change in code correctness between runs. Positive means better code quality.\n\n**Doc Coverage Δ**\n: Change in doc coverage between runs. Positive means the docs are providing more useful information.\n\n**Area Δ**\n: Score change for this area compared to the previous evaluation run.\n\n**Production**\n: Production source — docs fetched from the live production dataset. Scores reflect what real users and AI agents experience today.\n\n**Branch**\n: Branch source — docs fetched from a branch or draft dataset. Use this to preview how content changes affect scores before publishing.\n\n**Local**\n: Local source — docs fetched from local files or a local dev server. Useful for testing doc changes before pushing.\n\n**Score**\n: The overall ceiling composite for this evaluation run: Task Completion (50%), Code Correctness (25%), and Doc Coverage (25%), averaged across all feature areas.\n\n**Mode**\n: The evaluation mode determines which reference points are measured. Different modes test different aspects of how AI agents interact with documentation.\n\n**Trigger**\n: What initiated this evaluation run. Knowing the trigger helps you understand whether a score change was from a content edit, a code deploy, or a scheduled check.\n\n**Baseline**\n: Baseline mode — tests the model with gold-standard docs injected directly. Measures ceiling performance (best the docs can do).\n\n**Full**\n: Full mode — runs baseline + agentic. Compares ceiling (injected docs) against actual (agent-retrieved docs) to measure retrieval gap and infrastructure efficiency.\n\n**Agentic**\n: Agentic mode — the AI agent finds docs on its own via web search. Measures real-world performance: can agents actually discover and use your documentation?\n\n**Observed**\n: Observed mode — records how agents interact with docs without scoring. Captures search queries, pages visited, and browsing patterns for analysis.\n\n**Debug**\n: Debug mode — a diagnostic run for pipeline development. May use non-standard configurations or limited task sets.\n\n**Manual**\n: Manually triggered — someone ran the evaluation pipeline by hand, either locally or via the Studio UI.\n\n**CI**\n: CI-triggered — the evaluation ran automatically as part of a pull request or merge pipeline.\n\n**Scheduled**\n: Scheduled — the evaluation ran on a recurring schedule (e.g. nightly or weekly) to track score trends over time.\n\n**Webhook**\n: Webhook-triggered — a content change in Sanity triggered the evaluation automatically. Helps catch doc regressions early.\n\n**Cross-Repo**\n: Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.\n\n**Gold**\n: Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.\n\n**Baseline**\n: Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.\n\n**Naive**\n: Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.\n\n**Optimized**\n: Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.\n\n**Normal**\n: Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.\n\n**Fail**\n: Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.\n\n**Low dim**\n: Low dim — the run produced output but at least one grading dimension scored below 60.\n\n**OK**\n: OK — the run produced output and every grading dimension scored 60 or above.",
|
|
146
146
|
"source": "packages/shared/src/glossary.ts",
|
|
147
147
|
"tags": [
|
|
148
148
|
"reference",
|
|
@@ -314,5 +314,37 @@ export declare const GLOSSARY: {
|
|
|
314
314
|
readonly label: "Cross-Repo";
|
|
315
315
|
readonly long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.";
|
|
316
316
|
};
|
|
317
|
+
readonly variantGold: {
|
|
318
|
+
readonly label: "Gold";
|
|
319
|
+
readonly long: "Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.";
|
|
320
|
+
};
|
|
321
|
+
readonly variantBaseline: {
|
|
322
|
+
readonly label: "Baseline";
|
|
323
|
+
readonly long: "Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.";
|
|
324
|
+
};
|
|
325
|
+
readonly engineNaive: {
|
|
326
|
+
readonly label: "Naive";
|
|
327
|
+
readonly long: "Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.";
|
|
328
|
+
};
|
|
329
|
+
readonly engineOptimized: {
|
|
330
|
+
readonly label: "Optimized";
|
|
331
|
+
readonly long: "Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.";
|
|
332
|
+
};
|
|
333
|
+
readonly engineNormal: {
|
|
334
|
+
readonly label: "Normal";
|
|
335
|
+
readonly long: "Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.";
|
|
336
|
+
};
|
|
337
|
+
readonly statusFail: {
|
|
338
|
+
readonly label: "Fail";
|
|
339
|
+
readonly long: "Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.";
|
|
340
|
+
};
|
|
341
|
+
readonly statusLowDim: {
|
|
342
|
+
readonly label: "Low dim";
|
|
343
|
+
readonly long: "Low dim — the run produced output but at least one grading dimension scored below 60.";
|
|
344
|
+
};
|
|
345
|
+
readonly statusOk: {
|
|
346
|
+
readonly label: "OK";
|
|
347
|
+
readonly long: "OK — the run produced output and every grading dimension scored 60 or above.";
|
|
348
|
+
};
|
|
317
349
|
};
|
|
318
350
|
export type GlossarySlug = keyof typeof GLOSSARY;
|
|
@@ -327,4 +327,39 @@ export const GLOSSARY = {
|
|
|
327
327
|
label: "Cross-Repo",
|
|
328
328
|
long: "Cross-repo — triggered from another repository via the dispatch API. Used when external repos want to validate their docs against AILF tasks.",
|
|
329
329
|
},
|
|
330
|
+
// -- Variant values (per-test docs condition) ------------------------------
|
|
331
|
+
variantGold: {
|
|
332
|
+
label: "Gold",
|
|
333
|
+
long: "Gold variant — the relevant docs were injected into the prompt as context. This is the ceiling condition: the best the documentation can do.",
|
|
334
|
+
},
|
|
335
|
+
variantBaseline: {
|
|
336
|
+
label: "Baseline",
|
|
337
|
+
long: "Baseline variant — no documentation in the prompt. This is the floor condition: what the model already knows from its training data, used as the control for doc lift.",
|
|
338
|
+
},
|
|
339
|
+
// -- Execution mode values (how the model was driven) ----------------------
|
|
340
|
+
engineNaive: {
|
|
341
|
+
label: "Naive",
|
|
342
|
+
long: "Naive — the in-house agentic harness with naive prompting. The model runs in an agent loop with no doc-targeting strategy.",
|
|
343
|
+
},
|
|
344
|
+
engineOptimized: {
|
|
345
|
+
label: "Optimized",
|
|
346
|
+
long: "Optimized — the in-house agentic harness with optimized prompting. The model runs in an agent loop tuned for documentation retrieval.",
|
|
347
|
+
},
|
|
348
|
+
engineNormal: {
|
|
349
|
+
label: "Normal",
|
|
350
|
+
long: "Normal — a direct vendor-API call. The model produces a single-shot completion with no agent loop.",
|
|
351
|
+
},
|
|
352
|
+
// -- Status values (per-test outcome) --------------------------------------
|
|
353
|
+
statusFail: {
|
|
354
|
+
label: "Fail",
|
|
355
|
+
long: "Fail — the model produced no usable output (empty response, API error, or token exhaustion). Distinct from a low score on output that was produced.",
|
|
356
|
+
},
|
|
357
|
+
statusLowDim: {
|
|
358
|
+
label: "Low dim",
|
|
359
|
+
long: "Low dim — the run produced output but at least one grading dimension scored below 60.",
|
|
360
|
+
},
|
|
361
|
+
statusOk: {
|
|
362
|
+
label: "OK",
|
|
363
|
+
long: "OK — the run produced output and every grading dimension scored 60 or above.",
|
|
364
|
+
},
|
|
330
365
|
};
|
|
@@ -38,6 +38,14 @@ export declare class FanoutArtifactWriter implements ArtifactWriter {
|
|
|
38
38
|
private readonly writers;
|
|
39
39
|
private readonly progress?;
|
|
40
40
|
constructor(writers: readonly ArtifactWriter[], options?: FanoutArtifactWriterOptions);
|
|
41
|
+
/**
|
|
42
|
+
* The delegate writers in declaration order. Exposed read-only so callers
|
|
43
|
+
* walking the writer chain (e.g. to feature-detect an
|
|
44
|
+
* `ArtifactObjectChecker`) can descend into the fanout without it having to
|
|
45
|
+
* re-implement every optional capability. Mirrors the decorators' readonly
|
|
46
|
+
* `inner` accessor.
|
|
47
|
+
*/
|
|
48
|
+
get delegates(): readonly ArtifactWriter[];
|
|
41
49
|
private reportProgress;
|
|
42
50
|
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
51
|
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
@@ -33,6 +33,16 @@ export class FanoutArtifactWriter {
|
|
|
33
33
|
this.writers = writers;
|
|
34
34
|
this.progress = options.progress;
|
|
35
35
|
}
|
|
36
|
+
/**
|
|
37
|
+
* The delegate writers in declaration order. Exposed read-only so callers
|
|
38
|
+
* walking the writer chain (e.g. to feature-detect an
|
|
39
|
+
* `ArtifactObjectChecker`) can descend into the fanout without it having to
|
|
40
|
+
* re-implement every optional capability. Mirrors the decorators' readonly
|
|
41
|
+
* `inner` accessor.
|
|
42
|
+
*/
|
|
43
|
+
get delegates() {
|
|
44
|
+
return this.writers;
|
|
45
|
+
}
|
|
36
46
|
reportProgress(ref) {
|
|
37
47
|
if (!this.progress)
|
|
38
48
|
return;
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
29
29
|
*/
|
|
30
30
|
import { Storage } from "@google-cloud/storage";
|
|
31
|
-
import { type ArtifactEntry, type ArtifactRef, type ArtifactType, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssociationValues, type RunId, type RunManifest, type WriteSource } from "../_vendor/ailf-core/index.d.ts";
|
|
31
|
+
import { type ArtifactEntry, type ArtifactObjectChecker, type ArtifactRef, type ArtifactType, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssociationValues, type RunId, type RunManifest, type WriteSource } from "../_vendor/ailf-core/index.d.ts";
|
|
32
32
|
import { type UploadMetricsSink } from "./upload-metrics.js";
|
|
33
33
|
export interface GcsArtifactWriterOptions {
|
|
34
34
|
/** GCS bucket name (e.g., "ailf-artifacts") */
|
|
@@ -61,7 +61,7 @@ export interface GcsArtifactWriterOptions {
|
|
|
61
61
|
*/
|
|
62
62
|
writerSource?: WriteSource;
|
|
63
63
|
}
|
|
64
|
-
export declare class GcsArtifactWriter implements ArtifactWriter {
|
|
64
|
+
export declare class GcsArtifactWriter implements ArtifactWriter, ArtifactObjectChecker {
|
|
65
65
|
private client;
|
|
66
66
|
private readonly options;
|
|
67
67
|
private readonly ndjsonStreams;
|
|
@@ -83,6 +83,16 @@ export declare class GcsArtifactWriter implements ArtifactWriter {
|
|
|
83
83
|
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
84
84
|
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
85
85
|
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
86
|
+
/**
|
|
87
|
+
* Existence probe used by the cache-hit restore prune (D0057). Unlike the
|
|
88
|
+
* write methods (P5 non-blocking — swallow errors, return null), this
|
|
89
|
+
* resolves `false` ONLY for a definitively-absent object and **throws** on
|
|
90
|
+
* any other failure (auth / network / quota) so the caller can fail open
|
|
91
|
+
* and keep the ref rather than dropping a real artifact on a transient
|
|
92
|
+
* blip. `file.exists()` rejects only on real errors; a missing object
|
|
93
|
+
* resolves `[false]`.
|
|
94
|
+
*/
|
|
95
|
+
objectExists(path: string): Promise<boolean>;
|
|
86
96
|
/** @deprecated Use `emit()` instead. Routes through the same GCS I/O. */
|
|
87
97
|
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
88
98
|
/** @deprecated Use `emit()` per entry instead. */
|
|
@@ -223,6 +223,24 @@ export class GcsArtifactWriter {
|
|
|
223
223
|
this.reportProgress(ref);
|
|
224
224
|
return ref;
|
|
225
225
|
}
|
|
226
|
+
// ---- ArtifactObjectChecker (D0057) --------------------------------------
|
|
227
|
+
/**
|
|
228
|
+
* Existence probe used by the cache-hit restore prune (D0057). Unlike the
|
|
229
|
+
* write methods (P5 non-blocking — swallow errors, return null), this
|
|
230
|
+
* resolves `false` ONLY for a definitively-absent object and **throws** on
|
|
231
|
+
* any other failure (auth / network / quota) so the caller can fail open
|
|
232
|
+
* and keep the ref rather than dropping a real artifact on a transient
|
|
233
|
+
* blip. `file.exists()` rejects only on real errors; a missing object
|
|
234
|
+
* resolves `[false]`.
|
|
235
|
+
*/
|
|
236
|
+
async objectExists(path) {
|
|
237
|
+
const storage = this.getClient();
|
|
238
|
+
const [exists] = await storage
|
|
239
|
+
.bucket(this.options.bucket)
|
|
240
|
+
.file(path)
|
|
241
|
+
.exists();
|
|
242
|
+
return exists;
|
|
243
|
+
}
|
|
226
244
|
// ---- Deprecated legacy surface (W0052) ----------------------------------
|
|
227
245
|
/** @deprecated Use `emit()` instead. Routes through the same GCS I/O. */
|
|
228
246
|
async writeBulk(type, runId, data) {
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Which eval sub-runs a pipeline configuration requires, and whether every one
|
|
3
|
+
* of them was satisfied by a remote-cache hit.
|
|
4
|
+
*
|
|
5
|
+
* The post-scoring enrichment steps (gap-analysis, compute-attribution) may
|
|
6
|
+
* skip benignly when grader judgments are absent because ALL required runs came
|
|
7
|
+
* from the remote cache — a cache hit restores `score-summary.json` but never
|
|
8
|
+
* writes `grader-judgments.json`. They must NOT skip when at least one required
|
|
9
|
+
* run was evaluated fresh this pipeline: a fresh run that scored tests yet
|
|
10
|
+
* persisted no judgments is a degraded outcome that has to fail loud.
|
|
11
|
+
*
|
|
12
|
+
* Mirrors the required-run derivation in `calculate-scores-step` so the
|
|
13
|
+
* "all required runs cached" judgement is defined in exactly one place.
|
|
14
|
+
*/
|
|
15
|
+
interface EvalRunSelector {
|
|
16
|
+
mode: string;
|
|
17
|
+
variant?: string | null;
|
|
18
|
+
}
|
|
19
|
+
interface RemoteCacheState {
|
|
20
|
+
remoteCacheHits?: ReadonlySet<string>;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* The eval sub-runs a configuration requires, keyed by the same strings
|
|
24
|
+
* `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
|
|
25
|
+
* or the bare mode name for non-literacy modes).
|
|
26
|
+
*/
|
|
27
|
+
export declare function requiredEvalRuns(config: EvalRunSelector): string[];
|
|
28
|
+
/**
|
|
29
|
+
* True only when every eval sub-run the configuration requires was satisfied by
|
|
30
|
+
* a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
|
|
31
|
+
* cached, baseline fresh) returns false — the fresh run's outputs are still the
|
|
32
|
+
* pipeline's responsibility.
|
|
33
|
+
*/
|
|
34
|
+
export declare function allRequiredEvalRunsCached(config: EvalRunSelector, state: RemoteCacheState | undefined): boolean;
|
|
35
|
+
export {};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Which eval sub-runs a pipeline configuration requires, and whether every one
|
|
3
|
+
* of them was satisfied by a remote-cache hit.
|
|
4
|
+
*
|
|
5
|
+
* The post-scoring enrichment steps (gap-analysis, compute-attribution) may
|
|
6
|
+
* skip benignly when grader judgments are absent because ALL required runs came
|
|
7
|
+
* from the remote cache — a cache hit restores `score-summary.json` but never
|
|
8
|
+
* writes `grader-judgments.json`. They must NOT skip when at least one required
|
|
9
|
+
* run was evaluated fresh this pipeline: a fresh run that scored tests yet
|
|
10
|
+
* persisted no judgments is a degraded outcome that has to fail loud.
|
|
11
|
+
*
|
|
12
|
+
* Mirrors the required-run derivation in `calculate-scores-step` so the
|
|
13
|
+
* "all required runs cached" judgement is defined in exactly one place.
|
|
14
|
+
*/
|
|
15
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
16
|
+
/**
|
|
17
|
+
* The eval sub-runs a configuration requires, keyed by the same strings
|
|
18
|
+
* `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
|
|
19
|
+
* or the bare mode name for non-literacy modes).
|
|
20
|
+
*/
|
|
21
|
+
export function requiredEvalRuns(config) {
|
|
22
|
+
if (config.mode === "literacy") {
|
|
23
|
+
const variant = config.variant ?? LiteracyVariant.STANDARD;
|
|
24
|
+
return variant === LiteracyVariant.FULL
|
|
25
|
+
? [LiteracyVariant.STANDARD, LiteracyVariant.AGENTIC]
|
|
26
|
+
: [variant];
|
|
27
|
+
}
|
|
28
|
+
return [config.mode];
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* True only when every eval sub-run the configuration requires was satisfied by
|
|
32
|
+
* a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
|
|
33
|
+
* cached, baseline fresh) returns false — the fresh run's outputs are still the
|
|
34
|
+
* pipeline's responsibility.
|
|
35
|
+
*/
|
|
36
|
+
export function allRequiredEvalRunsCached(config, state) {
|
|
37
|
+
const hits = state?.remoteCacheHits;
|
|
38
|
+
if (!hits || hits.size === 0)
|
|
39
|
+
return false;
|
|
40
|
+
return requiredEvalRuns(config).every((run) => hits.has(run));
|
|
41
|
+
}
|
|
@@ -11,6 +11,7 @@ import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
|
11
11
|
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
12
12
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
13
13
|
import { buildCacheContext } from "../cache-context.js";
|
|
14
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
14
15
|
import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
|
|
15
16
|
import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
|
|
16
17
|
import { resultsFileForMode } from "../../pipeline/eval-constants.js";
|
|
@@ -27,30 +28,22 @@ export class CalculateScoresStep {
|
|
|
27
28
|
}
|
|
28
29
|
async execute(ctx, state) {
|
|
29
30
|
const start = Date.now();
|
|
30
|
-
// When all required eval
|
|
31
|
+
// When all required eval runs were satisfied by remote cache hits,
|
|
31
32
|
// score-summary.json was already restored from the cached report.
|
|
32
|
-
// Skip re-calculation — the raw eval-results files don't exist.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
45
|
-
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
46
|
-
if (summaryErrors.length === 0) {
|
|
47
|
-
return {
|
|
48
|
-
reason: "Remote cache hit — score-summary.json restored from cached report",
|
|
49
|
-
status: "skipped",
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
// If the summary is invalid, fall through to normal calculation
|
|
33
|
+
// Skip re-calculation — the raw eval-results files don't exist. A partial
|
|
34
|
+
// cache hit (only some required runs cached) falls through to normal
|
|
35
|
+
// calculation: the freshly-run sub-evals produced raw results to score.
|
|
36
|
+
if (allRequiredEvalRunsCached(ctx.config, state)) {
|
|
37
|
+
// Verify the restored score-summary.json is valid
|
|
38
|
+
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
39
|
+
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
40
|
+
if (summaryErrors.length === 0) {
|
|
41
|
+
return {
|
|
42
|
+
reason: "Remote cache hit — score-summary.json restored from cached report",
|
|
43
|
+
status: "skipped",
|
|
44
|
+
};
|
|
53
45
|
}
|
|
46
|
+
// If the summary is invalid, fall through to normal calculation
|
|
54
47
|
}
|
|
55
48
|
// Primary results file to score.
|
|
56
49
|
// For literacy: "full" variant uses baseline as primary; others use variant directly.
|
|
@@ -41,6 +41,7 @@ import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
|
41
41
|
import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
|
|
42
42
|
import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
|
|
43
43
|
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
44
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
44
45
|
// ---------------------------------------------------------------------------
|
|
45
46
|
// Step implementation
|
|
46
47
|
// ---------------------------------------------------------------------------
|
|
@@ -89,10 +90,12 @@ export class ComputeAttributionStep {
|
|
|
89
90
|
// grader judgments is a degraded run, not a benign skip. Fail loud so the
|
|
90
91
|
// outcome surfaces in pipeline-result and on the job document. A remote
|
|
91
92
|
// cache hit restores score-summary.json without grader-judgments.json, so
|
|
92
|
-
//
|
|
93
|
-
|
|
93
|
+
// missing judgments are legitimate ONLY when every required sub-eval came
|
|
94
|
+
// from the cache — a hybrid full run with a freshly-evaluated sub-eval that
|
|
95
|
+
// persisted no judgments is still degraded.
|
|
96
|
+
const allCached = allRequiredEvalRunsCached(ctx.config, state);
|
|
94
97
|
const inputs = classifyEnrichmentInputs(root);
|
|
95
|
-
if (inputs.kind === "judgments-missing-after-eval" && !
|
|
98
|
+
if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
|
|
96
99
|
return {
|
|
97
100
|
durationMs: Date.now() - start,
|
|
98
101
|
status: "failed",
|
|
@@ -19,6 +19,7 @@ import { join, resolve } from "path";
|
|
|
19
19
|
import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
21
21
|
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
22
|
+
import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
|
|
22
23
|
export class GapAnalysisStep {
|
|
23
24
|
name = "gap-analysis";
|
|
24
25
|
optional = true;
|
|
@@ -46,11 +47,14 @@ export class GapAnalysisStep {
|
|
|
46
47
|
// reports publish with a score but no test details.
|
|
47
48
|
//
|
|
48
49
|
// A remote cache hit restores score-summary.json (with testCount) from a
|
|
49
|
-
// prior report but never writes grader-judgments.json, so judgments
|
|
50
|
-
//
|
|
51
|
-
|
|
50
|
+
// prior report but never writes grader-judgments.json, so absent judgments
|
|
51
|
+
// are legitimate ONLY when every required sub-eval came from the cache. In a
|
|
52
|
+
// hybrid full run (e.g. agentic cached, baseline evaluated fresh) the fresh
|
|
53
|
+
// run's missing judgments are still degraded — gate the skip on ALL required
|
|
54
|
+
// runs being cached, not merely any.
|
|
55
|
+
const allCached = allRequiredEvalRunsCached(ctx.config, state);
|
|
52
56
|
const inputs = classifyEnrichmentInputs(root);
|
|
53
|
-
if (inputs.kind === "judgments-missing-after-eval" && !
|
|
57
|
+
if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
|
|
54
58
|
return {
|
|
55
59
|
durationMs: Date.now() - start,
|
|
56
60
|
status: "failed",
|
|
@@ -11,9 +11,11 @@ import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
|
11
11
|
import { emitSymbolPreflight } from "../../pipeline/preflight/emit-symbol-preflight.js";
|
|
12
12
|
import { loadPackageSurface } from "../../pipeline/preflight/load-package-surface.js";
|
|
13
13
|
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
14
|
+
import { FanoutArtifactWriter } from "../../artifact-capture/fanout-artifact-writer.js";
|
|
15
|
+
import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
|
|
14
16
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
15
17
|
import { buildCacheContext } from "../cache-context.js";
|
|
16
|
-
import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
|
|
18
|
+
import { pruneToResolvableRefs, remapToCacheHitRefs, } from "../../pipeline/cache-hit-restore.js";
|
|
17
19
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
18
20
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
19
21
|
import { loadGraderModel } from "../../pipeline/grader-api.js";
|
|
@@ -147,11 +149,29 @@ export class RunEvalStep {
|
|
|
147
149
|
remoteCacheResult.sourceRunId &&
|
|
148
150
|
ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
|
|
149
151
|
const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
|
|
150
|
-
|
|
151
|
-
|
|
152
|
+
// W0350 / D0057 — a degraded source run can advertise per-entry
|
|
153
|
+
// artifacts (e.g. rawResults) whose objects were never written under
|
|
154
|
+
// its prefix. Drop those over-claims here, at the restore boundary,
|
|
155
|
+
// so the new run's manifest advertises only artifacts that resolve —
|
|
156
|
+
// rather than pushing per-object HEAD checks onto the read side's hot
|
|
157
|
+
// signing path (AC 3). When no object checker is reachable in the
|
|
158
|
+
// writer chain (local-only / NoOp / gateway backends), skip the prune
|
|
159
|
+
// and restore verbatim, preserving prior behavior.
|
|
160
|
+
const checker = findObjectChecker(ctx.artifactWriter);
|
|
161
|
+
const { manifest: resolvable, droppedEntries, droppedRefs, } = checker
|
|
162
|
+
? await pruneToResolvableRefs(restored, checker)
|
|
163
|
+
: { manifest: restored, droppedEntries: 0, droppedRefs: 0 };
|
|
164
|
+
ctx.artifactWriter.injectAccumulated(resolvable);
|
|
165
|
+
const count = Object.keys(resolvable).length;
|
|
152
166
|
if (count > 0) {
|
|
153
167
|
console.log(` ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
|
|
154
168
|
}
|
|
169
|
+
if (droppedEntries > 0 || droppedRefs > 0) {
|
|
170
|
+
const refsNote = droppedRefs > 0
|
|
171
|
+
? ` and ${droppedRefs} ref${droppedRefs === 1 ? "" : "s"}`
|
|
172
|
+
: "";
|
|
173
|
+
console.log(` ⚠️ Dropped ${droppedEntries} unresolvable artifact entr${droppedEntries === 1 ? "y" : "ies"}${refsNote} over-claimed by cache parent ${remoteCacheResult.sourceRunId}`);
|
|
174
|
+
}
|
|
155
175
|
}
|
|
156
176
|
return {
|
|
157
177
|
durationMs: Date.now() - start,
|
|
@@ -275,6 +295,39 @@ export class RunEvalStep {
|
|
|
275
295
|
}
|
|
276
296
|
}
|
|
277
297
|
// ---------------------------------------------------------------------------
|
|
298
|
+
// Object-checker discovery (D0057 / W0350)
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
const FIND_CHECKER_MAX_STEPS = 16;
|
|
301
|
+
/**
|
|
302
|
+
* Walk a writer's decorator/fanout chain to feature-detect an
|
|
303
|
+
* `ArtifactObjectChecker`. The composition root wraps the backend in
|
|
304
|
+
* `AccumulatingArtifactWriter` (and optionally `InstrumentedArtifactWriter`)
|
|
305
|
+
* and layers GCS over local via `FanoutArtifactWriter`. Only
|
|
306
|
+
* `GcsArtifactWriter` implements `objectExists`, so on local-only / NoOp /
|
|
307
|
+
* gateway chains this returns null and the cache-hit restore skips pruning.
|
|
308
|
+
* `MAX_STEPS` is a cycle guard against a future decorator self-reference.
|
|
309
|
+
*/
|
|
310
|
+
function findObjectChecker(writer) {
|
|
311
|
+
const stack = [writer];
|
|
312
|
+
for (let steps = 0; stack.length > 0 && steps < FIND_CHECKER_MAX_STEPS; steps++) {
|
|
313
|
+
const cursor = stack.pop();
|
|
314
|
+
if (!cursor)
|
|
315
|
+
continue;
|
|
316
|
+
if (hasObjectExists(cursor))
|
|
317
|
+
return cursor;
|
|
318
|
+
if (cursor instanceof AccumulatingArtifactWriter)
|
|
319
|
+
stack.push(cursor.inner);
|
|
320
|
+
else if (cursor instanceof InstrumentedArtifactWriter)
|
|
321
|
+
stack.push(cursor.inner);
|
|
322
|
+
else if (cursor instanceof FanoutArtifactWriter)
|
|
323
|
+
stack.push(...cursor.delegates);
|
|
324
|
+
}
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
function hasObjectExists(w) {
|
|
328
|
+
return (typeof w.objectExists === "function");
|
|
329
|
+
}
|
|
330
|
+
// ---------------------------------------------------------------------------
|
|
278
331
|
// Remote cache helpers
|
|
279
332
|
// ---------------------------------------------------------------------------
|
|
280
333
|
async function checkRemoteCache(fingerprint, reportStore, rootDir) {
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/assert-grader-judgments-persisted.ts
|
|
3
|
+
*
|
|
4
|
+
* Post-persist guard for the grader-judgments write junction in
|
|
5
|
+
* `calculateAndWriteScores`.
|
|
6
|
+
*
|
|
7
|
+
* `extractGraderJudgmentsResilient` returns N judgments in memory, after which
|
|
8
|
+
* `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
|
|
9
|
+
* guard decides whether `grader-judgments.json` is written. A transient read
|
|
10
|
+
* anomaly or an unexpected in-place emptying can leave the file absent or empty
|
|
11
|
+
* even though extraction yielded judgments. Silently skipping the write strands
|
|
12
|
+
* gap-analysis and ships a scored report with no test details.
|
|
13
|
+
*
|
|
14
|
+
* This guard re-reads the file from disk — the same read gap-analysis performs
|
|
15
|
+
* — and fails loud when a non-empty extraction did not round-trip. The check is
|
|
16
|
+
* deliberately narrow: it fires only on the catastrophic "extracted N>0,
|
|
17
|
+
* persisted 0" divergence, never on a genuinely judgment-free run.
|
|
18
|
+
*/
|
|
19
|
+
/** Injectable seam — counts the grader judgments actually on disk. */
|
|
20
|
+
export interface PersistVerificationDeps {
|
|
21
|
+
countPersisted: (path: string) => number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Fail loud when a non-empty grader-judgment extraction did not round-trip to
|
|
25
|
+
* disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
|
|
26
|
+
* / no llm-rubric) is valid and persists nothing by design.
|
|
27
|
+
*
|
|
28
|
+
* @param extractedCount Judgments returned by extraction, captured BEFORE any
|
|
29
|
+
* in-place mutation (e.g. the borderline-consensus pass) so the count
|
|
30
|
+
* reflects what extraction actually produced.
|
|
31
|
+
* @param judgmentsPath Absolute path to `grader-judgments.json`.
|
|
32
|
+
* @param deps Injectable disk reader; defaults to the real filesystem.
|
|
33
|
+
* @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
|
|
34
|
+
*/
|
|
35
|
+
export declare function assertGraderJudgmentsPersisted(extractedCount: number, judgmentsPath: string, deps?: PersistVerificationDeps): void;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/assert-grader-judgments-persisted.ts
|
|
3
|
+
*
|
|
4
|
+
* Post-persist guard for the grader-judgments write junction in
|
|
5
|
+
* `calculateAndWriteScores`.
|
|
6
|
+
*
|
|
7
|
+
* `extractGraderJudgmentsResilient` returns N judgments in memory, after which
|
|
8
|
+
* `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
|
|
9
|
+
* guard decides whether `grader-judgments.json` is written. A transient read
|
|
10
|
+
* anomaly or an unexpected in-place emptying can leave the file absent or empty
|
|
11
|
+
* even though extraction yielded judgments. Silently skipping the write strands
|
|
12
|
+
* gap-analysis and ships a scored report with no test details.
|
|
13
|
+
*
|
|
14
|
+
* This guard re-reads the file from disk — the same read gap-analysis performs
|
|
15
|
+
* — and fails loud when a non-empty extraction did not round-trip. The check is
|
|
16
|
+
* deliberately narrow: it fires only on the catastrophic "extracted N>0,
|
|
17
|
+
* persisted 0" divergence, never on a genuinely judgment-free run.
|
|
18
|
+
*/
|
|
19
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
20
|
+
/**
|
|
21
|
+
* Parse `grader-judgments.json` and return its array length. Every "no usable
|
|
22
|
+
* judgments" shape (missing, unreadable, invalid JSON, non-array) collapses to
|
|
23
|
+
* 0 — mirroring how the downstream enrichment precondition reads the same file.
|
|
24
|
+
*/
|
|
25
|
+
function defaultCountPersisted(path) {
|
|
26
|
+
if (!existsSync(path))
|
|
27
|
+
return 0;
|
|
28
|
+
try {
|
|
29
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
30
|
+
return Array.isArray(parsed) ? parsed.length : 0;
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Fail loud when a non-empty grader-judgment extraction did not round-trip to
|
|
38
|
+
* disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
|
|
39
|
+
* / no llm-rubric) is valid and persists nothing by design.
|
|
40
|
+
*
|
|
41
|
+
* @param extractedCount Judgments returned by extraction, captured BEFORE any
|
|
42
|
+
* in-place mutation (e.g. the borderline-consensus pass) so the count
|
|
43
|
+
* reflects what extraction actually produced.
|
|
44
|
+
* @param judgmentsPath Absolute path to `grader-judgments.json`.
|
|
45
|
+
* @param deps Injectable disk reader; defaults to the real filesystem.
|
|
46
|
+
* @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
|
|
47
|
+
*/
|
|
48
|
+
export function assertGraderJudgmentsPersisted(extractedCount, judgmentsPath, deps = { countPersisted: defaultCountPersisted }) {
|
|
49
|
+
if (extractedCount <= 0)
|
|
50
|
+
return;
|
|
51
|
+
const persisted = deps.countPersisted(judgmentsPath);
|
|
52
|
+
if (persisted <= 0) {
|
|
53
|
+
throw new Error(`Grader judgments extract/persist divergence: extracted ${extractedCount} ` +
|
|
54
|
+
`judgment(s) but grader-judgments.json persisted 0. Refusing to finish ` +
|
|
55
|
+
`scoring — a scored report with no grader judgments would strand ` +
|
|
56
|
+
`gap-analysis and ship with no test details.`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
-
import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
11
|
+
import { type ArtifactManifest, type ArtifactObjectChecker, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
12
12
|
/**
|
|
13
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
14
14
|
* that doesn't already carry one.
|
|
@@ -47,3 +47,39 @@ import { type ArtifactManifest, type RunId } from "../_vendor/ailf-core/index.d.
|
|
|
47
47
|
export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
|
|
48
48
|
sourceRunId: RunId;
|
|
49
49
|
}): ArtifactManifest;
|
|
50
|
+
/** Outcome of `pruneToResolvableRefs`. */
|
|
51
|
+
export interface PruneResult {
|
|
52
|
+
/** Manifest with over-claimed entries/refs removed. */
|
|
53
|
+
readonly manifest: ArtifactManifest;
|
|
54
|
+
/** Per-entry entries dropped because their object did not exist. */
|
|
55
|
+
readonly droppedEntries: number;
|
|
56
|
+
/** Refs dropped entirely (bulk missing, or per-entry left with no entries). */
|
|
57
|
+
readonly droppedRefs: number;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Drop artifact refs (and per-entry entries) a cached report over-claimed —
|
|
61
|
+
* entries whose backing object was never written under the source run's
|
|
62
|
+
* storage prefix (D0040 / D0057, W0350).
|
|
63
|
+
*
|
|
64
|
+
* A degraded source run can publish a manifest that lists `rawResults`
|
|
65
|
+
* entries with no GCS object behind them; `remapToCacheHitRefs` copies those
|
|
66
|
+
* phantom entries forward into the new run's manifest, and the read side then
|
|
67
|
+
* signs URLs that 404 ("the specified key does not exist"). Pruning here, at
|
|
68
|
+
* the cache-hit restore boundary, removes the over-claim at the source so the
|
|
69
|
+
* written manifest's `entryCount` / `entries[]` reflect only artifacts that
|
|
70
|
+
* actually resolve — instead of pushing per-object HEAD checks onto the hot
|
|
71
|
+
* signing path (W0350 AC 3).
|
|
72
|
+
*
|
|
73
|
+
* Resolution mirrors the gateway: a per-entry object lives at
|
|
74
|
+
* `descriptor.objectPath(sourceRunId, entry.key)`, where `sourceRunId` is the
|
|
75
|
+
* runId encoded in `ref.path` (preferred — structurally tied to where bytes
|
|
76
|
+
* physically live) falling back to `ref.sourceRunId` (the lineage hint).
|
|
77
|
+
*
|
|
78
|
+
* **Fail open.** `checker.objectExists` throws when existence can't be
|
|
79
|
+
* determined (auth / network / quota). A throw KEEPS the ref/entry — we never
|
|
80
|
+
* drop a real artifact on a transient blip; the read side already tolerates a
|
|
81
|
+
* rare residual 404 (W0349). Only a definitive `false` drops an entry.
|
|
82
|
+
*
|
|
83
|
+
* Pure w.r.t. its inputs: returns a fresh manifest, never mutates `source`.
|
|
84
|
+
*/
|
|
85
|
+
export declare function pruneToResolvableRefs(source: ArtifactManifest, checker: ArtifactObjectChecker): Promise<PruneResult>;
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
9
|
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
10
|
*/
|
|
11
|
-
import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
|
|
11
|
+
import { ARTIFACT_REGISTRY, runId as parseRunId, } from "../_vendor/ailf-core/index.js";
|
|
12
12
|
/**
|
|
13
13
|
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref
|
|
14
14
|
* that doesn't already carry one.
|
|
@@ -60,3 +60,110 @@ export function remapToCacheHitRefs(source, opts) {
|
|
|
60
60
|
}
|
|
61
61
|
return out;
|
|
62
62
|
}
|
|
63
|
+
/**
|
|
64
|
+
* Drop artifact refs (and per-entry entries) a cached report over-claimed —
|
|
65
|
+
* entries whose backing object was never written under the source run's
|
|
66
|
+
* storage prefix (D0040 / D0057, W0350).
|
|
67
|
+
*
|
|
68
|
+
* A degraded source run can publish a manifest that lists `rawResults`
|
|
69
|
+
* entries with no GCS object behind them; `remapToCacheHitRefs` copies those
|
|
70
|
+
* phantom entries forward into the new run's manifest, and the read side then
|
|
71
|
+
* signs URLs that 404 ("the specified key does not exist"). Pruning here, at
|
|
72
|
+
* the cache-hit restore boundary, removes the over-claim at the source so the
|
|
73
|
+
* written manifest's `entryCount` / `entries[]` reflect only artifacts that
|
|
74
|
+
* actually resolve — instead of pushing per-object HEAD checks onto the hot
|
|
75
|
+
* signing path (W0350 AC 3).
|
|
76
|
+
*
|
|
77
|
+
* Resolution mirrors the gateway: a per-entry object lives at
|
|
78
|
+
* `descriptor.objectPath(sourceRunId, entry.key)`, where `sourceRunId` is the
|
|
79
|
+
* runId encoded in `ref.path` (preferred — structurally tied to where bytes
|
|
80
|
+
* physically live) falling back to `ref.sourceRunId` (the lineage hint).
|
|
81
|
+
*
|
|
82
|
+
* **Fail open.** `checker.objectExists` throws when existence can't be
|
|
83
|
+
* determined (auth / network / quota). A throw KEEPS the ref/entry — we never
|
|
84
|
+
* drop a real artifact on a transient blip; the read side already tolerates a
|
|
85
|
+
* rare residual 404 (W0349). Only a definitive `false` drops an entry.
|
|
86
|
+
*
|
|
87
|
+
* Pure w.r.t. its inputs: returns a fresh manifest, never mutates `source`.
|
|
88
|
+
*/
|
|
89
|
+
export async function pruneToResolvableRefs(source, checker) {
|
|
90
|
+
const out = {};
|
|
91
|
+
let droppedEntries = 0;
|
|
92
|
+
let droppedRefs = 0;
|
|
93
|
+
for (const [type, ref] of Object.entries(source)) {
|
|
94
|
+
if (!ref)
|
|
95
|
+
continue;
|
|
96
|
+
const artifactType = type;
|
|
97
|
+
const descriptor = ARTIFACT_REGISTRY[artifactType];
|
|
98
|
+
// Bulk: a single object at ref.path.
|
|
99
|
+
if (ref.layout === "bulk") {
|
|
100
|
+
if (await existsOrKeep(checker, ref.path))
|
|
101
|
+
out[artifactType] = ref;
|
|
102
|
+
else
|
|
103
|
+
droppedRefs++;
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
// Per-entry: each entry is its own object under the source run prefix.
|
|
107
|
+
const sourceRunId = resolveSourceRunId(ref);
|
|
108
|
+
const entries = ref.entries ?? [];
|
|
109
|
+
if (!descriptor || sourceRunId === undefined || entries.length === 0) {
|
|
110
|
+
// Can't resolve per-entry object paths — fail open, keep verbatim.
|
|
111
|
+
out[artifactType] = ref;
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
const keptFlags = await Promise.all(entries.map(async (entry) => {
|
|
115
|
+
let objectPath;
|
|
116
|
+
try {
|
|
117
|
+
objectPath = descriptor.objectPath(sourceRunId, entry.key);
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
return true; // malformed key — fail open rather than drop
|
|
121
|
+
}
|
|
122
|
+
return existsOrKeep(checker, objectPath);
|
|
123
|
+
}));
|
|
124
|
+
const kept = entries.filter((_, i) => keptFlags[i]);
|
|
125
|
+
if (kept.length === entries.length) {
|
|
126
|
+
out[artifactType] = ref;
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
droppedEntries += entries.length - kept.length;
|
|
130
|
+
if (kept.length === 0) {
|
|
131
|
+
droppedRefs++; // nothing resolvable — drop the whole over-claimed ref
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
out[artifactType] = {
|
|
135
|
+
...ref,
|
|
136
|
+
entries: kept,
|
|
137
|
+
entryCount: kept.length,
|
|
138
|
+
bytes: kept.reduce((sum, e) => sum + (e.bytes ?? 0), 0),
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
return { manifest: out, droppedEntries, droppedRefs };
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Resolve where a ref's bytes physically live. Prefers the runId encoded in
|
|
145
|
+
* `ref.path` (validated through the canonical parser so a malformed manifest
|
|
146
|
+
* path can't propagate into a synthesized object name) over the
|
|
147
|
+
* `ref.sourceRunId` lineage hint — matching the gateway's resolution order.
|
|
148
|
+
*/
|
|
149
|
+
function resolveSourceRunId(ref) {
|
|
150
|
+
const fromPath = /^runs\/([^/]+)/.exec(ref.path)?.[1];
|
|
151
|
+
if (fromPath) {
|
|
152
|
+
const parsed = parseRunId(fromPath);
|
|
153
|
+
if (parsed.ok)
|
|
154
|
+
return parsed.value;
|
|
155
|
+
}
|
|
156
|
+
return ref.sourceRunId;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* `checker.objectExists` wrapper that returns `true` (keep) on a thrown,
|
|
160
|
+
* indeterminate result — only a definitive `false` drops the artifact.
|
|
161
|
+
*/
|
|
162
|
+
async function existsOrKeep(checker, path) {
|
|
163
|
+
try {
|
|
164
|
+
return await checker.objectExists(path);
|
|
165
|
+
}
|
|
166
|
+
catch {
|
|
167
|
+
return true;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
@@ -42,6 +42,7 @@ import { loadSource } from "../sources.js";
|
|
|
42
42
|
import { LiteracyVariant } from "./normalize-mode.js";
|
|
43
43
|
import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
|
|
44
44
|
import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
|
|
45
|
+
import { assertGraderJudgmentsPersisted } from "./assert-grader-judgments-persisted.js";
|
|
45
46
|
// Re-export from core for backward compatibility.
|
|
46
47
|
// Existing imports from this file continue to work unchanged.
|
|
47
48
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -1544,6 +1545,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1544
1545
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1545
1546
|
// Extract and persist grader judgments
|
|
1546
1547
|
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1548
|
+
const extractedJudgmentCount = judgments.length;
|
|
1547
1549
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1548
1550
|
baselineResultsPath,
|
|
1549
1551
|
]);
|
|
@@ -1555,6 +1557,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1555
1557
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1556
1558
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1557
1559
|
}
|
|
1560
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a
|
|
1561
|
+
// transient divergence at the persist junction): otherwise gap-analysis
|
|
1562
|
+
// skips and the report ships a score with no test details.
|
|
1563
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1558
1564
|
// Extract and persist per-test results (D0029: model output + metadata)
|
|
1559
1565
|
// Agent-harness produces a single profile shared across detected variants
|
|
1560
1566
|
// (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
|
|
@@ -1607,6 +1613,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1607
1613
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1608
1614
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1609
1615
|
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1616
|
+
const extractedJudgmentCount = judgments.length;
|
|
1610
1617
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1611
1618
|
baselineResultsPath,
|
|
1612
1619
|
]);
|
|
@@ -1618,6 +1625,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1618
1625
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1619
1626
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1620
1627
|
}
|
|
1628
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a
|
|
1629
|
+
// transient divergence at the persist junction): otherwise gap-analysis
|
|
1630
|
+
// skips and the report ships a score with no test details.
|
|
1631
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1621
1632
|
// Knowledge-probe deletes vars.docs in the compiler, so every entry's
|
|
1622
1633
|
// detected variant is "baseline" — supply the probe profile under both
|
|
1623
1634
|
// keys so the composite is populated regardless of detection.
|
|
@@ -1744,6 +1755,9 @@ export async function calculateAndWriteScores(options) {
|
|
|
1744
1755
|
? [baselineResultsPath, agenticResultsPath]
|
|
1745
1756
|
: [baselineResultsPath];
|
|
1746
1757
|
const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
|
|
1758
|
+
// Capture the extracted count before the borderline pass mutates the array
|
|
1759
|
+
// in place — the persist guard below compares it against what lands on disk.
|
|
1760
|
+
const extractedJudgmentCount = judgments.length;
|
|
1747
1761
|
// Borderline-consensus pass — re-grade the ±5 borderline subset N times
|
|
1748
1762
|
// and merge medians back into the canonical judgments BEFORE
|
|
1749
1763
|
// `validateGraderJudgmentsCalibration` runs, so the calibration counter
|
|
@@ -1774,6 +1788,10 @@ export async function calculateAndWriteScores(options) {
|
|
|
1774
1788
|
});
|
|
1775
1789
|
}
|
|
1776
1790
|
}
|
|
1791
|
+
// Fail loud if a non-empty extraction did not round-trip to disk (a transient
|
|
1792
|
+
// divergence at the persist junction): otherwise gap-analysis skips and the
|
|
1793
|
+
// report ships a score with no test details.
|
|
1794
|
+
assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
|
|
1777
1795
|
// Extract and persist per-test results (D0029: model output + metadata).
|
|
1778
1796
|
// Literacy gold (with-docs) entries score against the default profile;
|
|
1779
1797
|
// baseline (without-docs) entries score against the output-only profile.
|