@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthesis cost telemetry types — canonical TS-first shapes for
|
|
3
|
+
* Phase 6 DIAG-06 cost and parse-failure observability.
|
|
4
|
+
*
|
|
5
|
+
* These interfaces are authored independently of their Zod adapter schema
|
|
6
|
+
* (Plan 06-02) per D0045: the Zod schema declares
|
|
7
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
|
|
8
|
+
* type so drift is a build error, not a runtime bug.
|
|
9
|
+
*
|
|
10
|
+
* The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
|
|
11
|
+
* land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
12
|
+
* (D6-09). No new sibling doc type (D0033 / D6-09).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
15
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
16
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
|
|
17
|
+
*/
|
|
18
|
+
export {};
|
|
@@ -115,12 +115,10 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
115
115
|
compareBaseline: config.compareBaseline,
|
|
116
116
|
gapAnalysisEnabled: config.execution?.gapAnalysis ?? true,
|
|
117
117
|
// W0077 Phase 4 — `publish` is now a policy object. Map the auto value
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
//
|
|
121
|
-
publishEnabled: config.publish?.auto
|
|
122
|
-
? false
|
|
123
|
-
: config.publish?.auto !== undefined,
|
|
118
|
+
// to a boolean for the file-config path. Absence of publish.auto mirrors
|
|
119
|
+
// the CLI's "full-runs" default (enable publish; composition root gates on
|
|
120
|
+
// token availability). Only "never" explicitly disables auto-publish.
|
|
121
|
+
publishEnabled: config.publish?.auto !== "never",
|
|
124
122
|
publishTag: config.publish?.tag,
|
|
125
123
|
noCache: config.noCache ?? false,
|
|
126
124
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
@@ -150,5 +148,9 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
150
148
|
? resolve(rootDir, config.taskSource.repoTasksPath)
|
|
151
149
|
: undefined,
|
|
152
150
|
presets: config.presets,
|
|
151
|
+
// Phase 6 / DIAG-06 — thread summary.onRun into ResolvedConfig so the
|
|
152
|
+
// file-config exit branch in executePipeline can pass it to
|
|
153
|
+
// runPostPipelineHooks.
|
|
154
|
+
summaryOnRun: config.summary?.onRun,
|
|
153
155
|
};
|
|
154
156
|
}
|
|
@@ -40,9 +40,29 @@ export declare class FakeLLMClient implements LLMClient {
|
|
|
40
40
|
readonly calls: FakeCallRecord[];
|
|
41
41
|
private readonly completeQueue;
|
|
42
42
|
private readonly structuredQueue;
|
|
43
|
+
/**
|
|
44
|
+
* Per-cardId keyed responses. A single-value entry is returned on every
|
|
45
|
+
* call for that cardId (repeated calls always get the same response). An
|
|
46
|
+
* array-value entry is consumed in order; once exhausted, calls for that
|
|
47
|
+
* cardId fall back to the FIFO structuredQueue.
|
|
48
|
+
*
|
|
49
|
+
* This is the substrate Plan 07's 17-fixture eval matrix uses to wire
|
|
50
|
+
* deterministic responses to specific LLM cards.
|
|
51
|
+
*/
|
|
52
|
+
private readonly keyedResponses;
|
|
43
53
|
constructor(args?: {
|
|
44
54
|
completeResponses?: FakeCompletionResponse[];
|
|
45
55
|
structuredResponses?: FakeStructuredResponse[];
|
|
56
|
+
/**
|
|
57
|
+
* Optional keyed-response map. Keys are `cardId` values from
|
|
58
|
+
* `args.context.cardId`. When a call matches a key the keyed entry is
|
|
59
|
+
* used instead of the FIFO queue.
|
|
60
|
+
*
|
|
61
|
+
* - Single-value entry: same response on every call for this cardId.
|
|
62
|
+
* - Array-value entry: entries consumed in insertion order; falls back
|
|
63
|
+
* to FIFO (or throws) when the array is exhausted.
|
|
64
|
+
*/
|
|
65
|
+
keyedResponses?: Record<string, FakeStructuredResponse | FakeStructuredResponse[]>;
|
|
46
66
|
});
|
|
47
67
|
complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
|
|
48
68
|
completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
|
|
@@ -11,9 +11,25 @@ export class FakeLLMClient {
|
|
|
11
11
|
calls = [];
|
|
12
12
|
completeQueue;
|
|
13
13
|
structuredQueue;
|
|
14
|
+
/**
|
|
15
|
+
* Per-cardId keyed responses. A single-value entry is returned on every
|
|
16
|
+
* call for that cardId (repeated calls always get the same response). An
|
|
17
|
+
* array-value entry is consumed in order; once exhausted, calls for that
|
|
18
|
+
* cardId fall back to the FIFO structuredQueue.
|
|
19
|
+
*
|
|
20
|
+
* This is the substrate Plan 07's 17-fixture eval matrix uses to wire
|
|
21
|
+
* deterministic responses to specific LLM cards.
|
|
22
|
+
*/
|
|
23
|
+
keyedResponses;
|
|
14
24
|
constructor(args = {}) {
|
|
15
25
|
this.completeQueue = [...(args.completeResponses ?? [])];
|
|
16
26
|
this.structuredQueue = [...(args.structuredResponses ?? [])];
|
|
27
|
+
// Deep-copy arrays so the caller's fixture data is not mutated.
|
|
28
|
+
const keyed = {};
|
|
29
|
+
for (const [key, val] of Object.entries(args.keyedResponses ?? {})) {
|
|
30
|
+
keyed[key] = Array.isArray(val) ? [...val] : val;
|
|
31
|
+
}
|
|
32
|
+
this.keyedResponses = keyed;
|
|
17
33
|
}
|
|
18
34
|
async complete(args) {
|
|
19
35
|
this.calls.push({
|
|
@@ -37,13 +53,34 @@ export class FakeLLMClient {
|
|
|
37
53
|
};
|
|
38
54
|
}
|
|
39
55
|
async completeStructured(args) {
|
|
56
|
+
// Record every call first so test assertions on this.calls are never
|
|
57
|
+
// affected by which branch (keyed vs FIFO) handles the response.
|
|
40
58
|
this.calls.push({
|
|
41
59
|
kind: "completeStructured",
|
|
42
60
|
model: args.model,
|
|
43
61
|
prompt: args.prompt,
|
|
44
62
|
...(args.context ? { context: args.context } : {}),
|
|
45
63
|
});
|
|
46
|
-
|
|
64
|
+
let next;
|
|
65
|
+
const cardId = args.context?.cardId;
|
|
66
|
+
if (cardId !== undefined && cardId in this.keyedResponses) {
|
|
67
|
+
const entry = this.keyedResponses[cardId];
|
|
68
|
+
if (Array.isArray(entry)) {
|
|
69
|
+
// Array-value: consume one entry per call. When exhausted, fall
|
|
70
|
+
// through to the FIFO queue below.
|
|
71
|
+
if (entry.length > 0) {
|
|
72
|
+
next = entry.shift();
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
// Single-value: return the same response on every call.
|
|
77
|
+
next = entry;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (next === undefined) {
|
|
81
|
+
// FIFO fallback (existing behavior)
|
|
82
|
+
next = this.structuredQueue.shift();
|
|
83
|
+
}
|
|
47
84
|
if (!next) {
|
|
48
85
|
throw new Error("FakeLLMClient: no more queued structured responses (call exceeded queue)");
|
|
49
86
|
}
|
|
@@ -5,5 +5,5 @@ export type { FakeCallRecord, FakeCompletionResponse, FakeStructuredResponse, }
|
|
|
5
5
|
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
6
6
|
export type { OpenAILLMClientOptions } from "./openai-llm-client.js";
|
|
7
7
|
export type { ModelPricing } from "./pricing.js";
|
|
8
|
-
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
8
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
9
9
|
export type { RetryPolicy } from "./retry.js";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
export { AnthropicLLMClient } from "./anthropic-llm-client.js";
|
|
2
2
|
export { FakeLLMClient } from "./fake-llm-client.js";
|
|
3
3
|
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
4
|
-
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
4
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
@@ -10,8 +10,9 @@
|
|
|
10
10
|
* the adapter never reads `process.env`. The composition root maps env vars
|
|
11
11
|
* to typed constructor args.
|
|
12
12
|
*/
|
|
13
|
+
import { z } from "zod";
|
|
13
14
|
import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
14
|
-
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
15
|
+
import { DEFAULT_RETRY_POLICY, LLMParseError, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
15
16
|
const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
|
|
16
17
|
/**
|
|
17
18
|
* Conservative defaults for the models in `packages/eval/config/models.ts`.
|
|
@@ -67,10 +68,25 @@ export class OpenAILLMClient {
|
|
|
67
68
|
}
|
|
68
69
|
async completeStructured(args) {
|
|
69
70
|
const { modelName } = splitModelId(args.model);
|
|
71
|
+
// Derive the JSON Schema from the caller's Zod schema. Zod v4 natively
|
|
72
|
+
// emits `additionalProperties: false` on every nested z.object node —
|
|
73
|
+
// this is required for OpenAI strict-mode.
|
|
74
|
+
const jsonSchema = z.toJSONSchema(args.schema, { target: "draft-2020-12" });
|
|
75
|
+
// OpenAI strict-mode requires the root to be a plain object schema (no
|
|
76
|
+
// anyOf/oneOf/allOf at the top level). Discriminated unions produce
|
|
77
|
+
// anyOf at the root — callers must wrap them in a discriminator object.
|
|
78
|
+
assertSchemaIsObjectRoot(jsonSchema, args.model);
|
|
70
79
|
const body = buildBody(modelName, args.prompt, {
|
|
71
|
-
temperature: args.temperature,
|
|
72
|
-
maxTokens: args.maxTokens,
|
|
73
|
-
responseFormat: {
|
|
80
|
+
temperature: args.temperature ?? 0.1,
|
|
81
|
+
maxTokens: args.maxTokens ?? 2000,
|
|
82
|
+
responseFormat: {
|
|
83
|
+
type: "json_schema",
|
|
84
|
+
json_schema: {
|
|
85
|
+
name: args.context?.cardId ?? "structured_output",
|
|
86
|
+
schema: jsonSchema,
|
|
87
|
+
strict: true,
|
|
88
|
+
},
|
|
89
|
+
},
|
|
74
90
|
});
|
|
75
91
|
const data = await this.callApi(body);
|
|
76
92
|
const raw = data.choices?.[0]?.message?.content;
|
|
@@ -82,8 +98,16 @@ export class OpenAILLMClient {
|
|
|
82
98
|
parsed = JSON.parse(raw);
|
|
83
99
|
}
|
|
84
100
|
catch (err) {
|
|
85
|
-
|
|
101
|
+
// Sanitize: SyntaxError.message embeds a snippet at the failure offset,
|
|
102
|
+
// which can leak prompt text or user content echoed back by the model.
|
|
103
|
+
// Keep the raw body on the instance for callers that opt in via .raw,
|
|
104
|
+
// mirroring the LLMHttpError pattern (verified by the "does not leak
|
|
105
|
+
// the response body" test in openai-llm-client.test.ts).
|
|
106
|
+
throw new LLMParseError(`OpenAI structured completion returned invalid JSON for model ${args.model}`, raw, { cause: err });
|
|
86
107
|
}
|
|
108
|
+
// strict:true guarantees a valid-against-the-schema JSON document, but
|
|
109
|
+
// the Zod parse is still load-bearing — it brands the result as T and is
|
|
110
|
+
// the only contract the engine trusts (D0045 parse-don't-validate).
|
|
87
111
|
const value = args.schema.parse(parsed);
|
|
88
112
|
const usage = extractUsage(data.usage);
|
|
89
113
|
const cost = this.computeCost(modelName, usage);
|
|
@@ -145,6 +169,36 @@ export class OpenAILLMClient {
|
|
|
145
169
|
`cost_usd=${cost.toFixed(6)}`);
|
|
146
170
|
}
|
|
147
171
|
}
|
|
172
|
+
/**
|
|
173
|
+
* Assert that the JSON Schema root is a plain object type.
|
|
174
|
+
*
|
|
175
|
+
* OpenAI strict-mode requires the root schema to be `{ type: "object" }`.
|
|
176
|
+
* A discriminated union (`z.union([...])`) produces `{ anyOf: [...] }` at
|
|
177
|
+
* the root — callers must wrap the union in a discriminator object before
|
|
178
|
+
* passing it to `completeStructured`.
|
|
179
|
+
*
|
|
180
|
+
* Per AI-SPEC §3 Pitfall 6 + T-05-03-01: caught at request-build time to
|
|
181
|
+
* avoid wasting API budget on a guaranteed 400.
|
|
182
|
+
*/
|
|
183
|
+
function assertSchemaIsObjectRoot(schema, modelId) {
|
|
184
|
+
if (typeof schema !== "object" || schema === null) {
|
|
185
|
+
throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
|
|
186
|
+
`schema root for model ${modelId}; got non-object JSON Schema root.`);
|
|
187
|
+
}
|
|
188
|
+
const node = schema;
|
|
189
|
+
if (node.type !== "object") {
|
|
190
|
+
// Identify the kind so the error message is actionable.
|
|
191
|
+
const kind = "anyOf" in node
|
|
192
|
+
? "z.union"
|
|
193
|
+
: "oneOf" in node
|
|
194
|
+
? "z.discriminatedUnion"
|
|
195
|
+
: "allOf" in node
|
|
196
|
+
? "z.intersection"
|
|
197
|
+
: String(node.type ?? "unknown");
|
|
198
|
+
throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
|
|
199
|
+
`schema root; got ${kind}. Wrap the union in a discriminator object.`);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
148
202
|
function buildBody(modelName, prompt, opts) {
|
|
149
203
|
const body = {
|
|
150
204
|
model: modelName,
|
|
@@ -33,6 +33,24 @@ export declare class LLMHttpError extends Error {
|
|
|
33
33
|
readonly body: string;
|
|
34
34
|
constructor(status: number, body: string, attempts: number);
|
|
35
35
|
}
|
|
36
|
+
/**
|
|
37
|
+
* Sanitized error raised when an LLM adapter receives an HTTP-200 response
|
|
38
|
+
* whose body is not valid JSON. The raw response body (which may echo back
|
|
39
|
+
* user prompt content or even API-key fragments from prompts) is kept on the
|
|
40
|
+
* instance for callers that opt in via `.raw`, NOT in the message string.
|
|
41
|
+
*
|
|
42
|
+
* Mirrors the LLMHttpError pattern verified by the
|
|
43
|
+
* "does not leak the response body" test in openai-llm-client.test.ts.
|
|
44
|
+
*/
|
|
45
|
+
export declare class LLMParseError extends Error {
|
|
46
|
+
/** Full raw response body (kept on the instance, NOT in `message`). */
|
|
47
|
+
readonly raw: string;
|
|
48
|
+
/** Byte length of `raw` — safe to include in the message. */
|
|
49
|
+
readonly rawLength: number;
|
|
50
|
+
constructor(message: string, raw: string, options?: {
|
|
51
|
+
cause?: unknown;
|
|
52
|
+
});
|
|
53
|
+
}
|
|
36
54
|
export declare function isRetryableStatus(status: number): boolean;
|
|
37
55
|
export interface RunWithRetryArgs<T> {
|
|
38
56
|
policy: RetryPolicy;
|
|
@@ -29,6 +29,27 @@ export class LLMHttpError extends Error {
|
|
|
29
29
|
this.body = body;
|
|
30
30
|
}
|
|
31
31
|
}
|
|
32
|
+
/**
|
|
33
|
+
* Sanitized error raised when an LLM adapter receives an HTTP-200 response
|
|
34
|
+
* whose body is not valid JSON. The raw response body (which may echo back
|
|
35
|
+
* user prompt content or even API-key fragments from prompts) is kept on the
|
|
36
|
+
* instance for callers that opt in via `.raw`, NOT in the message string.
|
|
37
|
+
*
|
|
38
|
+
* Mirrors the LLMHttpError pattern verified by the
|
|
39
|
+
* "does not leak the response body" test in openai-llm-client.test.ts.
|
|
40
|
+
*/
|
|
41
|
+
export class LLMParseError extends Error {
|
|
42
|
+
/** Full raw response body (kept on the instance, NOT in `message`). */
|
|
43
|
+
raw;
|
|
44
|
+
/** Byte length of `raw` — safe to include in the message. */
|
|
45
|
+
rawLength;
|
|
46
|
+
constructor(message, raw, options) {
|
|
47
|
+
super(`${message} (raw=${raw.length}B)`, options);
|
|
48
|
+
this.name = "LLMParseError";
|
|
49
|
+
this.raw = raw;
|
|
50
|
+
this.rawLength = raw.length;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
32
53
|
export function isRetryableStatus(status) {
|
|
33
54
|
return status === 429 || (status >= 500 && status < 600);
|
|
34
55
|
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
|
|
3
|
+
*
|
|
4
|
+
* This schema sits at `packages/eval/src/adapters/**` and is therefore
|
|
5
|
+
* scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
|
|
6
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
|
|
7
|
+
* drift a build error, not a runtime bug.
|
|
8
|
+
*
|
|
9
|
+
* Used by:
|
|
10
|
+
* - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
|
|
11
|
+
* writing to Sanity (process memory → Sanity write boundary, T-06-04).
|
|
12
|
+
* - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
|
|
13
|
+
* (Sanity Content Lake → eval process boundary, T-06-04).
|
|
14
|
+
*
|
|
15
|
+
* Security constraints:
|
|
16
|
+
* - No `.passthrough()` — schema is closed to prevent PII leakage from
|
|
17
|
+
* card body text into the telemetry shape (T-06-05).
|
|
18
|
+
* - Satisfies clause is load-bearing (T-06-06); no exemption marker.
|
|
19
|
+
*
|
|
20
|
+
* @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
export declare const SynthesisCostTelemetrySchema: z.ZodObject<{
|
|
26
|
+
cost: z.ZodNumber;
|
|
27
|
+
parseFailureCount: z.ZodNumber;
|
|
28
|
+
parseFailureRate: z.ZodNumber;
|
|
29
|
+
perCard: z.ZodArray<z.ZodObject<{
|
|
30
|
+
cardType: z.ZodEnum<{
|
|
31
|
+
"area-summary": "area-summary";
|
|
32
|
+
"failure-mode-summary": "failure-mode-summary";
|
|
33
|
+
"no-issues": "no-issues";
|
|
34
|
+
"top-recommendations": "top-recommendations";
|
|
35
|
+
"weakest-area": "weakest-area";
|
|
36
|
+
"low-confidence-attribution": "low-confidence-attribution";
|
|
37
|
+
"doc-attribution-spotlight": "doc-attribution-spotlight";
|
|
38
|
+
"regression-vs-baseline": "regression-vs-baseline";
|
|
39
|
+
}>;
|
|
40
|
+
cost: z.ZodOptional<z.ZodNumber>;
|
|
41
|
+
parseFailed: z.ZodBoolean;
|
|
42
|
+
latencyMs: z.ZodOptional<z.ZodNumber>;
|
|
43
|
+
tokenInput: z.ZodOptional<z.ZodNumber>;
|
|
44
|
+
tokenOutput: z.ZodOptional<z.ZodNumber>;
|
|
45
|
+
cardVersion: z.ZodString;
|
|
46
|
+
generatedAt: z.ZodString;
|
|
47
|
+
}, z.core.$strip>>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
|
|
3
|
+
*
|
|
4
|
+
* This schema sits at `packages/eval/src/adapters/**` and is therefore
|
|
5
|
+
* scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
|
|
6
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
|
|
7
|
+
* drift a build error, not a runtime bug.
|
|
8
|
+
*
|
|
9
|
+
* Used by:
|
|
10
|
+
* - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
|
|
11
|
+
* writing to Sanity (process memory → Sanity write boundary, T-06-04).
|
|
12
|
+
* - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
|
|
13
|
+
* (Sanity Content Lake → eval process boundary, T-06-04).
|
|
14
|
+
*
|
|
15
|
+
* Security constraints:
|
|
16
|
+
* - No `.passthrough()` — schema is closed to prevent PII leakage from
|
|
17
|
+
* card body text into the telemetry shape (T-06-05).
|
|
18
|
+
* - Satisfies clause is load-bearing (T-06-06); no exemption marker.
|
|
19
|
+
*
|
|
20
|
+
* @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
/**
|
|
26
|
+
* Enum of all valid card types — mirrors `CardType` from diagnosis.ts.
|
|
27
|
+
* Using `z.enum()` (not `z.string()`) so the schema satisfies
|
|
28
|
+
* `z.ZodType<SynthesisPerCardTelemetry>` (which requires `cardType: CardType`).
|
|
29
|
+
*/
|
|
30
|
+
const CardTypeSchema = z.enum([
|
|
31
|
+
"area-summary",
|
|
32
|
+
"failure-mode-summary",
|
|
33
|
+
"no-issues",
|
|
34
|
+
"top-recommendations",
|
|
35
|
+
"weakest-area",
|
|
36
|
+
"low-confidence-attribution",
|
|
37
|
+
"doc-attribution-spotlight",
|
|
38
|
+
"regression-vs-baseline",
|
|
39
|
+
]);
|
|
40
|
+
const SynthesisPerCardSchema = z.object({
|
|
41
|
+
cardType: CardTypeSchema,
|
|
42
|
+
cost: z.number().nonnegative().optional(),
|
|
43
|
+
parseFailed: z.boolean(),
|
|
44
|
+
latencyMs: z.number().int().nonnegative().optional(),
|
|
45
|
+
tokenInput: z.number().int().nonnegative().optional(),
|
|
46
|
+
tokenOutput: z.number().int().nonnegative().optional(),
|
|
47
|
+
cardVersion: z.string(),
|
|
48
|
+
generatedAt: z.string().datetime({ offset: false }), // ISO 8601 UTC required
|
|
49
|
+
});
|
|
50
|
+
export const SynthesisCostTelemetrySchema = z.object({
|
|
51
|
+
cost: z.number().nonnegative(),
|
|
52
|
+
parseFailureCount: z.number().int().nonnegative(),
|
|
53
|
+
parseFailureRate: z.number().min(0).max(1),
|
|
54
|
+
perCard: z.array(SynthesisPerCardSchema),
|
|
55
|
+
});
|
|
@@ -286,16 +286,21 @@ function mapAssertions(raw) {
|
|
|
286
286
|
.map((c) => ({ id: c.id, text: c.text })),
|
|
287
287
|
template: a.template,
|
|
288
288
|
type: "llm-rubric",
|
|
289
|
-
|
|
289
|
+
// Use `!= null` (loose) so we drop both `undefined` AND `null`.
|
|
290
|
+
// GROQ projects missing scalar fields as `null`, but the domain
|
|
291
|
+
// schema's `z.number().optional()` accepts `T | undefined`, not
|
|
292
|
+
// `T | null` — a strict `!== undefined` check would forward
|
|
293
|
+
// `weight: null` and trigger Zod's "Invalid input" on assertions.
|
|
294
|
+
...(a.weight != null ? { weight: a.weight } : {}),
|
|
290
295
|
};
|
|
291
296
|
}
|
|
292
|
-
// Value-based assertion
|
|
297
|
+
// Value-based assertion — same null-vs-undefined hazard as above.
|
|
293
298
|
const result = { type: a.type };
|
|
294
|
-
if (a.value
|
|
299
|
+
if (a.value != null)
|
|
295
300
|
result.value = a.value;
|
|
296
|
-
if (a.threshold
|
|
301
|
+
if (a.threshold != null)
|
|
297
302
|
result.threshold = a.threshold;
|
|
298
|
-
if (a.weight
|
|
303
|
+
if (a.weight != null)
|
|
299
304
|
result.weight = a.weight;
|
|
300
305
|
return result;
|
|
301
306
|
});
|
|
@@ -1561,6 +1561,13 @@ export declare const RepoConfigSchema: z.ZodObject<{
|
|
|
1561
1561
|
dir: z.ZodOptional<z.ZodString>;
|
|
1562
1562
|
exclude: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1563
1563
|
}, z.core.$strip>>;
|
|
1564
|
+
summary: z.ZodOptional<z.ZodObject<{
|
|
1565
|
+
onRun: z.ZodOptional<z.ZodEnum<{
|
|
1566
|
+
never: "never";
|
|
1567
|
+
always: "always";
|
|
1568
|
+
auto: "auto";
|
|
1569
|
+
}>>;
|
|
1570
|
+
}, z.core.$strip>>;
|
|
1564
1571
|
taskSource: z.ZodOptional<z.ZodObject<{
|
|
1565
1572
|
type: z.ZodOptional<z.ZodEnum<{
|
|
1566
1573
|
"content-lake": "content-lake";
|
|
@@ -646,6 +646,15 @@ const OwnerConfigSchema = z
|
|
|
646
646
|
individual: z.string().min(1).optional(),
|
|
647
647
|
})
|
|
648
648
|
.optional();
|
|
649
|
+
/**
|
|
650
|
+
* Post-run diagnosis summary policy (Phase 6 / DIAG-06).
|
|
651
|
+
* Sits in the W0077 Phase-6a auto-load pathway.
|
|
652
|
+
*/
|
|
653
|
+
const SummaryConfigSchema = z
|
|
654
|
+
.object({
|
|
655
|
+
onRun: z.enum(["auto", "always", "never"]).optional(),
|
|
656
|
+
})
|
|
657
|
+
.optional();
|
|
649
658
|
/**
|
|
650
659
|
* Agentic-mode configuration (W0077 Phase 6f). Replaces the retired
|
|
651
660
|
* `--header` and `--allowed-origin` CLI flags. `headers` is a key/value
|
|
@@ -694,6 +703,7 @@ export const RepoConfigSchema = z.object({
|
|
|
694
703
|
owner: OwnerConfigSchema,
|
|
695
704
|
agentic: AgenticConfigSchema,
|
|
696
705
|
artifacts: ArtifactsConfigSchema,
|
|
706
|
+
summary: SummaryConfigSchema,
|
|
697
707
|
taskSource: TaskSourceConfigSchema,
|
|
698
708
|
triggers: z
|
|
699
709
|
.object({
|
package/dist/cli-program.js
CHANGED
|
@@ -32,6 +32,7 @@ import { createFetchDocsCommand } from "./commands/fetch-docs.js";
|
|
|
32
32
|
import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
|
|
33
33
|
import { createGraderCommand } from "./commands/grader/index.js";
|
|
34
34
|
import { createInitCommand } from "./commands/init.js";
|
|
35
|
+
import { createInterpretCommand } from "./commands/interpret.js";
|
|
35
36
|
import { createInteractiveCommand } from "./commands/interactive.js";
|
|
36
37
|
import { createLookupDocCommand } from "./commands/lookup-doc.js";
|
|
37
38
|
import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
|
|
@@ -110,6 +111,8 @@ export function buildCliProgram(opts) {
|
|
|
110
111
|
.addCommand(createWeeklyDigestCommand())
|
|
111
112
|
.addCommand(createCheckStalenessCommand());
|
|
112
113
|
program.addCommand(reportCommand.helpGroup(CommandGroup.AnalysisReports));
|
|
114
|
+
// `ailf interpret <reportId>` — top-level (not nested under report) per AI-SPEC
|
|
115
|
+
program.addCommand(createInterpretCommand().helpGroup(CommandGroup.AnalysisReports));
|
|
113
116
|
// ── Grader Reliability ────────────────────────────────────────────────
|
|
114
117
|
program.addCommand(createGraderCommand().helpGroup(CommandGroup.GraderReliability));
|
|
115
118
|
// ── Setup & Configuration ─────────────────────────────────────────────
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* interpret command — generate a Diagnosis for a Report.
|
|
3
|
+
*
|
|
4
|
+
* Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
|
|
5
|
+
* command for consistent CLI integration. Closest analog: compare.ts.
|
|
6
|
+
*
|
|
7
|
+
* Entry points:
|
|
8
|
+
* ailf interpret <reportId> — one-line-per-card summary
|
|
9
|
+
* ailf interpret <reportId> --json — full Diagnosis JSON
|
|
10
|
+
* ailf interpret latest — most recent report
|
|
11
|
+
* ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
|
|
12
|
+
* ailf interpret <id> --refresh — bypass version-keyed cache
|
|
13
|
+
*
|
|
14
|
+
* @see packages/eval/src/commands/compare.ts — CLI factory analog
|
|
15
|
+
* @see packages/eval/src/composition-root.ts — getDiagnosisRunner
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
|
|
17
|
+
*/
|
|
18
|
+
import { Command } from "commander";
|
|
19
|
+
import { type DiagnosisCard, type DiagnosisRunner, type VersionedInputs } from "../_vendor/ailf-core/index.d.ts";
|
|
20
|
+
interface MinimalReportStore {
|
|
21
|
+
read(id: string): Promise<unknown | null>;
|
|
22
|
+
latest(): Promise<unknown | null>;
|
|
23
|
+
}
|
|
24
|
+
export interface InterpretCommandOptions {
|
|
25
|
+
/**
|
|
26
|
+
* Override the runner factory for tests. When omitted, the command
|
|
27
|
+
* imports `getDiagnosisRunner` from the composition root at action time.
|
|
28
|
+
*/
|
|
29
|
+
readonly runnerFactory?: (ctx: unknown) => DiagnosisRunner;
|
|
30
|
+
/**
|
|
31
|
+
* Override the store factory for tests. When omitted, the command
|
|
32
|
+
* creates the app context and uses `ctx.reportStore` at action time.
|
|
33
|
+
*/
|
|
34
|
+
readonly storeFactory?: () => MinimalReportStore | null;
|
|
35
|
+
/**
|
|
36
|
+
* Override the versions resolver for tests. Receives the stored report
|
|
37
|
+
* record and returns the `VersionedInputs` needed by the runner.
|
|
38
|
+
* When omitted, the command derives versions from the report's metadata.
|
|
39
|
+
*/
|
|
40
|
+
readonly versionsFromReport?: (report: unknown) => VersionedInputs;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Visual status markers — locked visual contract per plan Test 7:
|
|
44
|
+
* ready: "✓", degraded: "⚠", missing: "—"
|
|
45
|
+
*
|
|
46
|
+
* Exported so Plan 06-04's post-run hook imports the SAME object and
|
|
47
|
+
* D6-04's "single formatter, single visual contract" is physically
|
|
48
|
+
* enforced — no copy/paste drift possible.
|
|
49
|
+
*/
|
|
50
|
+
export declare const STATUS_ICONS: Record<DiagnosisCard["status"], string>;
|
|
51
|
+
/**
|
|
52
|
+
* Format a single card as a one-line summary string.
|
|
53
|
+
*
|
|
54
|
+
* Format: `<icon> <cardType>: <summary>`
|
|
55
|
+
* Per AI-SPEC §6: distinct icons for ready / degraded / missing.
|
|
56
|
+
*
|
|
57
|
+
* Exported so Plan 06-04's post-run hook imports the SAME function and
|
|
58
|
+
* D6-04's "single formatter, single visual contract" is physically
|
|
59
|
+
* enforced — no copy/paste drift possible.
|
|
60
|
+
*/
|
|
61
|
+
export declare function formatCardSummaryLine(card: DiagnosisCard): string;
|
|
62
|
+
/**
|
|
63
|
+
* Create the `ailf interpret <reportId>` Commander command.
|
|
64
|
+
*
|
|
65
|
+
* Accepts optional `InterpretCommandOptions` for testability — tests can
|
|
66
|
+
* inject a fake runner factory and store factory without touching module
|
|
67
|
+
* mocks (preferred per testing.md).
|
|
68
|
+
*/
|
|
69
|
+
export declare function createInterpretCommand(options?: InterpretCommandOptions): Command;
|
|
70
|
+
export {};
|