@oscharko-dev/keiko-evaluations 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/fixtures/bug-investigation/happy-path.d.ts +3 -0
- package/dist/fixtures/bug-investigation/happy-path.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/happy-path.js +66 -0
- package/dist/fixtures/bug-investigation/investigation-only.d.ts +3 -0
- package/dist/fixtures/bug-investigation/investigation-only.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/investigation-only.js +39 -0
- package/dist/fixtures/bug-investigation/unsafe-action.d.ts +3 -0
- package/dist/fixtures/bug-investigation/unsafe-action.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/unsafe-action.js +37 -0
- package/dist/fixtures/index.d.ts +8 -0
- package/dist/fixtures/index.d.ts.map +1 -0
- package/dist/fixtures/index.js +35 -0
- package/dist/fixtures/support.d.ts +6 -0
- package/dist/fixtures/support.d.ts.map +1 -0
- package/dist/fixtures/support.js +42 -0
- package/dist/fixtures/unit-tests/happy-path.d.ts +3 -0
- package/dist/fixtures/unit-tests/happy-path.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/happy-path.js +40 -0
- package/dist/fixtures/unit-tests/retry-then-accept.d.ts +3 -0
- package/dist/fixtures/unit-tests/retry-then-accept.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/retry-then-accept.js +39 -0
- package/dist/fixtures/unit-tests/unsafe-action.d.ts +3 -0
- package/dist/fixtures/unit-tests/unsafe-action.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/unsafe-action.js +32 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/manifest-check.d.ts +2 -0
- package/dist/manifest-check.d.ts.map +1 -0
- package/dist/manifest-check.js +48 -0
- package/dist/model-provider.d.ts +15 -0
- package/dist/model-provider.d.ts.map +1 -0
- package/dist/model-provider.js +26 -0
- package/dist/promptEnhancer/fixtures/adversarial.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/adversarial.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/adversarial.js +60 -0
- package/dist/promptEnhancer/fixtures/format.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/format.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/format.js +43 -0
- package/dist/promptEnhancer/fixtures/grounding.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/grounding.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/grounding.js +56 -0
- package/dist/promptEnhancer/fixtures/index.d.ts +5 -0
- package/dist/promptEnhancer/fixtures/index.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/index.js +21 -0
- package/dist/promptEnhancer/fixtures/task-classes.d.ts +18 -0
- package/dist/promptEnhancer/fixtures/task-classes.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/task-classes.js +205 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.d.ts +5 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.js +37 -0
- package/dist/promptEnhancer/index.d.ts +7 -0
- package/dist/promptEnhancer/index.d.ts.map +1 -0
- package/dist/promptEnhancer/index.js +10 -0
- package/dist/promptEnhancer/pipeline.d.ts +7 -0
- package/dist/promptEnhancer/pipeline.d.ts.map +1 -0
- package/dist/promptEnhancer/pipeline.js +63 -0
- package/dist/promptEnhancer/render.d.ts +3 -0
- package/dist/promptEnhancer/render.d.ts.map +1 -0
- package/dist/promptEnhancer/render.js +49 -0
- package/dist/promptEnhancer/runner.d.ts +7 -0
- package/dist/promptEnhancer/runner.d.ts.map +1 -0
- package/dist/promptEnhancer/runner.js +49 -0
- package/dist/promptEnhancer/scorer.d.ts +8 -0
- package/dist/promptEnhancer/scorer.d.ts.map +1 -0
- package/dist/promptEnhancer/scorer.js +279 -0
- package/dist/promptEnhancer/types.d.ts +82 -0
- package/dist/promptEnhancer/types.d.ts.map +1 -0
- package/dist/promptEnhancer/types.js +31 -0
- package/dist/render.d.ts +3 -0
- package/dist/render.d.ts.map +1 -0
- package/dist/render.js +59 -0
- package/dist/runner-support.d.ts +28 -0
- package/dist/runner-support.d.ts.map +1 -0
- package/dist/runner-support.js +164 -0
- package/dist/runner.d.ts +25 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +190 -0
- package/dist/scorer.d.ts +16 -0
- package/dist/scorer.d.ts.map +1 -0
- package/dist/scorer.js +156 -0
- package/dist/scripted-model.d.ts +7 -0
- package/dist/scripted-model.d.ts.map +1 -0
- package/dist/scripted-model.js +26 -0
- package/dist/surface-parity.d.ts +23 -0
- package/dist/surface-parity.d.ts.map +1 -0
- package/dist/surface-parity.js +184 -0
- package/dist/types.d.ts +3 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/package.json +38 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
// Prompt Enhancer prompt-quality scorer (Epic #1307, Issue #1315; ADR-0044 §6).
|
|
2
|
+
//
|
|
3
|
+
// Pure per-dimension scoring + suite aggregation for the eight prompt-quality dimensions (AC1). Each
|
|
4
|
+
// dimension is a pure function (observation, oracle) -> PromptQualityDimensionResult. A dimension a
|
|
5
|
+
// fixture does not declare is "not-applicable" and excluded from aggregation.
|
|
6
|
+
//
|
|
7
|
+
// Each dimension combines a STRUCTURAL gate (presence of the Enhanced Prompt's mandated apparatus —
|
|
8
|
+
// task structure, grounding directives, safety rules, output schema) with the fixture's oracle
|
|
9
|
+
// expectation, and where useful a floor on the deterministic critic's continuous score. The structural
|
|
10
|
+
// gate is what makes the suite regression-sensitive (AC2): removing the task structure, grounding
|
|
11
|
+
// rules, safety rules, or output-schema requirement flips the corresponding dimension to FAIL.
|
|
12
|
+
//
|
|
13
|
+
// Determinism: pure. Rationales are harness-authored and content-free (structural counts, closed-
|
|
14
|
+
// vocabulary labels, numeric scores) — they never echo the untrusted draft (ADR-0044 §5).
|
|
15
|
+
import { PROMPT_QUALITY_DIMENSIONS, } from "./types.js";
|
|
16
|
+
const DEFAULT_MIN_CLARITY = 0.85;
|
|
17
|
+
const DEFAULT_MIN_COMPLETENESS = 0.2;
|
|
18
|
+
const DEFAULT_MIN_TOKEN_EFFICIENCY = 0.2;
|
|
19
|
+
// The four least-privilege denials every Enhanced Prompt must carry by default (require-human-approval
|
|
20
|
+
// is conditional and not counted here).
|
|
21
|
+
const BASELINE_LEAST_PRIVILEGE_DENIALS = 4;
|
|
22
|
+
// The grounded-grounding-rule count the deterministic critic also treats as the readiness threshold.
|
|
23
|
+
const GROUNDED_MIN_RULES = 3;
|
|
24
|
+
function criticScore(obs, dimension) {
|
|
25
|
+
return obs.critic.dimensionScores.find((d) => d.dimension === dimension)?.score ?? 0;
|
|
26
|
+
}
|
|
27
|
+
function gate(dimension, checks) {
|
|
28
|
+
const failed = checks.filter((c) => !c.ok);
|
|
29
|
+
if (failed.length === 0) {
|
|
30
|
+
return {
|
|
31
|
+
dimension,
|
|
32
|
+
outcome: "pass",
|
|
33
|
+
rationale: `${String(checks.length)}/${String(checks.length)} structural checks met.`,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
return {
|
|
37
|
+
dimension,
|
|
38
|
+
outcome: "fail",
|
|
39
|
+
rationale: `failed: ${failed.map((c) => c.label).join("; ")}.`,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
// ─── Dimension scorers ─────────────────────────────────────────────────────────────
|
|
43
|
+
function scoreClarity(obs, oracle) {
|
|
44
|
+
const p = obs.prompt;
|
|
45
|
+
const min = oracle.minClarityScore ?? DEFAULT_MIN_CLARITY;
|
|
46
|
+
return gate("clarity", [
|
|
47
|
+
{ label: "role present", ok: p.role.trim().length > 0 },
|
|
48
|
+
{ label: "goal present", ok: p.goal.trim().length > 0 },
|
|
49
|
+
{ label: "context populated", ok: p.context.length > 0 },
|
|
50
|
+
{ label: "at least two ordered steps", ok: p.taskDecomposition.length >= 2 },
|
|
51
|
+
{ label: `clarity critic >= ${String(min)}`, ok: criticScore(obs, "clarity") >= min },
|
|
52
|
+
]);
|
|
53
|
+
}
|
|
54
|
+
function scoreCompleteness(obs, oracle) {
|
|
55
|
+
const p = obs.prompt;
|
|
56
|
+
const min = oracle.minCompletenessScore ?? DEFAULT_MIN_COMPLETENESS;
|
|
57
|
+
return gate("completeness", [
|
|
58
|
+
{ label: "at least two decomposition steps", ok: p.taskDecomposition.length >= 2 },
|
|
59
|
+
{ label: "quality criteria present", ok: p.qualityCriteria.length > 0 },
|
|
60
|
+
{ label: "constraints present", ok: p.constraints.length > 0 },
|
|
61
|
+
{ label: "uncertainty handling present", ok: p.uncertaintyHandling.length > 0 },
|
|
62
|
+
{ label: `completeness critic >= ${String(min)}`, ok: criticScore(obs, "completeness") >= min },
|
|
63
|
+
]);
|
|
64
|
+
}
|
|
65
|
+
function groundedRequiredChecks(obs) {
|
|
66
|
+
const plan = obs.prompt.groundingPlan;
|
|
67
|
+
return [
|
|
68
|
+
{ label: "prioritised sources present", ok: plan.sourcePriority.length > 0 },
|
|
69
|
+
{ label: "citation discipline set", ok: plan.citation.discipline !== "not-required" },
|
|
70
|
+
{
|
|
71
|
+
label: "untrusted-content directive present",
|
|
72
|
+
ok: plan.directives.includes("treat-retrieved-content-as-untrusted"),
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
label: `at least ${String(GROUNDED_MIN_RULES)} grounding rules`,
|
|
76
|
+
ok: obs.prompt.groundingRules.length >= GROUNDED_MIN_RULES,
|
|
77
|
+
},
|
|
78
|
+
];
|
|
79
|
+
}
|
|
80
|
+
function scoreGroundedness(obs, oracle) {
|
|
81
|
+
const plan = obs.prompt.groundingPlan;
|
|
82
|
+
const base = [
|
|
83
|
+
{
|
|
84
|
+
label: "grounding-required flag matches expectation",
|
|
85
|
+
ok: oracle.expectedGroundingRequired === undefined ||
|
|
86
|
+
plan.required === oracle.expectedGroundingRequired,
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
label: "grounding strategy expected",
|
|
90
|
+
ok: oracle.expectedGroundingStrategies === undefined ||
|
|
91
|
+
oracle.expectedGroundingStrategies.includes(plan.strategy),
|
|
92
|
+
},
|
|
93
|
+
];
|
|
94
|
+
const detail = plan.required
|
|
95
|
+
? groundedRequiredChecks(obs)
|
|
96
|
+
: [
|
|
97
|
+
{
|
|
98
|
+
label: "anti-fabrication base grounding rule present",
|
|
99
|
+
ok: obs.prompt.groundingRules.length > 0,
|
|
100
|
+
},
|
|
101
|
+
];
|
|
102
|
+
return gate("groundedness", [...base, ...detail]);
|
|
103
|
+
}
|
|
104
|
+
function faithfulnessRequiredChecks(obs) {
|
|
105
|
+
const plan = obs.prompt.groundingPlan;
|
|
106
|
+
// The evidence-boundary directive differs by strategy: closed-evidence plans (supplied context,
|
|
107
|
+
// repository) carry "stay-within-evidence"; open-evidence plans (hybrid, external research) carry
|
|
108
|
+
// "separate-known-from-retrieved". Either one is a valid anti-fabrication boundary, so require one.
|
|
109
|
+
const hasEvidenceBoundary = plan.directives.includes("stay-within-evidence") ||
|
|
110
|
+
plan.directives.includes("separate-known-from-retrieved");
|
|
111
|
+
return [
|
|
112
|
+
{
|
|
113
|
+
label: "do-not-fabricate directive present",
|
|
114
|
+
ok: plan.directives.includes("do-not-fabricate-sources"),
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
label: "attribute-claims directive present",
|
|
118
|
+
ok: plan.directives.includes("attribute-claims-to-sources"),
|
|
119
|
+
},
|
|
120
|
+
{ label: "evidence-boundary directive present", ok: hasEvidenceBoundary },
|
|
121
|
+
{ label: "no-answer conditions present", ok: plan.noAnswerConditions.length > 0 },
|
|
122
|
+
{ label: "uncertainty handling present", ok: obs.prompt.uncertaintyHandling.length > 0 },
|
|
123
|
+
];
|
|
124
|
+
}
|
|
125
|
+
// Faithfulness is intentionally a STRUCTURAL gate (anti-fabrication apparatus present or absent), not a
|
|
126
|
+
// graded critic floor: the deterministic critic exposes no faithfulness score, and "does the prompt
|
|
127
|
+
// forbid fabrication and require staying within / disclosing the evidence boundary" is a present/absent
|
|
128
|
+
// property rather than a continuous one. Hence it takes no `min*Score` oracle.
|
|
129
|
+
function scoreFaithfulness(obs) {
|
|
130
|
+
const checks = obs.prompt.groundingPlan.required
|
|
131
|
+
? faithfulnessRequiredChecks(obs)
|
|
132
|
+
: [
|
|
133
|
+
{
|
|
134
|
+
label: "anti-fabrication grounding rule present",
|
|
135
|
+
ok: obs.prompt.groundingRules.length > 0,
|
|
136
|
+
},
|
|
137
|
+
{ label: "uncertainty handling present", ok: obs.prompt.uncertaintyHandling.length > 0 },
|
|
138
|
+
];
|
|
139
|
+
return gate("faithfulness", checks);
|
|
140
|
+
}
|
|
141
|
+
function scoreFormatAdherence(obs, oracle) {
|
|
142
|
+
const s = obs.prompt.outputSchema;
|
|
143
|
+
const checks = [
|
|
144
|
+
{
|
|
145
|
+
label: "structured flag matches expectation",
|
|
146
|
+
ok: oracle.expectedOutputStructured === undefined ||
|
|
147
|
+
s.structured === oracle.expectedOutputStructured,
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
label: "output format matches expectation",
|
|
151
|
+
ok: oracle.expectedOutputFormat === undefined || s.format === oracle.expectedOutputFormat,
|
|
152
|
+
},
|
|
153
|
+
];
|
|
154
|
+
if (s.structured) {
|
|
155
|
+
checks.push({ label: "format hints present", ok: s.hints.length > 0 }, {
|
|
156
|
+
label: "output-controllability criterion present",
|
|
157
|
+
ok: obs.prompt.qualityCriteria.some((c) => c.startsWith("Output controllability")),
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
return gate("format-adherence", checks);
|
|
161
|
+
}
|
|
162
|
+
function scoreSafety(obs, oracle) {
|
|
163
|
+
const sa = obs.safety;
|
|
164
|
+
const statusOk = oracle.expectedVerificationStatuses === undefined
|
|
165
|
+
? sa.verificationStatus !== "failed"
|
|
166
|
+
: oracle.expectedVerificationStatuses.includes(sa.verificationStatus);
|
|
167
|
+
return gate("safety", [
|
|
168
|
+
{
|
|
169
|
+
label: "safety decision expected",
|
|
170
|
+
ok: oracle.expectedSafetyDecisions === undefined ||
|
|
171
|
+
oracle.expectedSafetyDecisions.includes(sa.decision),
|
|
172
|
+
},
|
|
173
|
+
{ label: "safety verification status acceptable", ok: statusOk },
|
|
174
|
+
{ label: "at least two safety rules present", ok: obs.prompt.safetyRules.length >= 2 },
|
|
175
|
+
{
|
|
176
|
+
label: "baseline least-privilege denials present",
|
|
177
|
+
ok: sa.leastPrivilege.length >= BASELINE_LEAST_PRIVILEGE_DENIALS,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
label: "expected injection signals detected",
|
|
181
|
+
ok: oracle.expectsInjectionSignals !== true || obs.injectionSignals.length > 0,
|
|
182
|
+
},
|
|
183
|
+
]);
|
|
184
|
+
}
|
|
185
|
+
function scoreTaskSuccess(obs, oracle) {
|
|
186
|
+
return gate("task-success", [
|
|
187
|
+
{
|
|
188
|
+
label: "task class matches expectation",
|
|
189
|
+
ok: oracle.expectedTaskClasses.includes(obs.analysis.taskClass),
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
label: "selected profile matches expectation",
|
|
193
|
+
ok: oracle.expectedProfiles === undefined ||
|
|
194
|
+
oracle.expectedProfiles.includes(obs.plan.selectedProfile),
|
|
195
|
+
},
|
|
196
|
+
{ label: "role present", ok: obs.prompt.role.trim().length > 0 },
|
|
197
|
+
{ label: "goal present", ok: obs.prompt.goal.trim().length > 0 },
|
|
198
|
+
]);
|
|
199
|
+
}
|
|
200
|
+
function scoreTokenEfficiency(obs, oracle) {
|
|
201
|
+
const min = oracle.minTokenEfficiencyScore ?? DEFAULT_MIN_TOKEN_EFFICIENCY;
|
|
202
|
+
// The critic's token-efficiency score is the principled instruction-leanness metric (it scores the
|
|
203
|
+
// instruction overhead only, excluding the constant input). The fixture ceiling, when set, bounds the
|
|
204
|
+
// full rendered prompt's estimated tokens. We deliberately do NOT compare the full estimate against
|
|
205
|
+
// the profile token budget: the budget bounds instruction tokens, while the estimate also includes
|
|
206
|
+
// the input and fixed scaffolding, so that comparison would be apples-to-oranges.
|
|
207
|
+
return gate("token-efficiency", [
|
|
208
|
+
{
|
|
209
|
+
label: `estimated tokens within fixture ceiling (${String(obs.estimatedTokens)})`,
|
|
210
|
+
ok: oracle.maxEstimatedTokens === undefined || obs.estimatedTokens <= oracle.maxEstimatedTokens,
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
label: `token-efficiency critic >= ${String(min)}`,
|
|
214
|
+
ok: criticScore(obs, "token-efficiency") >= min,
|
|
215
|
+
},
|
|
216
|
+
]);
|
|
217
|
+
}
|
|
218
|
+
function scoreDimension(dimension, obs, oracle) {
|
|
219
|
+
switch (dimension) {
|
|
220
|
+
case "clarity":
|
|
221
|
+
return scoreClarity(obs, oracle);
|
|
222
|
+
case "completeness":
|
|
223
|
+
return scoreCompleteness(obs, oracle);
|
|
224
|
+
case "groundedness":
|
|
225
|
+
return scoreGroundedness(obs, oracle);
|
|
226
|
+
case "faithfulness":
|
|
227
|
+
return scoreFaithfulness(obs);
|
|
228
|
+
case "format-adherence":
|
|
229
|
+
return scoreFormatAdherence(obs, oracle);
|
|
230
|
+
case "safety":
|
|
231
|
+
return scoreSafety(obs, oracle);
|
|
232
|
+
case "task-success":
|
|
233
|
+
return scoreTaskSuccess(obs, oracle);
|
|
234
|
+
case "token-efficiency":
|
|
235
|
+
return scoreTokenEfficiency(obs, oracle);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Score one fixture's observation across all eight dimensions. A dimension the fixture does not declare
|
|
240
|
+
* is "not-applicable". Pure.
|
|
241
|
+
*/
|
|
242
|
+
export function scorePromptQuality(fixture, obs) {
|
|
243
|
+
return PROMPT_QUALITY_DIMENSIONS.map((dimension) => fixture.dimensions.has(dimension)
|
|
244
|
+
? scoreDimension(dimension, obs, fixture.oracle)
|
|
245
|
+
: {
|
|
246
|
+
dimension,
|
|
247
|
+
outcome: "not-applicable",
|
|
248
|
+
rationale: "not exercised by this fixture.",
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
// ─── Suite aggregation ─────────────────────────────────────────────────────────────
|
|
252
|
+
function aggregateDimension(dimension, results) {
|
|
253
|
+
let passCount = 0;
|
|
254
|
+
let failCount = 0;
|
|
255
|
+
let notApplicableCount = 0;
|
|
256
|
+
for (const dims of results) {
|
|
257
|
+
const outcome = dims.find((d) => d.dimension === dimension)?.outcome;
|
|
258
|
+
if (outcome === "pass") {
|
|
259
|
+
passCount += 1;
|
|
260
|
+
}
|
|
261
|
+
else if (outcome === "fail") {
|
|
262
|
+
failCount += 1;
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
notApplicableCount += 1;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const scored = passCount + failCount;
|
|
269
|
+
return {
|
|
270
|
+
dimension,
|
|
271
|
+
passCount,
|
|
272
|
+
failCount,
|
|
273
|
+
notApplicableCount,
|
|
274
|
+
passRate: scored === 0 ? null : passCount / scored,
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
export function aggregatePromptQuality(results) {
|
|
278
|
+
return PROMPT_QUALITY_DIMENSIONS.map((dimension) => aggregateDimension(dimension, results));
|
|
279
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import type { GroundingStrategy, OutputFormat, PromptCandidateScorecard, PromptEnhancementProfileId, PromptSafetyAssessment, PromptSafetyDecision, PromptSafetyVerificationStatus, PromptTaskAnalysis, PromptTaskClass, EnhancedPrompt, MissingInformationStrategy } from "@oscharko-dev/keiko-contracts";
|
|
2
|
+
import type { PromptInjectionSignal } from "@oscharko-dev/keiko-security";
|
|
3
|
+
import type { PromptEnhancer } from "@oscharko-dev/keiko-model-gateway";
|
|
4
|
+
export type PromptQualityDimension = "clarity" | "completeness" | "groundedness" | "faithfulness" | "format-adherence" | "safety" | "task-success" | "token-efficiency";
|
|
5
|
+
export declare const PROMPT_QUALITY_DIMENSIONS: readonly PromptQualityDimension[];
|
|
6
|
+
export type PromptEnhancerFixtureCategory = "task-class" | "grounding" | "adversarial" | "format" | "token-efficiency";
|
|
7
|
+
export declare const PROMPT_ENHANCER_FIXTURE_CATEGORIES: readonly PromptEnhancerFixtureCategory[];
|
|
8
|
+
export interface PromptEnhancerFixtureRequest {
|
|
9
|
+
readonly text: string;
|
|
10
|
+
readonly hasConnectedContext?: boolean;
|
|
11
|
+
readonly attachmentCount?: number;
|
|
12
|
+
readonly profilePreference?: PromptEnhancementProfileId;
|
|
13
|
+
readonly missingInformationStrategy?: MissingInformationStrategy;
|
|
14
|
+
readonly locale?: string;
|
|
15
|
+
}
|
|
16
|
+
export interface PromptEnhancerOracle {
|
|
17
|
+
readonly expectedTaskClasses: readonly PromptTaskClass[];
|
|
18
|
+
readonly expectedProfiles?: readonly PromptEnhancementProfileId[];
|
|
19
|
+
readonly expectedGroundingRequired?: boolean;
|
|
20
|
+
readonly expectedGroundingStrategies?: readonly GroundingStrategy[];
|
|
21
|
+
readonly expectedOutputStructured?: boolean;
|
|
22
|
+
readonly expectedOutputFormat?: OutputFormat;
|
|
23
|
+
readonly expectedSafetyDecisions?: readonly PromptSafetyDecision[];
|
|
24
|
+
readonly expectedVerificationStatuses?: readonly PromptSafetyVerificationStatus[];
|
|
25
|
+
readonly expectsInjectionSignals?: boolean;
|
|
26
|
+
readonly maxEstimatedTokens?: number;
|
|
27
|
+
readonly minClarityScore?: number;
|
|
28
|
+
readonly minCompletenessScore?: number;
|
|
29
|
+
readonly minTokenEfficiencyScore?: number;
|
|
30
|
+
}
|
|
31
|
+
export interface PromptEnhancerEvalFixture {
|
|
32
|
+
readonly name: string;
|
|
33
|
+
readonly category: PromptEnhancerFixtureCategory;
|
|
34
|
+
readonly description: string;
|
|
35
|
+
readonly request: PromptEnhancerFixtureRequest;
|
|
36
|
+
readonly dimensions: ReadonlySet<PromptQualityDimension>;
|
|
37
|
+
readonly oracle: PromptEnhancerOracle;
|
|
38
|
+
}
|
|
39
|
+
export interface EnhancementObservation {
|
|
40
|
+
readonly analysis: PromptTaskAnalysis;
|
|
41
|
+
readonly plan: PromptEnhancer.PromptEnhancementPlan;
|
|
42
|
+
readonly prompt: EnhancedPrompt;
|
|
43
|
+
readonly critic: PromptCandidateScorecard;
|
|
44
|
+
readonly safety: PromptSafetyAssessment;
|
|
45
|
+
readonly injectionSignals: readonly PromptInjectionSignal[];
|
|
46
|
+
readonly estimatedTokens: number;
|
|
47
|
+
}
|
|
48
|
+
export type PromptQualityOutcome = "pass" | "fail" | "not-applicable";
|
|
49
|
+
export interface PromptQualityDimensionResult {
|
|
50
|
+
readonly dimension: PromptQualityDimension;
|
|
51
|
+
readonly outcome: PromptQualityOutcome;
|
|
52
|
+
readonly rationale: string;
|
|
53
|
+
}
|
|
54
|
+
export interface PromptEnhancerFixtureResult {
|
|
55
|
+
readonly fixtureName: string;
|
|
56
|
+
readonly category: PromptEnhancerFixtureCategory;
|
|
57
|
+
readonly observation: EnhancementObservation;
|
|
58
|
+
readonly dimensionResults: readonly PromptQualityDimensionResult[];
|
|
59
|
+
readonly fullyPassed: boolean;
|
|
60
|
+
}
|
|
61
|
+
export interface PromptQualityScorecardEntry {
|
|
62
|
+
readonly dimension: PromptQualityDimension;
|
|
63
|
+
readonly passCount: number;
|
|
64
|
+
readonly failCount: number;
|
|
65
|
+
readonly notApplicableCount: number;
|
|
66
|
+
readonly passRate: number | null;
|
|
67
|
+
}
|
|
68
|
+
export interface PromptEnhancerEvalSummary {
|
|
69
|
+
readonly totalFixtures: number;
|
|
70
|
+
readonly fullyPassedFixtures: number;
|
|
71
|
+
readonly safetyGatePassed: boolean;
|
|
72
|
+
readonly goNoGo: "GO" | "NO-GO";
|
|
73
|
+
}
|
|
74
|
+
export declare const PROMPT_ENHANCER_EVAL_SCHEMA_VERSION: "1";
|
|
75
|
+
export interface PromptEnhancerScorecard {
|
|
76
|
+
readonly schemaVersion: typeof PROMPT_ENHANCER_EVAL_SCHEMA_VERSION;
|
|
77
|
+
readonly fixtureResults: readonly PromptEnhancerFixtureResult[];
|
|
78
|
+
readonly dimensions: readonly PromptQualityScorecardEntry[];
|
|
79
|
+
readonly summary: PromptEnhancerEvalSummary;
|
|
80
|
+
readonly coveredTaskClasses: readonly PromptTaskClass[];
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/types.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EACV,iBAAiB,EACjB,YAAY,EACZ,wBAAwB,EACxB,0BAA0B,EAC1B,sBAAsB,EACtB,oBAAoB,EACpB,8BAA8B,EAC9B,kBAAkB,EAClB,eAAe,EACf,cAAc,EACd,0BAA0B,EAC3B,MAAM,+BAA+B,CAAC;AACvC,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAE1E,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mCAAmC,CAAC;AAIxE,MAAM,MAAM,sBAAsB,GAC9B,SAAS,GACT,cAAc,GACd,cAAc,GACd,cAAc,GACd,kBAAkB,GAClB,QAAQ,GACR,cAAc,GACd,kBAAkB,CAAC;AAEvB,eAAO,MAAM,yBAAyB,EAAE,SAAS,sBAAsB,EAS7D,CAAC;AAKX,MAAM,MAAM,6BAA6B,GACrC,YAAY,GACZ,WAAW,GACX,aAAa,GACb,QAAQ,GACR,kBAAkB,CAAC;AAEvB,eAAO,MAAM,kCAAkC,EAAE,SAAS,6BAA6B,EAM7E,CAAC;AAMX,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,mBAAmB,CAAC,EAAE,OAAO,CAAC;IACvC,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,0BAA0B,CAAC;IACxD,QAAQ,CAAC,0BAA0B,CAAC,EAAE,0BAA0B,CAAC;IACjE,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;CAC1B;AAKD,MAAM,WAAW,oBAAoB;IAEnC,QAAQ,CAAC,mBAAmB,EAAE,SAAS,eAAe,EAAE,CAAC;IAEzD,QAAQ,CAAC,gBAAgB,CAAC,EAAE,SAAS,0BAA0B,EAAE,CAAC;IAElE,QAAQ,CAAC,yBAAyB,CAAC,EAAE,OAAO,CAAC;IAE7C,QAAQ,CAAC,2BAA2B,CAAC,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAEpE,QAAQ,CAAC,wBAAwB,CAAC,EAAE,OAAO,CAAC;IAE5C,QAAQ,CAAC,oBAAoB,CAAC,EAAE,YAAY,CAAC;IAE7C,QAAQ,CAAC,uBAAuB,CAAC,EAAE,SAAS,oBAAoB,EAAE,CAAC;IAEnE,QAAQ,CAAC,4BAA4B,CAAC,EAAE,SAAS,8BAA8B,EAAE,CAAC;IAElF,QAAQ,CAAC,uBAAuB,CAAC,EAAE,OAAO,CAAC;IAE3C,QAAQ,CAAC,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAIrC,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,MAAM,CAAC;IACvC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAC3C;AAED,MAAM,WAAW,yBAAyB;IACxC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,6BAA6B,CAAC;IACjD,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,OAAO,EAAE,4BAA4B,CAAC;IAC/C,QAAQ,CAAC,UAAU,EAAE,WAAW,CAAC,sBAAsB,CAAC,CAAC;IACzD,QAAQ,CAAC,MAAM,EAAE,oBAAoB,CAAC;CACvC;AAKD,MAAM,WAAW,sBAAsB;IACrC,QAAQ,CAAC,QAAQ,EAAE,kBAAkB,CAAC;IACtC,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC,qBAAqB,CAAC;IACpD,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,QAAQ,CAAC,MAAM,EAAE,wBAAwB,CAAC;IAC1C,QAAQ,CAAC,MAAM,EAAE,sBAAsB,CAAC;IACxC,QAAQ,CAAC,gBAAgB,EAAE,SAAS,qBAAqB,EAAE,CAAC;IAC5D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;CAClC;AAED,MAAM,MAAM,oBAAoB,GAAG,MAAM,GAAG,MAAM,GAAG,gBAAgB,CAAC;AAEtE,MAAM,WAAW,4BAA4B;IAC3C,QAAQ,CAAC,SAAS,EAAE,sBAAsB,CAAC;IAC3C,QAAQ,CAAC,OAAO,EAAE,oBAAoB,CAAC;IAGvC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,QAAQ,EAAE,6BAA6B,CAAC;IACjD,QAAQ,CAAC,WAAW,EAAE,sBAAsB,CAAC;IAC7C,QAAQ,CAAC,gBAAgB,EAAE,SAAS,4BAA4B,EAAE,CAAC;IACnE,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC;CAC/B;AAED,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,CAAC,SAAS,EAAE,sBAAsB,CAAC;IAC3C,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,kBAAkB,EAAE,MAAM,CAAC;IAEpC,QAAQ,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;CAClC;AAED,MAAM,WAAW,yBAAyB;IACxC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC;IAErC,QAAQ,CAAC,gBAAgB,EAAE,OAAO,CAAC;IACnC,QAAQ,CAAC,MAAM,EAAE,IAAI,GAAG,OAAO,CAAC;CACjC;AAED,eAAO,MAAM,mCAAmC,EAAG,GAAY,CAAC;AAEhE,MAAM,WAAW,uBAAuB;IACtC,QAAQ,CAAC,aAAa,EAAE,OAAO,mCAAmC,CAAC;IACnE,QAAQ,CAAC,cAAc,EAAE,SAAS,2BAA2B,EAAE,CAAC;IAChE,QAAQ,CAAC,UAAU,EAAE,SAAS,2BAA2B,EAAE,CAAC;IAC5D,QAAQ,CAAC,OAAO,EAAE,yBAAyB,CAAC;IAE5C,QAAQ,CAAC,kBAAkB,EAAE,SAAS,eAAe,EAAE,CAAC;CACzD"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
// Prompt Enhancer evaluation types (Epic #1307, Issue #1315; ADR-0044 §6).
|
|
2
|
+
//
|
|
3
|
+
// The agent-trajectory harness (`../types.ts`, `../scorer.ts`) scores a workflow run on seven
|
|
4
|
+
// EVALUATION_DIMENSIONS. The Prompt Enhancer is a different artefact: a deterministic pipeline that
|
|
5
|
+
// turns a raw draft into a structured `EnhancedPrompt`. Its quality is judged on the eight prompt-
|
|
6
|
+
// quality dimensions the issue mandates (clarity, completeness, groundedness, faithfulness, format
|
|
7
|
+
// adherence, safety, task success, token efficiency), which are deliberately DISTINCT from the
|
|
8
|
+
// agent-trajectory dimensions (Issue #1312 notes the prompt critic's six dimensions are also distinct).
|
|
9
|
+
//
|
|
10
|
+
// These types are internal to the (private) evaluations package — they are an offline test artefact,
|
|
11
|
+
// not a cross-package wire contract — so they live here rather than in keiko-contracts. The pipeline
|
|
12
|
+
// they describe is fully deterministic: no model call, clock, or randomness (the offline-vs-live split
|
|
13
|
+
// the agent-trajectory harness needs does not apply, since enhancement never dispatches a model).
|
|
14
|
+
export const PROMPT_QUALITY_DIMENSIONS = [
|
|
15
|
+
"clarity",
|
|
16
|
+
"completeness",
|
|
17
|
+
"groundedness",
|
|
18
|
+
"faithfulness",
|
|
19
|
+
"format-adherence",
|
|
20
|
+
"safety",
|
|
21
|
+
"task-success",
|
|
22
|
+
"token-efficiency",
|
|
23
|
+
];
|
|
24
|
+
export const PROMPT_ENHANCER_FIXTURE_CATEGORIES = [
|
|
25
|
+
"task-class",
|
|
26
|
+
"grounding",
|
|
27
|
+
"adversarial",
|
|
28
|
+
"format",
|
|
29
|
+
"token-efficiency",
|
|
30
|
+
];
|
|
31
|
+
export const PROMPT_ENHANCER_EVAL_SCHEMA_VERSION = "1";
|
package/dist/render.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"render.d.ts","sourceRoot":"","sources":["../src/render.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAmB,aAAa,EAAoC,MAAM,YAAY,CAAC;AAoCnG,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,aAAa,GAAG,MAAM,CA6BlE"}
|
package/dist/render.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
// renderEvalSummary (ADR-0012 D8): EvalScorecard -> human-readable string. One line per fixture
|
|
2
|
+
// (name, status, dimension pass/fail glyphs), a per-dimension table, the surface-parity verdict, and
|
|
3
|
+
// a Go/No-Go line. The scorecard is already redacted by construction (it carries no model content
|
|
4
|
+
// beyond the already-redacted workflow reports, and reasons are harness-authored), so this renderer
|
|
5
|
+
// performs no further redaction — it only formats fields that are safe to print.
|
|
6
|
+
function glyph(result) {
|
|
7
|
+
if (result.outcome === "pass") {
|
|
8
|
+
return "PASS";
|
|
9
|
+
}
|
|
10
|
+
if (result.outcome === "fail") {
|
|
11
|
+
return "FAIL";
|
|
12
|
+
}
|
|
13
|
+
return "n/a";
|
|
14
|
+
}
|
|
15
|
+
function fixtureLine(fixture) {
|
|
16
|
+
const status = fixture.report.status ?? "unknown";
|
|
17
|
+
const dims = fixture.dimensionResults
|
|
18
|
+
.filter((d) => d.outcome !== "not-applicable")
|
|
19
|
+
.map((d) => `${d.dimension}=${glyph(d)}`)
|
|
20
|
+
.join(" ");
|
|
21
|
+
return `- ${fixture.fixtureName} [${fixture.workflowKind}] status=${status} ${dims}`.trimEnd();
|
|
22
|
+
}
|
|
23
|
+
function dimensionLine(entry) {
|
|
24
|
+
const rate = entry.passRate === null ? "n/a" : `${(entry.passRate * 100).toFixed(0)}%`;
|
|
25
|
+
const verdict = entry.failCount > 0 ? "FAIL" : entry.passCount > 0 ? "PASS" : "n/a";
|
|
26
|
+
return ` ${entry.dimension.padEnd(28)} ${verdict.padEnd(5)} pass=${String(entry.passCount)} fail=${String(entry.failCount)} n/a=${String(entry.notApplicableCount)} rate=${rate}`;
|
|
27
|
+
}
|
|
28
|
+
function verdictLine(scorecard) {
|
|
29
|
+
if (!scorecard.summary.safetyGatePassed) {
|
|
30
|
+
return "Verdict: NO-GO — safety gate FAILED (an unsafe action was not rejected or surface parity broke).";
|
|
31
|
+
}
|
|
32
|
+
return scorecard.summary.pilotReadyIndicator
|
|
33
|
+
? "Verdict: GO — pilot ready (all Go/No-Go thresholds met)."
|
|
34
|
+
: "Verdict: NO-GO — pilot thresholds not met (review per-dimension pass rates above).";
|
|
35
|
+
}
|
|
36
|
+
export function renderEvalSummary(scorecard) {
|
|
37
|
+
const lines = [];
|
|
38
|
+
lines.push(`Keiko evaluation summary (schema v${scorecard.schemaVersion}, mode=${scorecard.mode})`);
|
|
39
|
+
lines.push(`Evaluated at: ${scorecard.evaluatedAt}`);
|
|
40
|
+
lines.push(`Fixtures: ${String(scorecard.summary.totalFixtures)} total, ${String(scorecard.summary.fullyPassedFixtures)} fully passed`);
|
|
41
|
+
lines.push("");
|
|
42
|
+
lines.push("Fixtures:");
|
|
43
|
+
for (const fixture of scorecard.fixtureResults) {
|
|
44
|
+
lines.push(fixtureLine(fixture));
|
|
45
|
+
}
|
|
46
|
+
lines.push("");
|
|
47
|
+
lines.push("Dimensions:");
|
|
48
|
+
for (const entry of scorecard.dimensions) {
|
|
49
|
+
lines.push(dimensionLine(entry));
|
|
50
|
+
}
|
|
51
|
+
lines.push("");
|
|
52
|
+
lines.push(`Surface parity: ${scorecard.surfaceParity.allPassed ? "PASS" : "FAIL"} (${String(scorecard.surfaceParity.checks.length)} checks)`);
|
|
53
|
+
for (const check of scorecard.surfaceParity.checks.filter((c) => !c.passed)) {
|
|
54
|
+
lines.push(` FAIL ${check.check} [${check.workflowKind}] — ${check.reason ?? "unknown"}`);
|
|
55
|
+
}
|
|
56
|
+
lines.push("");
|
|
57
|
+
lines.push(verdictLine(scorecard));
|
|
58
|
+
return lines.join("\n");
|
|
59
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { SpawnFn, WorkspaceWriter } from "@oscharko-dev/keiko-tools";
|
|
2
|
+
import type { UnitTestWorkflowInput } from "@oscharko-dev/keiko-workflows";
|
|
3
|
+
import type { BugInvestigationInput } from "@oscharko-dev/keiko-workflows";
|
|
4
|
+
import type { ScoringInput } from "./scorer.js";
|
|
5
|
+
import type { EvaluationFixture, EvaluationMode } from "./types.js";
|
|
6
|
+
export interface MaterializedWorkspace {
|
|
7
|
+
readonly root: string;
|
|
8
|
+
readonly cleanup: () => void;
|
|
9
|
+
}
|
|
10
|
+
export declare function materializeFixture(fixture: EvaluationFixture): MaterializedWorkspace;
|
|
11
|
+
export interface RecordingWriter extends WorkspaceWriter {
|
|
12
|
+
readonly writeCount: () => number;
|
|
13
|
+
}
|
|
14
|
+
export declare function recordingWriter(): RecordingWriter;
|
|
15
|
+
export interface RecordingSink {
|
|
16
|
+
readonly emit: (event: {
|
|
17
|
+
readonly type: string;
|
|
18
|
+
}) => void;
|
|
19
|
+
readonly events: () => readonly {
|
|
20
|
+
readonly type: string;
|
|
21
|
+
}[];
|
|
22
|
+
}
|
|
23
|
+
export declare function recordingSink(): RecordingSink;
|
|
24
|
+
export declare function fakeSpawn(exitCode: number, stdout?: string): SpawnFn;
|
|
25
|
+
export declare function buildUnitTestInput(fixture: EvaluationFixture, workspaceRoot: string, modelId: string): UnitTestWorkflowInput;
|
|
26
|
+
export declare function buildBugInput(fixture: EvaluationFixture, workspaceRoot: string, modelId: string): BugInvestigationInput;
|
|
27
|
+
export declare function toScoringInput(report: Record<string, unknown>, writeCount: number, manifestValid: boolean, mode: EvaluationMode): ScoringInput;
|
|
28
|
+
//# sourceMappingURL=runner-support.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner-support.d.ts","sourceRoot":"","sources":["../src/runner-support.ts"],"names":[],"mappings":"AAYA,OAAO,KAAK,EAAE,OAAO,EAAE,eAAe,EAAE,MAAM,2BAA2B,CAAC;AAC1E,OAAO,KAAK,EAAkB,qBAAqB,EAAE,MAAM,+BAA+B,CAAC;AAC3F,OAAO,KAAK,EAAE,qBAAqB,EAAkB,MAAM,+BAA+B,CAAC;AAC3F,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAEpE,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,OAAO,EAAE,MAAM,IAAI,CAAC;CAC9B;AAMD,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,iBAAiB,GAAG,qBAAqB,CAkBpF;AAED,MAAM,WAAW,eAAgB,SAAQ,eAAe;IACtD,QAAQ,CAAC,UAAU,EAAE,MAAM,MAAM,CAAC;CACnC;AAID,wBAAgB,eAAe,IAAI,eAAe,CAYjD;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE;QAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;KAAE,KAAK,IAAI,CAAC;IAC1D,QAAQ,CAAC,MAAM,EAAE,MAAM,SAAS;QAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;CAC7D;AAID,wBAAgB,aAAa,IAAI,aAAa,CAQ7C;AAYD,wBAAgB,SAAS,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,SAAK,GAAG,OAAO,CAehE;AA0BD,wBAAgB,kBAAkB,CAChC,OAAO,EAAE,iBAAiB,EAC1B,aAAa,EAAE,MAAM,EACrB,OAAO,EAAE,MAAM,GACd,qBAAqB,CAOvB;AAeD,wBAAgB,aAAa,CAC3B,OAAO,EAAE,iBAAiB,EAC1B,aAAa,EAAE,MAAM,EACrB,OAAO,EAAE,MAAM,GACd,qBAAqB,CAOvB;AAID,wBAAgB,cAAc,CAC5B,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC/B,UAAU,EAAE,MAAM,EAClB,aAAa,EAAE,OAAO,EACtB,IAAI,EAAE,cAAc,GACnB,YAAY,CAkBd"}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
// Runner support primitives (ADR-0012 D3/C5). Pure-ish, IO-narrow helpers the EvalRunner composes:
|
|
2
|
+
// fixture materialization to/from a temp dir, a recording WorkspaceWriter and recording event sink, a
|
|
3
|
+
// deterministic fake SpawnFn (ported from the tests/verification fake-child pattern), typed workflow
|
|
4
|
+
// input construction from a fixture's untyped workflowInput record, and the ScoringInput projection
|
|
5
|
+
// from a workflow report. Keeping these here keeps runner.ts focused on orchestration and under the
|
|
6
|
+
// LOC limit.
|
|
7
|
+
import { EventEmitter } from "node:events";
|
|
8
|
+
import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
9
|
+
import { tmpdir } from "node:os";
|
|
10
|
+
import { dirname, join, sep } from "node:path";
|
|
11
|
+
// Writes every workspaceFile to a fresh mkdtemp dir and returns the absolute root + a cleanup that
|
|
12
|
+
// removes the whole tree. POSIX-relative keys are joined onto the root; parent dirs are created.
|
|
13
|
+
// Containment guard: a key like `../../etc/x` would resolve outside the temp root — reject it
|
|
14
|
+
// loudly rather than letting a malformed fixture escape the sandbox (mirrors #5/#6 realpath ethos).
|
|
15
|
+
export function materializeFixture(fixture) {
|
|
16
|
+
const root = mkdtempSync(join(tmpdir(), "keiko-eval-"));
|
|
17
|
+
for (const [relPath, content] of Object.entries(fixture.workspaceFiles)) {
|
|
18
|
+
const abs = join(root, relPath);
|
|
19
|
+
if (abs !== root && !abs.startsWith(root + sep)) {
|
|
20
|
+
throw new Error(`fixture workspaceFiles key "${relPath}" resolves outside the temp root: ${abs}`);
|
|
21
|
+
}
|
|
22
|
+
mkdirSync(dirname(abs), { recursive: true });
|
|
23
|
+
writeFileSync(abs, content, "utf8");
|
|
24
|
+
}
|
|
25
|
+
return {
|
|
26
|
+
root,
|
|
27
|
+
cleanup: () => {
|
|
28
|
+
rmSync(root, { recursive: true, force: true });
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
// A WorkspaceWriter that records writes WITHOUT touching disk, so an unsafe-action fixture can assert
|
|
33
|
+
// zero writes and an apply fixture can confirm the apply phase attempted exactly the expected writes.
|
|
34
|
+
export function recordingWriter() {
|
|
35
|
+
let writes = 0;
|
|
36
|
+
const recordWrite = () => {
|
|
37
|
+
writes += 1;
|
|
38
|
+
};
|
|
39
|
+
return {
|
|
40
|
+
writeCount: () => writes,
|
|
41
|
+
writeFileUtf8: recordWrite,
|
|
42
|
+
mkdirp: recordWrite,
|
|
43
|
+
remove: recordWrite,
|
|
44
|
+
rename: recordWrite,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
// A workflow/bug event sink that buffers every emitted event so the runner can fold model-usage
|
|
48
|
+
// events into the evidence manifest. Structurally satisfies WorkflowEventSink / BugWorkflowEventSink.
|
|
49
|
+
export function recordingSink() {
|
|
50
|
+
const events = [];
|
|
51
|
+
return {
|
|
52
|
+
events: () => events,
|
|
53
|
+
emit: (event) => {
|
|
54
|
+
events.push(event);
|
|
55
|
+
},
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
// A deterministic fake SpawnFn (ported from tests/verification/_support.ts): every spawned command
|
|
59
|
+
// emits the scripted stdout then closes with the given exit code on the next microtask, so
|
|
60
|
+
// runVerification produces a deterministic VerificationAuditSummary offline with no real process.
|
|
61
|
+
export function fakeSpawn(exitCode, stdout = "") {
|
|
62
|
+
return () => {
|
|
63
|
+
const child = new EventEmitter();
|
|
64
|
+
child.stdout = new EventEmitter();
|
|
65
|
+
child.stderr = new EventEmitter();
|
|
66
|
+
child.pid = 4242;
|
|
67
|
+
child.kill = () => true;
|
|
68
|
+
queueMicrotask(() => {
|
|
69
|
+
if (stdout.length > 0) {
|
|
70
|
+
child.stdout.emit("data", Buffer.from(stdout, "utf8"));
|
|
71
|
+
}
|
|
72
|
+
child.emit("close", exitCode, null);
|
|
73
|
+
});
|
|
74
|
+
return child;
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
function isRecord(value) {
|
|
78
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
79
|
+
}
|
|
80
|
+
// Narrows the fixture's untyped `target` into a typed UnitTestTarget. Throws on an unknown shape so a
|
|
81
|
+
// malformed fixture fails loudly at the runner boundary rather than via a blind cast (quality bar).
|
|
82
|
+
function toUnitTestTarget(value) {
|
|
83
|
+
if (!isRecord(value) || typeof value.kind !== "string") {
|
|
84
|
+
throw new Error("fixture workflowInput.target must be an object with a string `kind`");
|
|
85
|
+
}
|
|
86
|
+
if (value.kind === "file" && typeof value.filePath === "string") {
|
|
87
|
+
return typeof value.targetFunction === "string"
|
|
88
|
+
? { kind: "file", filePath: value.filePath, targetFunction: value.targetFunction }
|
|
89
|
+
: { kind: "file", filePath: value.filePath };
|
|
90
|
+
}
|
|
91
|
+
if (value.kind === "module" && typeof value.moduleDir === "string") {
|
|
92
|
+
return { kind: "module", moduleDir: value.moduleDir };
|
|
93
|
+
}
|
|
94
|
+
if (value.kind === "changedFiles" && Array.isArray(value.filePaths)) {
|
|
95
|
+
return { kind: "changedFiles", filePaths: value.filePaths.map(String) };
|
|
96
|
+
}
|
|
97
|
+
throw new Error(`fixture workflowInput.target has an unsupported kind: ${value.kind}`);
|
|
98
|
+
}
|
|
99
|
+
export function buildUnitTestInput(fixture, workspaceRoot, modelId) {
|
|
100
|
+
return {
|
|
101
|
+
workspaceRoot,
|
|
102
|
+
target: toUnitTestTarget(fixture.workflowInput.target),
|
|
103
|
+
apply: fixture.apply === true,
|
|
104
|
+
modelId,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function toBugReport(value) {
|
|
108
|
+
if (!isRecord(value)) {
|
|
109
|
+
throw new Error("fixture workflowInput.report must be an object");
|
|
110
|
+
}
|
|
111
|
+
const report = {
|
|
112
|
+
...(typeof value.description === "string" ? { description: value.description } : {}),
|
|
113
|
+
...(typeof value.failingOutput === "string" ? { failingOutput: value.failingOutput } : {}),
|
|
114
|
+
...(typeof value.stackTrace === "string" ? { stackTrace: value.stackTrace } : {}),
|
|
115
|
+
...(Array.isArray(value.targetFiles) ? { targetFiles: value.targetFiles.map(String) } : {}),
|
|
116
|
+
};
|
|
117
|
+
return report;
|
|
118
|
+
}
|
|
119
|
+
export function buildBugInput(fixture, workspaceRoot, modelId) {
|
|
120
|
+
return {
|
|
121
|
+
workspaceRoot,
|
|
122
|
+
report: toBugReport(fixture.workflowInput.report),
|
|
123
|
+
apply: fixture.apply === true,
|
|
124
|
+
modelId,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
// Projects a workflow report (unit-tests or bug-investigation) + the recording writer's observed
|
|
128
|
+
// write count into the report-shape-agnostic ScoringInput the pure scorer consumes.
|
|
129
|
+
export function toScoringInput(report, writeCount, manifestValid, mode) {
|
|
130
|
+
const proposedDiff = typeof report.proposedDiff === "string" ? report.proposedDiff : undefined;
|
|
131
|
+
const verification = resolveVerification(report);
|
|
132
|
+
const verificationStatus = verification !== undefined && typeof verification.overallStatus === "string"
|
|
133
|
+
? verification.overallStatus
|
|
134
|
+
: undefined;
|
|
135
|
+
return {
|
|
136
|
+
status: typeof report.status === "string" ? report.status : "unknown",
|
|
137
|
+
proposedDiff,
|
|
138
|
+
changedFileCount: changedFileCount(report),
|
|
139
|
+
patchBytes: proposedDiff === undefined ? 0 : Buffer.byteLength(proposedDiff, "utf8"),
|
|
140
|
+
verificationStatus,
|
|
141
|
+
verificationPresent: verification !== undefined,
|
|
142
|
+
manifestValid,
|
|
143
|
+
recordedWriteCount: writeCount,
|
|
144
|
+
mode,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
function changedFileCount(report) {
|
|
148
|
+
if (Array.isArray(report.addedTestFiles)) {
|
|
149
|
+
return report.addedTestFiles.length;
|
|
150
|
+
}
|
|
151
|
+
return Array.isArray(report.changedFiles) ? report.changedFiles.length : 0;
|
|
152
|
+
}
|
|
153
|
+
// The verification summary lives at `verificationSummary` on a unit-test report and at
|
|
154
|
+
// `verified.verification` on a bug-investigation report; this resolves whichever shape is present.
|
|
155
|
+
function resolveVerification(report) {
|
|
156
|
+
if (isRecord(report.verificationSummary)) {
|
|
157
|
+
return report.verificationSummary;
|
|
158
|
+
}
|
|
159
|
+
const verified = report.verified;
|
|
160
|
+
if (isRecord(verified) && isRecord(verified.verification)) {
|
|
161
|
+
return verified.verification;
|
|
162
|
+
}
|
|
163
|
+
return undefined;
|
|
164
|
+
}
|