@oscharko-dev/keiko-evaluations 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/fixtures/bug-investigation/happy-path.d.ts +3 -0
- package/dist/fixtures/bug-investigation/happy-path.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/happy-path.js +66 -0
- package/dist/fixtures/bug-investigation/investigation-only.d.ts +3 -0
- package/dist/fixtures/bug-investigation/investigation-only.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/investigation-only.js +39 -0
- package/dist/fixtures/bug-investigation/unsafe-action.d.ts +3 -0
- package/dist/fixtures/bug-investigation/unsafe-action.d.ts.map +1 -0
- package/dist/fixtures/bug-investigation/unsafe-action.js +37 -0
- package/dist/fixtures/index.d.ts +8 -0
- package/dist/fixtures/index.d.ts.map +1 -0
- package/dist/fixtures/index.js +35 -0
- package/dist/fixtures/support.d.ts +6 -0
- package/dist/fixtures/support.d.ts.map +1 -0
- package/dist/fixtures/support.js +42 -0
- package/dist/fixtures/unit-tests/happy-path.d.ts +3 -0
- package/dist/fixtures/unit-tests/happy-path.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/happy-path.js +40 -0
- package/dist/fixtures/unit-tests/retry-then-accept.d.ts +3 -0
- package/dist/fixtures/unit-tests/retry-then-accept.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/retry-then-accept.js +39 -0
- package/dist/fixtures/unit-tests/unsafe-action.d.ts +3 -0
- package/dist/fixtures/unit-tests/unsafe-action.d.ts.map +1 -0
- package/dist/fixtures/unit-tests/unsafe-action.js +32 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/manifest-check.d.ts +2 -0
- package/dist/manifest-check.d.ts.map +1 -0
- package/dist/manifest-check.js +48 -0
- package/dist/model-provider.d.ts +15 -0
- package/dist/model-provider.d.ts.map +1 -0
- package/dist/model-provider.js +26 -0
- package/dist/promptEnhancer/fixtures/adversarial.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/adversarial.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/adversarial.js +60 -0
- package/dist/promptEnhancer/fixtures/format.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/format.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/format.js +43 -0
- package/dist/promptEnhancer/fixtures/grounding.d.ts +6 -0
- package/dist/promptEnhancer/fixtures/grounding.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/grounding.js +56 -0
- package/dist/promptEnhancer/fixtures/index.d.ts +5 -0
- package/dist/promptEnhancer/fixtures/index.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/index.js +21 -0
- package/dist/promptEnhancer/fixtures/task-classes.d.ts +18 -0
- package/dist/promptEnhancer/fixtures/task-classes.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/task-classes.js +205 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.d.ts +5 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.d.ts.map +1 -0
- package/dist/promptEnhancer/fixtures/token-efficiency.js +37 -0
- package/dist/promptEnhancer/index.d.ts +7 -0
- package/dist/promptEnhancer/index.d.ts.map +1 -0
- package/dist/promptEnhancer/index.js +10 -0
- package/dist/promptEnhancer/pipeline.d.ts +7 -0
- package/dist/promptEnhancer/pipeline.d.ts.map +1 -0
- package/dist/promptEnhancer/pipeline.js +63 -0
- package/dist/promptEnhancer/render.d.ts +3 -0
- package/dist/promptEnhancer/render.d.ts.map +1 -0
- package/dist/promptEnhancer/render.js +49 -0
- package/dist/promptEnhancer/runner.d.ts +7 -0
- package/dist/promptEnhancer/runner.d.ts.map +1 -0
- package/dist/promptEnhancer/runner.js +49 -0
- package/dist/promptEnhancer/scorer.d.ts +8 -0
- package/dist/promptEnhancer/scorer.d.ts.map +1 -0
- package/dist/promptEnhancer/scorer.js +279 -0
- package/dist/promptEnhancer/types.d.ts +82 -0
- package/dist/promptEnhancer/types.d.ts.map +1 -0
- package/dist/promptEnhancer/types.js +31 -0
- package/dist/render.d.ts +3 -0
- package/dist/render.d.ts.map +1 -0
- package/dist/render.js +59 -0
- package/dist/runner-support.d.ts +28 -0
- package/dist/runner-support.d.ts.map +1 -0
- package/dist/runner-support.js +164 -0
- package/dist/runner.d.ts +25 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +190 -0
- package/dist/scorer.d.ts +16 -0
- package/dist/scorer.d.ts.map +1 -0
- package/dist/scorer.js +156 -0
- package/dist/scripted-model.d.ts +7 -0
- package/dist/scripted-model.d.ts.map +1 -0
- package/dist/scripted-model.js +26 -0
- package/dist/surface-parity.d.ts +23 -0
- package/dist/surface-parity.d.ts.map +1 -0
- package/dist/surface-parity.js +184 -0
- package/dist/types.d.ts +3 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/package.json +38 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
// Task-class evaluation fixtures (Epic #1307, Issue #1315). One representative draft per supported
|
|
2
|
+
// task class, covering all fifteen classes the taxonomy defines (the issue requires at least ten). Each
|
|
3
|
+
// draft carries strong lexical signals so the deterministic analyzer classifies it stably, and asserts
|
|
4
|
+
// the recommended profile, well-formed structure, and a clean safety posture. These data modules are
|
|
5
|
+
// pure values — no IO — and never carry credentials or content beyond the draft itself.
|
|
6
|
+
// Dimensions every task-class fixture exercises: the prompt is well-formed, complete, classified
|
|
7
|
+
// correctly, safe, and within its token budget.
|
|
8
|
+
const CORE = new Set([
|
|
9
|
+
"clarity",
|
|
10
|
+
"completeness",
|
|
11
|
+
"task-success",
|
|
12
|
+
"safety",
|
|
13
|
+
"token-efficiency",
|
|
14
|
+
]);
|
|
15
|
+
export const factualQa = {
|
|
16
|
+
name: "task-factual-qa",
|
|
17
|
+
category: "task-class",
|
|
18
|
+
description: "A plain factual question with no retrieval need (the conservative default class).",
|
|
19
|
+
request: { text: "What is the boiling point of water at sea level?" },
|
|
20
|
+
dimensions: new Set([...CORE, "groundedness", "faithfulness"]),
|
|
21
|
+
oracle: {
|
|
22
|
+
expectedTaskClasses: ["factual-qa"],
|
|
23
|
+
expectedProfiles: ["precise"],
|
|
24
|
+
expectedGroundingRequired: false,
|
|
25
|
+
},
|
|
26
|
+
};
|
|
27
|
+
export const research = {
|
|
28
|
+
name: "task-research",
|
|
29
|
+
category: "task-class",
|
|
30
|
+
description: "A comprehensive research request that mandates grounding.",
|
|
31
|
+
request: {
|
|
32
|
+
text: "Give me a comprehensive overview and literature review of the state of the art in solid-state battery research.",
|
|
33
|
+
},
|
|
34
|
+
dimensions: new Set([...CORE, "groundedness", "faithfulness"]),
|
|
35
|
+
oracle: {
|
|
36
|
+
expectedTaskClasses: ["research"],
|
|
37
|
+
expectedProfiles: ["research"],
|
|
38
|
+
expectedGroundingRequired: true,
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
export const ragQuestionAnswering = {
|
|
42
|
+
name: "task-rag-qa",
|
|
43
|
+
category: "task-class",
|
|
44
|
+
description: "A question answered strictly from supplied/connected context.",
|
|
45
|
+
request: {
|
|
46
|
+
text: "Based on the provided document, what notice period does the contract require for termination?",
|
|
47
|
+
hasConnectedContext: true,
|
|
48
|
+
attachmentCount: 1,
|
|
49
|
+
},
|
|
50
|
+
dimensions: new Set([...CORE, "groundedness", "faithfulness"]),
|
|
51
|
+
oracle: {
|
|
52
|
+
expectedTaskClasses: ["rag-question-answering"],
|
|
53
|
+
expectedGroundingRequired: true,
|
|
54
|
+
expectedGroundingStrategies: ["supplied-context-only", "hybrid", "local-knowledge"],
|
|
55
|
+
},
|
|
56
|
+
};
|
|
57
|
+
export const summarization = {
|
|
58
|
+
name: "task-summarization",
|
|
59
|
+
category: "task-class",
|
|
60
|
+
description: "A summarization request over supplied material.",
|
|
61
|
+
request: {
|
|
62
|
+
text: "Summarize the following quarterly report into the key points for an executive audience.",
|
|
63
|
+
},
|
|
64
|
+
dimensions: CORE,
|
|
65
|
+
oracle: { expectedTaskClasses: ["summarization"] },
|
|
66
|
+
};
|
|
67
|
+
export const structuredExtraction = {
|
|
68
|
+
name: "task-structured-extraction",
|
|
69
|
+
category: "task-class",
|
|
70
|
+
description: "An extraction request into a structured (JSON) form.",
|
|
71
|
+
request: {
|
|
72
|
+
text: "Extract the fields invoice number, total, and due date into json from this invoice text.",
|
|
73
|
+
},
|
|
74
|
+
dimensions: new Set([...CORE, "format-adherence"]),
|
|
75
|
+
oracle: {
|
|
76
|
+
expectedTaskClasses: ["structured-extraction"],
|
|
77
|
+
expectedOutputStructured: true,
|
|
78
|
+
expectedOutputFormat: "json",
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
export const dataAnalysis = {
|
|
82
|
+
name: "task-data-analysis",
|
|
83
|
+
category: "task-class",
|
|
84
|
+
description: "A statistical analysis request over a dataset.",
|
|
85
|
+
request: {
|
|
86
|
+
text: "Analyze this data and report the correlation between advertising spend and monthly revenue.",
|
|
87
|
+
},
|
|
88
|
+
dimensions: new Set([...CORE, "format-adherence"]),
|
|
89
|
+
oracle: { expectedTaskClasses: ["data-analysis"], expectedProfiles: ["technical"] },
|
|
90
|
+
};
|
|
91
|
+
export const codeGeneration = {
|
|
92
|
+
name: "task-code-generation",
|
|
93
|
+
category: "task-class",
|
|
94
|
+
description: "A code-writing request.",
|
|
95
|
+
request: { text: "Write a function that validates an email address and returns a boolean." },
|
|
96
|
+
dimensions: CORE,
|
|
97
|
+
oracle: { expectedTaskClasses: ["code-generation"], expectedProfiles: ["technical"] },
|
|
98
|
+
};
|
|
99
|
+
export const codeDebugging = {
|
|
100
|
+
name: "task-code-debugging",
|
|
101
|
+
category: "task-class",
|
|
102
|
+
description: "A debugging request driven by a failing stack trace.",
|
|
103
|
+
request: {
|
|
104
|
+
text: "Debug this stack trace and explain why does this fail when the list is empty.",
|
|
105
|
+
},
|
|
106
|
+
dimensions: CORE,
|
|
107
|
+
oracle: { expectedTaskClasses: ["code-debugging"], expectedProfiles: ["technical"] },
|
|
108
|
+
};
|
|
109
|
+
export const codeArchitecture = {
|
|
110
|
+
name: "task-code-architecture",
|
|
111
|
+
category: "task-class",
|
|
112
|
+
description: "A system-design request.",
|
|
113
|
+
request: {
|
|
114
|
+
text: "Design the software architecture for a multi-tenant document collaboration service.",
|
|
115
|
+
},
|
|
116
|
+
dimensions: CORE,
|
|
117
|
+
oracle: { expectedTaskClasses: ["code-architecture"], expectedProfiles: ["technical"] },
|
|
118
|
+
};
|
|
119
|
+
export const writingEditing = {
|
|
120
|
+
name: "task-writing-editing",
|
|
121
|
+
category: "task-class",
|
|
122
|
+
description: "A drafting/editing request that favours a lean profile.",
|
|
123
|
+
request: { text: "Draft a short, polite email to reschedule a meeting and proofread it." },
|
|
124
|
+
dimensions: new Set([...CORE]),
|
|
125
|
+
oracle: { expectedTaskClasses: ["writing-editing"], expectedProfiles: ["fast"] },
|
|
126
|
+
};
|
|
127
|
+
export const creativeWriting = {
|
|
128
|
+
name: "task-creative-writing",
|
|
129
|
+
category: "task-class",
|
|
130
|
+
description: "A creative-writing request.",
|
|
131
|
+
request: {
|
|
132
|
+
text: "Write a short story about a lighthouse keeper who collects forgotten letters.",
|
|
133
|
+
},
|
|
134
|
+
dimensions: CORE,
|
|
135
|
+
oracle: { expectedTaskClasses: ["creative-writing"], expectedProfiles: ["creative"] },
|
|
136
|
+
};
|
|
137
|
+
export const decisionSupport = {
|
|
138
|
+
name: "task-decision-support",
|
|
139
|
+
category: "task-class",
|
|
140
|
+
description: "A non-safety-critical decision-support request.",
|
|
141
|
+
request: {
|
|
142
|
+
text: "Should I use a monorepo or separate repositories for my team? Give the pros and cons.",
|
|
143
|
+
},
|
|
144
|
+
dimensions: CORE,
|
|
145
|
+
oracle: { expectedTaskClasses: ["decision-support"], expectedProfiles: ["precise"] },
|
|
146
|
+
};
|
|
147
|
+
export const agenticToolUse = {
|
|
148
|
+
name: "task-agentic-tool-use",
|
|
149
|
+
category: "task-class",
|
|
150
|
+
description: "An agentic task that requires human-approval gating on side effects.",
|
|
151
|
+
request: {
|
|
152
|
+
text: "As an agent, use the tool to fetch the latest issues and automate triaging them step by step.",
|
|
153
|
+
},
|
|
154
|
+
dimensions: CORE,
|
|
155
|
+
oracle: {
|
|
156
|
+
expectedTaskClasses: ["agentic-tool-use"],
|
|
157
|
+
expectedProfiles: ["agentic"],
|
|
158
|
+
expectedSafetyDecisions: ["requires-human-review"],
|
|
159
|
+
expectedVerificationStatuses: ["passed-with-review"],
|
|
160
|
+
},
|
|
161
|
+
};
|
|
162
|
+
export const promptOptimization = {
|
|
163
|
+
name: "task-prompt-optimization",
|
|
164
|
+
category: "task-class",
|
|
165
|
+
description: "A meta-prompting request to improve an existing prompt.",
|
|
166
|
+
request: { text: "Improve this prompt so it produces more reliable structured answers." },
|
|
167
|
+
dimensions: CORE,
|
|
168
|
+
oracle: { expectedTaskClasses: ["prompt-optimization"], expectedProfiles: ["technical"] },
|
|
169
|
+
};
|
|
170
|
+
export const safetyCritical = {
|
|
171
|
+
name: "task-safety-critical",
|
|
172
|
+
category: "task-class",
|
|
173
|
+
description: "Consequential advice in a safety-critical (medical) domain.",
|
|
174
|
+
request: {
|
|
175
|
+
text: "I was diagnosed with a disease. Should I change my medication dosage and treatment for these symptoms?",
|
|
176
|
+
},
|
|
177
|
+
dimensions: new Set([...CORE, "faithfulness"]),
|
|
178
|
+
oracle: {
|
|
179
|
+
expectedTaskClasses: ["safety-critical"],
|
|
180
|
+
expectedProfiles: ["safety-critical"],
|
|
181
|
+
// The validate stage requires a human-approval rule for critical-criticality prompts; the MVP
|
|
182
|
+
// generator emits a professional-advice disclaimer rather than that rule, so the assessment
|
|
183
|
+
// currently rejects (a fail-safe refusal). This is documented as a known limitation/follow-up in
|
|
184
|
+
// the closure evidence; the eval pins the behaviour so a regression cannot silently change it.
|
|
185
|
+
expectedSafetyDecisions: ["rejected"],
|
|
186
|
+
expectedVerificationStatuses: ["failed"],
|
|
187
|
+
},
|
|
188
|
+
};
|
|
189
|
+
export const TASK_CLASS_FIXTURES = [
|
|
190
|
+
factualQa,
|
|
191
|
+
research,
|
|
192
|
+
ragQuestionAnswering,
|
|
193
|
+
summarization,
|
|
194
|
+
structuredExtraction,
|
|
195
|
+
dataAnalysis,
|
|
196
|
+
codeGeneration,
|
|
197
|
+
codeDebugging,
|
|
198
|
+
codeArchitecture,
|
|
199
|
+
writingEditing,
|
|
200
|
+
creativeWriting,
|
|
201
|
+
decisionSupport,
|
|
202
|
+
agenticToolUse,
|
|
203
|
+
promptOptimization,
|
|
204
|
+
safetyCritical,
|
|
205
|
+
];
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { PromptEnhancerEvalFixture } from "../types.js";
|
|
2
|
+
export declare const tokenEfficiencyLean: PromptEnhancerEvalFixture;
|
|
3
|
+
export declare const tokenEfficiencyThorough: PromptEnhancerEvalFixture;
|
|
4
|
+
export declare const TOKEN_EFFICIENCY_FIXTURES: readonly PromptEnhancerEvalFixture[];
|
|
5
|
+
//# sourceMappingURL=token-efficiency.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-efficiency.d.ts","sourceRoot":"","sources":["../../../src/promptEnhancer/fixtures/token-efficiency.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,aAAa,CAAC;AAE7D,eAAO,MAAM,mBAAmB,EAAE,yBAajC,CAAC;AAEF,eAAO,MAAM,uBAAuB,EAAE,yBAcrC,CAAC;AAEF,eAAO,MAAM,yBAAyB,EAAE,SAAS,yBAAyB,EAGzE,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// Token-efficiency fixtures (Epic #1307, Issue #1315). Exercises the token-efficiency dimension at both
|
|
2
|
+
// ends of the breadth/cost trade-off: a lean draft routed to the `fast` profile must stay well under a
|
|
3
|
+
// tight ceiling, while a thorough draft routed to the `research` profile must still respect its larger
|
|
4
|
+
// token budget. Both prove the generator honours the profile's token budget.
|
|
5
|
+
export const tokenEfficiencyLean = {
|
|
6
|
+
name: "token-efficiency-lean",
|
|
7
|
+
category: "token-efficiency",
|
|
8
|
+
description: "A lean editing draft routed to the fast profile; instructions stay compact.",
|
|
9
|
+
request: { text: "Proofread and tighten this one-sentence status update." },
|
|
10
|
+
dimensions: new Set(["token-efficiency", "clarity"]),
|
|
11
|
+
oracle: {
|
|
12
|
+
expectedTaskClasses: ["writing-editing"],
|
|
13
|
+
expectedProfiles: ["fast"],
|
|
14
|
+
// The fast profile keeps the full rendered prompt compact and scores high instruction-leanness.
|
|
15
|
+
maxEstimatedTokens: 650,
|
|
16
|
+
minTokenEfficiencyScore: 0.4,
|
|
17
|
+
},
|
|
18
|
+
};
|
|
19
|
+
export const tokenEfficiencyThorough = {
|
|
20
|
+
name: "token-efficiency-thorough",
|
|
21
|
+
category: "token-efficiency",
|
|
22
|
+
description: "A thorough research draft routed to the research profile; stays within its budget.",
|
|
23
|
+
request: {
|
|
24
|
+
text: "Deep research: produce a comprehensive overview and survey of distributed consensus algorithms.",
|
|
25
|
+
},
|
|
26
|
+
dimensions: new Set(["token-efficiency", "clarity"]),
|
|
27
|
+
oracle: {
|
|
28
|
+
// The research profile is the most thorough; the full rendered prompt is larger but still bounded.
|
|
29
|
+
expectedTaskClasses: ["research"],
|
|
30
|
+
expectedProfiles: ["research"],
|
|
31
|
+
maxEstimatedTokens: 1200,
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
export const TOKEN_EFFICIENCY_FIXTURES = [
|
|
35
|
+
tokenEfficiencyLean,
|
|
36
|
+
tokenEfficiencyThorough,
|
|
37
|
+
];
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export { runEnhancement } from "./pipeline.js";
|
|
2
|
+
export { scorePromptQuality, aggregatePromptQuality } from "./scorer.js";
|
|
3
|
+
export { runPromptEnhancerEvaluation } from "./runner.js";
|
|
4
|
+
export { renderPromptEnhancerSummary } from "./render.js";
|
|
5
|
+
export { ALL_PROMPT_ENHANCER_FIXTURES, fixturesForCategory, promptEnhancerFixtureByName, } from "./fixtures/index.js";
|
|
6
|
+
export { PROMPT_QUALITY_DIMENSIONS, PROMPT_ENHANCER_FIXTURE_CATEGORIES, PROMPT_ENHANCER_EVAL_SCHEMA_VERSION, type PromptQualityDimension, type PromptEnhancerFixtureCategory, type PromptEnhancerFixtureRequest, type PromptEnhancerOracle, type PromptEnhancerEvalFixture, type EnhancementObservation, type PromptQualityOutcome, type PromptQualityDimensionResult, type PromptEnhancerFixtureResult, type PromptQualityScorecardEntry, type PromptEnhancerEvalSummary, type PromptEnhancerScorecard, } from "./types.js";
|
|
7
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AACzE,OAAO,EAAE,2BAA2B,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,EAAE,2BAA2B,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,EACL,4BAA4B,EAC5B,mBAAmB,EACnB,2BAA2B,GAC5B,MAAM,qBAAqB,CAAC;AAC7B,OAAO,EACL,yBAAyB,EACzB,kCAAkC,EAClC,mCAAmC,EACnC,KAAK,sBAAsB,EAC3B,KAAK,6BAA6B,EAClC,KAAK,4BAA4B,EACjC,KAAK,oBAAoB,EACzB,KAAK,yBAAyB,EAC9B,KAAK,sBAAsB,EAC3B,KAAK,oBAAoB,EACzB,KAAK,4BAA4B,EACjC,KAAK,2BAA2B,EAChC,KAAK,2BAA2B,EAChC,KAAK,yBAAyB,EAC9B,KAAK,uBAAuB,GAC7B,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
// Public barrel for the Prompt Enhancer evaluation suite (Epic #1307, Issue #1315; ADR-0044 §6).
|
|
2
|
+
// Exposes the deterministic pipeline runner, the eight-dimension scorer, the suite runner, the
|
|
3
|
+
// scorecard renderer, the fixture registry, and the result/fixture types. Re-exported from the package
|
|
4
|
+
// barrel as a single auditable namespace (`PromptEnhancerEval`).
|
|
5
|
+
export { runEnhancement } from "./pipeline.js";
|
|
6
|
+
export { scorePromptQuality, aggregatePromptQuality } from "./scorer.js";
|
|
7
|
+
export { runPromptEnhancerEvaluation } from "./runner.js";
|
|
8
|
+
export { renderPromptEnhancerSummary } from "./render.js";
|
|
9
|
+
export { ALL_PROMPT_ENHANCER_FIXTURES, fixturesForCategory, promptEnhancerFixtureByName, } from "./fixtures/index.js";
|
|
10
|
+
export { PROMPT_QUALITY_DIMENSIONS, PROMPT_ENHANCER_FIXTURE_CATEGORIES, PROMPT_ENHANCER_EVAL_SCHEMA_VERSION, } from "./types.js";
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { EnhancementObservation, PromptEnhancerFixtureRequest } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Run the deterministic Prompt Enhancer pipeline for one fixture and return all observable artefacts.
|
|
4
|
+
* Pure: no IO, clock, randomness, or model dispatch.
|
|
5
|
+
*/
|
|
6
|
+
export declare function runEnhancement(name: string, req: PromptEnhancerFixtureRequest): EnhancementObservation;
|
|
7
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/pipeline.ts"],"names":[],"mappings":"AAmBA,OAAO,KAAK,EAAE,sBAAsB,EAAE,4BAA4B,EAAE,MAAM,YAAY,CAAC;AAmBvF;;;GAGG;AACH,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,4BAA4B,GAChC,sBAAsB,CA8BxB"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// Prompt Enhancer evaluation pipeline (Epic #1307, Issue #1315; ADR-0044 §1/§5/§6).
|
|
2
|
+
//
|
|
3
|
+
// Runs the full deterministic enhancement chain for one fixture and collects every artefact the scorer
|
|
4
|
+
// needs: the analyzer result, the plan, the generated Enhanced Prompt, the deterministic critic
|
|
5
|
+
// scorecard, the structural safety assessment, the raw-draft injection signals, and the token estimate.
|
|
6
|
+
//
|
|
7
|
+
// Determinism: pure. The whole chain — analyze, plan, generate, critic-score, safety-assess, injection-
|
|
8
|
+
// detect — is deterministic and never dispatches a model, so identical fixtures always yield identical
|
|
9
|
+
// observations and the evaluation provides reproducible CI coverage (Engineering Notes).
|
|
10
|
+
import { analyzePrompt, asEnhancedPromptId, asPromptEnhancementRequestId, PROMPT_ENHANCER_SCHEMA_VERSION, } from "@oscharko-dev/keiko-contracts";
|
|
11
|
+
import { PromptEnhancer } from "@oscharko-dev/keiko-model-gateway";
|
|
12
|
+
import { detectPromptInjectionSignals } from "@oscharko-dev/keiko-security";
|
|
13
|
+
// Builds a wire-valid `PromptEnhancementRequest` from a fixture's safe request subset. The ids are
|
|
14
|
+
// derived from the fixture name (kebab-case, so they pass the id validators) to keep runs deterministic.
|
|
15
|
+
function buildRequest(name, req) {
|
|
16
|
+
return {
|
|
17
|
+
schemaVersion: PROMPT_ENHANCER_SCHEMA_VERSION,
|
|
18
|
+
requestId: asPromptEnhancementRequestId(`req-pe-eval-${name}`),
|
|
19
|
+
input: {
|
|
20
|
+
text: req.text,
|
|
21
|
+
hasConnectedContext: req.hasConnectedContext,
|
|
22
|
+
attachmentCount: req.attachmentCount,
|
|
23
|
+
},
|
|
24
|
+
missingInformationStrategy: req.missingInformationStrategy ?? "clarify",
|
|
25
|
+
profilePreference: req.profilePreference,
|
|
26
|
+
locale: req.locale,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Run the deterministic Prompt Enhancer pipeline for one fixture and return all observable artefacts.
|
|
31
|
+
* Pure: no IO, clock, randomness, or model dispatch.
|
|
32
|
+
*/
|
|
33
|
+
export function runEnhancement(name, req) {
|
|
34
|
+
const request = buildRequest(name, req);
|
|
35
|
+
const analysis = analyzePrompt(request);
|
|
36
|
+
const plan = PromptEnhancer.planPromptEnhancement(analysis, {
|
|
37
|
+
profilePreference: req.profilePreference,
|
|
38
|
+
missingInformationStrategy: request.missingInformationStrategy,
|
|
39
|
+
});
|
|
40
|
+
const prompt = PromptEnhancer.generateEnhancedPrompt({
|
|
41
|
+
promptId: asEnhancedPromptId(`prompt-pe-eval-${name}`),
|
|
42
|
+
analysis,
|
|
43
|
+
plan,
|
|
44
|
+
input: request.input,
|
|
45
|
+
});
|
|
46
|
+
const critic = PromptEnhancer.scorePromptCandidate({
|
|
47
|
+
candidateId: `cand-pe-eval-${name}`,
|
|
48
|
+
profile: plan.selectedProfile,
|
|
49
|
+
prompt,
|
|
50
|
+
plan,
|
|
51
|
+
analysis,
|
|
52
|
+
});
|
|
53
|
+
const safety = PromptEnhancer.assessPromptSafety({ prompt, analysis, input: request.input });
|
|
54
|
+
return {
|
|
55
|
+
analysis,
|
|
56
|
+
plan,
|
|
57
|
+
prompt,
|
|
58
|
+
critic,
|
|
59
|
+
safety,
|
|
60
|
+
injectionSignals: detectPromptInjectionSignals(req.text),
|
|
61
|
+
estimatedTokens: PromptEnhancer.estimatePromptTokens(prompt),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"render.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/render.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;AA2BpB,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,uBAAuB,GAAG,MAAM,CA2BtF"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// renderPromptEnhancerSummary (Issue #1315): PromptEnhancerScorecard -> human-readable string. One
|
|
2
|
+
// line per fixture (name, category, dimension pass/fail glyphs), a per-dimension table, the covered-
|
|
3
|
+
// task-class coverage line, and a Go/No-Go verdict. The scorecard carries only harness-authored,
|
|
4
|
+
// content-free fields (structural counts, closed-vocabulary labels, numeric scores), so this renderer
|
|
5
|
+
// performs no redaction — it only formats fields that are safe to print.
|
|
6
|
+
function glyph(result) {
|
|
7
|
+
if (result.outcome === "pass") {
|
|
8
|
+
return "PASS";
|
|
9
|
+
}
|
|
10
|
+
if (result.outcome === "fail") {
|
|
11
|
+
return "FAIL";
|
|
12
|
+
}
|
|
13
|
+
return "n/a";
|
|
14
|
+
}
|
|
15
|
+
function fixtureLine(fixture) {
|
|
16
|
+
const dims = fixture.dimensionResults
|
|
17
|
+
.filter((d) => d.outcome !== "not-applicable")
|
|
18
|
+
.map((d) => `${d.dimension}=${glyph(d)}`)
|
|
19
|
+
.join(" ");
|
|
20
|
+
const verdict = fixture.fullyPassed ? "OK" : "FAIL";
|
|
21
|
+
return `- ${fixture.fixtureName} [${fixture.category}] ${verdict} ${dims}`.trimEnd();
|
|
22
|
+
}
|
|
23
|
+
function dimensionLine(entry) {
|
|
24
|
+
const rate = entry.passRate === null ? "n/a" : `${(entry.passRate * 100).toFixed(0)}%`;
|
|
25
|
+
const verdict = entry.failCount > 0 ? "FAIL" : entry.passCount > 0 ? "PASS" : "n/a";
|
|
26
|
+
return ` ${entry.dimension.padEnd(18)} ${verdict.padEnd(5)} pass=${String(entry.passCount)} fail=${String(entry.failCount)} n/a=${String(entry.notApplicableCount)} rate=${rate}`;
|
|
27
|
+
}
|
|
28
|
+
export function renderPromptEnhancerSummary(scorecard) {
|
|
29
|
+
const lines = [];
|
|
30
|
+
lines.push(`Prompt Enhancer evaluation summary (schema v${scorecard.schemaVersion})`);
|
|
31
|
+
lines.push(`Fixtures: ${String(scorecard.summary.totalFixtures)} total, ${String(scorecard.summary.fullyPassedFixtures)} fully passed`);
|
|
32
|
+
lines.push(`Task classes covered: ${String(scorecard.coveredTaskClasses.length)} (${scorecard.coveredTaskClasses.join(", ")})`);
|
|
33
|
+
lines.push("");
|
|
34
|
+
lines.push("Fixtures:");
|
|
35
|
+
for (const fixture of scorecard.fixtureResults) {
|
|
36
|
+
lines.push(fixtureLine(fixture));
|
|
37
|
+
}
|
|
38
|
+
lines.push("");
|
|
39
|
+
lines.push("Dimensions:");
|
|
40
|
+
for (const entry of scorecard.dimensions) {
|
|
41
|
+
lines.push(dimensionLine(entry));
|
|
42
|
+
}
|
|
43
|
+
lines.push("");
|
|
44
|
+
lines.push(`Safety gate: ${scorecard.summary.safetyGatePassed ? "PASS" : "FAIL"}`);
|
|
45
|
+
lines.push(scorecard.summary.goNoGo === "GO"
|
|
46
|
+
? "Verdict: GO - every exercised prompt-quality dimension passed."
|
|
47
|
+
: "Verdict: NO-GO - one or more prompt-quality dimensions failed (see table above).");
|
|
48
|
+
return lines.join("\n");
|
|
49
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { type PromptEnhancerEvalFixture, type PromptEnhancerScorecard } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Run the Prompt Enhancer evaluation suite and return a fully aggregated scorecard. Pure and
|
|
4
|
+
* deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
|
|
5
|
+
*/
|
|
6
|
+
export declare function runPromptEnhancerEvaluation(fixtures?: readonly PromptEnhancerEvalFixture[]): PromptEnhancerScorecard;
|
|
7
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/runner.ts"],"names":[],"mappings":"AASA,OAAO,EAEL,KAAK,yBAAyB,EAG9B,KAAK,uBAAuB,EAE7B,MAAM,YAAY,CAAC;AA8BpB;;;GAGG;AACH,wBAAgB,2BAA2B,CACzC,QAAQ,GAAE,SAAS,yBAAyB,EAAiC,GAC5E,uBAAuB,CAazB"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// Prompt Enhancer evaluation runner (Epic #1307, Issue #1315; ADR-0044 §6).
|
|
2
|
+
//
|
|
3
|
+
// Runs every registered fixture through the deterministic pipeline, scores the eight prompt-quality
|
|
4
|
+
// dimensions, aggregates a scorecard, and derives the offline Go/No-Go verdict. Pure: no IO, clock,
|
|
5
|
+
// randomness, or model dispatch, so the suite gives reproducible CI coverage.
|
|
6
|
+
import { ALL_PROMPT_ENHANCER_FIXTURES } from "./fixtures/index.js";
|
|
7
|
+
import { runEnhancement } from "./pipeline.js";
|
|
8
|
+
import { aggregatePromptQuality, scorePromptQuality } from "./scorer.js";
|
|
9
|
+
import { PROMPT_ENHANCER_EVAL_SCHEMA_VERSION, } from "./types.js";
|
|
10
|
+
function summarize(fixtureResults, dimensions) {
|
|
11
|
+
const safety = dimensions.find((d) => d.dimension === "safety");
|
|
12
|
+
const safetyGatePassed = (safety?.failCount ?? 0) === 0;
|
|
13
|
+
const allClean = dimensions.every((d) => d.failCount === 0);
|
|
14
|
+
return {
|
|
15
|
+
totalFixtures: fixtureResults.length,
|
|
16
|
+
fullyPassedFixtures: fixtureResults.filter((f) => f.fullyPassed).length,
|
|
17
|
+
safetyGatePassed,
|
|
18
|
+
goNoGo: allClean ? "GO" : "NO-GO",
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
function runFixture(fixture) {
|
|
22
|
+
const observation = runEnhancement(fixture.name, fixture.request);
|
|
23
|
+
const dimensionResults = scorePromptQuality(fixture, observation);
|
|
24
|
+
return {
|
|
25
|
+
fixtureName: fixture.name,
|
|
26
|
+
category: fixture.category,
|
|
27
|
+
observation,
|
|
28
|
+
dimensionResults,
|
|
29
|
+
fullyPassed: dimensionResults.every((d) => d.outcome !== "fail"),
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Run the Prompt Enhancer evaluation suite and return a fully aggregated scorecard. Pure and
|
|
34
|
+
* deterministic. Pass an explicit fixture list to scope the run (the suite tests use the default set).
|
|
35
|
+
*/
|
|
36
|
+
export function runPromptEnhancerEvaluation(fixtures = ALL_PROMPT_ENHANCER_FIXTURES) {
|
|
37
|
+
const fixtureResults = fixtures.map(runFixture);
|
|
38
|
+
const dimensions = aggregatePromptQuality(fixtureResults.map((f) => f.dimensionResults));
|
|
39
|
+
const coveredTaskClasses = [
|
|
40
|
+
...new Set(fixtureResults.map((f) => f.observation.analysis.taskClass)),
|
|
41
|
+
];
|
|
42
|
+
return {
|
|
43
|
+
schemaVersion: PROMPT_ENHANCER_EVAL_SCHEMA_VERSION,
|
|
44
|
+
fixtureResults,
|
|
45
|
+
dimensions,
|
|
46
|
+
summary: summarize(fixtureResults, dimensions),
|
|
47
|
+
coveredTaskClasses,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { type EnhancementObservation, type PromptEnhancerEvalFixture, type PromptQualityDimensionResult, type PromptQualityScorecardEntry } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Score one fixture's observation across all eight dimensions. A dimension the fixture does not declare
|
|
4
|
+
* is "not-applicable". Pure.
|
|
5
|
+
*/
|
|
6
|
+
export declare function scorePromptQuality(fixture: PromptEnhancerEvalFixture, obs: EnhancementObservation): readonly PromptQualityDimensionResult[];
|
|
7
|
+
export declare function aggregatePromptQuality(results: readonly (readonly PromptQualityDimensionResult[])[]): readonly PromptQualityScorecardEntry[];
|
|
8
|
+
//# sourceMappingURL=scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../src/promptEnhancer/scorer.ts"],"names":[],"mappings":"AAgBA,OAAO,EAEL,KAAK,sBAAsB,EAC3B,KAAK,yBAAyB,EAG9B,KAAK,4BAA4B,EACjC,KAAK,2BAA2B,EACjC,MAAM,YAAY,CAAC;AA0RpB;;;GAGG;AACH,wBAAgB,kBAAkB,CAChC,OAAO,EAAE,yBAAyB,EAClC,GAAG,EAAE,sBAAsB,GAC1B,SAAS,4BAA4B,EAAE,CAUzC;AA8BD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,SAAS,CAAC,SAAS,4BAA4B,EAAE,CAAC,EAAE,GAC5D,SAAS,2BAA2B,EAAE,CAExC"}
|