daftari 1.15.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/README.md +8 -2
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +6 -0
- package/dist/cli.js.map +1 -1
- package/dist/eval/generate.d.ts +12 -0
- package/dist/eval/generate.d.ts.map +1 -0
- package/dist/eval/generate.js +221 -0
- package/dist/eval/generate.js.map +1 -0
- package/dist/eval/index.d.ts +2 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +311 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/llm.d.ts +47 -0
- package/dist/eval/llm.d.ts.map +1 -0
- package/dist/eval/llm.js +165 -0
- package/dist/eval/llm.js.map +1 -0
- package/dist/eval/prompts.d.ts +5 -0
- package/dist/eval/prompts.d.ts.map +1 -0
- package/dist/eval/prompts.js +44 -0
- package/dist/eval/prompts.js.map +1 -0
- package/dist/eval/run.d.ts +13 -0
- package/dist/eval/run.d.ts.map +1 -0
- package/dist/eval/run.js +78 -0
- package/dist/eval/run.js.map +1 -0
- package/dist/eval/score.d.ts +12 -0
- package/dist/eval/score.d.ts.map +1 -0
- package/dist/eval/score.js +154 -0
- package/dist/eval/score.js.map +1 -0
- package/dist/eval/storage.d.ts +10 -0
- package/dist/eval/storage.d.ts.map +1 -0
- package/dist/eval/storage.js +69 -0
- package/dist/eval/storage.js.map +1 -0
- package/dist/eval/subgraph.d.ts +17 -0
- package/dist/eval/subgraph.d.ts.map +1 -0
- package/dist/eval/subgraph.js +214 -0
- package/dist/eval/subgraph.js.map +1 -0
- package/dist/eval/tool-surface.d.ts +7 -0
- package/dist/eval/tool-surface.d.ts.map +1 -0
- package/dist/eval/tool-surface.js +160 -0
- package/dist/eval/tool-surface.js.map +1 -0
- package/dist/eval/types.d.ts +173 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +44 -0
- package/dist/eval/types.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
// src/eval/index.ts
|
|
2
|
+
// Top-level CLI dispatcher for `daftari eval`. Parses flags, routes to
|
|
3
|
+
// generate/run/score/top-level, translates Result<T, CortexEvalError> to exit
|
|
4
|
+
// codes (2 = config, 3 = runtime/llm).
|
|
5
|
+
import { createHash } from "node:crypto";
|
|
6
|
+
import { resolve } from "node:path";
|
|
7
|
+
import { generateQuestions } from "./generate.js";
|
|
8
|
+
import { createAnthropicClient } from "./llm.js";
|
|
9
|
+
import { PROMPT_VERSION } from "./prompts.js";
|
|
10
|
+
import { runAnswerer } from "./run.js";
|
|
11
|
+
import { aggregateScore, gradeAnswer } from "./score.js";
|
|
12
|
+
import { appendHistory, readQuestionSet, readResults, writeQuestionSet, writeResults, writeScore, } from "./storage.js";
|
|
13
|
+
import { sampleSubgraph } from "./subgraph.js";
|
|
14
|
+
import { SPEC_VERSION, TIERS, } from "./types.js";
|
|
15
|
+
const HELP = `daftari eval — cortex quality metric.
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
daftari eval [--vault <path>] [--n <count>] [--k <count>] [--seed <str>]
|
|
19
|
+
daftari eval generate [--vault <path>] [--n <count>] [--seed <str>]
|
|
20
|
+
daftari eval run [--questions <id>] [--vault <path>] [--model <id>] [--k <count>] [--resume <results-id>]
|
|
21
|
+
daftari eval score [--results <id>] [--vault <path>] [--grader-model <id>]
|
|
22
|
+
|
|
23
|
+
(--questions and --results take the artifact id printed by a prior stage,
|
|
24
|
+
not a file path; artifacts live under .daftari/eval/.)
|
|
25
|
+
|
|
26
|
+
Defaults:
|
|
27
|
+
--n 15 total questions across three tiers (5 each)
|
|
28
|
+
--k 2 runs per question for variance estimation
|
|
29
|
+
--model claude-sonnet-4-6 (DEFAULT_MODEL in src/eval/index.ts)
|
|
30
|
+
--vault current working directory
|
|
31
|
+
|
|
32
|
+
Environment:
|
|
33
|
+
ANTHROPIC_API_KEY required for any LLM-mediated stage
|
|
34
|
+
|
|
35
|
+
Disk usage:
|
|
36
|
+
.daftari/eval/results/ and scores/ grow without bound across runs. v1
|
|
37
|
+
recovery is a manual rm -rf .daftari/eval/results/; rerunning regenerates
|
|
38
|
+
what's needed. A daftari eval prune command is the planned v2 follow-up.
|
|
39
|
+
|
|
40
|
+
Exit codes:
|
|
41
|
+
0 — eval completed
|
|
42
|
+
2 — config error (missing API key, bad flags, no vault)
|
|
43
|
+
3 — runtime/LLM error (retries exhausted, vault I/O failure)
|
|
44
|
+
`;
|
|
45
|
+
export async function runEval(argv) {
|
|
46
|
+
if (argv.includes("--help") || argv.includes("-h")) {
|
|
47
|
+
process.stdout.write(HELP);
|
|
48
|
+
return 0;
|
|
49
|
+
}
|
|
50
|
+
// runEval is the CLI boundary and must not throw: a bad integer flag
|
|
51
|
+
// (intFlag) or any unexpected error becomes a config-error exit code (2).
|
|
52
|
+
try {
|
|
53
|
+
const [mode, ...rest] = argv;
|
|
54
|
+
switch (mode) {
|
|
55
|
+
case "generate":
|
|
56
|
+
return await runGenerate(rest);
|
|
57
|
+
case "run":
|
|
58
|
+
return await runRun(rest);
|
|
59
|
+
case "score":
|
|
60
|
+
return await runScore(rest);
|
|
61
|
+
default:
|
|
62
|
+
return await runTopLevel(argv);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
process.stderr.write(`${e instanceof Error ? e.message : String(e)}\n`);
|
|
67
|
+
return 2;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
function flag(argv, name) {
|
|
71
|
+
const i = argv.indexOf(`--${name}`);
|
|
72
|
+
if (i < 0 || i + 1 >= argv.length)
|
|
73
|
+
return undefined;
|
|
74
|
+
return argv[i + 1];
|
|
75
|
+
}
|
|
76
|
+
function intFlag(argv, name, def) {
|
|
77
|
+
const v = flag(argv, name);
|
|
78
|
+
if (v === undefined)
|
|
79
|
+
return def;
|
|
80
|
+
const n = parseInt(v, 10);
|
|
81
|
+
if (Number.isNaN(n))
|
|
82
|
+
throw new Error(`--${name} must be an integer`);
|
|
83
|
+
return n;
|
|
84
|
+
}
|
|
85
|
+
function vaultHash(vault) {
|
|
86
|
+
return createHash("sha256").update(resolve(vault)).digest("hex").slice(0, 12);
|
|
87
|
+
}
|
|
88
|
+
function defaultSeed(vault) {
|
|
89
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
90
|
+
return `${vaultHash(vault)}-${today}`;
|
|
91
|
+
}
|
|
92
|
+
const DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
93
|
+
async function runGenerate(argv) {
|
|
94
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
95
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
96
|
+
return 2;
|
|
97
|
+
}
|
|
98
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
99
|
+
const n = intFlag(argv, "n", 15);
|
|
100
|
+
const seed = flag(argv, "seed") ?? defaultSeed(vault);
|
|
101
|
+
const sg = await sampleSubgraph(vault, seed, { maxNodes: 5 });
|
|
102
|
+
if (!sg.ok) {
|
|
103
|
+
process.stderr.write(`${sg.error.message}\n`);
|
|
104
|
+
return 3;
|
|
105
|
+
}
|
|
106
|
+
const client = createAnthropicClient();
|
|
107
|
+
const qs = await generateQuestions(sg.value, client, {
|
|
108
|
+
n,
|
|
109
|
+
model: DEFAULT_MODEL,
|
|
110
|
+
vaultHash: vaultHash(vault),
|
|
111
|
+
seed,
|
|
112
|
+
});
|
|
113
|
+
if (!qs.ok) {
|
|
114
|
+
process.stderr.write(`${qs.error.message}\n`);
|
|
115
|
+
return 3;
|
|
116
|
+
}
|
|
117
|
+
qs.value.timestamp = new Date().toISOString();
|
|
118
|
+
qs.value.id = `${qs.value.vault_hash}-${qs.value.seed}-${qs.value.timestamp}`;
|
|
119
|
+
await writeQuestionSet(vault, qs.value);
|
|
120
|
+
process.stdout.write(`wrote question set ${qs.value.id} (${qs.value.questions.length} questions)\n`);
|
|
121
|
+
return 0;
|
|
122
|
+
}
|
|
123
|
+
async function runRun(argv) {
|
|
124
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
125
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
126
|
+
return 2;
|
|
127
|
+
}
|
|
128
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
129
|
+
const questionsId = flag(argv, "questions");
|
|
130
|
+
if (!questionsId) {
|
|
131
|
+
process.stderr.write("--questions required\n");
|
|
132
|
+
return 2;
|
|
133
|
+
}
|
|
134
|
+
const k = intFlag(argv, "k", 2);
|
|
135
|
+
const model = flag(argv, "model") ?? DEFAULT_MODEL;
|
|
136
|
+
const qsRead = await readQuestionSet(vault, questionsId);
|
|
137
|
+
if (!qsRead.ok) {
|
|
138
|
+
process.stderr.write(`${qsRead.error.message}\n`);
|
|
139
|
+
return 3;
|
|
140
|
+
}
|
|
141
|
+
let resumeFrom;
|
|
142
|
+
const resumeId = flag(argv, "resume");
|
|
143
|
+
if (resumeId) {
|
|
144
|
+
const r = await readResults(vault, resumeId);
|
|
145
|
+
if (r.ok)
|
|
146
|
+
resumeFrom = r.value;
|
|
147
|
+
}
|
|
148
|
+
// Mint the stable id + timestamp up front so the on-disk file path is stable
|
|
149
|
+
// across the run and any later --resume; persist incrementally so a mid-run
|
|
150
|
+
// failure leaves a resumable partial file.
|
|
151
|
+
const timestamp = new Date().toISOString();
|
|
152
|
+
const runId = resumeFrom ? resumeFrom.id : `${qsRead.value.id}-${model}-${timestamp}`;
|
|
153
|
+
const client = createAnthropicClient();
|
|
154
|
+
const run = await runAnswerer(qsRead.value, vault, client, {
|
|
155
|
+
k,
|
|
156
|
+
model,
|
|
157
|
+
resumeFrom,
|
|
158
|
+
runId,
|
|
159
|
+
timestamp,
|
|
160
|
+
persist: (r) => writeResults(vault, r),
|
|
161
|
+
});
|
|
162
|
+
if (!run.ok) {
|
|
163
|
+
process.stderr.write(`${run.error.message}\n`);
|
|
164
|
+
process.stderr.write(`partial results saved as ${runId}; resume with: daftari eval run --questions ${questionsId} --resume ${runId}\n`);
|
|
165
|
+
return 3;
|
|
166
|
+
}
|
|
167
|
+
await writeResults(vault, run.value); // final write (covers the zero-question edge where persist never fired)
|
|
168
|
+
process.stdout.write(`wrote results ${run.value.id}\n`);
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
async function runScore(argv) {
|
|
172
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
173
|
+
const resultsId = flag(argv, "results");
|
|
174
|
+
if (!resultsId) {
|
|
175
|
+
process.stderr.write("--results required\n");
|
|
176
|
+
return 2;
|
|
177
|
+
}
|
|
178
|
+
const graderModel = flag(argv, "grader-model") ?? DEFAULT_MODEL;
|
|
179
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
180
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
181
|
+
return 2;
|
|
182
|
+
}
|
|
183
|
+
const runRead = await readResults(vault, resultsId);
|
|
184
|
+
if (!runRead.ok) {
|
|
185
|
+
process.stderr.write(`${runRead.error.message}\n`);
|
|
186
|
+
return 3;
|
|
187
|
+
}
|
|
188
|
+
const run = runRead.value;
|
|
189
|
+
const qsRead = await readQuestionSet(vault, run.questions_id);
|
|
190
|
+
if (!qsRead.ok) {
|
|
191
|
+
process.stderr.write(`${qsRead.error.message}\n`);
|
|
192
|
+
return 3;
|
|
193
|
+
}
|
|
194
|
+
const qs = qsRead.value;
|
|
195
|
+
const grader = createAnthropicClient();
|
|
196
|
+
const grades = [];
|
|
197
|
+
const traces = new Map();
|
|
198
|
+
for (const [, pr] of Object.entries(run.runs)) {
|
|
199
|
+
if (pr.status !== "complete" || !pr.trace)
|
|
200
|
+
continue;
|
|
201
|
+
const q = qs.questions[pr.question_index];
|
|
202
|
+
if (!q)
|
|
203
|
+
continue;
|
|
204
|
+
const g = await gradeAnswer(q, pr.question_index, pr.k_index, pr.trace, grader, {
|
|
205
|
+
model: graderModel,
|
|
206
|
+
});
|
|
207
|
+
if (g.ok) {
|
|
208
|
+
grades.push(g.value);
|
|
209
|
+
traces.set(`${q.id}:${pr.k_index}`, pr.trace);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
const score = aggregateScore(grades, qs.questions, { traces });
|
|
213
|
+
score.models = {
|
|
214
|
+
generator: qs.generator_model,
|
|
215
|
+
answerer: run.answerer_model,
|
|
216
|
+
grader: graderModel,
|
|
217
|
+
};
|
|
218
|
+
score.prompt_version = PROMPT_VERSION;
|
|
219
|
+
score.spec_version = SPEC_VERSION;
|
|
220
|
+
score.questions_id = qs.id;
|
|
221
|
+
score.results_id = run.id;
|
|
222
|
+
score.vault_hash = qs.vault_hash;
|
|
223
|
+
score.k = run.k;
|
|
224
|
+
score.n = qs.questions.length;
|
|
225
|
+
score.timestamp = new Date().toISOString();
|
|
226
|
+
await writeScore(vault, score);
|
|
227
|
+
const histEntry = {
|
|
228
|
+
score_id: score.results_id,
|
|
229
|
+
score: score.score,
|
|
230
|
+
score_std: score.score_std,
|
|
231
|
+
by_tier: {
|
|
232
|
+
retrieval: score.by_tier.retrieval.mean,
|
|
233
|
+
cross_reference: score.by_tier.cross_reference.mean,
|
|
234
|
+
contradiction: score.by_tier.contradiction.mean,
|
|
235
|
+
},
|
|
236
|
+
vault_hash: score.vault_hash,
|
|
237
|
+
timestamp: score.timestamp,
|
|
238
|
+
n: score.n,
|
|
239
|
+
k: score.k,
|
|
240
|
+
models: score.models,
|
|
241
|
+
prompt_version: score.prompt_version,
|
|
242
|
+
spec_version: score.spec_version,
|
|
243
|
+
};
|
|
244
|
+
await appendHistory(vault, histEntry);
|
|
245
|
+
// Pretty-print headline + per-tier means.
|
|
246
|
+
process.stdout.write(`score: ${score.score.toFixed(3)} ± ${score.score_std.toFixed(3)}\n`);
|
|
247
|
+
for (const t of TIERS) {
|
|
248
|
+
const ts = score.by_tier[t];
|
|
249
|
+
process.stdout.write(` ${t.padEnd(16)}: ${ts.mean.toFixed(3)} (n=${ts.n}, efficiency=${ts.trace_efficiency.toFixed(1)} calls)\n`);
|
|
250
|
+
}
|
|
251
|
+
return 0;
|
|
252
|
+
}
|
|
253
|
+
async function runTopLevel(argv) {
|
|
254
|
+
// Spec §3 "Top-level convenience": runs generate → run → score in one shot.
|
|
255
|
+
// We thread the IDs in-memory rather than re-reading from disk, so a
|
|
256
|
+
// failure mid-pipeline still leaves the on-disk artifacts that did
|
|
257
|
+
// succeed for forensic / resume use.
|
|
258
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
259
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
260
|
+
return 2;
|
|
261
|
+
}
|
|
262
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
263
|
+
const n = intFlag(argv, "n", 15);
|
|
264
|
+
const k = intFlag(argv, "k", 2);
|
|
265
|
+
const seed = flag(argv, "seed") ?? defaultSeed(vault);
|
|
266
|
+
const model = flag(argv, "model") ?? DEFAULT_MODEL;
|
|
267
|
+
// 1. Generate
|
|
268
|
+
const sg = await sampleSubgraph(vault, seed, { maxNodes: 5 });
|
|
269
|
+
if (!sg.ok) {
|
|
270
|
+
process.stderr.write(`${sg.error.message}\n`);
|
|
271
|
+
return 3;
|
|
272
|
+
}
|
|
273
|
+
const apiClient = createAnthropicClient();
|
|
274
|
+
const qsRes = await generateQuestions(sg.value, apiClient, {
|
|
275
|
+
n,
|
|
276
|
+
model,
|
|
277
|
+
vaultHash: vaultHash(vault),
|
|
278
|
+
seed,
|
|
279
|
+
});
|
|
280
|
+
if (!qsRes.ok) {
|
|
281
|
+
process.stderr.write(`${qsRes.error.message}\n`);
|
|
282
|
+
return 3;
|
|
283
|
+
}
|
|
284
|
+
const qs = qsRes.value;
|
|
285
|
+
qs.timestamp = new Date().toISOString();
|
|
286
|
+
qs.id = `${qs.vault_hash}-${qs.seed}-${qs.timestamp}`;
|
|
287
|
+
await writeQuestionSet(vault, qs);
|
|
288
|
+
process.stdout.write(`generated ${qs.questions.length} questions (id=${qs.id})\n`);
|
|
289
|
+
// 2. Run — mint the stable id + timestamp up front and persist incrementally
|
|
290
|
+
// so a mid-run failure leaves a resumable partial file.
|
|
291
|
+
const runTimestamp = new Date().toISOString();
|
|
292
|
+
const runId = `${qs.id}-${model}-${runTimestamp}`;
|
|
293
|
+
const runRes = await runAnswerer(qs, vault, apiClient, {
|
|
294
|
+
k,
|
|
295
|
+
model,
|
|
296
|
+
runId,
|
|
297
|
+
timestamp: runTimestamp,
|
|
298
|
+
persist: (r) => writeResults(vault, r),
|
|
299
|
+
});
|
|
300
|
+
if (!runRes.ok) {
|
|
301
|
+
process.stderr.write(`${runRes.error.message}\n`);
|
|
302
|
+
process.stderr.write(`partial results saved as ${runId}; resume with: daftari eval run --questions ${qs.id} --resume ${runId}\n`);
|
|
303
|
+
return 3;
|
|
304
|
+
}
|
|
305
|
+
const run = runRes.value;
|
|
306
|
+
await writeResults(vault, run); // final write (covers the zero-question edge where persist never fired)
|
|
307
|
+
process.stdout.write(`ran ${Object.keys(run.runs).length} answerer invocations (id=${run.id})\n`);
|
|
308
|
+
// 3. Score — invoke the same grading logic runScore uses, in-process.
|
|
309
|
+
return await runScore(["--vault", vault, "--results", run.id, "--grader-model", model]);
|
|
310
|
+
}
|
|
311
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,oBAAoB;AACpB,uEAAuE;AACvE,8EAA8E;AAC9E,uCAAuC;AAEvC,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAClD,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzD,OAAO,EACL,aAAa,EACb,eAAe,EACf,WAAW,EACX,gBAAgB,EAChB,YAAY,EACZ,UAAU,GACX,MAAM,cAAc,CAAC;AACtB,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/C,OAAO,EAIL,YAAY,EACZ,KAAK,GAEN,MAAM,YAAY,CAAC;AAEpB,MAAM,IAAI,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA6BZ,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAc;IAC1C,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACnD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC3B,OAAO,CAAC,CAAC;IACX,CAAC;IACD,qEAAqE;IACrE,0EAA0E;IAC1E,IAAI,CAAC;QACH,MAAM,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;QAC7B,QAAQ,IAAI,EAAE,CAAC;YACb,KAAK,UAAU;gBACb,OAAO,MAAM,WAAW,CAAC,IAAI,CAAC,CAAC;YACjC,KAAK,KAAK;gBACR,OAAO,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,OAAO;gBACV,OAAO,MAAM,QAAQ,CAAC,IAAI,CAAC,CAAC;YAC9B;gBACE,OAAO,MAAM,WAAW,CAAC,IAAI,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACxE,OAAO,CAAC,CAAC;IACX,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,IAAc,EAAE,IAAY;IACxC,MAAM,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;IACpC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM;QAAE,OAAO,SAAS,CAAC;IACpD,OAAO,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;AACrB,CAAC;AACD,SAAS,OAAO,CAAC,IAAc,EAAE,IAAY,EAAE,GAAW;IACxD,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAC3B,IAAI,CAAC,KAAK,SAAS;QAAE,OAAO,GAAG,CAAC;IAChC,MAAM,CAAC,GAAG,QAAQ,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC1B,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,KAAK,IAAI,qBAAqB,CAAC,CAAC;IACrE,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAChF,CAAC;AAED,SAAS,WAAW,CAAC,KAAa;IAChC,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACpD,OAAO,GAAG,SAAS,CAAC,KAAK,CAAC,IAAI,KAAK,EAAE,CAAC;AACxC,CAAC;AAED,MAAM,aAAa,GAAG,mBAAmB,CAAC;AAE1C,KAAK,UAAU,WAAW,CAAC,IAAc;IACvC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QACnC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QACrD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACnD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;IACjC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC;IAEtD,MAAM,EAAE,GAAG,MAAM,cAAc,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACX,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAC9C,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,MAAM,GAAG,qBAAqB,EAAE,CAAC;IACvC,MAAM,EAAE,GAAG,MAAM,iBAAiB,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE;QACnD,CAAC;QACD,KAAK,EAAE,aAAa;QACpB,SAAS,EAAE,SAAS,CAAC,KAAK,CAAC;QAC3B,IAAI;KACL,CAAC,CAAC;IACH,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACX,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAC9C,OAAO,CAAC,CAAC;IACX,CAAC;IACD,EAAE,CAAC,KAAK,CAAC,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC9C,EAAE,CAAC,KAAK,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,KAAK,CAAC,UAAU,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;IAC9E,MAAM,gBAAgB,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;IACxC,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,sBAAsB,EAAE,CAAC,KAAK,CAAC,EAAE,KAAK,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,eAAe,CAC/E,CAAC;IACF,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,MAAM,CAAC,IAAc;IAClC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QACnC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QACrD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAC5C,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC;QAC/C,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,aAAa,CAAC;IAEnD,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IACzD,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;QACf,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAClD,OAAO,CAAC,CAAC;IACX,CAAC;IAED,IAAI,UAA+B,CAAC;IACpC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACtC,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,CAAC,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,CAAC,EAAE;YAAE,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC;IACjC,CAAC;IAED,6EAA6E;IAC7E,4EAA4E;IAC5E,2CAA2C;IAC3C,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC3C,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,EAAE,IAAI,KAAK,IAAI,SAAS,EAAE,CAAC;IACtF,MAAM,MAAM,GAAG,qBAAqB,EAAE,CAAC;IACvC,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE;QACzD,CAAC;QACD,KAAK;QACL,UAAU;QACV,KAAK;QACL,SAAS;QACT,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,KAAK,EAAE,CAAC,CAAC;KACvC,CAAC,CAAC;IACH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;QACZ,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAC/C,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,4BAA4B,KAAK,+CAA+C,WAAW,aAAa,KAAK,IAAI,CAClH,CAAC;QACF,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,wEAAwE;IAC9G,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,iBAAiB,GAAG,CAAC,KAAK,CAAC,EAAE,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,QAAQ,CAAC,IAAc;IACpC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACnD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IACxC,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC7C,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,cAAc,CAAC,IAAI,aAAa,CAAC;IAChE,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QACnC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QACrD,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IACpD,IAAI,CAAC,OAAO,CAAC,EAAE,EAAE,CAAC;QAChB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QACnD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,KAAK,EAAE,GAAG,CAAC,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;QACf,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAClD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC;IAExB,MAAM,MAAM,GAAG,qBAAqB,EAAE,CAAC;IACvC,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,IAAI,GAAG,EAAiB,CAAC;IACxC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9C,IAAI,EAAE,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,EAAE,CAAC,KAAK;YAAE,SAAS;QACpD,MAAM,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC,cAAc,CAAC,CAAC;QAC1C,IAAI,CAAC,CAAC;YAAE,SAAS;QACjB,MAAM,CAAC,GAAG,MAAM,WAAW,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,EAAE,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE;YAC9E,KAAK,EAAE,WAAW;SACnB,CAAC,CAAC;QACH,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACT,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACrB,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IACD,MAAM,KAAK,GAAG,cAAc,CAAC,MAAM,EAAE,EAAE,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,CAAC,CAAC;IAC/D,KAAK,CAAC,MAAM,GAAG;QACb,SAAS,EAAE,EAAE,CAAC,eAAe;QAC7B,QAAQ,EAAE,GAAG,CAAC,cAAc;QAC5B,MAAM,EAAE,WAAW;KACpB,CAAC;IACF,KAAK,CAAC,cAAc,GAAG,cAAc,CAAC;IACtC,KAAK,CAAC,YAAY,GAAG,YAAY,CAAC;IAClC,KAAK,CAAC,YAAY,GAAG,EAAE,CAAC,EAAE,CAAC;IAC3B,KAAK,CAAC,UAAU,GAAG,GAAG,CAAC,EAAE,CAAC;IAC1B,KAAK,CAAC,UAAU,GAAG,EAAE,CAAC,UAAU,CAAC;IACjC,KAAK,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;IAChB,KAAK,CAAC,CAAC,GAAG,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC;IAC9B,KAAK,CAAC,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC3C,MAAM,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAE/B,MAAM,SAAS,GAAiB;QAC9B,QAAQ,EAAE,KAAK,CAAC,UAAU;QAC1B,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,OAAO,EAAE;YACP,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI;YACvC,eAAe,EAAE,KAAK,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI;YACnD,aAAa,EAAE,KAAK,CAAC,OAAO,CAAC,aAAa,CAAC,IAAI;SAChD;QACD,UAAU,EAAE,KAAK,CAAC,UAAU;QAC5B,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,CAAC,EAAE,KAAK,CAAC,CAAC;QACV,CAAC,EAAE,KAAK,CAAC,CAAC;QACV,MAAM,EAAE,KAAK,CAAC,MAAM;QACpB,cAAc,EAAE,KAAK,CAAC,cAAc;QACpC,YAAY,EAAE,KAAK,CAAC,YAAY;KACjC,CAAC;IACF,MAAM,aAAa,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAEtC,0CAA0C;IAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC3F,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,EAAE,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,KAAK,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,gBAAgB,EAAE,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAC7G,CAAC;IACJ,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,IAAc;IACvC,4EAA4E;IAC5E,qEAAqE;IACrE,mEAAmE;IACnE,qCAAqC;IACrC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QACnC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QACrD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACnD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;IACjC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC;IACtD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,IAAI,aAAa,CAAC;IAEnD,cAAc;IACd,MAAM,EAAE,GAAG,MAAM,cAAc,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACX,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAC9C,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,SAAS,GAAG,qBAAqB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,EAAE,CAAC,KAAK,EAAE,SAAS,EAAE;QACzD,CAAC;QACD,KAAK;QACL,SAAS,EAAE,SAAS,CAAC,KAAK,CAAC;QAC3B,IAAI;KACL,CAAC,CAAC;IACH,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC;QACd,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QACjD,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,EAAE,GAAG,KAAK,CAAC,KAAK,CAAC;IACvB,EAAE,CAAC,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACxC,EAAE,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,UAAU,IAAI,EAAE,CAAC,IAAI,IAAI,EAAE,CAAC,SAAS,EAAE,CAAC;IACtD,MAAM,gBAAgB,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAClC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,aAAa,EAAE,CAAC,SAAS,CAAC,MAAM,kBAAkB,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC;IAEnF,6EAA6E;IAC7E,wDAAwD;IACxD,MAAM,YAAY,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC9C,MAAM,KAAK,GAAG,GAAG,EAAE,CAAC,EAAE,IAAI,KAAK,IAAI,YAAY,EAAE,CAAC;IAClD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE;QACrD,CAAC;QACD,KAAK;QACL,KAAK;QACL,SAAS,EAAE,YAAY;QACvB,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,KAAK,EAAE,CAAC,CAAC;KACvC,CAAC,CAAC;IACH,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;QACf,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QAClD,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,4BAA4B,KAAK,+CAA+C,EAAE,CAAC,EAAE,aAAa,KAAK,IAAI,CAC5G,CAAC;QACF,OAAO,CAAC,CAAC;IACX,CAAC;IACD,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC;IACzB,MAAM,YAAY,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC,wEAAwE;IACxG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,MAAM,6BAA6B,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC;IAElG,sEAAsE;IACtE,OAAO,MAAM,QAAQ,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,CAAC,EAAE,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC,CAAC;AAC1F,CAAC"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { type Result } from "../frontmatter/types.js";
|
|
2
|
+
import type { CortexEvalError } from "./types.js";
|
|
3
|
+
export interface CompleteOpts {
|
|
4
|
+
model: string;
|
|
5
|
+
system: string;
|
|
6
|
+
user: string;
|
|
7
|
+
maxTokens?: number;
|
|
8
|
+
}
|
|
9
|
+
export interface CompleteJsonOpts extends CompleteOpts {
|
|
10
|
+
schema: any;
|
|
11
|
+
}
|
|
12
|
+
export interface ToolDef {
|
|
13
|
+
name: string;
|
|
14
|
+
description: string;
|
|
15
|
+
input_schema: any;
|
|
16
|
+
}
|
|
17
|
+
export interface CompleteWithToolsOpts extends CompleteOpts {
|
|
18
|
+
tools: ToolDef[];
|
|
19
|
+
toolHandler: (name: string, input: unknown) => Promise<unknown>;
|
|
20
|
+
maxRounds?: number;
|
|
21
|
+
}
|
|
22
|
+
export interface CompleteResult {
|
|
23
|
+
text: string;
|
|
24
|
+
input_tokens: number;
|
|
25
|
+
output_tokens: number;
|
|
26
|
+
stop_reason: string;
|
|
27
|
+
}
|
|
28
|
+
export interface CompleteJsonResult extends CompleteResult {
|
|
29
|
+
parsed: unknown;
|
|
30
|
+
}
|
|
31
|
+
export interface CompleteWithToolsResult extends CompleteResult {
|
|
32
|
+
tool_calls: {
|
|
33
|
+
tool: string;
|
|
34
|
+
input: unknown;
|
|
35
|
+
output: unknown;
|
|
36
|
+
latency_ms: number;
|
|
37
|
+
}[];
|
|
38
|
+
}
|
|
39
|
+
export interface LlmClient {
|
|
40
|
+
complete(opts: CompleteOpts): Promise<Result<CompleteResult, CortexEvalError>>;
|
|
41
|
+
completeJson(opts: CompleteJsonOpts): Promise<Result<CompleteJsonResult, CortexEvalError>>;
|
|
42
|
+
completeWithTools(opts: CompleteWithToolsOpts): Promise<Result<CompleteWithToolsResult, CortexEvalError>>;
|
|
43
|
+
}
|
|
44
|
+
export declare function createAnthropicClient(): LlmClient;
|
|
45
|
+
export declare function retry<T>(fn: () => Promise<Result<T, CortexEvalError>>): Promise<Result<T, CortexEvalError>>;
|
|
46
|
+
export declare function stripCodeFence(s: string): string;
|
|
47
|
+
//# sourceMappingURL=llm.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm.d.ts","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAMA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAElD,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAiB,SAAQ,YAAY;IAEpD,MAAM,EAAE,GAAG,CAAC;CACb;AAED,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IAEpB,YAAY,EAAE,GAAG,CAAC;CACnB;AAED,MAAM,WAAW,qBAAsB,SAAQ,YAAY;IACzD,KAAK,EAAE,OAAO,EAAE,CAAC;IACjB,WAAW,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC;IAChE,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,kBAAmB,SAAQ,cAAc;IACxD,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,uBAAwB,SAAQ,cAAc;IAC7D,UAAU,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,OAAO,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;CACrF;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC,CAAC;IAC/E,YAAY,CAAC,IAAI,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAAE,eAAe,CAAC,CAAC,CAAC;IAC3F,iBAAiB,CACf,IAAI,EAAE,qBAAqB,GAC1B,OAAO,CAAC,MAAM,CAAC,uBAAuB,EAAE,eAAe,CAAC,CAAC,CAAC;CAC9D;AAED,wBAAgB,qBAAqB,IAAI,SAAS,CAiIjD;AAWD,wBAAsB,KAAK,CAAC,CAAC,EAC3B,EAAE,EAAE,MAAM,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC,GAC5C,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC,CAuBrC;AAED,wBAAgB,cAAc,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAGhD"}
|
package/dist/eval/llm.js
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
// src/eval/llm.ts
|
|
2
|
+
// Single-point wrapper around @anthropic-ai/sdk. Other eval modules depend
|
|
3
|
+
// on the LlmClient interface, not the SDK, so they can be unit-tested with
|
|
4
|
+
// hand-rolled mocks.
|
|
5
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
6
|
+
import { err, ok } from "../frontmatter/types.js";
|
|
7
|
+
export function createAnthropicClient() {
|
|
8
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
9
|
+
if (!apiKey)
|
|
10
|
+
throw new Error("ANTHROPIC_API_KEY env var is required for daftari eval");
|
|
11
|
+
const client = new Anthropic({ apiKey });
|
|
12
|
+
const complete = async (opts) => {
|
|
13
|
+
return retry(async () => {
|
|
14
|
+
const res = await client.messages.create({
|
|
15
|
+
model: opts.model,
|
|
16
|
+
max_tokens: opts.maxTokens ?? 4096,
|
|
17
|
+
system: opts.system,
|
|
18
|
+
messages: [{ role: "user", content: opts.user }],
|
|
19
|
+
});
|
|
20
|
+
const text = res.content
|
|
21
|
+
.filter((b) => b.type === "text")
|
|
22
|
+
.map((b) => b.text)
|
|
23
|
+
.join("");
|
|
24
|
+
return ok({
|
|
25
|
+
text,
|
|
26
|
+
input_tokens: res.usage.input_tokens,
|
|
27
|
+
output_tokens: res.usage.output_tokens,
|
|
28
|
+
stop_reason: res.stop_reason ?? "unknown",
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
};
|
|
32
|
+
const completeJson = async (opts) => {
|
|
33
|
+
// The schema is embedded in the system prompt as a hint to the LLM, then
|
|
34
|
+
// the response goes through JSON.parse + a manual shape check by the
|
|
35
|
+
// caller (see generate.ts and score.ts). This is NOT strict JSON Schema
|
|
36
|
+
// validation — there is no schema validator dep in v1. Callers must
|
|
37
|
+
// verify required fields exist after parse. If we ever need strict
|
|
38
|
+
// validation, add `ajv` and validate `parsed` here.
|
|
39
|
+
const sysWithSchema = `${opts.system}\n\nReturn JSON matching:\n${JSON.stringify(opts.schema, null, 2)}\nReturn ONLY JSON, no prose.`;
|
|
40
|
+
const r = await complete({ ...opts, system: sysWithSchema });
|
|
41
|
+
if (!r.ok)
|
|
42
|
+
return r;
|
|
43
|
+
try {
|
|
44
|
+
const parsed = JSON.parse(stripCodeFence(r.value.text));
|
|
45
|
+
return ok({ ...r.value, parsed });
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
49
|
+
return err({
|
|
50
|
+
kind: "llm",
|
|
51
|
+
message: `JSON parse: ${msg} — output was: ${r.value.text.slice(0, 200)}`,
|
|
52
|
+
retryable: false,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
const completeWithTools = async (opts) => {
|
|
57
|
+
const maxRounds = opts.maxRounds ?? 12;
|
|
58
|
+
const toolCalls = [];
|
|
59
|
+
const messages = [
|
|
60
|
+
{ role: "user", content: opts.user },
|
|
61
|
+
];
|
|
62
|
+
let totalIn = 0;
|
|
63
|
+
let totalOut = 0;
|
|
64
|
+
let lastStop = "unknown";
|
|
65
|
+
for (let round = 0; round < maxRounds; round++) {
|
|
66
|
+
const res = await retry(async () => ok(await client.messages.create({
|
|
67
|
+
model: opts.model,
|
|
68
|
+
max_tokens: opts.maxTokens ?? 4096,
|
|
69
|
+
system: opts.system,
|
|
70
|
+
// biome-ignore lint/suspicious/noExplicitAny: SDK types
|
|
71
|
+
tools: opts.tools,
|
|
72
|
+
// biome-ignore lint/suspicious/noExplicitAny: SDK types
|
|
73
|
+
messages: messages,
|
|
74
|
+
})));
|
|
75
|
+
if (!res.ok)
|
|
76
|
+
return res;
|
|
77
|
+
const message = res.value;
|
|
78
|
+
totalIn += message.usage.input_tokens;
|
|
79
|
+
totalOut += message.usage.output_tokens;
|
|
80
|
+
lastStop = message.stop_reason ?? "unknown";
|
|
81
|
+
// biome-ignore lint/suspicious/noExplicitAny: SDK content union
|
|
82
|
+
const blocks = message.content;
|
|
83
|
+
const toolUses = blocks.filter((b) => b.type === "tool_use");
|
|
84
|
+
if (toolUses.length === 0) {
|
|
85
|
+
const text = blocks
|
|
86
|
+
.filter((b) => b.type === "text")
|
|
87
|
+
.map((b) => b.text)
|
|
88
|
+
.join("");
|
|
89
|
+
return ok({
|
|
90
|
+
text,
|
|
91
|
+
input_tokens: totalIn,
|
|
92
|
+
output_tokens: totalOut,
|
|
93
|
+
stop_reason: lastStop,
|
|
94
|
+
tool_calls: toolCalls,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
messages.push({ role: "assistant", content: blocks });
|
|
98
|
+
const toolResults = [];
|
|
99
|
+
for (const tu of toolUses) {
|
|
100
|
+
const t0 = Date.now();
|
|
101
|
+
let output;
|
|
102
|
+
try {
|
|
103
|
+
output = await opts.toolHandler(tu.name, tu.input);
|
|
104
|
+
}
|
|
105
|
+
catch (e) {
|
|
106
|
+
output = { tool_error: e instanceof Error ? e.message : String(e) };
|
|
107
|
+
}
|
|
108
|
+
const latency = Date.now() - t0;
|
|
109
|
+
toolCalls.push({ tool: tu.name, input: tu.input, output, latency_ms: latency });
|
|
110
|
+
toolResults.push({
|
|
111
|
+
type: "tool_result",
|
|
112
|
+
tool_use_id: tu.id,
|
|
113
|
+
content: typeof output === "string" ? output : JSON.stringify(output),
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
messages.push({ role: "user", content: toolResults });
|
|
117
|
+
}
|
|
118
|
+
return err({
|
|
119
|
+
kind: "llm",
|
|
120
|
+
message: `exceeded maxRounds (${maxRounds}) without final answer`,
|
|
121
|
+
retryable: false,
|
|
122
|
+
});
|
|
123
|
+
};
|
|
124
|
+
return { complete, completeJson, completeWithTools };
|
|
125
|
+
}
|
|
126
|
+
// --- helpers ---
|
|
127
|
+
const MAX_RETRIES = 5;
|
|
128
|
+
const BASE_BACKOFF_MS = 500;
|
|
129
|
+
const MAX_BACKOFF_MS = 60_000;
|
|
130
|
+
// Exported for unit testing — these two pure helpers carry the trickiest logic
|
|
131
|
+
// in this module (retry arithmetic/predicate, fence stripping) and would
|
|
132
|
+
// otherwise be unreachable, since createAnthropicClient news up the SDK.
|
|
133
|
+
export async function retry(fn) {
|
|
134
|
+
let lastErr = null;
|
|
135
|
+
for (let i = 0; i < MAX_RETRIES; i++) {
|
|
136
|
+
try {
|
|
137
|
+
const r = await fn();
|
|
138
|
+
if (r.ok)
|
|
139
|
+
return r;
|
|
140
|
+
if (!r.error || r.error.kind !== "llm" || !r.error.retryable)
|
|
141
|
+
return r;
|
|
142
|
+
lastErr = r.error;
|
|
143
|
+
}
|
|
144
|
+
catch (e) {
|
|
145
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
146
|
+
const status = e?.status;
|
|
147
|
+
const retryable = status === 429 || (typeof status === "number" && status >= 500);
|
|
148
|
+
if (!retryable)
|
|
149
|
+
return err({ kind: "llm", message: msg, retryable: false });
|
|
150
|
+
lastErr = { kind: "llm", message: msg, retryable: true };
|
|
151
|
+
}
|
|
152
|
+
// Don't sleep after the final attempt — the loop is about to exit and
|
|
153
|
+
// surface the error; a trailing backoff would just delay the failure.
|
|
154
|
+
if (i < MAX_RETRIES - 1) {
|
|
155
|
+
const backoff = Math.min(BASE_BACKOFF_MS * 2 ** i, MAX_BACKOFF_MS);
|
|
156
|
+
await new Promise((res) => setTimeout(res, backoff));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return err(lastErr ?? { kind: "llm", message: "retries exhausted", retryable: false });
|
|
160
|
+
}
|
|
161
|
+
export function stripCodeFence(s) {
|
|
162
|
+
const m = s.match(/^```(?:json)?\n([\s\S]*?)\n```\s*$/);
|
|
163
|
+
return m ? m[1] : s;
|
|
164
|
+
}
|
|
165
|
+
//# sourceMappingURL=llm.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm.js","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2EAA2E;AAC3E,2EAA2E;AAC3E,qBAAqB;AAErB,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAmD/D,MAAM,UAAU,qBAAqB;IACnC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAC7C,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IACvF,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IAEzC,MAAM,QAAQ,GAAG,KAAK,EAAE,IAAkB,EAAoD,EAAE;QAC9F,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;YACtB,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACvC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;gBAClC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC;aACjD,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,GAAG,CAAC,OAAO;iBACrB,MAAM,CAAC,CAAC,CAAC,EAAwD,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;iBACtF,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;iBAClB,IAAI,CAAC,EAAE,CAAC,CAAC;YACZ,OAAO,EAAE,CAAC;gBACR,IAAI;gBACJ,YAAY,EAAE,GAAG,CAAC,KAAK,CAAC,YAAY;gBACpC,aAAa,EAAE,GAAG,CAAC,KAAK,CAAC,aAAa;gBACtC,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS;aAC1C,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC;IAEF,MAAM,YAAY,GAAG,KAAK,EACxB,IAAsB,EACgC,EAAE;QACxD,yEAAyE;QACzE,qEAAqE;QACrE,wEAAwE;QACxE,oEAAoE;QACpE,mEAAmE;QACnE,oDAAoD;QACpD,MAAM,aAAa,GAAG,GAAG,IAAI,CAAC,MAAM,8BAA8B,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,+BAA+B,CAAC;QACtI,MAAM,CAAC,GAAG,MAAM,QAAQ,CAAC,EAAE,GAAG,IAAI,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,CAAC,CAAC,EAAE;YAAE,OAAO,CAAC,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;YACxD,OAAO,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QACpC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACvD,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,KAAK;gBACX,OAAO,EAAE,eAAe,GAAG,kBAAkB,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzE,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC;IAEF,MAAM,iBAAiB,GAAG,KAAK,EAC7B,IAA2B,EACgC,EAAE;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;QACvC,MAAM,SAAS,GAA0C,EAAE,CAAC;QAC5D,MAAM,QAAQ,GAA4D;YACxE,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE;SACrC,CAAC;QACF,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,QAAQ,GAAG,SAAS,CAAC;QAEzB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,SAAS,EAAE,KAAK,EAAE,EAAE,CAAC;YAC/C,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,KAAK,IAAI,EAAE,CACjC,EAAE,CACA,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;gBAClC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,wDAAwD;gBACxD,KAAK,EAAE,IAAI,CAAC,KAAY;gBACxB,wDAAwD;gBACxD,QAAQ,EAAE,QAAe;aAC1B,CAAC,CACH,CACF,CAAC;YACF,IAAI,CAAC,GAAG,CAAC,EAAE;gBAAE,OAAO,GAAG,CAAC;YACxB,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC;YAC1B,OAAO,IAAI,OAAO,CAAC,KAAK,CAAC,YAAY,CAAC;YACtC,QAAQ,IAAI,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC;YACxC,QAAQ,GAAG,OAAO,CAAC,WAAW,IAAI,SAAS,CAAC;YAE5C,gEAAgE;YAChE,MAAM,MAAM,GAAG,OAAO,CAAC,OAAgB,CAAC;YACxC,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;YAC7D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC1B,MAAM,IAAI,GAAG,MAAM;qBAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;qBAChC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;qBAClB,IAAI,CAAC,EAAE,CAAC,CAAC;gBACZ,OAAO,EAAE,CAAC;oBACR,IAAI;oBACJ,YAAY,EAAE,OAAO;oBACrB,aAAa,EAAE,QAAQ;oBACvB,WAAW,EAAE,QAAQ;oBACrB,UAAU,EAAE,SAAS;iBACtB,CAAC,CAAC;YACL,CAAC;YAED,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;YAEtD,MAAM,WAAW,GAAc,EAAE,CAAC;YAClC,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBAC1B,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBACtB,IAAI,MAAe,CAAC;gBACpB,IAAI,CAAC;oBACH,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;gBACrD,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,MAAM,GAAG,EAAE,UAAU,EAAE,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtE,CAAC;gBACD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,EAAE,CAAC;gBAChC,SAAS,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAC;gBAChF,WAAW,CAAC,IAAI,CAAC;oBACf,IAAI,EAAE,aAAa;oBACnB,WAAW,EAAE,EAAE,CAAC,EAAE;oBAClB,OAAO,EAAE,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;iBACtE,CAAC,CAAC;YACL,CAAC;YACD,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,GAAG,CAAC;YACT,IAAI,EAAE,KAAK;YACX,OAAO,EAAE,uBAAuB,SAAS,wBAAwB;YACjE,SAAS,EAAE,KAAK;SACjB,CAAC,CAAC;IACL,CAAC,CAAC;IAEF,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,iBAAiB,EAAE,CAAC;AACvD,CAAC;AAED,kBAAkB;AAElB,MAAM,WAAW,GAAG,CAAC,CAAC;AACtB,MAAM,eAAe,GAAG,GAAG,CAAC;AAC5B,MAAM,cAAc,GAAG,MAAM,CAAC;AAE9B,+EAA+E;AAC/E,yEAAyE;AACzE,yEAAyE;AACzE,MAAM,CAAC,KAAK,UAAU,KAAK,CACzB,EAA6C;IAE7C,IAAI,OAAO,GAA2B,IAAI,CAAC;IAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,MAAM,EAAE,EAAE,CAAC;YACrB,IAAI,CAAC,CAAC,EAAE;gBAAE,OAAO,CAAC,CAAC;YACnB,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,KAAK,CAAC,IAAI,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS;gBAAE,OAAO,CAAC,CAAC;YACvE,OAAO,GAAG,CAAC,CAAC,KAAK,CAAC;QACpB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACvD,MAAM,MAAM,GAAI,CAAyB,EAAE,MAAM,CAAC;YAClD,MAAM,SAAS,GAAG,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,IAAI,GAAG,CAAC,CAAC;YAClF,IAAI,CAAC,SAAS;gBAAE,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;YAC5E,OAAO,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;QAC3D,CAAC;QACD,sEAAsE;QACtE,sEAAsE;QACtE,IAAI,CAAC,GAAG,WAAW,GAAG,CAAC,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,GAAG,CAAC,IAAI,CAAC,EAAE,cAAc,CAAC,CAAC;YACnE,MAAM,IAAI,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC,OAAO,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,mBAAmB,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;AACzF,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,CAAS;IACtC,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AACtB,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare const PROMPT_VERSION = 1;
|
|
2
|
+
export declare const GENERATOR_PROMPT = "You will read a connected subgraph of a Markdown knowledge vault and produce\nmulti-hop questions across three tiers. The questions must be answerable using\nONLY the docs provided. For each question, supply: question text, tier,\nexpected answer, source paths (must be a subset of the supplied docs).\n\nTiers:\n retrieval \u2014 single-doc lookup, 1-hop reasoning\n cross_reference \u2014 requires combining 2\u20133 docs\n contradiction \u2014 surfaces a tension or conflict across docs (use the\n tension log entries in the subgraph as seed material\n where present)\n\nReturn JSON matching the QuestionSetSchema declared in src/eval/types.ts.\nDo not include questions whose expected_sources are not in the supplied docs.\nDo not generate trivial yes/no questions.";
|
|
3
|
+
export declare const ANSWERER_SYSTEM_PROMPT = "You will answer a question about a Markdown knowledge vault using ONLY the\nprovided Daftari tools. Do not use training knowledge. Do not guess. If the\nvault does not contain the answer, say \"Vault does not contain the answer.\"\nCite source paths in your final answer using the format [path/to/doc.md].";
|
|
4
|
+
export declare const GRADER_PROMPT = "You are grading an answer to a question about a Markdown knowledge vault.\n\nQuestion: {{QUESTION}}\nExpected answer: {{EXPECTED_ANSWER}}\nExpected sources: {{EXPECTED_SOURCES}}\nClaimed answer: {{CLAIMED_ANSWER}}\nCited sources: {{CITED_SOURCES}}\n\nReturn JSON: {\"correct\": \"yes\" | \"partial\" | \"no\", \"reasoning\": \"<string>\"}\n\nDefinitions:\n yes \u2014 claimed answer is substantively correct and cites at least one\n expected source\n partial \u2014 claimed answer is partially correct OR cites the right sources\n but misses key content OR the answerer correctly said \"Vault\n does not contain the answer\" when the expected answer disagrees\n (records a question-set quality issue, not a cortex failure)\n no \u2014 claimed answer is wrong, hallucinated, or cites no expected\n sources";
|
|
5
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../src/eval/prompts.ts"],"names":[],"mappings":"AAKA,eAAO,MAAM,cAAc,IAAI,CAAC;AAEhC,eAAO,MAAM,gBAAgB,4zBAca,CAAC;AAE3C,eAAO,MAAM,sBAAsB,sTAGuC,CAAC;AAE3E,eAAO,MAAM,aAAa,o4BAkBN,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// src/eval/prompts.ts
|
|
2
|
+
// Frozen prompts for the three eval LLM roles. Bumping any prompt requires
|
|
3
|
+
// bumping PROMPT_VERSION in the same commit. PROMPT_VERSION is recorded in
|
|
4
|
+
// every output file for forensics and cross-version comparison gates.
|
|
5
|
+
export const PROMPT_VERSION = 1;
|
|
6
|
+
export const GENERATOR_PROMPT = `You will read a connected subgraph of a Markdown knowledge vault and produce
|
|
7
|
+
multi-hop questions across three tiers. The questions must be answerable using
|
|
8
|
+
ONLY the docs provided. For each question, supply: question text, tier,
|
|
9
|
+
expected answer, source paths (must be a subset of the supplied docs).
|
|
10
|
+
|
|
11
|
+
Tiers:
|
|
12
|
+
retrieval — single-doc lookup, 1-hop reasoning
|
|
13
|
+
cross_reference — requires combining 2–3 docs
|
|
14
|
+
contradiction — surfaces a tension or conflict across docs (use the
|
|
15
|
+
tension log entries in the subgraph as seed material
|
|
16
|
+
where present)
|
|
17
|
+
|
|
18
|
+
Return JSON matching the QuestionSetSchema declared in src/eval/types.ts.
|
|
19
|
+
Do not include questions whose expected_sources are not in the supplied docs.
|
|
20
|
+
Do not generate trivial yes/no questions.`;
|
|
21
|
+
export const ANSWERER_SYSTEM_PROMPT = `You will answer a question about a Markdown knowledge vault using ONLY the
|
|
22
|
+
provided Daftari tools. Do not use training knowledge. Do not guess. If the
|
|
23
|
+
vault does not contain the answer, say "Vault does not contain the answer."
|
|
24
|
+
Cite source paths in your final answer using the format [path/to/doc.md].`;
|
|
25
|
+
export const GRADER_PROMPT = `You are grading an answer to a question about a Markdown knowledge vault.
|
|
26
|
+
|
|
27
|
+
Question: {{QUESTION}}
|
|
28
|
+
Expected answer: {{EXPECTED_ANSWER}}
|
|
29
|
+
Expected sources: {{EXPECTED_SOURCES}}
|
|
30
|
+
Claimed answer: {{CLAIMED_ANSWER}}
|
|
31
|
+
Cited sources: {{CITED_SOURCES}}
|
|
32
|
+
|
|
33
|
+
Return JSON: {"correct": "yes" | "partial" | "no", "reasoning": "<string>"}
|
|
34
|
+
|
|
35
|
+
Definitions:
|
|
36
|
+
yes — claimed answer is substantively correct and cites at least one
|
|
37
|
+
expected source
|
|
38
|
+
partial — claimed answer is partially correct OR cites the right sources
|
|
39
|
+
but misses key content OR the answerer correctly said "Vault
|
|
40
|
+
does not contain the answer" when the expected answer disagrees
|
|
41
|
+
(records a question-set quality issue, not a cortex failure)
|
|
42
|
+
no — claimed answer is wrong, hallucinated, or cites no expected
|
|
43
|
+
sources`;
|
|
44
|
+
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../src/eval/prompts.ts"],"names":[],"mappings":"AAAA,sBAAsB;AACtB,2EAA2E;AAC3E,2EAA2E;AAC3E,sEAAsE;AAEtE,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC;AAEhC,MAAM,CAAC,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;0CAcU,CAAC;AAE3C,MAAM,CAAC,MAAM,sBAAsB,GAAG;;;0EAGoC,CAAC;AAE3E,MAAM,CAAC,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;oBAkBT,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { type Result } from "../frontmatter/types.js";
|
|
2
|
+
import type { LlmClient } from "./llm.js";
|
|
3
|
+
import type { CortexEvalError, EvalRun, QuestionSet } from "./types.js";
|
|
4
|
+
export interface RunOptions {
|
|
5
|
+
k: number;
|
|
6
|
+
model: string;
|
|
7
|
+
resumeFrom?: EvalRun;
|
|
8
|
+
runId?: string;
|
|
9
|
+
timestamp?: string;
|
|
10
|
+
persist?: (run: EvalRun) => Promise<void>;
|
|
11
|
+
}
|
|
12
|
+
export declare function runAnswerer(questions: QuestionSet, vaultRoot: string, llm: LlmClient, opts: RunOptions): Promise<Result<EvalRun, CortexEvalError>>;
|
|
13
|
+
//# sourceMappingURL=run.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../src/eval/run.ts"],"names":[],"mappings":"AAMA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,KAAK,EAAE,SAAS,EAAW,MAAM,UAAU,CAAC;AAGnD,OAAO,KAAK,EAAE,eAAe,EAAE,OAAO,EAAgB,WAAW,EAAS,MAAM,YAAY,CAAC;AAE7F,MAAM,WAAW,UAAU;IACzB,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,CAAC,GAAG,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3C;AAED,wBAAsB,WAAW,CAC/B,SAAS,EAAE,WAAW,EACtB,SAAS,EAAE,MAAM,EACjB,GAAG,EAAE,SAAS,EACd,IAAI,EAAE,UAAU,GACf,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAyE3C"}
|