daftari 1.14.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/README.md +8 -2
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +6 -0
- package/dist/cli.js.map +1 -1
- package/dist/curation/lint.d.ts +16 -1
- package/dist/curation/lint.d.ts.map +1 -1
- package/dist/curation/lint.js +91 -78
- package/dist/curation/lint.js.map +1 -1
- package/dist/curation/tension-blast.d.ts +37 -0
- package/dist/curation/tension-blast.d.ts.map +1 -0
- package/dist/curation/tension-blast.js +223 -0
- package/dist/curation/tension-blast.js.map +1 -0
- package/dist/curation/tension-clusters.d.ts +18 -0
- package/dist/curation/tension-clusters.d.ts.map +1 -0
- package/dist/curation/tension-clusters.js +157 -0
- package/dist/curation/tension-clusters.js.map +1 -0
- package/dist/curation/tension.d.ts +4 -0
- package/dist/curation/tension.d.ts.map +1 -1
- package/dist/curation/tension.js +54 -0
- package/dist/curation/tension.js.map +1 -1
- package/dist/curation/vault-docs.d.ts +14 -0
- package/dist/curation/vault-docs.d.ts.map +1 -0
- package/dist/curation/vault-docs.js +90 -0
- package/dist/curation/vault-docs.js.map +1 -0
- package/dist/eval/generate.d.ts +12 -0
- package/dist/eval/generate.d.ts.map +1 -0
- package/dist/eval/generate.js +221 -0
- package/dist/eval/generate.js.map +1 -0
- package/dist/eval/index.d.ts +2 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +311 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/llm.d.ts +47 -0
- package/dist/eval/llm.d.ts.map +1 -0
- package/dist/eval/llm.js +165 -0
- package/dist/eval/llm.js.map +1 -0
- package/dist/eval/prompts.d.ts +5 -0
- package/dist/eval/prompts.d.ts.map +1 -0
- package/dist/eval/prompts.js +44 -0
- package/dist/eval/prompts.js.map +1 -0
- package/dist/eval/run.d.ts +13 -0
- package/dist/eval/run.d.ts.map +1 -0
- package/dist/eval/run.js +78 -0
- package/dist/eval/run.js.map +1 -0
- package/dist/eval/score.d.ts +12 -0
- package/dist/eval/score.d.ts.map +1 -0
- package/dist/eval/score.js +154 -0
- package/dist/eval/score.js.map +1 -0
- package/dist/eval/storage.d.ts +10 -0
- package/dist/eval/storage.d.ts.map +1 -0
- package/dist/eval/storage.js +69 -0
- package/dist/eval/storage.js.map +1 -0
- package/dist/eval/subgraph.d.ts +17 -0
- package/dist/eval/subgraph.d.ts.map +1 -0
- package/dist/eval/subgraph.js +214 -0
- package/dist/eval/subgraph.js.map +1 -0
- package/dist/eval/tool-surface.d.ts +7 -0
- package/dist/eval/tool-surface.d.ts.map +1 -0
- package/dist/eval/tool-surface.js +160 -0
- package/dist/eval/tool-surface.js.map +1 -0
- package/dist/eval/types.d.ts +173 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +44 -0
- package/dist/eval/types.js.map +1 -0
- package/dist/tools/curation.d.ts +4 -0
- package/dist/tools/curation.d.ts.map +1 -1
- package/dist/tools/curation.js +101 -0
- package/dist/tools/curation.js.map +1 -1
- package/package.json +2 -1
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
// Shared vault-document loading and in-vault link helpers.
|
|
2
|
+
//
|
|
3
|
+
// Extracted from lint.ts so the same loader and link-resolution machinery can
|
|
4
|
+
// back any curation surface that needs the full set of vault docs plus a
|
|
5
|
+
// reverse-link view (lint, tension blast radius). Keeps the link-extraction
|
|
6
|
+
// regexes and the path-normalisation rules in one place so the two callers
|
|
7
|
+
// can't drift apart.
|
|
8
|
+
import { posix } from "node:path";
|
|
9
|
+
import { parseDocument } from "../frontmatter/parser.js";
|
|
10
|
+
import { ok } from "../frontmatter/types.js";
|
|
11
|
+
import { listFiles, readFile, resolveVaultPath } from "../storage/local.js";
|
|
12
|
+
// Loads every markdown file under the vault root, returning frontmatter +
|
|
13
|
+
// body for each. Files that fail to parse or to read are silently skipped —
|
|
14
|
+
// the curation surface should never crash because a single file is malformed.
|
|
15
|
+
export async function loadDocuments(vaultRoot) {
|
|
16
|
+
const list = await listFiles(vaultRoot);
|
|
17
|
+
if (!list.ok)
|
|
18
|
+
return list;
|
|
19
|
+
const docs = [];
|
|
20
|
+
for (const relPath of list.value) {
|
|
21
|
+
const resolved = resolveVaultPath(vaultRoot, relPath);
|
|
22
|
+
if (!resolved.ok)
|
|
23
|
+
continue;
|
|
24
|
+
const file = await readFile(resolved.value);
|
|
25
|
+
if (!file.ok)
|
|
26
|
+
continue;
|
|
27
|
+
const parsed = parseDocument(file.value);
|
|
28
|
+
if (!parsed.ok)
|
|
29
|
+
continue;
|
|
30
|
+
docs.push({
|
|
31
|
+
path: relPath,
|
|
32
|
+
frontmatter: parsed.value.frontmatter,
|
|
33
|
+
content: parsed.value.content,
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
return ok(docs);
|
|
37
|
+
}
|
|
38
|
+
// Pulls every internal link target out of a markdown body: both [[wikilinks]]
|
|
39
|
+
// and [text](target) markdown links. External URLs and anchors are dropped.
|
|
40
|
+
export function extractLinks(content) {
|
|
41
|
+
const targets = [];
|
|
42
|
+
for (const m of content.matchAll(/\[\[([^\]]+)\]\]/g)) {
|
|
43
|
+
// A wikilink may carry a |display alias and/or a #heading anchor.
|
|
44
|
+
const raw = m[1].split("|")[0]?.split("#")[0]?.trim();
|
|
45
|
+
if (raw)
|
|
46
|
+
targets.push(raw);
|
|
47
|
+
}
|
|
48
|
+
for (const m of content.matchAll(/\[[^\]]*\]\(([^)\s]+)\)/g)) {
|
|
49
|
+
const raw = m[1].split("#")[0]?.trim();
|
|
50
|
+
if (!raw)
|
|
51
|
+
continue;
|
|
52
|
+
if (/^(https?:|mailto:|#)/i.test(raw))
|
|
53
|
+
continue;
|
|
54
|
+
targets.push(raw);
|
|
55
|
+
}
|
|
56
|
+
return targets;
|
|
57
|
+
}
|
|
58
|
+
// Resolves a raw link target to a vault-relative path, or null if it points
|
|
59
|
+
// nowhere. Tries, in order: the target as-is, with a .md suffix, resolved
|
|
60
|
+
// relative to the linking file's directory, then a bare basename match (the
|
|
61
|
+
// common [[note-name]] wikilink form).
|
|
62
|
+
export function resolveLink(rawTarget, fromPath, byPath, byBasename) {
|
|
63
|
+
const withMd = (p) => (p.endsWith(".md") ? p : `${p}.md`);
|
|
64
|
+
if (byPath.has(rawTarget))
|
|
65
|
+
return rawTarget;
|
|
66
|
+
if (byPath.has(withMd(rawTarget)))
|
|
67
|
+
return withMd(rawTarget);
|
|
68
|
+
const relual = posix.normalize(posix.join(posix.dirname(fromPath), rawTarget));
|
|
69
|
+
if (byPath.has(relual))
|
|
70
|
+
return relual;
|
|
71
|
+
if (byPath.has(withMd(relual)))
|
|
72
|
+
return withMd(relual);
|
|
73
|
+
const base = posix.basename(rawTarget).replace(/\.md$/, "");
|
|
74
|
+
return byBasename.get(base) ?? null;
|
|
75
|
+
}
|
|
76
|
+
// Precomputes the two indexes resolveLink consults: the set of every known
|
|
77
|
+
// vault-relative path, and the basename → path map used for bare-name
|
|
78
|
+
// wikilinks. First write wins on basename collisions so the mapping is
|
|
79
|
+
// deterministic across runs.
|
|
80
|
+
export function buildPathIndexes(docs) {
|
|
81
|
+
const byPath = new Set(docs.map((d) => d.path));
|
|
82
|
+
const byBasename = new Map();
|
|
83
|
+
for (const d of docs) {
|
|
84
|
+
const base = posix.basename(d.path).replace(/\.md$/, "");
|
|
85
|
+
if (!byBasename.has(base))
|
|
86
|
+
byBasename.set(base, d.path);
|
|
87
|
+
}
|
|
88
|
+
return { byPath, byBasename };
|
|
89
|
+
}
|
|
90
|
+
//# sourceMappingURL=vault-docs.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vault-docs.js","sourceRoot":"","sources":["../../src/curation/vault-docs.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAC3D,EAAE;AACF,8EAA8E;AAC9E,yEAAyE;AACzE,4EAA4E;AAC5E,2EAA2E;AAC3E,qBAAqB;AAErB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AACzD,OAAO,EAAoB,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAC5E,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAQ5E,0EAA0E;AAC1E,4EAA4E;AAC5E,8EAA8E;AAC9E,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,SAAiB;IACnD,MAAM,IAAI,GAAG,MAAM,SAAS,CAAC,SAAS,CAAC,CAAC;IACxC,IAAI,CAAC,IAAI,CAAC,EAAE;QAAE,OAAO,IAAI,CAAC;IAE1B,MAAM,IAAI,GAAgB,EAAE,CAAC;IAC7B,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QACjC,MAAM,QAAQ,GAAG,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QACtD,IAAI,CAAC,QAAQ,CAAC,EAAE;YAAE,SAAS;QAC3B,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QAC5C,IAAI,CAAC,IAAI,CAAC,EAAE;YAAE,SAAS;QACvB,MAAM,MAAM,GAAG,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzC,IAAI,CAAC,MAAM,CAAC,EAAE;YAAE,SAAS;QACzB,IAAI,CAAC,IAAI,CAAC;YACR,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,MAAM,CAAC,KAAK,CAAC,WAAW;YACrC,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;SAC9B,CAAC,CAAC;IACL,CAAC;IACD,OAAO,EAAE,CAAC,IAAI,CAAC,CAAC;AAClB,CAAC;AAED,8EAA8E;AAC9E,4EAA4E;AAC5E,MAAM,UAAU,YAAY,CAAC,OAAe;IAC1C,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;QACtD,kEAAkE;QAClE,MAAM,GAAG,GAAI,CAAC,CAAC,CAAC,CAAY,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAClE,IAAI,GAAG;YAAE,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,0BAA0B,CAAC,EAAE,CAAC;QAC7D,MAAM,GAAG,GAAI,CAAC,CAAC,CAAC,CAAY,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QACnD,IAAI,CAAC,GAAG;YAAE,SAAS;QACnB,IAAI,uBAAuB,CAAC,IAAI,CAAC,GAAG,CAAC;YAAE,SAAS;QAChD,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACpB,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,4EAA4E;AAC5E,0EAA0E;AAC1E,4EAA4E;AAC5E,uCAAuC;AACvC,MAAM,UAAU,WAAW,CACzB,SAAiB,EACjB,QAAgB,EAChB,MAAmB,EACnB,UAA+B;IAE/B,MAAM,MAAM,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAElE,IAAI,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC;QAAE,OAAO,SAAS,CAAC;IAC5C,IAAI,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAAE,OAAO,MAAM,CAAC,SAAS,CAAC,CAAC;IAE5D,MAAM,MAAM,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC;IAC/E,IAAI,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;QAAE,OAAO,MAAM,CAAC;IACtC,IAAI,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAAE,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC;IAEtD,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAC5D,OAAO,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;AACtC,CAAC;AAED,2EAA2E;AAC3E,sEAAsE;AACtE,uEAAuE;AACvE,6BAA6B;AAC7B,MAAM,UAAU,gBAAgB,CAAC,IAAiB;IAIhD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzD,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;IAC1D,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;AAChC,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { type Result } from "../frontmatter/types.js";
|
|
2
|
+
import type { LlmClient } from "./llm.js";
|
|
3
|
+
import type { Subgraph } from "./subgraph.js";
|
|
4
|
+
import { type CortexEvalError, type QuestionSet } from "./types.js";
|
|
5
|
+
export interface GenerateOptions {
|
|
6
|
+
n: number;
|
|
7
|
+
model: string;
|
|
8
|
+
vaultHash?: string;
|
|
9
|
+
seed?: string;
|
|
10
|
+
}
|
|
11
|
+
export declare function generateQuestions(subgraph: Subgraph, llm: LlmClient, opts: GenerateOptions): Promise<Result<QuestionSet, CortexEvalError>>;
|
|
12
|
+
//# sourceMappingURL=generate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../src/eval/generate.ts"],"names":[],"mappings":"AAUA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAE1C,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EACL,KAAK,eAAe,EAEpB,KAAK,WAAW,EAIjB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,eAAe;IAC9B,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,QAAQ,EAClB,GAAG,EAAE,SAAS,EACd,IAAI,EAAE,eAAe,GACpB,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,eAAe,CAAC,CAAC,CA0E/C"}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
// Question-set generation for the cortex quality metric.
|
|
2
|
+
//
|
|
3
|
+
// Given a sampled subgraph, ask the generator LLM for multi-hop questions
|
|
4
|
+
// across three tiers, then validate/filter the output: drop questions whose
|
|
5
|
+
// expected_sources escape the subgraph, and drop trivial yes/no answers. If the
|
|
6
|
+
// generator under-produces any tier, make ONE focused top-up call (spec §5.3) —
|
|
7
|
+
// remaining imbalance is accepted and recorded, not treated as an error.
|
|
8
|
+
// Finally, augment the contradiction tier from logged tension edges.
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
10
|
+
import { err, ok } from "../frontmatter/types.js";
|
|
11
|
+
import { GENERATOR_PROMPT, PROMPT_VERSION } from "./prompts.js";
|
|
12
|
+
import { QuestionSetSchema, TIERS, } from "./types.js";
|
|
13
|
+
export async function generateQuestions(subgraph, llm, opts) {
|
|
14
|
+
const perTier = Math.floor(opts.n / TIERS.length);
|
|
15
|
+
const tierCountsRequested = {
|
|
16
|
+
retrieval: perTier,
|
|
17
|
+
cross_reference: perTier,
|
|
18
|
+
contradiction: perTier,
|
|
19
|
+
};
|
|
20
|
+
const validNodes = new Set(subgraph.nodes.map((n) => n.path));
|
|
21
|
+
// [FIRST GENERATION] full per-tier counts. A failure here is fatal.
|
|
22
|
+
const firstRes = await llm.completeJson({
|
|
23
|
+
model: opts.model,
|
|
24
|
+
system: GENERATOR_PROMPT,
|
|
25
|
+
user: renderUserPrompt(subgraph, tierCountsRequested),
|
|
26
|
+
schema: QuestionSetSchema,
|
|
27
|
+
});
|
|
28
|
+
if (!firstRes.ok)
|
|
29
|
+
return firstRes;
|
|
30
|
+
const firstRaw = extractQuestions(firstRes.value.parsed);
|
|
31
|
+
if (firstRaw === null) {
|
|
32
|
+
return err({
|
|
33
|
+
kind: "llm",
|
|
34
|
+
message: "generator returned non-conforming JSON",
|
|
35
|
+
retryable: false,
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
const generated = validateAndMap(firstRaw, validNodes);
|
|
39
|
+
// [TOP-UP] Shortfall is measured on GENERATOR output only (pre-augmentation).
|
|
40
|
+
const shortfall = computeShortfall(tierCountsRequested, generated);
|
|
41
|
+
let merged = generated;
|
|
42
|
+
if (hasShortfall(shortfall)) {
|
|
43
|
+
// Exactly ONE top-up call. A failure or non-conforming output is non-fatal:
|
|
44
|
+
// we swallow it (zero additional questions) and accept the imbalance.
|
|
45
|
+
const topUpRes = await llm.completeJson({
|
|
46
|
+
model: opts.model,
|
|
47
|
+
system: GENERATOR_PROMPT,
|
|
48
|
+
user: renderTopUpPrompt(subgraph, shortfall),
|
|
49
|
+
schema: QuestionSetSchema,
|
|
50
|
+
});
|
|
51
|
+
if (topUpRes.ok) {
|
|
52
|
+
const topUpRaw = extractQuestions(topUpRes.value.parsed);
|
|
53
|
+
if (topUpRaw !== null) {
|
|
54
|
+
const topUp = validateAndMap(topUpRaw, validNodes);
|
|
55
|
+
merged = dedupeById(generated, topUp);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// [AUGMENT] Additive contradiction questions from tension edges, deduped by id.
|
|
60
|
+
const augmented = augmentFromTensions(subgraph, tierCountsRequested.contradiction);
|
|
61
|
+
const all = dedupeById(merged, augmented);
|
|
62
|
+
const tierCountsProduced = countByTier(all);
|
|
63
|
+
const ts = "2026-01-01T00:00:00Z"; // placeholder; the caller overwrites this.
|
|
64
|
+
const vaultHash = opts.vaultHash ?? "";
|
|
65
|
+
const seed = opts.seed ?? "";
|
|
66
|
+
return ok({
|
|
67
|
+
id: `${vaultHash}-${seed}-${ts}`,
|
|
68
|
+
vault_hash: vaultHash,
|
|
69
|
+
seed,
|
|
70
|
+
timestamp: ts,
|
|
71
|
+
subgraph: {
|
|
72
|
+
seed_doc: subgraph.seed_doc,
|
|
73
|
+
nodes: subgraph.nodes.map((n) => n.path),
|
|
74
|
+
edges: subgraph.edges,
|
|
75
|
+
},
|
|
76
|
+
questions: all,
|
|
77
|
+
generator_model: opts.model,
|
|
78
|
+
prompt_version: PROMPT_VERSION,
|
|
79
|
+
tier_counts_requested: tierCountsRequested,
|
|
80
|
+
tier_counts_produced: tierCountsProduced,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
// Pulls the `questions` array out of parsed generator JSON, or null if the
|
|
84
|
+
// shape is non-conforming (missing/!array). The element shapes are validated
|
|
85
|
+
// later by validateAndMap.
|
|
86
|
+
function extractQuestions(parsed) {
|
|
87
|
+
if (typeof parsed !== "object" || parsed === null)
|
|
88
|
+
return null;
|
|
89
|
+
// biome-ignore lint/suspicious/noExplicitAny: structural access to parsed JSON
|
|
90
|
+
const questions = parsed.questions;
|
|
91
|
+
if (!Array.isArray(questions))
|
|
92
|
+
return null;
|
|
93
|
+
return questions;
|
|
94
|
+
}
|
|
95
|
+
// Validates each raw question and maps survivors to `Question` records with a
|
|
96
|
+
// stable id and origin "generated". Reused for both the first call and the
|
|
97
|
+
// top-up call. Rejects: bad tier, empty question/answer, empty or
|
|
98
|
+
// out-of-subgraph expected_sources, and trivial yes/no answers.
|
|
99
|
+
function validateAndMap(rawQuestions, validNodes) {
|
|
100
|
+
const out = [];
|
|
101
|
+
for (const raw of rawQuestions) {
|
|
102
|
+
if (typeof raw !== "object" || raw === null)
|
|
103
|
+
continue;
|
|
104
|
+
// biome-ignore lint/suspicious/noExplicitAny: structural access to parsed JSON
|
|
105
|
+
const q = raw;
|
|
106
|
+
const tier = q.tier;
|
|
107
|
+
if (!isTier(tier))
|
|
108
|
+
continue;
|
|
109
|
+
const question = q.question;
|
|
110
|
+
const answer = q.expected_answer;
|
|
111
|
+
if (typeof question !== "string" || question.trim().length === 0)
|
|
112
|
+
continue;
|
|
113
|
+
if (typeof answer !== "string" || answer.trim().length === 0)
|
|
114
|
+
continue;
|
|
115
|
+
const sources = q.expected_sources;
|
|
116
|
+
if (!Array.isArray(sources) || sources.length === 0)
|
|
117
|
+
continue;
|
|
118
|
+
if (!sources.every((s) => typeof s === "string" && validNodes.has(s)))
|
|
119
|
+
continue;
|
|
120
|
+
if (isTrivial(question, answer))
|
|
121
|
+
continue;
|
|
122
|
+
const expectedSources = sources;
|
|
123
|
+
out.push({
|
|
124
|
+
id: questionId(tier, question, expectedSources),
|
|
125
|
+
tier,
|
|
126
|
+
question,
|
|
127
|
+
expected_answer: answer,
|
|
128
|
+
expected_sources: expectedSources,
|
|
129
|
+
origin: "generated",
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
function isTier(value) {
|
|
135
|
+
return typeof value === "string" && TIERS.includes(value);
|
|
136
|
+
}
|
|
137
|
+
// Per-tier shortfall against the requested counts, measured on the supplied
|
|
138
|
+
// (generator-only) question set. Never negative.
|
|
139
|
+
function computeShortfall(requested, produced) {
|
|
140
|
+
const counts = countByTier(produced);
|
|
141
|
+
const shortfall = {
|
|
142
|
+
retrieval: 0,
|
|
143
|
+
cross_reference: 0,
|
|
144
|
+
contradiction: 0,
|
|
145
|
+
};
|
|
146
|
+
for (const tier of TIERS) {
|
|
147
|
+
shortfall[tier] = Math.max(0, requested[tier] - counts[tier]);
|
|
148
|
+
}
|
|
149
|
+
return shortfall;
|
|
150
|
+
}
|
|
151
|
+
function hasShortfall(shortfall) {
|
|
152
|
+
return TIERS.some((tier) => shortfall[tier] > 0);
|
|
153
|
+
}
|
|
154
|
+
function countByTier(questions) {
|
|
155
|
+
const counts = {
|
|
156
|
+
retrieval: 0,
|
|
157
|
+
cross_reference: 0,
|
|
158
|
+
contradiction: 0,
|
|
159
|
+
};
|
|
160
|
+
for (const q of questions)
|
|
161
|
+
counts[q.tier] += 1;
|
|
162
|
+
return counts;
|
|
163
|
+
}
|
|
164
|
+
// Merges `base` with `extra`, dropping any `extra` whose id already appears in
|
|
165
|
+
// `base` (or earlier in `extra`). Identical-text/source questions within a tier
|
|
166
|
+
// hash to the same id and collapse to one.
|
|
167
|
+
function dedupeById(base, extra) {
|
|
168
|
+
const seen = new Set(base.map((q) => q.id));
|
|
169
|
+
const out = [...base];
|
|
170
|
+
for (const q of extra) {
|
|
171
|
+
if (seen.has(q.id))
|
|
172
|
+
continue;
|
|
173
|
+
seen.add(q.id);
|
|
174
|
+
out.push(q);
|
|
175
|
+
}
|
|
176
|
+
return out;
|
|
177
|
+
}
|
|
178
|
+
function renderUserPrompt(sg, counts) {
|
|
179
|
+
const docs = sg.nodes.map((n) => `## ${n.path}\n\n${n.body}\n`).join("\n");
|
|
180
|
+
return `Subgraph docs:\n\n${docs}\n\nProduce exactly ${counts.retrieval} retrieval, ${counts.cross_reference} cross_reference, and ${counts.contradiction} contradiction questions.`;
|
|
181
|
+
}
|
|
182
|
+
// Top-up prompt: lists ONLY the under-produced tiers and their shortfall counts.
|
|
183
|
+
// Tiers with shortfall 0 are omitted entirely.
|
|
184
|
+
function renderTopUpPrompt(sg, shortfall) {
|
|
185
|
+
const docs = sg.nodes.map((n) => `## ${n.path}\n\n${n.body}\n`).join("\n");
|
|
186
|
+
const wanted = TIERS.filter((tier) => shortfall[tier] > 0)
|
|
187
|
+
.map((tier) => `${shortfall[tier]} more ${tier}`)
|
|
188
|
+
.join(" and ");
|
|
189
|
+
return `Subgraph docs:\n\n${docs}\n\nYou previously produced too few of some tiers. Produce exactly ${wanted} questions, same rules. Do not produce any other tiers.`;
|
|
190
|
+
}
|
|
191
|
+
function isTrivial(_question, answer) {
|
|
192
|
+
const a = answer.trim().toLowerCase();
|
|
193
|
+
return a === "yes" || a === "no" || a.length < 3;
|
|
194
|
+
}
|
|
195
|
+
function questionId(tier, question, sources) {
|
|
196
|
+
const h = createHash("sha256");
|
|
197
|
+
h.update(`${tier}\x00${question}\x00${[...sources].sort().join("\x00")}`);
|
|
198
|
+
return h.digest("hex").slice(0, 16);
|
|
199
|
+
}
|
|
200
|
+
// Additive contradiction questions seeded from logged tension edges in the
|
|
201
|
+
// subgraph: max(1, floor(0.2 × contradictionBudget)) of them, capped by how many
|
|
202
|
+
// tension edges exist. Origin "augmented" (no generator LLM involved).
|
|
203
|
+
function augmentFromTensions(sg, contradictionBudget) {
|
|
204
|
+
const tensionEdges = sg.edges.filter((e) => e.kind === "tension");
|
|
205
|
+
if (tensionEdges.length === 0)
|
|
206
|
+
return [];
|
|
207
|
+
const count = Math.max(1, Math.floor(0.2 * contradictionBudget));
|
|
208
|
+
return tensionEdges.slice(0, count).map((e) => {
|
|
209
|
+
const q = `${e.from} and ${e.to} appear to disagree on a specific point. Read both docs, identify the disagreement, and cite the position each takes. Cite both docs in your answer.`;
|
|
210
|
+
const sources = [e.from, e.to];
|
|
211
|
+
return {
|
|
212
|
+
id: questionId("contradiction", q, sources),
|
|
213
|
+
tier: "contradiction",
|
|
214
|
+
question: q,
|
|
215
|
+
expected_answer: `A correct answer identifies the substantive contradiction between ${e.from} and ${e.to} and cites both source paths.`,
|
|
216
|
+
expected_sources: sources,
|
|
217
|
+
origin: "augmented",
|
|
218
|
+
};
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
//# sourceMappingURL=generate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate.js","sourceRoot":"","sources":["../../src/eval/generate.ts"],"names":[],"mappings":"AAAA,yDAAyD;AACzD,EAAE;AACF,0EAA0E;AAC1E,4EAA4E;AAC5E,gFAAgF;AAChF,gFAAgF;AAChF,yEAAyE;AACzE,qEAAqE;AAErE,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhE,OAAO,EAIL,iBAAiB,EACjB,KAAK,GAEN,MAAM,YAAY,CAAC;AASpB,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAkB,EAClB,GAAc,EACd,IAAqB;IAErB,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;IAClD,MAAM,mBAAmB,GAAyB;QAChD,SAAS,EAAE,OAAO;QAClB,eAAe,EAAE,OAAO;QACxB,aAAa,EAAE,OAAO;KACvB,CAAC;IAEF,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAE9D,oEAAoE;IACpE,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,YAAY,CAAC;QACtC,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,MAAM,EAAE,gBAAgB;QACxB,IAAI,EAAE,gBAAgB,CAAC,QAAQ,EAAE,mBAAmB,CAAC;QACrD,MAAM,EAAE,iBAAiB;KAC1B,CAAC,CAAC;IACH,IAAI,CAAC,QAAQ,CAAC,EAAE;QAAE,OAAO,QAAQ,CAAC;IAClC,MAAM,QAAQ,GAAG,gBAAgB,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IACzD,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;QACtB,OAAO,GAAG,CAAC;YACT,IAAI,EAAE,KAAK;YACX,OAAO,EAAE,wCAAwC;YACjD,SAAS,EAAE,KAAK;SACjB,CAAC,CAAC;IACL,CAAC;IACD,MAAM,SAAS,GAAG,cAAc,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAEvD,8EAA8E;IAC9E,MAAM,SAAS,GAAG,gBAAgB,CAAC,mBAAmB,EAAE,SAAS,CAAC,CAAC;IACnE,IAAI,MAAM,GAAG,SAAS,CAAC;IACvB,IAAI,YAAY,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5B,4EAA4E;QAC5E,sEAAsE;QACtE,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,YAAY,CAAC;YACtC,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,gBAAgB;YACxB,IAAI,EAAE,iBAAiB,CAAC,QAAQ,EAAE,SAAS,CAAC;YAC5C,MAAM,EAAE,iBAAiB;SAC1B,CAAC,CAAC;QACH,IAAI,QAAQ,CAAC,EAAE,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,gBAAgB,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YACzD,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;gBACtB,MAAM,KAAK,GAAG,cAAc,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;gBACnD,MAAM,GAAG,UAAU,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YACxC,CAAC;QACH,CAAC;IACH,CAAC;IAED,gFAAgF;IAChF,MAAM,SAAS,GAAG,mBAAmB,CAAC,QAAQ,EAAE,mBAAmB,CAAC,aAAa,CAAC,CAAC;IACnF,MAAM,GAAG,GAAG,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAE1C,MAAM,kBAAkB,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;IAC5C,MAAM,EAAE,GAAG,sBAAsB,CAAC,CAAC,2CAA2C;IAC9E,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;IACvC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IAE7B,OAAO,EAAE,CAAC;QACR,EAAE,EAAE,GAAG,SAAS,IAAI,IAAI,IAAI,EAAE,EAAE;QAChC,UAAU,EAAE,SAAS;QACrB,IAAI;QACJ,SAAS,EAAE,EAAE;QACb,QAAQ,EAAE;YACR,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,KAAK,EAAE,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YACxC,KAAK,EAAE,QAAQ,CAAC,KAAK;SACtB;QACD,SAAS,EAAE,GAAG;QACd,eAAe,EAAE,IAAI,CAAC,KAAK;QAC3B,cAAc,EAAE,cAAc;QAC9B,qBAAqB,EAAE,mBAAmB;QAC1C,oBAAoB,EAAE,kBAAkB;KACzC,CAAC,CAAC;AACL,CAAC;AAED,2EAA2E;AAC3E,6EAA6E;AAC7E,2BAA2B;AAC3B,SAAS,gBAAgB,CAAC,MAAe;IACvC,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI;QAAE,OAAO,IAAI,CAAC;IAC/D,+EAA+E;IAC/E,MAAM,SAAS,GAAI,MAAc,CAAC,SAAS,CAAC;IAC5C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC;QAAE,OAAO,IAAI,CAAC;IAC3C,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,8EAA8E;AAC9E,2EAA2E;AAC3E,kEAAkE;AAClE,gEAAgE;AAChE,SAAS,cAAc,CAAC,YAAuB,EAAE,UAAuB;IACtE,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAC/B,IAAI,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI;YAAE,SAAS;QACtD,+EAA+E;QAC/E,MAAM,CAAC,GAAG,GAAU,CAAC;QACrB,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;QACpB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC;YAAE,SAAS;QAC5B,MAAM,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC;QAC5B,MAAM,MAAM,GAAG,CAAC,CAAC,eAAe,CAAC;QACjC,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC3E,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QACvE,MAAM,OAAO,GAAG,CAAC,CAAC,gBAAgB,CAAC;QACnC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC9D,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAAE,SAAS;QAChF,IAAI,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC;YAAE,SAAS;QAC1C,MAAM,eAAe,GAAG,OAAmB,CAAC;QAC5C,GAAG,CAAC,IAAI,CAAC;YACP,EAAE,EAAE,UAAU,CAAC,IAAI,EAAE,QAAQ,EAAE,eAAe,CAAC;YAC/C,IAAI;YACJ,QAAQ;YACR,eAAe,EAAE,MAAM;YACvB,gBAAgB,EAAE,eAAe;YACjC,MAAM,EAAE,WAAW;SACpB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,MAAM,CAAC,KAAc;IAC5B,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAK,KAA2B,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;AACnF,CAAC;AAED,4EAA4E;AAC5E,iDAAiD;AACjD,SAAS,gBAAgB,CACvB,SAA+B,EAC/B,QAAoB;IAEpB,MAAM,MAAM,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,SAAS,GAAyB;QACtC,SAAS,EAAE,CAAC;QACZ,eAAe,EAAE,CAAC;QAClB,aAAa,EAAE,CAAC;KACjB,CAAC;IACF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,SAAS,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;IAChE,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,YAAY,CAAC,SAA+B;IACnD,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AACnD,CAAC;AAED,SAAS,WAAW,CAAC,SAAqB;IACxC,MAAM,MAAM,GAAyB;QACnC,SAAS,EAAE,CAAC;QACZ,eAAe,EAAE,CAAC;QAClB,aAAa,EAAE,CAAC;KACjB,CAAC;IACF,KAAK,MAAM,CAAC,IAAI,SAAS;QAAE,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/C,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,+EAA+E;AAC/E,gFAAgF;AAChF,2CAA2C;AAC3C,SAAS,UAAU,CAAC,IAAgB,EAAE,KAAiB;IACrD,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;IACtB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAAE,SAAS;QAC7B,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACf,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACd,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,gBAAgB,CAAC,EAAY,EAAE,MAA4B;IAClE,MAAM,IAAI,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,IAAI,OAAO,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3E,OAAO,qBAAqB,IAAI,uBAAuB,MAAM,CAAC,SAAS,eAAe,MAAM,CAAC,eAAe,yBAAyB,MAAM,CAAC,aAAa,2BAA2B,CAAC;AACvL,CAAC;AAED,iFAAiF;AACjF,+CAA+C;AAC/C,SAAS,iBAAiB,CAAC,EAAY,EAAE,SAA+B;IACtE,MAAM,IAAI,GAAG,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,IAAI,OAAO,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC3E,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;SACvD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,SAAS,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;SAChD,IAAI,CAAC,OAAO,CAAC,CAAC;IACjB,OAAO,qBAAqB,IAAI,sEAAsE,MAAM,yDAAyD,CAAC;AACxK,CAAC;AAED,SAAS,SAAS,CAAC,SAAiB,EAAE,MAAc;IAClD,MAAM,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACtC,OAAO,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;AACnD,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,QAAgB,EAAE,OAAiB;IACnE,MAAM,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IAC/B,CAAC,CAAC,MAAM,CAAC,GAAG,IAAI,OAAO,QAAQ,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAC1E,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AACtC,CAAC;AAED,2EAA2E;AAC3E,iFAAiF;AACjF,uEAAuE;AACvE,SAAS,mBAAmB,CAAC,EAAY,EAAE,mBAA2B;IACpE,MAAM,YAAY,GAAG,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC;IAClE,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,mBAAmB,CAAC,CAAC,CAAC;IACjE,OAAO,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAC5C,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,EAAE,sJAAsJ,CAAC;QACtL,MAAM,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC;QAC/B,OAAO;YACL,EAAE,EAAE,UAAU,CAAC,eAAe,EAAE,CAAC,EAAE,OAAO,CAAC;YAC3C,IAAI,EAAE,eAAwB;YAC9B,QAAQ,EAAE,CAAC;YACX,eAAe,EAAE,qEAAqE,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,EAAE,+BAA+B;YACvI,gBAAgB,EAAE,OAAO;YACzB,MAAM,EAAE,WAAoB;SAC7B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AA6DA,wBAAsB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAuB7D"}
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
// src/eval/index.ts
|
|
2
|
+
// Top-level CLI dispatcher for `daftari eval`. Parses flags, routes to
|
|
3
|
+
// generate/run/score/top-level, translates Result<T, CortexEvalError> to exit
|
|
4
|
+
// codes (2 = config, 3 = runtime/llm).
|
|
5
|
+
import { createHash } from "node:crypto";
|
|
6
|
+
import { resolve } from "node:path";
|
|
7
|
+
import { generateQuestions } from "./generate.js";
|
|
8
|
+
import { createAnthropicClient } from "./llm.js";
|
|
9
|
+
import { PROMPT_VERSION } from "./prompts.js";
|
|
10
|
+
import { runAnswerer } from "./run.js";
|
|
11
|
+
import { aggregateScore, gradeAnswer } from "./score.js";
|
|
12
|
+
import { appendHistory, readQuestionSet, readResults, writeQuestionSet, writeResults, writeScore, } from "./storage.js";
|
|
13
|
+
import { sampleSubgraph } from "./subgraph.js";
|
|
14
|
+
import { SPEC_VERSION, TIERS, } from "./types.js";
|
|
15
|
+
const HELP = `daftari eval — cortex quality metric.
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
daftari eval [--vault <path>] [--n <count>] [--k <count>] [--seed <str>]
|
|
19
|
+
daftari eval generate [--vault <path>] [--n <count>] [--seed <str>]
|
|
20
|
+
daftari eval run [--questions <id>] [--vault <path>] [--model <id>] [--k <count>] [--resume <results-id>]
|
|
21
|
+
daftari eval score [--results <id>] [--vault <path>] [--grader-model <id>]
|
|
22
|
+
|
|
23
|
+
(--questions and --results take the artifact id printed by a prior stage,
|
|
24
|
+
not a file path; artifacts live under .daftari/eval/.)
|
|
25
|
+
|
|
26
|
+
Defaults:
|
|
27
|
+
--n 15 total questions across three tiers (5 each)
|
|
28
|
+
--k 2 runs per question for variance estimation
|
|
29
|
+
--model claude-sonnet-4-6 (DEFAULT_MODEL in src/eval/index.ts)
|
|
30
|
+
--vault current working directory
|
|
31
|
+
|
|
32
|
+
Environment:
|
|
33
|
+
ANTHROPIC_API_KEY required for any LLM-mediated stage
|
|
34
|
+
|
|
35
|
+
Disk usage:
|
|
36
|
+
.daftari/eval/results/ and scores/ grow without bound across runs. v1
|
|
37
|
+
recovery is a manual rm -rf .daftari/eval/results/; rerunning regenerates
|
|
38
|
+
what's needed. A daftari eval prune command is the planned v2 follow-up.
|
|
39
|
+
|
|
40
|
+
Exit codes:
|
|
41
|
+
0 — eval completed
|
|
42
|
+
2 — config error (missing API key, bad flags, no vault)
|
|
43
|
+
3 — runtime/LLM error (retries exhausted, vault I/O failure)
|
|
44
|
+
`;
|
|
45
|
+
export async function runEval(argv) {
|
|
46
|
+
if (argv.includes("--help") || argv.includes("-h")) {
|
|
47
|
+
process.stdout.write(HELP);
|
|
48
|
+
return 0;
|
|
49
|
+
}
|
|
50
|
+
// runEval is the CLI boundary and must not throw: a bad integer flag
|
|
51
|
+
// (intFlag) or any unexpected error becomes a config-error exit code (2).
|
|
52
|
+
try {
|
|
53
|
+
const [mode, ...rest] = argv;
|
|
54
|
+
switch (mode) {
|
|
55
|
+
case "generate":
|
|
56
|
+
return await runGenerate(rest);
|
|
57
|
+
case "run":
|
|
58
|
+
return await runRun(rest);
|
|
59
|
+
case "score":
|
|
60
|
+
return await runScore(rest);
|
|
61
|
+
default:
|
|
62
|
+
return await runTopLevel(argv);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
process.stderr.write(`${e instanceof Error ? e.message : String(e)}\n`);
|
|
67
|
+
return 2;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
function flag(argv, name) {
|
|
71
|
+
const i = argv.indexOf(`--${name}`);
|
|
72
|
+
if (i < 0 || i + 1 >= argv.length)
|
|
73
|
+
return undefined;
|
|
74
|
+
return argv[i + 1];
|
|
75
|
+
}
|
|
76
|
+
function intFlag(argv, name, def) {
|
|
77
|
+
const v = flag(argv, name);
|
|
78
|
+
if (v === undefined)
|
|
79
|
+
return def;
|
|
80
|
+
const n = parseInt(v, 10);
|
|
81
|
+
if (Number.isNaN(n))
|
|
82
|
+
throw new Error(`--${name} must be an integer`);
|
|
83
|
+
return n;
|
|
84
|
+
}
|
|
85
|
+
function vaultHash(vault) {
|
|
86
|
+
return createHash("sha256").update(resolve(vault)).digest("hex").slice(0, 12);
|
|
87
|
+
}
|
|
88
|
+
function defaultSeed(vault) {
|
|
89
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
90
|
+
return `${vaultHash(vault)}-${today}`;
|
|
91
|
+
}
|
|
92
|
+
const DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
93
|
+
async function runGenerate(argv) {
|
|
94
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
95
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
96
|
+
return 2;
|
|
97
|
+
}
|
|
98
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
99
|
+
const n = intFlag(argv, "n", 15);
|
|
100
|
+
const seed = flag(argv, "seed") ?? defaultSeed(vault);
|
|
101
|
+
const sg = await sampleSubgraph(vault, seed, { maxNodes: 5 });
|
|
102
|
+
if (!sg.ok) {
|
|
103
|
+
process.stderr.write(`${sg.error.message}\n`);
|
|
104
|
+
return 3;
|
|
105
|
+
}
|
|
106
|
+
const client = createAnthropicClient();
|
|
107
|
+
const qs = await generateQuestions(sg.value, client, {
|
|
108
|
+
n,
|
|
109
|
+
model: DEFAULT_MODEL,
|
|
110
|
+
vaultHash: vaultHash(vault),
|
|
111
|
+
seed,
|
|
112
|
+
});
|
|
113
|
+
if (!qs.ok) {
|
|
114
|
+
process.stderr.write(`${qs.error.message}\n`);
|
|
115
|
+
return 3;
|
|
116
|
+
}
|
|
117
|
+
qs.value.timestamp = new Date().toISOString();
|
|
118
|
+
qs.value.id = `${qs.value.vault_hash}-${qs.value.seed}-${qs.value.timestamp}`;
|
|
119
|
+
await writeQuestionSet(vault, qs.value);
|
|
120
|
+
process.stdout.write(`wrote question set ${qs.value.id} (${qs.value.questions.length} questions)\n`);
|
|
121
|
+
return 0;
|
|
122
|
+
}
|
|
123
|
+
async function runRun(argv) {
|
|
124
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
125
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
126
|
+
return 2;
|
|
127
|
+
}
|
|
128
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
129
|
+
const questionsId = flag(argv, "questions");
|
|
130
|
+
if (!questionsId) {
|
|
131
|
+
process.stderr.write("--questions required\n");
|
|
132
|
+
return 2;
|
|
133
|
+
}
|
|
134
|
+
const k = intFlag(argv, "k", 2);
|
|
135
|
+
const model = flag(argv, "model") ?? DEFAULT_MODEL;
|
|
136
|
+
const qsRead = await readQuestionSet(vault, questionsId);
|
|
137
|
+
if (!qsRead.ok) {
|
|
138
|
+
process.stderr.write(`${qsRead.error.message}\n`);
|
|
139
|
+
return 3;
|
|
140
|
+
}
|
|
141
|
+
let resumeFrom;
|
|
142
|
+
const resumeId = flag(argv, "resume");
|
|
143
|
+
if (resumeId) {
|
|
144
|
+
const r = await readResults(vault, resumeId);
|
|
145
|
+
if (r.ok)
|
|
146
|
+
resumeFrom = r.value;
|
|
147
|
+
}
|
|
148
|
+
// Mint the stable id + timestamp up front so the on-disk file path is stable
|
|
149
|
+
// across the run and any later --resume; persist incrementally so a mid-run
|
|
150
|
+
// failure leaves a resumable partial file.
|
|
151
|
+
const timestamp = new Date().toISOString();
|
|
152
|
+
const runId = resumeFrom ? resumeFrom.id : `${qsRead.value.id}-${model}-${timestamp}`;
|
|
153
|
+
const client = createAnthropicClient();
|
|
154
|
+
const run = await runAnswerer(qsRead.value, vault, client, {
|
|
155
|
+
k,
|
|
156
|
+
model,
|
|
157
|
+
resumeFrom,
|
|
158
|
+
runId,
|
|
159
|
+
timestamp,
|
|
160
|
+
persist: (r) => writeResults(vault, r),
|
|
161
|
+
});
|
|
162
|
+
if (!run.ok) {
|
|
163
|
+
process.stderr.write(`${run.error.message}\n`);
|
|
164
|
+
process.stderr.write(`partial results saved as ${runId}; resume with: daftari eval run --questions ${questionsId} --resume ${runId}\n`);
|
|
165
|
+
return 3;
|
|
166
|
+
}
|
|
167
|
+
await writeResults(vault, run.value); // final write (covers the zero-question edge where persist never fired)
|
|
168
|
+
process.stdout.write(`wrote results ${run.value.id}\n`);
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
async function runScore(argv) {
|
|
172
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
173
|
+
const resultsId = flag(argv, "results");
|
|
174
|
+
if (!resultsId) {
|
|
175
|
+
process.stderr.write("--results required\n");
|
|
176
|
+
return 2;
|
|
177
|
+
}
|
|
178
|
+
const graderModel = flag(argv, "grader-model") ?? DEFAULT_MODEL;
|
|
179
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
180
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
181
|
+
return 2;
|
|
182
|
+
}
|
|
183
|
+
const runRead = await readResults(vault, resultsId);
|
|
184
|
+
if (!runRead.ok) {
|
|
185
|
+
process.stderr.write(`${runRead.error.message}\n`);
|
|
186
|
+
return 3;
|
|
187
|
+
}
|
|
188
|
+
const run = runRead.value;
|
|
189
|
+
const qsRead = await readQuestionSet(vault, run.questions_id);
|
|
190
|
+
if (!qsRead.ok) {
|
|
191
|
+
process.stderr.write(`${qsRead.error.message}\n`);
|
|
192
|
+
return 3;
|
|
193
|
+
}
|
|
194
|
+
const qs = qsRead.value;
|
|
195
|
+
const grader = createAnthropicClient();
|
|
196
|
+
const grades = [];
|
|
197
|
+
const traces = new Map();
|
|
198
|
+
for (const [, pr] of Object.entries(run.runs)) {
|
|
199
|
+
if (pr.status !== "complete" || !pr.trace)
|
|
200
|
+
continue;
|
|
201
|
+
const q = qs.questions[pr.question_index];
|
|
202
|
+
if (!q)
|
|
203
|
+
continue;
|
|
204
|
+
const g = await gradeAnswer(q, pr.question_index, pr.k_index, pr.trace, grader, {
|
|
205
|
+
model: graderModel,
|
|
206
|
+
});
|
|
207
|
+
if (g.ok) {
|
|
208
|
+
grades.push(g.value);
|
|
209
|
+
traces.set(`${q.id}:${pr.k_index}`, pr.trace);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
const score = aggregateScore(grades, qs.questions, { traces });
|
|
213
|
+
score.models = {
|
|
214
|
+
generator: qs.generator_model,
|
|
215
|
+
answerer: run.answerer_model,
|
|
216
|
+
grader: graderModel,
|
|
217
|
+
};
|
|
218
|
+
score.prompt_version = PROMPT_VERSION;
|
|
219
|
+
score.spec_version = SPEC_VERSION;
|
|
220
|
+
score.questions_id = qs.id;
|
|
221
|
+
score.results_id = run.id;
|
|
222
|
+
score.vault_hash = qs.vault_hash;
|
|
223
|
+
score.k = run.k;
|
|
224
|
+
score.n = qs.questions.length;
|
|
225
|
+
score.timestamp = new Date().toISOString();
|
|
226
|
+
await writeScore(vault, score);
|
|
227
|
+
const histEntry = {
|
|
228
|
+
score_id: score.results_id,
|
|
229
|
+
score: score.score,
|
|
230
|
+
score_std: score.score_std,
|
|
231
|
+
by_tier: {
|
|
232
|
+
retrieval: score.by_tier.retrieval.mean,
|
|
233
|
+
cross_reference: score.by_tier.cross_reference.mean,
|
|
234
|
+
contradiction: score.by_tier.contradiction.mean,
|
|
235
|
+
},
|
|
236
|
+
vault_hash: score.vault_hash,
|
|
237
|
+
timestamp: score.timestamp,
|
|
238
|
+
n: score.n,
|
|
239
|
+
k: score.k,
|
|
240
|
+
models: score.models,
|
|
241
|
+
prompt_version: score.prompt_version,
|
|
242
|
+
spec_version: score.spec_version,
|
|
243
|
+
};
|
|
244
|
+
await appendHistory(vault, histEntry);
|
|
245
|
+
// Pretty-print headline + per-tier means.
|
|
246
|
+
process.stdout.write(`score: ${score.score.toFixed(3)} ± ${score.score_std.toFixed(3)}\n`);
|
|
247
|
+
for (const t of TIERS) {
|
|
248
|
+
const ts = score.by_tier[t];
|
|
249
|
+
process.stdout.write(` ${t.padEnd(16)}: ${ts.mean.toFixed(3)} (n=${ts.n}, efficiency=${ts.trace_efficiency.toFixed(1)} calls)\n`);
|
|
250
|
+
}
|
|
251
|
+
return 0;
|
|
252
|
+
}
|
|
253
|
+
async function runTopLevel(argv) {
|
|
254
|
+
// Spec §3 "Top-level convenience": runs generate → run → score in one shot.
|
|
255
|
+
// We thread the IDs in-memory rather than re-reading from disk, so a
|
|
256
|
+
// failure mid-pipeline still leaves the on-disk artifacts that did
|
|
257
|
+
// succeed for forensic / resume use.
|
|
258
|
+
if (!process.env.ANTHROPIC_API_KEY) {
|
|
259
|
+
process.stderr.write("ANTHROPIC_API_KEY required\n");
|
|
260
|
+
return 2;
|
|
261
|
+
}
|
|
262
|
+
const vault = flag(argv, "vault") ?? process.cwd();
|
|
263
|
+
const n = intFlag(argv, "n", 15);
|
|
264
|
+
const k = intFlag(argv, "k", 2);
|
|
265
|
+
const seed = flag(argv, "seed") ?? defaultSeed(vault);
|
|
266
|
+
const model = flag(argv, "model") ?? DEFAULT_MODEL;
|
|
267
|
+
// 1. Generate
|
|
268
|
+
const sg = await sampleSubgraph(vault, seed, { maxNodes: 5 });
|
|
269
|
+
if (!sg.ok) {
|
|
270
|
+
process.stderr.write(`${sg.error.message}\n`);
|
|
271
|
+
return 3;
|
|
272
|
+
}
|
|
273
|
+
const apiClient = createAnthropicClient();
|
|
274
|
+
const qsRes = await generateQuestions(sg.value, apiClient, {
|
|
275
|
+
n,
|
|
276
|
+
model,
|
|
277
|
+
vaultHash: vaultHash(vault),
|
|
278
|
+
seed,
|
|
279
|
+
});
|
|
280
|
+
if (!qsRes.ok) {
|
|
281
|
+
process.stderr.write(`${qsRes.error.message}\n`);
|
|
282
|
+
return 3;
|
|
283
|
+
}
|
|
284
|
+
const qs = qsRes.value;
|
|
285
|
+
qs.timestamp = new Date().toISOString();
|
|
286
|
+
qs.id = `${qs.vault_hash}-${qs.seed}-${qs.timestamp}`;
|
|
287
|
+
await writeQuestionSet(vault, qs);
|
|
288
|
+
process.stdout.write(`generated ${qs.questions.length} questions (id=${qs.id})\n`);
|
|
289
|
+
// 2. Run — mint the stable id + timestamp up front and persist incrementally
|
|
290
|
+
// so a mid-run failure leaves a resumable partial file.
|
|
291
|
+
const runTimestamp = new Date().toISOString();
|
|
292
|
+
const runId = `${qs.id}-${model}-${runTimestamp}`;
|
|
293
|
+
const runRes = await runAnswerer(qs, vault, apiClient, {
|
|
294
|
+
k,
|
|
295
|
+
model,
|
|
296
|
+
runId,
|
|
297
|
+
timestamp: runTimestamp,
|
|
298
|
+
persist: (r) => writeResults(vault, r),
|
|
299
|
+
});
|
|
300
|
+
if (!runRes.ok) {
|
|
301
|
+
process.stderr.write(`${runRes.error.message}\n`);
|
|
302
|
+
process.stderr.write(`partial results saved as ${runId}; resume with: daftari eval run --questions ${qs.id} --resume ${runId}\n`);
|
|
303
|
+
return 3;
|
|
304
|
+
}
|
|
305
|
+
const run = runRes.value;
|
|
306
|
+
await writeResults(vault, run); // final write (covers the zero-question edge where persist never fired)
|
|
307
|
+
process.stdout.write(`ran ${Object.keys(run.runs).length} answerer invocations (id=${run.id})\n`);
|
|
308
|
+
// 3. Score — invoke the same grading logic runScore uses, in-process.
|
|
309
|
+
return await runScore(["--vault", vault, "--results", run.id, "--grader-model", model]);
|
|
310
|
+
}
|
|
311
|
+
//# sourceMappingURL=index.js.map
|