daftari 1.15.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +8 -2
  3. package/dist/backfill/apply.d.ts +14 -0
  4. package/dist/backfill/apply.d.ts.map +1 -0
  5. package/dist/backfill/apply.js +111 -0
  6. package/dist/backfill/apply.js.map +1 -0
  7. package/dist/backfill/derive.d.ts +25 -0
  8. package/dist/backfill/derive.d.ts.map +1 -0
  9. package/dist/backfill/derive.js +142 -0
  10. package/dist/backfill/derive.js.map +1 -0
  11. package/dist/backfill/index.d.ts +2 -0
  12. package/dist/backfill/index.d.ts.map +1 -0
  13. package/dist/backfill/index.js +232 -0
  14. package/dist/backfill/index.js.map +1 -0
  15. package/dist/backfill/plan.d.ts +19 -0
  16. package/dist/backfill/plan.d.ts.map +1 -0
  17. package/dist/backfill/plan.js +157 -0
  18. package/dist/backfill/plan.js.map +1 -0
  19. package/dist/backfill/types.d.ts +19 -0
  20. package/dist/backfill/types.d.ts.map +1 -0
  21. package/dist/backfill/types.js +10 -0
  22. package/dist/backfill/types.js.map +1 -0
  23. package/dist/cli.d.ts.map +1 -1
  24. package/dist/cli.js +15 -0
  25. package/dist/cli.js.map +1 -1
  26. package/dist/curation/lint.d.ts +3 -0
  27. package/dist/curation/lint.d.ts.map +1 -1
  28. package/dist/curation/lint.js +5 -0
  29. package/dist/curation/lint.js.map +1 -1
  30. package/dist/curation/staged-actions.d.ts +68 -0
  31. package/dist/curation/staged-actions.d.ts.map +1 -0
  32. package/dist/curation/staged-actions.js +394 -0
  33. package/dist/curation/staged-actions.js.map +1 -0
  34. package/dist/eval/generate.d.ts +12 -0
  35. package/dist/eval/generate.d.ts.map +1 -0
  36. package/dist/eval/generate.js +221 -0
  37. package/dist/eval/generate.js.map +1 -0
  38. package/dist/eval/index.d.ts +2 -0
  39. package/dist/eval/index.d.ts.map +1 -0
  40. package/dist/eval/index.js +311 -0
  41. package/dist/eval/index.js.map +1 -0
  42. package/dist/eval/llm.d.ts +47 -0
  43. package/dist/eval/llm.d.ts.map +1 -0
  44. package/dist/eval/llm.js +165 -0
  45. package/dist/eval/llm.js.map +1 -0
  46. package/dist/eval/prompts.d.ts +5 -0
  47. package/dist/eval/prompts.d.ts.map +1 -0
  48. package/dist/eval/prompts.js +44 -0
  49. package/dist/eval/prompts.js.map +1 -0
  50. package/dist/eval/run.d.ts +13 -0
  51. package/dist/eval/run.d.ts.map +1 -0
  52. package/dist/eval/run.js +78 -0
  53. package/dist/eval/run.js.map +1 -0
  54. package/dist/eval/score.d.ts +12 -0
  55. package/dist/eval/score.d.ts.map +1 -0
  56. package/dist/eval/score.js +154 -0
  57. package/dist/eval/score.js.map +1 -0
  58. package/dist/eval/storage.d.ts +10 -0
  59. package/dist/eval/storage.d.ts.map +1 -0
  60. package/dist/eval/storage.js +69 -0
  61. package/dist/eval/storage.js.map +1 -0
  62. package/dist/eval/subgraph.d.ts +17 -0
  63. package/dist/eval/subgraph.d.ts.map +1 -0
  64. package/dist/eval/subgraph.js +214 -0
  65. package/dist/eval/subgraph.js.map +1 -0
  66. package/dist/eval/tool-surface.d.ts +7 -0
  67. package/dist/eval/tool-surface.d.ts.map +1 -0
  68. package/dist/eval/tool-surface.js +160 -0
  69. package/dist/eval/tool-surface.js.map +1 -0
  70. package/dist/eval/types.d.ts +173 -0
  71. package/dist/eval/types.d.ts.map +1 -0
  72. package/dist/eval/types.js +44 -0
  73. package/dist/eval/types.js.map +1 -0
  74. package/dist/index.d.ts.map +1 -1
  75. package/dist/index.js +11 -1
  76. package/dist/index.js.map +1 -1
  77. package/dist/search/reindex.d.ts.map +1 -1
  78. package/dist/search/reindex.js +6 -0
  79. package/dist/search/reindex.js.map +1 -1
  80. package/dist/server.d.ts.map +1 -1
  81. package/dist/server.js +2 -0
  82. package/dist/server.js.map +1 -1
  83. package/dist/storage/index-db.d.ts +19 -0
  84. package/dist/storage/index-db.d.ts.map +1 -1
  85. package/dist/storage/index-db.js +56 -0
  86. package/dist/storage/index-db.js.map +1 -1
  87. package/dist/tools/curation.d.ts +2 -1
  88. package/dist/tools/curation.d.ts.map +1 -1
  89. package/dist/tools/curation.js +18 -4
  90. package/dist/tools/curation.js.map +1 -1
  91. package/dist/tools/staged-actions.d.ts +18 -0
  92. package/dist/tools/staged-actions.d.ts.map +1 -0
  93. package/dist/tools/staged-actions.js +275 -0
  94. package/dist/tools/staged-actions.js.map +1 -0
  95. package/dist/utils/config.d.ts +1 -0
  96. package/dist/utils/config.d.ts.map +1 -1
  97. package/dist/utils/config.js +32 -0
  98. package/dist/utils/config.js.map +1 -1
  99. package/dist/utils/git.d.ts +6 -0
  100. package/dist/utils/git.d.ts.map +1 -1
  101. package/dist/utils/git.js +34 -0
  102. package/dist/utils/git.js.map +1 -1
  103. package/package.json +2 -1
@@ -0,0 +1,165 @@
1
+ // src/eval/llm.ts
2
+ // Single-point wrapper around @anthropic-ai/sdk. Other eval modules depend
3
+ // on the LlmClient interface, not the SDK, so they can be unit-tested with
4
+ // hand-rolled mocks.
5
+ import Anthropic from "@anthropic-ai/sdk";
6
+ import { err, ok } from "../frontmatter/types.js";
7
+ export function createAnthropicClient() {
8
+ const apiKey = process.env.ANTHROPIC_API_KEY;
9
+ if (!apiKey)
10
+ throw new Error("ANTHROPIC_API_KEY env var is required for daftari eval");
11
+ const client = new Anthropic({ apiKey });
12
+ const complete = async (opts) => {
13
+ return retry(async () => {
14
+ const res = await client.messages.create({
15
+ model: opts.model,
16
+ max_tokens: opts.maxTokens ?? 4096,
17
+ system: opts.system,
18
+ messages: [{ role: "user", content: opts.user }],
19
+ });
20
+ const text = res.content
21
+ .filter((b) => b.type === "text")
22
+ .map((b) => b.text)
23
+ .join("");
24
+ return ok({
25
+ text,
26
+ input_tokens: res.usage.input_tokens,
27
+ output_tokens: res.usage.output_tokens,
28
+ stop_reason: res.stop_reason ?? "unknown",
29
+ });
30
+ });
31
+ };
32
+ const completeJson = async (opts) => {
33
+ // The schema is embedded in the system prompt as a hint to the LLM, then
34
+ // the response goes through JSON.parse + a manual shape check by the
35
+ // caller (see generate.ts and score.ts). This is NOT strict JSON Schema
36
+ // validation — there is no schema validator dep in v1. Callers must
37
+ // verify required fields exist after parse. If we ever need strict
38
+ // validation, add `ajv` and validate `parsed` here.
39
+ const sysWithSchema = `${opts.system}\n\nReturn JSON matching:\n${JSON.stringify(opts.schema, null, 2)}\nReturn ONLY JSON, no prose.`;
40
+ const r = await complete({ ...opts, system: sysWithSchema });
41
+ if (!r.ok)
42
+ return r;
43
+ try {
44
+ const parsed = JSON.parse(stripCodeFence(r.value.text));
45
+ return ok({ ...r.value, parsed });
46
+ }
47
+ catch (e) {
48
+ const msg = e instanceof Error ? e.message : String(e);
49
+ return err({
50
+ kind: "llm",
51
+ message: `JSON parse: ${msg} — output was: ${r.value.text.slice(0, 200)}`,
52
+ retryable: false,
53
+ });
54
+ }
55
+ };
56
+ const completeWithTools = async (opts) => {
57
+ const maxRounds = opts.maxRounds ?? 12;
58
+ const toolCalls = [];
59
+ const messages = [
60
+ { role: "user", content: opts.user },
61
+ ];
62
+ let totalIn = 0;
63
+ let totalOut = 0;
64
+ let lastStop = "unknown";
65
+ for (let round = 0; round < maxRounds; round++) {
66
+ const res = await retry(async () => ok(await client.messages.create({
67
+ model: opts.model,
68
+ max_tokens: opts.maxTokens ?? 4096,
69
+ system: opts.system,
70
+ // biome-ignore lint/suspicious/noExplicitAny: SDK types
71
+ tools: opts.tools,
72
+ // biome-ignore lint/suspicious/noExplicitAny: SDK types
73
+ messages: messages,
74
+ })));
75
+ if (!res.ok)
76
+ return res;
77
+ const message = res.value;
78
+ totalIn += message.usage.input_tokens;
79
+ totalOut += message.usage.output_tokens;
80
+ lastStop = message.stop_reason ?? "unknown";
81
+ // biome-ignore lint/suspicious/noExplicitAny: SDK content union
82
+ const blocks = message.content;
83
+ const toolUses = blocks.filter((b) => b.type === "tool_use");
84
+ if (toolUses.length === 0) {
85
+ const text = blocks
86
+ .filter((b) => b.type === "text")
87
+ .map((b) => b.text)
88
+ .join("");
89
+ return ok({
90
+ text,
91
+ input_tokens: totalIn,
92
+ output_tokens: totalOut,
93
+ stop_reason: lastStop,
94
+ tool_calls: toolCalls,
95
+ });
96
+ }
97
+ messages.push({ role: "assistant", content: blocks });
98
+ const toolResults = [];
99
+ for (const tu of toolUses) {
100
+ const t0 = Date.now();
101
+ let output;
102
+ try {
103
+ output = await opts.toolHandler(tu.name, tu.input);
104
+ }
105
+ catch (e) {
106
+ output = { tool_error: e instanceof Error ? e.message : String(e) };
107
+ }
108
+ const latency = Date.now() - t0;
109
+ toolCalls.push({ tool: tu.name, input: tu.input, output, latency_ms: latency });
110
+ toolResults.push({
111
+ type: "tool_result",
112
+ tool_use_id: tu.id,
113
+ content: typeof output === "string" ? output : JSON.stringify(output),
114
+ });
115
+ }
116
+ messages.push({ role: "user", content: toolResults });
117
+ }
118
+ return err({
119
+ kind: "llm",
120
+ message: `exceeded maxRounds (${maxRounds}) without final answer`,
121
+ retryable: false,
122
+ });
123
+ };
124
+ return { complete, completeJson, completeWithTools };
125
+ }
126
+ // --- helpers ---
127
+ const MAX_RETRIES = 5;
128
+ const BASE_BACKOFF_MS = 500;
129
+ const MAX_BACKOFF_MS = 60_000;
130
+ // Exported for unit testing — these two pure helpers carry the trickiest logic
131
+ // in this module (retry arithmetic/predicate, fence stripping) and would
132
+ // otherwise be unreachable, since createAnthropicClient news up the SDK.
133
+ export async function retry(fn) {
134
+ let lastErr = null;
135
+ for (let i = 0; i < MAX_RETRIES; i++) {
136
+ try {
137
+ const r = await fn();
138
+ if (r.ok)
139
+ return r;
140
+ if (!r.error || r.error.kind !== "llm" || !r.error.retryable)
141
+ return r;
142
+ lastErr = r.error;
143
+ }
144
+ catch (e) {
145
+ const msg = e instanceof Error ? e.message : String(e);
146
+ const status = e?.status;
147
+ const retryable = status === 429 || (typeof status === "number" && status >= 500);
148
+ if (!retryable)
149
+ return err({ kind: "llm", message: msg, retryable: false });
150
+ lastErr = { kind: "llm", message: msg, retryable: true };
151
+ }
152
+ // Don't sleep after the final attempt — the loop is about to exit and
153
+ // surface the error; a trailing backoff would just delay the failure.
154
+ if (i < MAX_RETRIES - 1) {
155
+ const backoff = Math.min(BASE_BACKOFF_MS * 2 ** i, MAX_BACKOFF_MS);
156
+ await new Promise((res) => setTimeout(res, backoff));
157
+ }
158
+ }
159
+ return err(lastErr ?? { kind: "llm", message: "retries exhausted", retryable: false });
160
+ }
161
+ export function stripCodeFence(s) {
162
+ const m = s.match(/^```(?:json)?\n([\s\S]*?)\n```\s*$/);
163
+ return m ? m[1] : s;
164
+ }
165
+ //# sourceMappingURL=llm.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm.js","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2EAA2E;AAC3E,2EAA2E;AAC3E,qBAAqB;AAErB,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAmD/D,MAAM,UAAU,qBAAqB;IACnC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAC7C,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IACvF,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IAEzC,MAAM,QAAQ,GAAG,KAAK,EAAE,IAAkB,EAAoD,EAAE;QAC9F,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;YACtB,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACvC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;gBAClC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC;aACjD,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,GAAG,CAAC,OAAO;iBACrB,MAAM,CAAC,CAAC,CAAC,EAAwD,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;iBACtF,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;iBAClB,IAAI,CAAC,EAAE,CAAC,CAAC;YACZ,OAAO,EAAE,CAAC;gBACR,IAAI;gBACJ,YAAY,EAAE,GAAG,CAAC,KAAK,CAAC,YAAY;gBACpC,aAAa,EAAE,GAAG,CAAC,KAAK,CAAC,aAAa;gBACtC,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS;aAC1C,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC;IAEF,MAAM,YAAY,GAAG,KAAK,EACxB,IAAsB,EACgC,EAAE;QACxD,yEAAyE;QACzE,qEAAqE;QACrE,wEAAwE;QACxE,oEAAoE;QACpE,mEAAmE;QACnE,oDAAoD;QACpD,MAAM,aAAa,GAAG,GAAG,IAAI,CAAC,MAAM,8BAA8B,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,+BAA+B,CAAC;QACtI,MAAM,CAAC,GAAG,MAAM,QAAQ,CAAC,EAAE,GAAG,IAAI,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,CAAC,CAAC,EAAE;YAAE,OAAO,CAAC,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;YACxD,OAAO,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;QACpC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACvD,OAAO,GAAG,CAAC;gBACT,IAAI,EAAE,KAAK;gBACX,OAAO,EAAE,eAAe,GAAG,kBAAkB,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;gBACzE,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC;IAEF,MAAM,iBAAiB,GAAG,KAAK,EAC7B,IAA2B,EACgC,EAAE;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;QACvC,MAAM,SAAS,GAA0C,EAAE,CAAC;QAC5D,MAAM,QAAQ,GAA4D;YACxE,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,IAAI,EAAE;SACrC,CAAC;QACF,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,QAAQ,GAAG,SAAS,CAAC;QAEzB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,SAAS,EAAE,KAAK,EAAE,EAAE,CAAC;YAC/C,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,KAAK,IAAI,EAAE,CACjC,EAAE,CACA,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;gBAClC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,wDAAwD;gBACxD,KAAK,EAAE,IAAI,CAAC,KAAY;gBACxB,wDAAwD;gBACxD,QAAQ,EAAE,QAAe;aAC1B,CAAC,CACH,CACF,CAAC;YACF,IAAI,CAAC,GAAG,CAAC,EAAE;gBAAE,OAAO,GAAG,CAAC;YACxB,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC;YAC1B,OAAO,IAAI,OAAO,CAAC,KAAK,CAAC,YAAY,CAAC;YACtC,QAAQ,IAAI,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC;YACxC,QAAQ,GAAG,OAAO,CAAC,WAAW,IAAI,SAAS,CAAC;YAE5C,gEAAgE;YAChE,MAAM,MAAM,GAAG,OAAO,CAAC,OAAgB,CAAC;YACxC,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;YAC7D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC1B,MAAM,IAAI,GAAG,MAAM;qBAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;qBAChC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;qBAClB,IAAI,CAAC,EAAE,CAAC,CAAC;gBACZ,OAAO,EAAE,CAAC;oBACR,IAAI;oBACJ,YAAY,EAAE,OAAO;oBACrB,aAAa,EAAE,QAAQ;oBACvB,WAAW,EAAE,QAAQ;oBACrB,UAAU,EAAE,SAAS;iBACtB,CAAC,CAAC;YACL,CAAC;YAED,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;YAEtD,MAAM,WAAW,GAAc,EAAE,CAAC;YAClC,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;gBAC1B,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBACtB,IAAI,MAAe,CAAC;gBACpB,IAAI,CAAC;oBACH,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;gBACrD,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,MAAM,GAAG,EAAE,UAAU,EAAE,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtE,CAAC;gBACD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,EAAE,CAAC;gBAChC,SAAS,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAC;gBAChF,WAAW,CAAC,IAAI,CAAC;oBACf,IAAI,EAAE,aAAa;oBACnB,WAAW,EAAE,EAAE,CAAC,EAAE;oBAClB,OAAO,EAAE,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;iBACtE,CAAC,CAAC;YACL,CAAC;YACD,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,OAAO,GAAG,CAAC;YACT,IAAI,EAAE,KAAK;YACX,OAAO,EAAE,uBAAuB,SAAS,wBAAwB;YACjE,SAAS,EAAE,KAAK;SACjB,CAAC,CAAC;IACL,CAAC,CAAC;IAEF,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,iBAAiB,EAAE,CAAC;AACvD,CAAC;AAED,kBAAkB;AAElB,MAAM,WAAW,GAAG,CAAC,CAAC;AACtB,MAAM,eAAe,GAAG,GAAG,CAAC;AAC5B,MAAM,cAAc,GAAG,MAAM,CAAC;AAE9B,+EAA+E;AAC/E,yEAAyE;AACzE,yEAAyE;AACzE,MAAM,CAAC,KAAK,UAAU,KAAK,CACzB,EAA6C;IAE7C,IAAI,OAAO,GAA2B,IAAI,CAAC;IAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,MAAM,EAAE,EAAE,CAAC;YACrB,IAAI,CAAC,CAAC,EAAE;gBAAE,OAAO,CAAC,CAAC;YACnB,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,KAAK,CAAC,IAAI,KAAK,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS;gBAAE,OAAO,CAAC,CAAC;YACvE,OAAO,GAAG,CAAC,CAAC,KAAK,CAAC;QACpB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACvD,MAAM,MAAM,GAAI,CAAyB,EAAE,MAAM,CAAC;YAClD,MAAM,SAAS,GAAG,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,IAAI,GAAG,CAAC,CAAC;YAClF,IAAI,CAAC,SAAS;gBAAE,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;YAC5E,OAAO,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;QAC3D,CAAC;QACD,sEAAsE;QACtE,sEAAsE;QACtE,IAAI,CAAC,GAAG,WAAW,GAAG,CAAC,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,eAAe,GAAG,CAAC,IAAI,CAAC,EAAE,cAAc,CAAC,CAAC;YACnE,MAAM,IAAI,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC,OAAO,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,mBAAmB,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;AACzF,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,CAAS;IACtC,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AACtB,CAAC"}
@@ -0,0 +1,5 @@
1
+ export declare const PROMPT_VERSION = 1;
2
+ export declare const GENERATOR_PROMPT = "You will read a connected subgraph of a Markdown knowledge vault and produce\nmulti-hop questions across three tiers. The questions must be answerable using\nONLY the docs provided. For each question, supply: question text, tier,\nexpected answer, source paths (must be a subset of the supplied docs).\n\nTiers:\n retrieval \u2014 single-doc lookup, 1-hop reasoning\n cross_reference \u2014 requires combining 2\u20133 docs\n contradiction \u2014 surfaces a tension or conflict across docs (use the\n tension log entries in the subgraph as seed material\n where present)\n\nReturn JSON matching the QuestionSetSchema declared in src/eval/types.ts.\nDo not include questions whose expected_sources are not in the supplied docs.\nDo not generate trivial yes/no questions.";
3
+ export declare const ANSWERER_SYSTEM_PROMPT = "You will answer a question about a Markdown knowledge vault using ONLY the\nprovided Daftari tools. Do not use training knowledge. Do not guess. If the\nvault does not contain the answer, say \"Vault does not contain the answer.\"\nCite source paths in your final answer using the format [path/to/doc.md].";
4
+ export declare const GRADER_PROMPT = "You are grading an answer to a question about a Markdown knowledge vault.\n\nQuestion: {{QUESTION}}\nExpected answer: {{EXPECTED_ANSWER}}\nExpected sources: {{EXPECTED_SOURCES}}\nClaimed answer: {{CLAIMED_ANSWER}}\nCited sources: {{CITED_SOURCES}}\n\nReturn JSON: {\"correct\": \"yes\" | \"partial\" | \"no\", \"reasoning\": \"<string>\"}\n\nDefinitions:\n yes \u2014 claimed answer is substantively correct and cites at least one\n expected source\n partial \u2014 claimed answer is partially correct OR cites the right sources\n but misses key content OR the answerer correctly said \"Vault\n does not contain the answer\" when the expected answer disagrees\n (records a question-set quality issue, not a cortex failure)\n no \u2014 claimed answer is wrong, hallucinated, or cites no expected\n sources";
5
+ //# sourceMappingURL=prompts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../src/eval/prompts.ts"],"names":[],"mappings":"AAKA,eAAO,MAAM,cAAc,IAAI,CAAC;AAEhC,eAAO,MAAM,gBAAgB,4zBAca,CAAC;AAE3C,eAAO,MAAM,sBAAsB,sTAGuC,CAAC;AAE3E,eAAO,MAAM,aAAa,o4BAkBN,CAAC"}
@@ -0,0 +1,44 @@
1
+ // src/eval/prompts.ts
2
+ // Frozen prompts for the three eval LLM roles. Bumping any prompt requires
3
+ // bumping PROMPT_VERSION in the same commit. PROMPT_VERSION is recorded in
4
+ // every output file for forensics and cross-version comparison gates.
5
+ export const PROMPT_VERSION = 1;
6
+ export const GENERATOR_PROMPT = `You will read a connected subgraph of a Markdown knowledge vault and produce
7
+ multi-hop questions across three tiers. The questions must be answerable using
8
+ ONLY the docs provided. For each question, supply: question text, tier,
9
+ expected answer, source paths (must be a subset of the supplied docs).
10
+
11
+ Tiers:
12
+ retrieval — single-doc lookup, 1-hop reasoning
13
+ cross_reference — requires combining 2–3 docs
14
+ contradiction — surfaces a tension or conflict across docs (use the
15
+ tension log entries in the subgraph as seed material
16
+ where present)
17
+
18
+ Return JSON matching the QuestionSetSchema declared in src/eval/types.ts.
19
+ Do not include questions whose expected_sources are not in the supplied docs.
20
+ Do not generate trivial yes/no questions.`;
21
+ export const ANSWERER_SYSTEM_PROMPT = `You will answer a question about a Markdown knowledge vault using ONLY the
22
+ provided Daftari tools. Do not use training knowledge. Do not guess. If the
23
+ vault does not contain the answer, say "Vault does not contain the answer."
24
+ Cite source paths in your final answer using the format [path/to/doc.md].`;
25
+ export const GRADER_PROMPT = `You are grading an answer to a question about a Markdown knowledge vault.
26
+
27
+ Question: {{QUESTION}}
28
+ Expected answer: {{EXPECTED_ANSWER}}
29
+ Expected sources: {{EXPECTED_SOURCES}}
30
+ Claimed answer: {{CLAIMED_ANSWER}}
31
+ Cited sources: {{CITED_SOURCES}}
32
+
33
+ Return JSON: {"correct": "yes" | "partial" | "no", "reasoning": "<string>"}
34
+
35
+ Definitions:
36
+ yes — claimed answer is substantively correct and cites at least one
37
+ expected source
38
+ partial — claimed answer is partially correct OR cites the right sources
39
+ but misses key content OR the answerer correctly said "Vault
40
+ does not contain the answer" when the expected answer disagrees
41
+ (records a question-set quality issue, not a cortex failure)
42
+ no — claimed answer is wrong, hallucinated, or cites no expected
43
+ sources`;
44
+ //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../src/eval/prompts.ts"],"names":[],"mappings":"AAAA,sBAAsB;AACtB,2EAA2E;AAC3E,2EAA2E;AAC3E,sEAAsE;AAEtE,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC;AAEhC,MAAM,CAAC,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;0CAcU,CAAC;AAE3C,MAAM,CAAC,MAAM,sBAAsB,GAAG;;;0EAGoC,CAAC;AAE3E,MAAM,CAAC,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;oBAkBT,CAAC"}
@@ -0,0 +1,13 @@
1
+ import { type Result } from "../frontmatter/types.js";
2
+ import type { LlmClient } from "./llm.js";
3
+ import type { CortexEvalError, EvalRun, QuestionSet } from "./types.js";
4
+ export interface RunOptions {
5
+ k: number;
6
+ model: string;
7
+ resumeFrom?: EvalRun;
8
+ runId?: string;
9
+ timestamp?: string;
10
+ persist?: (run: EvalRun) => Promise<void>;
11
+ }
12
+ export declare function runAnswerer(questions: QuestionSet, vaultRoot: string, llm: LlmClient, opts: RunOptions): Promise<Result<EvalRun, CortexEvalError>>;
13
+ //# sourceMappingURL=run.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../src/eval/run.ts"],"names":[],"mappings":"AAMA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,KAAK,EAAE,SAAS,EAAW,MAAM,UAAU,CAAC;AAGnD,OAAO,KAAK,EAAE,eAAe,EAAE,OAAO,EAAgB,WAAW,EAAS,MAAM,YAAY,CAAC;AAE7F,MAAM,WAAW,UAAU;IACzB,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,CAAC,GAAG,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CAC3C;AAED,wBAAsB,WAAW,CAC/B,SAAS,EAAE,WAAW,EACtB,SAAS,EAAE,MAAM,EACjB,GAAG,EAAE,SAAS,EACd,IAAI,EAAE,UAAU,GACf,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAyE3C"}
@@ -0,0 +1,78 @@
1
+ // src/eval/run.ts
2
+ // The answerer loop. For each question, run the answerer LLM k independent
3
+ // times against the in-process tool surface, recording a full trace per run.
4
+ // Results are keyed by `"${question_index}:${k_index}"` so --resume can skip
5
+ // pairs already marked complete and re-run only the rest.
6
+ import { err, ok } from "../frontmatter/types.js";
7
+ import { ANSWERER_SYSTEM_PROMPT, PROMPT_VERSION } from "./prompts.js";
8
+ import { buildToolSurface } from "./tool-surface.js";
9
+ export async function runAnswerer(questions, vaultRoot, llm, opts) {
10
+ const ts = opts.timestamp ?? "2026-01-01T00:00:00Z";
11
+ const id = opts.resumeFrom?.id ?? opts.runId ?? `${questions.id}-${opts.model}-${ts}`;
12
+ const runs = { ...(opts.resumeFrom?.runs ?? {}) };
13
+ // Builds the current EvalRun from the live `runs` map. Snapshotting after
14
+ // each (q,k) status change lets the caller persist partial progress, so a
15
+ // failure/crash leaves a resumable file on disk (see --resume).
16
+ const snapshot = () => ({
17
+ id,
18
+ questions_id: questions.id,
19
+ answerer_model: opts.model,
20
+ prompt_version: PROMPT_VERSION,
21
+ timestamp: ts,
22
+ k: opts.k,
23
+ runs,
24
+ });
25
+ const tools = buildToolSurface(vaultRoot);
26
+ const toolDefs = tools.defs;
27
+ for (let qi = 0; qi < questions.questions.length; qi++) {
28
+ const q = questions.questions[qi];
29
+ for (let k = 0; k < opts.k; k++) {
30
+ const key = `${qi}:${k}`;
31
+ if (runs[key]?.status === "complete")
32
+ continue;
33
+ const t0 = Date.now();
34
+ const r = await llm.completeWithTools({
35
+ model: opts.model,
36
+ system: ANSWERER_SYSTEM_PROMPT,
37
+ user: q.question,
38
+ tools: toolDefs,
39
+ toolHandler: tools.handler,
40
+ });
41
+ const wall_ms = Date.now() - t0;
42
+ if (!r.ok) {
43
+ // Mark this pair incomplete, persist the partial run (completed pairs
44
+ // plus this incomplete one), then surface the error. Persisting here
45
+ // is what makes --resume work: progress made before the failure is
46
+ // saved, so a re-run skips the completed pairs and retries the rest.
47
+ runs[key] = {
48
+ question_id: q.id,
49
+ question_index: qi,
50
+ k_index: k,
51
+ status: "incomplete",
52
+ trace: null,
53
+ };
54
+ await opts.persist?.(snapshot());
55
+ return err(r.error);
56
+ }
57
+ const trace = {
58
+ tool_calls: r.value.tool_calls,
59
+ final_answer: r.value.text,
60
+ total_tool_calls: r.value.tool_calls.length,
61
+ input_tokens: r.value.input_tokens,
62
+ output_tokens: r.value.output_tokens,
63
+ wall_ms,
64
+ stop_reason: r.value.stop_reason,
65
+ };
66
+ runs[key] = {
67
+ question_id: q.id,
68
+ question_index: qi,
69
+ k_index: k,
70
+ status: "complete",
71
+ trace,
72
+ };
73
+ await opts.persist?.(snapshot());
74
+ }
75
+ }
76
+ return ok(snapshot());
77
+ }
78
+ //# sourceMappingURL=run.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"run.js","sourceRoot":"","sources":["../../src/eval/run.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2EAA2E;AAC3E,6EAA6E;AAC7E,6EAA6E;AAC7E,0DAA0D;AAE1D,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAE/D,OAAO,EAAE,sBAAsB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AACtE,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAYrD,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAsB,EACtB,SAAiB,EACjB,GAAc,EACd,IAAgB;IAEhB,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,IAAI,sBAAsB,CAAC;IACpD,MAAM,EAAE,GAAG,IAAI,CAAC,UAAU,EAAE,EAAE,IAAI,IAAI,CAAC,KAAK,IAAI,GAAG,SAAS,CAAC,EAAE,IAAI,IAAI,CAAC,KAAK,IAAI,EAAE,EAAE,CAAC;IACtF,MAAM,IAAI,GAAiC,EAAE,GAAG,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,EAAE,CAAC;IAEhF,0EAA0E;IAC1E,0EAA0E;IAC1E,gEAAgE;IAChE,MAAM,QAAQ,GAAG,GAAY,EAAE,CAAC,CAAC;QAC/B,EAAE;QACF,YAAY,EAAE,SAAS,CAAC,EAAE;QAC1B,cAAc,EAAE,IAAI,CAAC,KAAK;QAC1B,cAAc,EAAE,cAAc;QAC9B,SAAS,EAAE,EAAE;QACb,CAAC,EAAE,IAAI,CAAC,CAAC;QACT,IAAI;KACL,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;IAC1C,MAAM,QAAQ,GAAc,KAAK,CAAC,IAAI,CAAC;IAEvC,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,SAAS,CAAC,SAAS,CAAC,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC;QACvD,MAAM,CAAC,GAAG,SAAS,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;QAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,GAAG,GAAG,GAAG,EAAE,IAAI,CAAC,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,GAAG,CAAC,EAAE,MAAM,KAAK,UAAU;gBAAE,SAAS;YAE/C,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACtB,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,iBAAiB,CAAC;gBACpC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,sBAAsB;gBAC9B,IAAI,EAAE,CAAC,CAAC,QAAQ;gBAChB,KAAK,EAAE,QAAQ;gBACf,WAAW,EAAE,KAAK,CAAC,OAAO;aAC3B,CAAC,CAAC;YACH,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,EAAE,CAAC;YAChC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;gBACV,sEAAsE;gBACtE,qEAAqE;gBACrE,mEAAmE;gBACnE,qEAAqE;gBACrE,IAAI,CAAC,GAAG,CAAC,GAAG;oBACV,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,cAAc,EAAE,EAAE;oBAClB,OAAO,EAAE,CAAC;oBACV,MAAM,EAAE,YAAY;oBACpB,KAAK,EAAE,IAAI;iBACZ,CAAC;gBACF,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,CAAC;gBACjC,OAAO,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACtB,CAAC;YAED,MAAM,KAAK,GAAU;gBACnB,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU;gBAC9B,YAAY,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI;gBAC1B,gBAAgB,EAAE,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM;gBAC3C,YAAY,EAAE,CAAC,CAAC,KAAK,CAAC,YAAY;gBAClC,aAAa,EAAE,CAAC,CAAC,KAAK,CAAC,aAAa;gBACpC,OAAO;gBACP,WAAW,EAAE,CAAC,CAAC,KAAK,CAAC,WAAW;aACjC,CAAC;YACF,IAAI,CAAC,GAAG,CAAC,GAAG;gBACV,WAAW,EAAE,CAAC,CAAC,EAAE;gBACjB,cAAc,EAAE,EAAE;gBAClB,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,UAAU;gBAClB,KAAK;aACN,CAAC;YACF,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,CAAC;AACxB,CAAC"}
@@ -0,0 +1,12 @@
1
+ import { type Result } from "../frontmatter/types.js";
2
+ import type { LlmClient } from "./llm.js";
3
+ import { type CortexEvalError, type Grade, type Question, type Score, type Trace } from "./types.js";
4
+ export interface AggregateOptions {
5
+ traces: Map<string, Trace>;
6
+ }
7
+ export declare function aggregateScore(grades: Grade[], questions: Question[], opts: AggregateOptions): Score;
8
+ export interface GradeOptions {
9
+ model: string;
10
+ }
11
+ export declare function gradeAnswer(question: Question, questionIndex: number, kIndex: number, trace: Trace, llm: LlmClient, opts: GradeOptions): Promise<Result<Grade, CortexEvalError>>;
12
+ //# sourceMappingURL=score.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAKA,OAAO,EAAM,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC1D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAE1C,OAAO,EACL,KAAK,eAAe,EACpB,KAAK,KAAK,EAEV,KAAK,QAAQ,EACb,KAAK,KAAK,EAKV,KAAK,KAAK,EACX,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,gBAAgB;IAE/B,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;CAC5B;AAUD,wBAAgB,cAAc,CAC5B,MAAM,EAAE,KAAK,EAAE,EACf,SAAS,EAAE,QAAQ,EAAE,EACrB,IAAI,EAAE,gBAAgB,GACrB,KAAK,CAmFP;AAiCD,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,MAAM,CAAC;CACf;AAED,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,QAAQ,EAClB,aAAa,EAAE,MAAM,EACrB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,EACZ,GAAG,EAAE,SAAS,EACd,IAAI,EAAE,YAAY,GACjB,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC,CAiCzC"}
@@ -0,0 +1,154 @@
1
+ // src/eval/score.ts
2
+ // Aggregation of per-(question, k) grades into the headline tier-weighted
3
+ // score. Pure math. The LLM grader is added in Task 10; this v1 only
4
+ // computes scores from already-graded inputs.
5
+ import { ok } from "../frontmatter/types.js";
6
+ import { GRADER_PROMPT } from "./prompts.js";
7
+ import { TIER_WEIGHT, TIERS, } from "./types.js";
8
+ // Maps verdict to numeric value; null means excluded from aggregate.
9
+ const VERDICT_VALUE = {
10
+ yes: 1.0,
11
+ partial: 0.5,
12
+ no: 0.0,
13
+ ungraded: null,
14
+ };
15
+ export function aggregateScore(grades, questions, opts) {
16
+ const byTier = blankByTier();
17
+ // Group grades by question_id without non-null assertions.
18
+ const byQuestion = new Map();
19
+ for (const grade of grades) {
20
+ pushTo(byQuestion, grade.question_id, grade);
21
+ }
22
+ for (const tier of TIERS) {
23
+ const tierQuestions = questions.filter((q) => q.tier === tier);
24
+ const perQuestionMeans = [];
25
+ const efficiencyHits = [];
26
+ for (const q of tierQuestions) {
27
+ // Only include grades with a numeric verdict value (excludes ungraded).
28
+ const qGrades = (byQuestion.get(q.id) ?? []).filter((grade) => {
29
+ const val = VERDICT_VALUE[grade.verdict];
30
+ return val !== null;
31
+ });
32
+ if (qGrades.length === 0)
33
+ continue;
34
+ const values = qGrades.map((grade) => {
35
+ // Safe: we already filtered nulls above.
36
+ const val = VERDICT_VALUE[grade.verdict];
37
+ return val !== null ? val : 0;
38
+ });
39
+ const mean = avg(values);
40
+ perQuestionMeans.push(mean);
41
+ for (const grade of qGrades) {
42
+ const val = VERDICT_VALUE[grade.verdict];
43
+ if (val !== null && val > 0) {
44
+ const t = opts.traces.get(`${grade.question_id}:${grade.k_index}`);
45
+ if (t)
46
+ efficiencyHits.push(t.total_tool_calls);
47
+ }
48
+ }
49
+ }
50
+ byTier[tier] = {
51
+ mean: perQuestionMeans.length > 0 ? avg(perQuestionMeans) : 0,
52
+ std: perQuestionMeans.length > 0 ? stddev(perQuestionMeans) : 0,
53
+ n: perQuestionMeans.length,
54
+ trace_efficiency: efficiencyHits.length > 0 ? avg(efficiencyHits) : 0,
55
+ };
56
+ }
57
+ // Weighted aggregate: sum(weight * tier_mean * tier_n) / sum(weight * tier_n)
58
+ let num = 0;
59
+ let denom = 0;
60
+ for (const tier of TIERS) {
61
+ const w = TIER_WEIGHT[tier];
62
+ const ts = byTier[tier];
63
+ num += w * ts.mean * ts.n;
64
+ denom += w * ts.n;
65
+ }
66
+ const score = denom > 0 ? num / denom : 0;
67
+ const scoreStd = denom > 0
68
+ ? Math.sqrt(TIERS.reduce((acc, t) => {
69
+ const w = TIER_WEIGHT[t];
70
+ const ts = byTier[t];
71
+ return acc + ((w * ts.n) / denom) * ts.std ** 2;
72
+ }, 0))
73
+ : 0;
74
+ return {
75
+ score,
76
+ score_std: scoreStd,
77
+ by_tier: byTier,
78
+ models: { generator: "", answerer: "", grader: "" },
79
+ prompt_version: 0,
80
+ spec_version: 0,
81
+ questions_id: "",
82
+ results_id: "",
83
+ vault_hash: "",
84
+ k: 0,
85
+ n: 0,
86
+ timestamp: "",
87
+ };
88
+ }
89
+ function blankByTier() {
90
+ return {
91
+ retrieval: { mean: 0, std: 0, n: 0, trace_efficiency: 0 },
92
+ cross_reference: { mean: 0, std: 0, n: 0, trace_efficiency: 0 },
93
+ contradiction: { mean: 0, std: 0, n: 0, trace_efficiency: 0 },
94
+ };
95
+ }
96
+ // Appends `value` to the array at `key`, creating the array on first use.
97
+ // Mirrors the pushTo pattern from subgraph.ts to avoid non-null assertions.
98
+ function pushTo(m, key, value) {
99
+ const arr = m.get(key);
100
+ if (arr) {
101
+ arr.push(value);
102
+ }
103
+ else {
104
+ m.set(key, [value]);
105
+ }
106
+ }
107
+ function avg(xs) {
108
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
109
+ }
110
+ function stddev(xs) {
111
+ if (xs.length < 2)
112
+ return 0;
113
+ const m = avg(xs);
114
+ return Math.sqrt(xs.reduce((acc, x) => acc + (x - m) ** 2, 0) / xs.length);
115
+ }
116
+ export async function gradeAnswer(question, questionIndex, kIndex, trace, llm, opts) {
117
+ const cited = extractCitations(trace.final_answer);
118
+ const user = GRADER_PROMPT.replace("{{QUESTION}}", question.question)
119
+ .replace("{{EXPECTED_ANSWER}}", question.expected_answer)
120
+ .replace("{{EXPECTED_SOURCES}}", question.expected_sources.join(", "))
121
+ .replace("{{CLAIMED_ANSWER}}", trace.final_answer)
122
+ .replace("{{CITED_SOURCES}}", cited.join(", "));
123
+ const schema = {
124
+ type: "object",
125
+ required: ["correct", "reasoning"],
126
+ properties: {
127
+ correct: { enum: ["yes", "partial", "no"] },
128
+ reasoning: { type: "string" },
129
+ },
130
+ };
131
+ const r = await llm.completeJson({ model: opts.model, system: "", user, schema });
132
+ if (!r.ok)
133
+ return r;
134
+ // biome-ignore lint/suspicious/noExplicitAny: parsed JSON
135
+ const parsed = r.value.parsed;
136
+ const verdict = parsed?.correct === "yes" || parsed?.correct === "partial" || parsed?.correct === "no"
137
+ ? parsed.correct
138
+ : "ungraded";
139
+ return ok({
140
+ question_id: question.id,
141
+ question_index: questionIndex,
142
+ k_index: kIndex,
143
+ verdict,
144
+ reasoning: typeof parsed?.reasoning === "string" ? parsed.reasoning : "",
145
+ grader_model: opts.model,
146
+ });
147
+ }
148
+ function extractCitations(answer) {
149
+ const out = [];
150
+ for (const m of answer.matchAll(/\[([^\]]+\.md)\]/g))
151
+ out.push(m[1]);
152
+ return out;
153
+ }
154
+ //# sourceMappingURL=score.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,oBAAoB;AACpB,0EAA0E;AAC1E,qEAAqE;AACrE,8CAA8C;AAE9C,OAAO,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAE1D,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC7C,OAAO,EAML,WAAW,EACX,KAAK,GAIN,MAAM,YAAY,CAAC;AAOpB,qEAAqE;AACrE,MAAM,aAAa,GAA4C;IAC7D,GAAG,EAAE,GAAG;IACR,OAAO,EAAE,GAAG;IACZ,EAAE,EAAE,GAAG;IACP,QAAQ,EAAE,IAAI;CACf,CAAC;AAEF,MAAM,UAAU,cAAc,CAC5B,MAAe,EACf,SAAqB,EACrB,IAAsB;IAEtB,MAAM,MAAM,GAA4B,WAAW,EAAE,CAAC;IAEtD,2DAA2D;IAC3D,MAAM,UAAU,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC9C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,CAAC,UAAU,EAAE,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC;IAC/C,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QAC/D,MAAM,gBAAgB,GAAa,EAAE,CAAC;QACtC,MAAM,cAAc,GAAa,EAAE,CAAC;QAEpC,KAAK,MAAM,CAAC,IAAI,aAAa,EAAE,CAAC;YAC9B,wEAAwE;YACxE,MAAM,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;gBAC5D,MAAM,GAAG,GAAG,aAAa,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACzC,OAAO,GAAG,KAAK,IAAI,CAAC;YACtB,CAAC,CAAC,CAAC;YACH,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAEnC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;gBACnC,yCAAyC;gBACzC,MAAM,GAAG,GAAG,aAAa,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACzC,OAAO,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAChC,CAAC,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;YACzB,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE5B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,aAAa,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACzC,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;oBAC5B,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,WAAW,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;oBACnE,IAAI,CAAC;wBAAE,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,gBAAgB,CAAC,CAAC;gBACjD,CAAC;YACH,CAAC;QACH,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,GAAG;YACb,IAAI,EAAE,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7D,GAAG,EAAE,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;YAC/D,CAAC,EAAE,gBAAgB,CAAC,MAAM;YAC1B,gBAAgB,EAAE,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;SACtE,CAAC;IACJ,CAAC;IAED,8EAA8E;IAC9E,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QAC5B,MAAM,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QACxB,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;QAC1B,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IACpB,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAE1C,MAAM,QAAQ,GACZ,KAAK,GAAG,CAAC;QACP,CAAC,CAAC,IAAI,CAAC,IAAI,CACP,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;YACtB,MAAM,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;YACzB,MAAM,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACrB,OAAO,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,GAAG,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC;QAClD,CAAC,EAAE,CAAC,CAAC,CACN;QACH,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,KAAK;QACL,SAAS,EAAE,QAAQ;QACnB,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE;QACnD,cAAc,EAAE,CAAC;QACjB,YAAY,EAAE,CAAC;QACf,YAAY,EAAE,EAAE;QAChB,UAAU,EAAE,EAAE;QACd,UAAU,EAAE,EAAE;QACd,CAAC,EAAE,CAAC;QACJ,CAAC,EAAE,CAAC;QACJ,SAAS,EAAE,EAAE;KACd,CAAC;AACJ,CAAC;AAED,SAAS,WAAW;IAClB,OAAO;QACL,SAAS,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,gBAAgB,EAAE,CAAC,EAAE;QACzD,eAAe,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,gBAAgB,EAAE,CAAC,EAAE;QAC/D,aAAa,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,gBAAgB,EAAE,CAAC,EAAE;KAC9D,CAAC;AACJ,CAAC;AAED,0EAA0E;AAC1E,4EAA4E;AAC5E,SAAS,MAAM,CAAI,CAAmB,EAAE,GAAW,EAAE,KAAQ;IAC3D,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACvB,IAAI,GAAG,EAAE,CAAC;QACR,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAClB,CAAC;SAAM,CAAC;QACN,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED,SAAS,GAAG,CAAC,EAAY;IACvB,OAAO,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC;AACnD,CAAC;AAED,SAAS,MAAM,CAAC,EAAY;IAC1B,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC5B,MAAM,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;IAClB,OAAO,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,CAAC;AAC7E,CAAC;AAQD,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAkB,EAClB,aAAqB,EACrB,MAAc,EACd,KAAY,EACZ,GAAc,EACd,IAAkB;IAElB,MAAM,KAAK,GAAG,gBAAgB,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;IACnD,MAAM,IAAI,GAAG,aAAa,CAAC,OAAO,CAAC,cAAc,EAAE,QAAQ,CAAC,QAAQ,CAAC;SAClE,OAAO,CAAC,qBAAqB,EAAE,QAAQ,CAAC,eAAe,CAAC;SACxD,OAAO,CAAC,sBAAsB,EAAE,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SACrE,OAAO,CAAC,oBAAoB,EAAE,KAAK,CAAC,YAAY,CAAC;SACjD,OAAO,CAAC,mBAAmB,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAElD,MAAM,MAAM,GAAG;QACb,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,CAAC,SAAS,EAAE,WAAW,CAAC;QAClC,UAAU,EAAE;YACV,OAAO,EAAE,EAAE,IAAI,EAAE,CAAC,KAAK,EAAE,SAAS,EAAE,IAAI,CAAC,EAAE;YAC3C,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;SAC9B;KACO,CAAC;IAEX,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,YAAY,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IAClF,IAAI,CAAC,CAAC,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC;IACpB,0DAA0D;IAC1D,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,MAAa,CAAC;IACrC,MAAM,OAAO,GACX,MAAM,EAAE,OAAO,KAAK,KAAK,IAAI,MAAM,EAAE,OAAO,KAAK,SAAS,IAAI,MAAM,EAAE,OAAO,KAAK,IAAI;QACpF,CAAC,CAAC,MAAM,CAAC,OAAO;QAChB,CAAC,CAAC,UAAU,CAAC;IACjB,OAAO,EAAE,CAAC;QACR,WAAW,EAAE,QAAQ,CAAC,EAAE;QACxB,cAAc,EAAE,aAAa;QAC7B,OAAO,EAAE,MAAM;QACf,OAAO;QACP,SAAS,EAAE,OAAO,MAAM,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;QACxE,YAAY,EAAE,IAAI,CAAC,KAAK;KACzB,CAAC,CAAC;AACL,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAc;IACtC,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,mBAAmB,CAAC;QAAE,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,10 @@
1
+ import { type Result } from "../frontmatter/types.js";
2
+ import { type CortexEvalError, type EvalRun, type HistoryEntry, type HistoryFile, type QuestionSet, type Score } from "./types.js";
3
+ export declare function writeQuestionSet(vault: string, qs: QuestionSet): Promise<void>;
4
+ export declare function readQuestionSet(vault: string, id: string): Promise<Result<QuestionSet, CortexEvalError>>;
5
+ export declare function writeResults(vault: string, run: EvalRun): Promise<void>;
6
+ export declare function readResults(vault: string, id: string): Promise<Result<EvalRun, CortexEvalError>>;
7
+ export declare function writeScore(vault: string, score: Score): Promise<void>;
8
+ export declare function appendHistory(vault: string, entry: HistoryEntry): Promise<void>;
9
+ export declare function readHistory(vault: string): Promise<Result<HistoryFile, CortexEvalError>>;
10
+ //# sourceMappingURL=storage.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"storage.d.ts","sourceRoot":"","sources":["../../src/eval/storage.ts"],"names":[],"mappings":"AAOA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,EACL,KAAK,eAAe,EACpB,KAAK,OAAO,EAEZ,KAAK,YAAY,EACjB,KAAK,WAAW,EAChB,KAAK,WAAW,EAChB,KAAK,KAAK,EACX,MAAM,YAAY,CAAC;AA0BpB,wBAAsB,gBAAgB,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAGpF;AAED,wBAAgB,eAAe,CAC7B,KAAK,EAAE,MAAM,EACb,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,eAAe,CAAC,CAAC,CAE/C;AAED,wBAAsB,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAG7E;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,CAEhG;AAED,wBAAsB,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,CAG3E;AAED,wBAAsB,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAOrF;AAED,wBAAsB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,eAAe,CAAC,CAAC,CAY9F"}
@@ -0,0 +1,69 @@
1
+ // src/eval/storage.ts
2
+ // JSON I/O under .daftari/eval/. No business logic — just paths, schemas,
3
+ // rotation. Read paths are read-only-compatible per spec §12 resolution 5.
4
+ import { existsSync } from "node:fs";
5
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
6
+ import { join } from "node:path";
7
+ import { err, ok } from "../frontmatter/types.js";
8
+ import { HISTORY_RETENTION, } from "./types.js";
9
+ const EVAL_DIR = (vault) => join(vault, ".daftari", "eval");
10
+ const QS_DIR = (vault) => join(EVAL_DIR(vault), "questions");
11
+ const RES_DIR = (vault) => join(EVAL_DIR(vault), "results");
12
+ const SCORE_DIR = (vault) => join(EVAL_DIR(vault), "scores");
13
+ const HIST_FILE = (vault) => join(EVAL_DIR(vault), "history.json");
14
+ async function ensureDir(p) {
15
+ await mkdir(p, { recursive: true });
16
+ }
17
+ function writeJson(path, value) {
18
+ return writeFile(path, `${JSON.stringify(value, null, 2)}\n`, "utf8");
19
+ }
20
+ async function readJson(path) {
21
+ try {
22
+ const raw = await readFile(path, "utf8");
23
+ return ok(JSON.parse(raw));
24
+ }
25
+ catch (e) {
26
+ const msg = e instanceof Error ? e.message : String(e);
27
+ return err({ kind: "runtime", message: `read ${path}: ${msg}` });
28
+ }
29
+ }
30
+ export async function writeQuestionSet(vault, qs) {
31
+ await ensureDir(QS_DIR(vault));
32
+ await writeJson(join(QS_DIR(vault), `${qs.id}.json`), qs);
33
+ }
34
+ export function readQuestionSet(vault, id) {
35
+ return readJson(join(QS_DIR(vault), `${id}.json`));
36
+ }
37
+ export async function writeResults(vault, run) {
38
+ await ensureDir(RES_DIR(vault));
39
+ await writeJson(join(RES_DIR(vault), `${run.id}.json`), run);
40
+ }
41
+ export function readResults(vault, id) {
42
+ return readJson(join(RES_DIR(vault), `${id}.json`));
43
+ }
44
+ export async function writeScore(vault, score) {
45
+ await ensureDir(SCORE_DIR(vault));
46
+ await writeJson(join(SCORE_DIR(vault), `${score.results_id}.json`), score);
47
+ }
48
+ export async function appendHistory(vault, entry) {
49
+ await ensureDir(EVAL_DIR(vault));
50
+ const current = await readHistory(vault);
51
+ const runs = current.ok ? [...current.value.runs, entry] : [entry];
52
+ const trimmed = runs.slice(-HISTORY_RETENTION);
53
+ const out = { version: 1, runs: trimmed };
54
+ await writeJson(HIST_FILE(vault), out);
55
+ }
56
+ export async function readHistory(vault) {
57
+ const path = HIST_FILE(vault);
58
+ if (!existsSync(path))
59
+ return ok({ version: 1, runs: [] });
60
+ const r = await readJson(path);
61
+ if (!r.ok)
62
+ return r;
63
+ if (typeof r.value.version === "number" && r.value.version > 1) {
64
+ process.stderr.write(`daftari eval: history.json version ${r.value.version} is newer than supported (1); leaving untouched\n`);
65
+ return ok({ version: 1, runs: [] });
66
+ }
67
+ return r;
68
+ }
69
+ //# sourceMappingURL=storage.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"storage.js","sourceRoot":"","sources":["../../src/eval/storage.ts"],"names":[],"mappings":"AAAA,sBAAsB;AACtB,0EAA0E;AAC1E,2EAA2E;AAE3E,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,yBAAyB,CAAC;AAC/D,OAAO,EAGL,iBAAiB,GAKlB,MAAM,YAAY,CAAC;AAEpB,MAAM,QAAQ,GAAG,CAAC,KAAa,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;AACpE,MAAM,MAAM,GAAG,CAAC,KAAa,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,WAAW,CAAC,CAAC;AACrE,MAAM,OAAO,GAAG,CAAC,KAAa,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,SAAS,CAAC,CAAC;AACpE,MAAM,SAAS,GAAG,CAAC,KAAa,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,QAAQ,CAAC,CAAC;AACrE,MAAM,SAAS,GAAG,CAAC,KAAa,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,cAAc,CAAC,CAAC;AAE3E,KAAK,UAAU,SAAS,CAAC,CAAS;IAChC,MAAM,KAAK,CAAC,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,SAAS,CAAI,IAAY,EAAE,KAAQ;IAC1C,OAAO,SAAS,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;AACxE,CAAC;AAED,KAAK,UAAU,QAAQ,CAAI,IAAY;IACrC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QACzC,OAAO,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAM,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QACvD,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,QAAQ,IAAI,KAAK,GAAG,EAAE,EAAE,CAAC,CAAC;IACnE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,KAAa,EAAE,EAAe;IACnE,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;IAC/B,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,CAAC,EAAE,EAAE,CAAC,CAAC;AAC5D,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,KAAa,EACb,EAAU;IAEV,OAAO,QAAQ,CAAc,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC;AAClE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,KAAa,EAAE,GAAY;IAC5D,MAAM,SAAS,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;IAChC,MAAM,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,GAAG,GAAG,CAAC,EAAE,OAAO,CAAC,EAAE,GAAG,CAAC,CAAC;AAC/D,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,KAAa,EAAE,EAAU;IACnD,OAAO,QAAQ,CAAU,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC;AAC/D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,KAAa,EAAE,KAAY;IAC1D,MAAM,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;IAClC,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,GAAG,KAAK,CAAC,UAAU,OAAO,CAAC,EAAE,KAAK,CAAC,CAAC;AAC7E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,KAAa,EAAE,KAAmB;IACpE,MAAM,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACjC,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,CAAC;IACzC,MAAM,IAAI,GAAmB,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACnF,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,iBAAiB,CAAC,CAAC;IAC/C,MAAM,GAAG,GAAgB,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IACvD,MAAM,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC;AACzC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,KAAa;IAC7C,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC;IAC9B,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAC3D,MAAM,CAAC,GAAG,MAAM,QAAQ,CAAc,IAAI,CAAC,CAAC;IAC5C,IAAI,CAAC,CAAC,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC;IACpB,IAAI,OAAO,CAAC,CAAC,KAAK,CAAC,OAAO,KAAK,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;QAC/D,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,sCAAsC,CAAC,CAAC,KAAK,CAAC,OAAO,mDAAmD,CACzG,CAAC;QACF,OAAO,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IACtC,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC"}
@@ -0,0 +1,17 @@
1
+ import { type Result } from "../frontmatter/types.js";
2
+ import type { CortexEvalError, SubgraphEdge } from "./types.js";
3
+ export interface SubgraphOptions {
4
+ maxNodes?: number;
5
+ }
6
+ export interface SubgraphNode {
7
+ path: string;
8
+ body: string;
9
+ frontmatter: Record<string, unknown>;
10
+ }
11
+ export interface Subgraph {
12
+ seed_doc: string;
13
+ nodes: SubgraphNode[];
14
+ edges: SubgraphEdge[];
15
+ }
16
+ export declare function sampleSubgraph(vaultRoot: string, seed: string, opts?: SubgraphOptions): Promise<Result<Subgraph, CortexEvalError>>;
17
+ //# sourceMappingURL=subgraph.d.ts.map