nemoris 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. package/.env.example +49 -49
  2. package/LICENSE +21 -21
  3. package/README.md +209 -209
  4. package/SECURITY.md +59 -119
  5. package/bin/nemoris +46 -46
  6. package/config/agents/agent.toml.example +28 -28
  7. package/config/agents/content.toml +23 -0
  8. package/config/agents/default.toml +22 -22
  9. package/config/agents/heartbeat.toml +35 -0
  10. package/config/agents/iris.toml +23 -0
  11. package/config/agents/lab.toml +23 -0
  12. package/config/agents/main.toml +45 -0
  13. package/config/agents/nemo.toml +21 -0
  14. package/config/agents/ops.toml +38 -0
  15. package/config/agents/orchestrator.toml +18 -18
  16. package/config/agents/revenue.toml +23 -0
  17. package/config/agents/testyboo.toml +19 -0
  18. package/config/delivery.toml +73 -73
  19. package/config/embeddings.toml +5 -5
  20. package/config/identity/content-purpose.md +11 -0
  21. package/config/identity/content-soul.md +45 -0
  22. package/config/identity/default-purpose.md +1 -1
  23. package/config/identity/default-soul.md +3 -3
  24. package/config/identity/heartbeat-purpose.md +9 -0
  25. package/config/identity/heartbeat-soul.md +16 -0
  26. package/config/identity/iris-purpose.md +17 -0
  27. package/config/identity/iris-soul.md +68 -0
  28. package/config/identity/lab-purpose.md +10 -0
  29. package/config/identity/lab-soul.md +38 -0
  30. package/config/identity/main-purpose.md +17 -0
  31. package/config/identity/main-soul.md +66 -0
  32. package/config/identity/main-user.md +22 -0
  33. package/config/identity/ops-purpose.md +9 -0
  34. package/config/identity/ops-soul.md +16 -0
  35. package/config/identity/orchestrator-purpose.md +1 -1
  36. package/config/identity/orchestrator-soul.md +1 -1
  37. package/config/identity/revenue-purpose.md +9 -0
  38. package/config/identity/revenue-soul.md +41 -0
  39. package/config/identity/testyboo-purpose.md +13 -0
  40. package/config/identity/testyboo-soul.md +20 -0
  41. package/config/improvement-targets.toml +15 -15
  42. package/config/jobs/heartbeat-check.toml +30 -30
  43. package/config/jobs/memory-rollup.toml +46 -46
  44. package/config/jobs/workspace-health.toml +63 -63
  45. package/config/mcp.toml +16 -16
  46. package/config/output-contracts.toml +17 -17
  47. package/config/peers.toml +32 -32
  48. package/config/peers.toml.example +32 -32
  49. package/config/policies/memory-default.toml +10 -10
  50. package/config/policies/memory-heartbeat.toml +5 -5
  51. package/config/policies/memory-ops.toml +10 -10
  52. package/config/policies/tools-heartbeat-minimal.toml +8 -8
  53. package/config/policies/tools-interactive-safe.toml +8 -8
  54. package/config/policies/tools-ops-bounded.toml +8 -8
  55. package/config/policies/tools-orchestrator.toml +7 -7
  56. package/config/providers/anthropic.toml +15 -15
  57. package/config/providers/ollama.toml +5 -5
  58. package/config/providers/openai-codex.toml +9 -9
  59. package/config/providers/openrouter.toml +5 -5
  60. package/config/router.toml +22 -22
  61. package/config/runtime.toml +114 -114
  62. package/config/skills/self-improvement.toml +15 -15
  63. package/config/skills/telegram-onboarding-spec.md +240 -240
  64. package/config/skills/workspace-monitor.toml +15 -15
  65. package/config/task-router.toml +42 -42
  66. package/install.sh +50 -50
  67. package/package.json +91 -90
  68. package/src/auth/auth-profiles.js +169 -169
  69. package/src/auth/openai-codex-oauth.js +285 -285
  70. package/src/battle.js +449 -449
  71. package/src/cli/help.js +265 -265
  72. package/src/cli/output-filter.js +49 -49
  73. package/src/cli/runtime-control.js +704 -704
  74. package/src/cli-main.js +2763 -2763
  75. package/src/cli.js +78 -78
  76. package/src/config/loader.js +332 -332
  77. package/src/config/schema-validator.js +214 -214
  78. package/src/config/toml-lite.js +8 -8
  79. package/src/daemon/action-handlers.js +71 -71
  80. package/src/daemon/healing-tick.js +87 -87
  81. package/src/daemon/health-probes.js +90 -90
  82. package/src/daemon/notifier.js +57 -57
  83. package/src/daemon/nurse.js +218 -218
  84. package/src/daemon/repair-log.js +106 -106
  85. package/src/daemon/rule-staging.js +90 -90
  86. package/src/daemon/rules.js +29 -29
  87. package/src/daemon/telegram-commands.js +54 -54
  88. package/src/daemon/updater.js +85 -85
  89. package/src/jobs/job-runner.js +78 -78
  90. package/src/mcp/consumer.js +129 -129
  91. package/src/memory/active-recall.js +171 -171
  92. package/src/memory/backend-manager.js +97 -97
  93. package/src/memory/backends/file-backend.js +38 -38
  94. package/src/memory/backends/qmd-backend.js +219 -219
  95. package/src/memory/embedding-guards.js +24 -24
  96. package/src/memory/embedding-index.js +118 -118
  97. package/src/memory/embedding-service.js +179 -179
  98. package/src/memory/file-index.js +177 -177
  99. package/src/memory/memory-signature.js +5 -5
  100. package/src/memory/memory-store.js +648 -648
  101. package/src/memory/retrieval-planner.js +66 -66
  102. package/src/memory/scoring.js +145 -145
  103. package/src/memory/simhash.js +78 -78
  104. package/src/memory/sqlite-active-store.js +824 -824
  105. package/src/memory/write-policy.js +36 -36
  106. package/src/onboarding/aliases.js +33 -33
  107. package/src/onboarding/auth/api-key.js +224 -224
  108. package/src/onboarding/auth/ollama-detect.js +42 -42
  109. package/src/onboarding/clack-prompter.js +77 -77
  110. package/src/onboarding/doctor.js +530 -530
  111. package/src/onboarding/lock.js +42 -42
  112. package/src/onboarding/model-catalog.js +344 -344
  113. package/src/onboarding/phases/auth.js +576 -589
  114. package/src/onboarding/phases/build.js +130 -130
  115. package/src/onboarding/phases/choose.js +82 -82
  116. package/src/onboarding/phases/detect.js +98 -98
  117. package/src/onboarding/phases/hatch.js +216 -216
  118. package/src/onboarding/phases/identity.js +79 -79
  119. package/src/onboarding/phases/ollama.js +345 -345
  120. package/src/onboarding/phases/scaffold.js +99 -99
  121. package/src/onboarding/phases/telegram.js +377 -377
  122. package/src/onboarding/phases/validate.js +204 -204
  123. package/src/onboarding/phases/verify.js +206 -206
  124. package/src/onboarding/platform.js +482 -482
  125. package/src/onboarding/status-bar.js +95 -95
  126. package/src/onboarding/templates.js +794 -794
  127. package/src/onboarding/toml-writer.js +38 -38
  128. package/src/onboarding/tui.js +250 -250
  129. package/src/onboarding/uninstall.js +153 -153
  130. package/src/onboarding/wizard.js +516 -499
  131. package/src/providers/anthropic.js +168 -168
  132. package/src/providers/base.js +247 -247
  133. package/src/providers/circuit-breaker.js +136 -136
  134. package/src/providers/ollama.js +163 -163
  135. package/src/providers/openai-codex.js +149 -149
  136. package/src/providers/openrouter.js +136 -136
  137. package/src/providers/registry.js +36 -36
  138. package/src/providers/router.js +16 -16
  139. package/src/runtime/bootstrap-cache.js +47 -47
  140. package/src/runtime/capabilities-prompt.js +25 -25
  141. package/src/runtime/completion-ping.js +99 -99
  142. package/src/runtime/config-validator.js +121 -121
  143. package/src/runtime/context-ledger.js +360 -360
  144. package/src/runtime/cutover-readiness.js +42 -42
  145. package/src/runtime/daemon.js +729 -729
  146. package/src/runtime/delivery-ack.js +195 -195
  147. package/src/runtime/delivery-adapters/local-file.js +41 -41
  148. package/src/runtime/delivery-adapters/openclaw-cli.js +94 -94
  149. package/src/runtime/delivery-adapters/openclaw-peer.js +98 -98
  150. package/src/runtime/delivery-adapters/shadow.js +13 -13
  151. package/src/runtime/delivery-adapters/standalone-http.js +98 -98
  152. package/src/runtime/delivery-adapters/telegram.js +104 -104
  153. package/src/runtime/delivery-adapters/tui.js +128 -128
  154. package/src/runtime/delivery-manager.js +807 -807
  155. package/src/runtime/delivery-store.js +168 -168
  156. package/src/runtime/dependency-health.js +118 -118
  157. package/src/runtime/envelope.js +114 -114
  158. package/src/runtime/evaluation.js +1089 -1089
  159. package/src/runtime/exec-approvals.js +216 -216
  160. package/src/runtime/executor.js +500 -500
  161. package/src/runtime/failure-ping.js +67 -67
  162. package/src/runtime/flows.js +83 -83
  163. package/src/runtime/guards.js +45 -45
  164. package/src/runtime/handoff.js +51 -51
  165. package/src/runtime/identity-cache.js +28 -28
  166. package/src/runtime/improvement-engine.js +109 -109
  167. package/src/runtime/improvement-harness.js +581 -581
  168. package/src/runtime/input-sanitiser.js +72 -72
  169. package/src/runtime/interaction-contract.js +347 -347
  170. package/src/runtime/lane-readiness.js +226 -226
  171. package/src/runtime/migration.js +323 -323
  172. package/src/runtime/model-resolution.js +78 -78
  173. package/src/runtime/network.js +64 -64
  174. package/src/runtime/notification-store.js +97 -97
  175. package/src/runtime/notifier.js +256 -256
  176. package/src/runtime/orchestrator.js +53 -53
  177. package/src/runtime/orphan-reaper.js +41 -41
  178. package/src/runtime/output-contract-schema.js +139 -139
  179. package/src/runtime/output-contract-validator.js +439 -439
  180. package/src/runtime/peer-readiness.js +69 -69
  181. package/src/runtime/peer-registry.js +133 -133
  182. package/src/runtime/pilot-status.js +108 -108
  183. package/src/runtime/prompt-builder.js +261 -261
  184. package/src/runtime/provider-attempt.js +582 -582
  185. package/src/runtime/report-fallback.js +71 -71
  186. package/src/runtime/result-normalizer.js +183 -183
  187. package/src/runtime/retention.js +74 -74
  188. package/src/runtime/review.js +244 -244
  189. package/src/runtime/route-job.js +15 -15
  190. package/src/runtime/run-store.js +38 -38
  191. package/src/runtime/schedule.js +88 -88
  192. package/src/runtime/scheduler-state.js +434 -434
  193. package/src/runtime/scheduler.js +656 -656
  194. package/src/runtime/session-compactor.js +182 -182
  195. package/src/runtime/session-search.js +155 -155
  196. package/src/runtime/slack-inbound.js +249 -249
  197. package/src/runtime/ssrf.js +102 -102
  198. package/src/runtime/status-aggregator.js +330 -330
  199. package/src/runtime/task-contract.js +140 -140
  200. package/src/runtime/task-packet.js +107 -107
  201. package/src/runtime/task-router.js +140 -140
  202. package/src/runtime/telegram-inbound.js +1565 -1565
  203. package/src/runtime/token-counter.js +134 -134
  204. package/src/runtime/token-estimator.js +59 -59
  205. package/src/runtime/tool-loop.js +200 -200
  206. package/src/runtime/transport-server.js +311 -311
  207. package/src/runtime/tui-server.js +411 -411
  208. package/src/runtime/ulid.js +44 -44
  209. package/src/security/ssrf-check.js +197 -197
  210. package/src/setup.js +369 -369
  211. package/src/shadow/bridge.js +303 -303
  212. package/src/skills/loader.js +84 -84
  213. package/src/tools/catalog.json +49 -49
  214. package/src/tools/cli-delegate.js +44 -44
  215. package/src/tools/mcp-client.js +106 -106
  216. package/src/tools/micro/cancel-task.js +6 -6
  217. package/src/tools/micro/complete-task.js +6 -6
  218. package/src/tools/micro/fail-task.js +6 -6
  219. package/src/tools/micro/http-fetch.js +74 -74
  220. package/src/tools/micro/index.js +36 -36
  221. package/src/tools/micro/lcm-recall.js +60 -60
  222. package/src/tools/micro/list-dir.js +17 -17
  223. package/src/tools/micro/list-skills.js +46 -46
  224. package/src/tools/micro/load-skill.js +38 -38
  225. package/src/tools/micro/memory-search.js +45 -45
  226. package/src/tools/micro/read-file.js +11 -11
  227. package/src/tools/micro/session-search.js +54 -54
  228. package/src/tools/micro/shell-exec.js +43 -43
  229. package/src/tools/micro/trigger-job.js +79 -79
  230. package/src/tools/micro/web-search.js +58 -58
  231. package/src/tools/micro/workspace-paths.js +39 -39
  232. package/src/tools/micro/write-file.js +14 -14
  233. package/src/tools/micro/write-memory.js +41 -41
  234. package/src/tools/registry.js +348 -348
  235. package/src/tools/tool-result-contract.js +36 -36
  236. package/src/tui/chat.js +835 -835
  237. package/src/tui/renderer.js +175 -175
  238. package/src/tui/socket-client.js +217 -217
  239. package/src/utils/canonical-json.js +29 -29
  240. package/src/utils/compaction.js +30 -30
  241. package/src/utils/env-loader.js +5 -5
  242. package/src/utils/errors.js +80 -80
  243. package/src/utils/fs.js +101 -101
  244. package/src/utils/ids.js +5 -5
  245. package/src/utils/model-context-limits.js +30 -30
  246. package/src/utils/token-budget.js +74 -74
  247. package/src/utils/usage-cost.js +25 -25
  248. package/src/utils/usage-metrics.js +14 -14
@@ -1,1089 +1,1089 @@
1
- import path from "node:path";
2
- import { RunReviewer } from "./review.js";
3
- import { Scheduler } from "./scheduler.js";
4
- import { RunStore } from "./run-store.js";
5
- import { NotificationStore } from "./notification-store.js";
6
- import { DeliveryStore } from "./delivery-store.js";
7
- import { validateOutputContract } from "./output-contract-validator.js";
8
-
9
- function groupBy(items, keyFn) {
10
- const map = new Map();
11
- for (const item of items) {
12
- const key = keyFn(item);
13
- if (!map.has(key)) map.set(key, []);
14
- map.get(key).push(item);
15
- }
16
- return map;
17
- }
18
-
19
- function clamp(value, min = 0, max = 1) {
20
- return Math.max(min, Math.min(max, value));
21
- }
22
-
23
- function average(values) {
24
- if (values.length === 0) return null;
25
- return values.reduce((sum, value) => sum + value, 0) / values.length;
26
- }
27
-
28
- function choosePrimaryV2Run(runs) {
29
- if (!runs.length) return null;
30
- return runs.find((run) => run.mode === "provider") || runs[0];
31
- }
32
-
33
- function pickBestLiveMatch(liveMatches) {
34
- return liveMatches.find((match) => match.latestRun) || liveMatches[0] || null;
35
- }
36
-
37
- function normalizeText(value) {
38
- return String(value || "")
39
- .replace(/\s+/g, " ")
40
- .trim();
41
- }
42
-
43
- function stripOuterFence(text) {
44
- const raw = String(text || "").trim();
45
- const fencedMatch = raw.match(/^```(?:json|markdown|md|text)?\s*([\s\S]*?)\s*```$/i);
46
- return fencedMatch ? fencedMatch[1].trim() : raw;
47
- }
48
-
49
- function parseJsonCandidate(text) {
50
- const candidate = stripOuterFence(text);
51
- if (!(candidate.startsWith("{") && candidate.endsWith("}"))) {
52
- return null;
53
- }
54
-
55
- try {
56
- return JSON.parse(candidate);
57
- } catch {
58
- return null;
59
- }
60
- }
61
-
62
- function titleCase(value) {
63
- return String(value || "")
64
- .replace(/[_-]+/g, " ")
65
- .replace(/^#+\s*/, "")
66
- .replace(/\b\w/g, (char) => char.toUpperCase())
67
- .trim();
68
- }
69
-
70
- function normalizeInline(value) {
71
- return normalizeText(value) || "None";
72
- }
73
-
74
- function normalizeSectionKey(value) {
75
- return String(value || "")
76
- .toLowerCase()
77
- .replace(/^#+\s*/, "")
78
- .replace(/[*:_-]+/g, " ")
79
- .replace(/\s+/g, " ")
80
- .trim();
81
- }
82
-
83
- function renderValue(value, { contract = null, summary = null } = {}) {
84
- if (value == null) return "";
85
-
86
- if (typeof value === "string") {
87
- const parsed = parseJsonCandidate(value);
88
- if (parsed && typeof parsed === "object") {
89
- if (parsed.output !== undefined) {
90
- return renderValue(parsed.output, {
91
- contract,
92
- summary: parsed.summary || summary
93
- });
94
- }
95
-
96
- return renderValue(parsed, { contract, summary });
97
- }
98
-
99
- return stripOuterFence(value);
100
- }
101
-
102
- if (Array.isArray(value)) {
103
- return value
104
- .map((item) => renderValue(item, { contract, summary }))
105
- .filter(Boolean)
106
- .join("\n");
107
- }
108
-
109
- if (typeof value === "object") {
110
- const entries = Object.entries(value);
111
- const sectionOrder = contract?.requiredSections?.length
112
- ? contract.requiredSections
113
- : entries.map(([key]) => key);
114
- const valueMap = new Map(entries.map(([key, entryValue]) => [normalizeSectionKey(key), entryValue]));
115
- const sectionStyle = contract?.profile?.sectionStyle || (contract?.format === "structured_rollup" ? "headings" : "bullets");
116
- const requireStatus = contract?.profile?.requireStatus ?? (contract?.format === "bulleted_briefing");
117
-
118
- if (sectionStyle === "bullets") {
119
- const lines = [];
120
- if (requireStatus && summary) lines.push(`Status: ${normalizeInline(summary)}`);
121
- for (const section of sectionOrder) {
122
- lines.push(`- ${titleCase(section)}: ${normalizeInline(valueMap.get(normalizeSectionKey(section)))}`);
123
- }
124
- return lines.join("\n");
125
- }
126
-
127
- if (sectionStyle === "headings") {
128
- const lines = [];
129
- for (const section of sectionOrder) {
130
- lines.push(`## ${titleCase(section)}`);
131
- lines.push(`- ${normalizeInline(valueMap.get(normalizeSectionKey(section)))}`);
132
- lines.push("");
133
- }
134
- return lines.join("\n").trim();
135
- }
136
-
137
- return entries
138
- .map(([key, entryValue]) => `${titleCase(key)}: ${normalizeInline(entryValue)}`)
139
- .join("\n");
140
- }
141
-
142
- return String(value);
143
- }
144
-
145
- function extractRunText(run, contract = null) {
146
- if (!run) return "";
147
- const summary = run?.result?.summary || run?.summary || "";
148
- const value = run?.result?.output ?? run?.output ?? summary;
149
- return renderValue(value, { contract, summary });
150
- }
151
-
152
- function extractLiveText(run) {
153
- if (!run) return "";
154
- return renderValue(run.summary || run.error || "");
155
- }
156
-
157
- function qualitySignals(text) {
158
- const raw = String(text || "");
159
- const normalized = normalizeText(text);
160
- const lower = normalized.toLowerCase();
161
- const findings = [];
162
- let score = 1;
163
-
164
- if (!normalized) {
165
- findings.push("Output is empty.");
166
- return { score: 0, findings };
167
- }
168
-
169
- if (normalized.length < 40) {
170
- findings.push("Output is very short and may not carry enough signal.");
171
- score -= 0.18;
172
- }
173
-
174
- if (/\[(title|link|summary|sub)\]/i.test(normalized)) {
175
- findings.push("Output still contains placeholder text.");
176
- score -= 0.4;
177
- }
178
-
179
- if (/i('| a)?m sorry|does not support|cannot complete|failed to/i.test(lower)) {
180
- findings.push("Output contains apology or tool-failure language.");
181
- score -= 0.28;
182
- }
183
-
184
- if (/```/.test(raw)) {
185
- findings.push("Output is wrapped in a code block instead of a direct report.");
186
- score -= 0.12;
187
- }
188
-
189
- if (/timed out|timeout/i.test(lower)) {
190
- findings.push("Output references a timeout or stalled execution.");
191
- score -= 0.35;
192
- }
193
-
194
- const repeatedLinePenalty = detectRepeatedLinePenalty(normalized);
195
- if (repeatedLinePenalty > 0) {
196
- findings.push("Output repeats nearly identical numbered lines.");
197
- score -= repeatedLinePenalty;
198
- }
199
-
200
- return {
201
- score: clamp(Number(score.toFixed(4))),
202
- findings
203
- };
204
- }
205
-
206
- function detectRepeatedLinePenalty(text) {
207
- const lines = text
208
- .split("\n")
209
- .map((line) => line.trim())
210
- .filter(Boolean)
211
- .filter((line) => /^\d+\./.test(line));
212
-
213
- if (lines.length < 3) return 0;
214
-
215
- const normalized = lines.map((line) => line.replace(/^\d+\.\s*/, "").toLowerCase());
216
- const unique = new Set(normalized);
217
- const duplicateRatio = 1 - unique.size / normalized.length;
218
- if (duplicateRatio < 0.5) return 0;
219
- return Number((duplicateRatio * 0.25).toFixed(4));
220
- }
221
-
222
- function expandRelatedNotificationFiles(baseFiles, notifications) {
223
- const related = new Set(baseFiles);
224
- let changed = true;
225
- while (changed) {
226
- changed = false;
227
- for (const item of notifications) {
228
- if (!related.has(item.filePath)) continue;
229
- for (const generated of item.generatedNotificationFiles || []) {
230
- if (related.has(generated)) continue;
231
- related.add(generated);
232
- changed = true;
233
- }
234
- }
235
- }
236
- return related;
237
- }
238
-
239
- function evaluateLiveHistory(liveRuns) {
240
- if (liveRuns.length === 0) {
241
- return {
242
- runCount: 0,
243
- okCount: 0,
244
- errorCount: 0,
245
- skippedCount: 0,
246
- failureRate: null,
247
- avgDurationMs: null,
248
- avgTotalTokens: null,
249
- avgQualityScore: null,
250
- findings: ["No recent live history available."]
251
- };
252
- }
253
-
254
- const okCount = liveRuns.filter((run) => run.status === "ok").length;
255
- const errorCount = liveRuns.filter((run) => run.status === "error").length;
256
- const skippedCount = liveRuns.filter((run) => run.status === "skipped").length;
257
- const failureRate = errorCount / liveRuns.length;
258
- const durations = liveRuns.map((run) => run.durationMs).filter((value) => Number.isFinite(value));
259
- const tokens = liveRuns.map((run) => run.usage?.total_tokens).filter((value) => Number.isFinite(value));
260
- const qualityScores = liveRuns
261
- .map((run) => qualitySignals(run.summary || run.error || "").score)
262
- .filter((value) => Number.isFinite(value));
263
-
264
- const findings = [];
265
- if (failureRate >= 0.34) findings.push("Live cron lane is failing often.");
266
- if (skippedCount > 0) findings.push("Live cron history includes intentional skips.");
267
- if (tokens.some((value) => value > 12000)) findings.push("Live cron lane shows high token usage spikes.");
268
-
269
- return {
270
- runCount: liveRuns.length,
271
- okCount,
272
- errorCount,
273
- skippedCount,
274
- failureRate: Number(failureRate.toFixed(4)),
275
- avgDurationMs: durations.length ? Math.round(average(durations)) : null,
276
- avgTotalTokens: tokens.length ? Math.round(average(tokens)) : null,
277
- avgQualityScore: qualityScores.length ? Number(average(qualityScores).toFixed(4)) : null,
278
- findings
279
- };
280
- }
281
-
282
- function scoreStatus(status) {
283
- if (status === "ok") return 1;
284
- if (status === "skipped") return 0.6;
285
- if (status === "error") return 0;
286
- return 0.5;
287
- }
288
-
289
- function buildRubric({ v2Run, liveHistory, matchedLiveJob, outputContract = null, contractCheck = null }) {
290
- const findings = [];
291
- const v2Text = extractRunText(v2Run, outputContract);
292
- const v2Quality = qualitySignals(v2Text);
293
- const latestLive = liveHistory[0] || null;
294
- const latestLiveQuality = qualitySignals(extractLiveText(latestLive));
295
- const history = evaluateLiveHistory(liveHistory);
296
- const isDryRun = v2Run != null && v2Run.mode !== "provider";
297
-
298
- const comparisonReadiness = v2Run
299
- ? v2Run.mode === "provider"
300
- ? 1
301
- : 0.45
302
- : 0;
303
- const comparisonCoverage = matchedLiveJob ? 1 : 0.2;
304
- const contractAdherence = contractCheck ? contractCheck.satisfiedRatio : null;
305
- if (!v2Run) findings.push("No V2 run exists for this lane yet.");
306
- else if (isDryRun) findings.push("Only a dry-run V2 artifact exists; output quality and contract adherence are excluded from the rubric score.");
307
- if (!matchedLiveJob) findings.push("Comparison coverage is low because no live cron analogue is mapped.");
308
-
309
- if (!isDryRun && v2Quality.findings.length) findings.push(...v2Quality.findings.map((item) => `V2: ${item}`));
310
- if (latestLiveQuality.findings.length) findings.push(...latestLiveQuality.findings.map((item) => `Live: ${item}`));
311
- if (history.findings.length) findings.push(...history.findings);
312
-
313
- const fieldScores = contractCheck?.fieldScores || null;
314
- if (!isDryRun && fieldScores) {
315
- if (fieldScores.weakFields.length) {
316
- findings.push(`Field-level quality is low for: ${fieldScores.weakFields.join(", ")}`);
317
- }
318
- if (fieldScores.placeholderFields.length) {
319
- findings.push(`Placeholder text detected in: ${fieldScores.placeholderFields.join(", ")}`);
320
- }
321
- }
322
-
323
- if (!matchedLiveJob) {
324
- findings.push("No live cron analogue matched for this V2 job.");
325
- }
326
-
327
- const statusAlignment =
328
- latestLive && v2Run
329
- ? Number((scoreStatus(v2Run.result ? "ok" : v2Run.status) * scoreStatus(latestLive.status)).toFixed(4))
330
- : null;
331
- const liveReliability = history.failureRate == null ? null : Number((1 - history.failureRate).toFixed(4));
332
-
333
- const fieldLevelQuality = !isDryRun && fieldScores ? fieldScores.averageScore : null;
334
-
335
- const components = [
336
- isDryRun ? null : v2Quality.score,
337
- comparisonReadiness,
338
- comparisonCoverage,
339
- isDryRun ? null : contractAdherence,
340
- fieldLevelQuality,
341
- liveReliability,
342
- statusAlignment
343
- ].filter((value) => value != null);
344
- const overallScore = components.length ? Number(average(components).toFixed(4)) : 0;
345
-
346
- return {
347
- overallScore,
348
- dryRunExcluded: isDryRun,
349
- components: {
350
- v2OutputQuality: isDryRun ? null : v2Quality.score,
351
- comparisonReadiness,
352
- comparisonCoverage,
353
- contractAdherence: isDryRun ? null : contractAdherence,
354
- fieldLevelQuality,
355
- liveReliability,
356
- statusAlignment
357
- },
358
- findings: uniqueStrings(findings),
359
- metrics: {
360
- liveRunCount: history.runCount,
361
- liveOkCount: history.okCount,
362
- liveErrorCount: history.errorCount,
363
- liveSkippedCount: history.skippedCount,
364
- liveFailureRate: history.failureRate,
365
- liveAvgDurationMs: history.avgDurationMs,
366
- liveAvgTotalTokens: history.avgTotalTokens,
367
- liveAvgQualityScore: history.avgQualityScore,
368
- latestLiveQualityScore: latestLive ? latestLiveQuality.score : null
369
- }
370
- };
371
- }
372
-
373
- function uniqueStrings(items) {
374
- return [...new Set(items.filter(Boolean))];
375
- }
376
-
377
- function summarizeV2Run(run) {
378
- if (!run) return null;
379
- return {
380
- timestamp: run.timestamp,
381
- mode: run.mode || null,
382
- providerId: run.providerId || null,
383
- modelId: run.modelId || null,
384
- summary: run.result?.summary || run.summary || null,
385
- output: run.result?.output || run.output || null,
386
- retrievalMeta: run.retrievalMeta || null,
387
- retrievedMemoryCount: run.retrievedMemory?.length || 0,
388
- interaction: summarizeInteraction(run),
389
- fallback: run.fallback || null
390
- };
391
- }
392
-
393
- function summarizeInteraction(run) {
394
- const interaction = run?.interaction || null;
395
- if (!interaction) return null;
396
- return {
397
- ackRequired: interaction.ack?.required ?? false,
398
- completionRequired: interaction.completion?.required ?? false,
399
- handoffRequired: interaction.handoff?.required ?? false,
400
- yielded: run?.yielded || false,
401
- yieldSignal: run?.yieldSignal || null,
402
- followUpState: run?.followUpState || null,
403
- followUpQueued: run?.followUpQueued || false,
404
- followUpConsumed: run?.followUpConsumed || false,
405
- followUpTarget: run?.followUpTarget || null,
406
- followUpCompleted: run?.followUpCompleted || false,
407
- followUpExpired: run?.followUpExpired || false,
408
- followUpEscalated: run?.followUpEscalated || false,
409
- followUpEscalationFilePath: run?.followUpEscalationFilePath || null,
410
- handoffTarget: interaction.handoff?.target || null,
411
- suggestedPeerCount: interaction.handoff?.suggestions?.length || 0,
412
- handoffChosenPeerId: run?.handoffChosenPeerId || null,
413
- handoffChosenBy: run?.handoffChosenBy || null,
414
- handoffDelivered: run?.handoffDelivered || false,
415
- handoffDeliveryState: run?.handoffDeliveryState || null,
416
- deliveryDeduped: run?.deliveryDeduped || false,
417
- deliveryRetried: run?.deliveryRetried || false,
418
- deliveryUncertain: run?.deliveryUncertain || false
419
- };
420
- }
421
-
422
- function analyzeRetrieval(run) {
423
- const retrievalMeta = run?.retrievalMeta || run?.plan?.packet?.layers?.retrievalMeta || null;
424
- if (!run?.retrievedMemory?.length) {
425
- return {
426
- memoryCount: 0,
427
- lexicalCount: 0,
428
- semanticCount: 0,
429
- qmdCount: 0,
430
- freshEmbeddingCount: 0,
431
- staleEmbeddingCount: 0,
432
- missingEmbeddingCount: 0,
433
- failedEmbeddingCount: retrievalMeta?.embeddingHealth?.failedCount || 0,
434
- embeddingQueryMode: retrievalMeta?.embeddingQueryMode || "lexical_only",
435
- embeddingError: retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError || null,
436
- findings: ["No retrieved memory was captured for this run."],
437
- items: []
438
- };
439
- }
440
-
441
- const items = run.retrievedMemory.map((item) => ({
442
- entryId: item.entryId || null,
443
- title: item.title || null,
444
- sourceBackend: item.sourceBackend || "file",
445
- candidateSource: item.candidateSource || "indexed",
446
- lexicalScore: item.lexicalScore ?? 0,
447
- embeddingSimilarity: item.embeddingSimilarity ?? 0,
448
- embeddingFreshness: item.embeddingFreshness || "missing",
449
- retrievalSources: item.retrievalSources || []
450
- }));
451
-
452
- const lexicalCount = items.filter((item) => item.retrievalSources.includes("lexical")).length;
453
- const semanticCount = items.filter((item) => item.retrievalSources.includes("semantic")).length;
454
- const qmdCount = items.filter((item) => item.retrievalSources.includes("qmd") || item.sourceBackend === "qmd").length;
455
- const freshEmbeddingCount = items.filter((item) => item.embeddingFreshness === "fresh").length;
456
- const staleEmbeddingCount = items.filter((item) => item.embeddingFreshness === "stale").length;
457
- const missingEmbeddingCount = items.filter((item) => item.embeddingFreshness === "missing").length;
458
- const failedEmbeddingCount = items.filter((item) => item.embeddingFreshness === "failed").length;
459
- const findings = [];
460
-
461
- if (semanticCount === 0) findings.push("No semantic retrieval candidates reached the final packet.");
462
- if (staleEmbeddingCount > 0) findings.push("Some retrieved items have stale embeddings.");
463
- if (freshEmbeddingCount === 0 && missingEmbeddingCount > 0) findings.push("Retrieved file memory is falling back to non-embedded retrieval.");
464
- if ((retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError) && !findings.includes("Embedding query failed and retrieval fell back to lexical mode.")) {
465
- findings.push("Embedding query failed and retrieval fell back to lexical mode.");
466
- }
467
-
468
- return {
469
- memoryCount: items.length,
470
- lexicalCount,
471
- semanticCount,
472
- qmdCount,
473
- freshEmbeddingCount,
474
- staleEmbeddingCount,
475
- missingEmbeddingCount,
476
- failedEmbeddingCount,
477
- embeddingQueryMode: retrievalMeta?.embeddingQueryMode || "lexical_only",
478
- embeddingError: retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError || null,
479
- embeddingHealth: retrievalMeta?.embeddingHealth || null,
480
- findings,
481
- items
482
- };
483
- }
484
-
485
- function summarizeInteractionDiagnosis({
486
- ackRequired,
487
- ackQueued,
488
- completionRequired,
489
- completionQueued,
490
- handoffRequired,
491
- handoffQueued,
492
- yielded,
493
- followUpQueued,
494
- followUpConsumed,
495
- handoffDelivered,
496
- deliveryEvidenceRequired,
497
- deliveryEvidenceHealthy,
498
- deliveryUncertain,
499
- findings
500
- }) {
501
- const missingStages = [];
502
- if (ackRequired && !ackQueued) missingStages.push("ack");
503
- if (completionRequired && !completionQueued && !yielded) missingStages.push("completion");
504
- if (handoffRequired && !handoffQueued && !yielded) missingStages.push("handoff");
505
-
506
- if (!findings?.length) {
507
- return {
508
- status: "healthy",
509
- code: null,
510
- summary: "Interaction lifecycle evidence is complete.",
511
- missingStages,
512
- findings: []
513
- };
514
- }
515
-
516
- if (missingStages.length) {
517
- return {
518
- status: "action_required",
519
- code: "missing_required_notifications",
520
- summary: `Required interaction notifications are missing: ${missingStages.join(", ")}.`,
521
- missingStages,
522
- findings
523
- };
524
- }
525
-
526
- if (yielded && followUpQueued && !followUpConsumed) {
527
- return {
528
- status: "action_required",
529
- code: "follow_up_incomplete",
530
- summary: "Yielded follow-up is still pending or incomplete.",
531
- missingStages,
532
- findings
533
- };
534
- }
535
-
536
- if (deliveryEvidenceRequired && !deliveryEvidenceHealthy) {
537
- return {
538
- status: "action_required",
539
- code: "missing_delivery_evidence",
540
- summary: "Required pingback notifications do not have delivery receipts yet.",
541
- missingStages,
542
- findings
543
- };
544
- }
545
-
546
- if (handoffRequired && handoffQueued && !handoffDelivered) {
547
- return {
548
- status: "action_required",
549
- code: "handoff_incomplete",
550
- summary: "Handoff was queued but not fully delivered.",
551
- missingStages,
552
- findings
553
- };
554
- }
555
-
556
- if (deliveryUncertain) {
557
- return {
558
- status: "warning",
559
- code: "delivery_uncertain",
560
- summary: "Interaction lifecycle completed with uncertain delivery evidence.",
561
- missingStages,
562
- findings
563
- };
564
- }
565
-
566
- return {
567
- status: "warning",
568
- code: "interaction_findings_present",
569
- summary: findings[0] || "Interaction lifecycle has unresolved findings.",
570
- missingStages,
571
- findings
572
- };
573
- }
574
-
575
- function analyzeInteraction(run, notifications = [], deliveries = []) {
576
- const interaction = run?.interaction || null;
577
- const findings = [];
578
-
579
- const ackRequired = interaction?.ack?.required ?? false;
580
- const completionRequired = interaction?.completion?.required ?? false;
581
- const handoffRequired = interaction?.handoff?.required ?? false;
582
- const ackQueued = notifications.some((item) => item.stage === "ack");
583
- const completionQueued = notifications.some((item) => item.stage === "completion");
584
- const handoffQueued = notifications.some((item) => item.stage === "handoff");
585
- const followUpQueued = notifications.some((item) => item.stage === "follow_up");
586
- const followUpNotification = notifications.find((item) => item.stage === "follow_up") || null;
587
- const handoffNotification = notifications.find((item) => item.stage === "handoff") || null;
588
- const handoffPendingChoice = handoffNotification?.status === "awaiting_choice";
589
- const handoffState = handoffNotification?.handoffState || (handoffPendingChoice ? "pending" : null);
590
- const handoffChosen = Boolean(handoffNotification?.chosenPeer?.peerId);
591
- const handoffChosenPeerId = handoffNotification?.chosenPeer?.peerId || null;
592
- const handoffChosenBy = handoffNotification?.chosenBy || null;
593
- const handoffDelivery = handoffNotification
594
- ? deliveries.find((item) => item.notificationFilePath === handoffNotification.filePath) || null
595
- : null;
596
- const handoffDelivered = Boolean(handoffDelivery);
597
- const handoffDeliveryState = handoffDelivery?.delivery?.status || null;
598
- const deliveryStates = deliveries.map((item) => item.delivery?.status || item.stage || "unknown");
599
- const yielded = interaction?.yield?.required ?? false;
600
- const yieldSignal = interaction?.yield?.signal || null;
601
- const followUpState = followUpNotification?.followUpState || followUpNotification?.yieldState || null;
602
- const followUpConsumed = followUpNotification?.status === "consumed" || followUpNotification?.yieldState === "consumed" || followUpNotification?.followUpState === "consumed";
603
- const followUpTarget = followUpNotification?.targetSurface || interaction?.yield?.targetSurface || null;
604
- const generatedFiles = new Set(followUpNotification?.generatedNotificationFiles || []);
605
- const followUpCompleted = deliveries.some((item) => generatedFiles.has(item.notificationFilePath));
606
- const followUpExpired = followUpState === "expired";
607
- const followUpEscalated = followUpState === "escalated";
608
- const deliveryDeduped = deliveryStates.includes("duplicate_prevented");
609
- const deliveryRetried = deliveries.some((item) => Number(item.attempt) > 1);
610
- const deliveryUncertain = deliveryStates.includes("delivery_uncertain") || deliveryStates.includes("uncertain");
611
- const yieldedCompletionHandled = yielded && followUpQueued && Boolean(followUpNotification?.payload?.completion);
612
- const yieldedHandoffHandled = yielded && followUpQueued && Boolean(followUpNotification?.payload?.handoff);
613
- const visibleDeliveryCount = yielded ? deliveries.filter((item) => !generatedFiles.has(item.notificationFilePath)).length : deliveries.length;
614
- const deliveryEvidenceRequired = !yielded && Boolean(ackQueued || completionQueued || handoffQueued);
615
- const deliveryEvidenceHealthy = !deliveryEvidenceRequired || visibleDeliveryCount > 0;
616
-
617
- if (ackRequired && !ackQueued) findings.push("Required ack notification was not queued.");
618
- if (completionRequired && !completionQueued && !yieldedCompletionHandled) {
619
- findings.push("Required completion notification was not queued.");
620
- }
621
- if (handoffRequired && !handoffQueued && !yieldedHandoffHandled) {
622
- findings.push("Configured handoff was not queued.");
623
- }
624
- if (yielded && !followUpQueued) findings.push("Run yielded but no follow-up payload was persisted.");
625
- if (followUpQueued && !followUpConsumed) findings.push("Follow-up payload was persisted but never consumed.");
626
- if (followUpExpired) findings.push("Follow-up expired before it was consumed.");
627
- if (followUpEscalated) findings.push("Follow-up expired and was escalated to the operator.");
628
- if (handoffPendingChoice) findings.push("Handoff is awaiting an explicit peer choice.");
629
- if (handoffState === "expired") findings.push("Handoff expired without an operator choice.");
630
- if (handoffState === "escalated") findings.push("Handoff expired and was escalated to the operator.");
631
- if (handoffState === "blocked") findings.push(`Handoff is blocked${handoffNotification?.blockedReason ? `: ${handoffNotification.blockedReason}` : "."}`);
632
- if (handoffQueued && !handoffPendingChoice && !handoffChosen) findings.push("Handoff was queued but no peer choice was recorded.");
633
- if (handoffChosen && !handoffDelivered) findings.push("Handoff was promoted to a peer but not delivered yet.");
634
- if (handoffDeliveryState && /error|blocked/i.test(handoffDeliveryState)) findings.push(`Handoff delivery did not complete cleanly: ${handoffDeliveryState}.`);
635
- if (!deliveryEvidenceHealthy) {
636
- findings.push("No delivery receipts exist yet for queued interaction notifications.");
637
- }
638
- if (deliveryDeduped) findings.push("A duplicate delivery attempt was prevented by dedupe policy.");
639
- if (deliveryUncertain) findings.push("A delivery attempt was marked uncertain and will not be blindly retried.");
640
-
641
- const diagnosis = summarizeInteractionDiagnosis({
642
- ackRequired,
643
- ackQueued,
644
- completionRequired,
645
- completionQueued,
646
- handoffRequired,
647
- handoffQueued,
648
- yielded,
649
- followUpQueued,
650
- followUpConsumed,
651
- handoffDelivered,
652
- deliveryEvidenceRequired,
653
- deliveryEvidenceHealthy,
654
- deliveryUncertain,
655
- findings
656
- });
657
-
658
- return {
659
- ackRequired,
660
- ackQueued,
661
- completionRequired,
662
- completionQueued,
663
- handoffRequired,
664
- handoffQueued,
665
- yielded,
666
- yieldSignal,
667
- followUpState,
668
- followUpQueued,
669
- followUpConsumed,
670
- followUpTarget,
671
- followUpCompleted,
672
- followUpExpired,
673
- followUpEscalated,
674
- followUpEscalationFilePath: followUpNotification?.escalationNotificationFilePath || null,
675
- handoffPendingChoice,
676
- handoffState,
677
- handoffExpired: handoffState === "expired",
678
- handoffEscalated: handoffState === "escalated",
679
- handoffBlocked: handoffState === "blocked",
680
- handoffChosen,
681
- handoffChosenPeerId,
682
- handoffChosenBy,
683
- handoffDelivered,
684
- handoffDeliveryState,
685
- handoffBlockedReason: handoffNotification?.blockedReason || null,
686
- deliveryEvidenceRequired,
687
- deliveryEvidenceHealthy,
688
- deliveryReceiptCount: visibleDeliveryCount,
689
- deliveryStates,
690
- deliveryDeduped,
691
- deliveryRetried,
692
- deliveryUncertain,
693
- diagnosis,
694
- findings
695
- };
696
- }
697
-
698
- function summarizeLiveRun(run) {
699
- if (!run) return null;
700
- return {
701
- ts: run.ts || null,
702
- status: run.status || null,
703
- summary: run.summary || null,
704
- error: run.error || null,
705
- durationMs: run.durationMs ?? null,
706
- provider: run.provider || null,
707
- model: run.model || null,
708
- usage: run.usage || null
709
- };
710
- }
711
-
712
- const SECTION_HINTS = [
713
- "calendar",
714
- "weather",
715
- "projects",
716
- "project",
717
- "inbox",
718
- "issues",
719
- "backlog",
720
- "update",
721
- "summary",
722
- "next actions",
723
- "alerts",
724
- "status"
725
- ];
726
-
727
- const SECTION_ALIASES = new Map([
728
- ["project", "projects"],
729
- ["next action", "next actions"]
730
- ]);
731
-
732
- function canonicalSectionKey(value) {
733
- const normalized = normalizeSectionKey(value);
734
- return SECTION_ALIASES.get(normalized) || normalized;
735
- }
736
-
737
- function extractSectionKeys(nonEmptyLines) {
738
- const sectionKeys = [];
739
- const seen = new Set();
740
- const patterns = [
741
- /^#{1,6}\s+(.+)$/,
742
- /^\*\*([^*]+)\*\*$/,
743
- /^[-*]\s+\**([^:*]+?)\**:\s+.+$/,
744
- /^([^:]{2,40}):\s+.+$/
745
- ];
746
-
747
- for (const line of nonEmptyLines) {
748
- for (const pattern of patterns) {
749
- const match = line.match(pattern);
750
- if (!match) continue;
751
- const key = canonicalSectionKey(match[1]);
752
- if (!key || key.length < 2) continue;
753
- if (seen.has(key)) break;
754
- seen.add(key);
755
- sectionKeys.push(key);
756
- break;
757
- }
758
- }
759
-
760
- return sectionKeys;
761
- }
762
-
763
- function comparableSections(structure) {
764
- return structure.sectionKeys.length ? structure.sectionKeys : structure.sectionHints;
765
- }
766
-
767
- function analyzeOutputStructure(text) {
768
- const raw = String(text || "");
769
- const trimmed = raw.trim();
770
- const lines = raw.split("\n").map((line) => line.trim());
771
- const nonEmptyLines = lines.filter(Boolean);
772
- const lower = trimmed.toLowerCase();
773
- const sectionKeys = extractSectionKeys(nonEmptyLines);
774
-
775
- return {
776
- charCount: trimmed.length,
777
- lineCount: nonEmptyLines.length,
778
- headingCount: nonEmptyLines.filter((line) => /^#{1,6}\s/.test(line) || /^\*\*[^*]+\*\*/.test(line)).length,
779
- bulletCount: nonEmptyLines.filter((line) => /^[-*]\s/.test(line)).length,
780
- numberedCount: nonEmptyLines.filter((line) => /^\d+\.\s/.test(line)).length,
781
- codeFenceCount: (raw.match(/```/g) || []).length / 2,
782
- sectionKeys,
783
- sectionHints: SECTION_HINTS.filter((hint) => lower.includes(hint)),
784
- hasMarkdownTable: /\|.+\|/.test(raw),
785
- hasEmoji: /\p{Extended_Pictographic}/u.test(raw)
786
- };
787
- }
788
-
789
- function buildOutputDiff(v2Run, liveRun, contract = null) {
790
- const v2Text = extractRunText(v2Run, contract);
791
- const liveText = extractLiveText(liveRun);
792
- const v2 = analyzeOutputStructure(v2Text);
793
- const live = analyzeOutputStructure(liveText);
794
- const v2Sections = comparableSections(v2);
795
- const liveSections = comparableSections(live);
796
- const sharedSections = v2Sections.filter((hint) => liveSections.includes(hint));
797
- const missingFromV2 = liveSections.filter((hint) => !v2Sections.includes(hint));
798
- const extraInV2 = v2Sections.filter((hint) => !liveSections.includes(hint));
799
- const findings = [];
800
-
801
- if (!liveText) {
802
- findings.push("No live output is available for structural comparison.");
803
- } else {
804
- if (missingFromV2.length) findings.push(`V2 is missing live sections: ${missingFromV2.join(", ")}.`);
805
- if (v2.bulletCount + v2.numberedCount < live.bulletCount + live.numberedCount) {
806
- findings.push("V2 output is less structured than the live report.");
807
- }
808
- if (v2.charCount > 0 && live.charCount > 0) {
809
- const verbosityRatio = Number((v2.charCount / live.charCount).toFixed(4));
810
- if (verbosityRatio < 0.55) findings.push("V2 output is much shorter than the live report.");
811
- if (verbosityRatio > 1.8) findings.push("V2 output is much longer than the live report.");
812
- }
813
- }
814
-
815
- return {
816
- v2,
817
- live,
818
- alignment: {
819
- sharedSections,
820
- missingFromV2,
821
- extraInV2,
822
- bulletDelta: (v2.bulletCount + v2.numberedCount) - (live.bulletCount + live.numberedCount),
823
- headingDelta: v2.headingCount - live.headingCount,
824
- charDelta: v2.charCount - live.charCount
825
- },
826
- findings
827
- };
828
- }
829
-
830
- function assessOutputContract(contract, v2Run, liveRun) {
831
- if (!contract) return null;
832
-
833
- const v2Validation = validateOutputContract(contract, v2Run?.result?.output ?? v2Run?.output ?? "");
834
- const liveValidation = validateOutputContract(contract, liveRun?.summary || liveRun?.error || "");
835
- const missingFromV2 = v2Validation?.missingSections || [];
836
- const missingFromLive = liveValidation?.missingSections || [];
837
- const findings = [
838
- ...(v2Validation?.findings.map((item) => `V2: ${item}`) || []),
839
- ...(liveValidation?.findings.map((item) => `Live: ${item}`) || [])
840
- ];
841
-
842
- return {
843
- format: contract.format || null,
844
- requiredSections: contract.requiredSections || [],
845
- styleHints: contract.styleHints || [],
846
- satisfiedRatio: v2Validation?.satisfiedRatio ?? 0,
847
- missingFromV2,
848
- missingFromLive,
849
- emptyInV2: v2Validation?.emptySections || [],
850
- emptyInLive: liveValidation?.emptySections || [],
851
- parsedV2: v2Validation?.parsed || null,
852
- parsedLive: liveValidation?.parsed || null,
853
- fieldScores: v2Validation?.fieldScores || null,
854
- findings
855
- };
856
- }
857
-
858
- export { buildRubric };
859
-
860
- export class Evaluator {
861
- constructor({ projectRoot, liveRoot, stateRoot }) {
862
- this.liveRoot = liveRoot;
863
- this.stateRoot = stateRoot;
864
- this.reviewer = new RunReviewer({ stateRoot });
865
- this.scheduler = new Scheduler({ projectRoot, liveRoot, stateRoot });
866
- this.evalStore = new RunStore({ rootDir: path.join(stateRoot, "evaluations") });
867
- this.notificationStore = new NotificationStore({ rootDir: path.join(stateRoot, "notifications") });
868
- this.deliveryStore = new DeliveryStore({ rootDir: path.join(stateRoot, "deliveries") });
869
- }
870
-
871
- async resolveInteractionArtifacts(run, review = null) {
872
- const reviewNotifications = review?.recentNotifications || [];
873
- const _reviewDeliveries = review?.recentDeliveries || [];
874
- const baseFiles = run?.notificationFiles || [];
875
- const relatedNotificationFiles = expandRelatedNotificationFiles(
876
- baseFiles,
877
- reviewNotifications
878
- );
879
- const knownNotifications = reviewNotifications.filter((item) => relatedNotificationFiles.has(item.filePath));
880
- const missingNotificationFiles = [...relatedNotificationFiles].filter(
881
- (filePath) => !knownNotifications.some((item) => item.filePath === filePath)
882
- );
883
- const loadedNotifications = await this.notificationStore.getNotifications(missingNotificationFiles);
884
- const notifications = [...knownNotifications, ...loadedNotifications].sort((a, b) =>
885
- String(a.timestamp || "").localeCompare(String(b.timestamp || ""))
886
- );
887
- const expandedNotificationFiles = expandRelatedNotificationFiles(baseFiles, notifications);
888
-
889
- // Load all delivery receipts once for deterministic interaction evidence.
890
- const allDeliveries = await this.deliveryStore.listAll();
891
- const deliveries = allDeliveries.filter((item) => expandedNotificationFiles.has(item.notificationFilePath));
892
- return {
893
- notificationFiles: expandedNotificationFiles,
894
- notifications,
895
- deliveries
896
- };
897
- }
898
-
899
- async evaluate(limit = 20) {
900
- const [review, comparisons] = await Promise.all([
901
- this.reviewer.review(limit),
902
- this.scheduler.compareJobs()
903
- ]);
904
-
905
- const recentRuns = review.recentRuns.filter((run) => !run.jobId.endsWith("-comparison"));
906
- const grouped = groupBy(recentRuns, (run) => run.jobId);
907
-
908
- const jobs = await Promise.all(
909
- Array.from(grouped.entries()).map(async ([jobId, runs]) => this.evaluateJob(jobId, {
910
- runs,
911
- review,
912
- comparisons
913
- }))
914
- );
915
-
916
- return {
917
- jobs: [...jobs].sort((a, b) => a.rubric.overallScore - b.rubric.overallScore),
918
- recentRuns,
919
- scheduler: review.scheduler
920
- };
921
- }
922
-
923
- async evaluateJob(jobId, options = {}) {
924
- const runtime = await this.scheduler.loadRuntime();
925
- const jobConfig = runtime.jobs[jobId] || null;
926
- const review = options.review || (await this.reviewer.review(options.limit ?? 20));
927
- const comparisons = options.comparisons || (await this.scheduler.compareJobs());
928
- const runs = options.runs || review.recentRuns.filter((run) => run.jobId === jobId && !run.jobId.endsWith("-comparison"));
929
- const schedulerState = review.scheduler.find((item) => item.jobId === jobId) || null;
930
- const comparison = comparisons.find((item) => item.v2JobId === jobId) || null;
931
- const liveMatches = await Promise.all(
932
- (comparison?.closestLiveJobs || []).map(async (match) => ({
933
- ...match,
934
- recentRuns: await this.scheduler.bridge.loadCronRunHistory(match.id, 5),
935
- latestRun: (await this.scheduler.bridge.loadCronRunHistory(match.id, 1))[0] || null
936
- }))
937
- );
938
-
939
- const selectedRun = choosePrimaryV2Run(runs);
940
- const interactionRun = runs[0] || selectedRun;
941
- const selectedLiveMatch = pickBestLiveMatch(liveMatches);
942
- const liveHistory = selectedLiveMatch?.recentRuns || [];
943
- const outputContract = normalizeOutputContract(jobConfig?.outputContract || null);
944
- const outputDiff = buildOutputDiff(selectedRun, selectedLiveMatch?.latestRun || null, outputContract);
945
- const contractCheck = assessOutputContract(outputContract, selectedRun, selectedLiveMatch?.latestRun || null);
946
- const retrieval = analyzeRetrieval(selectedRun);
947
- const interactionArtifacts = await this.resolveInteractionArtifacts(interactionRun, review);
948
- const interaction = analyzeInteraction(interactionRun, interactionArtifacts.notifications, interactionArtifacts.deliveries);
949
- const rubric = buildRubric({
950
- v2Run: selectedRun,
951
- liveHistory,
952
- matchedLiveJob: selectedLiveMatch,
953
- outputContract,
954
- contractCheck
955
- });
956
-
957
- return {
958
- jobId,
959
- schedulerState,
960
- maintenance: review?.maintenance || null,
961
- latestRun: runs[0] || null,
962
- primaryRun: summarizeV2Run(selectedRun),
963
- interactionRunTimestamp: interactionRun?.timestamp || null,
964
- runCount: runs.length,
965
- modeCounts: countModes(runs),
966
- liveMatches: liveMatches.map((match) => ({
967
- id: match.id,
968
- name: match.name,
969
- description: match.description || null,
970
- enabled: match.enabled,
971
- schedule: match.schedule,
972
- kind: match.kind || null,
973
- lastStatus: match.lastStatus || null,
974
- latestRun: summarizeLiveRun(match.latestRun)
975
- })),
976
- selectedLiveMatch: selectedLiveMatch
977
- ? {
978
- id: selectedLiveMatch.id,
979
- name: selectedLiveMatch.name,
980
- latestRun: summarizeLiveRun(selectedLiveMatch.latestRun)
981
- }
982
- : null,
983
- retrieval,
984
- interaction,
985
- outputContract,
986
- contractCheck,
987
- outputDiff,
988
- rubric,
989
- comparisonNotes: buildComparisonNotes({
990
- jobId,
991
- selectedRun,
992
- selectedLiveMatch,
993
- schedulerState,
994
- maintenance: review?.maintenance || null,
995
- rubric,
996
- outputDiff,
997
- contractCheck,
998
- retrieval,
999
- interaction
1000
- }),
1001
- interactionArtifacts: {
1002
- notificationCount: interactionArtifacts.notifications.length,
1003
- deliveryCount: interactionArtifacts.deliveries.length,
1004
- notificationFileCount: interactionArtifacts.notificationFiles.size
1005
- }
1006
- };
1007
- }
1008
-
1009
- async evaluateAndPersistJob(jobId, options = {}) {
1010
- const report = await this.evaluateJob(jobId, options);
1011
- const artifact = {
1012
- timestamp: new Date().toISOString(),
1013
- kind: "job-evaluation",
1014
- ...report
1015
- };
1016
- const filePath = await this.evalStore.saveRun(jobId, artifact);
1017
- return {
1018
- filePath,
1019
- ...artifact
1020
- };
1021
- }
1022
- }
1023
-
1024
- function countModes(runs) {
1025
- return runs.reduce((acc, run) => {
1026
- const mode = run.mode || "unknown";
1027
- acc[mode] = (acc[mode] || 0) + 1;
1028
- return acc;
1029
- }, {});
1030
- }
1031
-
1032
- function buildComparisonNotes({
1033
- jobId,
1034
- selectedRun,
1035
- selectedLiveMatch,
1036
- schedulerState,
1037
- maintenance,
1038
- rubric,
1039
- outputDiff,
1040
- contractCheck,
1041
- retrieval = null,
1042
- interaction = null
1043
- }) {
1044
- const notes = [];
1045
- if (!selectedRun) notes.push("No V2 runs recorded.");
1046
- if (schedulerState?.lastStatus === "ok") notes.push("Latest V2 scheduler state is healthy.");
1047
- if (selectedRun?.fallback?.attempted && selectedRun?.fallback?.success) {
1048
- notes.push(
1049
- `Run used report fallback from ${selectedRun.fallback.sourceLane || "local"} to ${selectedRun.fallback.finalSourceLane || "remote"} after ${selectedRun.fallback.trigger || "failure"}.`
1050
- );
1051
- } else if (selectedRun?.fallback?.attempted && !selectedRun?.fallback?.success) {
1052
- notes.push(`Report fallback was attempted but did not succeed${selectedRun.fallback?.fallbackError ? `: ${selectedRun.fallback.fallbackError}` : "."}`);
1053
- } else if (selectedRun?.fallback?.allowed === false && selectedRun?.fallback?.trigger) {
1054
- notes.push(`Report fallback was blocked after ${selectedRun.fallback.trigger}${selectedRun.fallback?.blockedReason ? `: ${selectedRun.fallback.blockedReason}` : "."}`);
1055
- }
1056
- if (maintenance?.wal?.some((item) => item.action && item.action !== "none" && item.action !== "error")) {
1057
- notes.push("Daemon maintenance recently performed WAL checkpoint work.");
1058
- }
1059
- if (maintenance?.handoffs?.expiredCount > 0) {
1060
- notes.push(`Daemon maintenance expired or escalated ${maintenance.handoffs.expiredCount} pending handoff(s).`);
1061
- }
1062
- if (!selectedLiveMatch) {
1063
- notes.push("No live cron analogue matched.");
1064
- return uniqueStrings([...notes, ...rubric.findings, ...outputDiff.findings, ...(contractCheck?.findings || []), ...(interaction?.findings || [])]);
1065
- }
1066
-
1067
- const latestLive = selectedLiveMatch.latestRun;
1068
- notes.push(`Closest live cron match: ${selectedLiveMatch.name || selectedLiveMatch.id}.`);
1069
- if (latestLive?.status) notes.push(`Latest live status: ${latestLive.status}.`);
1070
- if (latestLive?.provider) notes.push(`Latest live provider: ${latestLive.provider}.`);
1071
- if (latestLive?.durationMs != null) notes.push(`Latest live duration: ${latestLive.durationMs}ms.`);
1072
- const retrievalFindings = retrieval?.findings || [];
1073
- const interactionFindings = interaction?.findings || [];
1074
- const outputDiffFindings =
1075
- jobId === "memory-rollup" && Number(contractCheck?.satisfiedRatio || 0) >= 0.99
1076
- ? (outputDiff.findings || []).filter((item) => !/less structured|much shorter/i.test(item))
1077
- : outputDiff.findings || [];
1078
- return uniqueStrings([...notes, ...rubric.findings, ...outputDiffFindings, ...(contractCheck?.findings || []), ...retrievalFindings, ...interactionFindings]);
1079
- }
1080
-
1081
- function normalizeOutputContract(contract) {
1082
- if (!contract) return null;
1083
- return {
1084
- format: contract.format || null,
1085
- requiredSections: contract.requiredSections || [],
1086
- styleHints: contract.styleHints || [],
1087
- profile: contract.profile || null
1088
- };
1089
- }
1
+ import path from "node:path";
2
+ import { RunReviewer } from "./review.js";
3
+ import { Scheduler } from "./scheduler.js";
4
+ import { RunStore } from "./run-store.js";
5
+ import { NotificationStore } from "./notification-store.js";
6
+ import { DeliveryStore } from "./delivery-store.js";
7
+ import { validateOutputContract } from "./output-contract-validator.js";
8
+
9
+ function groupBy(items, keyFn) {
10
+ const map = new Map();
11
+ for (const item of items) {
12
+ const key = keyFn(item);
13
+ if (!map.has(key)) map.set(key, []);
14
+ map.get(key).push(item);
15
+ }
16
+ return map;
17
+ }
18
+
19
+ function clamp(value, min = 0, max = 1) {
20
+ return Math.max(min, Math.min(max, value));
21
+ }
22
+
23
+ function average(values) {
24
+ if (values.length === 0) return null;
25
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
26
+ }
27
+
28
+ function choosePrimaryV2Run(runs) {
29
+ if (!runs.length) return null;
30
+ return runs.find((run) => run.mode === "provider") || runs[0];
31
+ }
32
+
33
+ function pickBestLiveMatch(liveMatches) {
34
+ return liveMatches.find((match) => match.latestRun) || liveMatches[0] || null;
35
+ }
36
+
37
+ function normalizeText(value) {
38
+ return String(value || "")
39
+ .replace(/\s+/g, " ")
40
+ .trim();
41
+ }
42
+
43
+ function stripOuterFence(text) {
44
+ const raw = String(text || "").trim();
45
+ const fencedMatch = raw.match(/^```(?:json|markdown|md|text)?\s*([\s\S]*?)\s*```$/i);
46
+ return fencedMatch ? fencedMatch[1].trim() : raw;
47
+ }
48
+
49
+ function parseJsonCandidate(text) {
50
+ const candidate = stripOuterFence(text);
51
+ if (!(candidate.startsWith("{") && candidate.endsWith("}"))) {
52
+ return null;
53
+ }
54
+
55
+ try {
56
+ return JSON.parse(candidate);
57
+ } catch {
58
+ return null;
59
+ }
60
+ }
61
+
62
+ function titleCase(value) {
63
+ return String(value || "")
64
+ .replace(/[_-]+/g, " ")
65
+ .replace(/^#+\s*/, "")
66
+ .replace(/\b\w/g, (char) => char.toUpperCase())
67
+ .trim();
68
+ }
69
+
70
+ function normalizeInline(value) {
71
+ return normalizeText(value) || "None";
72
+ }
73
+
74
+ function normalizeSectionKey(value) {
75
+ return String(value || "")
76
+ .toLowerCase()
77
+ .replace(/^#+\s*/, "")
78
+ .replace(/[*:_-]+/g, " ")
79
+ .replace(/\s+/g, " ")
80
+ .trim();
81
+ }
82
+
83
+ function renderValue(value, { contract = null, summary = null } = {}) {
84
+ if (value == null) return "";
85
+
86
+ if (typeof value === "string") {
87
+ const parsed = parseJsonCandidate(value);
88
+ if (parsed && typeof parsed === "object") {
89
+ if (parsed.output !== undefined) {
90
+ return renderValue(parsed.output, {
91
+ contract,
92
+ summary: parsed.summary || summary
93
+ });
94
+ }
95
+
96
+ return renderValue(parsed, { contract, summary });
97
+ }
98
+
99
+ return stripOuterFence(value);
100
+ }
101
+
102
+ if (Array.isArray(value)) {
103
+ return value
104
+ .map((item) => renderValue(item, { contract, summary }))
105
+ .filter(Boolean)
106
+ .join("\n");
107
+ }
108
+
109
+ if (typeof value === "object") {
110
+ const entries = Object.entries(value);
111
+ const sectionOrder = contract?.requiredSections?.length
112
+ ? contract.requiredSections
113
+ : entries.map(([key]) => key);
114
+ const valueMap = new Map(entries.map(([key, entryValue]) => [normalizeSectionKey(key), entryValue]));
115
+ const sectionStyle = contract?.profile?.sectionStyle || (contract?.format === "structured_rollup" ? "headings" : "bullets");
116
+ const requireStatus = contract?.profile?.requireStatus ?? (contract?.format === "bulleted_briefing");
117
+
118
+ if (sectionStyle === "bullets") {
119
+ const lines = [];
120
+ if (requireStatus && summary) lines.push(`Status: ${normalizeInline(summary)}`);
121
+ for (const section of sectionOrder) {
122
+ lines.push(`- ${titleCase(section)}: ${normalizeInline(valueMap.get(normalizeSectionKey(section)))}`);
123
+ }
124
+ return lines.join("\n");
125
+ }
126
+
127
+ if (sectionStyle === "headings") {
128
+ const lines = [];
129
+ for (const section of sectionOrder) {
130
+ lines.push(`## ${titleCase(section)}`);
131
+ lines.push(`- ${normalizeInline(valueMap.get(normalizeSectionKey(section)))}`);
132
+ lines.push("");
133
+ }
134
+ return lines.join("\n").trim();
135
+ }
136
+
137
+ return entries
138
+ .map(([key, entryValue]) => `${titleCase(key)}: ${normalizeInline(entryValue)}`)
139
+ .join("\n");
140
+ }
141
+
142
+ return String(value);
143
+ }
144
+
145
+ function extractRunText(run, contract = null) {
146
+ if (!run) return "";
147
+ const summary = run?.result?.summary || run?.summary || "";
148
+ const value = run?.result?.output ?? run?.output ?? summary;
149
+ return renderValue(value, { contract, summary });
150
+ }
151
+
152
+ function extractLiveText(run) {
153
+ if (!run) return "";
154
+ return renderValue(run.summary || run.error || "");
155
+ }
156
+
157
+ function qualitySignals(text) {
158
+ const raw = String(text || "");
159
+ const normalized = normalizeText(text);
160
+ const lower = normalized.toLowerCase();
161
+ const findings = [];
162
+ let score = 1;
163
+
164
+ if (!normalized) {
165
+ findings.push("Output is empty.");
166
+ return { score: 0, findings };
167
+ }
168
+
169
+ if (normalized.length < 40) {
170
+ findings.push("Output is very short and may not carry enough signal.");
171
+ score -= 0.18;
172
+ }
173
+
174
+ if (/\[(title|link|summary|sub)\]/i.test(normalized)) {
175
+ findings.push("Output still contains placeholder text.");
176
+ score -= 0.4;
177
+ }
178
+
179
+ if (/i('| a)?m sorry|does not support|cannot complete|failed to/i.test(lower)) {
180
+ findings.push("Output contains apology or tool-failure language.");
181
+ score -= 0.28;
182
+ }
183
+
184
+ if (/```/.test(raw)) {
185
+ findings.push("Output is wrapped in a code block instead of a direct report.");
186
+ score -= 0.12;
187
+ }
188
+
189
+ if (/timed out|timeout/i.test(lower)) {
190
+ findings.push("Output references a timeout or stalled execution.");
191
+ score -= 0.35;
192
+ }
193
+
194
+ const repeatedLinePenalty = detectRepeatedLinePenalty(normalized);
195
+ if (repeatedLinePenalty > 0) {
196
+ findings.push("Output repeats nearly identical numbered lines.");
197
+ score -= repeatedLinePenalty;
198
+ }
199
+
200
+ return {
201
+ score: clamp(Number(score.toFixed(4))),
202
+ findings
203
+ };
204
+ }
205
+
206
+ function detectRepeatedLinePenalty(text) {
207
+ const lines = text
208
+ .split("\n")
209
+ .map((line) => line.trim())
210
+ .filter(Boolean)
211
+ .filter((line) => /^\d+\./.test(line));
212
+
213
+ if (lines.length < 3) return 0;
214
+
215
+ const normalized = lines.map((line) => line.replace(/^\d+\.\s*/, "").toLowerCase());
216
+ const unique = new Set(normalized);
217
+ const duplicateRatio = 1 - unique.size / normalized.length;
218
+ if (duplicateRatio < 0.5) return 0;
219
+ return Number((duplicateRatio * 0.25).toFixed(4));
220
+ }
221
+
222
+ function expandRelatedNotificationFiles(baseFiles, notifications) {
223
+ const related = new Set(baseFiles);
224
+ let changed = true;
225
+ while (changed) {
226
+ changed = false;
227
+ for (const item of notifications) {
228
+ if (!related.has(item.filePath)) continue;
229
+ for (const generated of item.generatedNotificationFiles || []) {
230
+ if (related.has(generated)) continue;
231
+ related.add(generated);
232
+ changed = true;
233
+ }
234
+ }
235
+ }
236
+ return related;
237
+ }
238
+
239
+ function evaluateLiveHistory(liveRuns) {
240
+ if (liveRuns.length === 0) {
241
+ return {
242
+ runCount: 0,
243
+ okCount: 0,
244
+ errorCount: 0,
245
+ skippedCount: 0,
246
+ failureRate: null,
247
+ avgDurationMs: null,
248
+ avgTotalTokens: null,
249
+ avgQualityScore: null,
250
+ findings: ["No recent live history available."]
251
+ };
252
+ }
253
+
254
+ const okCount = liveRuns.filter((run) => run.status === "ok").length;
255
+ const errorCount = liveRuns.filter((run) => run.status === "error").length;
256
+ const skippedCount = liveRuns.filter((run) => run.status === "skipped").length;
257
+ const failureRate = errorCount / liveRuns.length;
258
+ const durations = liveRuns.map((run) => run.durationMs).filter((value) => Number.isFinite(value));
259
+ const tokens = liveRuns.map((run) => run.usage?.total_tokens).filter((value) => Number.isFinite(value));
260
+ const qualityScores = liveRuns
261
+ .map((run) => qualitySignals(run.summary || run.error || "").score)
262
+ .filter((value) => Number.isFinite(value));
263
+
264
+ const findings = [];
265
+ if (failureRate >= 0.34) findings.push("Live cron lane is failing often.");
266
+ if (skippedCount > 0) findings.push("Live cron history includes intentional skips.");
267
+ if (tokens.some((value) => value > 12000)) findings.push("Live cron lane shows high token usage spikes.");
268
+
269
+ return {
270
+ runCount: liveRuns.length,
271
+ okCount,
272
+ errorCount,
273
+ skippedCount,
274
+ failureRate: Number(failureRate.toFixed(4)),
275
+ avgDurationMs: durations.length ? Math.round(average(durations)) : null,
276
+ avgTotalTokens: tokens.length ? Math.round(average(tokens)) : null,
277
+ avgQualityScore: qualityScores.length ? Number(average(qualityScores).toFixed(4)) : null,
278
+ findings
279
+ };
280
+ }
281
+
282
+ function scoreStatus(status) {
283
+ if (status === "ok") return 1;
284
+ if (status === "skipped") return 0.6;
285
+ if (status === "error") return 0;
286
+ return 0.5;
287
+ }
288
+
289
+ function buildRubric({ v2Run, liveHistory, matchedLiveJob, outputContract = null, contractCheck = null }) {
290
+ const findings = [];
291
+ const v2Text = extractRunText(v2Run, outputContract);
292
+ const v2Quality = qualitySignals(v2Text);
293
+ const latestLive = liveHistory[0] || null;
294
+ const latestLiveQuality = qualitySignals(extractLiveText(latestLive));
295
+ const history = evaluateLiveHistory(liveHistory);
296
+ const isDryRun = v2Run != null && v2Run.mode !== "provider";
297
+
298
+ const comparisonReadiness = v2Run
299
+ ? v2Run.mode === "provider"
300
+ ? 1
301
+ : 0.45
302
+ : 0;
303
+ const comparisonCoverage = matchedLiveJob ? 1 : 0.2;
304
+ const contractAdherence = contractCheck ? contractCheck.satisfiedRatio : null;
305
+ if (!v2Run) findings.push("No V2 run exists for this lane yet.");
306
+ else if (isDryRun) findings.push("Only a dry-run V2 artifact exists; output quality and contract adherence are excluded from the rubric score.");
307
+ if (!matchedLiveJob) findings.push("Comparison coverage is low because no live cron analogue is mapped.");
308
+
309
+ if (!isDryRun && v2Quality.findings.length) findings.push(...v2Quality.findings.map((item) => `V2: ${item}`));
310
+ if (latestLiveQuality.findings.length) findings.push(...latestLiveQuality.findings.map((item) => `Live: ${item}`));
311
+ if (history.findings.length) findings.push(...history.findings);
312
+
313
+ const fieldScores = contractCheck?.fieldScores || null;
314
+ if (!isDryRun && fieldScores) {
315
+ if (fieldScores.weakFields.length) {
316
+ findings.push(`Field-level quality is low for: ${fieldScores.weakFields.join(", ")}`);
317
+ }
318
+ if (fieldScores.placeholderFields.length) {
319
+ findings.push(`Placeholder text detected in: ${fieldScores.placeholderFields.join(", ")}`);
320
+ }
321
+ }
322
+
323
+ if (!matchedLiveJob) {
324
+ findings.push("No live cron analogue matched for this V2 job.");
325
+ }
326
+
327
+ const statusAlignment =
328
+ latestLive && v2Run
329
+ ? Number((scoreStatus(v2Run.result ? "ok" : v2Run.status) * scoreStatus(latestLive.status)).toFixed(4))
330
+ : null;
331
+ const liveReliability = history.failureRate == null ? null : Number((1 - history.failureRate).toFixed(4));
332
+
333
+ const fieldLevelQuality = !isDryRun && fieldScores ? fieldScores.averageScore : null;
334
+
335
+ const components = [
336
+ isDryRun ? null : v2Quality.score,
337
+ comparisonReadiness,
338
+ comparisonCoverage,
339
+ isDryRun ? null : contractAdherence,
340
+ fieldLevelQuality,
341
+ liveReliability,
342
+ statusAlignment
343
+ ].filter((value) => value != null);
344
+ const overallScore = components.length ? Number(average(components).toFixed(4)) : 0;
345
+
346
+ return {
347
+ overallScore,
348
+ dryRunExcluded: isDryRun,
349
+ components: {
350
+ v2OutputQuality: isDryRun ? null : v2Quality.score,
351
+ comparisonReadiness,
352
+ comparisonCoverage,
353
+ contractAdherence: isDryRun ? null : contractAdherence,
354
+ fieldLevelQuality,
355
+ liveReliability,
356
+ statusAlignment
357
+ },
358
+ findings: uniqueStrings(findings),
359
+ metrics: {
360
+ liveRunCount: history.runCount,
361
+ liveOkCount: history.okCount,
362
+ liveErrorCount: history.errorCount,
363
+ liveSkippedCount: history.skippedCount,
364
+ liveFailureRate: history.failureRate,
365
+ liveAvgDurationMs: history.avgDurationMs,
366
+ liveAvgTotalTokens: history.avgTotalTokens,
367
+ liveAvgQualityScore: history.avgQualityScore,
368
+ latestLiveQualityScore: latestLive ? latestLiveQuality.score : null
369
+ }
370
+ };
371
+ }
372
+
373
+ function uniqueStrings(items) {
374
+ return [...new Set(items.filter(Boolean))];
375
+ }
376
+
377
+ function summarizeV2Run(run) {
378
+ if (!run) return null;
379
+ return {
380
+ timestamp: run.timestamp,
381
+ mode: run.mode || null,
382
+ providerId: run.providerId || null,
383
+ modelId: run.modelId || null,
384
+ summary: run.result?.summary || run.summary || null,
385
+ output: run.result?.output || run.output || null,
386
+ retrievalMeta: run.retrievalMeta || null,
387
+ retrievedMemoryCount: run.retrievedMemory?.length || 0,
388
+ interaction: summarizeInteraction(run),
389
+ fallback: run.fallback || null
390
+ };
391
+ }
392
+
393
+ function summarizeInteraction(run) {
394
+ const interaction = run?.interaction || null;
395
+ if (!interaction) return null;
396
+ return {
397
+ ackRequired: interaction.ack?.required ?? false,
398
+ completionRequired: interaction.completion?.required ?? false,
399
+ handoffRequired: interaction.handoff?.required ?? false,
400
+ yielded: run?.yielded || false,
401
+ yieldSignal: run?.yieldSignal || null,
402
+ followUpState: run?.followUpState || null,
403
+ followUpQueued: run?.followUpQueued || false,
404
+ followUpConsumed: run?.followUpConsumed || false,
405
+ followUpTarget: run?.followUpTarget || null,
406
+ followUpCompleted: run?.followUpCompleted || false,
407
+ followUpExpired: run?.followUpExpired || false,
408
+ followUpEscalated: run?.followUpEscalated || false,
409
+ followUpEscalationFilePath: run?.followUpEscalationFilePath || null,
410
+ handoffTarget: interaction.handoff?.target || null,
411
+ suggestedPeerCount: interaction.handoff?.suggestions?.length || 0,
412
+ handoffChosenPeerId: run?.handoffChosenPeerId || null,
413
+ handoffChosenBy: run?.handoffChosenBy || null,
414
+ handoffDelivered: run?.handoffDelivered || false,
415
+ handoffDeliveryState: run?.handoffDeliveryState || null,
416
+ deliveryDeduped: run?.deliveryDeduped || false,
417
+ deliveryRetried: run?.deliveryRetried || false,
418
+ deliveryUncertain: run?.deliveryUncertain || false
419
+ };
420
+ }
421
+
422
+ function analyzeRetrieval(run) {
423
+ const retrievalMeta = run?.retrievalMeta || run?.plan?.packet?.layers?.retrievalMeta || null;
424
+ if (!run?.retrievedMemory?.length) {
425
+ return {
426
+ memoryCount: 0,
427
+ lexicalCount: 0,
428
+ semanticCount: 0,
429
+ qmdCount: 0,
430
+ freshEmbeddingCount: 0,
431
+ staleEmbeddingCount: 0,
432
+ missingEmbeddingCount: 0,
433
+ failedEmbeddingCount: retrievalMeta?.embeddingHealth?.failedCount || 0,
434
+ embeddingQueryMode: retrievalMeta?.embeddingQueryMode || "lexical_only",
435
+ embeddingError: retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError || null,
436
+ findings: ["No retrieved memory was captured for this run."],
437
+ items: []
438
+ };
439
+ }
440
+
441
+ const items = run.retrievedMemory.map((item) => ({
442
+ entryId: item.entryId || null,
443
+ title: item.title || null,
444
+ sourceBackend: item.sourceBackend || "file",
445
+ candidateSource: item.candidateSource || "indexed",
446
+ lexicalScore: item.lexicalScore ?? 0,
447
+ embeddingSimilarity: item.embeddingSimilarity ?? 0,
448
+ embeddingFreshness: item.embeddingFreshness || "missing",
449
+ retrievalSources: item.retrievalSources || []
450
+ }));
451
+
452
+ const lexicalCount = items.filter((item) => item.retrievalSources.includes("lexical")).length;
453
+ const semanticCount = items.filter((item) => item.retrievalSources.includes("semantic")).length;
454
+ const qmdCount = items.filter((item) => item.retrievalSources.includes("qmd") || item.sourceBackend === "qmd").length;
455
+ const freshEmbeddingCount = items.filter((item) => item.embeddingFreshness === "fresh").length;
456
+ const staleEmbeddingCount = items.filter((item) => item.embeddingFreshness === "stale").length;
457
+ const missingEmbeddingCount = items.filter((item) => item.embeddingFreshness === "missing").length;
458
+ const failedEmbeddingCount = items.filter((item) => item.embeddingFreshness === "failed").length;
459
+ const findings = [];
460
+
461
+ if (semanticCount === 0) findings.push("No semantic retrieval candidates reached the final packet.");
462
+ if (staleEmbeddingCount > 0) findings.push("Some retrieved items have stale embeddings.");
463
+ if (freshEmbeddingCount === 0 && missingEmbeddingCount > 0) findings.push("Retrieved file memory is falling back to non-embedded retrieval.");
464
+ if ((retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError) && !findings.includes("Embedding query failed and retrieval fell back to lexical mode.")) {
465
+ findings.push("Embedding query failed and retrieval fell back to lexical mode.");
466
+ }
467
+
468
+ return {
469
+ memoryCount: items.length,
470
+ lexicalCount,
471
+ semanticCount,
472
+ qmdCount,
473
+ freshEmbeddingCount,
474
+ staleEmbeddingCount,
475
+ missingEmbeddingCount,
476
+ failedEmbeddingCount,
477
+ embeddingQueryMode: retrievalMeta?.embeddingQueryMode || "lexical_only",
478
+ embeddingError: retrievalMeta?.embeddingError || retrievalMeta?.embeddingHealth?.lastError || null,
479
+ embeddingHealth: retrievalMeta?.embeddingHealth || null,
480
+ findings,
481
+ items
482
+ };
483
+ }
484
+
485
+ function summarizeInteractionDiagnosis({
486
+ ackRequired,
487
+ ackQueued,
488
+ completionRequired,
489
+ completionQueued,
490
+ handoffRequired,
491
+ handoffQueued,
492
+ yielded,
493
+ followUpQueued,
494
+ followUpConsumed,
495
+ handoffDelivered,
496
+ deliveryEvidenceRequired,
497
+ deliveryEvidenceHealthy,
498
+ deliveryUncertain,
499
+ findings
500
+ }) {
501
+ const missingStages = [];
502
+ if (ackRequired && !ackQueued) missingStages.push("ack");
503
+ if (completionRequired && !completionQueued && !yielded) missingStages.push("completion");
504
+ if (handoffRequired && !handoffQueued && !yielded) missingStages.push("handoff");
505
+
506
+ if (!findings?.length) {
507
+ return {
508
+ status: "healthy",
509
+ code: null,
510
+ summary: "Interaction lifecycle evidence is complete.",
511
+ missingStages,
512
+ findings: []
513
+ };
514
+ }
515
+
516
+ if (missingStages.length) {
517
+ return {
518
+ status: "action_required",
519
+ code: "missing_required_notifications",
520
+ summary: `Required interaction notifications are missing: ${missingStages.join(", ")}.`,
521
+ missingStages,
522
+ findings
523
+ };
524
+ }
525
+
526
+ if (yielded && followUpQueued && !followUpConsumed) {
527
+ return {
528
+ status: "action_required",
529
+ code: "follow_up_incomplete",
530
+ summary: "Yielded follow-up is still pending or incomplete.",
531
+ missingStages,
532
+ findings
533
+ };
534
+ }
535
+
536
+ if (deliveryEvidenceRequired && !deliveryEvidenceHealthy) {
537
+ return {
538
+ status: "action_required",
539
+ code: "missing_delivery_evidence",
540
+ summary: "Required pingback notifications do not have delivery receipts yet.",
541
+ missingStages,
542
+ findings
543
+ };
544
+ }
545
+
546
+ if (handoffRequired && handoffQueued && !handoffDelivered) {
547
+ return {
548
+ status: "action_required",
549
+ code: "handoff_incomplete",
550
+ summary: "Handoff was queued but not fully delivered.",
551
+ missingStages,
552
+ findings
553
+ };
554
+ }
555
+
556
+ if (deliveryUncertain) {
557
+ return {
558
+ status: "warning",
559
+ code: "delivery_uncertain",
560
+ summary: "Interaction lifecycle completed with uncertain delivery evidence.",
561
+ missingStages,
562
+ findings
563
+ };
564
+ }
565
+
566
+ return {
567
+ status: "warning",
568
+ code: "interaction_findings_present",
569
+ summary: findings[0] || "Interaction lifecycle has unresolved findings.",
570
+ missingStages,
571
+ findings
572
+ };
573
+ }
574
+
575
+ function analyzeInteraction(run, notifications = [], deliveries = []) {
576
+ const interaction = run?.interaction || null;
577
+ const findings = [];
578
+
579
+ const ackRequired = interaction?.ack?.required ?? false;
580
+ const completionRequired = interaction?.completion?.required ?? false;
581
+ const handoffRequired = interaction?.handoff?.required ?? false;
582
+ const ackQueued = notifications.some((item) => item.stage === "ack");
583
+ const completionQueued = notifications.some((item) => item.stage === "completion");
584
+ const handoffQueued = notifications.some((item) => item.stage === "handoff");
585
+ const followUpQueued = notifications.some((item) => item.stage === "follow_up");
586
+ const followUpNotification = notifications.find((item) => item.stage === "follow_up") || null;
587
+ const handoffNotification = notifications.find((item) => item.stage === "handoff") || null;
588
+ const handoffPendingChoice = handoffNotification?.status === "awaiting_choice";
589
+ const handoffState = handoffNotification?.handoffState || (handoffPendingChoice ? "pending" : null);
590
+ const handoffChosen = Boolean(handoffNotification?.chosenPeer?.peerId);
591
+ const handoffChosenPeerId = handoffNotification?.chosenPeer?.peerId || null;
592
+ const handoffChosenBy = handoffNotification?.chosenBy || null;
593
+ const handoffDelivery = handoffNotification
594
+ ? deliveries.find((item) => item.notificationFilePath === handoffNotification.filePath) || null
595
+ : null;
596
+ const handoffDelivered = Boolean(handoffDelivery);
597
+ const handoffDeliveryState = handoffDelivery?.delivery?.status || null;
598
+ const deliveryStates = deliveries.map((item) => item.delivery?.status || item.stage || "unknown");
599
+ const yielded = interaction?.yield?.required ?? false;
600
+ const yieldSignal = interaction?.yield?.signal || null;
601
+ const followUpState = followUpNotification?.followUpState || followUpNotification?.yieldState || null;
602
+ const followUpConsumed = followUpNotification?.status === "consumed" || followUpNotification?.yieldState === "consumed" || followUpNotification?.followUpState === "consumed";
603
+ const followUpTarget = followUpNotification?.targetSurface || interaction?.yield?.targetSurface || null;
604
+ const generatedFiles = new Set(followUpNotification?.generatedNotificationFiles || []);
605
+ const followUpCompleted = deliveries.some((item) => generatedFiles.has(item.notificationFilePath));
606
+ const followUpExpired = followUpState === "expired";
607
+ const followUpEscalated = followUpState === "escalated";
608
+ const deliveryDeduped = deliveryStates.includes("duplicate_prevented");
609
+ const deliveryRetried = deliveries.some((item) => Number(item.attempt) > 1);
610
+ const deliveryUncertain = deliveryStates.includes("delivery_uncertain") || deliveryStates.includes("uncertain");
611
+ const yieldedCompletionHandled = yielded && followUpQueued && Boolean(followUpNotification?.payload?.completion);
612
+ const yieldedHandoffHandled = yielded && followUpQueued && Boolean(followUpNotification?.payload?.handoff);
613
+ const visibleDeliveryCount = yielded ? deliveries.filter((item) => !generatedFiles.has(item.notificationFilePath)).length : deliveries.length;
614
+ const deliveryEvidenceRequired = !yielded && Boolean(ackQueued || completionQueued || handoffQueued);
615
+ const deliveryEvidenceHealthy = !deliveryEvidenceRequired || visibleDeliveryCount > 0;
616
+
617
+ if (ackRequired && !ackQueued) findings.push("Required ack notification was not queued.");
618
+ if (completionRequired && !completionQueued && !yieldedCompletionHandled) {
619
+ findings.push("Required completion notification was not queued.");
620
+ }
621
+ if (handoffRequired && !handoffQueued && !yieldedHandoffHandled) {
622
+ findings.push("Configured handoff was not queued.");
623
+ }
624
+ if (yielded && !followUpQueued) findings.push("Run yielded but no follow-up payload was persisted.");
625
+ if (followUpQueued && !followUpConsumed) findings.push("Follow-up payload was persisted but never consumed.");
626
+ if (followUpExpired) findings.push("Follow-up expired before it was consumed.");
627
+ if (followUpEscalated) findings.push("Follow-up expired and was escalated to the operator.");
628
+ if (handoffPendingChoice) findings.push("Handoff is awaiting an explicit peer choice.");
629
+ if (handoffState === "expired") findings.push("Handoff expired without an operator choice.");
630
+ if (handoffState === "escalated") findings.push("Handoff expired and was escalated to the operator.");
631
+ if (handoffState === "blocked") findings.push(`Handoff is blocked${handoffNotification?.blockedReason ? `: ${handoffNotification.blockedReason}` : "."}`);
632
+ if (handoffQueued && !handoffPendingChoice && !handoffChosen) findings.push("Handoff was queued but no peer choice was recorded.");
633
+ if (handoffChosen && !handoffDelivered) findings.push("Handoff was promoted to a peer but not delivered yet.");
634
+ if (handoffDeliveryState && /error|blocked/i.test(handoffDeliveryState)) findings.push(`Handoff delivery did not complete cleanly: ${handoffDeliveryState}.`);
635
+ if (!deliveryEvidenceHealthy) {
636
+ findings.push("No delivery receipts exist yet for queued interaction notifications.");
637
+ }
638
+ if (deliveryDeduped) findings.push("A duplicate delivery attempt was prevented by dedupe policy.");
639
+ if (deliveryUncertain) findings.push("A delivery attempt was marked uncertain and will not be blindly retried.");
640
+
641
+ const diagnosis = summarizeInteractionDiagnosis({
642
+ ackRequired,
643
+ ackQueued,
644
+ completionRequired,
645
+ completionQueued,
646
+ handoffRequired,
647
+ handoffQueued,
648
+ yielded,
649
+ followUpQueued,
650
+ followUpConsumed,
651
+ handoffDelivered,
652
+ deliveryEvidenceRequired,
653
+ deliveryEvidenceHealthy,
654
+ deliveryUncertain,
655
+ findings
656
+ });
657
+
658
+ return {
659
+ ackRequired,
660
+ ackQueued,
661
+ completionRequired,
662
+ completionQueued,
663
+ handoffRequired,
664
+ handoffQueued,
665
+ yielded,
666
+ yieldSignal,
667
+ followUpState,
668
+ followUpQueued,
669
+ followUpConsumed,
670
+ followUpTarget,
671
+ followUpCompleted,
672
+ followUpExpired,
673
+ followUpEscalated,
674
+ followUpEscalationFilePath: followUpNotification?.escalationNotificationFilePath || null,
675
+ handoffPendingChoice,
676
+ handoffState,
677
+ handoffExpired: handoffState === "expired",
678
+ handoffEscalated: handoffState === "escalated",
679
+ handoffBlocked: handoffState === "blocked",
680
+ handoffChosen,
681
+ handoffChosenPeerId,
682
+ handoffChosenBy,
683
+ handoffDelivered,
684
+ handoffDeliveryState,
685
+ handoffBlockedReason: handoffNotification?.blockedReason || null,
686
+ deliveryEvidenceRequired,
687
+ deliveryEvidenceHealthy,
688
+ deliveryReceiptCount: visibleDeliveryCount,
689
+ deliveryStates,
690
+ deliveryDeduped,
691
+ deliveryRetried,
692
+ deliveryUncertain,
693
+ diagnosis,
694
+ findings
695
+ };
696
+ }
697
+
698
+ function summarizeLiveRun(run) {
699
+ if (!run) return null;
700
+ return {
701
+ ts: run.ts || null,
702
+ status: run.status || null,
703
+ summary: run.summary || null,
704
+ error: run.error || null,
705
+ durationMs: run.durationMs ?? null,
706
+ provider: run.provider || null,
707
+ model: run.model || null,
708
+ usage: run.usage || null
709
+ };
710
+ }
711
+
712
+ const SECTION_HINTS = [
713
+ "calendar",
714
+ "weather",
715
+ "projects",
716
+ "project",
717
+ "inbox",
718
+ "issues",
719
+ "backlog",
720
+ "update",
721
+ "summary",
722
+ "next actions",
723
+ "alerts",
724
+ "status"
725
+ ];
726
+
727
+ const SECTION_ALIASES = new Map([
728
+ ["project", "projects"],
729
+ ["next action", "next actions"]
730
+ ]);
731
+
732
+ function canonicalSectionKey(value) {
733
+ const normalized = normalizeSectionKey(value);
734
+ return SECTION_ALIASES.get(normalized) || normalized;
735
+ }
736
+
737
+ function extractSectionKeys(nonEmptyLines) {
738
+ const sectionKeys = [];
739
+ const seen = new Set();
740
+ const patterns = [
741
+ /^#{1,6}\s+(.+)$/,
742
+ /^\*\*([^*]+)\*\*$/,
743
+ /^[-*]\s+\**([^:*]+?)\**:\s+.+$/,
744
+ /^([^:]{2,40}):\s+.+$/
745
+ ];
746
+
747
+ for (const line of nonEmptyLines) {
748
+ for (const pattern of patterns) {
749
+ const match = line.match(pattern);
750
+ if (!match) continue;
751
+ const key = canonicalSectionKey(match[1]);
752
+ if (!key || key.length < 2) continue;
753
+ if (seen.has(key)) break;
754
+ seen.add(key);
755
+ sectionKeys.push(key);
756
+ break;
757
+ }
758
+ }
759
+
760
+ return sectionKeys;
761
+ }
762
+
763
+ function comparableSections(structure) {
764
+ return structure.sectionKeys.length ? structure.sectionKeys : structure.sectionHints;
765
+ }
766
+
767
+ function analyzeOutputStructure(text) {
768
+ const raw = String(text || "");
769
+ const trimmed = raw.trim();
770
+ const lines = raw.split("\n").map((line) => line.trim());
771
+ const nonEmptyLines = lines.filter(Boolean);
772
+ const lower = trimmed.toLowerCase();
773
+ const sectionKeys = extractSectionKeys(nonEmptyLines);
774
+
775
+ return {
776
+ charCount: trimmed.length,
777
+ lineCount: nonEmptyLines.length,
778
+ headingCount: nonEmptyLines.filter((line) => /^#{1,6}\s/.test(line) || /^\*\*[^*]+\*\*/.test(line)).length,
779
+ bulletCount: nonEmptyLines.filter((line) => /^[-*]\s/.test(line)).length,
780
+ numberedCount: nonEmptyLines.filter((line) => /^\d+\.\s/.test(line)).length,
781
+ codeFenceCount: (raw.match(/```/g) || []).length / 2,
782
+ sectionKeys,
783
+ sectionHints: SECTION_HINTS.filter((hint) => lower.includes(hint)),
784
+ hasMarkdownTable: /\|.+\|/.test(raw),
785
+ hasEmoji: /\p{Extended_Pictographic}/u.test(raw)
786
+ };
787
+ }
788
+
789
+ function buildOutputDiff(v2Run, liveRun, contract = null) {
790
+ const v2Text = extractRunText(v2Run, contract);
791
+ const liveText = extractLiveText(liveRun);
792
+ const v2 = analyzeOutputStructure(v2Text);
793
+ const live = analyzeOutputStructure(liveText);
794
+ const v2Sections = comparableSections(v2);
795
+ const liveSections = comparableSections(live);
796
+ const sharedSections = v2Sections.filter((hint) => liveSections.includes(hint));
797
+ const missingFromV2 = liveSections.filter((hint) => !v2Sections.includes(hint));
798
+ const extraInV2 = v2Sections.filter((hint) => !liveSections.includes(hint));
799
+ const findings = [];
800
+
801
+ if (!liveText) {
802
+ findings.push("No live output is available for structural comparison.");
803
+ } else {
804
+ if (missingFromV2.length) findings.push(`V2 is missing live sections: ${missingFromV2.join(", ")}.`);
805
+ if (v2.bulletCount + v2.numberedCount < live.bulletCount + live.numberedCount) {
806
+ findings.push("V2 output is less structured than the live report.");
807
+ }
808
+ if (v2.charCount > 0 && live.charCount > 0) {
809
+ const verbosityRatio = Number((v2.charCount / live.charCount).toFixed(4));
810
+ if (verbosityRatio < 0.55) findings.push("V2 output is much shorter than the live report.");
811
+ if (verbosityRatio > 1.8) findings.push("V2 output is much longer than the live report.");
812
+ }
813
+ }
814
+
815
+ return {
816
+ v2,
817
+ live,
818
+ alignment: {
819
+ sharedSections,
820
+ missingFromV2,
821
+ extraInV2,
822
+ bulletDelta: (v2.bulletCount + v2.numberedCount) - (live.bulletCount + live.numberedCount),
823
+ headingDelta: v2.headingCount - live.headingCount,
824
+ charDelta: v2.charCount - live.charCount
825
+ },
826
+ findings
827
+ };
828
+ }
829
+
830
+ function assessOutputContract(contract, v2Run, liveRun) {
831
+ if (!contract) return null;
832
+
833
+ const v2Validation = validateOutputContract(contract, v2Run?.result?.output ?? v2Run?.output ?? "");
834
+ const liveValidation = validateOutputContract(contract, liveRun?.summary || liveRun?.error || "");
835
+ const missingFromV2 = v2Validation?.missingSections || [];
836
+ const missingFromLive = liveValidation?.missingSections || [];
837
+ const findings = [
838
+ ...(v2Validation?.findings.map((item) => `V2: ${item}`) || []),
839
+ ...(liveValidation?.findings.map((item) => `Live: ${item}`) || [])
840
+ ];
841
+
842
+ return {
843
+ format: contract.format || null,
844
+ requiredSections: contract.requiredSections || [],
845
+ styleHints: contract.styleHints || [],
846
+ satisfiedRatio: v2Validation?.satisfiedRatio ?? 0,
847
+ missingFromV2,
848
+ missingFromLive,
849
+ emptyInV2: v2Validation?.emptySections || [],
850
+ emptyInLive: liveValidation?.emptySections || [],
851
+ parsedV2: v2Validation?.parsed || null,
852
+ parsedLive: liveValidation?.parsed || null,
853
+ fieldScores: v2Validation?.fieldScores || null,
854
+ findings
855
+ };
856
+ }
857
+
858
+ export { buildRubric };
859
+
860
+ export class Evaluator {
861
+ constructor({ projectRoot, liveRoot, stateRoot }) {
862
+ this.liveRoot = liveRoot;
863
+ this.stateRoot = stateRoot;
864
+ this.reviewer = new RunReviewer({ stateRoot });
865
+ this.scheduler = new Scheduler({ projectRoot, liveRoot, stateRoot });
866
+ this.evalStore = new RunStore({ rootDir: path.join(stateRoot, "evaluations") });
867
+ this.notificationStore = new NotificationStore({ rootDir: path.join(stateRoot, "notifications") });
868
+ this.deliveryStore = new DeliveryStore({ rootDir: path.join(stateRoot, "deliveries") });
869
+ }
870
+
871
+ async resolveInteractionArtifacts(run, review = null) {
872
+ const reviewNotifications = review?.recentNotifications || [];
873
+ const _reviewDeliveries = review?.recentDeliveries || [];
874
+ const baseFiles = run?.notificationFiles || [];
875
+ const relatedNotificationFiles = expandRelatedNotificationFiles(
876
+ baseFiles,
877
+ reviewNotifications
878
+ );
879
+ const knownNotifications = reviewNotifications.filter((item) => relatedNotificationFiles.has(item.filePath));
880
+ const missingNotificationFiles = [...relatedNotificationFiles].filter(
881
+ (filePath) => !knownNotifications.some((item) => item.filePath === filePath)
882
+ );
883
+ const loadedNotifications = await this.notificationStore.getNotifications(missingNotificationFiles);
884
+ const notifications = [...knownNotifications, ...loadedNotifications].sort((a, b) =>
885
+ String(a.timestamp || "").localeCompare(String(b.timestamp || ""))
886
+ );
887
+ const expandedNotificationFiles = expandRelatedNotificationFiles(baseFiles, notifications);
888
+
889
+ // Load all delivery receipts once for deterministic interaction evidence.
890
+ const allDeliveries = await this.deliveryStore.listAll();
891
+ const deliveries = allDeliveries.filter((item) => expandedNotificationFiles.has(item.notificationFilePath));
892
+ return {
893
+ notificationFiles: expandedNotificationFiles,
894
+ notifications,
895
+ deliveries
896
+ };
897
+ }
898
+
899
+ async evaluate(limit = 20) {
900
+ const [review, comparisons] = await Promise.all([
901
+ this.reviewer.review(limit),
902
+ this.scheduler.compareJobs()
903
+ ]);
904
+
905
+ const recentRuns = review.recentRuns.filter((run) => !run.jobId.endsWith("-comparison"));
906
+ const grouped = groupBy(recentRuns, (run) => run.jobId);
907
+
908
+ const jobs = await Promise.all(
909
+ Array.from(grouped.entries()).map(async ([jobId, runs]) => this.evaluateJob(jobId, {
910
+ runs,
911
+ review,
912
+ comparisons
913
+ }))
914
+ );
915
+
916
+ return {
917
+ jobs: [...jobs].sort((a, b) => a.rubric.overallScore - b.rubric.overallScore),
918
+ recentRuns,
919
+ scheduler: review.scheduler
920
+ };
921
+ }
922
+
923
+ async evaluateJob(jobId, options = {}) {
924
+ const runtime = await this.scheduler.loadRuntime();
925
+ const jobConfig = runtime.jobs[jobId] || null;
926
+ const review = options.review || (await this.reviewer.review(options.limit ?? 20));
927
+ const comparisons = options.comparisons || (await this.scheduler.compareJobs());
928
+ const runs = options.runs || review.recentRuns.filter((run) => run.jobId === jobId && !run.jobId.endsWith("-comparison"));
929
+ const schedulerState = review.scheduler.find((item) => item.jobId === jobId) || null;
930
+ const comparison = comparisons.find((item) => item.v2JobId === jobId) || null;
931
+ const liveMatches = await Promise.all(
932
+ (comparison?.closestLiveJobs || []).map(async (match) => ({
933
+ ...match,
934
+ recentRuns: await this.scheduler.bridge.loadCronRunHistory(match.id, 5),
935
+ latestRun: (await this.scheduler.bridge.loadCronRunHistory(match.id, 1))[0] || null
936
+ }))
937
+ );
938
+
939
+ const selectedRun = choosePrimaryV2Run(runs);
940
+ const interactionRun = runs[0] || selectedRun;
941
+ const selectedLiveMatch = pickBestLiveMatch(liveMatches);
942
+ const liveHistory = selectedLiveMatch?.recentRuns || [];
943
+ const outputContract = normalizeOutputContract(jobConfig?.outputContract || null);
944
+ const outputDiff = buildOutputDiff(selectedRun, selectedLiveMatch?.latestRun || null, outputContract);
945
+ const contractCheck = assessOutputContract(outputContract, selectedRun, selectedLiveMatch?.latestRun || null);
946
+ const retrieval = analyzeRetrieval(selectedRun);
947
+ const interactionArtifacts = await this.resolveInteractionArtifacts(interactionRun, review);
948
+ const interaction = analyzeInteraction(interactionRun, interactionArtifacts.notifications, interactionArtifacts.deliveries);
949
+ const rubric = buildRubric({
950
+ v2Run: selectedRun,
951
+ liveHistory,
952
+ matchedLiveJob: selectedLiveMatch,
953
+ outputContract,
954
+ contractCheck
955
+ });
956
+
957
+ return {
958
+ jobId,
959
+ schedulerState,
960
+ maintenance: review?.maintenance || null,
961
+ latestRun: runs[0] || null,
962
+ primaryRun: summarizeV2Run(selectedRun),
963
+ interactionRunTimestamp: interactionRun?.timestamp || null,
964
+ runCount: runs.length,
965
+ modeCounts: countModes(runs),
966
+ liveMatches: liveMatches.map((match) => ({
967
+ id: match.id,
968
+ name: match.name,
969
+ description: match.description || null,
970
+ enabled: match.enabled,
971
+ schedule: match.schedule,
972
+ kind: match.kind || null,
973
+ lastStatus: match.lastStatus || null,
974
+ latestRun: summarizeLiveRun(match.latestRun)
975
+ })),
976
+ selectedLiveMatch: selectedLiveMatch
977
+ ? {
978
+ id: selectedLiveMatch.id,
979
+ name: selectedLiveMatch.name,
980
+ latestRun: summarizeLiveRun(selectedLiveMatch.latestRun)
981
+ }
982
+ : null,
983
+ retrieval,
984
+ interaction,
985
+ outputContract,
986
+ contractCheck,
987
+ outputDiff,
988
+ rubric,
989
+ comparisonNotes: buildComparisonNotes({
990
+ jobId,
991
+ selectedRun,
992
+ selectedLiveMatch,
993
+ schedulerState,
994
+ maintenance: review?.maintenance || null,
995
+ rubric,
996
+ outputDiff,
997
+ contractCheck,
998
+ retrieval,
999
+ interaction
1000
+ }),
1001
+ interactionArtifacts: {
1002
+ notificationCount: interactionArtifacts.notifications.length,
1003
+ deliveryCount: interactionArtifacts.deliveries.length,
1004
+ notificationFileCount: interactionArtifacts.notificationFiles.size
1005
+ }
1006
+ };
1007
+ }
1008
+
1009
+ async evaluateAndPersistJob(jobId, options = {}) {
1010
+ const report = await this.evaluateJob(jobId, options);
1011
+ const artifact = {
1012
+ timestamp: new Date().toISOString(),
1013
+ kind: "job-evaluation",
1014
+ ...report
1015
+ };
1016
+ const filePath = await this.evalStore.saveRun(jobId, artifact);
1017
+ return {
1018
+ filePath,
1019
+ ...artifact
1020
+ };
1021
+ }
1022
+ }
1023
+
1024
+ function countModes(runs) {
1025
+ return runs.reduce((acc, run) => {
1026
+ const mode = run.mode || "unknown";
1027
+ acc[mode] = (acc[mode] || 0) + 1;
1028
+ return acc;
1029
+ }, {});
1030
+ }
1031
+
1032
+ function buildComparisonNotes({
1033
+ jobId,
1034
+ selectedRun,
1035
+ selectedLiveMatch,
1036
+ schedulerState,
1037
+ maintenance,
1038
+ rubric,
1039
+ outputDiff,
1040
+ contractCheck,
1041
+ retrieval = null,
1042
+ interaction = null
1043
+ }) {
1044
+ const notes = [];
1045
+ if (!selectedRun) notes.push("No V2 runs recorded.");
1046
+ if (schedulerState?.lastStatus === "ok") notes.push("Latest V2 scheduler state is healthy.");
1047
+ if (selectedRun?.fallback?.attempted && selectedRun?.fallback?.success) {
1048
+ notes.push(
1049
+ `Run used report fallback from ${selectedRun.fallback.sourceLane || "local"} to ${selectedRun.fallback.finalSourceLane || "remote"} after ${selectedRun.fallback.trigger || "failure"}.`
1050
+ );
1051
+ } else if (selectedRun?.fallback?.attempted && !selectedRun?.fallback?.success) {
1052
+ notes.push(`Report fallback was attempted but did not succeed${selectedRun.fallback?.fallbackError ? `: ${selectedRun.fallback.fallbackError}` : "."}`);
1053
+ } else if (selectedRun?.fallback?.allowed === false && selectedRun?.fallback?.trigger) {
1054
+ notes.push(`Report fallback was blocked after ${selectedRun.fallback.trigger}${selectedRun.fallback?.blockedReason ? `: ${selectedRun.fallback.blockedReason}` : "."}`);
1055
+ }
1056
+ if (maintenance?.wal?.some((item) => item.action && item.action !== "none" && item.action !== "error")) {
1057
+ notes.push("Daemon maintenance recently performed WAL checkpoint work.");
1058
+ }
1059
+ if (maintenance?.handoffs?.expiredCount > 0) {
1060
+ notes.push(`Daemon maintenance expired or escalated ${maintenance.handoffs.expiredCount} pending handoff(s).`);
1061
+ }
1062
+ if (!selectedLiveMatch) {
1063
+ notes.push("No live cron analogue matched.");
1064
+ return uniqueStrings([...notes, ...rubric.findings, ...outputDiff.findings, ...(contractCheck?.findings || []), ...(interaction?.findings || [])]);
1065
+ }
1066
+
1067
+ const latestLive = selectedLiveMatch.latestRun;
1068
+ notes.push(`Closest live cron match: ${selectedLiveMatch.name || selectedLiveMatch.id}.`);
1069
+ if (latestLive?.status) notes.push(`Latest live status: ${latestLive.status}.`);
1070
+ if (latestLive?.provider) notes.push(`Latest live provider: ${latestLive.provider}.`);
1071
+ if (latestLive?.durationMs != null) notes.push(`Latest live duration: ${latestLive.durationMs}ms.`);
1072
+ const retrievalFindings = retrieval?.findings || [];
1073
+ const interactionFindings = interaction?.findings || [];
1074
+ const outputDiffFindings =
1075
+ jobId === "memory-rollup" && Number(contractCheck?.satisfiedRatio || 0) >= 0.99
1076
+ ? (outputDiff.findings || []).filter((item) => !/less structured|much shorter/i.test(item))
1077
+ : outputDiff.findings || [];
1078
+ return uniqueStrings([...notes, ...rubric.findings, ...outputDiffFindings, ...(contractCheck?.findings || []), ...retrievalFindings, ...interactionFindings]);
1079
+ }
1080
+
1081
+ function normalizeOutputContract(contract) {
1082
+ if (!contract) return null;
1083
+ return {
1084
+ format: contract.format || null,
1085
+ requiredSections: contract.requiredSections || [],
1086
+ styleHints: contract.styleHints || [],
1087
+ profile: contract.profile || null
1088
+ };
1089
+ }