@roackb2/heddle 0.0.37 → 0.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/README.md +2 -0
  2. package/dist/examples/repo-investigator.js +1 -2
  3. package/dist/examples/repo-investigator.js.map +1 -1
  4. package/dist/src/cli/ask.d.ts.map +1 -1
  5. package/dist/src/cli/ask.js +11 -0
  6. package/dist/src/cli/ask.js.map +1 -1
  7. package/dist/src/cli/chat/App.js +43 -27
  8. package/dist/src/cli/chat/App.js.map +1 -1
  9. package/dist/src/cli/chat/components/ModelPickerPanel.d.ts +2 -1
  10. package/dist/src/cli/chat/components/ModelPickerPanel.d.ts.map +1 -1
  11. package/dist/src/cli/chat/components/ModelPickerPanel.js +8 -4
  12. package/dist/src/cli/chat/components/ModelPickerPanel.js.map +1 -1
  13. package/dist/src/cli/chat/debug/tui-debug-snapshot.d.ts +2 -1
  14. package/dist/src/cli/chat/debug/tui-debug-snapshot.d.ts.map +1 -1
  15. package/dist/src/cli/chat/debug/tui-debug-snapshot.js +14 -1
  16. package/dist/src/cli/chat/debug/tui-debug-snapshot.js.map +1 -1
  17. package/dist/src/cli/chat/hooks/tui-ordinary-turn.d.ts.map +1 -1
  18. package/dist/src/cli/chat/hooks/tui-ordinary-turn.js +0 -1
  19. package/dist/src/cli/chat/hooks/tui-ordinary-turn.js.map +1 -1
  20. package/dist/src/cli/chat/hooks/useApprovalFlow.d.ts.map +1 -1
  21. package/dist/src/cli/chat/hooks/useApprovalFlow.js +3 -3
  22. package/dist/src/cli/chat/hooks/useApprovalFlow.js.map +1 -1
  23. package/dist/src/cli/chat/hooks/useChatPickers.d.ts +6 -3
  24. package/dist/src/cli/chat/hooks/useChatPickers.d.ts.map +1 -1
  25. package/dist/src/cli/chat/hooks/useChatPickers.js +17 -4
  26. package/dist/src/cli/chat/hooks/useChatPickers.js.map +1 -1
  27. package/dist/src/cli/chat/hooks/useChatSessions.d.ts +1 -1
  28. package/dist/src/cli/chat/hooks/useChatSessions.d.ts.map +1 -1
  29. package/dist/src/cli/chat/hooks/useChatSessions.js +23 -20
  30. package/dist/src/cli/chat/hooks/useChatSessions.js.map +1 -1
  31. package/dist/src/cli/chat/hooks/usePromptSubmission.d.ts +2 -1
  32. package/dist/src/cli/chat/hooks/usePromptSubmission.d.ts.map +1 -1
  33. package/dist/src/cli/chat/hooks/usePromptSubmission.js +4 -1
  34. package/dist/src/cli/chat/hooks/usePromptSubmission.js.map +1 -1
  35. package/dist/src/cli/chat/state/local-commands.d.ts +2 -0
  36. package/dist/src/cli/chat/state/local-commands.d.ts.map +1 -1
  37. package/dist/src/cli/chat/state/local-commands.js +17 -3
  38. package/dist/src/cli/chat/state/local-commands.js.map +1 -1
  39. package/dist/src/cli/chat/submit.d.ts.map +1 -1
  40. package/dist/src/cli/chat/submit.js +13 -1
  41. package/dist/src/cli/chat/submit.js.map +1 -1
  42. package/dist/src/cli/eval/index.d.ts +24 -0
  43. package/dist/src/cli/eval/index.d.ts.map +1 -0
  44. package/dist/src/cli/eval/index.js +232 -0
  45. package/dist/src/cli/eval/index.js.map +1 -0
  46. package/dist/src/cli/main.js +25 -3
  47. package/dist/src/cli/main.js.map +1 -1
  48. package/dist/src/cli/remote/control-plane-client.d.ts +5 -1
  49. package/dist/src/cli/remote/control-plane-client.d.ts.map +1 -1
  50. package/dist/src/core/agent/mutation-tracking.d.ts +0 -7
  51. package/dist/src/core/agent/mutation-tracking.d.ts.map +1 -1
  52. package/dist/src/core/agent/mutation-tracking.js +5 -63
  53. package/dist/src/core/agent/mutation-tracking.js.map +1 -1
  54. package/dist/src/core/agent/post-mutation.d.ts +2 -2
  55. package/dist/src/core/agent/post-mutation.d.ts.map +1 -1
  56. package/dist/src/core/agent/post-mutation.js +5 -20
  57. package/dist/src/core/agent/post-mutation.js.map +1 -1
  58. package/dist/src/core/agent/progress-reminders.d.ts +1 -4
  59. package/dist/src/core/agent/progress-reminders.d.ts.map +1 -1
  60. package/dist/src/core/agent/progress-reminders.js +4 -56
  61. package/dist/src/core/agent/progress-reminders.js.map +1 -1
  62. package/dist/src/core/agent/run-agent.d.ts.map +1 -1
  63. package/dist/src/core/agent/run-agent.js +3 -103
  64. package/dist/src/core/agent/run-agent.js.map +1 -1
  65. package/dist/src/core/chat/session-submit.d.ts.map +1 -1
  66. package/dist/src/core/chat/session-submit.js +0 -1
  67. package/dist/src/core/chat/session-submit.js.map +1 -1
  68. package/dist/src/core/chat/types.d.ts +1 -0
  69. package/dist/src/core/chat/types.d.ts.map +1 -1
  70. package/dist/src/core/config.d.ts +1 -1
  71. package/dist/src/core/config.d.ts.map +1 -1
  72. package/dist/src/core/config.js +1 -1
  73. package/dist/src/core/config.js.map +1 -1
  74. package/dist/src/core/eval/agent-runner.d.ts +24 -0
  75. package/dist/src/core/eval/agent-runner.d.ts.map +1 -0
  76. package/dist/src/core/eval/agent-runner.js +151 -0
  77. package/dist/src/core/eval/agent-runner.js.map +1 -0
  78. package/dist/src/core/eval/case-loader.d.ts +7 -0
  79. package/dist/src/core/eval/case-loader.d.ts.map +1 -0
  80. package/dist/src/core/eval/case-loader.js +34 -0
  81. package/dist/src/core/eval/case-loader.js.map +1 -0
  82. package/dist/src/core/eval/check-runner.d.ts +8 -0
  83. package/dist/src/core/eval/check-runner.d.ts.map +1 -0
  84. package/dist/src/core/eval/check-runner.js +33 -0
  85. package/dist/src/core/eval/check-runner.js.map +1 -0
  86. package/dist/src/core/eval/cleanup.d.ts +20 -0
  87. package/dist/src/core/eval/cleanup.d.ts.map +1 -0
  88. package/dist/src/core/eval/cleanup.js +42 -0
  89. package/dist/src/core/eval/cleanup.js.map +1 -0
  90. package/dist/src/core/eval/git-artifacts.d.ts +26 -0
  91. package/dist/src/core/eval/git-artifacts.d.ts.map +1 -0
  92. package/dist/src/core/eval/git-artifacts.js +211 -0
  93. package/dist/src/core/eval/git-artifacts.js.map +1 -0
  94. package/dist/src/core/eval/process.d.ts +22 -0
  95. package/dist/src/core/eval/process.d.ts.map +1 -0
  96. package/dist/src/core/eval/process.js +65 -0
  97. package/dist/src/core/eval/process.js.map +1 -0
  98. package/dist/src/core/eval/progress.d.ts +28 -0
  99. package/dist/src/core/eval/progress.d.ts.map +1 -0
  100. package/dist/src/core/eval/progress.js +94 -0
  101. package/dist/src/core/eval/progress.js.map +1 -0
  102. package/dist/src/core/eval/report-writer.d.ts +7 -0
  103. package/dist/src/core/eval/report-writer.d.ts.map +1 -0
  104. package/dist/src/core/eval/report-writer.js +159 -0
  105. package/dist/src/core/eval/report-writer.js.map +1 -0
  106. package/dist/src/core/eval/schema.d.ts +206 -0
  107. package/dist/src/core/eval/schema.d.ts.map +1 -0
  108. package/dist/src/core/eval/schema.js +104 -0
  109. package/dist/src/core/eval/schema.js.map +1 -0
  110. package/dist/src/core/eval/trace-analyzer.d.ts +6 -0
  111. package/dist/src/core/eval/trace-analyzer.d.ts.map +1 -0
  112. package/dist/src/core/eval/trace-analyzer.js +106 -0
  113. package/dist/src/core/eval/trace-analyzer.js.map +1 -0
  114. package/dist/src/core/eval/workspace-fixture.d.ts +14 -0
  115. package/dist/src/core/eval/workspace-fixture.d.ts.map +1 -0
  116. package/dist/src/core/eval/workspace-fixture.js +235 -0
  117. package/dist/src/core/eval/workspace-fixture.js.map +1 -0
  118. package/dist/src/core/llm/model-policy.d.ts +26 -0
  119. package/dist/src/core/llm/model-policy.d.ts.map +1 -1
  120. package/dist/src/core/llm/model-policy.js +47 -0
  121. package/dist/src/core/llm/model-policy.js.map +1 -1
  122. package/dist/src/core/prompts/system-prompt.d.ts +1 -1
  123. package/dist/src/core/prompts/system-prompt.d.ts.map +1 -1
  124. package/dist/src/core/prompts/system-prompt.js +19 -100
  125. package/dist/src/core/prompts/system-prompt.js.map +1 -1
  126. package/dist/src/core/runtime/default-tools.d.ts.map +1 -1
  127. package/dist/src/core/runtime/default-tools.js +0 -2
  128. package/dist/src/core/runtime/default-tools.js.map +1 -1
  129. package/dist/src/index.d.ts +0 -1
  130. package/dist/src/index.d.ts.map +1 -1
  131. package/dist/src/index.js +0 -1
  132. package/dist/src/index.js.map +1 -1
  133. package/dist/src/server/features/control-plane/router.d.ts +5 -1
  134. package/dist/src/server/features/control-plane/router.d.ts.map +1 -1
  135. package/dist/src/server/features/control-plane/router.js +16 -2
  136. package/dist/src/server/features/control-plane/router.js.map +1 -1
  137. package/dist/src/server/features/control-plane/services/chat-sessions.d.ts +2 -1
  138. package/dist/src/server/features/control-plane/services/chat-sessions.d.ts.map +1 -1
  139. package/dist/src/server/features/control-plane/services/chat-sessions.js +9 -3
  140. package/dist/src/server/features/control-plane/services/chat-sessions.js.map +1 -1
  141. package/dist/src/server/router.d.ts +5 -1
  142. package/dist/src/server/router.d.ts.map +1 -1
  143. package/dist/src/web/assets/{MonacoDiffViewer-DM8Cy5Xf.js → MonacoDiffViewer-DP7GeCEC.js} +1 -1
  144. package/dist/src/web/assets/index-CYd4sslC.css +2 -0
  145. package/dist/src/web/assets/index-PUxjg447.js +56 -0
  146. package/dist/src/web/index.html +2 -2
  147. package/package.json +7 -2
  148. package/dist/src/core/tools/report-state.d.ts +0 -3
  149. package/dist/src/core/tools/report-state.d.ts.map +0 -1
  150. package/dist/src/core/tools/report-state.js +0 -63
  151. package/dist/src/core/tools/report-state.js.map +0 -1
  152. package/dist/src/web/assets/index-BEeN-RT5.css +0 -2
  153. package/dist/src/web/assets/index-BKDg9H_-.js +0 -56
@@ -0,0 +1,94 @@
1
+ import { appendFileSync, mkdirSync } from 'node:fs';
2
+ import { dirname } from 'node:path';
3
+ export class EvalProgressReporter {
4
+ progressPath;
5
+ caseId;
6
+ writeStdout;
7
+ constructor(args) {
8
+ this.caseId = args.caseId;
9
+ this.progressPath = args.progressPath;
10
+ this.writeStdout = args.writeStdout ?? true;
11
+ mkdirSync(dirname(this.progressPath), { recursive: true });
12
+ }
13
+ info(phase, message) {
14
+ this.write({
15
+ timestamp: new Date().toISOString(),
16
+ caseId: this.caseId,
17
+ phase,
18
+ status: 'info',
19
+ message,
20
+ });
21
+ }
22
+ async track(args) {
23
+ const startedAt = Date.now();
24
+ this.write({
25
+ timestamp: new Date(startedAt).toISOString(),
26
+ caseId: this.caseId,
27
+ phase: args.phase,
28
+ status: 'started',
29
+ message: args.message,
30
+ elapsedMs: 0,
31
+ });
32
+ const heartbeatMs = args.heartbeatMs ?? 30_000;
33
+ const interval = heartbeatMs > 0 ?
34
+ setInterval(() => {
35
+ const elapsedMs = Date.now() - startedAt;
36
+ this.write({
37
+ timestamp: new Date().toISOString(),
38
+ caseId: this.caseId,
39
+ phase: args.phase,
40
+ status: 'heartbeat',
41
+ message: args.heartbeatMessage ?? `still running ${args.message}`,
42
+ elapsedMs,
43
+ });
44
+ }, heartbeatMs)
45
+ : undefined;
46
+ try {
47
+ const result = await args.run();
48
+ const elapsedMs = Date.now() - startedAt;
49
+ this.write({
50
+ timestamp: new Date().toISOString(),
51
+ caseId: this.caseId,
52
+ phase: args.phase,
53
+ status: 'completed',
54
+ message: args.message,
55
+ elapsedMs,
56
+ });
57
+ return result;
58
+ }
59
+ catch (error) {
60
+ const elapsedMs = Date.now() - startedAt;
61
+ this.write({
62
+ timestamp: new Date().toISOString(),
63
+ caseId: this.caseId,
64
+ phase: args.phase,
65
+ status: 'failed',
66
+ message: error instanceof Error ? error.message : String(error),
67
+ elapsedMs,
68
+ });
69
+ throw error;
70
+ }
71
+ finally {
72
+ if (interval) {
73
+ clearInterval(interval);
74
+ }
75
+ }
76
+ }
77
+ write(event) {
78
+ appendFileSync(this.progressPath, `${JSON.stringify(event)}\n`, 'utf8');
79
+ if (this.writeStdout) {
80
+ process.stdout.write(formatProgressLine(event));
81
+ }
82
+ }
83
+ }
84
+ function formatProgressLine(event) {
85
+ const elapsed = event.elapsedMs === undefined ? '' : ` (${formatElapsed(event.elapsedMs)})`;
86
+ return `[${event.caseId}] ${event.status}: ${event.message}${elapsed}\n`;
87
+ }
88
+ function formatElapsed(elapsedMs) {
89
+ if (elapsedMs < 1000) {
90
+ return `${elapsedMs}ms`;
91
+ }
92
+ return `${Math.round(elapsedMs / 1000)}s`;
93
+ }
94
+ //# sourceMappingURL=progress.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"progress.js","sourceRoot":"","sources":["../../../../src/core/eval/progress.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACpD,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAWpC,MAAM,OAAO,oBAAoB;IACtB,YAAY,CAAS;IACb,MAAM,CAAS;IACf,WAAW,CAAU;IAEtC,YAAY,IAIX;QACC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC;QACtC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC;QAC5C,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,IAAI,CAAC,KAAa,EAAE,OAAe;QACjC,IAAI,CAAC,KAAK,CAAC;YACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,KAAK;YACL,MAAM,EAAE,MAAM;YACd,OAAO;SACR,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,KAAK,CAAI,IAMd;QACC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC,KAAK,CAAC;YACT,SAAS,EAAE,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;YAC5C,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,SAAS,EAAE,CAAC;SACb,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,MAAM,CAAC;QAC/C,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC;YAChC,WAAW,CAAC,GAAG,EAAE;gBACf,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACzC,IAAI,CAAC,KAAK,CAAC;oBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;oBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE,WAAW;oBACnB,OAAO,EAAE,IAAI,CAAC,gBAAgB,IAAI,iBAAiB,IAAI,CAAC,OAAO,EAAE;oBACjE,SAAS;iBACV,CAAC,CAAC;YACL,CAAC,EAAE,WAAW,CAAC;YACjB,CAAC,CAAC,SAAS,CAAC;QAEZ,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,CAAC;YAChC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACzC,IAAI,CAAC,KAAK,CAAC;gBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,WAAW;gBACnB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,SAAS;aACV,CAAC,CAAC;YACH,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACzC,IAAI,CAAC,KAAK,CAAC;gBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,QAAQ;gBAChB,OAAO,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;gBAC/D,SAAS;aACV,CAAC,CAAC;YACH,MAAM,KAAK,CAAC;QACd,CAAC;gBAAS,CAAC;YACT,IAAI,QAAQ,EAAE,CAAC;gBACb,aAAa,CAAC,QAAQ,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,KAAwB;QACpC,cAAc,CAAC,IAAI,CAAC,YAAY,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QACxE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;CACF;AAED,SAAS,kBAAkB,CAAC,KAAwB;IAClD,MAAM,OAAO,GAAG,KAAK,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,aAAa,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC;IAC5F,OAAO,IAAI,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,OAAO,GAAG,OAAO,IAAI,CAAC;AAC3E,CAAC;AAED,SAAS,aAAa,CAAC,SAAiB;IACtC,IAAI,SAAS,GAAG,IAAI,EAAE,CAAC;QACrB,OAAO,GAAG,SAAS,IAAI,CAAC;IAC1B,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC;AAC5C,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { EvalSuiteReport } from './schema.js';
2
+ export declare function writeEvalSuiteReport(report: EvalSuiteReport): {
3
+ jsonPath: string;
4
+ markdownPath: string;
5
+ };
6
+ export declare function formatEvalSuiteMarkdown(report: EvalSuiteReport): string;
7
+ //# sourceMappingURL=report-writer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"report-writer.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/report-writer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAiB,eAAe,EAAE,MAAM,aAAa,CAAC;AAElE,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,eAAe,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAOxG;AAED,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAqBvE"}
@@ -0,0 +1,159 @@
1
+ import { mkdirSync, writeFileSync } from 'node:fs';
2
+ import { relative, join } from 'node:path';
3
+ export function writeEvalSuiteReport(report) {
4
+ mkdirSync(report.resultsDir, { recursive: true });
5
+ const jsonPath = join(report.resultsDir, 'report.json');
6
+ const markdownPath = join(report.resultsDir, 'report.md');
7
+ writeFileSync(jsonPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8');
8
+ writeFileSync(markdownPath, formatEvalSuiteMarkdown(report), 'utf8');
9
+ return { jsonPath, markdownPath };
10
+ }
11
+ export function formatEvalSuiteMarkdown(report) {
12
+ const passed = report.results.filter((result) => result.status === 'passed').length;
13
+ const lines = [
14
+ '# Heddle Agent Eval Report',
15
+ '',
16
+ `Target: ${report.target}`,
17
+ `Started: ${report.startedAt}`,
18
+ `Finished: ${report.finishedAt}`,
19
+ `Results: ${passed}/${report.results.length} passed`,
20
+ '',
21
+ '| Case | Status | Model | Checks | Outcome | Turns | Mutations | Verification After Mutation |',
22
+ '| --- | --- | --- | ---: | --- | ---: | ---: | ---: |',
23
+ ...report.results.map(formatSummaryRow),
24
+ '',
25
+ ];
26
+ for (const result of report.results) {
27
+ lines.push(...formatRunDetail(result, report.resultsDir), '');
28
+ }
29
+ return `${lines.join('\n').trimEnd()}\n`;
30
+ }
31
+ function formatSummaryRow(result) {
32
+ const passedChecks = result.checks.filter((check) => check.passed).length;
33
+ return [
34
+ result.caseId,
35
+ result.status,
36
+ result.model ?? 'default',
37
+ `${passedChecks}/${result.checks.length}`,
38
+ result.metrics.outcome ?? `exit ${result.agent.exitCode ?? 'unknown'}`,
39
+ String(result.metrics.assistantTurns),
40
+ String(result.metrics.mutations),
41
+ String(result.metrics.verificationCommandsAfterMutation),
42
+ ].map(escapeCell).join(' | ').replace(/^/, '| ').replace(/$/, ' |');
43
+ }
44
+ function formatRunDetail(result, resultsDir) {
45
+ const traceFiles = result.artifacts.traceFiles.map((path) => formatPath(path, resultsDir));
46
+ const lines = [
47
+ `## ${result.caseId}`,
48
+ '',
49
+ '| Field | Value |',
50
+ '| --- | --- |',
51
+ `| Status | ${escapeCell(result.status)} |`,
52
+ `| Model | ${escapeCell(result.model ?? 'default')} |`,
53
+ `| Max steps | ${escapeCell(String(result.maxSteps ?? 'default'))} |`,
54
+ `| Agent exit | ${escapeCell(`${result.agent.exitCode ?? 'unknown'}${result.agent.timedOut ? ' (timed out)' : ''}`)} |`,
55
+ `| Fixture | ${escapeCell(formatFixture(result))} |`,
56
+ `| Workspace | \`${escapeCell(formatPath(result.workspaceRoot, resultsDir))}\` |`,
57
+ `| Output | \`${escapeCell(formatPath(result.outputDir, resultsDir))}\` |`,
58
+ `| Diff | \`${escapeCell(formatPath(result.artifacts.gitDiffPath, resultsDir))}\` |`,
59
+ `| Diff stat | \`${escapeCell(formatPath(result.artifacts.gitDiffStatPath, resultsDir))}\` |`,
60
+ `| Changed files JSON | \`${escapeCell(formatPath(result.artifacts.changedFilesPath, resultsDir))}\` |`,
61
+ `| Git status | \`${escapeCell(formatPath(result.artifacts.gitStatusPath, resultsDir))}\` |`,
62
+ `| Progress | ${result.artifacts.progressPath ? `\`${escapeCell(formatPath(result.artifacts.progressPath, resultsDir))}\`` : 'none'} |`,
63
+ `| Session catalog | ${result.artifacts.sessionCatalogPath ? `\`${escapeCell(formatPath(result.artifacts.sessionCatalogPath, resultsDir))}\`` : 'none'} |`,
64
+ `| Trace files | ${traceFiles.length ? traceFiles.map((path) => `\`${escapeCell(path)}\``).join('<br>') : 'none'} |`,
65
+ '',
66
+ '### Milestone Review',
67
+ '',
68
+ ];
69
+ lines.push(...formatReviewSection(result), '', '### Changed Files', '');
70
+ if (result.artifacts.changedFiles.length === 0) {
71
+ lines.push('- none');
72
+ }
73
+ else {
74
+ lines.push('| File | Status | + | - |', '| --- | --- | ---: | ---: |');
75
+ for (const file of result.artifacts.changedFiles) {
76
+ lines.push(`| ${escapeCell(file.path)} | ${escapeCell(file.status)} | ${file.additions ?? ''} | ${file.deletions ?? ''} |`);
77
+ }
78
+ }
79
+ lines.push('', '### Post-Run Checks', '');
80
+ if (result.checks.length === 0) {
81
+ lines.push('- none');
82
+ }
83
+ else {
84
+ for (const check of result.checks) {
85
+ lines.push(`- ${check.passed ? 'PASS' : 'FAIL'} ${check.name}: \`${check.command}\` (${check.exitCode ?? 'unknown'}, ${check.durationMs}ms)`);
86
+ }
87
+ }
88
+ lines.push('', '### Metrics', '', '| Metric | Value |', '| --- | ---: |', `| Assistant turns | ${result.metrics.assistantTurns} |`, `| Tool calls | ${result.metrics.toolCalls} |`, `| Mutations | ${result.metrics.mutations} |`, `| Verification after mutation | ${result.metrics.verificationCommandsAfterMutation} |`, `| Approvals requested | ${result.metrics.approvalsRequested} |`, `| Approvals resolved | ${result.metrics.approvalsResolved} |`, `| Tool errors | ${result.metrics.toolErrors} |`);
89
+ lines.push('', '### Agent Verification Commands', '');
90
+ if (result.metrics.verificationCommandDetails.length === 0) {
91
+ lines.push('- none detected after first mutation');
92
+ }
93
+ else {
94
+ for (const command of result.metrics.verificationCommandDetails) {
95
+ lines.push(`- \`${escapeCell(command)}\``);
96
+ }
97
+ }
98
+ lines.push('', '### Rubric', '');
99
+ if (result.review.requiredOutcomes.length === 0 && result.review.humanQuestions.length === 0) {
100
+ lines.push('- none');
101
+ }
102
+ else {
103
+ for (const outcome of result.review.requiredOutcomes) {
104
+ lines.push(`- [ ] ${outcome}`);
105
+ }
106
+ for (const question of result.review.humanQuestions) {
107
+ lines.push(`- [ ] ${question}`);
108
+ }
109
+ }
110
+ if (result.metrics.summary) {
111
+ lines.push('', '### Final Summary', '', result.metrics.summary);
112
+ }
113
+ return lines;
114
+ }
115
+ function formatReviewSection(result) {
116
+ const lines = [];
117
+ if (result.review.milestone) {
118
+ lines.push(`Milestone: ${result.review.milestone}`);
119
+ }
120
+ if (result.review.intent) {
121
+ lines.push('', result.review.intent);
122
+ }
123
+ lines.push('', '| Review Field | Items |', '| --- | --- |');
124
+ lines.push(`| Required outcomes | ${formatListCell(result.review.requiredOutcomes)} |`);
125
+ lines.push(`| Allowed scope | ${formatListCell(result.review.allowedScope)} |`);
126
+ lines.push(`| Out of scope | ${formatListCell(result.review.outOfScope)} |`);
127
+ lines.push(`| Human questions | ${formatListCell(result.review.humanQuestions)} |`);
128
+ return lines;
129
+ }
130
+ function formatListCell(items) {
131
+ return items.length ? items.map((item) => `- ${escapeCell(item)}`).join('<br>') : 'none';
132
+ }
133
+ function formatFixture(result) {
134
+ if (result.fixture.type === 'git-worktree') {
135
+ return [
136
+ 'git-worktree',
137
+ result.fixture.ref ? `ref ${result.fixture.ref}` : undefined,
138
+ result.fixture.resolvedRef ? `commit ${shortSha(result.fixture.resolvedRef)}` : undefined,
139
+ result.fixture.baselineCommit && result.fixture.baselineCommit !== result.fixture.resolvedRef ?
140
+ `baseline ${shortSha(result.fixture.baselineCommit)}`
141
+ : undefined,
142
+ ].filter(Boolean).join(', ');
143
+ }
144
+ return result.fixture.baselineCommit ? `inline, baseline ${shortSha(result.fixture.baselineCommit)}` : 'inline';
145
+ }
146
+ function shortSha(value) {
147
+ return value.slice(0, 12);
148
+ }
149
+ function escapeCell(value) {
150
+ return value.replaceAll('|', '\\|').replaceAll('\n', ' ');
151
+ }
152
+ function formatPath(path, basePath) {
153
+ if (path === basePath) {
154
+ return '.';
155
+ }
156
+ const relativePath = relative(basePath, path);
157
+ return relativePath && !relativePath.startsWith('..') ? relativePath : path;
158
+ }
159
+ //# sourceMappingURL=report-writer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"report-writer.js","sourceRoot":"","sources":["../../../../src/core/eval/report-writer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,UAAU,oBAAoB,CAAC,MAAuB;IAC1D,SAAS,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAClD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;IACxD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC1D,aAAa,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACxE,aAAa,CAAC,YAAY,EAAE,uBAAuB,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC;IACrE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;AACpC,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,MAAuB;IAC7D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;IACpF,MAAM,KAAK,GAAG;QACZ,4BAA4B;QAC5B,EAAE;QACF,WAAW,MAAM,CAAC,MAAM,EAAE;QAC1B,YAAY,MAAM,CAAC,SAAS,EAAE;QAC9B,aAAa,MAAM,CAAC,UAAU,EAAE;QAChC,YAAY,MAAM,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,SAAS;QACpD,EAAE;QACF,gGAAgG;QAChG,uDAAuD;QACvD,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;QACvC,EAAE;KACH,CAAC;IAEF,KAAK,MAAM,MAAM,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,EAAE,EAAE,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,IAAI,CAAC;AAC3C,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAqB;IAC7C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IAC1E,OAAO;QACL,MAAM,CAAC,MAAM;QACb,MAAM,CAAC,MAAM;QACb,MAAM,CAAC,KAAK,IAAI,SAAS;QACzB,GAAG,YAAY,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE;QACzC,MAAM,CAAC,OAAO,CAAC,OAAO,IAAI,QAAQ,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,SAAS,EAAE;QACtE,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,iCAAiC,CAAC;KACzD,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;AACtE,CAAC;AAED,SAAS,eAAe,CAAC,MAAqB,EAAE,UAAkB;IAChE,MAAM,UAAU,GAAG,MAAM,CAAC,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC;IAC3F,MAAM,KAAK,GAAG;QACZ,MAAM,MAAM,CAAC,MAAM,EAAE;QACrB,EAAE;QACF,mBAAmB;QACnB,eAAe;QACf,cAAc,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI;QAC3C,aAAa,UAAU,CAAC,MAAM,CAAC,KAAK,IAAI,SAAS,CAAC,IAAI;QACtD,iBAAiB,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,IAAI,SAAS,CAAC,CAAC,IAAI;QACrE,kBAAkB,UAAU,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI;QACvH,eAAe,UAAU,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,IAAI;QACpD,mBAAmB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,aAAa,EAAE,UAAU,CAAC,CAAC,MAAM;QACjF,gBAAgB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC,MAAM;QAC1E,cAAc,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC,MAAM;QACpF,mBAAmB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,eAAe,EAAE,UAAU,CAAC,CAAC,MAAM;QAC7F,4BAA4B,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,gBAAgB,EAAE,UAAU,CAAC,CAAC,MAAM;QACvG,oBAAoB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,aAAa,EAAE,UAAU,CAAC,CAAC,MAAM;QAC5F,gBAAgB,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,CAAC,KAAK,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI;QACvI,uBAAuB,MAAM,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC,CAAC,KAAK,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,kBAAkB,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI;QAC1J,mBAAmB,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI;QACpH,EAAE;QACF,sBAAsB;QACtB,EAAE;KACH,CAAC;IAEF,KAAK,CAAC,IAAI,CAAC,GAAG,mBAAmB,CAAC,MAAM,CAAC,EAAE,EAAE,EAAE,mBAAmB,EAAE,EAAE,CAAC,CAAC;IACxE,IAAI,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,CAAC,IAAI,CAAC,2BAA2B,EAAE,6BAA6B,CAAC,CAAC;QACvE,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,SAAS,CAAC,YAAY,EAAE,CAAC;YACjD,KAAK,CAAC,IAAI,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,SAAS,IAAI,EAAE,MAAM,IAAI,CAAC,SAAS,IAAI,EAAE,IAAI,CAAC,CAAC;QAC9H,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,qBAAqB,EAAE,EAAE,CAAC,CAAC;IAC1C,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClC,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC,IAAI,OAAO,KAAK,CAAC,OAAO,OAAO,KAAK,CAAC,QAAQ,IAAI,SAAS,KAAK,KAAK,CAAC,UAAU,KAAK,CAAC,CAAC;QAChJ,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CACR,EAAE,EACF,aAAa,EACb,EAAE,EACF,oBAAoB,EACpB,gBAAgB,EAChB,uBAAuB,MAAM,CAAC,OAAO,CAAC,cAAc,IAAI,EACxD,kBAAkB,MAAM,CAAC,OAAO,CAAC,SAAS,IAAI,EAC9C,iBAAiB,MAAM,CAAC,OAAO,CAAC,SAAS,IAAI,EAC7C,mCAAmC,MAAM,CAAC,OAAO,CAAC,iCAAiC,IAAI,EACvF,2BAA2B,MAAM,CAAC,OAAO,CAAC,kBAAkB,IAAI,EAChE,0BAA0B,MAAM,CAAC,OAAO,CAAC,iBAAiB,IAAI,EAC9D,mBAAmB,MAAM,CAAC,OAAO,CAAC,UAAU,IAAI,CACjD,CAAC;IAEF,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,iCAAiC,EAAE,EAAE,CAAC,CAAC;IACtD,IAAI,MAAM,CAAC,OAAO,CAAC,0BAA0B,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3D,KAAK,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;IACrD,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,0BAA0B,EAAE,CAAC;YAChE,KAAK,CAAC,IAAI,CAAC,OAAO,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7F,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,gBAAgB,EAAE,CAAC;YACrD,KAAK,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;QACjC,CAAC;QACD,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,CAAC;YACpD,KAAK,CAAC,IAAI,CAAC,SAAS,QAAQ,EAAE,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QAC3B,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,mBAAmB,EAAE,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAClE,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,mBAAmB,CAAC,MAAqB;IAChD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;QAC5B,KAAK,CAAC,IAAI,CAAC,cAAc,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACtD,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACzB,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACvC,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,0BAA0B,EAAE,eAAe,CAAC,CAAC;IAC5D,KAAK,CAAC,IAAI,CAAC,yBAAyB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACxF,KAAK,CAAC,IAAI,CAAC,qBAAqB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IAChF,KAAK,CAAC,IAAI,CAAC,oBAAoB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAC7E,KAAK,CAAC,IAAI,CAAC,uBAAuB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACpF,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,cAAc,CAAC,KAAe;IACrC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;AAC3F,CAAC;AAED,SAAS,aAAa,CAAC,MAAqB;IAC1C,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,KAAK,cAAc,EAAE,CAAC;QAC3C,OAAO;YACL,cAAc;YACd,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,MAAM,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,SAAS;YAC5D,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,UAAU,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS;YACzF,MAAM,CAAC,OAAO,CAAC,cAAc,IAAI,MAAM,CAAC,OAAO,CAAC,cAAc,KAAK,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;gBAC7F,YAAY,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE;gBACvD,CAAC,CAAC,SAAS;SACZ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IACD,OAAO,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,oBAAoB,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;AAClH,CAAC;AAED,SAAS,QAAQ,CAAC,KAAa;IAC7B,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,KAAK,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAC5D,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,QAAgB;IAChD,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;QACtB,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,YAAY,GAAG,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAC9C,OAAO,YAAY,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;AAC9E,CAAC"}
@@ -0,0 +1,206 @@
1
+ import { z } from 'zod';
2
+ export declare const evalCheckSchema: z.ZodObject<{
3
+ name: z.ZodOptional<z.ZodString>;
4
+ command: z.ZodString;
5
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
6
+ }, z.core.$strip>;
7
+ export declare const evalSetupSchema: z.ZodObject<{
8
+ copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
9
+ files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
10
+ commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
11
+ name: z.ZodOptional<z.ZodString>;
12
+ command: z.ZodString;
13
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
14
+ }, z.core.$strip>>>;
15
+ commitMessage: z.ZodOptional<z.ZodString>;
16
+ }, z.core.$strip>;
17
+ export declare const evalInlineFixtureSchema: z.ZodObject<{
18
+ type: z.ZodLiteral<"inline">;
19
+ }, z.core.$strip>;
20
+ export declare const evalGitWorktreeFixtureSchema: z.ZodObject<{
21
+ type: z.ZodLiteral<"git-worktree">;
22
+ repo: z.ZodDefault<z.ZodString>;
23
+ ref: z.ZodString;
24
+ }, z.core.$strip>;
25
+ export declare const evalFixtureSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
26
+ type: z.ZodLiteral<"inline">;
27
+ }, z.core.$strip>, z.ZodObject<{
28
+ type: z.ZodLiteral<"git-worktree">;
29
+ repo: z.ZodDefault<z.ZodString>;
30
+ ref: z.ZodString;
31
+ }, z.core.$strip>], "type">;
32
+ export declare const evalMilestoneReviewSchema: z.ZodObject<{
33
+ milestone: z.ZodOptional<z.ZodString>;
34
+ intent: z.ZodOptional<z.ZodString>;
35
+ requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
36
+ allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
37
+ outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
38
+ humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
39
+ }, z.core.$strip>;
40
+ export declare const agentEvalCaseSchema: z.ZodObject<{
41
+ id: z.ZodString;
42
+ kind: z.ZodLiteral<"coding">;
43
+ description: z.ZodOptional<z.ZodString>;
44
+ prompt: z.ZodString;
45
+ model: z.ZodOptional<z.ZodString>;
46
+ maxSteps: z.ZodOptional<z.ZodNumber>;
47
+ setup: z.ZodDefault<z.ZodObject<{
48
+ copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
49
+ files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
50
+ commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
51
+ name: z.ZodOptional<z.ZodString>;
52
+ command: z.ZodString;
53
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
54
+ }, z.core.$strip>>>;
55
+ commitMessage: z.ZodOptional<z.ZodString>;
56
+ }, z.core.$strip>>;
57
+ fixture: z.ZodDefault<z.ZodDiscriminatedUnion<[z.ZodObject<{
58
+ type: z.ZodLiteral<"inline">;
59
+ }, z.core.$strip>, z.ZodObject<{
60
+ type: z.ZodLiteral<"git-worktree">;
61
+ repo: z.ZodDefault<z.ZodString>;
62
+ ref: z.ZodString;
63
+ }, z.core.$strip>], "type">>;
64
+ review: z.ZodDefault<z.ZodObject<{
65
+ milestone: z.ZodOptional<z.ZodString>;
66
+ intent: z.ZodOptional<z.ZodString>;
67
+ requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
68
+ allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
69
+ outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
70
+ humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
71
+ }, z.core.$strip>>;
72
+ checks: z.ZodDefault<z.ZodArray<z.ZodObject<{
73
+ name: z.ZodOptional<z.ZodString>;
74
+ command: z.ZodString;
75
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
76
+ }, z.core.$strip>>>;
77
+ rubric: z.ZodDefault<z.ZodArray<z.ZodString>>;
78
+ tags: z.ZodDefault<z.ZodArray<z.ZodString>>;
79
+ }, z.core.$strip>;
80
+ export declare const evalCaseSchema: z.ZodObject<{
81
+ id: z.ZodString;
82
+ kind: z.ZodLiteral<"coding">;
83
+ description: z.ZodOptional<z.ZodString>;
84
+ prompt: z.ZodString;
85
+ model: z.ZodOptional<z.ZodString>;
86
+ maxSteps: z.ZodOptional<z.ZodNumber>;
87
+ setup: z.ZodDefault<z.ZodObject<{
88
+ copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
89
+ files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
90
+ commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
91
+ name: z.ZodOptional<z.ZodString>;
92
+ command: z.ZodString;
93
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
94
+ }, z.core.$strip>>>;
95
+ commitMessage: z.ZodOptional<z.ZodString>;
96
+ }, z.core.$strip>>;
97
+ fixture: z.ZodDefault<z.ZodDiscriminatedUnion<[z.ZodObject<{
98
+ type: z.ZodLiteral<"inline">;
99
+ }, z.core.$strip>, z.ZodObject<{
100
+ type: z.ZodLiteral<"git-worktree">;
101
+ repo: z.ZodDefault<z.ZodString>;
102
+ ref: z.ZodString;
103
+ }, z.core.$strip>], "type">>;
104
+ review: z.ZodDefault<z.ZodObject<{
105
+ milestone: z.ZodOptional<z.ZodString>;
106
+ intent: z.ZodOptional<z.ZodString>;
107
+ requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
108
+ allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
109
+ outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
110
+ humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
111
+ }, z.core.$strip>>;
112
+ checks: z.ZodDefault<z.ZodArray<z.ZodObject<{
113
+ name: z.ZodOptional<z.ZodString>;
114
+ command: z.ZodString;
115
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
116
+ }, z.core.$strip>>>;
117
+ rubric: z.ZodDefault<z.ZodArray<z.ZodString>>;
118
+ tags: z.ZodDefault<z.ZodArray<z.ZodString>>;
119
+ }, z.core.$strip>;
120
+ export type EvalCheck = z.infer<typeof evalCheckSchema>;
121
+ export type EvalSetup = z.infer<typeof evalSetupSchema>;
122
+ export type EvalFixture = z.infer<typeof evalFixtureSchema>;
123
+ export type EvalMilestoneReview = z.infer<typeof evalMilestoneReviewSchema>;
124
+ export type AgentEvalCase = z.infer<typeof agentEvalCaseSchema>;
125
+ export type EvalCase = z.infer<typeof evalCaseSchema>;
126
+ export type EvalCheckResult = {
127
+ name: string;
128
+ command: string;
129
+ exitCode: number | null;
130
+ stdout: string;
131
+ stderr: string;
132
+ durationMs: number;
133
+ passed: boolean;
134
+ timedOut: boolean;
135
+ };
136
+ export type EvalChangedFile = {
137
+ path: string;
138
+ status: string;
139
+ additions?: number;
140
+ deletions?: number;
141
+ };
142
+ export type EvalTraceMetrics = {
143
+ assistantTurns: number;
144
+ toolCalls: number;
145
+ toolResults: number;
146
+ mutations: number;
147
+ approvalsRequested: number;
148
+ approvalsResolved: number;
149
+ toolErrors: number;
150
+ verificationCommandsAfterMutation: number;
151
+ firstMutationStep?: number;
152
+ outcome?: string;
153
+ summary?: string;
154
+ toolsByName: Record<string, number>;
155
+ readOrSearchBeforeMutation: string[];
156
+ verificationCommandDetails: string[];
157
+ };
158
+ export type EvalRunResult = {
159
+ caseId: string;
160
+ target: string;
161
+ status: 'passed' | 'failed';
162
+ workspaceRoot: string;
163
+ outputDir: string;
164
+ fixture: {
165
+ type: EvalFixture['type'];
166
+ repo?: string;
167
+ ref?: string;
168
+ resolvedRef?: string;
169
+ baselineCommit?: string;
170
+ };
171
+ startedAt: string;
172
+ finishedAt: string;
173
+ durationMs: number;
174
+ agent: {
175
+ command: string[];
176
+ exitCode: number | null;
177
+ stdoutPath: string;
178
+ stderrPath: string;
179
+ timedOut: boolean;
180
+ };
181
+ artifacts: {
182
+ gitStatusPath: string;
183
+ gitDiffPath: string;
184
+ gitDiffStatPath: string;
185
+ changedFilesPath: string;
186
+ progressPath?: string;
187
+ sessionCatalogPath?: string;
188
+ traceFiles: string[];
189
+ changedFiles: EvalChangedFile[];
190
+ };
191
+ checks: EvalCheckResult[];
192
+ metrics: EvalTraceMetrics;
193
+ review: EvalMilestoneReview;
194
+ model?: string;
195
+ maxSteps?: number;
196
+ };
197
+ export type EvalSuiteReport = {
198
+ version: 1;
199
+ target: string;
200
+ repoRoot: string;
201
+ startedAt: string;
202
+ finishedAt: string;
203
+ resultsDir: string;
204
+ results: EvalRunResult[];
205
+ };
206
+ //# sourceMappingURL=schema.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;iBASoG,CAAC;AAEjI,eAAO,MAAM,eAAe;;;;;;;;;iBAmB8E,CAAC;AAE3G,eAAO,MAAM,uBAAuB;;iBAG+C,CAAC;AAEpF,eAAO,MAAM,4BAA4B;;;;iBAQ8C,CAAC;AAExF,eAAO,MAAM,iBAAiB;;;;;;2BAGyC,CAAC;AAExE,eAAO,MAAM,yBAAyB;;;;;;;iBA2BoE,CAAC;AAE3G,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA2CmF,CAAC;AAEpH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAAsB,CAAC;AAElD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAC5D,MAAM,MAAM,mBAAmB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,yBAAyB,CAAC,CAAC;AAC5E,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAChE,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,MAAM,MAAM,eAAe,GAAG;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,OAAO,CAAC;IAChB,QAAQ,EAAE,OAAO,CAAC;CACnB,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,iCAAiC,EAAE,MAAM,CAAC;IAC1C,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACpC,0BAA0B,EAAE,MAAM,EAAE,CAAC;IACrC,0BAA0B,EAAE,MAAM,EAAE,CAAC;CACtC,CAAC;AAEF,MAAM,MAAM,aAAa,GAAG;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE;QACP,IAAI,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC;IACF,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE;QACL,OAAO,EAAE,MAAM,EAAE,CAAC;QAClB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,QAAQ,EAAE,OAAO,CAAC;KACnB,CAAC;IACF,SAAS,EAAE;QACT,aAAa,EAAE,MAAM,CAAC;QACtB,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,CAAC;QACxB,gBAAgB,EAAE,MAAM,CAAC;QACzB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,UAAU,EAAE,MAAM,EAAE,CAAC;QACrB,YAAY,EAAE,eAAe,EAAE,CAAC;KACjC,CAAC;IACF,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,OAAO,EAAE,gBAAgB,CAAC;IAC1B,MAAM,EAAE,mBAAmB,CAAC;IAC5B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,OAAO,EAAE,CAAC,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,aAAa,EAAE,CAAC;CAC1B,CAAC"}
@@ -0,0 +1,104 @@
1
+ import { z } from 'zod';
2
+ export const evalCheckSchema = z.object({
3
+ name: z.string().trim().min(1)
4
+ .describe('Human-readable label for this check in reports. Defaults to the command when omitted.')
5
+ .optional(),
6
+ command: z.string().trim().min(1)
7
+ .describe('Shell command to run inside the disposable eval workspace after the agent finishes. Exit code 0 means the check passed.'),
8
+ timeoutMs: z.number().int().positive()
9
+ .describe('Optional timeout for this check command in milliseconds.')
10
+ .optional(),
11
+ }).describe('A deterministic post-run command used to decide whether the agent-produced workspace passes objective validation.');
12
+ export const evalSetupSchema = z.object({
13
+ copyFiles: z.record(z.string().trim().min(1).describe('Repository-relative source file path to copy from the eval harness repo.'), z.string().trim().min(1).describe('Workspace-relative destination file path to create before the eval starts.'))
14
+ .describe('Fixture files to copy from the eval harness repo into the disposable workspace before committing the initial Git baseline.')
15
+ .optional(),
16
+ files: z.record(z.string().trim().min(1).describe('Workspace-relative file path to create before the eval starts.'), z.string().describe('Exact UTF-8 file contents to write for the fixture file.'))
17
+ .describe('Fixture files to write into the disposable workspace before committing the initial Git baseline.')
18
+ .optional(),
19
+ commands: z.array(evalCheckSchema)
20
+ .describe('Setup commands to run after fixture files are written and before the initial Git baseline is committed.')
21
+ .optional(),
22
+ commitMessage: z.string().trim().min(1)
23
+ .describe('Commit message for the initial fixture Git baseline. Defaults to a generated eval fixture message.')
24
+ .optional(),
25
+ }).describe('Instructions for creating the disposable repository state that the agent will work against.');
26
+ export const evalInlineFixtureSchema = z.object({
27
+ type: z.literal('inline')
28
+ .describe('Create a small disposable Git repository from the case setup files and commands.'),
29
+ }).describe('A synthetic disposable repository built from inline eval setup data.');
30
+ export const evalGitWorktreeFixtureSchema = z.object({
31
+ type: z.literal('git-worktree')
32
+ .describe('Create a disposable Git worktree from an existing repository at a pinned ref.'),
33
+ repo: z.string().trim().min(1)
34
+ .describe('Repository path to create the worktree from. Relative paths are resolved from the Heddle repo root.')
35
+ .default('.'),
36
+ ref: z.string().trim().min(1)
37
+ .describe('Pinned target ref for the worktree, such as a release tag or commit SHA. Avoid moving HEAD for comparable evals.'),
38
+ }).describe('A realistic disposable repository fixture created from a pinned Git ref.');
39
+ export const evalFixtureSchema = z.discriminatedUnion('type', [
40
+ evalInlineFixtureSchema,
41
+ evalGitWorktreeFixtureSchema,
42
+ ]).describe('How to prepare the disposable workspace the agent edits.');
43
+ export const evalMilestoneReviewSchema = z.object({
44
+ milestone: z.string().trim().min(1)
45
+ .describe('Short name for the user-intended milestone this case evaluates.')
46
+ .optional(),
47
+ intent: z.string().trim().min(1)
48
+ .describe('Human-readable statement of what the agent should accomplish beyond merely passing checks.')
49
+ .optional(),
50
+ requiredOutcomes: z.array(z.string().trim().min(1).describe('Observable outcome a human reviewer should look for in the final diff, trace, or answer.'))
51
+ .describe('Milestone outcomes expected for a high-quality completion.')
52
+ .default([]),
53
+ allowedScope: z.array(z.string().trim().min(1).describe('Files, modules, or behavior areas the agent is allowed or expected to touch.'))
54
+ .describe('Expected implementation scope for judging whether the diff stayed on task.')
55
+ .default([]),
56
+ outOfScope: z.array(z.string().trim().min(1).describe('Files, modules, or behavior areas that should not be changed for this case.'))
57
+ .describe('Boundaries a human reviewer should use to spot unrelated churn.')
58
+ .default([]),
59
+ humanQuestions: z.array(z.string().trim().min(1).describe('Question for human review after the run completes.'))
60
+ .describe('Review prompts that help judge task completion quality beyond deterministic checks.')
61
+ .default([]),
62
+ }).describe('Human-review metadata for milestone-style eval cases where pass/fail checks are not enough.');
63
+ export const agentEvalCaseSchema = z.object({
64
+ id: z.string().trim().regex(/^[a-zA-Z0-9._-]+$/, 'Use a filesystem-safe case id.')
65
+ .describe('Stable filesystem-safe case id used in result paths, filtering, and reports.'),
66
+ kind: z.literal('coding')
67
+ .describe('Eval case type. The first harness slice supports coding cases run through ask --new-session.'),
68
+ description: z.string().trim()
69
+ .describe('Optional short explanation of what behavior this case is meant to exercise.')
70
+ .optional(),
71
+ prompt: z.string().trim().min(1)
72
+ .describe('User prompt sent to Heddle in the disposable workspace. This should ask for real coding work, not just Q&A.'),
73
+ model: z.string().trim().min(1)
74
+ .describe('Optional model override for this case. The CLI-level --model takes precedence when supplied.')
75
+ .optional(),
76
+ maxSteps: z.number().int().positive()
77
+ .describe('Optional maximum agent loop steps for this case. The CLI-level --max-steps takes precedence when supplied.')
78
+ .optional(),
79
+ setup: evalSetupSchema
80
+ .describe('Disposable workspace setup for this case.')
81
+ .default({}),
82
+ fixture: evalFixtureSchema
83
+ .describe('Workspace fixture source. Defaults to an inline synthetic repository.')
84
+ .default({ type: 'inline' }),
85
+ review: evalMilestoneReviewSchema
86
+ .describe('Optional milestone-completion review guidance included in reports.')
87
+ .default({
88
+ requiredOutcomes: [],
89
+ allowedScope: [],
90
+ outOfScope: [],
91
+ humanQuestions: [],
92
+ }),
93
+ checks: z.array(evalCheckSchema)
94
+ .describe('Deterministic post-agent commands that must pass for the case to be marked passed.')
95
+ .default([]),
96
+ rubric: z.array(z.string().trim().min(1).describe('Qualitative behavior criterion for human or future LLM judging.'))
97
+ .describe('Non-deterministic quality criteria preserved in reports for manual or future judge review.')
98
+ .default([]),
99
+ tags: z.array(z.string().trim().min(1).describe('Free-form label for filtering or grouping eval cases.'))
100
+ .describe('Case labels such as bugfix, refactor, verification, multi-file, or tui.')
101
+ .default([]),
102
+ }).describe('A live coding-task eval case run in a disposable Git workspace through Heddle ask/session execution.');
103
+ export const evalCaseSchema = agentEvalCaseSchema;
104
+ //# sourceMappingURL=schema.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../../../src/core/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC3B,QAAQ,CAAC,uFAAuF,CAAC;SACjG,QAAQ,EAAE;IACb,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC9B,QAAQ,CAAC,yHAAyH,CAAC;IACtI,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE;SACnC,QAAQ,CAAC,0DAA0D,CAAC;SACpE,QAAQ,EAAE;CACd,CAAC,CAAC,QAAQ,CAAC,mHAAmH,CAAC,CAAC;AAEjI,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,SAAS,EAAE,CAAC,CAAC,MAAM,CACjB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0EAA0E,CAAC,EAC7G,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4EAA4E,CAAC,CAChH;SACE,QAAQ,CAAC,4HAA4H,CAAC;SACtI,QAAQ,EAAE;IACb,KAAK,EAAE,CAAC,CAAC,MAAM,CACb,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,gEAAgE,CAAC,EACnG,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,0DAA0D,CAAC,CAChF;SACE,QAAQ,CAAC,kGAAkG,CAAC;SAC5G,QAAQ,EAAE;IACb,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;SAC/B,QAAQ,CAAC,yGAAyG,CAAC;SACnH,QAAQ,EAAE;IACb,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SACpC,QAAQ,CAAC,oGAAoG,CAAC;SAC9G,QAAQ,EAAE;CACd,CAAC,CAAC,QAAQ,CAAC,6FAA6F,CAAC,CAAC;AAE3G,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9C,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC;SACtB,QAAQ,CAAC,kFAAkF,CAAC;CAChG,CAAC,CAAC,QAAQ,CAAC,sEAAsE,CAAC,CAAC;AAEpF,MAAM,CAAC,MAAM,4BAA4B,GAAG,CAAC,CAAC,MAAM,CAAC;IACnD,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,CAAC;SAC5B,QAAQ,CAAC,+EAA+E,CAAC;IAC5F,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC3B,QAAQ,CAAC,qGAAqG,CAAC;SAC/G,OAAO,CAAC,GAAG,CAAC;IACf,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC1B,QAAQ,CAAC,kHAAkH,CAAC;CAChI,CAAC,CAAC,QAAQ,CAAC,0EAA0E,CAAC,CAAC;AAExF,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE;IAC5D,uBAAuB;IACvB,4BAA4B;CAC7B,CAAC,CAAC,QAAQ,CAAC,0DAA0D,CAAC,CAAC;AAExE,MAAM,CAAC,MAAM,yBAAyB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAChC,QAAQ,CAAC,iEAAiE,CAAC;SAC3E,QAAQ,EAAE;IACb,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC7B,QAAQ,CAAC,4FAA4F,CAAC;SACtG,QAAQ,EAAE;IACb,gBAAgB,EAAE,CAAC,CAAC,KAAK,CACvB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0FAA0F,CAAC,CAC9H;SACE,QAAQ,CAAC,4DAA4D,CAAC;SACtE,OAAO,CAAC,EAAE,CAAC;IACd,YAAY,EAAE,CAAC,CAAC,KAAK,CACnB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,8EAA8E,CAAC,CAClH;SACE,QAAQ,CAAC,4EAA4E,CAAC;SACtF,OAAO,CAAC,EAAE,CAAC;IACd,UAAU,EAAE,CAAC,CAAC,KAAK,CACjB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,6EAA6E,CAAC,CACjH;SACE,QAAQ,CAAC,iEAAiE,CAAC;SAC3E,OAAO,CAAC,EAAE,CAAC;IACd,cAAc,EAAE,CAAC,CAAC,KAAK,CACrB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,oDAAoD,CAAC,CACxF;SACE,QAAQ,CAAC,qFAAqF,CAAC;SAC/F,OAAO,CAAC,EAAE,CAAC;CACf,CAAC,CAAC,QAAQ,CAAC,6FAA6F,CAAC,CAAC;AAE3G,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC1C,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,mBAAmB,EAAE,gCAAgC,CAAC;SAC/E,QAAQ,CAAC,8EAA8E,CAAC;IAC3F,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC;SACtB,QAAQ,CAAC,8FAA8F,CAAC;IAC3G,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE;SAC3B,QAAQ,CAAC,6EAA6E,CAAC;SACvF,QAAQ,EAAE;IACb,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC7B,QAAQ,CAAC,6GAA6G,CAAC;IAC1H,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC5B,QAAQ,CAAC,8FAA8F,CAAC;SACxG,QAAQ,EAAE;IACb,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE;SAClC,QAAQ,CAAC,4GAA4G,CAAC;SACtH,QAAQ,EAAE;IACb,KAAK,EAAE,eAAe;SACnB,QAAQ,CAAC,2CAA2C,CAAC;SACrD,OAAO,CAAC,EAAE,CAAC;IACd,OAAO,EAAE,iBAAiB;SACvB,QAAQ,CAAC,uEAAuE,CAAC;SACjF,OAAO,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC9B,MAAM,EAAE,yBAAyB;SAC9B,QAAQ,CAAC,oEAAoE,CAAC;SAC9E,OAAO,CAAC;QACP,gBAAgB,EAAE,EAAE;QACpB,YAAY,EAAE,EAAE;QAChB,UAAU,EAAE,EAAE;QACd,cAAc,EAAE,EAAE;KACnB,CAAC;IACJ,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;SAC7B,QAAQ,CAAC,oFAAoF,CAAC;SAC9F,OAAO,CAAC,EAAE,CAAC;IACd,MAAM,EAAE,CAAC,CAAC,KAAK,CACb,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,iEAAiE,CAAC,CACrG;SACE,QAAQ,CAAC,4FAA4F,CAAC;SACtG,OAAO,CAAC,EAAE,CAAC;IACd,IAAI,EAAE,CAAC,CAAC,KAAK,CACX,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,uDAAuD,CAAC,CAC3F;SACE,QAAQ,CAAC,yEAAyE,CAAC;SACnF,OAAO,CAAC,EAAE,CAAC;CACf,CAAC,CAAC,QAAQ,CAAC,sGAAsG,CAAC,CAAC;AAEpH,MAAM,CAAC,MAAM,cAAc,GAAG,mBAAmB,CAAC"}
@@ -0,0 +1,6 @@
1
+ import type { TraceEvent } from '../types.js';
2
+ import type { EvalTraceMetrics } from './schema.js';
3
+ export declare function analyzeTraceFiles(paths: string[]): EvalTraceMetrics;
4
+ export declare function analyzeTrace(trace: TraceEvent[]): EvalTraceMetrics;
5
+ export declare function readTraceFile(path: string): TraceEvent[];
6
+ //# sourceMappingURL=trace-analyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"trace-analyzer.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/trace-analyzer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAKpD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,gBAAgB,CAEnE;AAED,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,gBAAgB,CAiFlE;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,UAAU,EAAE,CAMxD"}