@xera-ai/core 0.1.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. package/dist/bin/internal.js +2039 -725
  2. package/dist/{adapter → core/src/adapter}/types.d.ts +1 -1
  3. package/dist/core/src/adapter/types.d.ts.map +1 -0
  4. package/dist/core/src/artifact/hash.d.ts.map +1 -0
  5. package/dist/core/src/artifact/meta.d.ts.map +1 -0
  6. package/dist/core/src/artifact/paths.d.ts.map +1 -0
  7. package/dist/core/src/artifact/status.d.ts.map +1 -0
  8. package/dist/core/src/auth/encrypt.d.ts.map +1 -0
  9. package/dist/core/src/auth/key.d.ts.map +1 -0
  10. package/dist/core/src/auth/refresh.d.ts.map +1 -0
  11. package/dist/core/src/auth/state.d.ts.map +1 -0
  12. package/dist/core/src/bin-internal/doctor.d.ts +5 -0
  13. package/dist/core/src/bin-internal/doctor.d.ts.map +1 -0
  14. package/dist/core/src/bin-internal/eval-deterministic.d.ts +5 -0
  15. package/dist/core/src/bin-internal/eval-deterministic.d.ts.map +1 -0
  16. package/dist/core/src/bin-internal/eval-prepare.d.ts +7 -0
  17. package/dist/core/src/bin-internal/eval-prepare.d.ts.map +1 -0
  18. package/dist/core/src/bin-internal/eval-report.d.ts +5 -0
  19. package/dist/core/src/bin-internal/eval-report.d.ts.map +1 -0
  20. package/dist/core/src/bin-internal/exec.d.ts.map +1 -0
  21. package/dist/core/src/bin-internal/fetch.d.ts.map +1 -0
  22. package/dist/core/src/bin-internal/heal-prepare.d.ts +19 -0
  23. package/dist/core/src/bin-internal/heal-prepare.d.ts.map +1 -0
  24. package/dist/core/src/bin-internal/index.d.ts.map +1 -0
  25. package/dist/core/src/bin-internal/lint.d.ts.map +1 -0
  26. package/dist/core/src/bin-internal/normalize.d.ts.map +1 -0
  27. package/dist/core/src/bin-internal/post.d.ts.map +1 -0
  28. package/dist/core/src/bin-internal/promote.d.ts.map +1 -0
  29. package/dist/core/src/bin-internal/report.d.ts.map +1 -0
  30. package/dist/core/src/bin-internal/status-cmd.d.ts.map +1 -0
  31. package/dist/core/src/bin-internal/typecheck.d.ts.map +1 -0
  32. package/dist/core/src/bin-internal/unlock.d.ts.map +1 -0
  33. package/dist/core/src/bin-internal/validate-feature.d.ts.map +1 -0
  34. package/dist/core/src/bin-internal/verify-prompts.d.ts +7 -0
  35. package/dist/core/src/bin-internal/verify-prompts.d.ts.map +1 -0
  36. package/dist/core/src/classifier/aggregate.d.ts.map +1 -0
  37. package/dist/core/src/classifier/history.d.ts.map +1 -0
  38. package/dist/core/src/classifier/types.d.ts.map +1 -0
  39. package/dist/core/src/config/define.d.ts.map +1 -0
  40. package/dist/core/src/config/load.d.ts.map +1 -0
  41. package/dist/{config → core/src/config}/schema.d.ts.map +1 -1
  42. package/dist/core/src/eval/paths.d.ts +15 -0
  43. package/dist/core/src/eval/paths.d.ts.map +1 -0
  44. package/dist/core/src/eval/run-id.d.ts +6 -0
  45. package/dist/core/src/eval/run-id.d.ts.map +1 -0
  46. package/dist/core/src/eval/types.d.ts +551 -0
  47. package/dist/core/src/eval/types.d.ts.map +1 -0
  48. package/dist/core/src/index.d.ts.map +1 -0
  49. package/dist/core/src/jira/client.d.ts.map +1 -0
  50. package/dist/core/src/jira/fields.d.ts.map +1 -0
  51. package/dist/core/src/jira/mcp-backend.d.ts.map +1 -0
  52. package/dist/core/src/jira/rest-backend.d.ts.map +1 -0
  53. package/dist/core/src/jira/retry.d.ts.map +1 -0
  54. package/dist/core/src/jira/types.d.ts.map +1 -0
  55. package/dist/core/src/lock/file-lock.d.ts.map +1 -0
  56. package/dist/core/src/logging/ndjson-logger.d.ts.map +1 -0
  57. package/dist/core/src/reporter/jira-comment.d.ts.map +1 -0
  58. package/dist/core/src/reporter/status-writer.d.ts.map +1 -0
  59. package/dist/src/index.js +19 -12
  60. package/dist/web/src/adapter.d.ts +3 -0
  61. package/dist/web/src/adapter.d.ts.map +1 -0
  62. package/dist/web/src/auth-setup/define.d.ts +16 -0
  63. package/dist/web/src/auth-setup/define.d.ts.map +1 -0
  64. package/dist/web/src/auth-setup/playwright-state.d.ts +2 -0
  65. package/dist/web/src/auth-setup/playwright-state.d.ts.map +1 -0
  66. package/dist/web/src/auth-setup/runner.d.ts +12 -0
  67. package/dist/web/src/auth-setup/runner.d.ts.map +1 -0
  68. package/dist/web/src/executor/index.d.ts +18 -0
  69. package/dist/web/src/executor/index.d.ts.map +1 -0
  70. package/dist/web/src/executor/playwright-args.d.ts +7 -0
  71. package/dist/web/src/executor/playwright-args.d.ts.map +1 -0
  72. package/dist/web/src/generator/gherkin-validate.d.ts +9 -0
  73. package/dist/web/src/generator/gherkin-validate.d.ts.map +1 -0
  74. package/dist/web/src/generator/lint.d.ts +9 -0
  75. package/dist/web/src/generator/lint.d.ts.map +1 -0
  76. package/dist/web/src/generator/pom-scan.d.ts +6 -0
  77. package/dist/web/src/generator/pom-scan.d.ts.map +1 -0
  78. package/dist/web/src/generator/promote.d.ts +7 -0
  79. package/dist/web/src/generator/promote.d.ts.map +1 -0
  80. package/dist/web/src/generator/selector-rules.d.ts +10 -0
  81. package/dist/web/src/generator/selector-rules.d.ts.map +1 -0
  82. package/dist/web/src/generator/typecheck.d.ts +11 -0
  83. package/dist/web/src/generator/typecheck.d.ts.map +1 -0
  84. package/dist/web/src/index.d.ts +18 -0
  85. package/dist/web/src/index.d.ts.map +1 -0
  86. package/dist/web/src/trace-normalizer/normalize.d.ts +7 -0
  87. package/dist/web/src/trace-normalizer/normalize.d.ts.map +1 -0
  88. package/dist/web/src/trace-normalizer/parse.d.ts +37 -0
  89. package/dist/web/src/trace-normalizer/parse.d.ts.map +1 -0
  90. package/dist/web/src/trace-normalizer/scrub-rules.d.ts +12 -0
  91. package/dist/web/src/trace-normalizer/scrub-rules.d.ts.map +1 -0
  92. package/dist/web/src/trace-normalizer/scrub.d.ts +29 -0
  93. package/dist/web/src/trace-normalizer/scrub.d.ts.map +1 -0
  94. package/dist/web/src/trace-normalizer/unzip.d.ts +6 -0
  95. package/dist/web/src/trace-normalizer/unzip.d.ts.map +1 -0
  96. package/package.json +3 -2
  97. package/src/adapter/types.ts +5 -2
  98. package/src/artifact/meta.ts +1 -1
  99. package/src/artifact/status.ts +1 -1
  100. package/src/auth/encrypt.ts +2 -2
  101. package/src/auth/key.ts +1 -2
  102. package/src/auth/refresh.ts +4 -1
  103. package/src/auth/state.ts +2 -2
  104. package/src/bin-internal/doctor.ts +133 -0
  105. package/src/bin-internal/eval-deterministic.ts +149 -0
  106. package/src/bin-internal/eval-prepare.ts +214 -0
  107. package/src/bin-internal/eval-report.ts +177 -0
  108. package/src/bin-internal/exec.ts +38 -16
  109. package/src/bin-internal/fetch.ts +21 -10
  110. package/src/bin-internal/heal-prepare.ts +230 -0
  111. package/src/bin-internal/index.ts +25 -11
  112. package/src/bin-internal/lint.ts +11 -4
  113. package/src/bin-internal/normalize.ts +23 -9
  114. package/src/bin-internal/post.ts +10 -4
  115. package/src/bin-internal/report.ts +3 -3
  116. package/src/bin-internal/status-cmd.ts +11 -3
  117. package/src/bin-internal/typecheck.ts +9 -3
  118. package/src/bin-internal/unlock.ts +12 -4
  119. package/src/bin-internal/validate-feature.ts +14 -5
  120. package/src/bin-internal/verify-prompts.ts +59 -0
  121. package/src/classifier/aggregate.ts +13 -6
  122. package/src/config/define.ts +3 -1
  123. package/src/config/load.ts +1 -1
  124. package/src/config/schema.ts +43 -37
  125. package/src/eval/paths.ts +32 -0
  126. package/src/eval/run-id.ts +30 -0
  127. package/src/eval/types.ts +101 -0
  128. package/src/jira/client.ts +4 -2
  129. package/src/jira/fields.ts +4 -2
  130. package/src/jira/mcp-backend.ts +1 -1
  131. package/src/jira/rest-backend.ts +17 -5
  132. package/src/jira/retry.ts +2 -2
  133. package/src/lock/file-lock.ts +2 -2
  134. package/src/logging/ndjson-logger.ts +2 -2
  135. package/src/reporter/jira-comment.ts +13 -7
  136. package/src/reporter/status-writer.ts +2 -2
  137. package/dist/adapter/types.d.ts.map +0 -1
  138. package/dist/artifact/hash.d.ts.map +0 -1
  139. package/dist/artifact/meta.d.ts.map +0 -1
  140. package/dist/artifact/paths.d.ts.map +0 -1
  141. package/dist/artifact/status.d.ts.map +0 -1
  142. package/dist/auth/encrypt.d.ts.map +0 -1
  143. package/dist/auth/key.d.ts.map +0 -1
  144. package/dist/auth/refresh.d.ts.map +0 -1
  145. package/dist/auth/state.d.ts.map +0 -1
  146. package/dist/bin-internal/exec.d.ts.map +0 -1
  147. package/dist/bin-internal/fetch.d.ts.map +0 -1
  148. package/dist/bin-internal/index.d.ts.map +0 -1
  149. package/dist/bin-internal/lint.d.ts.map +0 -1
  150. package/dist/bin-internal/normalize.d.ts.map +0 -1
  151. package/dist/bin-internal/post.d.ts.map +0 -1
  152. package/dist/bin-internal/promote.d.ts.map +0 -1
  153. package/dist/bin-internal/report.d.ts.map +0 -1
  154. package/dist/bin-internal/status-cmd.d.ts.map +0 -1
  155. package/dist/bin-internal/typecheck.d.ts.map +0 -1
  156. package/dist/bin-internal/unlock.d.ts.map +0 -1
  157. package/dist/bin-internal/validate-feature.d.ts.map +0 -1
  158. package/dist/classifier/aggregate.d.ts.map +0 -1
  159. package/dist/classifier/history.d.ts.map +0 -1
  160. package/dist/classifier/types.d.ts.map +0 -1
  161. package/dist/config/define.d.ts.map +0 -1
  162. package/dist/config/load.d.ts.map +0 -1
  163. package/dist/index.d.ts.map +0 -1
  164. package/dist/jira/client.d.ts.map +0 -1
  165. package/dist/jira/fields.d.ts.map +0 -1
  166. package/dist/jira/mcp-backend.d.ts.map +0 -1
  167. package/dist/jira/rest-backend.d.ts.map +0 -1
  168. package/dist/jira/retry.d.ts.map +0 -1
  169. package/dist/jira/types.d.ts.map +0 -1
  170. package/dist/lock/file-lock.d.ts.map +0 -1
  171. package/dist/logging/ndjson-logger.d.ts.map +0 -1
  172. package/dist/reporter/jira-comment.d.ts.map +0 -1
  173. package/dist/reporter/status-writer.d.ts.map +0 -1
  174. /package/dist/{artifact → core/src/artifact}/hash.d.ts +0 -0
  175. /package/dist/{artifact → core/src/artifact}/meta.d.ts +0 -0
  176. /package/dist/{artifact → core/src/artifact}/paths.d.ts +0 -0
  177. /package/dist/{artifact → core/src/artifact}/status.d.ts +0 -0
  178. /package/dist/{auth → core/src/auth}/encrypt.d.ts +0 -0
  179. /package/dist/{auth → core/src/auth}/key.d.ts +0 -0
  180. /package/dist/{auth → core/src/auth}/refresh.d.ts +0 -0
  181. /package/dist/{auth → core/src/auth}/state.d.ts +0 -0
  182. /package/dist/{bin-internal → core/src/bin-internal}/exec.d.ts +0 -0
  183. /package/dist/{bin-internal → core/src/bin-internal}/fetch.d.ts +0 -0
  184. /package/dist/{bin-internal → core/src/bin-internal}/index.d.ts +0 -0
  185. /package/dist/{bin-internal → core/src/bin-internal}/lint.d.ts +0 -0
  186. /package/dist/{bin-internal → core/src/bin-internal}/normalize.d.ts +0 -0
  187. /package/dist/{bin-internal → core/src/bin-internal}/post.d.ts +0 -0
  188. /package/dist/{bin-internal → core/src/bin-internal}/promote.d.ts +0 -0
  189. /package/dist/{bin-internal → core/src/bin-internal}/report.d.ts +0 -0
  190. /package/dist/{bin-internal → core/src/bin-internal}/status-cmd.d.ts +0 -0
  191. /package/dist/{bin-internal → core/src/bin-internal}/typecheck.d.ts +0 -0
  192. /package/dist/{bin-internal → core/src/bin-internal}/unlock.d.ts +0 -0
  193. /package/dist/{bin-internal → core/src/bin-internal}/validate-feature.d.ts +0 -0
  194. /package/dist/{classifier → core/src/classifier}/aggregate.d.ts +0 -0
  195. /package/dist/{classifier → core/src/classifier}/history.d.ts +0 -0
  196. /package/dist/{classifier → core/src/classifier}/types.d.ts +0 -0
  197. /package/dist/{config → core/src/config}/define.d.ts +0 -0
  198. /package/dist/{config → core/src/config}/load.d.ts +0 -0
  199. /package/dist/{config → core/src/config}/schema.d.ts +0 -0
  200. /package/dist/{index.d.ts → core/src/index.d.ts} +0 -0
  201. /package/dist/{jira → core/src/jira}/client.d.ts +0 -0
  202. /package/dist/{jira → core/src/jira}/fields.d.ts +0 -0
  203. /package/dist/{jira → core/src/jira}/mcp-backend.d.ts +0 -0
  204. /package/dist/{jira → core/src/jira}/rest-backend.d.ts +0 -0
  205. /package/dist/{jira → core/src/jira}/retry.d.ts +0 -0
  206. /package/dist/{jira → core/src/jira}/types.d.ts +0 -0
  207. /package/dist/{lock → core/src/lock}/file-lock.d.ts +0 -0
  208. /package/dist/{logging → core/src/logging}/ndjson-logger.d.ts +0 -0
  209. /package/dist/{reporter → core/src/reporter}/jira-comment.d.ts +0 -0
  210. /package/dist/{reporter → core/src/reporter}/status-writer.d.ts +0 -0
@@ -0,0 +1,133 @@
1
+ import { existsSync, readFileSync, readdirSync } from 'node:fs';
2
+ import { join } from 'node:path';
3
+ import type { Stage } from '../eval/types';
4
+ import { verifyPrompts } from './verify-prompts';
5
+
6
+ export interface DoctorOpts {
7
+ cwd?: string;
8
+ }
9
+
10
+ interface CheckResult {
11
+ ok: boolean;
12
+ message: string;
13
+ }
14
+
15
+ const REQUIRED_FILES_PER_STAGE: Record<Stage, string[]> = {
16
+ 'feature-from-story': ['golden/test.feature'],
17
+ 'script-from-feature': ['golden/spec-requirements.md'],
18
+ 'diagnose-failure': [],
19
+ };
20
+
21
+ const REQUIRED_SCRIPTS = [
22
+ 'xera:eval-prepare',
23
+ 'xera:eval-deterministic',
24
+ 'xera:eval-report',
25
+ 'xera:verify-prompts',
26
+ 'xera:doctor',
27
+ ];
28
+
29
+ function frontmatterField(content: string, field: string): string | null {
30
+ const m = content.match(new RegExp(`^${field}:\\s*(\\S+)\\s*$`, 'm'));
31
+ return m?.[1] ?? null;
32
+ }
33
+
34
+ function checkGoldenEvalDir(repoRoot: string): CheckResult[] {
35
+ const root = join(repoRoot, 'fixtures/golden-eval');
36
+ if (!existsSync(root)) return [{ ok: false, message: 'fixtures/golden-eval/ does not exist' }];
37
+ const dirs = readdirSync(root, { withFileTypes: true }).filter(
38
+ (e) => e.isDirectory() && !e.name.startsWith('.'),
39
+ );
40
+ const results: CheckResult[] = [];
41
+ if (dirs.length < 3) {
42
+ results.push({
43
+ ok: false,
44
+ message: `fixtures/golden-eval/ has ${dirs.length} ticket dir(s); need ≥ 3`,
45
+ });
46
+ }
47
+ for (const entry of dirs) {
48
+ const dir = join(root, entry.name);
49
+ const metaPath = join(dir, 'meta.json');
50
+ if (!existsSync(metaPath)) {
51
+ results.push({ ok: false, message: `${entry.name}: meta.json missing` });
52
+ continue;
53
+ }
54
+ let meta: { id?: string; stages?: unknown[] };
55
+ try {
56
+ meta = JSON.parse(readFileSync(metaPath, 'utf8')) as { id?: string; stages?: unknown[] };
57
+ } catch (err) {
58
+ results.push({
59
+ ok: false,
60
+ message: `${entry.name}: meta.json parse error: ${(err as Error).message}`,
61
+ });
62
+ continue;
63
+ }
64
+ const stages = Array.isArray(meta.stages) ? (meta.stages as Stage[]) : [];
65
+ if (stages.length === 0)
66
+ results.push({ ok: false, message: `${entry.name}: meta.stages is empty` });
67
+ if (!existsSync(join(dir, 'story.md')))
68
+ results.push({ ok: false, message: `${entry.name}: story.md missing` });
69
+ for (const stage of stages) {
70
+ const required = REQUIRED_FILES_PER_STAGE[stage] ?? [];
71
+ for (const rel of required) {
72
+ if (!existsSync(join(dir, rel))) {
73
+ results.push({
74
+ ok: false,
75
+ message: `${meta.id ?? entry.name}: stage "${stage}" declared but ${rel} missing`,
76
+ });
77
+ }
78
+ }
79
+ }
80
+ }
81
+ return results;
82
+ }
83
+
84
+ function checkRubricPrompt(repoRoot: string): CheckResult[] {
85
+ const path = join(repoRoot, 'packages/prompts/eval-rubric.md');
86
+ if (!existsSync(path)) return [{ ok: false, message: 'packages/prompts/eval-rubric.md missing' }];
87
+ const text = readFileSync(path, 'utf8');
88
+ const id = frontmatterField(text, 'id');
89
+ const version = frontmatterField(text, 'version');
90
+ if (id !== 'eval-rubric')
91
+ return [{ ok: false, message: 'eval-rubric.md frontmatter "id" must be "eval-rubric"' }];
92
+ if (!version) return [{ ok: false, message: 'eval-rubric.md frontmatter "version" missing' }];
93
+ return [];
94
+ }
95
+
96
+ function checkEvalSkill(repoRoot: string): CheckResult[] {
97
+ const path = join(repoRoot, 'packages/skills/xera-eval.md');
98
+ if (!existsSync(path)) return [{ ok: false, message: 'packages/skills/xera-eval.md missing' }];
99
+ const text = readFileSync(path, 'utf8');
100
+ if (!frontmatterField(text, 'name'))
101
+ return [{ ok: false, message: 'xera-eval.md frontmatter "name" missing' }];
102
+ return [];
103
+ }
104
+
105
+ function checkPromptInjectionPreamble(repoRoot: string): CheckResult[] {
106
+ return verifyPrompts(repoRoot);
107
+ }
108
+
109
+ function checkRootScripts(repoRoot: string): CheckResult[] {
110
+ const path = join(repoRoot, 'package.json');
111
+ if (!existsSync(path)) return [{ ok: false, message: 'root package.json missing' }];
112
+ const pkg = JSON.parse(readFileSync(path, 'utf8'));
113
+ const scripts = pkg.scripts ?? {};
114
+ const missing = REQUIRED_SCRIPTS.filter((s) => typeof scripts[s] !== 'string');
115
+ return missing.map((s) => ({ ok: false, message: `root package.json missing script: ${s}` }));
116
+ }
117
+
118
+ export async function doctorCmd(_argv: string[], opts: DoctorOpts = {}): Promise<number> {
119
+ const repoRoot = opts.cwd ?? process.cwd();
120
+ const results: CheckResult[] = [
121
+ ...checkGoldenEvalDir(repoRoot),
122
+ ...checkRubricPrompt(repoRoot),
123
+ ...checkEvalSkill(repoRoot),
124
+ ...checkPromptInjectionPreamble(repoRoot),
125
+ ...checkRootScripts(repoRoot),
126
+ ];
127
+ if (results.length === 0) {
128
+ console.log('[xera:doctor] ok');
129
+ return 0;
130
+ }
131
+ for (const r of results) console.error(`[xera:doctor] ${r.message}`);
132
+ return 1;
133
+ }
@@ -0,0 +1,149 @@
1
+ import { existsSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { join } from 'node:path';
3
+ import { validateGherkin } from '@xera-ai/web';
4
+ import { resolveEvalPaths } from '../eval/paths';
5
+ import {
6
+ type DeterministicEntry,
7
+ type DeterministicScores,
8
+ DeterministicScoresSchema,
9
+ ManifestSchema,
10
+ } from '../eval/types';
11
+
12
+ export interface EvalDeterministicOpts {
13
+ cwd?: string;
14
+ }
15
+
16
+ interface ClassifierScenario {
17
+ name: string;
18
+ class: string;
19
+ }
20
+
21
+ function checkFeatureFromStory(actualFeaturePath: string): {
22
+ passed: boolean;
23
+ checks: string[];
24
+ error?: string;
25
+ } {
26
+ if (!existsSync(actualFeaturePath)) {
27
+ return { passed: false, checks: ['validate-feature'], error: 'actual missing: test.feature' };
28
+ }
29
+ try {
30
+ const r = validateGherkin(readFileSync(actualFeaturePath, 'utf8'));
31
+ if (r.ok) return { passed: true, checks: ['validate-feature'] };
32
+ return {
33
+ passed: false,
34
+ checks: ['validate-feature'],
35
+ error: r.errors.map((e) => `line ${e.line}: ${e.message}`).join('; '),
36
+ };
37
+ } catch (err) {
38
+ return { passed: false, checks: ['validate-feature'], error: (err as Error).message };
39
+ }
40
+ }
41
+
42
+ function checkScriptFromFeature(actualTicketDir: string): {
43
+ passed: boolean;
44
+ checks: string[];
45
+ error?: string;
46
+ } {
47
+ // v0.2 deviation: file-presence only. Full typecheck/lint hookup is deferred to v0.2.1
48
+ // because v0.1's lintTicket/typecheckTicket resolve paths from .xera/<TICKET>/ (consumer
49
+ // project layout), not .xera/eval/<run-id>/actual/<ticket>/. The judge dimensions
50
+ // "Requirements satisfied", "Wait strategy", "No dead code" cover the lint surface.
51
+ const specPath = join(actualTicketDir, 'spec.ts');
52
+ if (!existsSync(specPath)) {
53
+ return { passed: false, checks: ['file-presence'], error: 'actual missing: spec.ts' };
54
+ }
55
+ return { passed: true, checks: ['file-presence'] };
56
+ }
57
+
58
+ function checkDiagnoseFailure(
59
+ inputsTicketDir: string,
60
+ actualTicketDir: string,
61
+ ): { passed: boolean; checks: string[]; error?: string } {
62
+ const inputPath = join(inputsTicketDir, 'classifier-input.json');
63
+ const actualPath = join(actualTicketDir, 'classification.json');
64
+ if (!existsSync(actualPath)) {
65
+ return {
66
+ passed: false,
67
+ checks: ['bucket-match'],
68
+ error: 'actual missing: classification.json',
69
+ };
70
+ }
71
+ if (!existsSync(inputPath)) {
72
+ return {
73
+ passed: false,
74
+ checks: ['bucket-match'],
75
+ error: 'inputs missing: classifier-input.json',
76
+ };
77
+ }
78
+ const golden = JSON.parse(readFileSync(inputPath, 'utf8'));
79
+ const actual = JSON.parse(readFileSync(actualPath, 'utf8'));
80
+ const goldScens: ClassifierScenario[] = golden.scenarios ?? [];
81
+ const actScens: ClassifierScenario[] = actual.scenarios ?? [];
82
+ const mismatches: string[] = [];
83
+ for (const g of goldScens) {
84
+ const a = actScens.find((s) => s.name === g.name);
85
+ if (!a) {
86
+ mismatches.push(`missing scenario "${g.name}"`);
87
+ continue;
88
+ }
89
+ if (a.class !== g.class)
90
+ mismatches.push(`scenario "${g.name}": expected class ${g.class}, got ${a.class}`);
91
+ }
92
+ if (mismatches.length > 0) {
93
+ return {
94
+ passed: false,
95
+ checks: ['bucket-match'],
96
+ error: `bucket mismatch — ${mismatches.join('; ')}`,
97
+ };
98
+ }
99
+ return { passed: true, checks: ['bucket-match'] };
100
+ }
101
+
102
+ export async function evalDeterministicCmd(
103
+ argv: string[],
104
+ opts: EvalDeterministicOpts = {},
105
+ ): Promise<number> {
106
+ const cwd = opts.cwd ?? process.cwd();
107
+ const runId = argv[0];
108
+ if (!runId) {
109
+ console.error('[xera:eval-deterministic] usage: eval-deterministic <run-id>');
110
+ return 1;
111
+ }
112
+ const paths = resolveEvalPaths(cwd, runId);
113
+ if (!existsSync(paths.manifest)) {
114
+ console.error(`[xera:eval-deterministic] missing manifest.json at ${paths.manifest}`);
115
+ return 1;
116
+ }
117
+ const manifest = ManifestSchema.parse(JSON.parse(readFileSync(paths.manifest, 'utf8')));
118
+
119
+ const entries: DeterministicEntry[] = [];
120
+ for (const [ticket, ticketStages] of Object.entries(manifest.ticket_stages)) {
121
+ for (const stage of ticketStages) {
122
+ const inputsDir = paths.ticketInputsDir(ticket);
123
+ const actualDir = paths.ticketActualDir(ticket);
124
+ let result: { passed: boolean; checks: string[]; error?: string };
125
+ if (stage === 'feature-from-story') {
126
+ result = checkFeatureFromStory(join(actualDir, 'test.feature'));
127
+ } else if (stage === 'script-from-feature') {
128
+ result = checkScriptFromFeature(actualDir);
129
+ } else {
130
+ result = checkDiagnoseFailure(inputsDir, actualDir);
131
+ }
132
+
133
+ const entry: DeterministicEntry = {
134
+ ticket,
135
+ stage,
136
+ passed: result.passed,
137
+ checks: result.checks,
138
+ };
139
+ if (result.error !== undefined) entry.error = result.error;
140
+ entries.push(entry);
141
+ }
142
+ }
143
+
144
+ const scores: DeterministicScores = { run_id: runId, entries };
145
+ DeterministicScoresSchema.parse(scores);
146
+ writeFileSync(paths.deterministicScores, JSON.stringify(scores, null, 2));
147
+ console.log(`[xera:eval-deterministic] wrote ${entries.length} entries`);
148
+ return 0;
149
+ }
@@ -0,0 +1,214 @@
1
+ import {
2
+ copyFileSync,
3
+ existsSync,
4
+ mkdirSync,
5
+ readFileSync,
6
+ readdirSync,
7
+ writeFileSync,
8
+ } from 'node:fs';
9
+ import { join } from 'node:path';
10
+ import { resolveEvalPaths } from '../eval/paths';
11
+ import { generateRunId } from '../eval/run-id';
12
+ import { type Manifest, ManifestSchema, STAGES, type Stage } from '../eval/types';
13
+ import { acquireLock } from '../lock/file-lock';
14
+
15
+ export interface EvalPrepareOpts {
16
+ cwd?: string;
17
+ now?: () => Date;
18
+ getGitSha?: () => string | null;
19
+ }
20
+
21
+ interface ParsedFlags {
22
+ force: boolean;
23
+ only_prompt: Stage | null;
24
+ only_ticket: string | null;
25
+ }
26
+
27
+ function parseFlags(argv: string[]): ParsedFlags | { error: string } {
28
+ const flags: ParsedFlags = { force: false, only_prompt: null, only_ticket: null };
29
+ for (const arg of argv) {
30
+ if (arg === '--force') flags.force = true;
31
+ else if (arg.startsWith('--prompt=')) {
32
+ const v = arg.slice('--prompt='.length);
33
+ if (!STAGES.includes(v as Stage)) {
34
+ return { error: `Unknown stage: ${v}. Valid: ${STAGES.join(', ')}.` };
35
+ }
36
+ flags.only_prompt = v as Stage;
37
+ } else if (arg.startsWith('--ticket=')) {
38
+ flags.only_ticket = arg.slice('--ticket='.length);
39
+ } else {
40
+ return { error: `Unknown argument: ${arg}` };
41
+ }
42
+ }
43
+ return flags;
44
+ }
45
+
46
+ function readPromptVersion(repoRoot: string, name: string): string {
47
+ const path = join(repoRoot, 'packages/prompts', `${name}.md`);
48
+ if (!existsSync(path)) return '0.0.0';
49
+ const text = readFileSync(path, 'utf8');
50
+ const m = /^version:\s*(\S+)\s*$/m.exec(text);
51
+ return m?.[1] ?? '0.0.0';
52
+ }
53
+
54
+ function discoverEvalTickets(repoRoot: string): { id: string; dir: string; stages: Stage[] }[] {
55
+ const root = join(repoRoot, 'fixtures/golden-eval');
56
+ if (!existsSync(root)) return [];
57
+ const out: { id: string; dir: string; stages: Stage[] }[] = [];
58
+ for (const entry of readdirSync(root, { withFileTypes: true })) {
59
+ if (!entry.isDirectory()) continue;
60
+ if (entry.name === 'README.md' || entry.name.startsWith('.')) continue;
61
+ const dir = join(root, entry.name);
62
+ const metaPath = join(dir, 'meta.json');
63
+ if (!existsSync(metaPath)) continue;
64
+ const meta = JSON.parse(readFileSync(metaPath, 'utf8')) as { id: string; stages: Stage[] };
65
+ out.push({ id: meta.id, dir, stages: meta.stages });
66
+ }
67
+ return out.sort((a, b) => a.id.localeCompare(b.id));
68
+ }
69
+
70
+ function discoverClassifierTickets(repoRoot: string): { id: string; path: string }[] {
71
+ const root = join(repoRoot, 'fixtures/golden-tickets');
72
+ if (!existsSync(root)) return [];
73
+ const out: { id: string; path: string }[] = [];
74
+ for (const entry of readdirSync(root, { withFileTypes: true })) {
75
+ if (!entry.isFile() || !entry.name.endsWith('.json')) continue;
76
+ const path = join(root, entry.name);
77
+ const data = JSON.parse(readFileSync(path, 'utf8')) as { ticket?: string };
78
+ if (typeof data.ticket === 'string') out.push({ id: data.ticket, path });
79
+ }
80
+ return out.sort((a, b) => a.id.localeCompare(b.id));
81
+ }
82
+
83
+ export async function evalPrepareCmd(argv: string[], opts: EvalPrepareOpts = {}): Promise<number> {
84
+ const repoRoot = opts.cwd ?? process.cwd();
85
+
86
+ const flags = parseFlags(argv);
87
+ if ('error' in flags) {
88
+ console.error(`[xera:eval-prepare] ${flags.error}`);
89
+ return 1;
90
+ }
91
+
92
+ const evalTickets = discoverEvalTickets(repoRoot);
93
+ const classifierTickets = discoverClassifierTickets(repoRoot);
94
+
95
+ // Determine which stages to run.
96
+ const stages: Stage[] = flags.only_prompt ? [flags.only_prompt] : [...STAGES];
97
+
98
+ // Determine which tickets are relevant.
99
+ const wantsEval = stages.some((s) => s !== 'diagnose-failure');
100
+ const wantsClassifier = stages.includes('diagnose-failure');
101
+
102
+ let selectedTickets: string[] = [];
103
+ if (wantsEval) selectedTickets.push(...evalTickets.map((t) => t.id));
104
+ if (wantsClassifier) selectedTickets.push(...classifierTickets.map((t) => t.id));
105
+ selectedTickets = [...new Set(selectedTickets)].sort();
106
+
107
+ if (flags.only_ticket) {
108
+ if (!selectedTickets.includes(flags.only_ticket)) {
109
+ console.error(`[xera:eval-prepare] No golden fixture for ${flags.only_ticket}`);
110
+ return 1;
111
+ }
112
+ selectedTickets = [flags.only_ticket];
113
+ }
114
+
115
+ if (selectedTickets.length === 0) {
116
+ console.error('[xera:eval-prepare] No tickets selected (after filters).');
117
+ return 1;
118
+ }
119
+
120
+ // Compute per-ticket stages by intersecting the ticket's declared stages with the
121
+ // global stages array, then filter out tickets with an empty intersection.
122
+ const ticket_stages: Record<string, Stage[]> = {};
123
+ for (const ticket of selectedTickets) {
124
+ const evalT = evalTickets.find((t) => t.id === ticket);
125
+ let ticketDeclared: Stage[];
126
+ if (evalT) {
127
+ ticketDeclared = evalT.stages;
128
+ } else {
129
+ // Classifier/GOLD ticket — always diagnose-failure.
130
+ ticketDeclared = ['diagnose-failure'];
131
+ }
132
+ const intersection = ticketDeclared.filter((s) => stages.includes(s));
133
+ if (intersection.length > 0) {
134
+ ticket_stages[ticket] = intersection;
135
+ }
136
+ }
137
+
138
+ // Filter selectedTickets to only those with applicable stages.
139
+ selectedTickets = selectedTickets.filter((t) => ticket_stages[t] !== undefined);
140
+
141
+ if (selectedTickets.length === 0) {
142
+ console.error('[xera:eval-prepare] No tickets applicable to requested stages.');
143
+ return 1;
144
+ }
145
+
146
+ const runId = generateRunId({
147
+ ...(opts.now ? { now: opts.now } : {}),
148
+ ...(opts.getGitSha ? { getGitSha: opts.getGitSha } : {}),
149
+ });
150
+ const paths = resolveEvalPaths(repoRoot, runId);
151
+
152
+ if (existsSync(paths.root) && !flags.force) {
153
+ console.error(
154
+ `[xera:eval-prepare] run dir already exists: ${paths.root}. Pass --force to re-run.`,
155
+ );
156
+ return 1;
157
+ }
158
+ mkdirSync(paths.inputsDir, { recursive: true });
159
+ mkdirSync(paths.actualDir, { recursive: true });
160
+
161
+ // Copy inputs.
162
+ for (const ticket of selectedTickets) {
163
+ const ticketInputs = paths.ticketInputsDir(ticket);
164
+ mkdirSync(ticketInputs, { recursive: true });
165
+ const evalT = evalTickets.find((t) => t.id === ticket);
166
+ const classT = classifierTickets.find((t) => t.id === ticket);
167
+ if (evalT) {
168
+ copyFileSync(join(evalT.dir, 'story.md'), join(ticketInputs, 'story.md'));
169
+ const featurePath = join(evalT.dir, 'golden/test.feature');
170
+ if (existsSync(featurePath)) copyFileSync(featurePath, join(ticketInputs, 'test.feature'));
171
+ }
172
+ if (classT) {
173
+ copyFileSync(classT.path, join(ticketInputs, 'classifier-input.json'));
174
+ }
175
+ }
176
+
177
+ // Build manifest.
178
+ const now = (opts.now ?? (() => new Date()))();
179
+ const manifest: Manifest = {
180
+ run_id: runId,
181
+ started_at: now.toISOString(),
182
+ git_sha: runId.split('-')[2] ?? 'nogit',
183
+ tickets: selectedTickets,
184
+ stages,
185
+ ticket_stages,
186
+ prompt_versions: {
187
+ 'feature-from-story': readPromptVersion(repoRoot, 'feature-from-story'),
188
+ 'script-from-feature': readPromptVersion(repoRoot, 'script-from-feature'),
189
+ 'diagnose-failure': readPromptVersion(repoRoot, 'diagnose-failure'),
190
+ 'eval-rubric': readPromptVersion(repoRoot, 'eval-rubric'),
191
+ },
192
+ flags: {
193
+ force: flags.force,
194
+ only_prompt: flags.only_prompt,
195
+ only_ticket: flags.only_ticket,
196
+ judge_only: false,
197
+ },
198
+ };
199
+
200
+ // Validate before writing.
201
+ ManifestSchema.parse(manifest);
202
+ writeFileSync(paths.manifest, JSON.stringify(manifest, null, 2));
203
+
204
+ if (!acquireLock(paths.lock, runId)) {
205
+ console.error(`[xera:eval-prepare] failed to acquire lock at ${paths.lock}`);
206
+ return 4;
207
+ }
208
+
209
+ console.log(
210
+ `[xera:eval-prepare] prepared ${selectedTickets.length} ticket(s) for stages: ${stages.join(', ')}`,
211
+ );
212
+ console.log(`RUN_ID=${runId}`);
213
+ return 0;
214
+ }
@@ -0,0 +1,177 @@
1
+ import { existsSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { resolveEvalPaths } from '../eval/paths';
3
+ import {
4
+ type DeterministicScores,
5
+ DeterministicScoresSchema,
6
+ type JudgeScores,
7
+ JudgeScoresSchema,
8
+ type Judgment,
9
+ ManifestSchema,
10
+ type Result,
11
+ type Summary,
12
+ SummarySchema,
13
+ } from '../eval/types';
14
+ import { releaseLock } from '../lock/file-lock';
15
+
16
+ export interface EvalReportOpts {
17
+ cwd?: string;
18
+ }
19
+
20
+ function scoreJudgment(j: Judgment): { passed: boolean; score: number } {
21
+ const nonNa = j.dimensions.filter((d) => d.verdict !== 'NA');
22
+ if (nonNa.length === 0) return { passed: true, score: 1 };
23
+ const passes = nonNa.filter((d) => d.verdict === 'PASS').length;
24
+ const score = passes / nonNa.length;
25
+ const passed = nonNa.every((d) => d.verdict === 'PASS');
26
+ return { passed, score };
27
+ }
28
+
29
+ function renderReport(summary: Summary): string {
30
+ const lines: string[] = [];
31
+ lines.push(`# xera eval report ${summary.run_id}`);
32
+ lines.push('');
33
+ lines.push(`**Git SHA:** \`${summary.git_sha}\``);
34
+ lines.push('');
35
+ lines.push('**Prompt versions:**');
36
+ for (const [k, v] of Object.entries(summary.prompt_versions)) lines.push(`- \`${k}\`: ${v}`);
37
+ lines.push('');
38
+ lines.push(
39
+ `**Overall:** ${summary.overall.passed}/${summary.overall.total} PASS (score ${(summary.overall.score * 100).toFixed(0)}%)`,
40
+ );
41
+ lines.push('');
42
+ lines.push('## Results');
43
+ lines.push('');
44
+ lines.push('| Ticket | Stage | Deterministic | Judge | Score |');
45
+ lines.push('|---|---|---|---|---|');
46
+ for (const r of summary.results) {
47
+ const det = r.deterministic.passed ? 'PASS' : `FAIL (${r.deterministic.error ?? ''})`;
48
+ const judge = r.skipped ? 'SKIPPED' : r.judge ? (r.judge.passed ? 'PASS' : 'FAIL') : 'SKIPPED';
49
+ const score = r.judge ? `${(r.judge.score * 100).toFixed(0)}%` : '—';
50
+ lines.push(`| ${r.ticket} | ${r.stage} | ${det} | ${judge} | ${score} |`);
51
+ }
52
+ lines.push('');
53
+ lines.push('## Dimension breakdown');
54
+ lines.push('');
55
+ for (const r of summary.results) {
56
+ if (!r.judge || r.judge.dimensions.length === 0) continue;
57
+ lines.push(`### ${r.ticket} — ${r.stage}`);
58
+ lines.push('');
59
+ for (const d of r.judge.dimensions) lines.push(`- **${d.name}** — ${d.verdict}: ${d.notes}`);
60
+ lines.push('');
61
+ }
62
+ return lines.join('\n');
63
+ }
64
+
65
+ export async function evalReportCmd(argv: string[], opts: EvalReportOpts = {}): Promise<number> {
66
+ const cwd = opts.cwd ?? process.cwd();
67
+ const runId = argv[0];
68
+ if (!runId) {
69
+ console.error('[xera:eval-report] usage: eval-report <run-id>');
70
+ return 1;
71
+ }
72
+ const paths = resolveEvalPaths(cwd, runId);
73
+ if (!existsSync(paths.manifest)) {
74
+ console.error(`[xera:eval-report] missing manifest.json at ${paths.manifest}`);
75
+ return 1;
76
+ }
77
+ const manifest = ManifestSchema.parse(JSON.parse(readFileSync(paths.manifest, 'utf8')));
78
+
79
+ try {
80
+ let det: DeterministicScores;
81
+ let judge: JudgeScores;
82
+ try {
83
+ det = DeterministicScoresSchema.parse(
84
+ JSON.parse(readFileSync(paths.deterministicScores, 'utf8')),
85
+ );
86
+ } catch (err) {
87
+ console.error(
88
+ `[xera:eval-report] invalid deterministic-scores.json: ${(err as Error).message}`,
89
+ );
90
+ return 2;
91
+ }
92
+ try {
93
+ judge = JudgeScoresSchema.parse(JSON.parse(readFileSync(paths.judgeScores, 'utf8')));
94
+ } catch (err) {
95
+ console.error(`[xera:eval-report] invalid judge-scores.json: ${(err as Error).message}`);
96
+ return 2;
97
+ }
98
+
99
+ const results: Result[] = [];
100
+ for (const detEntry of det.entries) {
101
+ const judgment = judge.judgments.find(
102
+ (j) => j.ticket === detEntry.ticket && j.stage === detEntry.stage,
103
+ );
104
+ if (!judgment && detEntry.error?.startsWith('actual missing')) {
105
+ const r: Result = {
106
+ ticket: detEntry.ticket,
107
+ stage: detEntry.stage,
108
+ deterministic: {
109
+ passed: detEntry.passed,
110
+ checks: detEntry.checks,
111
+ ...(detEntry.error !== undefined ? { error: detEntry.error } : {}),
112
+ },
113
+ judge: null,
114
+ skipped: true,
115
+ };
116
+ results.push(r);
117
+ continue;
118
+ }
119
+ if (!judgment) {
120
+ // Judge entry expected but missing: count as FAIL not SKIPPED.
121
+ const r: Result = {
122
+ ticket: detEntry.ticket,
123
+ stage: detEntry.stage,
124
+ deterministic: {
125
+ passed: detEntry.passed,
126
+ checks: detEntry.checks,
127
+ ...(detEntry.error !== undefined ? { error: detEntry.error } : {}),
128
+ },
129
+ judge: { passed: false, dimensions: [], score: 0 },
130
+ };
131
+ results.push(r);
132
+ continue;
133
+ }
134
+ const { passed, score } = scoreJudgment(judgment);
135
+ const r: Result = {
136
+ ticket: detEntry.ticket,
137
+ stage: detEntry.stage,
138
+ deterministic: {
139
+ passed: detEntry.passed,
140
+ checks: detEntry.checks,
141
+ ...(detEntry.error !== undefined ? { error: detEntry.error } : {}),
142
+ },
143
+ judge: { passed, dimensions: judgment.dimensions, score },
144
+ };
145
+ results.push(r);
146
+ }
147
+
148
+ const counted = results.filter((r) => !r.skipped);
149
+ const passedCount = counted.filter((r) => r.deterministic.passed && r.judge?.passed).length;
150
+ const failedCount = counted.length - passedCount;
151
+ const avgScore =
152
+ counted.length === 0
153
+ ? 0
154
+ : counted.reduce(
155
+ (acc, r) => acc + (r.deterministic.passed && r.judge ? r.judge.score : 0),
156
+ 0,
157
+ ) / counted.length;
158
+
159
+ const summary: Summary = {
160
+ run_id: runId,
161
+ git_sha: manifest.git_sha,
162
+ prompt_versions: manifest.prompt_versions,
163
+ results,
164
+ overall: { passed: passedCount, failed: failedCount, total: counted.length, score: avgScore },
165
+ };
166
+ SummarySchema.parse(summary);
167
+ writeFileSync(paths.summary, JSON.stringify(summary, null, 2));
168
+ writeFileSync(paths.report, renderReport(summary));
169
+
170
+ console.log(
171
+ `[xera:eval-report] ${passedCount}/${counted.length} PASS (avg ${(avgScore * 100).toFixed(0)}%)`,
172
+ );
173
+ return 0;
174
+ } finally {
175
+ releaseLock(paths.lock);
176
+ }
177
+ }