ai-spec-dev 0.31.0 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ import * as fs from "fs-extra";
2
+ import * as path from "path";
3
+ import chalk from "chalk";
4
+ import { RunLog } from "./run-logger";
5
+
6
+ const LOG_DIR = ".ai-spec-logs";
7
+
8
+ // ─── Types ────────────────────────────────────────────────────────────────────
9
+
10
+ export interface TrendEntry {
11
+ runId: string;
12
+ startedAt: string;
13
+ promptHash: string | null;
14
+ harnessScore: number | null;
15
+ specPath: string | null;
16
+ provider: string | null;
17
+ model: string | null;
18
+ filesWritten: number;
19
+ totalDurationMs: number | null;
20
+ errors: number;
21
+ }
22
+
23
+ export interface PromptGroupSummary {
24
+ promptHash: string;
25
+ runs: number;
26
+ avg: number;
27
+ best: number;
28
+ worst: number;
29
+ firstSeen: string;
30
+ lastSeen: string;
31
+ /** true if this is the most recently used prompt hash */
32
+ isCurrent: boolean;
33
+ }
34
+
35
+ export interface TrendReport {
36
+ entries: TrendEntry[];
37
+ promptGroups: PromptGroupSummary[];
38
+ totalRuns: number;
39
+ }
40
+
41
+ // ─── Loader ──────────────────────────────────────────────────────────────────
42
+
43
+ /**
44
+ * Read all RunLog JSON files from `.ai-spec-logs/`, sorted newest-first.
45
+ * Silently skips unreadable / corrupt files.
46
+ */
47
+ export async function loadRunLogs(workingDir: string): Promise<RunLog[]> {
48
+ const logDir = path.join(workingDir, LOG_DIR);
49
+ if (!(await fs.pathExists(logDir))) return [];
50
+
51
+ const files = await fs.readdir(logDir);
52
+ const jsonFiles = files.filter((f) => f.endsWith(".json")).sort().reverse();
53
+
54
+ const logs: RunLog[] = [];
55
+ for (const file of jsonFiles) {
56
+ try {
57
+ const log: RunLog = await fs.readJson(path.join(logDir, file));
58
+ // only include runs that have a startedAt (minimal validity check)
59
+ if (log.runId && log.startedAt) {
60
+ logs.push(log);
61
+ }
62
+ } catch {
63
+ // corrupt file — skip silently
64
+ }
65
+ }
66
+ return logs;
67
+ }
68
+
69
+ // ─── Aggregation ─────────────────────────────────────────────────────────────
70
+
71
+ export function buildTrendReport(
72
+ logs: RunLog[],
73
+ opts: { last?: number; promptFilter?: string } = {}
74
+ ): TrendReport {
75
+ let entries: TrendEntry[] = logs.map((log) => ({
76
+ runId: log.runId,
77
+ startedAt: log.startedAt,
78
+ promptHash: log.promptHash ?? null,
79
+ harnessScore: log.harnessScore ?? null,
80
+ specPath: log.specPath ?? null,
81
+ provider: log.provider ?? null,
82
+ model: log.model ?? null,
83
+ filesWritten: log.filesWritten?.length ?? 0,
84
+ totalDurationMs: log.totalDurationMs ?? null,
85
+ errors: log.errors?.length ?? 0,
86
+ }));
87
+
88
+ // filter: only runs with a harnessScore (create runs)
89
+ entries = entries.filter((e) => e.harnessScore !== null);
90
+
91
+ // filter by prompt hash if requested
92
+ if (opts.promptFilter) {
93
+ entries = entries.filter((e) =>
94
+ e.promptHash?.startsWith(opts.promptFilter!)
95
+ );
96
+ }
97
+
98
+ // limit to last N
99
+ if (opts.last && opts.last > 0) {
100
+ entries = entries.slice(0, opts.last);
101
+ }
102
+
103
+ // build prompt group summaries (only from filtered entries)
104
+ const groupMap = new Map<string, TrendEntry[]>();
105
+ for (const e of entries) {
106
+ const key = e.promptHash ?? "(none)";
107
+ if (!groupMap.has(key)) groupMap.set(key, []);
108
+ groupMap.get(key)!.push(e);
109
+ }
110
+
111
+ // determine "current" = the prompt hash of the most recent run
112
+ const currentHash = entries[0]?.promptHash ?? null;
113
+
114
+ const promptGroups: PromptGroupSummary[] = [];
115
+ for (const [hash, group] of groupMap.entries()) {
116
+ const scores = group.map((e) => e.harnessScore as number);
117
+ promptGroups.push({
118
+ promptHash: hash,
119
+ runs: group.length,
120
+ avg: Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 10) / 10,
121
+ best: Math.max(...scores),
122
+ worst: Math.min(...scores),
123
+ firstSeen: group[group.length - 1].startedAt,
124
+ lastSeen: group[0].startedAt,
125
+ isCurrent: hash === currentHash,
126
+ });
127
+ }
128
+
129
+ // sort groups: most recently used first
130
+ promptGroups.sort((a, b) => b.lastSeen.localeCompare(a.lastSeen));
131
+
132
+ return { entries, promptGroups, totalRuns: entries.length };
133
+ }
134
+
135
+ // ─── Display ─────────────────────────────────────────────────────────────────
136
+
137
+ function scoreBar(score: number): string {
138
+ const filled = Math.round(score);
139
+ return "█".repeat(filled) + "░".repeat(10 - filled);
140
+ }
141
+
142
+ function scoreColor(score: number, text: string): string {
143
+ if (score >= 8) return chalk.green(text);
144
+ if (score >= 6) return chalk.yellow(text);
145
+ return chalk.red(text);
146
+ }
147
+
148
+ function formatDate(iso: string): string {
149
+ return iso.slice(0, 10); // YYYY-MM-DD
150
+ }
151
+
152
+ function formatDuration(ms: number | null): string {
153
+ if (ms === null) return " — ";
154
+ const s = Math.round(ms / 1000);
155
+ if (s < 60) return `${s}s`;
156
+ return `${Math.floor(s / 60)}m${s % 60}s`;
157
+ }
158
+
159
+ function shortSpec(specPath: string | null): string {
160
+ if (!specPath) return chalk.gray("—");
161
+ return path.basename(specPath);
162
+ }
163
+
164
+ export function printTrendReport(report: TrendReport, workingDir: string): void {
165
+ const { entries, promptGroups } = report;
166
+
167
+ console.log(chalk.cyan("\n─── Harness Trend ───────────────────────────────────────────"));
168
+
169
+ if (entries.length === 0) {
170
+ console.log(chalk.gray(" No scored runs found. Run `ai-spec create` to start tracking."));
171
+ console.log(chalk.cyan("─".repeat(63)));
172
+ return;
173
+ }
174
+
175
+ // ── Prompt Version Summary ────────────────────────────────────────
176
+ if (promptGroups.length > 0) {
177
+ console.log(chalk.bold("\n Prompt Versions:\n"));
178
+
179
+ const colWidths = {
180
+ hash: 10,
181
+ runs: 5,
182
+ avg: 5,
183
+ best: 5,
184
+ worst: 5,
185
+ };
186
+
187
+ // header
188
+ console.log(
189
+ chalk.gray(
190
+ " " +
191
+ "Hash ".padEnd(colWidths.hash) + " " +
192
+ "Runs ".padStart(colWidths.runs) + " " +
193
+ " Avg" + " " +
194
+ " Best" + " " +
195
+ "Worst" + " " +
196
+ "Last seen"
197
+ )
198
+ );
199
+ console.log(chalk.gray(" " + "─".repeat(55)));
200
+
201
+ for (const g of promptGroups) {
202
+ const currentMark = g.isCurrent ? chalk.cyan(" ◀ current") : "";
203
+ const avgStr = scoreColor(g.avg, g.avg.toFixed(1).padStart(5));
204
+ const bestStr = chalk.green(g.best.toFixed(1).padStart(5));
205
+ const worstStr = g.worst < 6 ? chalk.red(g.worst.toFixed(1).padStart(5)) : chalk.yellow(g.worst.toFixed(1).padStart(5));
206
+
207
+ console.log(
208
+ " " +
209
+ chalk.white(g.promptHash.padEnd(colWidths.hash)) + " " +
210
+ chalk.gray(String(g.runs).padStart(colWidths.runs)) + " " +
211
+ avgStr + " " +
212
+ bestStr + " " +
213
+ worstStr + " " +
214
+ chalk.gray(formatDate(g.lastSeen)) +
215
+ currentMark
216
+ );
217
+ }
218
+ }
219
+
220
+ // ── Run History ───────────────────────────────────────────────────
221
+ console.log(chalk.bold("\n Run History:\n"));
222
+
223
+ for (const e of entries) {
224
+ const score = e.harnessScore as number;
225
+ const bar = scoreColor(score, `[${scoreBar(score)}]`);
226
+ const scoreStr = scoreColor(score, score.toFixed(1).padStart(4));
227
+ const hash = e.promptHash ? chalk.gray(e.promptHash) : chalk.gray("(no hash)");
228
+ const dur = chalk.gray(formatDuration(e.totalDurationMs));
229
+ const errMark = e.errors > 0 ? chalk.yellow(` ⚠${e.errors}err`) : "";
230
+ const spec = chalk.gray(shortSpec(e.specPath));
231
+
232
+ console.log(
233
+ ` ${chalk.gray(formatDate(e.startedAt))} ${bar}${scoreStr} ${hash} ${dur}${errMark} ${spec}`
234
+ );
235
+ }
236
+
237
+ // ── Footer ────────────────────────────────────────────────────────
238
+ const logRelDir = path.relative(workingDir, path.join(workingDir, LOG_DIR));
239
+ console.log(chalk.gray(`\n ${entries.length} run(s) shown · logs: ${logRelDir}/`));
240
+ console.log(chalk.cyan("─".repeat(63)));
241
+ }
@@ -18,8 +18,14 @@ export interface SelfEvalResult {
18
18
  detail: {
19
19
  endpointsTotal: number;
20
20
  endpointLayerCovered: boolean;
21
+ /** Number of endpoint-layer files generated */
22
+ endpointLayerFiles: number;
21
23
  modelsTotal: number;
22
24
  modelLayerCovered: boolean;
25
+ /** 0-1: fraction of DSL model names found in generated file paths */
26
+ modelNameCoverage: number;
27
+ /** Number of DSL model names actually matched in file paths */
28
+ modelNameMatched: number;
23
29
  filesWritten: number;
24
30
  };
25
31
  }
@@ -57,6 +63,32 @@ function extractReviewScore(reviewText: string): number | null {
57
63
 
58
64
  // ─── Main ─────────────────────────────────────────────────────────────────────
59
65
 
66
+ /**
67
+ * Normalize a PascalCase or camelCase model name to a set of search tokens
68
+ * that would appear in file paths.
69
+ *
70
+ * "OrderItem" → ["orderitem", "order-item", "order_item"]
71
+ * "User" → ["user"]
72
+ */
73
+ export function modelNameTokens(name: string): string[] {
74
+ const lower = name.toLowerCase();
75
+ // split on uppercase boundaries: "OrderItem" → ["order", "item"]
76
+ const parts = name
77
+ .replace(/([A-Z])/g, "-$1")
78
+ .toLowerCase()
79
+ .replace(/^-/, "")
80
+ .split("-")
81
+ .filter(Boolean);
82
+
83
+ const tokens = new Set<string>();
84
+ tokens.add(lower);
85
+ if (parts.length > 1) {
86
+ tokens.add(parts.join("-"));
87
+ tokens.add(parts.join("_"));
88
+ }
89
+ return [...tokens];
90
+ }
91
+
60
92
  /**
61
93
  * Run a lightweight self-evaluation at the end of `ai-spec create`.
62
94
  *
@@ -71,6 +103,18 @@ function extractReviewScore(reviewText: string): number | null {
71
103
  * | DSL Coverage | 40 % | 55 % |
72
104
  * | Compile/Error | 30 % | 45 % |
73
105
  * | Review Score | 30 % | — |
106
+ *
107
+ * DSL Coverage Score breakdown (0-10):
108
+ * Tier 1 — Layer existence (same as before):
109
+ * - No files generated → 0 (early exit)
110
+ * - Endpoints declared but no endpoint layer → -4
111
+ * - Models declared but no model layer → -3
112
+ * Tier 2 — Model name coverage (new):
113
+ * - coverage < 50 % → -2
114
+ * - coverage 50–79 % → -1
115
+ * - coverage ≥ 80 % → 0
116
+ * Tier 3 — Endpoint file adequacy (new):
117
+ * - ≥5 endpoints declared but only 1 endpoint-layer file → -1
74
118
  */
75
119
  export function runSelfEval(opts: {
76
120
  dsl: SpecDSL | null;
@@ -91,18 +135,55 @@ export function runSelfEval(opts: {
91
135
  const endpointLayerCovered = generatedFiles.some((f) =>
92
136
  ENDPOINT_LAYER_PATTERNS.some((p) => p.test(f))
93
137
  );
138
+ const endpointLayerFiles = generatedFiles.filter((f) =>
139
+ ENDPOINT_LAYER_PATTERNS.some((p) => p.test(f))
140
+ ).length;
94
141
  const modelLayerCovered = generatedFiles.some((f) =>
95
142
  MODEL_LAYER_PATTERNS.some((p) => p.test(f))
96
143
  );
97
144
 
145
+ // ── Tier 2: Model name coverage ───────────────────────────────────────────
146
+ // For each DSL model, check if its name (lowercased/tokenized) appears
147
+ // in any generated file path. This catches "User model was declared but
148
+ // no user.ts / user.model.ts was generated".
149
+ let modelNameMatched = 0;
150
+ if (modelsTotal > 0 && dsl?.models) {
151
+ for (const model of dsl.models) {
152
+ const tokens = modelNameTokens(model.name);
153
+ const found = generatedFiles.some((f) => {
154
+ const lf = f.toLowerCase();
155
+ return tokens.some((t) => lf.includes(t));
156
+ });
157
+ if (found) modelNameMatched++;
158
+ }
159
+ }
160
+ const modelNameCoverage = modelsTotal > 0 ? modelNameMatched / modelsTotal : 1;
161
+
162
+ // ── Compute DSL Coverage Score ────────────────────────────────────────────
98
163
  let dslCoverageScore = 10;
164
+
99
165
  if (generatedFiles.length === 0) {
100
166
  dslCoverageScore = 0;
101
167
  } else {
168
+ // Tier 1: layer existence
102
169
  if (endpointsTotal > 0 && !endpointLayerCovered) dslCoverageScore -= 4;
103
170
  if (modelsTotal > 0 && !modelLayerCovered) dslCoverageScore -= 3;
171
+
172
+ // Tier 2: model name coverage (only meaningful when model layer exists)
173
+ if (modelsTotal > 0 && modelLayerCovered) {
174
+ if (modelNameCoverage < 0.5) dslCoverageScore -= 2;
175
+ else if (modelNameCoverage < 0.8) dslCoverageScore -= 1;
176
+ }
177
+
178
+ // Tier 3: endpoint file adequacy (many endpoints, very few files)
179
+ if (endpointsTotal >= 5 && endpointLayerCovered && endpointLayerFiles < 2) {
180
+ dslCoverageScore -= 1;
181
+ }
104
182
  }
105
183
 
184
+ // clamp to [0, 10]
185
+ dslCoverageScore = Math.max(0, Math.min(10, dslCoverageScore));
186
+
106
187
  // ── Compile Score ─────────────────────────────────────────────────────────
107
188
  // 10 = clean pass, 5 = error feedback ran but didn't fully clear / was skipped
108
189
  const compileScore = compilePassed ? 10 : 5;
@@ -124,8 +205,11 @@ export function runSelfEval(opts: {
124
205
  detail: {
125
206
  endpointsTotal,
126
207
  endpointLayerCovered,
208
+ endpointLayerFiles,
127
209
  modelsTotal,
128
210
  modelLayerCovered,
211
+ modelNameCoverage: Math.round(modelNameCoverage * 100) / 100,
212
+ modelNameMatched,
129
213
  filesWritten: generatedFiles.length,
130
214
  },
131
215
  };
@@ -138,6 +222,9 @@ export function runSelfEval(opts: {
138
222
  compileScore,
139
223
  reviewScore: reviewScore ?? undefined,
140
224
  promptHash,
225
+ modelNameCoverage: result.detail.modelNameCoverage,
226
+ modelNameMatched: result.detail.modelNameMatched,
227
+ endpointLayerFiles: result.detail.endpointLayerFiles,
141
228
  });
142
229
 
143
230
  return result;
@@ -161,12 +248,29 @@ export function printSelfEval(result: SelfEvalResult): void {
161
248
  ? `Review: ${result.reviewScore}/10`
162
249
  : chalk.gray("Review: skipped");
163
250
 
251
+ // Model coverage tag (only shown when there are declared models)
252
+ let modelCoverageTag = "";
253
+ if (result.detail.modelsTotal > 0) {
254
+ const pct = Math.round(result.detail.modelNameCoverage * 100);
255
+ const tag = `Models: ${result.detail.modelNameMatched}/${result.detail.modelsTotal} (${pct}%)`;
256
+ modelCoverageTag = pct >= 80
257
+ ? chalk.green(tag)
258
+ : pct >= 50
259
+ ? chalk.yellow(tag)
260
+ : chalk.red(tag);
261
+ }
262
+
164
263
  console.log(chalk.cyan("\n─── Harness Self-Eval ───────────────────────────"));
165
264
  console.log(` Score : ${scoreColor(`[${bar}] ${result.harnessScore}/10`)}`);
166
265
  console.log(
167
- ` DSL : ${scoreColor(result.dslCoverageScore + "/10")} ` +
266
+ ` DSL : ${scoreColor(String(result.dslCoverageScore) + "/10")} ` +
168
267
  `Compile: ${compileTag} ${reviewTag}`
169
268
  );
269
+ if (modelCoverageTag) {
270
+ console.log(` Detail : ${modelCoverageTag} ` +
271
+ chalk.gray(`Endpoints: ${result.detail.endpointsTotal} Files: ${result.detail.filesWritten}`)
272
+ );
273
+ }
170
274
  console.log(chalk.gray(` Prompt : ${result.promptHash}`));
171
- console.log(chalk.gray("─".repeat(49)));
275
+ console.log(chalk.cyan("─".repeat(49)));
172
276
  }