ai-spec-dev 0.30.1 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +5 -1
- package/README.md +29 -1
- package/RELEASE_LOG.md +188 -0
- package/cli/commands/config.ts +93 -0
- package/cli/commands/export.ts +66 -0
- package/cli/commands/init.ts +153 -0
- package/cli/commands/learn.ts +30 -0
- package/cli/commands/logs.ts +106 -0
- package/cli/commands/model.ts +156 -0
- package/cli/commands/restore.ts +22 -0
- package/cli/commands/review.ts +63 -0
- package/cli/commands/trend.ts +36 -0
- package/cli/commands/update.ts +178 -0
- package/cli/commands/workspace.ts +219 -0
- package/cli/index.ts +301 -1
- package/cli/utils.ts +83 -0
- package/core/dsl-feedback.ts +255 -0
- package/core/prompt-hasher.ts +42 -0
- package/core/run-logger.ts +21 -0
- package/core/run-trend.ts +241 -0
- package/core/self-evaluator.ts +276 -0
- package/dist/cli/index.js +1089 -445
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/index.mjs +1089 -445
- package/dist/cli/index.mjs.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/index.mjs.map +1 -1
- package/package.json +6 -3
- package/purpose.md +189 -2
- package/tests/dsl-extractor.test.ts +264 -0
- package/tests/dsl-feedback.test.ts +266 -0
- package/tests/dsl-validator.test.ts +283 -0
- package/tests/error-feedback.test.ts +292 -0
- package/tests/provider-utils.test.ts +173 -0
- package/tests/run-trend.test.ts +186 -0
- package/tests/self-evaluator.test.ts +339 -0
- package/tests/spec-assessor.test.ts +142 -0
- package/tests/task-generator.test.ts +230 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import chalk from "chalk";
|
|
2
|
+
import { SpecDSL } from "./dsl-types";
|
|
3
|
+
import { RunLogger } from "./run-logger";
|
|
4
|
+
|
|
5
|
+
// ─── Types ────────────────────────────────────────────────────────────────────
|
|
6
|
+
|
|
7
|
+
export interface SelfEvalResult {
|
|
8
|
+
/** 0-10: did generated files cover the endpoint + model layers declared in DSL? */
|
|
9
|
+
dslCoverageScore: number;
|
|
10
|
+
/** 0-10: 10 = error feedback passed cleanly, 5 = partial / skipped */
|
|
11
|
+
compileScore: number;
|
|
12
|
+
/** 0-10 extracted from 3-pass review text, or null when review was skipped */
|
|
13
|
+
reviewScore: number | null;
|
|
14
|
+
/** 0-10 weighted overall — the "Harness Score" recorded in RunLog */
|
|
15
|
+
harnessScore: number;
|
|
16
|
+
/** Prompt hash at the time this run executed */
|
|
17
|
+
promptHash: string;
|
|
18
|
+
detail: {
|
|
19
|
+
endpointsTotal: number;
|
|
20
|
+
endpointLayerCovered: boolean;
|
|
21
|
+
/** Number of endpoint-layer files generated */
|
|
22
|
+
endpointLayerFiles: number;
|
|
23
|
+
modelsTotal: number;
|
|
24
|
+
modelLayerCovered: boolean;
|
|
25
|
+
/** 0-1: fraction of DSL model names found in generated file paths */
|
|
26
|
+
modelNameCoverage: number;
|
|
27
|
+
/** Number of DSL model names actually matched in file paths */
|
|
28
|
+
modelNameMatched: number;
|
|
29
|
+
filesWritten: number;
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
/** File-path patterns that indicate an API / controller / route layer file. */
|
|
36
|
+
const ENDPOINT_LAYER_PATTERNS = [
|
|
37
|
+
/src\/api/,
|
|
38
|
+
/src\/routes?/,
|
|
39
|
+
/src\/controller/,
|
|
40
|
+
/src\/handler/,
|
|
41
|
+
/src\/endpoints?/,
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
/** File-path patterns that indicate a data / model / schema layer file. */
|
|
45
|
+
const MODEL_LAYER_PATTERNS = [
|
|
46
|
+
/src\/model/,
|
|
47
|
+
/src\/schema/,
|
|
48
|
+
/src\/entit/,
|
|
49
|
+
/src\/db/,
|
|
50
|
+
/prisma/,
|
|
51
|
+
/src\/data/,
|
|
52
|
+
/src\/domain/,
|
|
53
|
+
];
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Extract a numeric score from review text.
|
|
57
|
+
* Matches the same "Score: X/10" pattern as `reviewer.ts → extractScore()`.
|
|
58
|
+
*/
|
|
59
|
+
function extractReviewScore(reviewText: string): number | null {
|
|
60
|
+
const match = reviewText.match(/Score:\s*(\d+(?:\.\d+)?)\s*\/\s*10/i);
|
|
61
|
+
return match ? parseFloat(match[1]) : null;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ─── Main ─────────────────────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Normalize a PascalCase or camelCase model name to a set of search tokens
|
|
68
|
+
* that would appear in file paths.
|
|
69
|
+
*
|
|
70
|
+
* "OrderItem" → ["orderitem", "order-item", "order_item"]
|
|
71
|
+
* "User" → ["user"]
|
|
72
|
+
*/
|
|
73
|
+
export function modelNameTokens(name: string): string[] {
|
|
74
|
+
const lower = name.toLowerCase();
|
|
75
|
+
// split on uppercase boundaries: "OrderItem" → ["order", "item"]
|
|
76
|
+
const parts = name
|
|
77
|
+
.replace(/([A-Z])/g, "-$1")
|
|
78
|
+
.toLowerCase()
|
|
79
|
+
.replace(/^-/, "")
|
|
80
|
+
.split("-")
|
|
81
|
+
.filter(Boolean);
|
|
82
|
+
|
|
83
|
+
const tokens = new Set<string>();
|
|
84
|
+
tokens.add(lower);
|
|
85
|
+
if (parts.length > 1) {
|
|
86
|
+
tokens.add(parts.join("-"));
|
|
87
|
+
tokens.add(parts.join("_"));
|
|
88
|
+
}
|
|
89
|
+
return [...tokens];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Run a lightweight self-evaluation at the end of `ai-spec create`.
|
|
94
|
+
*
|
|
95
|
+
* Design goals (Harness Engineering):
|
|
96
|
+
* - Zero AI calls: all scoring is deterministic file-system + text checks
|
|
97
|
+
* - Produces a single `harnessScore` (0-10) stored in RunLog alongside `promptHash`
|
|
98
|
+
* - Lets you compare runs across prompt versions: did harnessScore go up or down?
|
|
99
|
+
*
|
|
100
|
+
* Scoring weights:
|
|
101
|
+
* | Dimension | Weight (with review) | Weight (review skipped) |
|
|
102
|
+
* |-----------------|----------------------|-------------------------|
|
|
103
|
+
* | DSL Coverage | 40 % | 55 % |
|
|
104
|
+
* | Compile/Error | 30 % | 45 % |
|
|
105
|
+
* | Review Score | 30 % | — |
|
|
106
|
+
*
|
|
107
|
+
* DSL Coverage Score breakdown (0-10):
|
|
108
|
+
* Tier 1 — Layer existence (same as before):
|
|
109
|
+
* - No files generated → 0 (early exit)
|
|
110
|
+
* - Endpoints declared but no endpoint layer → -4
|
|
111
|
+
* - Models declared but no model layer → -3
|
|
112
|
+
* Tier 2 — Model name coverage (new):
|
|
113
|
+
* - coverage < 50 % → -2
|
|
114
|
+
* - coverage 50–79 % → -1
|
|
115
|
+
* - coverage ≥ 80 % → 0
|
|
116
|
+
* Tier 3 — Endpoint file adequacy (new):
|
|
117
|
+
* - ≥5 endpoints declared but only 1 endpoint-layer file → -1
|
|
118
|
+
*/
|
|
119
|
+
export function runSelfEval(opts: {
|
|
120
|
+
dsl: SpecDSL | null;
|
|
121
|
+
generatedFiles: string[];
|
|
122
|
+
/** true = error-feedback loop ended with all checks passing */
|
|
123
|
+
compilePassed: boolean;
|
|
124
|
+
/** Full text of the 3-pass review output; empty string if review was skipped */
|
|
125
|
+
reviewText: string;
|
|
126
|
+
promptHash: string;
|
|
127
|
+
logger: RunLogger;
|
|
128
|
+
}): SelfEvalResult {
|
|
129
|
+
const { dsl, generatedFiles, compilePassed, reviewText, promptHash, logger } = opts;
|
|
130
|
+
|
|
131
|
+
// ── DSL Coverage Score ────────────────────────────────────────────────────
|
|
132
|
+
const endpointsTotal = dsl?.endpoints?.length ?? 0;
|
|
133
|
+
const modelsTotal = dsl?.models?.length ?? 0;
|
|
134
|
+
|
|
135
|
+
const endpointLayerCovered = generatedFiles.some((f) =>
|
|
136
|
+
ENDPOINT_LAYER_PATTERNS.some((p) => p.test(f))
|
|
137
|
+
);
|
|
138
|
+
const endpointLayerFiles = generatedFiles.filter((f) =>
|
|
139
|
+
ENDPOINT_LAYER_PATTERNS.some((p) => p.test(f))
|
|
140
|
+
).length;
|
|
141
|
+
const modelLayerCovered = generatedFiles.some((f) =>
|
|
142
|
+
MODEL_LAYER_PATTERNS.some((p) => p.test(f))
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
// ── Tier 2: Model name coverage ───────────────────────────────────────────
|
|
146
|
+
// For each DSL model, check if its name (lowercased/tokenized) appears
|
|
147
|
+
// in any generated file path. This catches "User model was declared but
|
|
148
|
+
// no user.ts / user.model.ts was generated".
|
|
149
|
+
let modelNameMatched = 0;
|
|
150
|
+
if (modelsTotal > 0 && dsl?.models) {
|
|
151
|
+
for (const model of dsl.models) {
|
|
152
|
+
const tokens = modelNameTokens(model.name);
|
|
153
|
+
const found = generatedFiles.some((f) => {
|
|
154
|
+
const lf = f.toLowerCase();
|
|
155
|
+
return tokens.some((t) => lf.includes(t));
|
|
156
|
+
});
|
|
157
|
+
if (found) modelNameMatched++;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
const modelNameCoverage = modelsTotal > 0 ? modelNameMatched / modelsTotal : 1;
|
|
161
|
+
|
|
162
|
+
// ── Compute DSL Coverage Score ────────────────────────────────────────────
|
|
163
|
+
let dslCoverageScore = 10;
|
|
164
|
+
|
|
165
|
+
if (generatedFiles.length === 0) {
|
|
166
|
+
dslCoverageScore = 0;
|
|
167
|
+
} else {
|
|
168
|
+
// Tier 1: layer existence
|
|
169
|
+
if (endpointsTotal > 0 && !endpointLayerCovered) dslCoverageScore -= 4;
|
|
170
|
+
if (modelsTotal > 0 && !modelLayerCovered) dslCoverageScore -= 3;
|
|
171
|
+
|
|
172
|
+
// Tier 2: model name coverage (only meaningful when model layer exists)
|
|
173
|
+
if (modelsTotal > 0 && modelLayerCovered) {
|
|
174
|
+
if (modelNameCoverage < 0.5) dslCoverageScore -= 2;
|
|
175
|
+
else if (modelNameCoverage < 0.8) dslCoverageScore -= 1;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Tier 3: endpoint file adequacy (many endpoints, very few files)
|
|
179
|
+
if (endpointsTotal >= 5 && endpointLayerCovered && endpointLayerFiles < 2) {
|
|
180
|
+
dslCoverageScore -= 1;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// clamp to [0, 10]
|
|
185
|
+
dslCoverageScore = Math.max(0, Math.min(10, dslCoverageScore));
|
|
186
|
+
|
|
187
|
+
// ── Compile Score ─────────────────────────────────────────────────────────
|
|
188
|
+
// 10 = clean pass, 5 = error feedback ran but didn't fully clear / was skipped
|
|
189
|
+
const compileScore = compilePassed ? 10 : 5;
|
|
190
|
+
|
|
191
|
+
// ── Review Score ──────────────────────────────────────────────────────────
|
|
192
|
+
const reviewScore = reviewText ? extractReviewScore(reviewText) : null;
|
|
193
|
+
|
|
194
|
+
// ── Harness Score (weighted average) ──────────────────────────────────────
|
|
195
|
+
const harnessScore = reviewScore !== null
|
|
196
|
+
? Math.round((dslCoverageScore * 0.4 + compileScore * 0.3 + reviewScore * 0.3) * 10) / 10
|
|
197
|
+
: Math.round((dslCoverageScore * 0.55 + compileScore * 0.45) * 10) / 10;
|
|
198
|
+
|
|
199
|
+
const result: SelfEvalResult = {
|
|
200
|
+
dslCoverageScore,
|
|
201
|
+
compileScore,
|
|
202
|
+
reviewScore,
|
|
203
|
+
harnessScore,
|
|
204
|
+
promptHash,
|
|
205
|
+
detail: {
|
|
206
|
+
endpointsTotal,
|
|
207
|
+
endpointLayerCovered,
|
|
208
|
+
endpointLayerFiles,
|
|
209
|
+
modelsTotal,
|
|
210
|
+
modelLayerCovered,
|
|
211
|
+
modelNameCoverage: Math.round(modelNameCoverage * 100) / 100,
|
|
212
|
+
modelNameMatched,
|
|
213
|
+
filesWritten: generatedFiles.length,
|
|
214
|
+
},
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
// Persist to RunLog
|
|
218
|
+
logger.setHarnessScore(harnessScore);
|
|
219
|
+
logger.stageEnd("self_eval", {
|
|
220
|
+
harnessScore,
|
|
221
|
+
dslCoverageScore,
|
|
222
|
+
compileScore,
|
|
223
|
+
reviewScore: reviewScore ?? undefined,
|
|
224
|
+
promptHash,
|
|
225
|
+
modelNameCoverage: result.detail.modelNameCoverage,
|
|
226
|
+
modelNameMatched: result.detail.modelNameMatched,
|
|
227
|
+
endpointLayerFiles: result.detail.endpointLayerFiles,
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
return result;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// ─── Display ──────────────────────────────────────────────────────────────────
|
|
234
|
+
|
|
235
|
+
export function printSelfEval(result: SelfEvalResult): void {
|
|
236
|
+
const scoreColor =
|
|
237
|
+
result.harnessScore >= 8 ? chalk.green :
|
|
238
|
+
result.harnessScore >= 6 ? chalk.yellow :
|
|
239
|
+
chalk.red;
|
|
240
|
+
|
|
241
|
+
const filled = Math.round(result.harnessScore);
|
|
242
|
+
const bar = "█".repeat(filled) + "░".repeat(10 - filled);
|
|
243
|
+
|
|
244
|
+
const compileTag = result.compileScore === 10
|
|
245
|
+
? chalk.green("pass")
|
|
246
|
+
: chalk.yellow("partial");
|
|
247
|
+
const reviewTag = result.reviewScore !== null
|
|
248
|
+
? `Review: ${result.reviewScore}/10`
|
|
249
|
+
: chalk.gray("Review: skipped");
|
|
250
|
+
|
|
251
|
+
// Model coverage tag (only shown when there are declared models)
|
|
252
|
+
let modelCoverageTag = "";
|
|
253
|
+
if (result.detail.modelsTotal > 0) {
|
|
254
|
+
const pct = Math.round(result.detail.modelNameCoverage * 100);
|
|
255
|
+
const tag = `Models: ${result.detail.modelNameMatched}/${result.detail.modelsTotal} (${pct}%)`;
|
|
256
|
+
modelCoverageTag = pct >= 80
|
|
257
|
+
? chalk.green(tag)
|
|
258
|
+
: pct >= 50
|
|
259
|
+
? chalk.yellow(tag)
|
|
260
|
+
: chalk.red(tag);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
console.log(chalk.cyan("\n─── Harness Self-Eval ───────────────────────────"));
|
|
264
|
+
console.log(` Score : ${scoreColor(`[${bar}] ${result.harnessScore}/10`)}`);
|
|
265
|
+
console.log(
|
|
266
|
+
` DSL : ${scoreColor(String(result.dslCoverageScore) + "/10")} ` +
|
|
267
|
+
`Compile: ${compileTag} ${reviewTag}`
|
|
268
|
+
);
|
|
269
|
+
if (modelCoverageTag) {
|
|
270
|
+
console.log(` Detail : ${modelCoverageTag} ` +
|
|
271
|
+
chalk.gray(`Endpoints: ${result.detail.endpointsTotal} Files: ${result.detail.filesWritten}`)
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
console.log(chalk.gray(` Prompt : ${result.promptHash}`));
|
|
275
|
+
console.log(chalk.cyan("─".repeat(49)));
|
|
276
|
+
}
|