ai-spec-dev 0.30.1 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +5 -1
- package/README.md +29 -1
- package/RELEASE_LOG.md +188 -0
- package/cli/commands/config.ts +93 -0
- package/cli/commands/export.ts +66 -0
- package/cli/commands/init.ts +153 -0
- package/cli/commands/learn.ts +30 -0
- package/cli/commands/logs.ts +106 -0
- package/cli/commands/model.ts +156 -0
- package/cli/commands/restore.ts +22 -0
- package/cli/commands/review.ts +63 -0
- package/cli/commands/trend.ts +36 -0
- package/cli/commands/update.ts +178 -0
- package/cli/commands/workspace.ts +219 -0
- package/cli/index.ts +301 -1
- package/cli/utils.ts +83 -0
- package/core/dsl-feedback.ts +255 -0
- package/core/prompt-hasher.ts +42 -0
- package/core/run-logger.ts +21 -0
- package/core/run-trend.ts +241 -0
- package/core/self-evaluator.ts +276 -0
- package/dist/cli/index.js +1089 -445
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/index.mjs +1089 -445
- package/dist/cli/index.mjs.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/index.mjs.map +1 -1
- package/package.json +6 -3
- package/purpose.md +189 -2
- package/tests/dsl-extractor.test.ts +264 -0
- package/tests/dsl-feedback.test.ts +266 -0
- package/tests/dsl-validator.test.ts +283 -0
- package/tests/error-feedback.test.ts +292 -0
- package/tests/provider-utils.test.ts +173 -0
- package/tests/run-trend.test.ts +186 -0
- package/tests/self-evaluator.test.ts +339 -0
- package/tests/spec-assessor.test.ts +142 -0
- package/tests/task-generator.test.ts +230 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* dsl-feedback.ts — Two pipeline feedback loops for ai-spec create
|
|
3
|
+
*
|
|
4
|
+
* Loop 1 (DSL → Spec): after DSL extraction, detect sparse/incomplete DSL
|
|
5
|
+
* and offer a targeted spec refinement pass before codegen starts.
|
|
6
|
+
*
|
|
7
|
+
* Loop 2 (Review → DSL): after 3-pass review, detect design-level findings
|
|
8
|
+
* (as opposed to implementation issues) and offer to amend the spec + DSL
|
|
9
|
+
* so the next update/regen starts from a corrected contract.
|
|
10
|
+
*
|
|
11
|
+
* Design constraints:
|
|
12
|
+
* - Both loops are SKIPPED in --auto / --fast / --skip-dsl modes.
|
|
13
|
+
* - Zero extra AI calls until the user explicitly opts in.
|
|
14
|
+
* - Non-blocking: user can always skip.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import chalk from "chalk";
|
|
18
|
+
import { SpecDSL } from "./dsl-types";
|
|
19
|
+
|
|
20
|
+
// ─── Loop 1 Types ─────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
export interface DslGap {
|
|
23
|
+
/** Short machine key for RunLog serialisation */
|
|
24
|
+
code: "sparse_model" | "missing_errors" | "generic_endpoint_desc" | "no_models_no_endpoints";
|
|
25
|
+
/** Human-readable message shown to the user */
|
|
26
|
+
message: string;
|
|
27
|
+
/** Concrete suggestion injected into the refinement prompt */
|
|
28
|
+
hint: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ─── Loop 1: DSL Richness Assessment ─────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Inspect a freshly-extracted DSL for common completeness gaps.
|
|
35
|
+
* Returns a list of DslGap objects (empty = DSL looks adequate).
|
|
36
|
+
*
|
|
37
|
+
* All checks are pure heuristics — zero AI calls.
|
|
38
|
+
*/
|
|
39
|
+
export function assessDslRichness(dsl: SpecDSL): DslGap[] {
|
|
40
|
+
const gaps: DslGap[] = [];
|
|
41
|
+
|
|
42
|
+
// ── No endpoints AND no models ────────────────────────────────────────────
|
|
43
|
+
if (dsl.endpoints.length === 0 && dsl.models.length === 0) {
|
|
44
|
+
gaps.push({
|
|
45
|
+
code: "no_models_no_endpoints",
|
|
46
|
+
message: "DSL has no endpoints and no models — spec may be too abstract for structured extraction",
|
|
47
|
+
hint: "Please add explicit API endpoint definitions (method, path, request/response) and any data models that this feature requires.",
|
|
48
|
+
});
|
|
49
|
+
return gaps; // no point checking the rest
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// ── Endpoints with very generic / short descriptions ─────────────────────
|
|
53
|
+
const GENERIC_DESC_KEYWORDS = ["handles", "processes", "manages", "操作", "处理", "管理"];
|
|
54
|
+
const GENERIC_DESC_MIN_LEN = 15;
|
|
55
|
+
|
|
56
|
+
for (const ep of dsl.endpoints) {
|
|
57
|
+
const desc = (ep.description ?? "").trim();
|
|
58
|
+
const isGeneric =
|
|
59
|
+
desc.length < GENERIC_DESC_MIN_LEN ||
|
|
60
|
+
GENERIC_DESC_KEYWORDS.some((kw) => desc.toLowerCase().startsWith(kw));
|
|
61
|
+
|
|
62
|
+
if (isGeneric) {
|
|
63
|
+
gaps.push({
|
|
64
|
+
code: "generic_endpoint_desc",
|
|
65
|
+
message: `Endpoint ${ep.method} ${ep.path} has a vague description: "${desc}"`,
|
|
66
|
+
hint: `Clarify what ${ep.method} ${ep.path} does: what inputs are required, what the success response contains, and what business rule it enforces.`,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ── Endpoints with no error definitions (but spec text likely mentions them) ──
|
|
72
|
+
const endpointsWithoutErrors = dsl.endpoints.filter(
|
|
73
|
+
(ep) => !ep.errors || ep.errors.length === 0
|
|
74
|
+
);
|
|
75
|
+
if (endpointsWithoutErrors.length > 0 && dsl.endpoints.length >= 2) {
|
|
76
|
+
gaps.push({
|
|
77
|
+
code: "missing_errors",
|
|
78
|
+
message: `${endpointsWithoutErrors.length}/${dsl.endpoints.length} endpoints have no error definitions`,
|
|
79
|
+
hint: `For each endpoint, specify at least the main error cases: e.g. 400 validation errors, 401 auth failures, 404 not found, 409 conflict. Include an error code (e.g. INVALID_INPUT) and description for each.`,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ── Models with fewer than 2 fields ──────────────────────────────────────
|
|
84
|
+
for (const model of dsl.models) {
|
|
85
|
+
if (!model.fields || model.fields.length < 2) {
|
|
86
|
+
gaps.push({
|
|
87
|
+
code: "sparse_model",
|
|
88
|
+
message: `Model "${model.name}" has only ${model.fields?.length ?? 0} field(s) — likely incomplete`,
|
|
89
|
+
hint: `List all fields for "${model.name}" with their types and whether they are required. Include at minimum an id, created_at, and the core domain fields this model needs.`,
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return gaps;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ─── Loop 1: Targeted Spec Refinement Prompt ─────────────────────────────────
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Build a targeted AI refinement prompt that focuses the LLM on filling
|
|
101
|
+
* only the specific gaps detected by `assessDslRichness`.
|
|
102
|
+
*/
|
|
103
|
+
export function buildDslGapRefinementPrompt(spec: string, gaps: DslGap[]): string {
|
|
104
|
+
const gapList = gaps
|
|
105
|
+
.map((g, i) => `${i + 1}. [${g.code}] ${g.message}\n → ${g.hint}`)
|
|
106
|
+
.join("\n\n");
|
|
107
|
+
|
|
108
|
+
return `The following feature spec has been structurally analysed. The DSL extracted from it was found to be incomplete in these specific areas:
|
|
109
|
+
|
|
110
|
+
${gapList}
|
|
111
|
+
|
|
112
|
+
Your task: revise the spec below to address ONLY the gaps listed above.
|
|
113
|
+
- Do NOT change the overall feature scope or business logic.
|
|
114
|
+
- Do NOT rewrite sections that are already complete.
|
|
115
|
+
- Add missing error cases, clarify vague endpoint descriptions, complete sparse model field lists.
|
|
116
|
+
- Output ONLY the complete revised Markdown spec. No preamble, no explanation.
|
|
117
|
+
|
|
118
|
+
=== Current Spec ===
|
|
119
|
+
${spec}`;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ─── Loop 2 Types ─────────────────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
export interface StructuralFinding {
|
|
125
|
+
/** Short label for display + RunLog */
|
|
126
|
+
category: "auth_design" | "model_design" | "api_contract" | "layer_violation" | "other_design";
|
|
127
|
+
description: string;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ─── Loop 2: Review Structural Issue Classifier ───────────────────────────────
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Parse a 3-pass review text to extract Pass 1 (architecture) findings
|
|
134
|
+
* that indicate design-level issues in the Spec/DSL — as opposed to
|
|
135
|
+
* implementation-level issues that belong in §9 knowledge.
|
|
136
|
+
*
|
|
137
|
+
* Returns an empty array if no structural issues are found or if the
|
|
138
|
+
* review score for Pass 1 is high (≥ 8), indicating overall approval.
|
|
139
|
+
*/
|
|
140
|
+
export function extractStructuralFindings(reviewText: string): StructuralFinding[] {
|
|
141
|
+
// Split by the separator used between passes ("─────...")
|
|
142
|
+
const parts = reviewText.split(/─{20,}/);
|
|
143
|
+
// Pass 1 is always the first section
|
|
144
|
+
const pass1Text = parts[0] ?? "";
|
|
145
|
+
|
|
146
|
+
// If Pass 1 scored well, treat as no structural issues
|
|
147
|
+
const pass1Score = extractPassScore(pass1Text);
|
|
148
|
+
if (pass1Score !== null && pass1Score >= 8) return [];
|
|
149
|
+
|
|
150
|
+
const findings: StructuralFinding[] = [];
|
|
151
|
+
|
|
152
|
+
// ── Auth / 认证 design issues ──────────────────────────────────────────
|
|
153
|
+
if (
|
|
154
|
+
/缺少认证|missing auth|auth.*false|未加认证|鉴权.*缺|endpoint.*public.*should/i.test(pass1Text)
|
|
155
|
+
) {
|
|
156
|
+
const match = pass1Text.match(/[^。\n]*(?:缺少认证|missing auth|auth.*false|未加认证|鉴权.*缺|endpoint.*public.*should)[^。\n]*/i);
|
|
157
|
+
findings.push({
|
|
158
|
+
category: "auth_design",
|
|
159
|
+
description: match ? match[0].trim() : "One or more endpoints may have incorrect authentication requirements",
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ── API contract / 接口设计 issues ────────────────────────────────────
|
|
164
|
+
if (
|
|
165
|
+
/接口设计.*问题|接口.*不合理|API design|response.*missing|request.*missing|接口.*缺少/i.test(pass1Text)
|
|
166
|
+
) {
|
|
167
|
+
const match = pass1Text.match(/[^。\n]*(?:接口设计.*问题|接口.*不合理|API design|response.*missing|接口.*缺少)[^。\n]*/i);
|
|
168
|
+
findings.push({
|
|
169
|
+
category: "api_contract",
|
|
170
|
+
description: match ? match[0].trim() : "API contract design may have issues",
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ── Model / 数据模型 design issues ────────────────────────────────────
|
|
175
|
+
if (
|
|
176
|
+
/模型.*缺少字段|model.*missing field|数据结构.*问题|schema.*incomplete|字段.*missing/i.test(pass1Text)
|
|
177
|
+
) {
|
|
178
|
+
const match = pass1Text.match(/[^。\n]*(?:模型.*缺少字段|model.*missing field|数据结构.*问题|schema.*incomplete)[^。\n]*/i);
|
|
179
|
+
findings.push({
|
|
180
|
+
category: "model_design",
|
|
181
|
+
description: match ? match[0].trim() : "Data model design may be incomplete",
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// ── Layer separation / 层级分离 violations ────────────────────────────
|
|
186
|
+
if (
|
|
187
|
+
/层级.*违反|layer.*violation|business logic.*controller|controller.*service.*混|分层.*问题/i.test(pass1Text)
|
|
188
|
+
) {
|
|
189
|
+
const match = pass1Text.match(/[^。\n]*(?:层级.*违反|layer.*violation|business logic.*controller|分层.*问题)[^。\n]*/i);
|
|
190
|
+
findings.push({
|
|
191
|
+
category: "layer_violation",
|
|
192
|
+
description: match ? match[0].trim() : "Layer separation may be violated in the generated code",
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return findings;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/** Extract the numeric score from a single pass section. */
|
|
200
|
+
function extractPassScore(text: string): number | null {
|
|
201
|
+
const m = text.match(/Score:\s*(\d+(?:\.\d+)?)\s*\/\s*10/i);
|
|
202
|
+
return m ? parseFloat(m[1]) : null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// ─── Loop 2: Spec Amendment Prompt ────────────────────────────────────────────
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Build a prompt asking the AI to produce a minimal spec amendment
|
|
209
|
+
* that addresses the structural findings from the review.
|
|
210
|
+
*
|
|
211
|
+
* The amendment is a targeted addition/correction — NOT a full rewrite.
|
|
212
|
+
*/
|
|
213
|
+
export function buildStructuralAmendmentPrompt(
|
|
214
|
+
spec: string,
|
|
215
|
+
findings: StructuralFinding[]
|
|
216
|
+
): string {
|
|
217
|
+
const findingList = findings
|
|
218
|
+
.map((f, i) => `${i + 1}. [${f.category}] ${f.description}`)
|
|
219
|
+
.join("\n");
|
|
220
|
+
|
|
221
|
+
return `A code review of the feature built from this spec found the following DESIGN-LEVEL issues.
|
|
222
|
+
These are problems in the spec/contract itself, not in the implementation.
|
|
223
|
+
|
|
224
|
+
=== Structural Findings ===
|
|
225
|
+
${findingList}
|
|
226
|
+
|
|
227
|
+
Your task:
|
|
228
|
+
- Revise the spec below to correct the design issues listed above.
|
|
229
|
+
- Do NOT change the feature scope, business logic, or sections unrelated to these findings.
|
|
230
|
+
- Be minimal: only change what is necessary to fix the design issues.
|
|
231
|
+
- Output ONLY the complete revised Markdown spec. No preamble, no explanation.
|
|
232
|
+
|
|
233
|
+
=== Current Spec ===
|
|
234
|
+
${spec}`;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ─── Display Helpers ──────────────────────────────────────────────────────────
|
|
238
|
+
|
|
239
|
+
export function printDslGaps(gaps: DslGap[]): void {
|
|
240
|
+
console.log(chalk.yellow("\n ⚠ DSL Completeness Check — gaps detected:"));
|
|
241
|
+
for (const gap of gaps) {
|
|
242
|
+
console.log(chalk.yellow(` · ${gap.message}`));
|
|
243
|
+
}
|
|
244
|
+
console.log(chalk.gray(" → A targeted spec refinement can fill these gaps before codegen."));
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
export function printStructuralFindings(findings: StructuralFinding[]): void {
|
|
248
|
+
console.log(chalk.yellow("\n ⚠ Review — structural (design-level) issues found:"));
|
|
249
|
+
for (const f of findings) {
|
|
250
|
+
const label = chalk.gray(`[${f.category}]`);
|
|
251
|
+
console.log(` ${label} ${f.description}`);
|
|
252
|
+
}
|
|
253
|
+
console.log(chalk.gray(" → These are contract issues in the Spec/DSL, not just implementation problems."));
|
|
254
|
+
console.log(chalk.gray(" → Fixing the spec now means the next run generates correct code from the start."));
|
|
255
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { createHash } from "crypto";
|
|
2
|
+
|
|
3
|
+
import { codeGenSystemPrompt } from "../prompts/codegen.prompt";
|
|
4
|
+
import {
|
|
5
|
+
reviewArchitectureSystemPrompt,
|
|
6
|
+
reviewImplementationSystemPrompt,
|
|
7
|
+
reviewImpactComplexitySystemPrompt,
|
|
8
|
+
} from "../prompts/codegen.prompt";
|
|
9
|
+
import { dslSystemPrompt } from "../prompts/dsl.prompt";
|
|
10
|
+
import { specPrompt } from "../prompts/spec.prompt";
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Compute a short deterministic hash of the key prompt strings used in a run.
|
|
14
|
+
*
|
|
15
|
+
* Why this matters (Harness Engineering):
|
|
16
|
+
* When you change a prompt and re-run `ai-spec create`, the resulting RunLog
|
|
17
|
+
* will have a different promptHash. Cross-referencing RunLogs by promptHash
|
|
18
|
+
* lets you quantify whether a prompt change improved or degraded harnessScore
|
|
19
|
+
* without keeping a separate changelog.
|
|
20
|
+
*
|
|
21
|
+
* Coverage: codegen system prompt (TS), DSL extractor, spec generator, and all
|
|
22
|
+
* three review-pass prompts — these drive the vast majority of token spend and
|
|
23
|
+
* output variance.
|
|
24
|
+
*
|
|
25
|
+
* Returns: 8-char lowercase hex (e.g. "a3f2c1d8"). Collision probability for
|
|
26
|
+
* practical prompt-tweak scenarios is negligible.
|
|
27
|
+
*/
|
|
28
|
+
export function computePromptHash(): string {
|
|
29
|
+
const segments = [
|
|
30
|
+
codeGenSystemPrompt,
|
|
31
|
+
dslSystemPrompt,
|
|
32
|
+
specPrompt,
|
|
33
|
+
reviewArchitectureSystemPrompt,
|
|
34
|
+
reviewImplementationSystemPrompt,
|
|
35
|
+
reviewImpactComplexitySystemPrompt,
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
return createHash("sha256")
|
|
39
|
+
.update(segments.join("\x00")) // \x00 separator prevents segment-boundary collisions
|
|
40
|
+
.digest("hex")
|
|
41
|
+
.slice(0, 8);
|
|
42
|
+
}
|
package/core/run-logger.ts
CHANGED
|
@@ -20,6 +20,15 @@ export interface RunLog {
|
|
|
20
20
|
provider?: string;
|
|
21
21
|
model?: string;
|
|
22
22
|
specPath?: string;
|
|
23
|
+
/**
|
|
24
|
+
* 8-char hex hash of the key prompt strings used in this run.
|
|
25
|
+
* Changes whenever any of: codegen, DSL, spec, or review prompts are edited.
|
|
26
|
+
* Use this to correlate RunLogs across runs and measure whether a prompt
|
|
27
|
+
* change improved or degraded harnessScore (Harness Engineering observability).
|
|
28
|
+
*/
|
|
29
|
+
promptHash?: string;
|
|
30
|
+
/** Harness self-evaluation score recorded at end of `create` (0-10). */
|
|
31
|
+
harnessScore?: number;
|
|
23
32
|
entries: LogEntry[];
|
|
24
33
|
filesWritten: string[];
|
|
25
34
|
errors: string[];
|
|
@@ -73,6 +82,18 @@ export class RunLogger {
|
|
|
73
82
|
this.flush();
|
|
74
83
|
}
|
|
75
84
|
|
|
85
|
+
/** Record the prompt hash for this run (call once at run start). */
|
|
86
|
+
setPromptHash(hash: string): void {
|
|
87
|
+
this.log.promptHash = hash;
|
|
88
|
+
this.flush();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/** Record the harness self-eval score (call once at run end). */
|
|
92
|
+
setHarnessScore(score: number): void {
|
|
93
|
+
this.log.harnessScore = score;
|
|
94
|
+
this.flush();
|
|
95
|
+
}
|
|
96
|
+
|
|
76
97
|
fileWritten(filePath: string): void {
|
|
77
98
|
if (!this.log.filesWritten.includes(filePath)) {
|
|
78
99
|
this.log.filesWritten.push(filePath);
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import * as fs from "fs-extra";
|
|
2
|
+
import * as path from "path";
|
|
3
|
+
import chalk from "chalk";
|
|
4
|
+
import { RunLog } from "./run-logger";
|
|
5
|
+
|
|
6
|
+
const LOG_DIR = ".ai-spec-logs";
|
|
7
|
+
|
|
8
|
+
// ─── Types ────────────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
export interface TrendEntry {
|
|
11
|
+
runId: string;
|
|
12
|
+
startedAt: string;
|
|
13
|
+
promptHash: string | null;
|
|
14
|
+
harnessScore: number | null;
|
|
15
|
+
specPath: string | null;
|
|
16
|
+
provider: string | null;
|
|
17
|
+
model: string | null;
|
|
18
|
+
filesWritten: number;
|
|
19
|
+
totalDurationMs: number | null;
|
|
20
|
+
errors: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface PromptGroupSummary {
|
|
24
|
+
promptHash: string;
|
|
25
|
+
runs: number;
|
|
26
|
+
avg: number;
|
|
27
|
+
best: number;
|
|
28
|
+
worst: number;
|
|
29
|
+
firstSeen: string;
|
|
30
|
+
lastSeen: string;
|
|
31
|
+
/** true if this is the most recently used prompt hash */
|
|
32
|
+
isCurrent: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface TrendReport {
|
|
36
|
+
entries: TrendEntry[];
|
|
37
|
+
promptGroups: PromptGroupSummary[];
|
|
38
|
+
totalRuns: number;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ─── Loader ──────────────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Read all RunLog JSON files from `.ai-spec-logs/`, sorted newest-first.
|
|
45
|
+
* Silently skips unreadable / corrupt files.
|
|
46
|
+
*/
|
|
47
|
+
export async function loadRunLogs(workingDir: string): Promise<RunLog[]> {
|
|
48
|
+
const logDir = path.join(workingDir, LOG_DIR);
|
|
49
|
+
if (!(await fs.pathExists(logDir))) return [];
|
|
50
|
+
|
|
51
|
+
const files = await fs.readdir(logDir);
|
|
52
|
+
const jsonFiles = files.filter((f) => f.endsWith(".json")).sort().reverse();
|
|
53
|
+
|
|
54
|
+
const logs: RunLog[] = [];
|
|
55
|
+
for (const file of jsonFiles) {
|
|
56
|
+
try {
|
|
57
|
+
const log: RunLog = await fs.readJson(path.join(logDir, file));
|
|
58
|
+
// only include runs that have a startedAt (minimal validity check)
|
|
59
|
+
if (log.runId && log.startedAt) {
|
|
60
|
+
logs.push(log);
|
|
61
|
+
}
|
|
62
|
+
} catch {
|
|
63
|
+
// corrupt file — skip silently
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return logs;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// ─── Aggregation ─────────────────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
export function buildTrendReport(
|
|
72
|
+
logs: RunLog[],
|
|
73
|
+
opts: { last?: number; promptFilter?: string } = {}
|
|
74
|
+
): TrendReport {
|
|
75
|
+
let entries: TrendEntry[] = logs.map((log) => ({
|
|
76
|
+
runId: log.runId,
|
|
77
|
+
startedAt: log.startedAt,
|
|
78
|
+
promptHash: log.promptHash ?? null,
|
|
79
|
+
harnessScore: log.harnessScore ?? null,
|
|
80
|
+
specPath: log.specPath ?? null,
|
|
81
|
+
provider: log.provider ?? null,
|
|
82
|
+
model: log.model ?? null,
|
|
83
|
+
filesWritten: log.filesWritten?.length ?? 0,
|
|
84
|
+
totalDurationMs: log.totalDurationMs ?? null,
|
|
85
|
+
errors: log.errors?.length ?? 0,
|
|
86
|
+
}));
|
|
87
|
+
|
|
88
|
+
// filter: only runs with a harnessScore (create runs)
|
|
89
|
+
entries = entries.filter((e) => e.harnessScore !== null);
|
|
90
|
+
|
|
91
|
+
// filter by prompt hash if requested
|
|
92
|
+
if (opts.promptFilter) {
|
|
93
|
+
entries = entries.filter((e) =>
|
|
94
|
+
e.promptHash?.startsWith(opts.promptFilter!)
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// limit to last N
|
|
99
|
+
if (opts.last && opts.last > 0) {
|
|
100
|
+
entries = entries.slice(0, opts.last);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// build prompt group summaries (only from filtered entries)
|
|
104
|
+
const groupMap = new Map<string, TrendEntry[]>();
|
|
105
|
+
for (const e of entries) {
|
|
106
|
+
const key = e.promptHash ?? "(none)";
|
|
107
|
+
if (!groupMap.has(key)) groupMap.set(key, []);
|
|
108
|
+
groupMap.get(key)!.push(e);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// determine "current" = the prompt hash of the most recent run
|
|
112
|
+
const currentHash = entries[0]?.promptHash ?? null;
|
|
113
|
+
|
|
114
|
+
const promptGroups: PromptGroupSummary[] = [];
|
|
115
|
+
for (const [hash, group] of groupMap.entries()) {
|
|
116
|
+
const scores = group.map((e) => e.harnessScore as number);
|
|
117
|
+
promptGroups.push({
|
|
118
|
+
promptHash: hash,
|
|
119
|
+
runs: group.length,
|
|
120
|
+
avg: Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 10) / 10,
|
|
121
|
+
best: Math.max(...scores),
|
|
122
|
+
worst: Math.min(...scores),
|
|
123
|
+
firstSeen: group[group.length - 1].startedAt,
|
|
124
|
+
lastSeen: group[0].startedAt,
|
|
125
|
+
isCurrent: hash === currentHash,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// sort groups: most recently used first
|
|
130
|
+
promptGroups.sort((a, b) => b.lastSeen.localeCompare(a.lastSeen));
|
|
131
|
+
|
|
132
|
+
return { entries, promptGroups, totalRuns: entries.length };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ─── Display ─────────────────────────────────────────────────────────────────
|
|
136
|
+
|
|
137
|
+
function scoreBar(score: number): string {
|
|
138
|
+
const filled = Math.round(score);
|
|
139
|
+
return "█".repeat(filled) + "░".repeat(10 - filled);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function scoreColor(score: number, text: string): string {
|
|
143
|
+
if (score >= 8) return chalk.green(text);
|
|
144
|
+
if (score >= 6) return chalk.yellow(text);
|
|
145
|
+
return chalk.red(text);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function formatDate(iso: string): string {
|
|
149
|
+
return iso.slice(0, 10); // YYYY-MM-DD
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function formatDuration(ms: number | null): string {
|
|
153
|
+
if (ms === null) return " — ";
|
|
154
|
+
const s = Math.round(ms / 1000);
|
|
155
|
+
if (s < 60) return `${s}s`;
|
|
156
|
+
return `${Math.floor(s / 60)}m${s % 60}s`;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function shortSpec(specPath: string | null): string {
|
|
160
|
+
if (!specPath) return chalk.gray("—");
|
|
161
|
+
return path.basename(specPath);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export function printTrendReport(report: TrendReport, workingDir: string): void {
|
|
165
|
+
const { entries, promptGroups } = report;
|
|
166
|
+
|
|
167
|
+
console.log(chalk.cyan("\n─── Harness Trend ───────────────────────────────────────────"));
|
|
168
|
+
|
|
169
|
+
if (entries.length === 0) {
|
|
170
|
+
console.log(chalk.gray(" No scored runs found. Run `ai-spec create` to start tracking."));
|
|
171
|
+
console.log(chalk.cyan("─".repeat(63)));
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ── Prompt Version Summary ────────────────────────────────────────
|
|
176
|
+
if (promptGroups.length > 0) {
|
|
177
|
+
console.log(chalk.bold("\n Prompt Versions:\n"));
|
|
178
|
+
|
|
179
|
+
const colWidths = {
|
|
180
|
+
hash: 10,
|
|
181
|
+
runs: 5,
|
|
182
|
+
avg: 5,
|
|
183
|
+
best: 5,
|
|
184
|
+
worst: 5,
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
// header
|
|
188
|
+
console.log(
|
|
189
|
+
chalk.gray(
|
|
190
|
+
" " +
|
|
191
|
+
"Hash ".padEnd(colWidths.hash) + " " +
|
|
192
|
+
"Runs ".padStart(colWidths.runs) + " " +
|
|
193
|
+
" Avg" + " " +
|
|
194
|
+
" Best" + " " +
|
|
195
|
+
"Worst" + " " +
|
|
196
|
+
"Last seen"
|
|
197
|
+
)
|
|
198
|
+
);
|
|
199
|
+
console.log(chalk.gray(" " + "─".repeat(55)));
|
|
200
|
+
|
|
201
|
+
for (const g of promptGroups) {
|
|
202
|
+
const currentMark = g.isCurrent ? chalk.cyan(" ◀ current") : "";
|
|
203
|
+
const avgStr = scoreColor(g.avg, g.avg.toFixed(1).padStart(5));
|
|
204
|
+
const bestStr = chalk.green(g.best.toFixed(1).padStart(5));
|
|
205
|
+
const worstStr = g.worst < 6 ? chalk.red(g.worst.toFixed(1).padStart(5)) : chalk.yellow(g.worst.toFixed(1).padStart(5));
|
|
206
|
+
|
|
207
|
+
console.log(
|
|
208
|
+
" " +
|
|
209
|
+
chalk.white(g.promptHash.padEnd(colWidths.hash)) + " " +
|
|
210
|
+
chalk.gray(String(g.runs).padStart(colWidths.runs)) + " " +
|
|
211
|
+
avgStr + " " +
|
|
212
|
+
bestStr + " " +
|
|
213
|
+
worstStr + " " +
|
|
214
|
+
chalk.gray(formatDate(g.lastSeen)) +
|
|
215
|
+
currentMark
|
|
216
|
+
);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// ── Run History ───────────────────────────────────────────────────
|
|
221
|
+
console.log(chalk.bold("\n Run History:\n"));
|
|
222
|
+
|
|
223
|
+
for (const e of entries) {
|
|
224
|
+
const score = e.harnessScore as number;
|
|
225
|
+
const bar = scoreColor(score, `[${scoreBar(score)}]`);
|
|
226
|
+
const scoreStr = scoreColor(score, score.toFixed(1).padStart(4));
|
|
227
|
+
const hash = e.promptHash ? chalk.gray(e.promptHash) : chalk.gray("(no hash)");
|
|
228
|
+
const dur = chalk.gray(formatDuration(e.totalDurationMs));
|
|
229
|
+
const errMark = e.errors > 0 ? chalk.yellow(` ⚠${e.errors}err`) : "";
|
|
230
|
+
const spec = chalk.gray(shortSpec(e.specPath));
|
|
231
|
+
|
|
232
|
+
console.log(
|
|
233
|
+
` ${chalk.gray(formatDate(e.startedAt))} ${bar}${scoreStr} ${hash} ${dur}${errMark} ${spec}`
|
|
234
|
+
);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ── Footer ────────────────────────────────────────────────────────
|
|
238
|
+
const logRelDir = path.relative(workingDir, path.join(workingDir, LOG_DIR));
|
|
239
|
+
console.log(chalk.gray(`\n ${entries.length} run(s) shown · logs: ${logRelDir}/`));
|
|
240
|
+
console.log(chalk.cyan("─".repeat(63)));
|
|
241
|
+
}
|