@yuaone/core 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/agent-loop.d.ts +40 -0
  2. package/dist/agent-loop.d.ts.map +1 -1
  3. package/dist/agent-loop.js +182 -4
  4. package/dist/agent-loop.js.map +1 -1
  5. package/dist/benchmark-runner.d.ts +141 -0
  6. package/dist/benchmark-runner.d.ts.map +1 -0
  7. package/dist/benchmark-runner.js +526 -0
  8. package/dist/benchmark-runner.js.map +1 -0
  9. package/dist/codebase-context.d.ts +49 -0
  10. package/dist/codebase-context.d.ts.map +1 -1
  11. package/dist/codebase-context.js +146 -0
  12. package/dist/codebase-context.js.map +1 -1
  13. package/dist/cost-optimizer.d.ts +159 -0
  14. package/dist/cost-optimizer.d.ts.map +1 -0
  15. package/dist/cost-optimizer.js +406 -0
  16. package/dist/cost-optimizer.js.map +1 -0
  17. package/dist/execution-policy-engine.d.ts +133 -0
  18. package/dist/execution-policy-engine.d.ts.map +1 -0
  19. package/dist/execution-policy-engine.js +367 -0
  20. package/dist/execution-policy-engine.js.map +1 -0
  21. package/dist/failure-recovery.d.ts +228 -0
  22. package/dist/failure-recovery.d.ts.map +1 -0
  23. package/dist/failure-recovery.js +664 -0
  24. package/dist/failure-recovery.js.map +1 -0
  25. package/dist/hierarchical-planner.d.ts +69 -1
  26. package/dist/hierarchical-planner.d.ts.map +1 -1
  27. package/dist/hierarchical-planner.js +117 -0
  28. package/dist/hierarchical-planner.js.map +1 -1
  29. package/dist/impact-analyzer.d.ts +92 -0
  30. package/dist/impact-analyzer.d.ts.map +1 -0
  31. package/dist/impact-analyzer.js +615 -0
  32. package/dist/impact-analyzer.js.map +1 -0
  33. package/dist/index.d.ts +14 -2
  34. package/dist/index.d.ts.map +1 -1
  35. package/dist/index.js +13 -0
  36. package/dist/index.js.map +1 -1
  37. package/dist/world-state.d.ts +87 -0
  38. package/dist/world-state.d.ts.map +1 -0
  39. package/dist/world-state.js +435 -0
  40. package/dist/world-state.js.map +1 -0
  41. package/package.json +11 -21
@@ -0,0 +1,141 @@
1
+ /**
2
+ * @module benchmark-runner
3
+ * @description Runs benchmarks against the YUAN coding agent to measure performance objectively.
4
+ *
5
+ * The BenchmarkRunner does NOT instantiate AgentLoop directly (avoiding circular dependencies).
6
+ * Instead, it records task specs and validates results. The actual agent execution is performed
7
+ * by the caller (CLI or test harness) which passes in AgentLoop results.
8
+ */
9
+ /** Benchmark task category */
10
+ export type BenchmarkCategory = "bug_fix" | "feature" | "refactor" | "test" | "docs";
11
+ /** Benchmark task difficulty */
12
+ export type BenchmarkDifficulty = "easy" | "medium" | "hard";
13
+ /** A single benchmark task definition */
14
+ export interface BenchmarkTask {
15
+ id: string;
16
+ name: string;
17
+ description: string;
18
+ category: BenchmarkCategory;
19
+ difficulty: BenchmarkDifficulty;
20
+ /** Path to test project directory */
21
+ setupDir: string;
22
+ /** User prompt to give the agent */
23
+ prompt: string;
24
+ /** Files that should be modified */
25
+ expectedFiles?: string[];
26
+ /** Shell command to validate result (exit 0 = pass) */
27
+ validationScript?: string;
28
+ /** Token budget for this task */
29
+ maxTokens?: number;
30
+ /** Timeout in ms (default 300000 = 5min) */
31
+ timeoutMs?: number;
32
+ }
33
+ /** Result of running a single benchmark task */
34
+ export interface BenchmarkResult {
35
+ taskId: string;
36
+ taskName: string;
37
+ success: boolean;
38
+ tokensUsed: number;
39
+ durationMs: number;
40
+ filesChanged: string[];
41
+ errors: string[];
42
+ validationOutput?: string;
43
+ terminationReason: string;
44
+ }
45
+ /** Aggregated summary of a benchmark suite run */
46
+ export interface BenchmarkSummary {
47
+ totalTasks: number;
48
+ passed: number;
49
+ failed: number;
50
+ /** Success rate 0-1 */
51
+ successRate: number;
52
+ avgTokensPerTask: number;
53
+ avgDurationMs: number;
54
+ totalCostEstimateUSD: number;
55
+ byCategory: Record<string, {
56
+ passed: number;
57
+ total: number;
58
+ }>;
59
+ byDifficulty: Record<string, {
60
+ passed: number;
61
+ total: number;
62
+ }>;
63
+ /** Task IDs that previously passed but now fail */
64
+ regressions: string[];
65
+ /** Task IDs that previously failed but now pass */
66
+ improvements: string[];
67
+ timestamp: string;
68
+ /** Individual task results */
69
+ results: BenchmarkResult[];
70
+ }
71
+ /** Configuration for the BenchmarkRunner */
72
+ export interface BenchmarkRunnerConfig {
73
+ /** Directory to store results (default ".yuan/benchmarks") */
74
+ resultsDir: string;
75
+ /** Max concurrent tasks (default 1 = sequential) */
76
+ maxConcurrent?: number;
77
+ /** Whether to save results to disk (default true) */
78
+ saveResults?: boolean;
79
+ /** Whether to compare against baseline (default true) */
80
+ compareBaseline?: boolean;
81
+ }
82
+ export declare class BenchmarkRunner {
83
+ private readonly config;
84
+ constructor(config: BenchmarkRunnerConfig);
85
+ /**
86
+ * Run a single benchmark task.
87
+ *
88
+ * This method prepares the working directory and validates the result,
89
+ * but does NOT call AgentLoop itself. The caller is responsible for
90
+ * actually running the agent between `runTask` setup and finalization.
91
+ *
92
+ * If `agentResult` is provided, it is used directly. Otherwise, a
93
+ * placeholder result is returned indicating the task is ready for execution.
94
+ */
95
+ runTask(task: BenchmarkTask, agentResult?: {
96
+ tokensUsed: number;
97
+ filesChanged: string[];
98
+ errors: string[];
99
+ terminationReason: string;
100
+ }): Promise<BenchmarkResult>;
101
+ /**
102
+ * Run all tasks in a benchmark suite.
103
+ *
104
+ * Tasks are run sequentially by default, or concurrently up to maxConcurrent.
105
+ * Each task must be provided with an agent result via the `taskResults` map.
106
+ */
107
+ runSuite(tasks: BenchmarkTask[], taskResults?: Map<string, {
108
+ tokensUsed: number;
109
+ filesChanged: string[];
110
+ errors: string[];
111
+ terminationReason: string;
112
+ }>): Promise<BenchmarkSummary>;
113
+ /**
114
+ * Load the most recent baseline benchmark result from resultsDir.
115
+ * Returns null if no previous results exist.
116
+ */
117
+ loadBaseline(): Promise<BenchmarkSummary | null>;
118
+ /**
119
+ * Save benchmark results to disk with atomic write.
120
+ * Returns the path to the saved file.
121
+ */
122
+ saveResults(summary: BenchmarkSummary): Promise<string>;
123
+ /**
124
+ * Compare current results with a baseline.
125
+ * Identifies regressions (was pass, now fail) and improvements (was fail, now pass).
126
+ */
127
+ compareWithBaseline(current: BenchmarkSummary, baseline: BenchmarkSummary): {
128
+ regressions: string[];
129
+ improvements: string[];
130
+ };
131
+ /**
132
+ * Generate a Markdown report from benchmark summary.
133
+ */
134
+ generateReport(summary: BenchmarkSummary): string;
135
+ /**
136
+ * Built-in sample tasks for quick testing.
137
+ * These are simple tasks that can validate the benchmark infrastructure itself.
138
+ */
139
+ static getSampleTasks(): BenchmarkTask[];
140
+ }
141
+ //# sourceMappingURL=benchmark-runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark-runner.d.ts","sourceRoot":"","sources":["../src/benchmark-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAUH,8BAA8B;AAC9B,MAAM,MAAM,iBAAiB,GAAG,SAAS,GAAG,SAAS,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,CAAC;AAErF,gCAAgC;AAChC,MAAM,MAAM,mBAAmB,GAAG,MAAM,GAAG,QAAQ,GAAG,MAAM,CAAC;AAE7D,yCAAyC;AACzC,MAAM,WAAW,aAAa;IAC5B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,UAAU,EAAE,mBAAmB,CAAC;IAChC,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,MAAM,EAAE,MAAM,CAAC;IACf,oCAAoC;IACpC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,uDAAuD;IACvD,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,4CAA4C;IAC5C,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,gDAAgD;AAChD,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED,kDAAkD;AAClD,MAAM,WAAW,gBAAgB;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,uBAAuB;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,gBAAgB,EAAE,MAAM,CAAC;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9D,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAChE,mDAAmD;IACnD,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,mDAAmD;IACnD,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,OAAO,EAAE,eAAe,EAAE,CAAC;CAC5B;AAED,4CAA4C;AAC5C,MAAM,WAAW,qBAAqB;IACpC,8DAA8D;IAC9D,UAAU,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,qDAAqD;IACrD,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,yDAAyD;IACzD,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B;AAsGD,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAkC;gBAE7C,MAAM,EAAE,qBAAqB;IASzC;;;;;;;;;OASG;IACG,OAAO,CACX,IAAI,EAAE,aAAa,EACnB,WAAW,CAAC,EAAE;QACZ,UAAU,EAAE,MAAM,CAAC;QACnB,YAAY,EAAE,MAAM,EAAE,CAAC;QACvB,MAAM,EAAE,MAAM,EAAE,CAAC;QACjB,iBAAiB,EAAE,MAAM,CAAC;KAC3B,GACA,OAAO,CAAC,eAAe,CAAC;IAmE3B;;;;;OAKG;IACG,QAAQ,CACZ,KAAK,EAAE,aAAa,EAAE,EACtB,WAAW,CAAC,EAAE,GAAG,CACf,MAAM,EACN;QACE,UAAU,EAAE,MAAM,CAAC;QACnB,YAAY,EAAE,MAAM,EAAE,CAAC;QACvB,MAAM,EAAE,MAAM,EAAE,CAAC;QACjB,iBAAiB,EAAE,MAAM,CAAC;KAC3B,CACF,GACA,OAAO,CAAC,gBAAgB,CAAC;IA6G5B;;;OAGG;IACG,YAAY,IAAI,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAkBtD;;;OAGG;IACG,WAAW,CAAC,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC;IAc7D;;;OAGG;IACH,mBAAmB,CACjB,OAAO,EAAE,gBAAgB,EACzB,QAAQ,EAAE,gBAAgB,GACzB;QAAE,WAAW,EAAE,MAAM,EAAE,CAAC;QAAC,YAAY,EAAE,MAAM,EAAE,CAAA;KAAE;IA8BpD;;OAEG;IACH,cAAc,CAAC,OAAO,EAAE,gBAAgB,GAAG,MAAM;IA4GjD;;;OAGG;IACH,MAAM,CAAC,cAAc,IAAI,aAAa,EAAE;CA0EzC"}
@@ -0,0 +1,526 @@
1
+ /**
2
+ * @module benchmark-runner
3
+ * @description Runs benchmarks against the YUAN coding agent to measure performance objectively.
4
+ *
5
+ * The BenchmarkRunner does NOT instantiate AgentLoop directly (avoiding circular dependencies).
6
+ * Instead, it records task specs and validates results. The actual agent execution is performed
7
+ * by the caller (CLI or test harness) which passes in AgentLoop results.
8
+ */
9
+ import { mkdir, readdir, readFile, writeFile, rename } from "fs/promises";
10
+ import { join } from "path";
11
+ import { exec } from "child_process";
12
+ import { randomUUID } from "crypto";
13
+ // ─── Cost Constants ───
14
+ /** Approximate cost per 1M tokens for estimation (Claude Sonnet-class) */
15
+ const COST_PER_MILLION_INPUT = 3.0;
16
+ const COST_PER_MILLION_OUTPUT = 15.0;
17
+ /** Rough ratio: assume 70% input, 30% output */
18
+ const INPUT_RATIO = 0.7;
19
+ const OUTPUT_RATIO = 0.3;
20
+ // ─── Helpers ───
21
+ /**
22
+ * Execute a shell command with timeout. Returns { stdout, stderr, exitCode }.
23
+ */
24
+ function execWithTimeout(command, cwd, timeoutMs) {
25
+ return new Promise((resolve) => {
26
+ const child = exec(command, { cwd, timeout: timeoutMs, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
27
+ resolve({
28
+ stdout: typeof stdout === "string" ? stdout : "",
29
+ stderr: typeof stderr === "string" ? stderr : "",
30
+ exitCode: error ? error.code ?? 1 : 0,
31
+ });
32
+ });
33
+ // Safety: kill if still running after timeout + grace period
34
+ setTimeout(() => {
35
+ try {
36
+ child.kill("SIGKILL");
37
+ }
38
+ catch {
39
+ // process already exited
40
+ }
41
+ }, timeoutMs + 5000);
42
+ });
43
+ }
44
+ /**
45
+ * Estimate USD cost from token count.
46
+ */
47
+ function estimateCostUSD(tokens) {
48
+ const inputTokens = tokens * INPUT_RATIO;
49
+ const outputTokens = tokens * OUTPUT_RATIO;
50
+ return (inputTokens / 1_000_000) * COST_PER_MILLION_INPUT + (outputTokens / 1_000_000) * COST_PER_MILLION_OUTPUT;
51
+ }
52
+ /**
53
+ * Atomic write: write to temp file then rename.
54
+ */
55
+ async function atomicWrite(filePath, content) {
56
+ const tmpPath = filePath + `.tmp.${randomUUID().slice(0, 8)}`;
57
+ await writeFile(tmpPath, content, "utf-8");
58
+ await rename(tmpPath, filePath);
59
+ }
60
+ /**
61
+ * List files that changed between two directory snapshots (shallow compare by mtime/size).
62
+ * Returns relative paths.
63
+ */
64
+ async function listChangedFiles(originalDir, modifiedDir) {
65
+ const changed = [];
66
+ async function walk(dir, base) {
67
+ const { readdir: rd, stat } = await import("fs/promises");
68
+ let entries;
69
+ try {
70
+ entries = await rd(dir, { withFileTypes: true });
71
+ }
72
+ catch {
73
+ return;
74
+ }
75
+ for (const entry of entries) {
76
+ const rel = base ? `${base}/${entry.name}` : entry.name;
77
+ const fullPath = join(dir, entry.name);
78
+ if (entry.isDirectory()) {
79
+ // Skip node_modules, .git, etc.
80
+ if (entry.name === "node_modules" || entry.name === ".git")
81
+ continue;
82
+ await walk(fullPath, rel);
83
+ }
84
+ else if (entry.isFile()) {
85
+ const origPath = join(originalDir, rel);
86
+ try {
87
+ const [origStat, modStat] = await Promise.all([stat(origPath), stat(fullPath)]);
88
+ if (origStat.mtimeMs !== modStat.mtimeMs || origStat.size !== modStat.size) {
89
+ changed.push(rel);
90
+ }
91
+ }
92
+ catch {
93
+ // File exists in modified but not original → new file
94
+ changed.push(rel);
95
+ }
96
+ }
97
+ }
98
+ }
99
+ await walk(modifiedDir, "");
100
+ return changed;
101
+ }
102
+ // ─── BenchmarkRunner ───
103
+ export class BenchmarkRunner {
104
+ config;
105
+ constructor(config) {
106
+ this.config = {
107
+ resultsDir: config.resultsDir || ".yuan/benchmarks",
108
+ maxConcurrent: config.maxConcurrent ?? 1,
109
+ saveResults: config.saveResults ?? true,
110
+ compareBaseline: config.compareBaseline ?? true,
111
+ };
112
+ }
113
+ /**
114
+ * Run a single benchmark task.
115
+ *
116
+ * This method prepares the working directory and validates the result,
117
+ * but does NOT call AgentLoop itself. The caller is responsible for
118
+ * actually running the agent between `runTask` setup and finalization.
119
+ *
120
+ * If `agentResult` is provided, it is used directly. Otherwise, a
121
+ * placeholder result is returned indicating the task is ready for execution.
122
+ */
123
+ async runTask(task, agentResult) {
124
+ const startTime = Date.now();
125
+ const timeoutMs = task.timeoutMs ?? 300_000;
126
+ const errors = [];
127
+ // If no agent result provided, return a placeholder indicating setup-only mode
128
+ if (!agentResult) {
129
+ return {
130
+ taskId: task.id,
131
+ taskName: task.name,
132
+ success: false,
133
+ tokensUsed: 0,
134
+ durationMs: Date.now() - startTime,
135
+ filesChanged: [],
136
+ errors: ["no_agent_result: task prepared but agent was not executed"],
137
+ terminationReason: "no_execution",
138
+ };
139
+ }
140
+ // Validate with validation script if provided
141
+ let validationOutput;
142
+ let validationPassed = true;
143
+ if (task.validationScript) {
144
+ try {
145
+ const result = await execWithTimeout(task.validationScript, task.setupDir, Math.min(timeoutMs, 60_000));
146
+ validationOutput = result.stdout + (result.stderr ? `\n[stderr] ${result.stderr}` : "");
147
+ if (result.exitCode !== 0) {
148
+ validationPassed = false;
149
+ errors.push(`validation_failed: exit code ${result.exitCode}`);
150
+ }
151
+ }
152
+ catch (err) {
153
+ validationPassed = false;
154
+ const msg = err instanceof Error ? err.message : String(err);
155
+ errors.push(`validation_error: ${msg}`);
156
+ }
157
+ }
158
+ // Check expected files if specified
159
+ if (task.expectedFiles && task.expectedFiles.length > 0) {
160
+ const missing = task.expectedFiles.filter((f) => !agentResult.filesChanged.includes(f));
161
+ if (missing.length > 0) {
162
+ errors.push(`missing_expected_files: ${missing.join(", ")}`);
163
+ }
164
+ }
165
+ // Merge agent errors
166
+ errors.push(...agentResult.errors);
167
+ // Determine success: no fatal errors and validation passed
168
+ const success = validationPassed && agentResult.errors.length === 0;
169
+ const durationMs = Date.now() - startTime;
170
+ return {
171
+ taskId: task.id,
172
+ taskName: task.name,
173
+ success,
174
+ tokensUsed: agentResult.tokensUsed,
175
+ durationMs,
176
+ filesChanged: agentResult.filesChanged,
177
+ errors,
178
+ validationOutput,
179
+ terminationReason: agentResult.terminationReason,
180
+ };
181
+ }
182
+ /**
183
+ * Run all tasks in a benchmark suite.
184
+ *
185
+ * Tasks are run sequentially by default, or concurrently up to maxConcurrent.
186
+ * Each task must be provided with an agent result via the `taskResults` map.
187
+ */
188
+ async runSuite(tasks, taskResults) {
189
+ const results = [];
190
+ const maxConcurrent = this.config.maxConcurrent;
191
+ if (maxConcurrent <= 1) {
192
+ // Sequential execution
193
+ for (const task of tasks) {
194
+ const agentResult = taskResults?.get(task.id);
195
+ const result = await this.runTask(task, agentResult);
196
+ results.push(result);
197
+ }
198
+ }
199
+ else {
200
+ // Concurrent execution in batches
201
+ for (let i = 0; i < tasks.length; i += maxConcurrent) {
202
+ const batch = tasks.slice(i, i + maxConcurrent);
203
+ const batchResults = await Promise.all(batch.map((task) => {
204
+ const agentResult = taskResults?.get(task.id);
205
+ return this.runTask(task, agentResult);
206
+ }));
207
+ results.push(...batchResults);
208
+ }
209
+ }
210
+ // Aggregate stats
211
+ const passed = results.filter((r) => r.success).length;
212
+ const failed = results.length - passed;
213
+ const totalTokens = results.reduce((sum, r) => sum + r.tokensUsed, 0);
214
+ const totalDuration = results.reduce((sum, r) => sum + r.durationMs, 0);
215
+ // Group by category
216
+ const byCategory = {};
217
+ for (const task of tasks) {
218
+ if (!byCategory[task.category]) {
219
+ byCategory[task.category] = { passed: 0, total: 0 };
220
+ }
221
+ byCategory[task.category].total++;
222
+ const result = results.find((r) => r.taskId === task.id);
223
+ if (result?.success) {
224
+ byCategory[task.category].passed++;
225
+ }
226
+ }
227
+ // Group by difficulty
228
+ const byDifficulty = {};
229
+ for (const task of tasks) {
230
+ if (!byDifficulty[task.difficulty]) {
231
+ byDifficulty[task.difficulty] = { passed: 0, total: 0 };
232
+ }
233
+ byDifficulty[task.difficulty].total++;
234
+ const result = results.find((r) => r.taskId === task.id);
235
+ if (result?.success) {
236
+ byDifficulty[task.difficulty].passed++;
237
+ }
238
+ }
239
+ // Compare with baseline if enabled
240
+ let regressions = [];
241
+ let improvements = [];
242
+ if (this.config.compareBaseline) {
243
+ const baseline = await this.loadBaseline();
244
+ if (baseline) {
245
+ const summary = {
246
+ totalTasks: tasks.length,
247
+ passed,
248
+ failed,
249
+ successRate: tasks.length > 0 ? passed / tasks.length : 0,
250
+ avgTokensPerTask: tasks.length > 0 ? totalTokens / tasks.length : 0,
251
+ avgDurationMs: tasks.length > 0 ? totalDuration / tasks.length : 0,
252
+ totalCostEstimateUSD: estimateCostUSD(totalTokens),
253
+ byCategory,
254
+ byDifficulty,
255
+ regressions: [],
256
+ improvements: [],
257
+ timestamp: new Date().toISOString(),
258
+ results,
259
+ };
260
+ const comparison = this.compareWithBaseline(summary, baseline);
261
+ regressions = comparison.regressions;
262
+ improvements = comparison.improvements;
263
+ }
264
+ }
265
+ const summary = {
266
+ totalTasks: tasks.length,
267
+ passed,
268
+ failed,
269
+ successRate: tasks.length > 0 ? passed / tasks.length : 0,
270
+ avgTokensPerTask: tasks.length > 0 ? totalTokens / tasks.length : 0,
271
+ avgDurationMs: tasks.length > 0 ? totalDuration / tasks.length : 0,
272
+ totalCostEstimateUSD: estimateCostUSD(totalTokens),
273
+ byCategory,
274
+ byDifficulty,
275
+ regressions,
276
+ improvements,
277
+ timestamp: new Date().toISOString(),
278
+ results,
279
+ };
280
+ // Save results if configured
281
+ if (this.config.saveResults) {
282
+ await this.saveResults(summary);
283
+ }
284
+ return summary;
285
+ }
286
+ /**
287
+ * Load the most recent baseline benchmark result from resultsDir.
288
+ * Returns null if no previous results exist.
289
+ */
290
+ async loadBaseline() {
291
+ try {
292
+ const files = await readdir(this.config.resultsDir);
293
+ const benchmarkFiles = files
294
+ .filter((f) => f.startsWith("benchmark-") && f.endsWith(".json"))
295
+ .sort()
296
+ .reverse();
297
+ if (benchmarkFiles.length === 0)
298
+ return null;
299
+ const latestFile = join(this.config.resultsDir, benchmarkFiles[0]);
300
+ const content = await readFile(latestFile, "utf-8");
301
+ return JSON.parse(content);
302
+ }
303
+ catch {
304
+ return null;
305
+ }
306
+ }
307
+ /**
308
+ * Save benchmark results to disk with atomic write.
309
+ * Returns the path to the saved file.
310
+ */
311
+ async saveResults(summary) {
312
+ await mkdir(this.config.resultsDir, { recursive: true });
313
+ // Format timestamp for filename: 2026-03-09T12-30-00Z
314
+ const ts = summary.timestamp.replace(/:/g, "-").replace(/\.\d+Z$/, "Z");
315
+ const filename = `benchmark-${ts}.json`;
316
+ const filePath = join(this.config.resultsDir, filename);
317
+ const content = JSON.stringify(summary, null, 2);
318
+ await atomicWrite(filePath, content);
319
+ return filePath;
320
+ }
321
+ /**
322
+ * Compare current results with a baseline.
323
+ * Identifies regressions (was pass, now fail) and improvements (was fail, now pass).
324
+ */
325
+ compareWithBaseline(current, baseline) {
326
+ const regressions = [];
327
+ const improvements = [];
328
+ // Build lookup maps from results arrays
329
+ const baselineMap = new Map();
330
+ for (const result of baseline.results ?? []) {
331
+ baselineMap.set(result.taskId, result.success);
332
+ }
333
+ const currentMap = new Map();
334
+ for (const result of current.results ?? []) {
335
+ currentMap.set(result.taskId, result.success);
336
+ }
337
+ // Compare tasks that exist in both runs
338
+ for (const [taskId, currentSuccess] of currentMap) {
339
+ const baselineSuccess = baselineMap.get(taskId);
340
+ if (baselineSuccess === undefined)
341
+ continue; // New task, skip
342
+ if (baselineSuccess && !currentSuccess) {
343
+ regressions.push(taskId);
344
+ }
345
+ else if (!baselineSuccess && currentSuccess) {
346
+ improvements.push(taskId);
347
+ }
348
+ }
349
+ return { regressions, improvements };
350
+ }
351
+ /**
352
+ * Generate a Markdown report from benchmark summary.
353
+ */
354
+ generateReport(summary) {
355
+ const lines = [];
356
+ lines.push("# YUAN Benchmark Report");
357
+ lines.push("");
358
+ lines.push(`**Date:** ${summary.timestamp}`);
359
+ lines.push("");
360
+ // ─── Overview ───
361
+ lines.push("## Overview");
362
+ lines.push("");
363
+ lines.push("| Metric | Value |");
364
+ lines.push("|--------|-------|");
365
+ lines.push(`| Total Tasks | ${summary.totalTasks} |`);
366
+ lines.push(`| Passed | ${summary.passed} |`);
367
+ lines.push(`| Failed | ${summary.failed} |`);
368
+ lines.push(`| Success Rate | ${(summary.successRate * 100).toFixed(1)}% |`);
369
+ lines.push(`| Avg Tokens/Task | ${Math.round(summary.avgTokensPerTask).toLocaleString()} |`);
370
+ lines.push(`| Avg Duration/Task | ${(summary.avgDurationMs / 1000).toFixed(1)}s |`);
371
+ lines.push(`| Est. Total Cost | $${summary.totalCostEstimateUSD.toFixed(4)} |`);
372
+ lines.push("");
373
+ // ─── By Category ───
374
+ lines.push("## Results by Category");
375
+ lines.push("");
376
+ lines.push("| Category | Passed | Total | Rate |");
377
+ lines.push("|----------|--------|-------|------|");
378
+ for (const [category, stats] of Object.entries(summary.byCategory)) {
379
+ const rate = stats.total > 0 ? ((stats.passed / stats.total) * 100).toFixed(0) : "0";
380
+ lines.push(`| ${category} | ${stats.passed} | ${stats.total} | ${rate}% |`);
381
+ }
382
+ lines.push("");
383
+ // ─── By Difficulty ───
384
+ lines.push("## Results by Difficulty");
385
+ lines.push("");
386
+ lines.push("| Difficulty | Passed | Total | Rate |");
387
+ lines.push("|------------|--------|-------|------|");
388
+ for (const [difficulty, stats] of Object.entries(summary.byDifficulty)) {
389
+ const rate = stats.total > 0 ? ((stats.passed / stats.total) * 100).toFixed(0) : "0";
390
+ lines.push(`| ${difficulty} | ${stats.passed} | ${stats.total} | ${rate}% |`);
391
+ }
392
+ lines.push("");
393
+ // ─── Regressions & Improvements ───
394
+ if (summary.regressions.length > 0) {
395
+ lines.push("## Regressions");
396
+ lines.push("");
397
+ lines.push("Tasks that previously passed but now fail:");
398
+ lines.push("");
399
+ for (const id of summary.regressions) {
400
+ lines.push(`- \`${id}\``);
401
+ }
402
+ lines.push("");
403
+ }
404
+ if (summary.improvements.length > 0) {
405
+ lines.push("## Improvements");
406
+ lines.push("");
407
+ lines.push("Tasks that previously failed but now pass:");
408
+ lines.push("");
409
+ for (const id of summary.improvements) {
410
+ lines.push(`- \`${id}\``);
411
+ }
412
+ lines.push("");
413
+ }
414
+ // ─── Individual Results ───
415
+ if (summary.results && summary.results.length > 0) {
416
+ lines.push("## Task Details");
417
+ lines.push("");
418
+ lines.push("| Task | Status | Tokens | Duration | Reason |");
419
+ lines.push("|------|--------|--------|----------|--------|");
420
+ for (const r of summary.results) {
421
+ const status = r.success ? "PASS" : "FAIL";
422
+ const tokens = r.tokensUsed.toLocaleString();
423
+ const duration = `${(r.durationMs / 1000).toFixed(1)}s`;
424
+ const reason = r.terminationReason.slice(0, 40);
425
+ lines.push(`| ${r.taskName} | ${status} | ${tokens} | ${duration} | ${reason} |`);
426
+ }
427
+ lines.push("");
428
+ // ─── Error Details ───
429
+ const failedResults = summary.results.filter((r) => !r.success && r.errors.length > 0);
430
+ if (failedResults.length > 0) {
431
+ lines.push("## Error Details");
432
+ lines.push("");
433
+ for (const r of failedResults) {
434
+ lines.push(`### ${r.taskName} (\`${r.taskId}\`)`);
435
+ lines.push("");
436
+ for (const err of r.errors) {
437
+ lines.push(`- ${err}`);
438
+ }
439
+ if (r.validationOutput) {
440
+ lines.push("");
441
+ lines.push("**Validation output:**");
442
+ lines.push("```");
443
+ lines.push(r.validationOutput.slice(0, 500));
444
+ lines.push("```");
445
+ }
446
+ lines.push("");
447
+ }
448
+ }
449
+ }
450
+ return lines.join("\n");
451
+ }
452
+ /**
453
+ * Built-in sample tasks for quick testing.
454
+ * These are simple tasks that can validate the benchmark infrastructure itself.
455
+ */
456
+ static getSampleTasks() {
457
+ return [
458
+ {
459
+ id: "fix-typo",
460
+ name: "Fix Typo",
461
+ description: "Fix a typo in a README file: 'recieve' should be 'receive'",
462
+ category: "bug_fix",
463
+ difficulty: "easy",
464
+ setupDir: "",
465
+ prompt: "Fix the typo in README.md: 'recieve' should be 'receive'",
466
+ expectedFiles: ["README.md"],
467
+ validationScript: "grep -q 'receive' README.md && ! grep -q 'recieve' README.md",
468
+ maxTokens: 1000,
469
+ timeoutMs: 60_000,
470
+ },
471
+ {
472
+ id: "add-function",
473
+ name: "Add Function",
474
+ description: "Add a simple utility function that returns the sum of two numbers",
475
+ category: "feature",
476
+ difficulty: "easy",
477
+ setupDir: "",
478
+ prompt: "Add an exported function 'add(a: number, b: number): number' to src/utils.ts that returns the sum of a and b",
479
+ expectedFiles: ["src/utils.ts"],
480
+ validationScript: "grep -q 'export function add' src/utils.ts",
481
+ maxTokens: 2000,
482
+ timeoutMs: 60_000,
483
+ },
484
+ {
485
+ id: "rename-variable",
486
+ name: "Rename Variable",
487
+ description: "Rename all occurrences of 'data' to 'payload' in src/handler.ts",
488
+ category: "refactor",
489
+ difficulty: "medium",
490
+ setupDir: "",
491
+ prompt: "Rename the variable 'data' to 'payload' in src/handler.ts (all occurrences)",
492
+ expectedFiles: ["src/handler.ts"],
493
+ validationScript: "grep -q 'payload' src/handler.ts && ! grep -q 'const data' src/handler.ts",
494
+ maxTokens: 5000,
495
+ timeoutMs: 120_000,
496
+ },
497
+ {
498
+ id: "add-unit-test",
499
+ name: "Add Unit Test",
500
+ description: "Add unit tests for the multiply function in src/math.ts",
501
+ category: "test",
502
+ difficulty: "medium",
503
+ setupDir: "",
504
+ prompt: "Write unit tests for the multiply(a, b) function in src/math.ts. Create src/__tests__/math.test.ts with at least 3 test cases including edge cases (zero, negative numbers).",
505
+ expectedFiles: ["src/__tests__/math.test.ts"],
506
+ validationScript: "test -f src/__tests__/math.test.ts && grep -c 'test\\|it(' src/__tests__/math.test.ts",
507
+ maxTokens: 8000,
508
+ timeoutMs: 180_000,
509
+ },
510
+ {
511
+ id: "multi-file-refactor",
512
+ name: "Multi-File Refactor",
513
+ description: "Extract a shared interface from two files that define similar types, and update both files to import from the shared module",
514
+ category: "refactor",
515
+ difficulty: "hard",
516
+ setupDir: "",
517
+ prompt: "Both src/user-service.ts and src/admin-service.ts define a 'UserRecord' interface with the same fields. Extract it to src/types/user.ts and update both files to import from there.",
518
+ expectedFiles: ["src/types/user.ts", "src/user-service.ts", "src/admin-service.ts"],
519
+ validationScript: 'test -f src/types/user.ts && grep -q "import.*UserRecord.*from" src/user-service.ts && grep -q "import.*UserRecord.*from" src/admin-service.ts',
520
+ maxTokens: 15000,
521
+ timeoutMs: 300_000,
522
+ },
523
+ ];
524
+ }
525
+ }
526
+ //# sourceMappingURL=benchmark-runner.js.map