@vibe-forge/core 0.7.5 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -46
- package/src/env.ts +5 -25
- package/src/index.ts +0 -5
- package/src/types.ts +13 -72
- package/src/ws.ts +2 -12
- package/src/adapter/index.ts +0 -6
- package/src/adapter/loader.ts +0 -11
- package/src/adapter/type.ts +0 -117
- package/src/config/load.ts +0 -122
- package/src/config/types.ts +0 -289
- package/src/config.ts +0 -2
- package/src/controllers/benchmark/discover.ts +0 -89
- package/src/controllers/benchmark/index.ts +0 -24
- package/src/controllers/benchmark/result-store.ts +0 -46
- package/src/controllers/benchmark/runner.ts +0 -415
- package/src/controllers/benchmark/schema.ts +0 -60
- package/src/controllers/benchmark/types.ts +0 -80
- package/src/controllers/benchmark/utils.ts +0 -144
- package/src/controllers/benchmark/workspace.ts +0 -179
- package/src/controllers/config/index.ts +0 -214
- package/src/controllers/system/assets/completed.mp3 +0 -0
- package/src/controllers/system/assets/mcp.png +0 -0
- package/src/controllers/system/index.ts +0 -102
- package/src/controllers/task/generate-adapter-query-options.ts +0 -25
- package/src/controllers/task/index.ts +0 -2
- package/src/controllers/task/prepare.ts +0 -74
- package/src/controllers/task/run.ts +0 -231
- package/src/controllers/task/schema.ts +0 -131
- package/src/controllers/task/type.ts +0 -6
- package/src/hooks/bridge.ts +0 -368
- package/src/hooks/call.ts +0 -74
- package/src/hooks/index.ts +0 -41
- package/src/hooks/loader.ts +0 -79
- package/src/hooks/native.ts +0 -116
- package/src/hooks/runtime.ts +0 -139
- package/src/hooks/type.ts +0 -145
- package/src/utils/cache.ts +0 -58
- package/src/utils/create-logger.ts +0 -89
- package/src/utils/definition-loader.ts +0 -530
- package/src/utils/filter.ts +0 -26
- package/src/utils/string-transform.ts +0 -37
- package/src/utils/uuid.ts +0 -6
- package/src/utils/workspace-assets.ts +0 -919
|
@@ -1,415 +0,0 @@
|
|
|
1
|
-
import { readFile } from 'node:fs/promises'
|
|
2
|
-
import process from 'node:process'
|
|
3
|
-
|
|
4
|
-
import { run as runTask } from '#~/controllers/task/index.js'
|
|
5
|
-
import { uuid } from '#~/utils/uuid.js'
|
|
6
|
-
|
|
7
|
-
import { getBenchmarkCase, listBenchmarkCases } from './discover'
|
|
8
|
-
import { writeBenchmarkResult } from './result-store'
|
|
9
|
-
import type { BenchmarkRunCaseInput, BenchmarkRunCaseOutput, BenchmarkRunCategoryInput, BenchmarkRunCategoryOutput, BenchmarkRunEvent } from './types'
|
|
10
|
-
import type { BenchmarkResult } from './schema'
|
|
11
|
-
import { createCaseWorkspace, disposeCaseWorkspace } from './workspace'
|
|
12
|
-
import { execCommand, execShellCommand, parseDiffFiles, summarizeText } from './utils'
|
|
13
|
-
|
|
14
|
-
interface TaskExecutionResult {
|
|
15
|
-
sessionId: string
|
|
16
|
-
exitCode: number
|
|
17
|
-
stderr: string
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
interface JudgedResult {
|
|
21
|
-
status: BenchmarkResult['status']
|
|
22
|
-
finalScore: number
|
|
23
|
-
scores: BenchmarkResult['scores']
|
|
24
|
-
judgeSummary: string
|
|
25
|
-
issues: string[]
|
|
26
|
-
changedFiles: string[]
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
const emitRunEvent = (
|
|
30
|
-
input: BenchmarkRunCaseInput | BenchmarkRunCategoryInput,
|
|
31
|
-
event: Omit<BenchmarkRunEvent, 'timestamp'>
|
|
32
|
-
) => {
|
|
33
|
-
input.onEvent?.({
|
|
34
|
-
...event,
|
|
35
|
-
timestamp: new Date().toISOString()
|
|
36
|
-
})
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
const runAgentTask = async (
|
|
40
|
-
input: BenchmarkRunCaseInput,
|
|
41
|
-
taskDescription: string,
|
|
42
|
-
cwd: string,
|
|
43
|
-
timeoutSec: number
|
|
44
|
-
): Promise<TaskExecutionResult> => {
|
|
45
|
-
const sessionId = uuid()
|
|
46
|
-
|
|
47
|
-
return new Promise((resolve, reject) => {
|
|
48
|
-
let sessionHandle: { kill: () => void } | undefined
|
|
49
|
-
let settled = false
|
|
50
|
-
let stderr = ''
|
|
51
|
-
let exitCode = -1
|
|
52
|
-
let timer: NodeJS.Timeout | undefined
|
|
53
|
-
|
|
54
|
-
const finish = (value: TaskExecutionResult | Error, isError = false) => {
|
|
55
|
-
if (settled) return
|
|
56
|
-
settled = true
|
|
57
|
-
if (timer != null) clearTimeout(timer)
|
|
58
|
-
if (isError) {
|
|
59
|
-
reject(value)
|
|
60
|
-
return
|
|
61
|
-
}
|
|
62
|
-
resolve(value as TaskExecutionResult)
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
void (async () => {
|
|
66
|
-
try {
|
|
67
|
-
const { session } = await runTask({
|
|
68
|
-
adapter: input.adapter,
|
|
69
|
-
cwd,
|
|
70
|
-
env: input.env
|
|
71
|
-
}, {
|
|
72
|
-
type: 'create',
|
|
73
|
-
runtime: input.runtime ?? 'cli',
|
|
74
|
-
sessionId,
|
|
75
|
-
model: input.model,
|
|
76
|
-
systemPrompt: input.systemPrompt,
|
|
77
|
-
permissionMode: input.permissionMode,
|
|
78
|
-
mode: 'stream',
|
|
79
|
-
description: taskDescription,
|
|
80
|
-
onEvent: (event) => {
|
|
81
|
-
if (event.type === 'exit') {
|
|
82
|
-
stderr = event.data.stderr ?? stderr
|
|
83
|
-
exitCode = event.data.exitCode ?? exitCode
|
|
84
|
-
finish({
|
|
85
|
-
sessionId,
|
|
86
|
-
exitCode,
|
|
87
|
-
stderr
|
|
88
|
-
})
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
})
|
|
92
|
-
|
|
93
|
-
sessionHandle = session
|
|
94
|
-
timer = setTimeout(() => {
|
|
95
|
-
sessionHandle?.kill()
|
|
96
|
-
finish({
|
|
97
|
-
sessionId,
|
|
98
|
-
exitCode: -1,
|
|
99
|
-
stderr: 'Task execution timed out'
|
|
100
|
-
})
|
|
101
|
-
}, timeoutSec * 1000)
|
|
102
|
-
} catch (error) {
|
|
103
|
-
finish(error as Error, true)
|
|
104
|
-
}
|
|
105
|
-
})()
|
|
106
|
-
})
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
const collectCandidatePatch = async (cwd: string) => {
|
|
110
|
-
await execCommand({
|
|
111
|
-
command: 'git',
|
|
112
|
-
args: ['add', '-N', '.'],
|
|
113
|
-
cwd
|
|
114
|
-
})
|
|
115
|
-
const diffResult = await execCommand({
|
|
116
|
-
command: 'git',
|
|
117
|
-
args: ['diff', '--binary', '--no-ext-diff', '--submodule=diff', '--', '.'],
|
|
118
|
-
cwd
|
|
119
|
-
})
|
|
120
|
-
if (diffResult.exitCode !== 0) {
|
|
121
|
-
throw new Error(diffResult.stderr || 'Failed to collect candidate patch')
|
|
122
|
-
}
|
|
123
|
-
return diffResult.stdout
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const applyPatchFromFile = async (cwd: string, patchPath: string) => {
|
|
127
|
-
const result = await execCommand({
|
|
128
|
-
command: 'git',
|
|
129
|
-
args: ['apply', '--allow-empty', '--recount', '--whitespace=nowarn', '--binary', patchPath],
|
|
130
|
-
cwd
|
|
131
|
-
})
|
|
132
|
-
if (result.exitCode !== 0) {
|
|
133
|
-
throw new Error(result.stderr || result.stdout || `Failed to apply patch: ${patchPath}`)
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
const judgeResult = (params: {
|
|
138
|
-
testExitCode: number
|
|
139
|
-
candidatePatch: string
|
|
140
|
-
referencePatch: string
|
|
141
|
-
taskExitCode: number
|
|
142
|
-
taskStderr: string
|
|
143
|
-
}) => {
|
|
144
|
-
const changedFiles = parseDiffFiles(params.candidatePatch)
|
|
145
|
-
const referenceFiles = parseDiffFiles(params.referencePatch)
|
|
146
|
-
const overlapCount = referenceFiles.filter(file => changedFiles.includes(file)).length
|
|
147
|
-
const referenceScore = referenceFiles.length === 0
|
|
148
|
-
? (changedFiles.length > 0 ? 1 : 0)
|
|
149
|
-
: Number((overlapCount / referenceFiles.length).toFixed(2))
|
|
150
|
-
const testScore = params.testExitCode === 0 ? 1 : 0
|
|
151
|
-
|
|
152
|
-
let goalScore = testScore
|
|
153
|
-
const issues: string[] = []
|
|
154
|
-
|
|
155
|
-
if (changedFiles.length === 0) {
|
|
156
|
-
goalScore = 0
|
|
157
|
-
issues.push('Agent 未产出代码改动')
|
|
158
|
-
} else if (testScore === 0) {
|
|
159
|
-
goalScore = Math.max(0.2, Number((referenceScore * 0.5).toFixed(2)))
|
|
160
|
-
issues.push('验收测试未通过')
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
if (params.taskExitCode !== 0) {
|
|
164
|
-
issues.push(`任务执行异常退出:${params.taskExitCode}`)
|
|
165
|
-
}
|
|
166
|
-
if (params.taskStderr.trim() !== '') {
|
|
167
|
-
issues.push(`任务执行 stderr:${summarizeText(params.taskStderr, 240)}`)
|
|
168
|
-
}
|
|
169
|
-
if (referenceFiles.length > 0 && overlapCount === 0 && changedFiles.length > 0) {
|
|
170
|
-
issues.push('改动文件与参考实现无重叠,需要人工复核目标一致性')
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
const finalScore = Number((0.7 * testScore + 0.2 * goalScore + 0.1 * referenceScore).toFixed(2))
|
|
174
|
-
const status: BenchmarkResult['status'] =
|
|
175
|
-
testScore === 1 && goalScore >= 0.8
|
|
176
|
-
? 'pass'
|
|
177
|
-
: finalScore > 0
|
|
178
|
-
? 'partial'
|
|
179
|
-
: 'fail'
|
|
180
|
-
|
|
181
|
-
const judgeSummary = status === 'pass'
|
|
182
|
-
? '验收测试通过,任务目标完成度良好。'
|
|
183
|
-
: status === 'partial'
|
|
184
|
-
? '实现存在部分有效改动,但仍需进一步完善。'
|
|
185
|
-
: '当前实现未完成任务目标。'
|
|
186
|
-
|
|
187
|
-
return {
|
|
188
|
-
status,
|
|
189
|
-
finalScore,
|
|
190
|
-
scores: {
|
|
191
|
-
testScore,
|
|
192
|
-
goalScore,
|
|
193
|
-
referenceScore
|
|
194
|
-
},
|
|
195
|
-
judgeSummary,
|
|
196
|
-
issues,
|
|
197
|
-
changedFiles
|
|
198
|
-
} satisfies JudgedResult
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
export const runBenchmarkCase = async (input: BenchmarkRunCaseInput): Promise<BenchmarkRunCaseOutput> => {
|
|
202
|
-
const workspaceFolder = input.workspaceFolder ?? process.cwd()
|
|
203
|
-
const runId = input.runId ?? uuid()
|
|
204
|
-
const startedAt = Date.now()
|
|
205
|
-
|
|
206
|
-
emitRunEvent(input, {
|
|
207
|
-
runId,
|
|
208
|
-
category: input.category,
|
|
209
|
-
title: input.title,
|
|
210
|
-
scope: 'case',
|
|
211
|
-
phase: 'discover',
|
|
212
|
-
message: 'Loading benchmark case'
|
|
213
|
-
})
|
|
214
|
-
|
|
215
|
-
const caseItem = await getBenchmarkCase({
|
|
216
|
-
workspaceFolder,
|
|
217
|
-
category: input.category,
|
|
218
|
-
title: input.title
|
|
219
|
-
})
|
|
220
|
-
|
|
221
|
-
if (caseItem == null) {
|
|
222
|
-
throw new Error(`Benchmark case not found: ${input.category}/${input.title}`)
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
emitRunEvent(input, {
|
|
226
|
-
runId,
|
|
227
|
-
category: input.category,
|
|
228
|
-
title: input.title,
|
|
229
|
-
scope: 'case',
|
|
230
|
-
phase: 'workspace',
|
|
231
|
-
message: 'Preparing isolated benchmark workspace'
|
|
232
|
-
})
|
|
233
|
-
|
|
234
|
-
let workspaceState: Awaited<ReturnType<typeof createCaseWorkspace>> | null = null
|
|
235
|
-
|
|
236
|
-
try {
|
|
237
|
-
workspaceState = await createCaseWorkspace({
|
|
238
|
-
workspaceFolder,
|
|
239
|
-
category: input.category,
|
|
240
|
-
title: input.title,
|
|
241
|
-
runId,
|
|
242
|
-
baseCommit: caseItem.frontmatter.baseCommit,
|
|
243
|
-
setupCommand: caseItem.frontmatter.setupCommand,
|
|
244
|
-
timeoutSec: caseItem.frontmatter.timeoutSec
|
|
245
|
-
})
|
|
246
|
-
|
|
247
|
-
emitRunEvent(input, {
|
|
248
|
-
runId,
|
|
249
|
-
category: input.category,
|
|
250
|
-
title: input.title,
|
|
251
|
-
scope: 'case',
|
|
252
|
-
phase: 'task',
|
|
253
|
-
message: 'Running task agent'
|
|
254
|
-
})
|
|
255
|
-
|
|
256
|
-
const taskResult = await runAgentTask(
|
|
257
|
-
input,
|
|
258
|
-
caseItem.rfcBody,
|
|
259
|
-
workspaceState.caseWorkspacePath,
|
|
260
|
-
caseItem.frontmatter.timeoutSec
|
|
261
|
-
)
|
|
262
|
-
const candidatePatch = await collectCandidatePatch(workspaceState.caseWorkspacePath)
|
|
263
|
-
|
|
264
|
-
emitRunEvent(input, {
|
|
265
|
-
runId,
|
|
266
|
-
category: input.category,
|
|
267
|
-
title: input.title,
|
|
268
|
-
scope: 'case',
|
|
269
|
-
phase: 'verify',
|
|
270
|
-
message: 'Applying benchmark test patch'
|
|
271
|
-
})
|
|
272
|
-
|
|
273
|
-
await applyPatchFromFile(workspaceState.caseWorkspacePath, caseItem.patchTestPath)
|
|
274
|
-
const testResult = await execShellCommand({
|
|
275
|
-
command: caseItem.frontmatter.testCommand,
|
|
276
|
-
cwd: workspaceState.caseWorkspacePath,
|
|
277
|
-
timeoutMs: caseItem.frontmatter.timeoutSec * 1000,
|
|
278
|
-
env: input.env
|
|
279
|
-
})
|
|
280
|
-
const referencePatch = await readFile(caseItem.patchPath, 'utf-8')
|
|
281
|
-
const judged = judgeResult({
|
|
282
|
-
testExitCode: testResult.exitCode,
|
|
283
|
-
candidatePatch,
|
|
284
|
-
referencePatch,
|
|
285
|
-
taskExitCode: taskResult.exitCode,
|
|
286
|
-
taskStderr: taskResult.stderr
|
|
287
|
-
})
|
|
288
|
-
|
|
289
|
-
const result: BenchmarkResult = {
|
|
290
|
-
category: caseItem.category,
|
|
291
|
-
title: caseItem.title,
|
|
292
|
-
status: judged.status,
|
|
293
|
-
finalScore: judged.finalScore,
|
|
294
|
-
scores: judged.scores,
|
|
295
|
-
baseCommit: caseItem.frontmatter.baseCommit,
|
|
296
|
-
durationMs: Date.now() - startedAt,
|
|
297
|
-
setupCommand: caseItem.frontmatter.setupCommand,
|
|
298
|
-
testCommand: caseItem.frontmatter.testCommand,
|
|
299
|
-
testExitCode: testResult.exitCode,
|
|
300
|
-
judgeSummary: judged.judgeSummary,
|
|
301
|
-
issues: judged.issues,
|
|
302
|
-
timestamp: new Date().toISOString(),
|
|
303
|
-
runId,
|
|
304
|
-
taskSessionId: taskResult.sessionId,
|
|
305
|
-
taskExitCode: taskResult.exitCode,
|
|
306
|
-
changedFiles: judged.changedFiles,
|
|
307
|
-
categoryRunId: input.categoryRunId
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
await writeBenchmarkResult(workspaceFolder, result)
|
|
311
|
-
|
|
312
|
-
emitRunEvent(input, {
|
|
313
|
-
runId,
|
|
314
|
-
category: input.category,
|
|
315
|
-
title: input.title,
|
|
316
|
-
scope: 'case',
|
|
317
|
-
phase: 'result',
|
|
318
|
-
message: `Benchmark completed with status ${result.status}`
|
|
319
|
-
})
|
|
320
|
-
|
|
321
|
-
return {
|
|
322
|
-
runId,
|
|
323
|
-
result
|
|
324
|
-
}
|
|
325
|
-
} catch (error) {
|
|
326
|
-
const result: BenchmarkResult = {
|
|
327
|
-
category: caseItem.category,
|
|
328
|
-
title: caseItem.title,
|
|
329
|
-
status: 'fail',
|
|
330
|
-
finalScore: 0,
|
|
331
|
-
scores: {
|
|
332
|
-
testScore: 0,
|
|
333
|
-
goalScore: 0,
|
|
334
|
-
referenceScore: 0
|
|
335
|
-
},
|
|
336
|
-
baseCommit: caseItem.frontmatter.baseCommit,
|
|
337
|
-
durationMs: Date.now() - startedAt,
|
|
338
|
-
setupCommand: caseItem.frontmatter.setupCommand,
|
|
339
|
-
testCommand: caseItem.frontmatter.testCommand,
|
|
340
|
-
testExitCode: -1,
|
|
341
|
-
judgeSummary: 'Benchmark 运行失败。',
|
|
342
|
-
issues: [error instanceof Error ? error.message : String(error)],
|
|
343
|
-
timestamp: new Date().toISOString(),
|
|
344
|
-
runId,
|
|
345
|
-
categoryRunId: input.categoryRunId
|
|
346
|
-
}
|
|
347
|
-
await writeBenchmarkResult(workspaceFolder, result)
|
|
348
|
-
return {
|
|
349
|
-
runId,
|
|
350
|
-
result
|
|
351
|
-
}
|
|
352
|
-
} finally {
|
|
353
|
-
if (workspaceState != null) {
|
|
354
|
-
await disposeCaseWorkspace(workspaceState)
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
export const runBenchmarkCategory = async (input: BenchmarkRunCategoryInput): Promise<BenchmarkRunCategoryOutput> => {
|
|
360
|
-
const workspaceFolder = input.workspaceFolder ?? process.cwd()
|
|
361
|
-
const runId = input.runId ?? uuid()
|
|
362
|
-
const allCases = await listBenchmarkCases({
|
|
363
|
-
workspaceFolder,
|
|
364
|
-
category: input.category
|
|
365
|
-
})
|
|
366
|
-
const selectedCases = input.titles?.length
|
|
367
|
-
? allCases.filter(item => input.titles?.includes(item.title))
|
|
368
|
-
: allCases
|
|
369
|
-
|
|
370
|
-
if (selectedCases.length === 0) {
|
|
371
|
-
throw new Error(`No benchmark cases found for category: ${input.category}`)
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
const concurrency = Math.max(1, Math.min(input.concurrency ?? 2, selectedCases.length))
|
|
375
|
-
const results: BenchmarkResult[] = []
|
|
376
|
-
let nextIndex = 0
|
|
377
|
-
|
|
378
|
-
const worker = async () => {
|
|
379
|
-
while (nextIndex < selectedCases.length) {
|
|
380
|
-
const currentIndex = nextIndex
|
|
381
|
-
nextIndex += 1
|
|
382
|
-
const item = selectedCases[currentIndex]
|
|
383
|
-
emitRunEvent(input, {
|
|
384
|
-
runId,
|
|
385
|
-
category: input.category,
|
|
386
|
-
title: item.title,
|
|
387
|
-
scope: 'category',
|
|
388
|
-
phase: 'discover',
|
|
389
|
-
message: `Scheduling case ${item.title}`
|
|
390
|
-
})
|
|
391
|
-
const output = await runBenchmarkCase({
|
|
392
|
-
workspaceFolder,
|
|
393
|
-
category: input.category,
|
|
394
|
-
title: item.title,
|
|
395
|
-
adapter: input.adapter,
|
|
396
|
-
model: input.model,
|
|
397
|
-
systemPrompt: input.systemPrompt,
|
|
398
|
-
permissionMode: input.permissionMode,
|
|
399
|
-
runtime: input.runtime,
|
|
400
|
-
env: input.env,
|
|
401
|
-
categoryRunId: runId,
|
|
402
|
-
onEvent: input.onEvent
|
|
403
|
-
})
|
|
404
|
-
results.push(output.result)
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
await Promise.all(Array.from({ length: concurrency }, () => worker()))
|
|
409
|
-
|
|
410
|
-
return {
|
|
411
|
-
runId,
|
|
412
|
-
category: input.category,
|
|
413
|
-
results: results.sort((a, b) => a.title.localeCompare(b.title))
|
|
414
|
-
}
|
|
415
|
-
}
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import z from 'zod'
|
|
2
|
-
|
|
3
|
-
export const BenchmarkFrontmatterSchema = z.object({
|
|
4
|
-
title: z.string().trim().optional(),
|
|
5
|
-
description: z.string().trim().optional(),
|
|
6
|
-
baseCommit: z.string().trim().min(1),
|
|
7
|
-
setupCommand: z.string().trim().min(1),
|
|
8
|
-
testCommand: z.string().trim().min(1),
|
|
9
|
-
timeoutSec: z.number().int().positive().default(900)
|
|
10
|
-
})
|
|
11
|
-
|
|
12
|
-
export const BenchmarkStatusSchema = z.enum(['pass', 'partial', 'fail'])
|
|
13
|
-
|
|
14
|
-
export const BenchmarkScoresSchema = z.object({
|
|
15
|
-
testScore: z.number().min(0).max(1),
|
|
16
|
-
goalScore: z.number().min(0).max(1),
|
|
17
|
-
referenceScore: z.number().min(0).max(1)
|
|
18
|
-
})
|
|
19
|
-
|
|
20
|
-
export const BenchmarkResultSchema = z.object({
|
|
21
|
-
category: z.string().trim().min(1),
|
|
22
|
-
title: z.string().trim().min(1),
|
|
23
|
-
status: BenchmarkStatusSchema,
|
|
24
|
-
finalScore: z.number().min(0).max(1),
|
|
25
|
-
scores: BenchmarkScoresSchema,
|
|
26
|
-
baseCommit: z.string().trim().min(1),
|
|
27
|
-
durationMs: z.number().int().nonnegative(),
|
|
28
|
-
setupCommand: z.string().trim().min(1),
|
|
29
|
-
testCommand: z.string().trim().min(1),
|
|
30
|
-
testExitCode: z.number().int(),
|
|
31
|
-
judgeSummary: z.string(),
|
|
32
|
-
issues: z.array(z.string()),
|
|
33
|
-
timestamp: z.string().trim().min(1),
|
|
34
|
-
runId: z.string().trim().min(1).optional(),
|
|
35
|
-
taskSessionId: z.string().trim().min(1).optional(),
|
|
36
|
-
taskExitCode: z.number().int().optional(),
|
|
37
|
-
changedFiles: z.array(z.string()).optional(),
|
|
38
|
-
categoryRunId: z.string().trim().min(1).optional()
|
|
39
|
-
})
|
|
40
|
-
|
|
41
|
-
export const BenchmarkRunSummarySchema = z.object({
|
|
42
|
-
runId: z.string().trim().min(1),
|
|
43
|
-
category: z.string().trim().min(1),
|
|
44
|
-
title: z.string().trim().min(1).optional(),
|
|
45
|
-
status: z.enum(['queued', 'running', 'completed', 'failed']),
|
|
46
|
-
startedAt: z.number().int().positive(),
|
|
47
|
-
finishedAt: z.number().int().positive().optional(),
|
|
48
|
-
completedCount: z.number().int().nonnegative().optional(),
|
|
49
|
-
totalCount: z.number().int().positive().optional(),
|
|
50
|
-
result: BenchmarkResultSchema.optional(),
|
|
51
|
-
results: z.array(BenchmarkResultSchema).optional(),
|
|
52
|
-
error: z.string().optional(),
|
|
53
|
-
lastMessage: z.string().optional()
|
|
54
|
-
})
|
|
55
|
-
|
|
56
|
-
export type BenchmarkFrontmatter = z.infer<typeof BenchmarkFrontmatterSchema>
|
|
57
|
-
export type BenchmarkStatus = z.infer<typeof BenchmarkStatusSchema>
|
|
58
|
-
export type BenchmarkScores = z.infer<typeof BenchmarkScoresSchema>
|
|
59
|
-
export type BenchmarkResult = z.infer<typeof BenchmarkResultSchema>
|
|
60
|
-
export type BenchmarkRunSummary = z.infer<typeof BenchmarkRunSummarySchema>
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
import type { BenchmarkFrontmatter, BenchmarkResult } from './schema'
|
|
2
|
-
|
|
3
|
-
export interface BenchmarkCase {
|
|
4
|
-
id: string
|
|
5
|
-
category: string
|
|
6
|
-
title: string
|
|
7
|
-
caseDir: string
|
|
8
|
-
rfcPath: string
|
|
9
|
-
patchPath: string
|
|
10
|
-
patchTestPath: string
|
|
11
|
-
rfcBody: string
|
|
12
|
-
rfcRaw: string
|
|
13
|
-
summary: string
|
|
14
|
-
frontmatter: BenchmarkFrontmatter
|
|
15
|
-
latestResult?: BenchmarkResult | null
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export interface BenchmarkCategory {
|
|
19
|
-
category: string
|
|
20
|
-
caseCount: number
|
|
21
|
-
lastStatuses: Record<'pass' | 'partial' | 'fail', number>
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export interface BenchmarkListOptions {
|
|
25
|
-
workspaceFolder?: string
|
|
26
|
-
category?: string
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
export interface BenchmarkCaseSelector {
|
|
30
|
-
workspaceFolder?: string
|
|
31
|
-
category: string
|
|
32
|
-
title: string
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
export type BenchmarkPermissionMode = 'default' | 'acceptEdits' | 'plan' | 'dontAsk' | 'bypassPermissions'
|
|
36
|
-
|
|
37
|
-
export interface BenchmarkAgentOptions {
|
|
38
|
-
adapter?: string
|
|
39
|
-
model?: string
|
|
40
|
-
systemPrompt?: string
|
|
41
|
-
permissionMode?: BenchmarkPermissionMode
|
|
42
|
-
runtime?: 'cli' | 'server'
|
|
43
|
-
env?: Record<string, string | null | undefined>
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export interface BenchmarkRunEvent {
|
|
47
|
-
runId: string
|
|
48
|
-
category: string
|
|
49
|
-
title?: string
|
|
50
|
-
scope: 'category' | 'case'
|
|
51
|
-
phase: 'discover' | 'workspace' | 'setup' | 'task' | 'verify' | 'result'
|
|
52
|
-
message: string
|
|
53
|
-
timestamp: string
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
export interface BenchmarkRunCaseInput extends BenchmarkCaseSelector, BenchmarkAgentOptions {
|
|
57
|
-
runId?: string
|
|
58
|
-
categoryRunId?: string
|
|
59
|
-
onEvent?: (event: BenchmarkRunEvent) => void
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
export interface BenchmarkRunCaseOutput {
|
|
63
|
-
runId: string
|
|
64
|
-
result: BenchmarkResult
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
export interface BenchmarkRunCategoryInput extends BenchmarkAgentOptions {
|
|
68
|
-
workspaceFolder?: string
|
|
69
|
-
category: string
|
|
70
|
-
titles?: string[]
|
|
71
|
-
concurrency?: number
|
|
72
|
-
runId?: string
|
|
73
|
-
onEvent?: (event: BenchmarkRunEvent) => void
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
export interface BenchmarkRunCategoryOutput {
|
|
77
|
-
runId: string
|
|
78
|
-
category: string
|
|
79
|
-
results: BenchmarkResult[]
|
|
80
|
-
}
|
|
@@ -1,144 +0,0 @@
|
|
|
1
|
-
import { spawn } from 'node:child_process'
|
|
2
|
-
import { access, mkdir, readFile, symlink } from 'node:fs/promises'
|
|
3
|
-
import { constants } from 'node:fs'
|
|
4
|
-
import { dirname, join } from 'node:path'
|
|
5
|
-
import process from 'node:process'
|
|
6
|
-
|
|
7
|
-
export interface CommandResult {
|
|
8
|
-
exitCode: number
|
|
9
|
-
stdout: string
|
|
10
|
-
stderr: string
|
|
11
|
-
durationMs: number
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
interface ExecCommandInput {
|
|
15
|
-
command: string
|
|
16
|
-
args?: string[]
|
|
17
|
-
cwd: string
|
|
18
|
-
env?: Record<string, string | null | undefined>
|
|
19
|
-
timeoutMs?: number
|
|
20
|
-
input?: string
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
export const pathExists = async (targetPath: string) => {
|
|
24
|
-
try {
|
|
25
|
-
await access(targetPath, constants.F_OK)
|
|
26
|
-
return true
|
|
27
|
-
} catch {
|
|
28
|
-
return false
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export const readTextIfExists = async (targetPath: string) => {
|
|
33
|
-
if (!await pathExists(targetPath)) return null
|
|
34
|
-
return readFile(targetPath, 'utf-8')
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
export const ensureParentDir = async (targetPath: string) => {
|
|
38
|
-
await mkdir(dirname(targetPath), { recursive: true })
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export const execCommand = async (input: ExecCommandInput): Promise<CommandResult> => {
|
|
42
|
-
const startedAt = Date.now()
|
|
43
|
-
const {
|
|
44
|
-
command,
|
|
45
|
-
args = [],
|
|
46
|
-
cwd,
|
|
47
|
-
env,
|
|
48
|
-
timeoutMs,
|
|
49
|
-
input: stdin
|
|
50
|
-
} = input
|
|
51
|
-
|
|
52
|
-
return new Promise((resolve, reject) => {
|
|
53
|
-
const child = spawn(command, args, {
|
|
54
|
-
cwd,
|
|
55
|
-
env: {
|
|
56
|
-
...process.env,
|
|
57
|
-
...env
|
|
58
|
-
},
|
|
59
|
-
stdio: 'pipe'
|
|
60
|
-
})
|
|
61
|
-
|
|
62
|
-
const stdout: string[] = []
|
|
63
|
-
const stderr: string[] = []
|
|
64
|
-
let settled = false
|
|
65
|
-
|
|
66
|
-
const finish = (value: CommandResult | Error, isError = false) => {
|
|
67
|
-
if (settled) return
|
|
68
|
-
settled = true
|
|
69
|
-
if (timer) clearTimeout(timer)
|
|
70
|
-
if (isError) {
|
|
71
|
-
reject(value)
|
|
72
|
-
return
|
|
73
|
-
}
|
|
74
|
-
resolve(value as CommandResult)
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
child.stdout.on('data', (chunk) => {
|
|
78
|
-
stdout.push(String(chunk))
|
|
79
|
-
})
|
|
80
|
-
child.stderr.on('data', (chunk) => {
|
|
81
|
-
stderr.push(String(chunk))
|
|
82
|
-
})
|
|
83
|
-
child.on('error', (error) => {
|
|
84
|
-
finish(error, true)
|
|
85
|
-
})
|
|
86
|
-
child.on('close', (code) => {
|
|
87
|
-
finish({
|
|
88
|
-
exitCode: code ?? -1,
|
|
89
|
-
stdout: stdout.join(''),
|
|
90
|
-
stderr: stderr.join(''),
|
|
91
|
-
durationMs: Date.now() - startedAt
|
|
92
|
-
})
|
|
93
|
-
})
|
|
94
|
-
|
|
95
|
-
const timer = timeoutMs != null
|
|
96
|
-
? setTimeout(() => {
|
|
97
|
-
child.kill('SIGTERM')
|
|
98
|
-
}, timeoutMs)
|
|
99
|
-
: null
|
|
100
|
-
|
|
101
|
-
if (stdin != null && stdin !== '') {
|
|
102
|
-
child.stdin.write(stdin)
|
|
103
|
-
}
|
|
104
|
-
child.stdin.end()
|
|
105
|
-
})
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
export const execShellCommand = async (input: Omit<ExecCommandInput, 'command' | 'args'> & { command: string }) => {
|
|
109
|
-
const shell = process.env.SHELL || '/bin/sh'
|
|
110
|
-
return execCommand({
|
|
111
|
-
command: shell,
|
|
112
|
-
args: ['-lc', input.command],
|
|
113
|
-
cwd: input.cwd,
|
|
114
|
-
env: input.env,
|
|
115
|
-
timeoutMs: input.timeoutMs,
|
|
116
|
-
input: input.input
|
|
117
|
-
})
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
export const parseDiffFiles = (patch: string) => {
|
|
121
|
-
const files = new Set<string>()
|
|
122
|
-
const matcher = /^diff --git a\/(.+?) b\/(.+)$/gm
|
|
123
|
-
for (const match of patch.matchAll(matcher)) {
|
|
124
|
-
const [, oldPath, newPath] = match
|
|
125
|
-
files.add(newPath === '/dev/null' ? oldPath : newPath)
|
|
126
|
-
}
|
|
127
|
-
return [...files]
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
export const summarizeText = (value: string, limit = 1200) => {
|
|
131
|
-
const normalized = value.trim()
|
|
132
|
-
if (normalized.length <= limit) return normalized
|
|
133
|
-
return `${normalized.slice(0, limit)}...`
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
export const linkPreparedNodeModules = async (sourceDir: string, targetDir: string) => {
|
|
137
|
-
const candidates = ['node_modules']
|
|
138
|
-
for (const name of candidates) {
|
|
139
|
-
const sourcePath = join(sourceDir, name)
|
|
140
|
-
const targetPath = join(targetDir, name)
|
|
141
|
-
if (!await pathExists(sourcePath) || await pathExists(targetPath)) continue
|
|
142
|
-
await symlink(sourcePath, targetPath, process.platform === 'win32' ? 'junction' : 'dir')
|
|
143
|
-
}
|
|
144
|
-
}
|