zenkit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +63 -0
- package/LICENSE +21 -0
- package/README.md +242 -0
- package/agents/backend-architect.md +19 -0
- package/agents/frontend-architect.md +19 -0
- package/agents/implementation-auditor.md +19 -0
- package/agents/product-manager.md +19 -0
- package/agents/qa-test-engineer.md +19 -0
- package/agents/security-specialist.md +19 -0
- package/agents/system-architect.md +19 -0
- package/agents/technical-writer.md +19 -0
- package/agents/ux-engineer.md +19 -0
- package/benchmark/feature-specs/cli-tool.json +58 -0
- package/benchmark/feature-specs/handoff-system.json +69 -0
- package/benchmark/feature-specs/protocol-completeness.json +85 -0
- package/benchmark/feature-specs/schema-validator-baseline.json +93 -0
- package/benchmark/feature-specs/schema-validator-playground.json +92 -0
- package/benchmark/feature-specs/self-audit.json +76 -0
- package/benchmark/fixtures/valid-handoff.json +13 -0
- package/benchmark/scripts/compare.ts +172 -0
- package/benchmark/scripts/report.ts +102 -0
- package/benchmark/scripts/run-all.ts +125 -0
- package/benchmark/scripts/run.ts +595 -0
- package/benchmark/scripts/visualize.ts +120 -0
- package/bin/zenkit.js +24 -0
- package/commands/audit.md +28 -0
- package/commands/build.md +26 -0
- package/commands/checkpoint.md +28 -0
- package/commands/handoff.md +28 -0
- package/commands/plan.md +27 -0
- package/commands/refactor.md +27 -0
- package/commands/ship.md +28 -0
- package/commands/spec.md +26 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +174 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +765 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +121 -0
- package/dist/index.js.map +1 -0
- package/dist/schemas/audit.schema.json +63 -0
- package/dist/schemas/benchmark.schema.json +118 -0
- package/dist/schemas/checkpoint.schema.json +64 -0
- package/dist/schemas/feature-spec.schema.json +76 -0
- package/dist/schemas/handoff.schema.json +78 -0
- package/dist/schemas/schemas/audit.schema.json +63 -0
- package/dist/schemas/schemas/benchmark.schema.json +118 -0
- package/dist/schemas/schemas/checkpoint.schema.json +64 -0
- package/dist/schemas/schemas/feature-spec.schema.json +76 -0
- package/dist/schemas/schemas/handoff.schema.json +78 -0
- package/dist/schemas/schemas/task.schema.json +69 -0
- package/dist/schemas/task.schema.json +69 -0
- package/docs/agent-contract.md +36 -0
- package/docs/architecture.md +88 -0
- package/docs/benchmarking.md +51 -0
- package/docs/command-model.md +43 -0
- package/docs/philosophy.md +35 -0
- package/docs/roadmap.md +43 -0
- package/docs/self-audit.md +29 -0
- package/hooks/post-change.md +30 -0
- package/hooks/pre-change.md +27 -0
- package/hooks/pre-ship.md +30 -0
- package/package.json +92 -0
- package/rubrics/architectural-alignment.md +26 -0
- package/rubrics/execution-quality.md +26 -0
- package/rubrics/verbosity-score.md +26 -0
- package/schemas/audit.schema.json +63 -0
- package/schemas/benchmark.schema.json +118 -0
- package/schemas/checkpoint.schema.json +64 -0
- package/schemas/feature-spec.schema.json +76 -0
- package/schemas/handoff.schema.json +78 -0
- package/schemas/task.schema.json +69 -0
- package/skills/architecture-review.md +17 -0
- package/skills/backend-change.md +17 -0
- package/skills/bug-triage.md +17 -0
- package/skills/frontend-change.md +17 -0
- package/skills/prompt-pruning.md +17 -0
- package/skills/release-check.md +17 -0
- package/skills/security-review.md +17 -0
- package/templates/agent.template.md +18 -0
- package/templates/command.template.md +21 -0
- package/templates/skill.template.md +15 -0
- package/templates/task.template.md +19 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ZenKit Benchmark Runner
|
|
3
|
+
*
|
|
4
|
+
* Verifies acceptance criteria from a feature spec against the actual
|
|
5
|
+
* implementation. Produces structured results distinguishing validated
|
|
6
|
+
* checks from estimated/illustrative data.
|
|
7
|
+
*
|
|
8
|
+
* Usage: npx tsx benchmark/scripts/run.ts [feature-spec-path]
|
|
9
|
+
*/
|
|
10
|
+
import fs from 'fs'
|
|
11
|
+
import path from 'path'
|
|
12
|
+
import Ajv from 'ajv'
|
|
13
|
+
import addFormats from 'ajv-formats'
|
|
14
|
+
|
|
15
|
+
// --- Types ---
|
|
16
|
+
|
|
17
|
+
interface Verification {
|
|
18
|
+
type: 'file_exists' | 'file_contains' | 'schema_count' | 'examples_valid' | 'schemas_consistent' | 'test_passes' | 'json_path_equals'
|
|
19
|
+
path?: string
|
|
20
|
+
pattern?: string
|
|
21
|
+
expected?: number
|
|
22
|
+
command?: string
|
|
23
|
+
json_path?: string
|
|
24
|
+
equals?: unknown
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface AcceptanceCriterion {
|
|
28
|
+
id: string
|
|
29
|
+
description: string
|
|
30
|
+
verification: Verification
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
interface FeatureSpec {
|
|
34
|
+
feature_id: string
|
|
35
|
+
name: string
|
|
36
|
+
description: string
|
|
37
|
+
mode: 'zenkit' | 'baseline'
|
|
38
|
+
acceptance_criteria: AcceptanceCriterion[]
|
|
39
|
+
constraints: string[]
|
|
40
|
+
expected_files: string[]
|
|
41
|
+
assigned_commands: string[]
|
|
42
|
+
estimated_complexity: string
|
|
43
|
+
limitations: string[]
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
interface CriterionResult {
|
|
47
|
+
id: string
|
|
48
|
+
description: string
|
|
49
|
+
status: 'pass' | 'fail'
|
|
50
|
+
evidence: string
|
|
51
|
+
verification_type: string
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
interface StageResult {
|
|
55
|
+
name: string
|
|
56
|
+
status: 'pass' | 'fail' | 'skipped'
|
|
57
|
+
duration_ms: number
|
|
58
|
+
checks_run: number
|
|
59
|
+
checks_passed: number
|
|
60
|
+
details: string[]
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
interface BenchmarkResult {
|
|
64
|
+
benchmark_id: string
|
|
65
|
+
version: string
|
|
66
|
+
mode: 'zenkit' | 'baseline'
|
|
67
|
+
task_name: string
|
|
68
|
+
feature_spec: string
|
|
69
|
+
started_at: string
|
|
70
|
+
completed_at: string
|
|
71
|
+
duration_ms: number
|
|
72
|
+
status: 'pass' | 'fail' | 'partial'
|
|
73
|
+
expected_files: string[]
|
|
74
|
+
files_found: string[]
|
|
75
|
+
files_missing: string[]
|
|
76
|
+
acceptance_criteria_results: CriterionResult[]
|
|
77
|
+
stages: StageResult[]
|
|
78
|
+
validation_summary: {
|
|
79
|
+
total_criteria: number
|
|
80
|
+
criteria_passed: number
|
|
81
|
+
criteria_failed: number
|
|
82
|
+
schemas_valid: boolean
|
|
83
|
+
examples_valid: boolean
|
|
84
|
+
}
|
|
85
|
+
telemetry: {
|
|
86
|
+
estimated: {
|
|
87
|
+
tokens: number
|
|
88
|
+
cost_usd: number
|
|
89
|
+
basis: string
|
|
90
|
+
}
|
|
91
|
+
actual: null | {
|
|
92
|
+
tokens: number
|
|
93
|
+
cost_usd: number
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
uncertainty: string[]
|
|
97
|
+
limitations: string[]
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// --- Helpers ---
|
|
101
|
+
|
|
102
|
+
const ROOT = path.resolve(__dirname, '../..')
|
|
103
|
+
|
|
104
|
+
function resolve(p: string): string {
|
|
105
|
+
return path.resolve(ROOT, p)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function fileExists(p: string): boolean {
|
|
109
|
+
return fs.existsSync(resolve(p))
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function fileContains(p: string, pattern: string): boolean {
|
|
113
|
+
if (!fileExists(p)) return false
|
|
114
|
+
const content = fs.readFileSync(resolve(p), 'utf-8')
|
|
115
|
+
return content.includes(pattern)
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function compileAllSchemas(): { valid: boolean; count: number; errors: string[] } {
|
|
119
|
+
const schemasDir = resolve('schemas')
|
|
120
|
+
const files = fs.readdirSync(schemasDir).filter(f => f.endsWith('.schema.json'))
|
|
121
|
+
const errors: string[] = []
|
|
122
|
+
|
|
123
|
+
for (const file of files) {
|
|
124
|
+
const localAjv = new Ajv({ allErrors: true, strict: false })
|
|
125
|
+
addFormats(localAjv)
|
|
126
|
+
const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, file), 'utf-8'))
|
|
127
|
+
try {
|
|
128
|
+
localAjv.compile(schema)
|
|
129
|
+
} catch (err) {
|
|
130
|
+
errors.push(`${file}: ${err}`)
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return { valid: errors.length === 0, count: files.length, errors }
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function checkSchemasConsistent(): { consistent: boolean; details: string } {
|
|
138
|
+
const schemasDir = resolve('schemas')
|
|
139
|
+
const files = fs.readdirSync(schemasDir).filter(f => f.endsWith('.schema.json'))
|
|
140
|
+
const drafts = new Set<string>()
|
|
141
|
+
|
|
142
|
+
for (const file of files) {
|
|
143
|
+
const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, file), 'utf-8'))
|
|
144
|
+
if (schema.$schema) drafts.add(schema.$schema)
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const consistent = drafts.size === 1
|
|
148
|
+
return {
|
|
149
|
+
consistent,
|
|
150
|
+
details: consistent
|
|
151
|
+
? `All ${files.length} schemas use ${[...drafts][0]}`
|
|
152
|
+
: `Inconsistent drafts: ${[...drafts].join(', ')}`,
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function checkExamplesValid(): { valid: boolean; details: string[] } {
|
|
157
|
+
// Dynamic import won't work in tsx script, so we re-implement validation inline
|
|
158
|
+
const schemasDir = resolve('schemas')
|
|
159
|
+
const schemaFiles: Record<string, string> = {
|
|
160
|
+
handoff: 'handoff.schema.json',
|
|
161
|
+
task: 'task.schema.json',
|
|
162
|
+
audit: 'audit.schema.json',
|
|
163
|
+
checkpoint: 'checkpoint.schema.json',
|
|
164
|
+
benchmark: 'benchmark.schema.json',
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const details: string[] = []
|
|
168
|
+
let allValid = true
|
|
169
|
+
|
|
170
|
+
// Load example data by reading the TS file and extracting JSON-like structure
|
|
171
|
+
// Since we can't import TS directly, we validate example fixtures instead
|
|
172
|
+
const fixtureDir = resolve('benchmark/fixtures')
|
|
173
|
+
if (fs.existsSync(fixtureDir)) {
|
|
174
|
+
const fixtures = fs.readdirSync(fixtureDir).filter(f => f.endsWith('.json'))
|
|
175
|
+
for (const fixture of fixtures) {
|
|
176
|
+
const data = JSON.parse(fs.readFileSync(path.join(fixtureDir, fixture), 'utf-8'))
|
|
177
|
+
// Try to validate against handoff schema (our main fixture)
|
|
178
|
+
const localAjv = new Ajv({ allErrors: true, strict: false })
|
|
179
|
+
addFormats(localAjv)
|
|
180
|
+
const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, 'handoff.schema.json'), 'utf-8'))
|
|
181
|
+
const validate = localAjv.compile(schema)
|
|
182
|
+
const valid = validate(data)
|
|
183
|
+
if (valid) {
|
|
184
|
+
details.push(`${fixture}: valid against handoff.schema.json`)
|
|
185
|
+
} else {
|
|
186
|
+
details.push(`${fixture}: INVALID — ${validate.errors?.map(e => e.message).join(', ')}`)
|
|
187
|
+
allValid = false
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Also check that the schema library file registers all schemas
|
|
193
|
+
const schemasTs = resolve('src/lib/schemas.ts')
|
|
194
|
+
if (fs.existsSync(schemasTs)) {
|
|
195
|
+
const content = fs.readFileSync(schemasTs, 'utf-8')
|
|
196
|
+
for (const name of Object.keys(schemaFiles)) {
|
|
197
|
+
if (content.includes(`${name}:`)) {
|
|
198
|
+
details.push(`schemas.ts registers '${name}'`)
|
|
199
|
+
} else {
|
|
200
|
+
details.push(`schemas.ts MISSING registration for '${name}'`)
|
|
201
|
+
allValid = false
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return { valid: allValid, details }
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// --- Criterion Verification ---
|
|
210
|
+
|
|
211
|
+
function verifyCriterion(criterion: AcceptanceCriterion): CriterionResult {
|
|
212
|
+
const { verification } = criterion
|
|
213
|
+
|
|
214
|
+
switch (verification.type) {
|
|
215
|
+
case 'file_exists': {
|
|
216
|
+
const exists = fileExists(verification.path!)
|
|
217
|
+
return {
|
|
218
|
+
id: criterion.id,
|
|
219
|
+
description: criterion.description,
|
|
220
|
+
status: exists ? 'pass' : 'fail',
|
|
221
|
+
evidence: exists ? `${verification.path} exists` : `${verification.path} not found`,
|
|
222
|
+
verification_type: 'file_exists',
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
case 'file_contains': {
|
|
227
|
+
const found = fileContains(verification.path!, verification.pattern!)
|
|
228
|
+
return {
|
|
229
|
+
id: criterion.id,
|
|
230
|
+
description: criterion.description,
|
|
231
|
+
status: found ? 'pass' : 'fail',
|
|
232
|
+
evidence: found
|
|
233
|
+
? `${verification.path} contains '${verification.pattern}'`
|
|
234
|
+
: `${verification.path} does not contain '${verification.pattern}'`,
|
|
235
|
+
verification_type: 'file_contains',
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
case 'schema_count': {
|
|
240
|
+
const result = compileAllSchemas()
|
|
241
|
+
const pass = result.count === verification.expected!
|
|
242
|
+
return {
|
|
243
|
+
id: criterion.id,
|
|
244
|
+
description: criterion.description,
|
|
245
|
+
status: pass ? 'pass' : 'fail',
|
|
246
|
+
evidence: `${result.count} schemas found (expected ${verification.expected}), ${result.errors.length} compilation errors`,
|
|
247
|
+
verification_type: 'schema_count',
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
case 'examples_valid': {
|
|
252
|
+
const result = checkExamplesValid()
|
|
253
|
+
return {
|
|
254
|
+
id: criterion.id,
|
|
255
|
+
description: criterion.description,
|
|
256
|
+
status: result.valid ? 'pass' : 'fail',
|
|
257
|
+
evidence: result.details.join('; '),
|
|
258
|
+
verification_type: 'examples_valid',
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
case 'schemas_consistent': {
|
|
263
|
+
const result = checkSchemasConsistent()
|
|
264
|
+
return {
|
|
265
|
+
id: criterion.id,
|
|
266
|
+
description: criterion.description,
|
|
267
|
+
status: result.consistent ? 'pass' : 'fail',
|
|
268
|
+
evidence: result.details,
|
|
269
|
+
verification_type: 'schemas_consistent',
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
case 'test_passes': {
|
|
274
|
+
const cmd = verification.command || 'npm test'
|
|
275
|
+
try {
|
|
276
|
+
const { execSync } = require('child_process')
|
|
277
|
+
execSync(cmd, { cwd: ROOT, encoding: 'utf-8', timeout: 60000, stdio: 'pipe' })
|
|
278
|
+
return {
|
|
279
|
+
id: criterion.id,
|
|
280
|
+
description: criterion.description,
|
|
281
|
+
status: 'pass',
|
|
282
|
+
evidence: `Command '${cmd}' exited with code 0`,
|
|
283
|
+
verification_type: 'test_passes',
|
|
284
|
+
}
|
|
285
|
+
} catch (err: any) {
|
|
286
|
+
return {
|
|
287
|
+
id: criterion.id,
|
|
288
|
+
description: criterion.description,
|
|
289
|
+
status: 'fail',
|
|
290
|
+
evidence: `Command '${cmd}' failed with exit code ${err.status || 'unknown'}`,
|
|
291
|
+
verification_type: 'test_passes',
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
case 'json_path_equals': {
|
|
297
|
+
const filePath = verification.path!
|
|
298
|
+
const jsonPath = verification.json_path!
|
|
299
|
+
const expectedValue = verification.equals
|
|
300
|
+
if (!fileExists(filePath)) {
|
|
301
|
+
return {
|
|
302
|
+
id: criterion.id,
|
|
303
|
+
description: criterion.description,
|
|
304
|
+
status: 'fail',
|
|
305
|
+
evidence: `File not found: ${filePath}`,
|
|
306
|
+
verification_type: 'json_path_equals',
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
const data = JSON.parse(fs.readFileSync(resolve(filePath), 'utf-8'))
|
|
311
|
+
// Simple dot-path traversal
|
|
312
|
+
const parts = jsonPath.split('.')
|
|
313
|
+
let current: any = data
|
|
314
|
+
for (const part of parts) {
|
|
315
|
+
if (current === undefined || current === null) break
|
|
316
|
+
current = current[part]
|
|
317
|
+
}
|
|
318
|
+
const match = JSON.stringify(current) === JSON.stringify(expectedValue)
|
|
319
|
+
return {
|
|
320
|
+
id: criterion.id,
|
|
321
|
+
description: criterion.description,
|
|
322
|
+
status: match ? 'pass' : 'fail',
|
|
323
|
+
evidence: match
|
|
324
|
+
? `${filePath}:${jsonPath} equals ${JSON.stringify(expectedValue)}`
|
|
325
|
+
: `${filePath}:${jsonPath} is ${JSON.stringify(current)}, expected ${JSON.stringify(expectedValue)}`,
|
|
326
|
+
verification_type: 'json_path_equals',
|
|
327
|
+
}
|
|
328
|
+
} catch (err) {
|
|
329
|
+
return {
|
|
330
|
+
id: criterion.id,
|
|
331
|
+
description: criterion.description,
|
|
332
|
+
status: 'fail',
|
|
333
|
+
evidence: `Error reading ${filePath}: ${err}`,
|
|
334
|
+
verification_type: 'json_path_equals',
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
default:
|
|
340
|
+
return {
|
|
341
|
+
id: criterion.id,
|
|
342
|
+
description: criterion.description,
|
|
343
|
+
status: 'fail',
|
|
344
|
+
evidence: `Unknown verification type: ${verification.type}`,
|
|
345
|
+
verification_type: 'unknown',
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// --- Stage Runners ---
|
|
351
|
+
|
|
352
|
+
function runSpecStage(spec: FeatureSpec, specPath: string): StageResult {
|
|
353
|
+
const start = Date.now()
|
|
354
|
+
const checks: string[] = []
|
|
355
|
+
let passed = 0
|
|
356
|
+
const total = 4
|
|
357
|
+
|
|
358
|
+
// Validate spec against feature-spec.schema.json
|
|
359
|
+
const specSchemaPath = resolve('schemas/feature-spec.schema.json')
|
|
360
|
+
if (fs.existsSync(specSchemaPath)) {
|
|
361
|
+
const localAjv = new Ajv({ allErrors: true, strict: false })
|
|
362
|
+
addFormats(localAjv)
|
|
363
|
+
const specSchema = JSON.parse(fs.readFileSync(specSchemaPath, 'utf-8'))
|
|
364
|
+
const validate = localAjv.compile(specSchema)
|
|
365
|
+
const specData = JSON.parse(fs.readFileSync(path.resolve(specPath), 'utf-8'))
|
|
366
|
+
if (validate(specData)) {
|
|
367
|
+
passed++; checks.push('spec validates against feature-spec.schema.json')
|
|
368
|
+
} else {
|
|
369
|
+
checks.push(`FAIL: spec schema validation — ${validate.errors?.map(e => e.message).join(', ')}`)
|
|
370
|
+
}
|
|
371
|
+
} else {
|
|
372
|
+
checks.push('SKIP: feature-spec.schema.json not found')
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (spec.name.length > 0) { passed++; checks.push('name present') }
|
|
376
|
+
else checks.push('FAIL: name empty')
|
|
377
|
+
|
|
378
|
+
if (spec.acceptance_criteria.length > 0) { passed++; checks.push(`${spec.acceptance_criteria.length} acceptance criteria defined`) }
|
|
379
|
+
else checks.push('FAIL: no acceptance criteria')
|
|
380
|
+
|
|
381
|
+
if (spec.limitations.length > 0) { passed++; checks.push(`${spec.limitations.length} limitations declared`) }
|
|
382
|
+
else checks.push('FAIL: no limitations declared — specs should be honest about scope')
|
|
383
|
+
|
|
384
|
+
return {
|
|
385
|
+
name: 'spec',
|
|
386
|
+
status: passed === total ? 'pass' : 'fail',
|
|
387
|
+
duration_ms: Date.now() - start,
|
|
388
|
+
checks_run: total,
|
|
389
|
+
checks_passed: passed,
|
|
390
|
+
details: checks,
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
function runBuildStage(spec: FeatureSpec): StageResult {
|
|
395
|
+
const start = Date.now()
|
|
396
|
+
const checks: string[] = []
|
|
397
|
+
let passed = 0
|
|
398
|
+
|
|
399
|
+
for (const file of spec.expected_files) {
|
|
400
|
+
if (fileExists(file)) {
|
|
401
|
+
passed++
|
|
402
|
+
checks.push(`${file} exists`)
|
|
403
|
+
} else {
|
|
404
|
+
checks.push(`FAIL: ${file} not found`)
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
return {
|
|
409
|
+
name: 'build',
|
|
410
|
+
status: passed === spec.expected_files.length ? 'pass' : 'fail',
|
|
411
|
+
duration_ms: Date.now() - start,
|
|
412
|
+
checks_run: spec.expected_files.length,
|
|
413
|
+
checks_passed: passed,
|
|
414
|
+
details: checks,
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
function runAuditStage(criteriaResults: CriterionResult[]): StageResult {
|
|
419
|
+
const start = Date.now()
|
|
420
|
+
const passed = criteriaResults.filter(c => c.status === 'pass').length
|
|
421
|
+
const total = criteriaResults.length
|
|
422
|
+
|
|
423
|
+
return {
|
|
424
|
+
name: 'audit',
|
|
425
|
+
status: passed === total ? 'pass' : passed > 0 ? 'fail' : 'fail',
|
|
426
|
+
duration_ms: Date.now() - start,
|
|
427
|
+
checks_run: total,
|
|
428
|
+
checks_passed: passed,
|
|
429
|
+
details: criteriaResults.map(c => `[${c.status.toUpperCase()}] ${c.id}: ${c.evidence}`),
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function runSchemaStage(): StageResult {
|
|
434
|
+
const start = Date.now()
|
|
435
|
+
const result = compileAllSchemas()
|
|
436
|
+
const consistency = checkSchemasConsistent()
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
name: 'schemas',
|
|
440
|
+
status: result.valid && consistency.consistent ? 'pass' : 'fail',
|
|
441
|
+
duration_ms: Date.now() - start,
|
|
442
|
+
checks_run: result.count + 1,
|
|
443
|
+
checks_passed: (result.count - result.errors.length) + (consistency.consistent ? 1 : 0),
|
|
444
|
+
details: [
|
|
445
|
+
`${result.count} schemas compiled, ${result.errors.length} errors`,
|
|
446
|
+
consistency.details,
|
|
447
|
+
...result.errors,
|
|
448
|
+
],
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// --- Telemetry ---
|
|
453
|
+
|
|
454
|
+
function estimateTokens(spec: FeatureSpec): number {
|
|
455
|
+
const base = 5000
|
|
456
|
+
const perCriterion = 2500
|
|
457
|
+
const multiplier = spec.estimated_complexity === 'high' ? 2.0 :
|
|
458
|
+
spec.estimated_complexity === 'medium' ? 1.5 : 1.0
|
|
459
|
+
return Math.round((base + spec.acceptance_criteria.length * perCriterion) * multiplier)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
function estimateCost(tokens: number): number {
|
|
463
|
+
// $3/M input, $15/M output, assuming 60/40 split
|
|
464
|
+
return (tokens * 0.6 * 3 + tokens * 0.4 * 15) / 1_000_000
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// --- Main ---
|
|
468
|
+
|
|
469
|
+
async function main() {
|
|
470
|
+
const specPath = process.argv[2] || 'benchmark/feature-specs/schema-validator-playground.json'
|
|
471
|
+
const resolvedPath = path.resolve(specPath)
|
|
472
|
+
|
|
473
|
+
if (!fs.existsSync(resolvedPath)) {
|
|
474
|
+
console.error(`Feature spec not found: ${resolvedPath}`)
|
|
475
|
+
process.exit(1)
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
console.log('ZenKit Benchmark Runner v0.2')
|
|
479
|
+
console.log('============================\n')
|
|
480
|
+
|
|
481
|
+
const startTime = new Date()
|
|
482
|
+
const spec: FeatureSpec = JSON.parse(fs.readFileSync(resolvedPath, 'utf-8'))
|
|
483
|
+
|
|
484
|
+
console.log(`Feature: ${spec.name}`)
|
|
485
|
+
console.log(`Mode: ${spec.mode}`)
|
|
486
|
+
console.log(`Criteria: ${spec.acceptance_criteria.length}`)
|
|
487
|
+
console.log(`Expected files: ${spec.expected_files.length}`)
|
|
488
|
+
console.log()
|
|
489
|
+
|
|
490
|
+
// Run stages
|
|
491
|
+
const stages: StageResult[] = []
|
|
492
|
+
|
|
493
|
+
// 1. Spec validation
|
|
494
|
+
const specStage = runSpecStage(spec, resolvedPath)
|
|
495
|
+
stages.push(specStage)
|
|
496
|
+
console.log(` [${specStage.status}] spec (${specStage.checks_passed}/${specStage.checks_run})`)
|
|
497
|
+
|
|
498
|
+
// 2. Schema compilation
|
|
499
|
+
const schemaStage = runSchemaStage()
|
|
500
|
+
stages.push(schemaStage)
|
|
501
|
+
console.log(` [${schemaStage.status}] schemas (${schemaStage.checks_passed}/${schemaStage.checks_run})`)
|
|
502
|
+
|
|
503
|
+
// 3. Build verification (expected files)
|
|
504
|
+
const buildStage = runBuildStage(spec)
|
|
505
|
+
stages.push(buildStage)
|
|
506
|
+
console.log(` [${buildStage.status}] build (${buildStage.checks_passed}/${buildStage.checks_run})`)
|
|
507
|
+
|
|
508
|
+
// 4. Acceptance criteria audit
|
|
509
|
+
const criteriaResults = spec.acceptance_criteria.map(verifyCriterion)
|
|
510
|
+
const auditStage = runAuditStage(criteriaResults)
|
|
511
|
+
stages.push(auditStage)
|
|
512
|
+
console.log(` [${auditStage.status}] audit (${auditStage.checks_passed}/${auditStage.checks_run})`)
|
|
513
|
+
|
|
514
|
+
const endTime = new Date()
|
|
515
|
+
const allPassed = stages.every(s => s.status === 'pass')
|
|
516
|
+
const anyPassed = stages.some(s => s.status === 'pass')
|
|
517
|
+
const estimatedTokens = estimateTokens(spec)
|
|
518
|
+
|
|
519
|
+
// Determine actual files found/missing
|
|
520
|
+
const filesFound = spec.expected_files.filter(fileExists)
|
|
521
|
+
const filesMissing = spec.expected_files.filter(f => !fileExists(f))
|
|
522
|
+
|
|
523
|
+
const result: BenchmarkResult = {
|
|
524
|
+
benchmark_id: `bench-${spec.feature_id}-${Date.now()}`,
|
|
525
|
+
version: '0.2.0',
|
|
526
|
+
mode: spec.mode,
|
|
527
|
+
task_name: spec.name,
|
|
528
|
+
feature_spec: specPath,
|
|
529
|
+
started_at: startTime.toISOString(),
|
|
530
|
+
completed_at: endTime.toISOString(),
|
|
531
|
+
duration_ms: endTime.getTime() - startTime.getTime(),
|
|
532
|
+
status: allPassed ? 'pass' : anyPassed ? 'partial' : 'fail',
|
|
533
|
+
expected_files: spec.expected_files,
|
|
534
|
+
files_found: filesFound,
|
|
535
|
+
files_missing: filesMissing,
|
|
536
|
+
acceptance_criteria_results: criteriaResults,
|
|
537
|
+
stages,
|
|
538
|
+
validation_summary: {
|
|
539
|
+
total_criteria: criteriaResults.length,
|
|
540
|
+
criteria_passed: criteriaResults.filter(c => c.status === 'pass').length,
|
|
541
|
+
criteria_failed: criteriaResults.filter(c => c.status === 'fail').length,
|
|
542
|
+
schemas_valid: schemaStage.status === 'pass',
|
|
543
|
+
examples_valid: criteriaResults.find(c => c.id === 'ac-6')?.status === 'pass' || false,
|
|
544
|
+
},
|
|
545
|
+
telemetry: {
|
|
546
|
+
estimated: {
|
|
547
|
+
tokens: estimatedTokens,
|
|
548
|
+
cost_usd: estimateCost(estimatedTokens),
|
|
549
|
+
basis: 'Heuristic: 5000 base + 2500 per criterion, scaled by complexity',
|
|
550
|
+
},
|
|
551
|
+
actual: null,
|
|
552
|
+
},
|
|
553
|
+
uncertainty: [
|
|
554
|
+
'Token and cost figures are estimates — no actual API telemetry is captured by this runner',
|
|
555
|
+
'Acceptance criteria verify code structure and schema validity, not runtime UI behavior',
|
|
556
|
+
'Stage durations reflect verification time, not original implementation time',
|
|
557
|
+
],
|
|
558
|
+
limitations: spec.limitations,
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Write result
|
|
562
|
+
const resultPath = resolve(`benchmark/results/${spec.feature_id}-live.json`)
|
|
563
|
+
fs.mkdirSync(path.dirname(resultPath), { recursive: true })
|
|
564
|
+
fs.writeFileSync(resultPath, JSON.stringify(result, null, 2))
|
|
565
|
+
|
|
566
|
+
// Summary
|
|
567
|
+
const totalChecks = stages.reduce((sum, s) => sum + s.checks_run, 0)
|
|
568
|
+
const totalPassed = stages.reduce((sum, s) => sum + s.checks_passed, 0)
|
|
569
|
+
|
|
570
|
+
console.log(`\n${'='.repeat(50)}`)
|
|
571
|
+
console.log(`Status: ${result.status.toUpperCase()}`)
|
|
572
|
+
console.log(`Checks: ${totalPassed}/${totalChecks} passed`)
|
|
573
|
+
console.log(`Criteria: ${result.validation_summary.criteria_passed}/${result.validation_summary.total_criteria} passed`)
|
|
574
|
+
console.log(`Files: ${filesFound.length}/${spec.expected_files.length} found`)
|
|
575
|
+
console.log(`Duration: ${result.duration_ms}ms`)
|
|
576
|
+
console.log(`Est tokens: ~${estimatedTokens.toLocaleString()} (estimated)`)
|
|
577
|
+
console.log(`Est cost: ~$${result.telemetry.estimated.cost_usd.toFixed(2)} (estimated)`)
|
|
578
|
+
console.log(`Result: ${resultPath}`)
|
|
579
|
+
|
|
580
|
+
if (filesMissing.length > 0) {
|
|
581
|
+
console.log(`\nMissing files:`)
|
|
582
|
+
filesMissing.forEach(f => console.log(` - ${f}`))
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
const failedCriteria = criteriaResults.filter(c => c.status === 'fail')
|
|
586
|
+
if (failedCriteria.length > 0) {
|
|
587
|
+
console.log(`\nFailed criteria:`)
|
|
588
|
+
failedCriteria.forEach(c => console.log(` - ${c.id}: ${c.evidence}`))
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
main().catch(err => {
|
|
593
|
+
console.error('Benchmark failed:', err)
|
|
594
|
+
process.exit(1)
|
|
595
|
+
})
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ZenKit Benchmark Visualizer
|
|
3
|
+
*
|
|
4
|
+
* Generates Mermaid diagram syntax from a benchmark result or summary.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* npx tsx benchmark/scripts/visualize.ts [result-path]
|
|
8
|
+
* npx tsx benchmark/scripts/visualize.ts --summary
|
|
9
|
+
*/
|
|
10
|
+
import fs from 'fs'
|
|
11
|
+
import path from 'path'
|
|
12
|
+
|
|
13
|
+
const ROOT = path.resolve(__dirname, '../..')
|
|
14
|
+
|
|
15
|
+
function visualizeResult(resultPath: string): string {
|
|
16
|
+
const r = JSON.parse(fs.readFileSync(resultPath, 'utf-8'))
|
|
17
|
+
const lines: string[] = [
|
|
18
|
+
'graph LR',
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
// Stages as nodes
|
|
22
|
+
for (let i = 0; i < r.stages.length; i++) {
|
|
23
|
+
const s = r.stages[i]
|
|
24
|
+
const id = s.name.replace(/[^a-zA-Z]/g, '')
|
|
25
|
+
const label = `${s.name}\\n${s.checks_passed}/${s.checks_run}`
|
|
26
|
+
const shape = s.status === 'pass' ? `${id}[${label}]` : `${id}{{${label}}}`
|
|
27
|
+
lines.push(` ${shape}`)
|
|
28
|
+
|
|
29
|
+
if (i > 0) {
|
|
30
|
+
const prevId = r.stages[i - 1].name.replace(/[^a-zA-Z]/g, '')
|
|
31
|
+
lines.push(` ${prevId} --> ${id}`)
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Style pass/fail
|
|
36
|
+
lines.push('')
|
|
37
|
+
for (const s of r.stages) {
|
|
38
|
+
const id = s.name.replace(/[^a-zA-Z]/g, '')
|
|
39
|
+
if (s.status === 'pass') {
|
|
40
|
+
lines.push(` style ${id} fill:#064e3b,stroke:#059669,color:#d1fae5`)
|
|
41
|
+
} else {
|
|
42
|
+
lines.push(` style ${id} fill:#7f1d1d,stroke:#dc2626,color:#fecaca`)
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return lines.join('\n')
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function visualizeSummary(): string {
|
|
50
|
+
const summaryPath = path.join(ROOT, 'benchmark/results/summary.json')
|
|
51
|
+
if (!fs.existsSync(summaryPath)) {
|
|
52
|
+
console.error('No summary.json found. Run benchmark:all first.')
|
|
53
|
+
process.exit(1)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const s = JSON.parse(fs.readFileSync(summaryPath, 'utf-8'))
|
|
57
|
+
const lines: string[] = [
|
|
58
|
+
'graph TD',
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
for (let i = 0; i < s.results.length; i++) {
|
|
62
|
+
const r = s.results[i]
|
|
63
|
+
const id = `spec${i}`
|
|
64
|
+
const name = r.spec.replace('.json', '').replace(/-/g, ' ')
|
|
65
|
+
const label = `${name}\\n${r.criteria} criteria\\n${r.checks} checks`
|
|
66
|
+
const shape = r.status === 'pass' ? `${id}[${label}]` : `${id}{{${label}}}`
|
|
67
|
+
lines.push(` ${shape}`)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Connect to a central summary node
|
|
71
|
+
lines.push(` summary((${s.passed}/${s.total} passed))`)
|
|
72
|
+
for (let i = 0; i < s.results.length; i++) {
|
|
73
|
+
lines.push(` spec${i} --> summary`)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
lines.push('')
|
|
77
|
+
for (let i = 0; i < s.results.length; i++) {
|
|
78
|
+
const r = s.results[i]
|
|
79
|
+
const id = `spec${i}`
|
|
80
|
+
if (r.status === 'pass') {
|
|
81
|
+
lines.push(` style ${id} fill:#064e3b,stroke:#059669,color:#d1fae5`)
|
|
82
|
+
} else {
|
|
83
|
+
lines.push(` style ${id} fill:#7f1d1d,stroke:#dc2626,color:#fecaca`)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
lines.push(` style summary fill:#292824,stroke:#918f7e,color:#ededea`)
|
|
87
|
+
|
|
88
|
+
return lines.join('\n')
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function main() {
|
|
92
|
+
const arg = process.argv[2]
|
|
93
|
+
|
|
94
|
+
let mermaid: string
|
|
95
|
+
|
|
96
|
+
if (arg === '--summary') {
|
|
97
|
+
mermaid = visualizeSummary()
|
|
98
|
+
} else if (arg) {
|
|
99
|
+
mermaid = visualizeResult(path.resolve(arg))
|
|
100
|
+
} else {
|
|
101
|
+
// Default: use latest live result
|
|
102
|
+
const defaultResult = path.join(ROOT, 'benchmark/results/svp-001-live.json')
|
|
103
|
+
if (fs.existsSync(defaultResult)) {
|
|
104
|
+
mermaid = visualizeResult(defaultResult)
|
|
105
|
+
} else {
|
|
106
|
+
console.error('No result file found. Specify a path or run a benchmark first.')
|
|
107
|
+
process.exit(1)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
console.log(mermaid)
|
|
112
|
+
|
|
113
|
+
// Also write to file
|
|
114
|
+
const outDir = path.join(ROOT, 'benchmark/results')
|
|
115
|
+
const outFile = arg === '--summary' ? 'workflow-summary.mermaid' : 'workflow.mermaid'
|
|
116
|
+
fs.writeFileSync(path.join(outDir, outFile), mermaid)
|
|
117
|
+
console.log(`\nWritten to: benchmark/results/${outFile}`)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
main()
|