zenkit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CONTRIBUTING.md +63 -0
  2. package/LICENSE +21 -0
  3. package/README.md +242 -0
  4. package/agents/backend-architect.md +19 -0
  5. package/agents/frontend-architect.md +19 -0
  6. package/agents/implementation-auditor.md +19 -0
  7. package/agents/product-manager.md +19 -0
  8. package/agents/qa-test-engineer.md +19 -0
  9. package/agents/security-specialist.md +19 -0
  10. package/agents/system-architect.md +19 -0
  11. package/agents/technical-writer.md +19 -0
  12. package/agents/ux-engineer.md +19 -0
  13. package/benchmark/feature-specs/cli-tool.json +58 -0
  14. package/benchmark/feature-specs/handoff-system.json +69 -0
  15. package/benchmark/feature-specs/protocol-completeness.json +85 -0
  16. package/benchmark/feature-specs/schema-validator-baseline.json +93 -0
  17. package/benchmark/feature-specs/schema-validator-playground.json +92 -0
  18. package/benchmark/feature-specs/self-audit.json +76 -0
  19. package/benchmark/fixtures/valid-handoff.json +13 -0
  20. package/benchmark/scripts/compare.ts +172 -0
  21. package/benchmark/scripts/report.ts +102 -0
  22. package/benchmark/scripts/run-all.ts +125 -0
  23. package/benchmark/scripts/run.ts +595 -0
  24. package/benchmark/scripts/visualize.ts +120 -0
  25. package/bin/zenkit.js +24 -0
  26. package/commands/audit.md +28 -0
  27. package/commands/build.md +26 -0
  28. package/commands/checkpoint.md +28 -0
  29. package/commands/handoff.md +28 -0
  30. package/commands/plan.md +27 -0
  31. package/commands/refactor.md +27 -0
  32. package/commands/ship.md +28 -0
  33. package/commands/spec.md +26 -0
  34. package/dist/cli.d.ts +2 -0
  35. package/dist/cli.d.ts.map +1 -0
  36. package/dist/cli.js +174 -0
  37. package/dist/cli.js.map +1 -0
  38. package/dist/index.d.ts +765 -0
  39. package/dist/index.d.ts.map +1 -0
  40. package/dist/index.js +121 -0
  41. package/dist/index.js.map +1 -0
  42. package/dist/schemas/audit.schema.json +63 -0
  43. package/dist/schemas/benchmark.schema.json +118 -0
  44. package/dist/schemas/checkpoint.schema.json +64 -0
  45. package/dist/schemas/feature-spec.schema.json +76 -0
  46. package/dist/schemas/handoff.schema.json +78 -0
  47. package/dist/schemas/schemas/audit.schema.json +63 -0
  48. package/dist/schemas/schemas/benchmark.schema.json +118 -0
  49. package/dist/schemas/schemas/checkpoint.schema.json +64 -0
  50. package/dist/schemas/schemas/feature-spec.schema.json +76 -0
  51. package/dist/schemas/schemas/handoff.schema.json +78 -0
  52. package/dist/schemas/schemas/task.schema.json +69 -0
  53. package/dist/schemas/task.schema.json +69 -0
  54. package/docs/agent-contract.md +36 -0
  55. package/docs/architecture.md +88 -0
  56. package/docs/benchmarking.md +51 -0
  57. package/docs/command-model.md +43 -0
  58. package/docs/philosophy.md +35 -0
  59. package/docs/roadmap.md +43 -0
  60. package/docs/self-audit.md +29 -0
  61. package/hooks/post-change.md +30 -0
  62. package/hooks/pre-change.md +27 -0
  63. package/hooks/pre-ship.md +30 -0
  64. package/package.json +92 -0
  65. package/rubrics/architectural-alignment.md +26 -0
  66. package/rubrics/execution-quality.md +26 -0
  67. package/rubrics/verbosity-score.md +26 -0
  68. package/schemas/audit.schema.json +63 -0
  69. package/schemas/benchmark.schema.json +118 -0
  70. package/schemas/checkpoint.schema.json +64 -0
  71. package/schemas/feature-spec.schema.json +76 -0
  72. package/schemas/handoff.schema.json +78 -0
  73. package/schemas/task.schema.json +69 -0
  74. package/skills/architecture-review.md +17 -0
  75. package/skills/backend-change.md +17 -0
  76. package/skills/bug-triage.md +17 -0
  77. package/skills/frontend-change.md +17 -0
  78. package/skills/prompt-pruning.md +17 -0
  79. package/skills/release-check.md +17 -0
  80. package/skills/security-review.md +17 -0
  81. package/templates/agent.template.md +18 -0
  82. package/templates/command.template.md +21 -0
  83. package/templates/skill.template.md +15 -0
  84. package/templates/task.template.md +19 -0
@@ -0,0 +1,595 @@
1
+ /**
2
+ * ZenKit Benchmark Runner
3
+ *
4
+ * Verifies acceptance criteria from a feature spec against the actual
5
+ * implementation. Produces structured results distinguishing validated
6
+ * checks from estimated/illustrative data.
7
+ *
8
+ * Usage: npx tsx benchmark/scripts/run.ts [feature-spec-path]
9
+ */
10
+ import fs from 'fs'
11
+ import path from 'path'
12
+ import Ajv from 'ajv'
13
+ import addFormats from 'ajv-formats'
14
+
15
+ // --- Types ---
16
+
17
+ interface Verification {
18
+ type: 'file_exists' | 'file_contains' | 'schema_count' | 'examples_valid' | 'schemas_consistent' | 'test_passes' | 'json_path_equals'
19
+ path?: string
20
+ pattern?: string
21
+ expected?: number
22
+ command?: string
23
+ json_path?: string
24
+ equals?: unknown
25
+ }
26
+
27
+ interface AcceptanceCriterion {
28
+ id: string
29
+ description: string
30
+ verification: Verification
31
+ }
32
+
33
+ interface FeatureSpec {
34
+ feature_id: string
35
+ name: string
36
+ description: string
37
+ mode: 'zenkit' | 'baseline'
38
+ acceptance_criteria: AcceptanceCriterion[]
39
+ constraints: string[]
40
+ expected_files: string[]
41
+ assigned_commands: string[]
42
+ estimated_complexity: string
43
+ limitations: string[]
44
+ }
45
+
46
+ interface CriterionResult {
47
+ id: string
48
+ description: string
49
+ status: 'pass' | 'fail'
50
+ evidence: string
51
+ verification_type: string
52
+ }
53
+
54
+ interface StageResult {
55
+ name: string
56
+ status: 'pass' | 'fail' | 'skipped'
57
+ duration_ms: number
58
+ checks_run: number
59
+ checks_passed: number
60
+ details: string[]
61
+ }
62
+
63
+ interface BenchmarkResult {
64
+ benchmark_id: string
65
+ version: string
66
+ mode: 'zenkit' | 'baseline'
67
+ task_name: string
68
+ feature_spec: string
69
+ started_at: string
70
+ completed_at: string
71
+ duration_ms: number
72
+ status: 'pass' | 'fail' | 'partial'
73
+ expected_files: string[]
74
+ files_found: string[]
75
+ files_missing: string[]
76
+ acceptance_criteria_results: CriterionResult[]
77
+ stages: StageResult[]
78
+ validation_summary: {
79
+ total_criteria: number
80
+ criteria_passed: number
81
+ criteria_failed: number
82
+ schemas_valid: boolean
83
+ examples_valid: boolean
84
+ }
85
+ telemetry: {
86
+ estimated: {
87
+ tokens: number
88
+ cost_usd: number
89
+ basis: string
90
+ }
91
+ actual: null | {
92
+ tokens: number
93
+ cost_usd: number
94
+ }
95
+ }
96
+ uncertainty: string[]
97
+ limitations: string[]
98
+ }
99
+
100
+ // --- Helpers ---
101
+
102
+ const ROOT = path.resolve(__dirname, '../..')
103
+
104
+ function resolve(p: string): string {
105
+ return path.resolve(ROOT, p)
106
+ }
107
+
108
+ function fileExists(p: string): boolean {
109
+ return fs.existsSync(resolve(p))
110
+ }
111
+
112
+ function fileContains(p: string, pattern: string): boolean {
113
+ if (!fileExists(p)) return false
114
+ const content = fs.readFileSync(resolve(p), 'utf-8')
115
+ return content.includes(pattern)
116
+ }
117
+
118
+ function compileAllSchemas(): { valid: boolean; count: number; errors: string[] } {
119
+ const schemasDir = resolve('schemas')
120
+ const files = fs.readdirSync(schemasDir).filter(f => f.endsWith('.schema.json'))
121
+ const errors: string[] = []
122
+
123
+ for (const file of files) {
124
+ const localAjv = new Ajv({ allErrors: true, strict: false })
125
+ addFormats(localAjv)
126
+ const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, file), 'utf-8'))
127
+ try {
128
+ localAjv.compile(schema)
129
+ } catch (err) {
130
+ errors.push(`${file}: ${err}`)
131
+ }
132
+ }
133
+
134
+ return { valid: errors.length === 0, count: files.length, errors }
135
+ }
136
+
137
+ function checkSchemasConsistent(): { consistent: boolean; details: string } {
138
+ const schemasDir = resolve('schemas')
139
+ const files = fs.readdirSync(schemasDir).filter(f => f.endsWith('.schema.json'))
140
+ const drafts = new Set<string>()
141
+
142
+ for (const file of files) {
143
+ const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, file), 'utf-8'))
144
+ if (schema.$schema) drafts.add(schema.$schema)
145
+ }
146
+
147
+ const consistent = drafts.size === 1
148
+ return {
149
+ consistent,
150
+ details: consistent
151
+ ? `All ${files.length} schemas use ${[...drafts][0]}`
152
+ : `Inconsistent drafts: ${[...drafts].join(', ')}`,
153
+ }
154
+ }
155
+
156
+ function checkExamplesValid(): { valid: boolean; details: string[] } {
157
+ // Dynamic import won't work in tsx script, so we re-implement validation inline
158
+ const schemasDir = resolve('schemas')
159
+ const schemaFiles: Record<string, string> = {
160
+ handoff: 'handoff.schema.json',
161
+ task: 'task.schema.json',
162
+ audit: 'audit.schema.json',
163
+ checkpoint: 'checkpoint.schema.json',
164
+ benchmark: 'benchmark.schema.json',
165
+ }
166
+
167
+ const details: string[] = []
168
+ let allValid = true
169
+
170
+ // Load example data by reading the TS file and extracting JSON-like structure
171
+ // Since we can't import TS directly, we validate example fixtures instead
172
+ const fixtureDir = resolve('benchmark/fixtures')
173
+ if (fs.existsSync(fixtureDir)) {
174
+ const fixtures = fs.readdirSync(fixtureDir).filter(f => f.endsWith('.json'))
175
+ for (const fixture of fixtures) {
176
+ const data = JSON.parse(fs.readFileSync(path.join(fixtureDir, fixture), 'utf-8'))
177
+ // Try to validate against handoff schema (our main fixture)
178
+ const localAjv = new Ajv({ allErrors: true, strict: false })
179
+ addFormats(localAjv)
180
+ const schema = JSON.parse(fs.readFileSync(path.join(schemasDir, 'handoff.schema.json'), 'utf-8'))
181
+ const validate = localAjv.compile(schema)
182
+ const valid = validate(data)
183
+ if (valid) {
184
+ details.push(`${fixture}: valid against handoff.schema.json`)
185
+ } else {
186
+ details.push(`${fixture}: INVALID — ${validate.errors?.map(e => e.message).join(', ')}`)
187
+ allValid = false
188
+ }
189
+ }
190
+ }
191
+
192
+ // Also check that the schema library file registers all schemas
193
+ const schemasTs = resolve('src/lib/schemas.ts')
194
+ if (fs.existsSync(schemasTs)) {
195
+ const content = fs.readFileSync(schemasTs, 'utf-8')
196
+ for (const name of Object.keys(schemaFiles)) {
197
+ if (content.includes(`${name}:`)) {
198
+ details.push(`schemas.ts registers '${name}'`)
199
+ } else {
200
+ details.push(`schemas.ts MISSING registration for '${name}'`)
201
+ allValid = false
202
+ }
203
+ }
204
+ }
205
+
206
+ return { valid: allValid, details }
207
+ }
208
+
209
+ // --- Criterion Verification ---
210
+
211
+ function verifyCriterion(criterion: AcceptanceCriterion): CriterionResult {
212
+ const { verification } = criterion
213
+
214
+ switch (verification.type) {
215
+ case 'file_exists': {
216
+ const exists = fileExists(verification.path!)
217
+ return {
218
+ id: criterion.id,
219
+ description: criterion.description,
220
+ status: exists ? 'pass' : 'fail',
221
+ evidence: exists ? `${verification.path} exists` : `${verification.path} not found`,
222
+ verification_type: 'file_exists',
223
+ }
224
+ }
225
+
226
+ case 'file_contains': {
227
+ const found = fileContains(verification.path!, verification.pattern!)
228
+ return {
229
+ id: criterion.id,
230
+ description: criterion.description,
231
+ status: found ? 'pass' : 'fail',
232
+ evidence: found
233
+ ? `${verification.path} contains '${verification.pattern}'`
234
+ : `${verification.path} does not contain '${verification.pattern}'`,
235
+ verification_type: 'file_contains',
236
+ }
237
+ }
238
+
239
+ case 'schema_count': {
240
+ const result = compileAllSchemas()
241
+ const pass = result.count === verification.expected!
242
+ return {
243
+ id: criterion.id,
244
+ description: criterion.description,
245
+ status: pass ? 'pass' : 'fail',
246
+ evidence: `${result.count} schemas found (expected ${verification.expected}), ${result.errors.length} compilation errors`,
247
+ verification_type: 'schema_count',
248
+ }
249
+ }
250
+
251
+ case 'examples_valid': {
252
+ const result = checkExamplesValid()
253
+ return {
254
+ id: criterion.id,
255
+ description: criterion.description,
256
+ status: result.valid ? 'pass' : 'fail',
257
+ evidence: result.details.join('; '),
258
+ verification_type: 'examples_valid',
259
+ }
260
+ }
261
+
262
+ case 'schemas_consistent': {
263
+ const result = checkSchemasConsistent()
264
+ return {
265
+ id: criterion.id,
266
+ description: criterion.description,
267
+ status: result.consistent ? 'pass' : 'fail',
268
+ evidence: result.details,
269
+ verification_type: 'schemas_consistent',
270
+ }
271
+ }
272
+
273
+ case 'test_passes': {
274
+ const cmd = verification.command || 'npm test'
275
+ try {
276
+ const { execSync } = require('child_process')
277
+ execSync(cmd, { cwd: ROOT, encoding: 'utf-8', timeout: 60000, stdio: 'pipe' })
278
+ return {
279
+ id: criterion.id,
280
+ description: criterion.description,
281
+ status: 'pass',
282
+ evidence: `Command '${cmd}' exited with code 0`,
283
+ verification_type: 'test_passes',
284
+ }
285
+ } catch (err: any) {
286
+ return {
287
+ id: criterion.id,
288
+ description: criterion.description,
289
+ status: 'fail',
290
+ evidence: `Command '${cmd}' failed with exit code ${err.status || 'unknown'}`,
291
+ verification_type: 'test_passes',
292
+ }
293
+ }
294
+ }
295
+
296
+ case 'json_path_equals': {
297
+ const filePath = verification.path!
298
+ const jsonPath = verification.json_path!
299
+ const expectedValue = verification.equals
300
+ if (!fileExists(filePath)) {
301
+ return {
302
+ id: criterion.id,
303
+ description: criterion.description,
304
+ status: 'fail',
305
+ evidence: `File not found: ${filePath}`,
306
+ verification_type: 'json_path_equals',
307
+ }
308
+ }
309
+ try {
310
+ const data = JSON.parse(fs.readFileSync(resolve(filePath), 'utf-8'))
311
+ // Simple dot-path traversal
312
+ const parts = jsonPath.split('.')
313
+ let current: any = data
314
+ for (const part of parts) {
315
+ if (current === undefined || current === null) break
316
+ current = current[part]
317
+ }
318
+ const match = JSON.stringify(current) === JSON.stringify(expectedValue)
319
+ return {
320
+ id: criterion.id,
321
+ description: criterion.description,
322
+ status: match ? 'pass' : 'fail',
323
+ evidence: match
324
+ ? `${filePath}:${jsonPath} equals ${JSON.stringify(expectedValue)}`
325
+ : `${filePath}:${jsonPath} is ${JSON.stringify(current)}, expected ${JSON.stringify(expectedValue)}`,
326
+ verification_type: 'json_path_equals',
327
+ }
328
+ } catch (err) {
329
+ return {
330
+ id: criterion.id,
331
+ description: criterion.description,
332
+ status: 'fail',
333
+ evidence: `Error reading ${filePath}: ${err}`,
334
+ verification_type: 'json_path_equals',
335
+ }
336
+ }
337
+ }
338
+
339
+ default:
340
+ return {
341
+ id: criterion.id,
342
+ description: criterion.description,
343
+ status: 'fail',
344
+ evidence: `Unknown verification type: ${verification.type}`,
345
+ verification_type: 'unknown',
346
+ }
347
+ }
348
+ }
349
+
350
+ // --- Stage Runners ---
351
+
352
+ function runSpecStage(spec: FeatureSpec, specPath: string): StageResult {
353
+ const start = Date.now()
354
+ const checks: string[] = []
355
+ let passed = 0
356
+ const total = 4
357
+
358
+ // Validate spec against feature-spec.schema.json
359
+ const specSchemaPath = resolve('schemas/feature-spec.schema.json')
360
+ if (fs.existsSync(specSchemaPath)) {
361
+ const localAjv = new Ajv({ allErrors: true, strict: false })
362
+ addFormats(localAjv)
363
+ const specSchema = JSON.parse(fs.readFileSync(specSchemaPath, 'utf-8'))
364
+ const validate = localAjv.compile(specSchema)
365
+ const specData = JSON.parse(fs.readFileSync(path.resolve(specPath), 'utf-8'))
366
+ if (validate(specData)) {
367
+ passed++; checks.push('spec validates against feature-spec.schema.json')
368
+ } else {
369
+ checks.push(`FAIL: spec schema validation — ${validate.errors?.map(e => e.message).join(', ')}`)
370
+ }
371
+ } else {
372
+ checks.push('SKIP: feature-spec.schema.json not found')
373
+ }
374
+
375
+ if (spec.name.length > 0) { passed++; checks.push('name present') }
376
+ else checks.push('FAIL: name empty')
377
+
378
+ if (spec.acceptance_criteria.length > 0) { passed++; checks.push(`${spec.acceptance_criteria.length} acceptance criteria defined`) }
379
+ else checks.push('FAIL: no acceptance criteria')
380
+
381
+ if (spec.limitations.length > 0) { passed++; checks.push(`${spec.limitations.length} limitations declared`) }
382
+ else checks.push('FAIL: no limitations declared — specs should be honest about scope')
383
+
384
+ return {
385
+ name: 'spec',
386
+ status: passed === total ? 'pass' : 'fail',
387
+ duration_ms: Date.now() - start,
388
+ checks_run: total,
389
+ checks_passed: passed,
390
+ details: checks,
391
+ }
392
+ }
393
+
394
+ function runBuildStage(spec: FeatureSpec): StageResult {
395
+ const start = Date.now()
396
+ const checks: string[] = []
397
+ let passed = 0
398
+
399
+ for (const file of spec.expected_files) {
400
+ if (fileExists(file)) {
401
+ passed++
402
+ checks.push(`${file} exists`)
403
+ } else {
404
+ checks.push(`FAIL: ${file} not found`)
405
+ }
406
+ }
407
+
408
+ return {
409
+ name: 'build',
410
+ status: passed === spec.expected_files.length ? 'pass' : 'fail',
411
+ duration_ms: Date.now() - start,
412
+ checks_run: spec.expected_files.length,
413
+ checks_passed: passed,
414
+ details: checks,
415
+ }
416
+ }
417
+
418
+ function runAuditStage(criteriaResults: CriterionResult[]): StageResult {
419
+ const start = Date.now()
420
+ const passed = criteriaResults.filter(c => c.status === 'pass').length
421
+ const total = criteriaResults.length
422
+
423
+ return {
424
+ name: 'audit',
425
+ status: passed === total ? 'pass' : passed > 0 ? 'fail' : 'fail',
426
+ duration_ms: Date.now() - start,
427
+ checks_run: total,
428
+ checks_passed: passed,
429
+ details: criteriaResults.map(c => `[${c.status.toUpperCase()}] ${c.id}: ${c.evidence}`),
430
+ }
431
+ }
432
+
433
+ function runSchemaStage(): StageResult {
434
+ const start = Date.now()
435
+ const result = compileAllSchemas()
436
+ const consistency = checkSchemasConsistent()
437
+
438
+ return {
439
+ name: 'schemas',
440
+ status: result.valid && consistency.consistent ? 'pass' : 'fail',
441
+ duration_ms: Date.now() - start,
442
+ checks_run: result.count + 1,
443
+ checks_passed: (result.count - result.errors.length) + (consistency.consistent ? 1 : 0),
444
+ details: [
445
+ `${result.count} schemas compiled, ${result.errors.length} errors`,
446
+ consistency.details,
447
+ ...result.errors,
448
+ ],
449
+ }
450
+ }
451
+
452
+ // --- Telemetry ---
453
+
454
+ function estimateTokens(spec: FeatureSpec): number {
455
+ const base = 5000
456
+ const perCriterion = 2500
457
+ const multiplier = spec.estimated_complexity === 'high' ? 2.0 :
458
+ spec.estimated_complexity === 'medium' ? 1.5 : 1.0
459
+ return Math.round((base + spec.acceptance_criteria.length * perCriterion) * multiplier)
460
+ }
461
+
462
+ function estimateCost(tokens: number): number {
463
+ // $3/M input, $15/M output, assuming 60/40 split
464
+ return (tokens * 0.6 * 3 + tokens * 0.4 * 15) / 1_000_000
465
+ }
466
+
467
+ // --- Main ---
468
+
469
+ async function main() {
470
+ const specPath = process.argv[2] || 'benchmark/feature-specs/schema-validator-playground.json'
471
+ const resolvedPath = path.resolve(specPath)
472
+
473
+ if (!fs.existsSync(resolvedPath)) {
474
+ console.error(`Feature spec not found: ${resolvedPath}`)
475
+ process.exit(1)
476
+ }
477
+
478
+ console.log('ZenKit Benchmark Runner v0.2')
479
+ console.log('============================\n')
480
+
481
+ const startTime = new Date()
482
+ const spec: FeatureSpec = JSON.parse(fs.readFileSync(resolvedPath, 'utf-8'))
483
+
484
+ console.log(`Feature: ${spec.name}`)
485
+ console.log(`Mode: ${spec.mode}`)
486
+ console.log(`Criteria: ${spec.acceptance_criteria.length}`)
487
+ console.log(`Expected files: ${spec.expected_files.length}`)
488
+ console.log()
489
+
490
+ // Run stages
491
+ const stages: StageResult[] = []
492
+
493
+ // 1. Spec validation
494
+ const specStage = runSpecStage(spec, resolvedPath)
495
+ stages.push(specStage)
496
+ console.log(` [${specStage.status}] spec (${specStage.checks_passed}/${specStage.checks_run})`)
497
+
498
+ // 2. Schema compilation
499
+ const schemaStage = runSchemaStage()
500
+ stages.push(schemaStage)
501
+ console.log(` [${schemaStage.status}] schemas (${schemaStage.checks_passed}/${schemaStage.checks_run})`)
502
+
503
+ // 3. Build verification (expected files)
504
+ const buildStage = runBuildStage(spec)
505
+ stages.push(buildStage)
506
+ console.log(` [${buildStage.status}] build (${buildStage.checks_passed}/${buildStage.checks_run})`)
507
+
508
+ // 4. Acceptance criteria audit
509
+ const criteriaResults = spec.acceptance_criteria.map(verifyCriterion)
510
+ const auditStage = runAuditStage(criteriaResults)
511
+ stages.push(auditStage)
512
+ console.log(` [${auditStage.status}] audit (${auditStage.checks_passed}/${auditStage.checks_run})`)
513
+
514
+ const endTime = new Date()
515
+ const allPassed = stages.every(s => s.status === 'pass')
516
+ const anyPassed = stages.some(s => s.status === 'pass')
517
+ const estimatedTokens = estimateTokens(spec)
518
+
519
+ // Determine actual files found/missing
520
+ const filesFound = spec.expected_files.filter(fileExists)
521
+ const filesMissing = spec.expected_files.filter(f => !fileExists(f))
522
+
523
+ const result: BenchmarkResult = {
524
+ benchmark_id: `bench-${spec.feature_id}-${Date.now()}`,
525
+ version: '0.2.0',
526
+ mode: spec.mode,
527
+ task_name: spec.name,
528
+ feature_spec: specPath,
529
+ started_at: startTime.toISOString(),
530
+ completed_at: endTime.toISOString(),
531
+ duration_ms: endTime.getTime() - startTime.getTime(),
532
+ status: allPassed ? 'pass' : anyPassed ? 'partial' : 'fail',
533
+ expected_files: spec.expected_files,
534
+ files_found: filesFound,
535
+ files_missing: filesMissing,
536
+ acceptance_criteria_results: criteriaResults,
537
+ stages,
538
+ validation_summary: {
539
+ total_criteria: criteriaResults.length,
540
+ criteria_passed: criteriaResults.filter(c => c.status === 'pass').length,
541
+ criteria_failed: criteriaResults.filter(c => c.status === 'fail').length,
542
+ schemas_valid: schemaStage.status === 'pass',
543
+ examples_valid: criteriaResults.find(c => c.id === 'ac-6')?.status === 'pass' || false,
544
+ },
545
+ telemetry: {
546
+ estimated: {
547
+ tokens: estimatedTokens,
548
+ cost_usd: estimateCost(estimatedTokens),
549
+ basis: 'Heuristic: 5000 base + 2500 per criterion, scaled by complexity',
550
+ },
551
+ actual: null,
552
+ },
553
+ uncertainty: [
554
+ 'Token and cost figures are estimates — no actual API telemetry is captured by this runner',
555
+ 'Acceptance criteria verify code structure and schema validity, not runtime UI behavior',
556
+ 'Stage durations reflect verification time, not original implementation time',
557
+ ],
558
+ limitations: spec.limitations,
559
+ }
560
+
561
+ // Write result
562
+ const resultPath = resolve(`benchmark/results/${spec.feature_id}-live.json`)
563
+ fs.mkdirSync(path.dirname(resultPath), { recursive: true })
564
+ fs.writeFileSync(resultPath, JSON.stringify(result, null, 2))
565
+
566
+ // Summary
567
+ const totalChecks = stages.reduce((sum, s) => sum + s.checks_run, 0)
568
+ const totalPassed = stages.reduce((sum, s) => sum + s.checks_passed, 0)
569
+
570
+ console.log(`\n${'='.repeat(50)}`)
571
+ console.log(`Status: ${result.status.toUpperCase()}`)
572
+ console.log(`Checks: ${totalPassed}/${totalChecks} passed`)
573
+ console.log(`Criteria: ${result.validation_summary.criteria_passed}/${result.validation_summary.total_criteria} passed`)
574
+ console.log(`Files: ${filesFound.length}/${spec.expected_files.length} found`)
575
+ console.log(`Duration: ${result.duration_ms}ms`)
576
+ console.log(`Est tokens: ~${estimatedTokens.toLocaleString()} (estimated)`)
577
+ console.log(`Est cost: ~$${result.telemetry.estimated.cost_usd.toFixed(2)} (estimated)`)
578
+ console.log(`Result: ${resultPath}`)
579
+
580
+ if (filesMissing.length > 0) {
581
+ console.log(`\nMissing files:`)
582
+ filesMissing.forEach(f => console.log(` - ${f}`))
583
+ }
584
+
585
+ const failedCriteria = criteriaResults.filter(c => c.status === 'fail')
586
+ if (failedCriteria.length > 0) {
587
+ console.log(`\nFailed criteria:`)
588
+ failedCriteria.forEach(c => console.log(` - ${c.id}: ${c.evidence}`))
589
+ }
590
+ }
591
+
592
+ main().catch(err => {
593
+ console.error('Benchmark failed:', err)
594
+ process.exit(1)
595
+ })
@@ -0,0 +1,120 @@
1
+ /**
2
+ * ZenKit Benchmark Visualizer
3
+ *
4
+ * Generates Mermaid diagram syntax from a benchmark result or summary.
5
+ *
6
+ * Usage:
7
+ * npx tsx benchmark/scripts/visualize.ts [result-path]
8
+ * npx tsx benchmark/scripts/visualize.ts --summary
9
+ */
10
+ import fs from 'fs'
11
+ import path from 'path'
12
+
13
+ const ROOT = path.resolve(__dirname, '../..')
14
+
15
+ function visualizeResult(resultPath: string): string {
16
+ const r = JSON.parse(fs.readFileSync(resultPath, 'utf-8'))
17
+ const lines: string[] = [
18
+ 'graph LR',
19
+ ]
20
+
21
+ // Stages as nodes
22
+ for (let i = 0; i < r.stages.length; i++) {
23
+ const s = r.stages[i]
24
+ const id = s.name.replace(/[^a-zA-Z]/g, '')
25
+ const label = `${s.name}\\n${s.checks_passed}/${s.checks_run}`
26
+ const shape = s.status === 'pass' ? `${id}[${label}]` : `${id}{{${label}}}`
27
+ lines.push(` ${shape}`)
28
+
29
+ if (i > 0) {
30
+ const prevId = r.stages[i - 1].name.replace(/[^a-zA-Z]/g, '')
31
+ lines.push(` ${prevId} --> ${id}`)
32
+ }
33
+ }
34
+
35
+ // Style pass/fail
36
+ lines.push('')
37
+ for (const s of r.stages) {
38
+ const id = s.name.replace(/[^a-zA-Z]/g, '')
39
+ if (s.status === 'pass') {
40
+ lines.push(` style ${id} fill:#064e3b,stroke:#059669,color:#d1fae5`)
41
+ } else {
42
+ lines.push(` style ${id} fill:#7f1d1d,stroke:#dc2626,color:#fecaca`)
43
+ }
44
+ }
45
+
46
+ return lines.join('\n')
47
+ }
48
+
49
+ function visualizeSummary(): string {
50
+ const summaryPath = path.join(ROOT, 'benchmark/results/summary.json')
51
+ if (!fs.existsSync(summaryPath)) {
52
+ console.error('No summary.json found. Run benchmark:all first.')
53
+ process.exit(1)
54
+ }
55
+
56
+ const s = JSON.parse(fs.readFileSync(summaryPath, 'utf-8'))
57
+ const lines: string[] = [
58
+ 'graph TD',
59
+ ]
60
+
61
+ for (let i = 0; i < s.results.length; i++) {
62
+ const r = s.results[i]
63
+ const id = `spec${i}`
64
+ const name = r.spec.replace('.json', '').replace(/-/g, ' ')
65
+ const label = `${name}\\n${r.criteria} criteria\\n${r.checks} checks`
66
+ const shape = r.status === 'pass' ? `${id}[${label}]` : `${id}{{${label}}}`
67
+ lines.push(` ${shape}`)
68
+ }
69
+
70
+ // Connect to a central summary node
71
+ lines.push(` summary((${s.passed}/${s.total} passed))`)
72
+ for (let i = 0; i < s.results.length; i++) {
73
+ lines.push(` spec${i} --> summary`)
74
+ }
75
+
76
+ lines.push('')
77
+ for (let i = 0; i < s.results.length; i++) {
78
+ const r = s.results[i]
79
+ const id = `spec${i}`
80
+ if (r.status === 'pass') {
81
+ lines.push(` style ${id} fill:#064e3b,stroke:#059669,color:#d1fae5`)
82
+ } else {
83
+ lines.push(` style ${id} fill:#7f1d1d,stroke:#dc2626,color:#fecaca`)
84
+ }
85
+ }
86
+ lines.push(` style summary fill:#292824,stroke:#918f7e,color:#ededea`)
87
+
88
+ return lines.join('\n')
89
+ }
90
+
91
+ function main() {
92
+ const arg = process.argv[2]
93
+
94
+ let mermaid: string
95
+
96
+ if (arg === '--summary') {
97
+ mermaid = visualizeSummary()
98
+ } else if (arg) {
99
+ mermaid = visualizeResult(path.resolve(arg))
100
+ } else {
101
+ // Default: use latest live result
102
+ const defaultResult = path.join(ROOT, 'benchmark/results/svp-001-live.json')
103
+ if (fs.existsSync(defaultResult)) {
104
+ mermaid = visualizeResult(defaultResult)
105
+ } else {
106
+ console.error('No result file found. Specify a path or run a benchmark first.')
107
+ process.exit(1)
108
+ }
109
+ }
110
+
111
+ console.log(mermaid)
112
+
113
+ // Also write to file
114
+ const outDir = path.join(ROOT, 'benchmark/results')
115
+ const outFile = arg === '--summary' ? 'workflow-summary.mermaid' : 'workflow.mermaid'
116
+ fs.writeFileSync(path.join(outDir, outFile), mermaid)
117
+ console.log(`\nWritten to: benchmark/results/${outFile}`)
118
+ }
119
+
120
+ main()