npm - zenkit - Versions diffs - 0.5.0 - Mend

zenkit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

package/CONTRIBUTING.md +63 -0
package/LICENSE +21 -0
package/README.md +242 -0
package/agents/backend-architect.md +19 -0
package/agents/frontend-architect.md +19 -0
package/agents/implementation-auditor.md +19 -0
package/agents/product-manager.md +19 -0
package/agents/qa-test-engineer.md +19 -0
package/agents/security-specialist.md +19 -0
package/agents/system-architect.md +19 -0
package/agents/technical-writer.md +19 -0
package/agents/ux-engineer.md +19 -0
package/benchmark/feature-specs/cli-tool.json +58 -0
package/benchmark/feature-specs/handoff-system.json +69 -0
package/benchmark/feature-specs/protocol-completeness.json +85 -0
package/benchmark/feature-specs/schema-validator-baseline.json +93 -0
package/benchmark/feature-specs/schema-validator-playground.json +92 -0
package/benchmark/feature-specs/self-audit.json +76 -0
package/benchmark/fixtures/valid-handoff.json +13 -0
package/benchmark/scripts/compare.ts +172 -0
package/benchmark/scripts/report.ts +102 -0
package/benchmark/scripts/run-all.ts +125 -0
package/benchmark/scripts/run.ts +595 -0
package/benchmark/scripts/visualize.ts +120 -0
package/bin/zenkit.js +24 -0
package/commands/audit.md +28 -0
package/commands/build.md +26 -0
package/commands/checkpoint.md +28 -0
package/commands/handoff.md +28 -0
package/commands/plan.md +27 -0
package/commands/refactor.md +27 -0
package/commands/ship.md +28 -0
package/commands/spec.md +26 -0
package/dist/cli.d.ts +2 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +174 -0
package/dist/cli.js.map +1 -0
package/dist/index.d.ts +765 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +121 -0
package/dist/index.js.map +1 -0
package/dist/schemas/audit.schema.json +63 -0
package/dist/schemas/benchmark.schema.json +118 -0
package/dist/schemas/checkpoint.schema.json +64 -0
package/dist/schemas/feature-spec.schema.json +76 -0
package/dist/schemas/handoff.schema.json +78 -0
package/dist/schemas/schemas/audit.schema.json +63 -0
package/dist/schemas/schemas/benchmark.schema.json +118 -0
package/dist/schemas/schemas/checkpoint.schema.json +64 -0
package/dist/schemas/schemas/feature-spec.schema.json +76 -0
package/dist/schemas/schemas/handoff.schema.json +78 -0
package/dist/schemas/schemas/task.schema.json +69 -0
package/dist/schemas/task.schema.json +69 -0
package/docs/agent-contract.md +36 -0
package/docs/architecture.md +88 -0
package/docs/benchmarking.md +51 -0
package/docs/command-model.md +43 -0
package/docs/philosophy.md +35 -0
package/docs/roadmap.md +43 -0
package/docs/self-audit.md +29 -0
package/hooks/post-change.md +30 -0
package/hooks/pre-change.md +27 -0
package/hooks/pre-ship.md +30 -0
package/package.json +92 -0
package/rubrics/architectural-alignment.md +26 -0
package/rubrics/execution-quality.md +26 -0
package/rubrics/verbosity-score.md +26 -0
package/schemas/audit.schema.json +63 -0
package/schemas/benchmark.schema.json +118 -0
package/schemas/checkpoint.schema.json +64 -0
package/schemas/feature-spec.schema.json +76 -0
package/schemas/handoff.schema.json +78 -0
package/schemas/task.schema.json +69 -0
package/skills/architecture-review.md +17 -0
package/skills/backend-change.md +17 -0
package/skills/bug-triage.md +17 -0
package/skills/frontend-change.md +17 -0
package/skills/prompt-pruning.md +17 -0
package/skills/release-check.md +17 -0
package/skills/security-review.md +17 -0
package/templates/agent.template.md +18 -0
package/templates/command.template.md +21 -0
package/templates/skill.template.md +15 -0
package/templates/task.template.md +19 -0

package/docs/command-model.md ADDED Viewed

@@ -0,0 +1,43 @@
+# Command Model
+ZenKit defines eight commands. Each takes structured input and produces structured output per the standard contract.
+## The eight commands
+| Command | Purpose | Position |
+|---------|---------|----------|
+| `/spec` | Convert requirement into structured specification | Entry point |
+| `/plan` | Break spec into tasks, dependencies, risks | After spec |
+| `/build` | Execute plan, produce deliverables | After plan |
+| `/audit` | Validate build against spec and rubrics | After build |
+| `/refactor` | Improve code without behavior change | Any stage (requires green tests) |
+| `/handoff` | Transfer context between agents | Any stage |
+| `/checkpoint` | Snapshot workflow state for resumption | Any stage |
+| `/ship` | Package and release validated output | Terminal (after audit passes) |
+See `commands/*.md` for per-command details and examples.
+## Chaining
+```
+spec --> plan --> build --> audit --+--> ship
+                   ^               |
+                   +-- refactor <--+  (on failure)
+```
+At any point: `checkpoint` snapshots state, `handoff` transfers between agents.
+## Standard output contract
+Every command output includes:
+- **context** — What the agent knew when deciding.
+- **assumptions** — What was assumed but not verified. Each is an explicit risk.
+- **constraints** — Hard limits the agent operated within.
+- **decision** — What was decided and why, including rejected alternatives.
+- **deliverable** — The actual output: code, plan, report, or artifact.
+- **risks** — Specific risks in this output, not general concerns.
+- **open_questions** — What could not be resolved. Must be answered before downstream work proceeds.
+- **next_agent** — Who receives this output next.
+This contract makes agents interchangeable at the interface level. Swap the implementation, keep the contract.

package/docs/philosophy.md ADDED Viewed

@@ -0,0 +1,35 @@
+# ZenKit Design Philosophy
+ZenKit is a lightweight protocol layer for AI-assisted software building. It is not a framework, not a platform, and not an orchestration engine. It is a set of contracts that keep AI agents useful and honest.
+## Thin Over Grand
+ZenKit does the minimum required to make AI-assisted workflows repeatable and auditable. There is no runtime, no daemon, no server. The protocol is a set of JSON schemas, shell commands, and conventions. If you can run a shell script and read JSON, you can use ZenKit.
+Grand architectures create grand maintenance burdens. ZenKit stays thin so teams can adopt it incrementally without rewriting their toolchain.
+## Protocol Over Persona
+AI agents do not need personalities. They need input contracts, output contracts, and clear boundaries. ZenKit defines what an agent receives, what it must produce, and what it must not do. The rest is implementation detail.
+Persona-driven agents drift into unpredictable behavior. Protocol-driven agents produce artifacts you can diff, review, and version.
+## Bounded Autonomy Over Fake Certainty
+An agent that says "I'm 95% confident" without evidence is worse than one that says "I don't know." ZenKit requires agents to declare their assumptions, constraints, and open questions explicitly. Autonomy is bounded by schema validation, not by trust.
+Every command output includes an `open_questions` field. Silence is not confidence -- it is a schema violation.
+## Validation Over Narration
+ZenKit does not care what an agent says it did. It cares what artifacts exist, whether they parse, and whether they satisfy the spec. The audit system checks deliverables against schemas, not against prose descriptions.
+## Low Drift
+Each command produces a structured output that feeds into the next command. The chain is explicit. There is no hidden state, no ambient context that accumulates silently, no memory that decays. If context is needed, it is passed as input.
+## Real Benchmarkability
+If you cannot measure it, you cannot improve it. ZenKit's benchmark system validates actual artifacts -- not claims about artifacts. Telemetry distinguishes between measured values and estimated values. Results are reproducible because inputs and outputs are versioned.
+ZenKit is intentionally small because the problem it solves -- keeping AI agents accountable in a build workflow -- does not require a large solution. It requires a precise one.

package/docs/roadmap.md ADDED Viewed

@@ -0,0 +1,43 @@
+# Roadmap
+## Done in v0.2
+- Acceptance-criteria-driven benchmark runner (23 checks, 8 criteria)
+- Baseline vs zenkit comparison architecture (illustrative data)
+- Self-audit documentation with circular-validation safeguards
+- 72% reduction in protocol artifact size
+- Telemetry honesty: estimated/actual separation with basis field
+- Uncertainty and limitations as first-class result fields
+## Done in v0.3
+- 12 Playwright E2E browser tests
+- 4 benchmark feature specs (35 criteria, 101 checks)
+- ESLint integration (clean)
+- GitHub Actions CI pipeline (lint, test, benchmark, build, E2E)
+- Multi-spec benchmark runner with aggregate summary
+- Landing page: benchmark summary, comparison section, self-audit section
+- `zenkit` CLI tool (validate, benchmark, init, status)
+- CLAUDE.md for Claude Code integration
+- Mermaid workflow visualization
+- README with accurate test counts and full command reference
+## Next priorities
+### Real telemetry adapters
+Provider-specific adapters that capture actual token usage and cost from API responses. Initial targets: Anthropic, OpenAI.
+### A/B workflow comparison
+Execute the same feature spec twice — once with ZenKit structure, once without — and measure drift, retries, and rework. Prerequisite for meaningful (non-illustrative) comparison data.
+### Custom schema extensions
+Allow teams to extend the standard output contract with domain-specific fields without breaking compatibility.
+### npm package publishing
+Publish `zenkit` as an npm package so `zenkit init` and `zenkit validate` work globally without cloning the repo.
+### Interactive workflow visualization
+Render Mermaid diagrams directly in the landing page, with clickable stages showing check details.
+### More verification types
+Add `test_passes` (run a specific test and check exit code), `url_responds` (HTTP health check), and `json_path_equals` (check specific values in JSON files).

package/docs/self-audit.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Self-Audit
+ZenKit can structure audits of its own repository. This is useful but requires explicit safeguards against circular self-certification.
+## What self-audit does
+Using ZenKit commands and schemas to audit ZenKit produces structured artifacts: findings, rubric scores, and uncertainty notes. This is valuable because it:
+- Tests whether ZenKit's own primitives are expressive enough to describe real work
+- Produces concrete evidence of what passes and what does not
+- Forces the same honesty requirements (explicit uncertainty, bounded claims) on ZenKit itself
+## What self-audit does NOT do
+- It does not prove ZenKit is correct. A system auditing itself can only check what it knows to check.
+- It does not replace independent inspection. External review finds the blind spots self-audit cannot.
+- It does not validate the rubrics themselves. If a rubric is poorly designed, self-audit scores are meaningless.
+## Safeguards
+1. **Benchmark checks are verifiable.** The runner checks file existence, pattern matching, schema compilation, and example validation. These are inspectable — run the benchmark yourself and verify.
+2. **Uncertainty is required, not optional.** Every benchmark result must include an `uncertainty` array. Empty uncertainty is a red flag, not a sign of perfection.
+3. **Limitations are inherited from specs.** The feature spec declares what verification cannot cover. These propagate to results.
+4. **Illustrative data is labeled.** Comparison artifacts and example data are explicitly marked as `illustrative` where applicable.
+5. **Telemetry is never fabricated.** Estimated figures state their estimation basis. Actual telemetry is null when not instrumented.
+## The honest framing
+ZenKit structures its own audits, but its claims are only as strong as the checks behind them. The v0.2 benchmark verifies 23 specific checks across 8 acceptance criteria. It does not verify runtime UI behavior, performance under load, or multi-agent workflow fidelity. Those remain unvalidated until specific checks are added.

package/hooks/post-change.md ADDED Viewed

@@ -0,0 +1,30 @@
+# Post-Change Hook
+> When: After code changes are made
+## Purpose
+Validates that the codebase remains healthy after modifications and that changes stay within the planned scope.
+## Trigger
+Fires automatically after any code modification is applied to the working tree. This runs once all file edits for a given step are complete, before the agent reports success.
+## Checks
+- All tests pass (no regressions introduced by the change).
+- No lint errors exist in modified files.
+- Schema validation passes for any modified configuration or data files.
+- Changes match the approved plan scope (no unplanned files modified, no scope creep).
+- No new security warnings introduced (basic static analysis).
+## On Failure
+- The change is flagged as incomplete or non-conforming.
+- The agent receives a detailed report of which checks failed and why.
+- The agent must fix the issues before the change is considered done.
+- If scope drift is detected, the agent must either revert out-of-scope changes or request a plan amendment.
+## Configuration
+- `require_tests_pass`: (boolean, default `true`) Whether all tests must remain green.
+- `require_lint_clean`: (boolean, default `true`) Whether lint must pass on changed files.
+- `require_schema_valid`: (boolean, default `true`) Whether schema validation is enforced.
+- `require_scope_match`: (boolean, default `true`) Whether changes must stay within plan scope.
+- `lint_command`: (string, default `"npm run lint"`) Command used for linting.
+- `test_command`: (string, default `"npm test"`) Command used for running tests.

package/hooks/pre-change.md ADDED Viewed

@@ -0,0 +1,27 @@
+# Pre-Change Hook
+> When: Before any code change begins
+## Purpose
+Validates that all preconditions are met before modifying code, ensuring changes are planned, the environment is stable, and no work is lost.
+## Trigger
+Fires automatically before any code modification is applied to the working tree. This includes new feature work, bug fixes, refactors, and any file edits initiated by an agent.
+## Checks
+- A plan exists and has been approved for the current task.
+- All existing tests pass (green test suite).
+- The working tree is clean (no uncommitted or unstaged changes that could be overwritten).
+- The current branch is up to date with its upstream tracking branch.
+- No conflicting changes are in progress from another agent or task.
+## On Failure
+- The code change is blocked and does not proceed.
+- The failing check is reported with a clear reason and remediation hint.
+- The agent is prompted to resolve the issue (e.g., commit or stash uncommitted work, fix failing tests, create a plan) before retrying.
+## Configuration
+- `require_plan`: (boolean, default `true`) Whether an approved plan must exist.
+- `require_green_tests`: (boolean, default `true`) Whether the full test suite must pass.
+- `require_clean_tree`: (boolean, default `true`) Whether the working tree must have no uncommitted changes.
+- `allowed_dirty_paths`: (list, default `[]`) Paths excluded from the clean-tree check.

package/hooks/pre-ship.md ADDED Viewed

@@ -0,0 +1,30 @@
+# Pre-Ship Hook
+> When: Before shipping or deploying
+## Purpose
+Validates that all quality, security, and documentation gates are satisfied before code leaves the development environment.
+## Trigger
+Fires automatically before any ship or deploy action, including creating a release PR, tagging a release, or triggering a deployment pipeline.
+## Checks
+- All audit findings from the implementation-auditor are addressed or explicitly accepted.
+- All quality gates pass (execution-quality rubric score meets threshold).
+- The changelog is updated with an entry describing the shipped changes.
+- A checkpoint (commit or snapshot) exists that captures the exact state being shipped.
+- No open blocking issues remain on the task.
+- All agent handoffs in the chain have been completed.
+## On Failure
+- The ship action is blocked and does not proceed.
+- A summary of unmet conditions is presented with links to the relevant findings.
+- The responsible agent is directed to resolve each blocker before retrying.
+- If audit findings are intentionally deferred, they must be marked as accepted with a rationale.
+## Configuration
+- `require_audit_clear`: (boolean, default `true`) Whether all audit findings must be resolved.
+- `require_changelog`: (boolean, default `true`) Whether the changelog must have a new entry.
+- `require_checkpoint`: (boolean, default `true`) Whether a checkpoint commit must exist.
+- `min_quality_score`: (number, default `7`) Minimum execution-quality rubric score to ship.
+- `allow_deferred_findings`: (boolean, default `false`) Whether accepted-but-unresolved findings are permitted.

package/package.json ADDED Viewed

@@ -0,0 +1,92 @@
+{
+  "name": "zenkit",
+  "version": "0.5.0",
+  "private": false,
+  "description": "Disciplined workflows for coding agents — a lightweight open-source protocol layer for AI-assisted software building.",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/carl0zen/zenkit.git"
+  },
+  "keywords": ["ai", "agents", "workflow", "protocol", "benchmark", "schema", "handoff", "cli"],
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "require": "./dist/index.js",
+      "types": "./dist/index.d.ts"
+    },
+    "./schemas/*": "./schemas/*"
+  },
+  "bin": {
+    "zenkit": "./bin/zenkit.js"
+  },
+  "files": [
+    "dist/",
+    "bin/zenkit.js",
+    "schemas/",
+    "commands/",
+    "skills/",
+    "hooks/",
+    "agents/",
+    "rubrics/",
+    "templates/",
+    "benchmark/feature-specs/",
+    "benchmark/scripts/",
+    "benchmark/fixtures/",
+    "docs/",
+    "README.md",
+    "LICENSE",
+    "CONTRIBUTING.md"
+  ],
+  "scripts": {
+    "zenkit": "tsx bin/zenkit.ts",
+    "dev": "next dev",
+    "build": "next build",
+    "build:lib": "tsc -p tsconfig.lib.json && cp -r schemas dist/schemas",
+    "start": "next start",
+    "lint": "next lint",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "test:e2e": "playwright test",
+    "benchmark": "tsx benchmark/scripts/run.ts",
+    "benchmark:report": "tsx benchmark/scripts/report.ts",
+    "benchmark:compare": "tsx benchmark/scripts/compare.ts",
+    "benchmark:baseline": "tsx benchmark/scripts/run.ts benchmark/feature-specs/schema-validator-baseline.json",
+    "benchmark:all": "tsx benchmark/scripts/run-all.ts",
+    "benchmark:visualize": "tsx benchmark/scripts/visualize.ts",
+    "validate:schemas": "tsx src/lib/validate-schemas.ts",
+    "prepublishOnly": "npm run build:lib"
+  },
+  "dependencies": {
+    "ajv": "^8.17.1",
+    "ajv-formats": "^3.0.1"
+  },
+  "devDependencies": {
+    "@playwright/test": "^1.59.1",
+    "@types/node": "^22.10.0",
+    "@types/react": "^18.3.12",
+    "@types/react-dom": "^18.3.1",
+    "autoprefixer": "^10.4.20",
+    "eslint": "^8.57.1",
+    "eslint-config-next": "^14.2.35",
+    "next": "^14.2.18",
+    "postcss": "^8.4.49",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "tailwindcss": "^3.4.16",
+    "tsx": "^4.19.2",
+    "typescript": "^5.7.2",
+    "vitest": "^2.1.8"
+  },
+  "peerDependencies": {
+    "next": ">=14",
+    "react": ">=18",
+    "react-dom": ">=18"
+  },
+  "peerDependenciesMeta": {
+    "next": { "optional": true },
+    "react": { "optional": true },
+    "react-dom": { "optional": true }
+  }
+}

package/rubrics/architectural-alignment.md ADDED Viewed

@@ -0,0 +1,26 @@
+# Architectural Alignment
+> Measures how well the implementation follows the planned architecture and project conventions.
+## Scale
+0-10, where 0 is a complete departure from the architecture and 10 is perfect alignment.
+## Criteria
+### Score 0-2: Divergent
+The implementation ignores the architecture plan. Components are in the wrong layers, interfaces do not match contracts, and conventions are not followed. A redesign or major refactor is required to bring the work into alignment.
+### Score 3-4: Misaligned
+The implementation loosely follows the architecture but deviates in significant ways. Some component boundaries are violated, naming conventions are inconsistent, or data flows through unplanned paths. Multiple corrections are needed.
+### Score 5-6: Partially Aligned
+The implementation follows the architecture for major components but drifts on details. Most conventions are followed, but some shortcuts bypass the planned structure. Minor refactoring would bring it into full alignment.
+### Score 7-8: Aligned
+The implementation faithfully follows the architecture plan. Component boundaries are respected, interfaces match contracts, and project conventions are consistently applied. Minor deviations exist but are justified and documented.
+### Score 9-10: Exemplary Alignment
+The implementation is a textbook execution of the architecture plan. Every component, interface, and convention is followed precisely. Deviations, where they exist, improve on the original plan and are documented with rationale.
+## How to Apply
+Use this rubric during implementation-auditor review to assess structural compliance. Compare the implementation against the system-architect's design document, checking component boundaries, interface contracts, data flow paths, and naming conventions. Score each dimension separately and average for the final score. Minimum score to ship is configurable (default: 7).

package/rubrics/execution-quality.md ADDED Viewed

@@ -0,0 +1,26 @@
+# Execution Quality
+> Measures correctness, completeness, and production-readiness of delivered work.
+## Scale
+0-10, where 0 is entirely broken and 10 is flawless production-ready work.
+## Criteria
+### Score 0-2: Failing
+The deliverable does not function. Core requirements are unmet, tests fail or are absent, and the output cannot be used without a full rewrite. Fundamental misunderstandings of the requirements are evident.
+### Score 3-4: Incomplete
+The deliverable partially works but has significant gaps. Some requirements are met, but critical paths are broken or untested. Error handling is missing or incorrect. The work needs substantial rework before it can be reviewed.
+### Score 5-6: Functional
+The deliverable meets most requirements and works for the happy path. Tests exist but coverage is thin. Error handling covers common cases but misses edge cases. The work is reviewable but needs fixes before shipping.
+### Score 7-8: Solid
+The deliverable meets all requirements and handles edge cases well. Test coverage is thorough, error handling is robust, and the code follows project conventions. Minor improvements may be suggested but nothing blocks shipping.
+### Score 9-10: Exemplary
+The deliverable exceeds expectations. All requirements are met with comprehensive test coverage, excellent error handling, clear documentation, and thoughtful handling of edge cases. The work is a reference example for future implementations.
+## How to Apply
+Use this rubric during implementation-auditor review to score the overall quality of a deliverable. The minimum score to pass the pre-ship gate is configurable (default: 7). Score each dimension (correctness, completeness, test coverage, error handling, conventions) individually, then average for the final score.

package/rubrics/verbosity-score.md ADDED Viewed

@@ -0,0 +1,26 @@
+# Verbosity Score
+> Measures conciseness and signal-to-noise ratio of output (lower verbosity = higher score).
+## Scale
+0-10, where 0 is impenetrably dense or excessively verbose and 10 is perfectly concise with maximum signal.
+## Criteria
+### Score 0-2: Noise-Dominant
+Output is overwhelmed by filler, repetition, or unnecessary detail. Key information is buried. Alternatively, output is so terse that critical context is missing entirely. The reader cannot extract the needed information efficiently.
+### Score 3-4: Verbose
+Output contains the needed information but padded with redundant explanations, obvious statements, or excessive caveats. Could be reduced by 40-60% without losing meaning. Reader must skim aggressively to find value.
+### Score 5-6: Adequate
+Output is reasonably concise but has room for tightening. Some redundancy exists, and a few sections could be compressed. Reader gets the information but spends more time than necessary.
+### Score 7-8: Concise
+Output is tight and well-structured. Every paragraph earns its place. Minimal redundancy. Technical detail is present where needed and absent where it is not. Reader extracts information quickly.
+### Score 9-10: Optimal
+Output achieves maximum information density without sacrificing clarity. Every sentence carries meaning. Structure aids rapid comprehension. Nothing can be removed without losing value, and nothing is missing that would add value.
+## How to Apply
+Use this rubric when reviewing agent output, documentation, and reports. Apply it to the technical-writer's deliverables as a shipping gate (minimum: 7). Also use it for self-assessment by any agent producing written output. Score by estimating what percentage of content could be removed without information loss, then mapping to the scale.

package/schemas/audit.schema.json ADDED Viewed

@@ -0,0 +1,63 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://zenkit.dev/schemas/audit.schema.json",
+  "title": "ZenKit Audit Report",
+  "description": "Structured audit output from a ZenKit audit command.",
+  "type": "object",
+  "required": ["task_id", "auditor", "verdict", "findings"],
+  "properties": {
+    "task_id": {
+      "type": "string"
+    },
+    "auditor": {
+      "type": "string",
+      "description": "Agent or role that performed the audit."
+    },
+    "timestamp": {
+      "type": "string",
+      "format": "date-time"
+    },
+    "verdict": {
+      "type": "string",
+      "enum": ["pass", "fail", "conditional", "needs_review"]
+    },
+    "findings": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["category", "severity", "description"],
+        "properties": {
+          "category": {
+            "type": "string",
+            "enum": ["correctness", "security", "performance", "style", "architecture", "testing", "documentation"]
+          },
+          "severity": {
+            "type": "string",
+            "enum": ["info", "warning", "error", "critical"]
+          },
+          "description": { "type": "string" },
+          "file": { "type": "string" },
+          "line": { "type": "integer" },
+          "suggestion": { "type": "string" }
+        }
+      }
+    },
+    "rubric_scores": {
+      "type": "object",
+      "properties": {
+        "execution_quality": { "type": "number", "minimum": 0, "maximum": 10 },
+        "verbosity_score": { "type": "number", "minimum": 0, "maximum": 10 },
+        "architectural_alignment": { "type": "number", "minimum": 0, "maximum": 10 }
+      }
+    },
+    "open_questions": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "recommendations": {
+      "type": "array",
+      "items": { "type": "string" }
+    }
+  },
+  "additionalProperties": false
+}

package/schemas/benchmark.schema.json ADDED Viewed

@@ -0,0 +1,118 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://zenkit.dev/schemas/benchmark.schema.json",
+  "title": "ZenKit Benchmark Result",
+  "description": "Structured output from a ZenKit benchmark run with acceptance criteria verification.",
+  "type": "object",
+  "required": ["benchmark_id", "version", "mode", "task_name", "started_at", "completed_at", "status", "validation_summary", "acceptance_criteria_results"],
+  "properties": {
+    "benchmark_id": { "type": "string" },
+    "version": { "type": "string" },
+    "mode": {
+      "type": "string",
+      "enum": ["zenkit", "baseline"],
+      "description": "Whether this run used ZenKit workflow structure or an unstructured baseline."
+    },
+    "task_name": { "type": "string" },
+    "feature_spec": { "type": "string" },
+    "started_at": { "type": "string", "format": "date-time" },
+    "completed_at": { "type": "string", "format": "date-time" },
+    "duration_ms": { "type": "integer", "minimum": 0 },
+    "status": {
+      "type": "string",
+      "enum": ["pass", "fail", "partial"]
+    },
+    "expected_files": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "files_found": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "files_missing": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "acceptance_criteria_results": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["id", "description", "status", "evidence", "verification_type"],
+        "properties": {
+          "id": { "type": "string" },
+          "description": { "type": "string" },
+          "status": { "type": "string", "enum": ["pass", "fail"] },
+          "evidence": { "type": "string", "description": "What was actually checked and found." },
+          "verification_type": { "type": "string" }
+        }
+      }
+    },
+    "stages": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["name", "status", "checks_run", "checks_passed"],
+        "properties": {
+          "name": { "type": "string" },
+          "status": { "type": "string", "enum": ["pass", "fail", "skipped"] },
+          "duration_ms": { "type": "integer" },
+          "checks_run": { "type": "integer", "minimum": 0 },
+          "checks_passed": { "type": "integer", "minimum": 0 },
+          "details": { "type": "array", "items": { "type": "string" } }
+        }
+      }
+    },
+    "validation_summary": {
+      "type": "object",
+      "required": ["total_criteria", "criteria_passed", "criteria_failed"],
+      "properties": {
+        "total_criteria": { "type": "integer" },
+        "criteria_passed": { "type": "integer" },
+        "criteria_failed": { "type": "integer" },
+        "schemas_valid": { "type": "boolean" },
+        "examples_valid": { "type": "boolean" }
+      }
+    },
+    "telemetry": {
+      "type": "object",
+      "required": ["estimated"],
+      "properties": {
+        "estimated": {
+          "type": "object",
+          "required": ["tokens", "cost_usd", "basis"],
+          "properties": {
+            "tokens": { "type": "integer" },
+            "cost_usd": { "type": "number" },
+            "basis": { "type": "string", "description": "How the estimate was calculated." }
+          }
+        },
+        "actual": {
+          "oneOf": [
+            { "type": "null" },
+            {
+              "type": "object",
+              "required": ["tokens", "cost_usd"],
+              "properties": {
+                "tokens": { "type": "integer" },
+                "cost_usd": { "type": "number" }
+              }
+            }
+          ],
+          "description": "Null when no real telemetry is available. Never fabricated."
+        }
+      }
+    },
+    "uncertainty": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "What this benchmark does NOT prove."
+    },
+    "limitations": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Inherited from the feature spec — scope boundaries of the verification."
+    }
+  },
+  "additionalProperties": false
+}