@percepta/kaizen 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/README.md +54 -126
  2. package/agent/claude-command.md +23 -0
  3. package/agent/evals.md +41 -0
  4. package/agent/overview.md +53 -0
  5. package/agent/variant-builder.md +22 -0
  6. package/agent/views.md +51 -0
  7. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/BUILD_ID +1 -1
  8. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/build-manifest.json +22 -22
  9. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/prerender-manifest.json +3 -3
  10. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/routes-manifest.json +30 -10
  11. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/27.js +1 -0
  12. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/516.js +8 -0
  13. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/913.js +1 -0
  14. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/middleware-build-manifest.js +1 -1
  15. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/404.html +1 -1
  16. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/500.html +1 -1
  17. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.html +1 -1
  18. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.js.nft.json +1 -1
  19. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].html +1 -0
  20. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].js.nft.json +1 -0
  21. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.html +1 -1
  22. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.js.nft.json +1 -1
  23. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].html +1 -0
  24. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].js.nft.json +1 -0
  25. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.html +1 -1
  26. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.js.nft.json +1 -1
  27. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js +1 -0
  28. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js.nft.json +1 -0
  29. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js +1 -1
  30. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js.nft.json +1 -1
  31. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js +1 -0
  32. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js.nft.json +1 -0
  33. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js +1 -1
  34. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js.nft.json +1 -1
  35. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js +1 -1
  36. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js.nft.json +1 -1
  37. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js +1 -1
  38. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js.nft.json +1 -1
  39. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js +1 -0
  40. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js.nft.json +1 -0
  41. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js +2 -2
  42. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js.nft.json +1 -1
  43. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js +1 -1
  44. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js.nft.json +1 -1
  45. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js +1 -1
  46. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js.nft.json +1 -1
  47. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js +1 -1
  48. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js.nft.json +1 -1
  49. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js +2 -2
  50. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js.nft.json +1 -1
  51. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js +2 -2
  52. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js.nft.json +1 -1
  53. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js +1 -1
  54. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js.nft.json +1 -1
  55. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.html +1 -1
  56. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.js.nft.json +1 -1
  57. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages-manifest.json +8 -5
  58. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/SCF0o7YxElB9rzWaOohsA/_buildManifest.js +1 -0
  59. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/253-85c76c34f33c9604.js +8 -0
  60. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{benchmarks-559dc9df52db3af4.js → benchmarks-30a17b7659010b8c.js} +1 -1
  61. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data/[[...path]]-e5f4083fe9ffe429.js +1 -0
  62. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{eval-3c911ea8744631fd.js → eval-160237a604b47416.js} +1 -1
  63. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments/[[...path]]-91e47a4893093600.js +1 -0
  64. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{ideas-6829a271003150a9.js → ideas-96e58e4624952e26.js} +1 -1
  65. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/{index-1d8b6719f49e4ae0.js → index-d3306bb6f5d7d235.js} +1 -1
  66. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/cd3873236eb77caa.css +1 -0
  67. package/dashboard/.next/standalone/packages/kaizen/package.json +5 -3
  68. package/dashboard/.next/standalone/packages/kaizen/shared/workspace-paths.js +84 -0
  69. package/dist/commands/create-view.js +58 -0
  70. package/dist/commands/create-view.js.map +1 -0
  71. package/dist/commands/guide.js +66 -0
  72. package/dist/commands/guide.js.map +1 -0
  73. package/dist/commands/ideas.js +4 -8
  74. package/dist/commands/ideas.js.map +1 -1
  75. package/dist/commands/init-system.js +22 -20
  76. package/dist/commands/init-system.js.map +1 -1
  77. package/dist/commands/init.js +28 -64
  78. package/dist/commands/init.js.map +1 -1
  79. package/dist/commands/log.js +5 -11
  80. package/dist/commands/log.js.map +1 -1
  81. package/dist/commands/rebuild.js +7 -9
  82. package/dist/commands/rebuild.js.map +1 -1
  83. package/dist/commands/run.js +5 -9
  84. package/dist/commands/run.js.map +1 -1
  85. package/dist/commands/studio.js +3 -3
  86. package/dist/commands/studio.js.map +1 -1
  87. package/dist/index.js +17 -21
  88. package/dist/index.js.map +1 -1
  89. package/dist/lib/cli.js +20 -0
  90. package/dist/lib/cli.js.map +1 -0
  91. package/dist/lib/events.js.map +1 -1
  92. package/dist/lib/fs-utils.js +3 -27
  93. package/dist/lib/fs-utils.js.map +1 -1
  94. package/dist/lib/leaderboard.js +1 -1
  95. package/dist/lib/leaderboard.js.map +1 -1
  96. package/dist/lib/paths.js +3 -3
  97. package/dist/lib/paths.js.map +1 -1
  98. package/dist/lib/promotion.js.map +1 -1
  99. package/dist/lib/run-dir.js +1 -1
  100. package/dist/lib/run-dir.js.map +1 -1
  101. package/dist/lib/runner.js +6 -5
  102. package/dist/lib/runner.js.map +1 -1
  103. package/dist/lib/system.js +4 -2
  104. package/dist/lib/system.js.map +1 -1
  105. package/dist/package.js +5 -3
  106. package/dist/shared/view-types.d.ts +67 -0
  107. package/dist/shared/view-types.d.ts.map +1 -0
  108. package/dist/shared/workspace-paths.js +84 -0
  109. package/dist/shared/workspace-paths.js.map +1 -0
  110. package/dist/types.d.ts +3 -30
  111. package/dist/types.d.ts.map +1 -1
  112. package/package.json +5 -3
  113. package/shared/view-types.d.ts +69 -0
  114. package/shared/view-types.js +1 -0
  115. package/shared/workspace-paths.d.ts +19 -0
  116. package/shared/workspace-paths.js +84 -0
  117. package/templates/system/eval.py +13 -6
  118. package/templates/system/eval.ts +11 -5
  119. package/templates/system/rubric.md +1 -1
  120. package/templates/system/system.md +6 -5
  121. package/templates/view/dataset-item.tsx +63 -0
  122. package/templates/view/trace.tsx +10 -0
  123. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/715.js +0 -6
  124. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.html +0 -1
  125. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.js.nft.json +0 -1
  126. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.html +0 -1
  127. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.js.nft.json +0 -1
  128. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/YpQ-I4VL-aEdQrM5uN7_3/_buildManifest.js +0 -1
  129. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/673-ed4be46027ae7a37.js +0 -6
  130. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data-644e4280b4c86fe0.js +0 -1
  131. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments-42f31600c2bb47ad.js +0 -1
  132. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/b18a6732b96168e1.css +0 -1
  133. package/dist/lib/env.js +0 -2
  134. package/dist/shared/env.js +0 -4
  135. package/templates/workspace/.claude/agents/variant-builder.md +0 -51
  136. package/templates/workspace/.claude/commands/kaizen.md +0 -65
  137. /package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/{YpQ-I4VL-aEdQrM5uN7_3 → SCF0o7YxElB9rzWaOohsA}/_ssgManifest.js +0 -0
package/README.md CHANGED
@@ -1,176 +1,104 @@
1
1
  # Kaizen
2
2
 
3
- Automated AI researcher that improves AI systems. Kaizen investigates production traces, builds evaluation datasets, records scored runs, and helps prepare improvements -- driven by Claude Code with a live dashboard.
3
+ Kaizen is an agentic eval platform for AI systems. It helps a coding agent create a system definition, curate a Langfuse-backed dataset, write an eval script, run a baseline, and iterate on variants while Kaizen records scored runs under `kaizen/.kaizen/runs/`.
4
4
 
5
5
  ## Install In A Target Repo
6
6
 
7
- For a persistent local `kaizen` command:
8
-
9
7
  ```bash
10
8
  npm install -g @percepta/kaizen
11
9
  kaizen init
12
- kaizen create system <system-id> # add --eval-language ts for TypeScript
13
- kaizen run --system <system-id> --variant baseline --hypothesis "starting baseline"
10
+ kaizen guide
11
+ kaizen create system <system-id>
12
+ kaizen create view <system-id> --type trace
13
+ kaizen create view <system-id> --type dataset-item
14
+ kaizen run --system <system-id> --variant baseline --diagnostic --hypothesis "starting baseline"
14
15
  kaizen studio
15
16
  ```
16
17
 
17
- For one-off use without a global install:
18
+ For one-off use:
18
19
 
19
20
  ```bash
20
21
  npx @percepta/kaizen init
21
22
  ```
22
23
 
23
- For a repo-local dev dependency:
24
+ Kaizen is installed inside the customer repo. The customer-owned footprint is intentionally small:
24
25
 
25
- ```bash
26
- pnpm add -D @percepta/kaizen
27
- pnpm exec kaizen studio
28
- ```
26
+ - `kaizen/config.ts`
27
+ - `kaizen/systems/<system-id>/system.md`
28
+ - `kaizen/systems/<system-id>/eval.py|ts`
29
+ - optional `kaizen/systems/<system-id>/trace.tsx`
30
+ - optional `kaizen/systems/<system-id>/dataset-item.tsx`
31
+ - optional `kaizen/systems/<system-id>/rubric.md`
32
+ - `kaizen/.kaizen/runs/`
29
33
 
30
- Kaizen is installed inside the customer repo. System definitions, eval scripts, custom views, and `.kaizen/runs/` all live there; the CLI, runner, dashboard shell, and recipes come from the package.
31
-
32
- ## Developing This Repo
33
-
34
- ```bash
35
- pnpm install
36
- pnpm --filter @percepta/kaizen dev:studio
37
- ```
34
+ Package-owned agent guidance is printed with `kaizen guide`. Customer-specific durable notes belong in `kaizen/systems/<system-id>/system.md`; Kaizen does not create repo-level agent markdown such as `KAIZEN.md`, `AGENTS.md`, or `CLAUDE.md`.
38
35
 
39
- This starts the dashboard at http://localhost:6789 against `examples/legacy-workspace`, a transitional fixture with historical customer and system definitions. The CLI and dashboard both live in `packages/kaizen` (CLI in `src/`, Next.js dashboard in `dashboard/`).
36
+ ## Lifecycle
40
37
 
41
- Other dev scripts:
38
+ 1. Run `kaizen init` once in the target repo.
39
+ 2. Run `kaizen create system <system-id>` and fill in `kaizen/systems/<system-id>/system.md`.
40
+ 3. Use Studio Data to create or select a Langfuse dataset, add useful source traces, and label dataset items.
41
+ 4. Replace `kaizen/systems/<system-id>/eval.py|ts` with a real eval that reads the dataset named by `dataset_version`.
42
+ 5. Run a diagnostic baseline, then a full baseline.
43
+ 6. Run variants with `kaizen run`, inspect `kaizen log`, and use Studio to compare runs and failures.
42
44
 
43
- | Script | What it does |
44
- | ------------------------------------------- | --------------------------------- |
45
- | `pnpm --filter @percepta/kaizen dev:studio` | Start the Studio dashboard |
46
- | `pnpm --filter @percepta/kaizen dev:next` | Start only the Next.js dev server |
47
- | `pnpm typecheck` | Typecheck all packages |
48
- | `pnpm test` | Run package tests |
45
+ The eval script emits NDJSON events to `--out-fd`; the runner owns process supervision, `kaizen/.kaizen/runs/`, crash recording, and automatic promotion. For Langfuse-backed evals, the eval should also link each dataset item to the fresh trace generated by that run and write the primary metric as a trace score.
49
46
 
50
- ## Publishing
47
+ ## Custom Views
51
48
 
52
- Publishing `@percepta/kaizen` to npm is automated with Changesets. For changes
53
- that affect the published package, add a changeset:
49
+ Custom views are plain React components co-located with the system:
54
50
 
55
51
  ```bash
56
- pnpm changeset
52
+ kaizen create view <system-id> --type trace
53
+ kaizen create view <system-id> --type dataset-item
57
54
  ```
58
55
 
59
- Merging to `main` runs `.github/workflows/build-and-publish.yml`, which builds
60
- the CLI and bundled Studio, then either opens a version PR or publishes to npm
61
- using the `NPM_TOKEN` repository secret.
62
-
63
- ## How It Works
64
-
65
- Kaizen closes the eval loop for AI systems:
66
-
67
- 1. **Investigate** -- pull production traces from Langfuse, analyze failure patterns
68
- 2. **Build dataset** -- create versioned eval datasets from traces with ground truth
69
- 3. **Annotate** -- label ground truth via the dashboard's inline annotation view
70
- 4. **Record runs** -- test system variants against ground truth, scored automatically
71
- 5. **Improve** -- prepare a PR from the latest promoted baseline when a human asks
72
-
73
- The `/kaizen` slash command in Claude Code orchestrates this workflow. Variant-builder agents can execute in parallel worktrees, but they pass the main checkout's `.kaizen` path via `KAIZEN_STATE_DIR` or `--state-dir` so the dashboard always reads one canonical state tree.
74
-
75
- ## Dashboard
76
-
77
- The web app (Next.js, pages router) provides:
56
+ `trace.tsx` receives the full Langfuse trace payload plus actions for writing scores. `dataset-item.tsx` receives the dataset item, the linked source trace when available, and actions for updating the dataset item or linking run items. Browser-side credentials are not required; Studio proxies the write actions through local API routes.
78
57
 
79
- - **Data** -- inspect Langfuse datasets, dataset items, and source traces
80
- - **Experiments** -- inspect local Kaizen runs from the customer repo's `.kaizen/runs/` store
81
- - **Ideas** -- inspect Linear issues scoped to the system's configured project and the shared `Kaizen` label
82
- - **Source indicators** -- show whether a field is sourced from repo code, Langfuse, Linear, or the local filesystem
58
+ Run `kaizen guide views` for the exact prop and action interfaces.
83
59
 
84
- ### Keyboard Shortcuts
85
-
86
- | Shortcut | Action |
87
- | -------- | -------------- |
88
- | `Cmd+[` | Toggle sidebar |
89
- | `Cmd+/` | Show shortcuts |
90
-
91
- ## System Definitions
92
-
93
- In real use, each target/customer repo owns its own `customers/`, `systems/`, `rubrics/`, `eval/`, and optional `views/` directories. Each system is defined in `systems/*.md` with YAML frontmatter:
60
+ ## Developing This Repo
94
61
 
95
- ```yaml
96
- run_eval: eval/<system>.ts # or .py
97
- eval_version: 1
98
- dataset_version: v1
99
- eval_style: ground-truth
100
- primary_metric: score
62
+ ```bash
63
+ pnpm install
64
+ pnpm --filter @percepta/kaizen dev:studio
101
65
  ```
102
66
 
103
- The eval script emits NDJSON events to `--out-fd`; the runner owns `.kaizen/runs/`.
104
- Kaizen runs Python evals with `python3`, JavaScript evals with `node`, and
105
- TypeScript evals with the package's bundled `tsx` loader. New system scaffolds
106
- default to Python; pass `--eval-language ts` to create a TypeScript eval.
107
- For Langfuse-backed production evals, the same script should also link each
108
- dataset item to the fresh trace produced by that run in a Langfuse dataset run
109
- and write the primary metric as a trace score. Those writes are for durable
110
- trace inspection; the NDJSON `complete.score` remains Kaizen's required result
111
- contract.
112
-
113
- This repo keeps historical definitions under `examples/legacy-workspace/` only as sample data for local Studio development:
114
-
115
- | Customer | System | Primary Metric |
116
- | ---------------- | ---------------------------- | ----------------------- |
117
- | Transcarent | EMO HIE Processing | F2 |
118
- | Transcarent | EMO Facility Processing | F2 |
119
- | Transcarent | EMO Cost Savings Agent | Classification Accuracy |
120
- | Transcarent | EMO Summarization | -- |
121
- | Transcarent | Orbit Call Summarization | Judge Quality |
122
- | Cityblock Health | BOI Chaselist Impact | Calibration Error |
123
- | Cityblock Health | Concurrent Review Agent | -- |
124
- | Cityblock Health | Contract Exclusion Detection | -- |
125
- | Cityblock Health | Quality Gap Modeling | Calibration Error |
126
- | Janus Henderson | Portfolio Analytics | -- |
127
- | Summa Health | Agentic BI (SLCC) | -- |
128
-
129
- ## Repository Structure
67
+ This starts Studio at `http://localhost:6789` against `examples/demo-workspace`, a local fixture for package development. The CLI lives in `src/`; the bundled Next.js Studio lives in `dashboard/`.
130
68
 
131
- ```
132
- kaizen/
133
- ├── packages/kaizen/ # Published @percepta/kaizen package
134
- │ ├── src/ # CLI source
135
- │ ├── dashboard/ # Next.js Studio (built into the published bundle)
136
- │ └── examples/legacy-workspace/ # Transitional customer/system fixture for local dev
137
- ```
138
-
139
- ## Tech Stack
69
+ Useful scripts:
140
70
 
141
- - **Frontend**: Next.js (pages router), TypeScript, CSS modules, dark theme
142
- - **AI**: Claude Code with `/kaizen` skill + variant-builder agents
143
- - **Observability**: Langfuse for traces, datasets, and annotation state; `.kaizen/runs/` for local run truth
144
- - **Package manager**: pnpm
71
+ | Script | What it does |
72
+ | ------------------------------------------- | ---------------------------------- |
73
+ | `pnpm --filter @percepta/kaizen dev:studio` | Start Studio with the demo fixture |
74
+ | `pnpm --filter @percepta/kaizen dev:next` | Start only the Next.js dev server |
75
+ | `pnpm --filter @percepta/kaizen typecheck` | Typecheck the package |
76
+ | `pnpm --filter @percepta/kaizen test` | Run package tests |
145
77
 
146
- ## Environment Setup
78
+ ## Environment
147
79
 
148
- Create `.env.local` in the workspace repo root with Kaizen credentials:
80
+ Create `.env.local` in the workspace repo root:
149
81
 
150
- ```
82
+ ```text
151
83
  LANGFUSE_HOST=https://...
152
84
  LANGFUSE_PUBLIC_KEY=pk-lf-...
153
85
  LANGFUSE_SECRET_KEY=sk-lf-...
154
86
  LINEAR_API_KEY=lin_api_...
87
+ LINEAR_TEAM_KEY=ENG
155
88
  ```
156
89
 
157
- Kaizen Studio reads Langfuse and Linear credentials from the workspace root
158
- `.env.local`. Put the values there instead of app package env files so stale
159
- package-level placeholders cannot shadow the credentials Studio needs.
160
-
161
- Langfuse credentials power the Data surface. `LINEAR_API_KEY` powers the Ideas
162
- surface and the `kaizen ideas --system <id>` CLI command.
90
+ Langfuse credentials power the Data surface and custom view actions. `LINEAR_API_KEY` and `LINEAR_TEAM_KEY` power `kaizen ideas --system <id>`.
163
91
 
164
- System Ideas configuration should use a stable Linear project URL or ID:
92
+ System Ideas configuration should use a stable Linear project URL or ID in `system.md`:
165
93
 
166
94
  ```yaml
167
- linear_project: https://linear.app/aitco/project/kaizen-v0-555399b53e23
95
+ linear_project: https://linear.app/<workspace>/project/<project-slug>
168
96
  ```
169
97
 
170
- Project names are intentionally not used for the connection because they can
171
- change in Linear without changing project identity.
98
+ ## Publishing
172
99
 
173
- ## Docs
100
+ Publishing `@percepta/kaizen` to npm is automated with Changesets. For changes that affect the published package, add a changeset:
174
101
 
175
- - [Langfuse Standards](docs/langfuse-standards.md) -- how we structure Langfuse across all customers
176
- - [Eval Framework](docs/eval-framework.md) -- evaluation philosophy and requirements
102
+ ```bash
103
+ pnpm changeset
104
+ ```
@@ -0,0 +1,23 @@
1
+ # Kaizen Claude Command Guide
2
+
3
+ Use this as a Claude Code command body when Claude should drive the Kaizen lifecycle. It is package-owned; do not copy it into customer repos as durable markdown unless the user explicitly asks.
4
+
5
+ ## Rules
6
+
7
+ - Never commit PHI or credentials.
8
+ - Run `kaizen guide` first when guidance is not already in context.
9
+ - Put customer-specific notes in `kaizen/systems/<system-id>/system.md`.
10
+ - Use Studio for dataset curation and custom dataset item views for labeling workflows.
11
+
12
+ ## Workflow
13
+
14
+ 1. Select a system. If none is given, list `kaizen/systems/*/system.md` and ask which one.
15
+ 2. Read `system.md`, relevant application code, and current `kaizen log --system <system-id> --json`.
16
+ 3. If the system is new, run `kaizen create system <system-id> --eval-language py|ts` and fill in the scaffold.
17
+ 4. Use Studio Data to create/select a dataset, add traces, and label expected outputs.
18
+ 5. Replace the starter eval with real code that loads `--dataset`, runs the candidate, emits NDJSON events, and persists Langfuse links/scores when available.
19
+ 6. Run a diagnostic baseline, then the full baseline.
20
+ 7. Iterate variants with `kaizen run`.
21
+ 8. Create `trace.tsx` or `dataset-item.tsx` only when the default views are insufficient.
22
+
23
+ For exact eval and view contracts, run `kaizen guide evals` and `kaizen guide views`.
package/agent/evals.md ADDED
@@ -0,0 +1,41 @@
1
+ # Kaizen Eval Guide
2
+
3
+ Eval scripts are customer-owned executable code stored at `kaizen/systems/<system-id>/eval.py|ts`. `kaizen run` invokes the path named by `run_eval` in `kaizen/systems/<system-id>/system.md`:
4
+
5
+ ```bash
6
+ <run_eval> --variant <variant-id> --dataset <dataset_version> --out-fd 3 [--max-items <n>]
7
+ ```
8
+
9
+ The eval must write NDJSON events to `--out-fd`. Do not write these events to normal stdout.
10
+
11
+ ```json
12
+ {"type":"start","n":10,"eval_version":1,"dataset_version":"v1"}
13
+ {"type":"item","id":"item-1","score":0.8,"breakdown":{"score":0.8},"trace_id":"trace-id-or-null"}
14
+ {"type":"complete","score":0.82,"n":10,"breakdown":{"score":0.82},"worst_traces":[{"id":"item-1","score":0.8,"trace_id":"trace-id-or-null"}]}
15
+ ```
16
+
17
+ The terminal `complete.score` is Kaizen's authoritative result. It must be a number in `[0, 1]`.
18
+
19
+ ## Langfuse Persistence
20
+
21
+ For Langfuse-backed evals:
22
+
23
+ - Treat `--dataset` as the Langfuse dataset name unless `system.md` says otherwise.
24
+ - Load dataset items from that dataset.
25
+ - Run the candidate system for each item.
26
+ - Capture the fresh Langfuse trace id for that item.
27
+ - Link the dataset item to the fresh trace in a Langfuse dataset run.
28
+ - Write the primary metric as a Langfuse score on the fresh trace.
29
+ - Emit the same item score through Kaizen's NDJSON stream.
30
+
31
+ Langfuse stores trace inspection, dataset-run history, and score metadata. `kaizen/.kaizen/runs/` remains the source of truth for promotion and run state.
32
+
33
+ ## Baseline
34
+
35
+ Run a diagnostic baseline first:
36
+
37
+ ```bash
38
+ kaizen run --system <system-id> --variant baseline --diagnostic --hypothesis "starting baseline"
39
+ ```
40
+
41
+ If setup, credentials, dataset access, and event schema are valid, run the full baseline without `--diagnostic`.
@@ -0,0 +1,53 @@
1
+ # Kaizen Agent Guide
2
+
3
+ Kaizen helps a coding agent define, evaluate, inspect, and improve an AI system inside the customer repo. This guide is package-owned; rerun `kaizen guide` after package upgrades.
4
+
5
+ Do not create extra long-lived agent markdown files. Customer-specific notes belong in `kaizen/systems/<system-id>/system.md`; repo-owned code belongs beside it in `kaizen/systems/<system-id>/`.
6
+
7
+ ## Commands
8
+
9
+ Run commands from the repo root:
10
+
11
+ - `kaizen init` - scaffold Kaizen once.
12
+ - `kaizen guide topics` - list focused guide topics.
13
+ - `kaizen create system <system-id> --eval-language py|ts` - create `kaizen/systems/<system-id>/system.md` and `kaizen/systems/<system-id>/eval.py|ts`.
14
+ - `kaizen create view <system-id> --type trace` - create `kaizen/systems/<system-id>/trace.tsx`.
15
+ - `kaizen create view <system-id> --type dataset-item` - create `kaizen/systems/<system-id>/dataset-item.tsx`.
16
+ - `kaizen studio` - open Studio for dataset curation, trace inspection, and run review.
17
+ - `kaizen run --system <system-id> --variant <variant-id> --hypothesis "<why>"` - record one eval run.
18
+ - `kaizen run --system <system-id> --variant <variant-id> --diagnostic --hypothesis "<why>"` - run a small diagnostic sample first.
19
+ - `kaizen log --system <system-id> --json` - inspect the promoted baseline and recent runs.
20
+
21
+ Run state is written to `kaizen/.kaizen/`. When evaluating from a Git linked worktree, Kaizen automatically stores run state in the primary checkout's `kaizen/.kaizen/`.
22
+
23
+ ## Files
24
+
25
+ - `kaizen/systems/<system-id>/system.md` is the durable system definition. It should explain the workflow, key files, setup, dataset, metric, known failures, and variant ideas.
26
+ - `kaizen/systems/<system-id>/eval.py|ts` is the eval entrypoint named by `run_eval`.
27
+ - `kaizen/systems/<system-id>/trace.tsx` is an optional custom trace view.
28
+ - `kaizen/systems/<system-id>/dataset-item.tsx` is an optional custom dataset labeling view.
29
+ - `kaizen/systems/<system-id>/rubric.md` is optional and only needed for LLM-as-judge or hybrid evals.
30
+
31
+ Each `system.md` must include:
32
+
33
+ ```yaml
34
+ run_eval: kaizen/systems/<system-id>/eval.py
35
+ eval_version: 1
36
+ dataset_version: <langfuse-dataset-name>
37
+ eval_style: ground-truth
38
+ primary_metric: score
39
+ target: 0.90
40
+ ```
41
+
42
+ ## Lifecycle
43
+
44
+ 1. Run `kaizen create system <system-id>` unless the system already exists.
45
+ 2. Read the codebase and fill in `system.md` with real key files, setup, data sources, dataset, and metric.
46
+ 3. Use Studio Data to create/select a dataset, add representative traces, and label expected outputs.
47
+ 4. Replace the starter eval with real code that loads the dataset named by `--dataset`.
48
+ 5. Run a diagnostic baseline.
49
+ 6. Run the full baseline.
50
+ 7. Iterate on variants with `kaizen run`; read `kaizen log` and Studio failures between attempts.
51
+ 8. Create custom views only when the default JSON views are not enough for trace inspection or dataset labeling.
52
+
53
+ For eval details, run `kaizen guide evals`. For view props and actions, run `kaizen guide views`.
@@ -0,0 +1,22 @@
1
+ # Kaizen Variant Builder Guide
2
+
3
+ You implement and evaluate one variant, record one run with `kaizen run`, then stop.
4
+
5
+ ## Setup
6
+
7
+ 1. Work in the assigned worktree, not the main checkout.
8
+ 2. Let Kaizen auto-detect the primary checkout for run state. Runs from linked worktrees are recorded under the primary checkout's `kaizen/.kaizen/`.
9
+ 3. Read `kaizen/systems/<system-id>/system.md`, the parent run manifest when present, and the parent failures.
10
+ 4. Install or start only what the system setup section requires.
11
+
12
+ ## Run
13
+
14
+ ```bash
15
+ kaizen run \
16
+ --system <system-id> \
17
+ --variant <variant-id> \
18
+ --parent <parent-run-id> \
19
+ --hypothesis "<what changed and why>"
20
+ ```
21
+
22
+ The runner owns process supervision, `kaizen/.kaizen/runs/`, crash recording, and promotion. Read the single summary line it prints and include the run id and score in your handoff.
package/agent/views.md ADDED
@@ -0,0 +1,51 @@
1
+ # Kaizen Custom Views Guide
2
+
3
+ Custom views are customer-owned React components co-located with the system.
4
+
5
+ ```bash
6
+ kaizen create view <system-id> --type trace
7
+ kaizen create view <system-id> --type dataset-item
8
+ ```
9
+
10
+ Studio loads:
11
+
12
+ - `kaizen/systems/<system-id>/trace.tsx`
13
+ - `kaizen/systems/<system-id>/dataset-item.tsx`
14
+
15
+ No `system.md` frontmatter field is required.
16
+
17
+ ## Trace View
18
+
19
+ ```tsx
20
+ import type { TraceRendererProps } from "@percepta/kaizen";
21
+
22
+ export default function TraceView({ trace, actions }: TraceRendererProps) {
23
+ return <pre>{JSON.stringify(trace, null, 2)}</pre>;
24
+ }
25
+ ```
26
+
27
+ Trace views receive `{ trace, context, actions }`. `actions.createScore(...)` writes a Langfuse score for the current or supplied trace id.
28
+
29
+ ## Dataset Item View
30
+
31
+ ```tsx
32
+ import type { DatasetItemRendererProps } from "@percepta/kaizen";
33
+
34
+ export default function DatasetItemView({
35
+ datasetItem,
36
+ trace,
37
+ actions,
38
+ }: DatasetItemRendererProps) {
39
+ return <pre>{JSON.stringify({ datasetItem, trace }, null, 2)}</pre>;
40
+ }
41
+ ```
42
+
43
+ Dataset item views receive `{ datasetItem, trace, context, actions }`. Use them for labeling expected output, metadata, review status, and scoring workflows.
44
+
45
+ Available dataset actions:
46
+
47
+ - `actions.updateDatasetItem({ expectedOutput?, metadata?, input?, sourceTraceId?, status? })`
48
+ - `actions.createDatasetRunItem({ runName, datasetItemId?, traceId?, runDescription?, metadata? })`
49
+ - `actions.createScore({ name, value, traceId?, comment?, metadata? })`
50
+
51
+ When omitted, `datasetName`, `itemId`, and `traceId` default to the current Studio selection where Studio can infer them.
@@ -1 +1 @@
1
- YpQ-I4VL-aEdQrM5uN7_3
1
+ SCF0o7YxElB9rzWaOohsA
@@ -4,8 +4,8 @@
4
4
  ],
5
5
  "devFiles": [],
6
6
  "lowPriorityFiles": [
7
- "static/YpQ-I4VL-aEdQrM5uN7_3/_buildManifest.js",
8
- "static/YpQ-I4VL-aEdQrM5uN7_3/_ssgManifest.js"
7
+ "static/SCF0o7YxElB9rzWaOohsA/_buildManifest.js",
8
+ "static/SCF0o7YxElB9rzWaOohsA/_ssgManifest.js"
9
9
  ],
10
10
  "rootMainFiles": [],
11
11
  "rootMainFilesTree": {},
@@ -15,9 +15,9 @@
15
15
  "static/chunks/framework-7089c270fe56b51f.js",
16
16
  "static/chunks/main-7ac7f96d288497aa.js",
17
17
  "static/chunks/431-43358ce3c29e5e1b.js",
18
- "static/css/b18a6732b96168e1.css",
19
- "static/chunks/673-ed4be46027ae7a37.js",
20
- "static/chunks/pages/index-1d8b6719f49e4ae0.js"
18
+ "static/css/cd3873236eb77caa.css",
19
+ "static/chunks/253-85c76c34f33c9604.js",
20
+ "static/chunks/pages/index-d3306bb6f5d7d235.js"
21
21
  ],
22
22
  "/[system]": [
23
23
  "static/chunks/webpack-8c7966d82a2912f0.js",
@@ -30,45 +30,45 @@
30
30
  "static/chunks/framework-7089c270fe56b51f.js",
31
31
  "static/chunks/main-7ac7f96d288497aa.js",
32
32
  "static/chunks/431-43358ce3c29e5e1b.js",
33
- "static/css/b18a6732b96168e1.css",
34
- "static/chunks/673-ed4be46027ae7a37.js",
35
- "static/chunks/pages/[system]/benchmarks-559dc9df52db3af4.js"
33
+ "static/css/cd3873236eb77caa.css",
34
+ "static/chunks/253-85c76c34f33c9604.js",
35
+ "static/chunks/pages/[system]/benchmarks-30a17b7659010b8c.js"
36
36
  ],
37
- "/[system]/data": [
37
+ "/[system]/data/[[...path]]": [
38
38
  "static/chunks/webpack-8c7966d82a2912f0.js",
39
39
  "static/chunks/framework-7089c270fe56b51f.js",
40
40
  "static/chunks/main-7ac7f96d288497aa.js",
41
41
  "static/chunks/431-43358ce3c29e5e1b.js",
42
- "static/css/b18a6732b96168e1.css",
43
- "static/chunks/673-ed4be46027ae7a37.js",
44
- "static/chunks/pages/[system]/data-644e4280b4c86fe0.js"
42
+ "static/css/cd3873236eb77caa.css",
43
+ "static/chunks/253-85c76c34f33c9604.js",
44
+ "static/chunks/pages/[system]/data/[[...path]]-e5f4083fe9ffe429.js"
45
45
  ],
46
46
  "/[system]/eval": [
47
47
  "static/chunks/webpack-8c7966d82a2912f0.js",
48
48
  "static/chunks/framework-7089c270fe56b51f.js",
49
49
  "static/chunks/main-7ac7f96d288497aa.js",
50
50
  "static/chunks/431-43358ce3c29e5e1b.js",
51
- "static/css/b18a6732b96168e1.css",
52
- "static/chunks/673-ed4be46027ae7a37.js",
53
- "static/chunks/pages/[system]/eval-3c911ea8744631fd.js"
51
+ "static/css/cd3873236eb77caa.css",
52
+ "static/chunks/253-85c76c34f33c9604.js",
53
+ "static/chunks/pages/[system]/eval-160237a604b47416.js"
54
54
  ],
55
- "/[system]/experiments": [
55
+ "/[system]/experiments/[[...path]]": [
56
56
  "static/chunks/webpack-8c7966d82a2912f0.js",
57
57
  "static/chunks/framework-7089c270fe56b51f.js",
58
58
  "static/chunks/main-7ac7f96d288497aa.js",
59
59
  "static/chunks/431-43358ce3c29e5e1b.js",
60
- "static/css/b18a6732b96168e1.css",
61
- "static/chunks/673-ed4be46027ae7a37.js",
62
- "static/chunks/pages/[system]/experiments-42f31600c2bb47ad.js"
60
+ "static/css/cd3873236eb77caa.css",
61
+ "static/chunks/253-85c76c34f33c9604.js",
62
+ "static/chunks/pages/[system]/experiments/[[...path]]-91e47a4893093600.js"
63
63
  ],
64
64
  "/[system]/ideas": [
65
65
  "static/chunks/webpack-8c7966d82a2912f0.js",
66
66
  "static/chunks/framework-7089c270fe56b51f.js",
67
67
  "static/chunks/main-7ac7f96d288497aa.js",
68
68
  "static/chunks/431-43358ce3c29e5e1b.js",
69
- "static/css/b18a6732b96168e1.css",
70
- "static/chunks/673-ed4be46027ae7a37.js",
71
- "static/chunks/pages/[system]/ideas-6829a271003150a9.js"
69
+ "static/css/cd3873236eb77caa.css",
70
+ "static/chunks/253-85c76c34f33c9604.js",
71
+ "static/chunks/pages/[system]/ideas-96e58e4624952e26.js"
72
72
  ],
73
73
  "/_app": [
74
74
  "static/chunks/webpack-8c7966d82a2912f0.js",
@@ -3,9 +3,9 @@
3
3
  "routes": {},
4
4
  "dynamicRoutes": {},
5
5
  "preview": {
6
- "previewModeId": "02bf50b6d1c114ab64891ff63b9ae67b",
7
- "previewModeSigningKey": "fe223618cfb9bb61306a9dea8261c44ddd7141789a6543970e870a61bf19da51",
8
- "previewModeEncryptionKey": "a64d9df7a36d787a5996f6fb7171afb1fc8b7d35ad4962f94f482a0b687146c9"
6
+ "previewModeId": "0ba1834cfc7d7c8ea8708ad29269e503",
7
+ "previewModeSigningKey": "4cdb34c8c9deccef53be3f014ab0ccdd422a7b57e71290f1ce447fd0a2e2a138",
8
+ "previewModeEncryptionKey": "c48cc1326503410c45e8e0baf97d801d067e317fa759e378825440098271163f"
9
9
  },
10
10
  "notFoundRoutes": []
11
11
  }
@@ -38,12 +38,13 @@
38
38
  "namedRegex": "^/(?<nxtPsystem>[^/]+?)/benchmarks(?:/)?$"
39
39
  },
40
40
  {
41
- "page": "/[system]/data",
42
- "regex": "^/([^/]+?)/data(?:/)?$",
41
+ "page": "/[system]/data/[[...path]]",
42
+ "regex": "^/([^/]+?)/data(?:/(.+?))?(?:/)?$",
43
43
  "routeKeys": {
44
- "nxtPsystem": "nxtPsystem"
44
+ "nxtPsystem": "nxtPsystem",
45
+ "nxtPpath": "nxtPpath"
45
46
  },
46
- "namedRegex": "^/(?<nxtPsystem>[^/]+?)/data(?:/)?$"
47
+ "namedRegex": "^/(?<nxtPsystem>[^/]+?)/data(?:/(?<nxtPpath>.+?))?(?:/)?$"
47
48
  },
48
49
  {
49
50
  "page": "/[system]/eval",
@@ -54,12 +55,13 @@
54
55
  "namedRegex": "^/(?<nxtPsystem>[^/]+?)/eval(?:/)?$"
55
56
  },
56
57
  {
57
- "page": "/[system]/experiments",
58
- "regex": "^/([^/]+?)/experiments(?:/)?$",
58
+ "page": "/[system]/experiments/[[...path]]",
59
+ "regex": "^/([^/]+?)/experiments(?:/(.+?))?(?:/)?$",
59
60
  "routeKeys": {
60
- "nxtPsystem": "nxtPsystem"
61
+ "nxtPsystem": "nxtPsystem",
62
+ "nxtPpath": "nxtPpath"
61
63
  },
62
- "namedRegex": "^/(?<nxtPsystem>[^/]+?)/experiments(?:/)?$"
64
+ "namedRegex": "^/(?<nxtPsystem>[^/]+?)/experiments(?:/(?<nxtPpath>.+?))?(?:/)?$"
63
65
  },
64
66
  {
65
67
  "page": "/[system]/ideas",
@@ -77,6 +79,12 @@
77
79
  "routeKeys": {},
78
80
  "namedRegex": "^/(?:/)?$"
79
81
  },
82
+ {
83
+ "page": "/api/langfuse-action",
84
+ "regex": "^/api/langfuse\\-action(?:/)?$",
85
+ "routeKeys": {},
86
+ "namedRegex": "^/api/langfuse\\-action(?:/)?$"
87
+ },
80
88
  {
81
89
  "page": "/api/langfuse-dataset",
82
90
  "regex": "^/api/langfuse\\-dataset(?:/)?$",
@@ -89,6 +97,12 @@
89
97
  "routeKeys": {},
90
98
  "namedRegex": "^/api/langfuse\\-dataset\\-item(?:/)?$"
91
99
  },
100
+ {
101
+ "page": "/api/langfuse-dataset-mutation",
102
+ "regex": "^/api/langfuse\\-dataset\\-mutation(?:/)?$",
103
+ "routeKeys": {},
104
+ "namedRegex": "^/api/langfuse\\-dataset\\-mutation(?:/)?$"
105
+ },
92
106
  {
93
107
  "page": "/api/langfuse-datasets",
94
108
  "regex": "^/api/langfuse\\-datasets(?:/)?$",
@@ -101,6 +115,12 @@
101
115
  "routeKeys": {},
102
116
  "namedRegex": "^/api/langfuse\\-trace(?:/)?$"
103
117
  },
118
+ {
119
+ "page": "/api/langfuse-traces",
120
+ "regex": "^/api/langfuse\\-traces(?:/)?$",
121
+ "routeKeys": {},
122
+ "namedRegex": "^/api/langfuse\\-traces(?:/)?$"
123
+ },
104
124
  {
105
125
  "page": "/api/linear-ideas",
106
126
  "regex": "^/api/linear\\-ideas(?:/)?$",
@@ -150,8 +170,8 @@
150
170
  "routeKeys": {
151
171
  "nxtPsystem": "nxtPsystem"
152
172
  },
153
- "dataRouteRegex": "^/_next/data/YpQ\\-I4VL\\-aEdQrM5uN7_3/([^/]+?)\\.json$",
154
- "namedDataRouteRegex": "^/_next/data/YpQ\\-I4VL\\-aEdQrM5uN7_3/(?<nxtPsystem>[^/]+?)\\.json$"
173
+ "dataRouteRegex": "^/_next/data/SCF0o7YxElB9rzWaOohsA/([^/]+?)\\.json$",
174
+ "namedDataRouteRegex": "^/_next/data/SCF0o7YxElB9rzWaOohsA/(?<nxtPsystem>[^/]+?)\\.json$"
155
175
  }
156
176
  ],
157
177
  "rsc": {