@percepta/kaizen 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/README.md +54 -126
  2. package/agent/claude-command.md +23 -0
  3. package/agent/evals.md +41 -0
  4. package/agent/overview.md +53 -0
  5. package/agent/variant-builder.md +22 -0
  6. package/agent/views.md +51 -0
  7. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/BUILD_ID +1 -1
  8. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/build-manifest.json +22 -22
  9. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/prerender-manifest.json +3 -3
  10. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/routes-manifest.json +30 -10
  11. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/27.js +1 -0
  12. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/516.js +8 -0
  13. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/913.js +1 -0
  14. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/middleware-build-manifest.js +1 -1
  15. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/404.html +1 -1
  16. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/500.html +1 -1
  17. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.html +1 -1
  18. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.js.nft.json +1 -1
  19. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].html +1 -0
  20. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].js.nft.json +1 -0
  21. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.html +1 -1
  22. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.js.nft.json +1 -1
  23. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].html +1 -0
  24. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].js.nft.json +1 -0
  25. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.html +1 -1
  26. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.js.nft.json +1 -1
  27. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js +1 -0
  28. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js.nft.json +1 -0
  29. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js +1 -1
  30. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js.nft.json +1 -1
  31. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js +1 -0
  32. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js.nft.json +1 -0
  33. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js +1 -1
  34. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js.nft.json +1 -1
  35. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js +1 -1
  36. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js.nft.json +1 -1
  37. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js +1 -1
  38. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js.nft.json +1 -1
  39. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js +1 -0
  40. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js.nft.json +1 -0
  41. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js +2 -2
  42. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js.nft.json +1 -1
  43. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js +1 -1
  44. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js.nft.json +1 -1
  45. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js +1 -1
  46. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js.nft.json +1 -1
  47. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js +1 -1
  48. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js.nft.json +1 -1
  49. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js +2 -2
  50. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js.nft.json +1 -1
  51. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js +2 -2
  52. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js.nft.json +1 -1
  53. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js +1 -1
  54. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js.nft.json +1 -1
  55. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.html +1 -1
  56. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.js.nft.json +1 -1
  57. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages-manifest.json +8 -5
  58. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/SCF0o7YxElB9rzWaOohsA/_buildManifest.js +1 -0
  59. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/253-85c76c34f33c9604.js +8 -0
  60. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{benchmarks-559dc9df52db3af4.js → benchmarks-30a17b7659010b8c.js} +1 -1
  61. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data/[[...path]]-e5f4083fe9ffe429.js +1 -0
  62. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{eval-3c911ea8744631fd.js → eval-160237a604b47416.js} +1 -1
  63. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments/[[...path]]-91e47a4893093600.js +1 -0
  64. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{ideas-6829a271003150a9.js → ideas-96e58e4624952e26.js} +1 -1
  65. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/{index-1d8b6719f49e4ae0.js → index-d3306bb6f5d7d235.js} +1 -1
  66. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/cd3873236eb77caa.css +1 -0
  67. package/dashboard/.next/standalone/packages/kaizen/package.json +5 -3
  68. package/dashboard/.next/standalone/packages/kaizen/shared/workspace-paths.js +84 -0
  69. package/dist/commands/create-view.js +58 -0
  70. package/dist/commands/create-view.js.map +1 -0
  71. package/dist/commands/guide.js +66 -0
  72. package/dist/commands/guide.js.map +1 -0
  73. package/dist/commands/ideas.js +4 -8
  74. package/dist/commands/ideas.js.map +1 -1
  75. package/dist/commands/init-system.js +22 -20
  76. package/dist/commands/init-system.js.map +1 -1
  77. package/dist/commands/init.js +28 -64
  78. package/dist/commands/init.js.map +1 -1
  79. package/dist/commands/log.js +5 -11
  80. package/dist/commands/log.js.map +1 -1
  81. package/dist/commands/rebuild.js +7 -9
  82. package/dist/commands/rebuild.js.map +1 -1
  83. package/dist/commands/run.js +5 -9
  84. package/dist/commands/run.js.map +1 -1
  85. package/dist/commands/studio.js +3 -3
  86. package/dist/commands/studio.js.map +1 -1
  87. package/dist/index.js +17 -21
  88. package/dist/index.js.map +1 -1
  89. package/dist/lib/cli.js +20 -0
  90. package/dist/lib/cli.js.map +1 -0
  91. package/dist/lib/events.js.map +1 -1
  92. package/dist/lib/fs-utils.js +3 -27
  93. package/dist/lib/fs-utils.js.map +1 -1
  94. package/dist/lib/leaderboard.js +1 -1
  95. package/dist/lib/leaderboard.js.map +1 -1
  96. package/dist/lib/paths.js +3 -3
  97. package/dist/lib/paths.js.map +1 -1
  98. package/dist/lib/promotion.js.map +1 -1
  99. package/dist/lib/run-dir.js +1 -1
  100. package/dist/lib/run-dir.js.map +1 -1
  101. package/dist/lib/runner.js +6 -5
  102. package/dist/lib/runner.js.map +1 -1
  103. package/dist/lib/system.js +4 -2
  104. package/dist/lib/system.js.map +1 -1
  105. package/dist/package.js +5 -3
  106. package/dist/shared/view-types.d.ts +67 -0
  107. package/dist/shared/view-types.d.ts.map +1 -0
  108. package/dist/shared/workspace-paths.js +84 -0
  109. package/dist/shared/workspace-paths.js.map +1 -0
  110. package/dist/types.d.ts +3 -30
  111. package/dist/types.d.ts.map +1 -1
  112. package/package.json +5 -3
  113. package/shared/view-types.d.ts +69 -0
  114. package/shared/view-types.js +1 -0
  115. package/shared/workspace-paths.d.ts +19 -0
  116. package/shared/workspace-paths.js +84 -0
  117. package/templates/system/eval.py +13 -6
  118. package/templates/system/eval.ts +11 -5
  119. package/templates/system/rubric.md +1 -1
  120. package/templates/system/system.md +6 -5
  121. package/templates/view/dataset-item.tsx +63 -0
  122. package/templates/view/trace.tsx +10 -0
  123. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/715.js +0 -6
  124. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.html +0 -1
  125. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.js.nft.json +0 -1
  126. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.html +0 -1
  127. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.js.nft.json +0 -1
  128. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/YpQ-I4VL-aEdQrM5uN7_3/_buildManifest.js +0 -1
  129. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/673-ed4be46027ae7a37.js +0 -6
  130. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data-644e4280b4c86fe0.js +0 -1
  131. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments-42f31600c2bb47ad.js +0 -1
  132. package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/b18a6732b96168e1.css +0 -1
  133. package/dist/lib/env.js +0 -2
  134. package/dist/shared/env.js +0 -4
  135. package/templates/workspace/.claude/agents/variant-builder.md +0 -51
  136. package/templates/workspace/.claude/commands/kaizen.md +0 -65
  137. /package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/{YpQ-I4VL-aEdQrM5uN7_3 → SCF0o7YxElB9rzWaOohsA}/_ssgManifest.js +0 -0
package/dist/types.d.ts CHANGED
@@ -1,38 +1,11 @@
1
+ import { DatasetItemData, DatasetItemRendererActions, DatasetItemRendererProps, TraceData, TraceRendererActions, TraceRendererContext, TraceRendererProps } from "./shared/view-types.js";
2
+
1
3
  //#region src/types.d.ts
2
4
  interface KaizenConfig {
3
5
  customer: {
4
- slug: string;
5
6
  name: string;
6
7
  };
7
- langfuse?: {
8
- host?: string;
9
- publicKeyEnv?: string;
10
- secretKeyEnv?: string;
11
- };
12
- studio?: {
13
- port?: number;
14
- };
15
- }
16
- /** Data shape passed to custom trace renderers. */
17
- interface TraceData {
18
- id?: string;
19
- name?: string;
20
- tags?: string[];
21
- timestamp?: string;
22
- metadata?: unknown;
23
- input?: unknown;
24
- output?: unknown;
25
- }
26
- /** Props contract for custom trace renderer components. */
27
- interface TraceRendererProps {
28
- trace: TraceData;
29
- datasetItem?: {
30
- id: string;
31
- input?: unknown;
32
- expectedOutput?: unknown;
33
- metadata?: Record<string, unknown> | null;
34
- } | null;
35
8
  }
36
9
  //#endregion
37
- export { KaizenConfig, TraceData, TraceRendererProps };
10
+ export { type DatasetItemData, type DatasetItemRendererActions, type DatasetItemRendererProps, KaizenConfig, type TraceData, type TraceRendererActions, type TraceRendererContext, type TraceRendererProps };
38
11
  //# sourceMappingURL=types.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","names":[],"sources":["../src/types.ts"],"mappings":";UAAiB,YAAA;EACf,QAAA;IACE,IAAA;IACA,IAAA;EAAA;EAEF,QAAA;IACE,IAAA;IACA,YAAA;IACA,YAAA;EAAA;EAEF,MAAA;IACE,IAAA;EAAA;AAAA;;UAKa,SAAA;EACf,EAAA;EACA,IAAA;EACA,IAAA;EACA,SAAA;EACA,QAAA;EACA,KAAA;EACA,MAAA;AAAA;;UAIe,kBAAA;EACf,KAAA,EAAO,SAAA;EACP,WAAA;IACE,EAAA;IACA,KAAA;IACA,cAAA;IACA,QAAA,GAAW,MAAA;EAAA;AAAA"}
1
+ {"version":3,"file":"types.d.ts","names":[],"sources":["../src/types.ts"],"mappings":";;;UAAiB,YAAA;EACf,QAAA;IACE,IAAA;EAAA;AAAA"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@percepta/kaizen",
3
- "version": "0.6.0",
3
+ "version": "0.7.0",
4
4
  "description": "Automated AI researcher that improves AI systems",
5
5
  "keywords": [
6
6
  "ai",
@@ -14,6 +14,7 @@
14
14
  "kaizen": "./bin/kaizen.js"
15
15
  },
16
16
  "files": [
17
+ "agent",
17
18
  "bin",
18
19
  "dashboard/.next/standalone",
19
20
  "dist",
@@ -71,8 +72,9 @@
71
72
  "typecheck:dashboard": "tsc -p dashboard --noEmit",
72
73
  "test": "vitest run",
73
74
  "dev": "tsx src/index.ts",
74
- "dev:studio": "KAIZEN_WORKSPACE=examples/legacy-workspace next dev dashboard --webpack --port 6789",
75
- "dev:next": "KAIZEN_WORKSPACE=examples/legacy-workspace next dev dashboard --webpack --port 6789",
75
+ "dev:studio": "KAIZEN_DEMO_MODE=1 KAIZEN_WORKSPACE=$PWD/examples/demo-workspace next dev dashboard --webpack --port 6789",
76
+ "dev:next": "KAIZEN_DEMO_MODE=1 KAIZEN_WORKSPACE=$PWD/examples/demo-workspace next dev dashboard --webpack --port 6789",
77
+ "seed:demo-data": "node scripts/seed-demo-traces.mjs --workspace $PWD/examples/demo-workspace",
76
78
  "kaizen": "tsx src/index.ts"
77
79
  }
78
80
  }
@@ -0,0 +1,69 @@
1
+ /** Data shape passed to custom trace renderers. */
2
+ export interface TraceData {
3
+ id?: string;
4
+ name?: string;
5
+ tags?: string[];
6
+ timestamp?: string;
7
+ metadata?: unknown;
8
+ input?: unknown;
9
+ output?: unknown;
10
+ [key: string]: unknown;
11
+ }
12
+
13
+ /** Props contract for custom trace renderer components. */
14
+ export interface TraceRendererProps {
15
+ trace: TraceData;
16
+ context: TraceRendererContext;
17
+ actions: TraceRendererActions;
18
+ }
19
+
20
+ export interface DatasetItemRendererProps {
21
+ datasetItem: DatasetItemData;
22
+ trace?: TraceData | null;
23
+ context: TraceRendererContext;
24
+ actions: DatasetItemRendererActions;
25
+ }
26
+
27
+ export interface DatasetItemData {
28
+ id: string;
29
+ input?: unknown;
30
+ expectedOutput?: unknown;
31
+ metadata?: Record<string, unknown> | null;
32
+ [key: string]: unknown;
33
+ }
34
+
35
+ export interface TraceRendererContext {
36
+ systemId: string;
37
+ surface: "trace" | "dataset-item" | "run-trace";
38
+ datasetName?: string | null;
39
+ runId?: string | null;
40
+ }
41
+
42
+ export interface TraceRendererActions {
43
+ createScore(input: {
44
+ traceId?: string;
45
+ name: string;
46
+ value: number | string | boolean;
47
+ comment?: string;
48
+ metadata?: Record<string, unknown>;
49
+ }): Promise<unknown>;
50
+ }
51
+
52
+ export interface DatasetItemRendererActions extends TraceRendererActions {
53
+ updateDatasetItem(input: {
54
+ datasetName?: string;
55
+ itemId?: string;
56
+ expectedOutput?: unknown;
57
+ metadata?: Record<string, unknown> | null;
58
+ input?: unknown;
59
+ sourceTraceId?: string | null;
60
+ status?: string | null;
61
+ }): Promise<unknown>;
62
+ createDatasetRunItem(input: {
63
+ datasetItemId?: string;
64
+ traceId?: string;
65
+ runName: string;
66
+ runDescription?: string;
67
+ metadata?: Record<string, unknown>;
68
+ }): Promise<unknown>;
69
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,19 @@
1
+ export declare const KAIZEN_DIR = "kaizen";
2
+ export declare const KAIZEN_CONFIG = "config.ts";
3
+ export declare const KAIZEN_STATE_DIR = ".kaizen";
4
+ export declare const KAIZEN_SYSTEMS_DIR = "systems";
5
+
6
+ export declare function kaizenDir(workspaceRoot: string): string;
7
+ export declare function kaizenConfigPath(workspaceRoot: string): string;
8
+ export declare function kaizenSystemsDir(workspaceRoot: string): string;
9
+ export declare function kaizenSystemDir(
10
+ workspaceRoot: string,
11
+ systemId: string,
12
+ ): string;
13
+ export declare function kaizenSystemPath(
14
+ workspaceRoot: string,
15
+ systemId: string,
16
+ ): string;
17
+ export declare function defaultKaizenStateDir(workspaceRoot: string): string;
18
+ export declare function resolveKaizenStateDir(workspaceRoot: string): string;
19
+ export declare function primaryWorktreeRoot(workspaceRoot: string): string;
@@ -0,0 +1,84 @@
1
+ import { execFileSync } from "node:child_process";
2
+ import { realpathSync } from "node:fs";
3
+ import { join, resolve } from "node:path";
4
+
5
+ export const KAIZEN_DIR = "kaizen";
6
+ export const KAIZEN_CONFIG = "config.ts";
7
+ export const KAIZEN_STATE_DIR = ".kaizen";
8
+ export const KAIZEN_SYSTEMS_DIR = "systems";
9
+
10
+ export function kaizenDir(workspaceRoot) {
11
+ return join(workspaceRoot, KAIZEN_DIR);
12
+ }
13
+
14
+ export function kaizenConfigPath(workspaceRoot) {
15
+ return join(kaizenDir(workspaceRoot), KAIZEN_CONFIG);
16
+ }
17
+
18
+ export function kaizenSystemsDir(workspaceRoot) {
19
+ return join(kaizenDir(workspaceRoot), KAIZEN_SYSTEMS_DIR);
20
+ }
21
+
22
+ export function kaizenSystemDir(workspaceRoot, systemId) {
23
+ return join(kaizenSystemsDir(workspaceRoot), systemId);
24
+ }
25
+
26
+ export function kaizenSystemPath(workspaceRoot, systemId) {
27
+ return join(kaizenSystemDir(workspaceRoot, systemId), "system.md");
28
+ }
29
+
30
+ export function defaultKaizenStateDir(workspaceRoot) {
31
+ return join(primaryWorktreeRoot(workspaceRoot), KAIZEN_DIR, KAIZEN_STATE_DIR);
32
+ }
33
+
34
+ export function resolveKaizenStateDir(workspaceRoot) {
35
+ const raw = process.env.KAIZEN_STATE_DIR;
36
+ return raw
37
+ ? resolve(workspaceRoot, raw)
38
+ : defaultKaizenStateDir(workspaceRoot);
39
+ }
40
+
41
+ export function primaryWorktreeRoot(workspaceRoot) {
42
+ const normalizedWorkspace = canonicalPath(workspaceRoot);
43
+ const gitRoot = gitTopLevel(normalizedWorkspace);
44
+ if (!gitRoot || canonicalPath(gitRoot) !== normalizedWorkspace) {
45
+ return normalizedWorkspace;
46
+ }
47
+
48
+ try {
49
+ const out = execFileSync(
50
+ "git",
51
+ ["-C", normalizedWorkspace, "worktree", "list", "--porcelain"],
52
+ { encoding: "utf-8", stdio: ["ignore", "pipe", "ignore"] },
53
+ );
54
+ const first = out.split("\n").find((line) => line.startsWith("worktree "));
55
+ return first
56
+ ? canonicalPath(first.slice("worktree ".length))
57
+ : normalizedWorkspace;
58
+ } catch {
59
+ return normalizedWorkspace;
60
+ }
61
+ }
62
+
63
+ function gitTopLevel(workspaceRoot) {
64
+ try {
65
+ return execFileSync(
66
+ "git",
67
+ ["-C", workspaceRoot, "rev-parse", "--show-toplevel"],
68
+ {
69
+ encoding: "utf-8",
70
+ stdio: ["ignore", "pipe", "ignore"],
71
+ },
72
+ ).trim();
73
+ } catch {
74
+ return null;
75
+ }
76
+ }
77
+
78
+ function canonicalPath(path) {
79
+ try {
80
+ return realpathSync(path);
81
+ } catch {
82
+ return resolve(path);
83
+ }
84
+ }
@@ -7,7 +7,7 @@ well-formed event stream:
7
7
  {"type": "start", "n": <int>, "eval_version": <int>, "dataset_version": "<str>"}
8
8
  {"type": "item", "id": "<str>", "score": <float in [0,1]>, "breakdown": {...}, "trace_id": "<str|null>"}
9
9
  ... one item event per dataset item ...
10
- {"type": "complete", "score": <float>, "breakdown": {...}, "worst_traces": [...]}
10
+ {"type": "complete", "score": <float>, "n": <int>, "breakdown": {...}, "worst_traces": [...]}
11
11
 
12
12
  If something goes wrong, emit `{"type": "error", "message": "<str>"}` and exit non-zero.
13
13
  The supervisor will record the run as `crashed` if no `complete` event is seen.
@@ -19,8 +19,9 @@ For production evals backed by Langfuse, keep the NDJSON stream as the required
19
19
  Kaizen contract and also persist results back to Langfuse as a best-effort side
20
20
  effect: load the versioned dataset, run the system to create a fresh trace for
21
21
  each item, link that trace to the dataset item in a dataset run, and write the
22
- primary metric as a score on the trace. Include that fresh trace id in the
23
- Kaizen item event so failure analysis can jump directly to Langfuse.
22
+ primary metric as a score on the trace. Treat --dataset as the Langfuse dataset
23
+ name unless system.md says otherwise. Include that fresh trace id in the Kaizen
24
+ item event so failure analysis can jump directly to Langfuse.
24
25
  """
25
26
  from __future__ import annotations
26
27
 
@@ -41,8 +42,14 @@ def emit(out, event: dict[str, Any]) -> None:
41
42
  out.flush()
42
43
 
43
44
 
44
- def iter_items(dataset_version: str, max_items: int | None) -> Iterator[dict[str, Any]]:
45
- """Yield dataset items. Replace with your real loader."""
45
+ def load_items(dataset_name: str, max_items: int | None) -> Iterator[dict[str, Any]]:
46
+ """Yield dataset items.
47
+
48
+ Replace this with your real dataset loader. For Langfuse-backed evals, list
49
+ items from dataset_name and yield stable ids, inputs, expected outputs, and
50
+ any metadata your scorer needs.
51
+ """
52
+ del dataset_name
46
53
  items = [
47
54
  {"id": "demo-1", "input": "hello", "expected": "hi"},
48
55
  {"id": "demo-2", "input": "world", "expected": "world"},
@@ -87,7 +94,7 @@ def main() -> int:
87
94
  else:
88
95
  out = sys.stdout
89
96
 
90
- items = list(iter_items(args.dataset, args.max_items))
97
+ items = list(load_items(args.dataset, args.max_items))
91
98
  emit(out, {
92
99
  "type": "start",
93
100
  "n": len(items),
@@ -8,14 +8,16 @@
8
8
  * {"type":"start","n":<int>,"eval_version":<int>,"dataset_version":"<str>"}
9
9
  * {"type":"item","id":"<str>","score":<float in [0,1]>,"breakdown":{},"trace_id":"<str|null>"}
10
10
  * ... one item event per dataset item ...
11
- * {"type":"complete","score":<float>,"breakdown":{},"worst_traces":[]}
11
+ * {"type":"complete","score":<float>,"n":<int>,"breakdown":{},"worst_traces":[]}
12
12
  *
13
13
  * For Langfuse-backed production evals, keep the NDJSON stream as the required
14
14
  * Kaizen contract and also persist results back to Langfuse as a best-effort
15
15
  * side effect: load the versioned dataset, run the system to create a fresh
16
16
  * trace for each item, link that trace to the dataset item in a dataset run,
17
- * and write the primary metric as a score on the trace. Include that fresh
18
- * trace id in the Kaizen item event so failure analysis can jump to Langfuse.
17
+ * and write the primary metric as a score on the trace. Treat --dataset as the
18
+ * Langfuse dataset name unless your system.md says otherwise. Include each
19
+ * fresh trace id in the Kaizen item event so failure analysis can jump to
20
+ * Langfuse.
19
21
  */
20
22
  import { closeSync, writeSync } from "node:fs";
21
23
 
@@ -82,7 +84,11 @@ function parseArgs(argv: string[]): EvalArgs {
82
84
  return args;
83
85
  }
84
86
 
85
- function iterItems(maxItems: number | null): DemoItem[] {
87
+ function loadItems(datasetName: string, maxItems: number | null): DemoItem[] {
88
+ void datasetName;
89
+ // Replace this with your real dataset loader. For Langfuse-backed evals,
90
+ // list dataset items from --dataset and return objects with stable ids,
91
+ // inputs, expected outputs, and any metadata your scorer needs.
86
92
  const items = [
87
93
  { id: "demo-1", input: "hello", expected: "hi" },
88
94
  { id: "demo-2", input: "world", expected: "world" },
@@ -111,7 +117,7 @@ function average(values: number[]): number {
111
117
 
112
118
  async function main(): Promise<void> {
113
119
  const args = parseArgs(process.argv.slice(2));
114
- const items = iterItems(args.maxItems);
120
+ const items = loadItems(args.dataset, args.maxItems);
115
121
 
116
122
  emit(args.outFd, {
117
123
  type: "start",
@@ -25,4 +25,4 @@ The judge is itself an LLM system. Calibrate it against human labels until they
25
25
 
26
26
  ## Calibration
27
27
 
28
- Run the judge on 30 items that also have human labels. Iterate the rubric/prompt until agreement ≥ 95%. Track in `.kaizen/runs/<system>/<judge_run_id>/`.
28
+ Run the judge on 30 items that also have human labels. Iterate the rubric/prompt until agreement ≥ 95%. Track in `kaizen/.kaizen/runs/<system>/<judge_run_id>/`.
@@ -1,13 +1,13 @@
1
1
  ---
2
2
  name: {{name}}
3
3
  description: {{description}}
4
- run_eval: eval/{{name}}.{{eval_ext}}
4
+ run_eval: kaizen/systems/{{name}}/{{eval_file}}
5
5
  eval_version: 1
6
6
  dataset_version: v1
7
7
  eval_style: {{eval_style}}
8
8
  primary_metric: {{primary_metric}}
9
9
  target: {{target}}
10
- execution_mode: in_process
10
+ {{rubric_frontmatter}}
11
11
  # Optional: stable Linear project URL or ID for Kaizen Ideas.
12
12
  # linear_project: https://linear.app/<workspace>/project/<project-slug>
13
13
  created_at: {{iso_now}}
@@ -21,7 +21,7 @@ created_at: {{iso_now}}
21
21
 
22
22
  ## Key files
23
23
 
24
- <!-- Paths in this repo that a variant-builder agent needs to read to understand the system. -->
24
+ <!-- Paths in this repo that a coding agent needs to read to understand the system. -->
25
25
 
26
26
  - `path/to/main_workflow.py` — orchestrator
27
27
  - `path/to/prompts.py` — prompt templates
@@ -29,7 +29,7 @@ created_at: {{iso_now}}
29
29
 
30
30
  ## Setup
31
31
 
32
- <!-- What does the runner or variant-builder agent need before invoking the eval? E.g.:
32
+ <!-- What does the runner or coding agent need before invoking the eval? E.g.:
33
33
  - start servers
34
34
  - install deps
35
35
  - set env vars
@@ -39,6 +39,7 @@ created_at: {{iso_now}}
39
39
 
40
40
  <!-- For Langfuse-backed production evals:
41
41
  - Load dataset items from the `dataset_version` named in frontmatter.
42
+ - Treat `dataset_version` as the Langfuse dataset name unless this section says otherwise.
42
43
  - For each item, run the candidate system and capture the fresh Langfuse trace id.
43
44
  - Link the dataset item to that trace in a Langfuse dataset run named for the Kaizen run.
44
45
  - Write the primary metric as a Langfuse score on the fresh trace, with secondary metrics in metadata.
@@ -63,6 +64,6 @@ graph TD
63
64
 
64
65
  ## Variant candidates
65
66
 
66
- <!-- A scratchpad of ideas to try. The /kaizen skill reads this to seed variant generation. -->
67
+ <!-- A scratchpad of ideas to try. Coding agents read this to seed variant generation. -->
67
68
 
68
69
  - _none yet_
@@ -0,0 +1,63 @@
1
+ import { useState } from "react";
2
+ import type { DatasetItemRendererProps } from "@percepta/kaizen";
3
+
4
+ export default function DatasetItemView({
5
+ datasetItem,
6
+ trace,
7
+ actions,
8
+ }: DatasetItemRendererProps) {
9
+ const [expectedOutput, setExpectedOutput] = useState(() =>
10
+ JSON.stringify(datasetItem.expectedOutput ?? null, null, 2),
11
+ );
12
+ const [status, setStatus] = useState<"idle" | "saving" | "saved" | "error">(
13
+ "idle",
14
+ );
15
+
16
+ async function saveExpectedOutput() {
17
+ setStatus("saving");
18
+ try {
19
+ await actions.updateDatasetItem({
20
+ expectedOutput: JSON.parse(expectedOutput),
21
+ });
22
+ setStatus("saved");
23
+ } catch {
24
+ setStatus("error");
25
+ }
26
+ }
27
+
28
+ return (
29
+ <main>
30
+ <h2>{datasetItem.id}</h2>
31
+ <section>
32
+ <h3>Expected output</h3>
33
+ <textarea
34
+ value={expectedOutput}
35
+ onChange={(event) => {
36
+ setExpectedOutput(event.target.value);
37
+ setStatus("idle");
38
+ }}
39
+ rows={10}
40
+ style={{ width: "100%" }}
41
+ />
42
+ <button type="button" onClick={saveExpectedOutput}>
43
+ Save label
44
+ </button>
45
+ {status !== "idle" ? <span>{status}</span> : null}
46
+ </section>
47
+ <section>
48
+ <h3>Input</h3>
49
+ <pre>{JSON.stringify(datasetItem.input, null, 2)}</pre>
50
+ </section>
51
+ <section>
52
+ <h3>Metadata</h3>
53
+ <pre>{JSON.stringify(datasetItem.metadata, null, 2)}</pre>
54
+ </section>
55
+ {trace ? (
56
+ <section>
57
+ <h3>Source trace</h3>
58
+ <pre>{JSON.stringify(trace, null, 2)}</pre>
59
+ </section>
60
+ ) : null}
61
+ </main>
62
+ );
63
+ }
@@ -0,0 +1,10 @@
1
+ import type { TraceRendererProps } from "@percepta/kaizen";
2
+
3
+ export default function TraceView({ trace }: TraceRendererProps) {
4
+ return (
5
+ <main>
6
+ <h2>{trace.name ?? trace.id ?? "Trace"}</h2>
7
+ <pre>{JSON.stringify(trace, null, 2)}</pre>
8
+ </main>
9
+ );
10
+ }