@percepta/kaizen 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -126
- package/agent/claude-command.md +23 -0
- package/agent/evals.md +41 -0
- package/agent/overview.md +53 -0
- package/agent/variant-builder.md +22 -0
- package/agent/views.md +51 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/BUILD_ID +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/build-manifest.json +22 -22
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/prerender-manifest.json +3 -3
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/routes-manifest.json +30 -10
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/27.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/516.js +8 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/913.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/middleware-build-manifest.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/404.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/500.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/benchmarks.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].html +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data/[[...path]].js.nft.json +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/eval.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].html +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments/[[...path]].js.nft.json +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/ideas.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-action.js.nft.json +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-item.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset-mutation.js.nft.json +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-dataset.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-datasets.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-trace.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/langfuse-traces.js.nft.json +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js +2 -2
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/linear-ideas.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-events.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-failures.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/run-traces.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js +2 -2
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/runs.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js +2 -2
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/systems.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/api/trace-renderer.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.html +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/index.js.nft.json +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages-manifest.json +8 -5
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/SCF0o7YxElB9rzWaOohsA/_buildManifest.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/253-85c76c34f33c9604.js +8 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{benchmarks-559dc9df52db3af4.js → benchmarks-30a17b7659010b8c.js} +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data/[[...path]]-e5f4083fe9ffe429.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{eval-3c911ea8744631fd.js → eval-160237a604b47416.js} +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments/[[...path]]-91e47a4893093600.js +1 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/{ideas-6829a271003150a9.js → ideas-96e58e4624952e26.js} +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/{index-1d8b6719f49e4ae0.js → index-d3306bb6f5d7d235.js} +1 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/cd3873236eb77caa.css +1 -0
- package/dashboard/.next/standalone/packages/kaizen/package.json +5 -3
- package/dashboard/.next/standalone/packages/kaizen/shared/workspace-paths.js +84 -0
- package/dist/commands/create-view.js +58 -0
- package/dist/commands/create-view.js.map +1 -0
- package/dist/commands/guide.js +66 -0
- package/dist/commands/guide.js.map +1 -0
- package/dist/commands/ideas.js +4 -8
- package/dist/commands/ideas.js.map +1 -1
- package/dist/commands/init-system.js +22 -20
- package/dist/commands/init-system.js.map +1 -1
- package/dist/commands/init.js +28 -64
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/log.js +5 -11
- package/dist/commands/log.js.map +1 -1
- package/dist/commands/rebuild.js +7 -9
- package/dist/commands/rebuild.js.map +1 -1
- package/dist/commands/run.js +5 -9
- package/dist/commands/run.js.map +1 -1
- package/dist/commands/studio.js +3 -3
- package/dist/commands/studio.js.map +1 -1
- package/dist/index.js +17 -21
- package/dist/index.js.map +1 -1
- package/dist/lib/cli.js +20 -0
- package/dist/lib/cli.js.map +1 -0
- package/dist/lib/events.js.map +1 -1
- package/dist/lib/fs-utils.js +3 -27
- package/dist/lib/fs-utils.js.map +1 -1
- package/dist/lib/leaderboard.js +1 -1
- package/dist/lib/leaderboard.js.map +1 -1
- package/dist/lib/paths.js +3 -3
- package/dist/lib/paths.js.map +1 -1
- package/dist/lib/promotion.js.map +1 -1
- package/dist/lib/run-dir.js +1 -1
- package/dist/lib/run-dir.js.map +1 -1
- package/dist/lib/runner.js +6 -5
- package/dist/lib/runner.js.map +1 -1
- package/dist/lib/system.js +4 -2
- package/dist/lib/system.js.map +1 -1
- package/dist/package.js +5 -3
- package/dist/shared/view-types.d.ts +67 -0
- package/dist/shared/view-types.d.ts.map +1 -0
- package/dist/shared/workspace-paths.js +84 -0
- package/dist/shared/workspace-paths.js.map +1 -0
- package/dist/types.d.ts +3 -30
- package/dist/types.d.ts.map +1 -1
- package/package.json +5 -3
- package/shared/view-types.d.ts +69 -0
- package/shared/view-types.js +1 -0
- package/shared/workspace-paths.d.ts +19 -0
- package/shared/workspace-paths.js +84 -0
- package/templates/system/eval.py +13 -6
- package/templates/system/eval.ts +11 -5
- package/templates/system/rubric.md +1 -1
- package/templates/system/system.md +6 -5
- package/templates/view/dataset-item.tsx +63 -0
- package/templates/view/trace.tsx +10 -0
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/chunks/715.js +0 -6
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.html +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/data.js.nft.json +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.html +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/server/pages/[system]/experiments.js.nft.json +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/YpQ-I4VL-aEdQrM5uN7_3/_buildManifest.js +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/673-ed4be46027ae7a37.js +0 -6
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/data-644e4280b4c86fe0.js +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/chunks/pages/[system]/experiments-42f31600c2bb47ad.js +0 -1
- package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/css/b18a6732b96168e1.css +0 -1
- package/dist/lib/env.js +0 -2
- package/dist/shared/env.js +0 -4
- package/templates/workspace/.claude/agents/variant-builder.md +0 -51
- package/templates/workspace/.claude/commands/kaizen.md +0 -65
- /package/dashboard/.next/standalone/packages/kaizen/dashboard/.next/static/{YpQ-I4VL-aEdQrM5uN7_3 → SCF0o7YxElB9rzWaOohsA}/_ssgManifest.js +0 -0
package/dist/types.d.ts
CHANGED
|
@@ -1,38 +1,11 @@
|
|
|
1
|
+
import { DatasetItemData, DatasetItemRendererActions, DatasetItemRendererProps, TraceData, TraceRendererActions, TraceRendererContext, TraceRendererProps } from "./shared/view-types.js";
|
|
2
|
+
|
|
1
3
|
//#region src/types.d.ts
|
|
2
4
|
interface KaizenConfig {
|
|
3
5
|
customer: {
|
|
4
|
-
slug: string;
|
|
5
6
|
name: string;
|
|
6
7
|
};
|
|
7
|
-
langfuse?: {
|
|
8
|
-
host?: string;
|
|
9
|
-
publicKeyEnv?: string;
|
|
10
|
-
secretKeyEnv?: string;
|
|
11
|
-
};
|
|
12
|
-
studio?: {
|
|
13
|
-
port?: number;
|
|
14
|
-
};
|
|
15
|
-
}
|
|
16
|
-
/** Data shape passed to custom trace renderers. */
|
|
17
|
-
interface TraceData {
|
|
18
|
-
id?: string;
|
|
19
|
-
name?: string;
|
|
20
|
-
tags?: string[];
|
|
21
|
-
timestamp?: string;
|
|
22
|
-
metadata?: unknown;
|
|
23
|
-
input?: unknown;
|
|
24
|
-
output?: unknown;
|
|
25
|
-
}
|
|
26
|
-
/** Props contract for custom trace renderer components. */
|
|
27
|
-
interface TraceRendererProps {
|
|
28
|
-
trace: TraceData;
|
|
29
|
-
datasetItem?: {
|
|
30
|
-
id: string;
|
|
31
|
-
input?: unknown;
|
|
32
|
-
expectedOutput?: unknown;
|
|
33
|
-
metadata?: Record<string, unknown> | null;
|
|
34
|
-
} | null;
|
|
35
8
|
}
|
|
36
9
|
//#endregion
|
|
37
|
-
export { KaizenConfig, TraceData, TraceRendererProps };
|
|
10
|
+
export { type DatasetItemData, type DatasetItemRendererActions, type DatasetItemRendererProps, KaizenConfig, type TraceData, type TraceRendererActions, type TraceRendererContext, type TraceRendererProps };
|
|
38
11
|
//# sourceMappingURL=types.d.ts.map
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","names":[],"sources":["../src/types.ts"],"mappings":"
|
|
1
|
+
{"version":3,"file":"types.d.ts","names":[],"sources":["../src/types.ts"],"mappings":";;;UAAiB,YAAA;EACf,QAAA;IACE,IAAA;EAAA;AAAA"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@percepta/kaizen",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "Automated AI researcher that improves AI systems",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai",
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"kaizen": "./bin/kaizen.js"
|
|
15
15
|
},
|
|
16
16
|
"files": [
|
|
17
|
+
"agent",
|
|
17
18
|
"bin",
|
|
18
19
|
"dashboard/.next/standalone",
|
|
19
20
|
"dist",
|
|
@@ -71,8 +72,9 @@
|
|
|
71
72
|
"typecheck:dashboard": "tsc -p dashboard --noEmit",
|
|
72
73
|
"test": "vitest run",
|
|
73
74
|
"dev": "tsx src/index.ts",
|
|
74
|
-
"dev:studio": "KAIZEN_WORKSPACE
|
|
75
|
-
"dev:next": "KAIZEN_WORKSPACE
|
|
75
|
+
"dev:studio": "KAIZEN_DEMO_MODE=1 KAIZEN_WORKSPACE=$PWD/examples/demo-workspace next dev dashboard --webpack --port 6789",
|
|
76
|
+
"dev:next": "KAIZEN_DEMO_MODE=1 KAIZEN_WORKSPACE=$PWD/examples/demo-workspace next dev dashboard --webpack --port 6789",
|
|
77
|
+
"seed:demo-data": "node scripts/seed-demo-traces.mjs --workspace $PWD/examples/demo-workspace",
|
|
76
78
|
"kaizen": "tsx src/index.ts"
|
|
77
79
|
}
|
|
78
80
|
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/** Data shape passed to custom trace renderers. */
|
|
2
|
+
export interface TraceData {
|
|
3
|
+
id?: string;
|
|
4
|
+
name?: string;
|
|
5
|
+
tags?: string[];
|
|
6
|
+
timestamp?: string;
|
|
7
|
+
metadata?: unknown;
|
|
8
|
+
input?: unknown;
|
|
9
|
+
output?: unknown;
|
|
10
|
+
[key: string]: unknown;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Props contract for custom trace renderer components. */
|
|
14
|
+
export interface TraceRendererProps {
|
|
15
|
+
trace: TraceData;
|
|
16
|
+
context: TraceRendererContext;
|
|
17
|
+
actions: TraceRendererActions;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface DatasetItemRendererProps {
|
|
21
|
+
datasetItem: DatasetItemData;
|
|
22
|
+
trace?: TraceData | null;
|
|
23
|
+
context: TraceRendererContext;
|
|
24
|
+
actions: DatasetItemRendererActions;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface DatasetItemData {
|
|
28
|
+
id: string;
|
|
29
|
+
input?: unknown;
|
|
30
|
+
expectedOutput?: unknown;
|
|
31
|
+
metadata?: Record<string, unknown> | null;
|
|
32
|
+
[key: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface TraceRendererContext {
|
|
36
|
+
systemId: string;
|
|
37
|
+
surface: "trace" | "dataset-item" | "run-trace";
|
|
38
|
+
datasetName?: string | null;
|
|
39
|
+
runId?: string | null;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface TraceRendererActions {
|
|
43
|
+
createScore(input: {
|
|
44
|
+
traceId?: string;
|
|
45
|
+
name: string;
|
|
46
|
+
value: number | string | boolean;
|
|
47
|
+
comment?: string;
|
|
48
|
+
metadata?: Record<string, unknown>;
|
|
49
|
+
}): Promise<unknown>;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface DatasetItemRendererActions extends TraceRendererActions {
|
|
53
|
+
updateDatasetItem(input: {
|
|
54
|
+
datasetName?: string;
|
|
55
|
+
itemId?: string;
|
|
56
|
+
expectedOutput?: unknown;
|
|
57
|
+
metadata?: Record<string, unknown> | null;
|
|
58
|
+
input?: unknown;
|
|
59
|
+
sourceTraceId?: string | null;
|
|
60
|
+
status?: string | null;
|
|
61
|
+
}): Promise<unknown>;
|
|
62
|
+
createDatasetRunItem(input: {
|
|
63
|
+
datasetItemId?: string;
|
|
64
|
+
traceId?: string;
|
|
65
|
+
runName: string;
|
|
66
|
+
runDescription?: string;
|
|
67
|
+
metadata?: Record<string, unknown>;
|
|
68
|
+
}): Promise<unknown>;
|
|
69
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export declare const KAIZEN_DIR = "kaizen";
|
|
2
|
+
export declare const KAIZEN_CONFIG = "config.ts";
|
|
3
|
+
export declare const KAIZEN_STATE_DIR = ".kaizen";
|
|
4
|
+
export declare const KAIZEN_SYSTEMS_DIR = "systems";
|
|
5
|
+
|
|
6
|
+
export declare function kaizenDir(workspaceRoot: string): string;
|
|
7
|
+
export declare function kaizenConfigPath(workspaceRoot: string): string;
|
|
8
|
+
export declare function kaizenSystemsDir(workspaceRoot: string): string;
|
|
9
|
+
export declare function kaizenSystemDir(
|
|
10
|
+
workspaceRoot: string,
|
|
11
|
+
systemId: string,
|
|
12
|
+
): string;
|
|
13
|
+
export declare function kaizenSystemPath(
|
|
14
|
+
workspaceRoot: string,
|
|
15
|
+
systemId: string,
|
|
16
|
+
): string;
|
|
17
|
+
export declare function defaultKaizenStateDir(workspaceRoot: string): string;
|
|
18
|
+
export declare function resolveKaizenStateDir(workspaceRoot: string): string;
|
|
19
|
+
export declare function primaryWorktreeRoot(workspaceRoot: string): string;
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { execFileSync } from "node:child_process";
|
|
2
|
+
import { realpathSync } from "node:fs";
|
|
3
|
+
import { join, resolve } from "node:path";
|
|
4
|
+
|
|
5
|
+
export const KAIZEN_DIR = "kaizen";
|
|
6
|
+
export const KAIZEN_CONFIG = "config.ts";
|
|
7
|
+
export const KAIZEN_STATE_DIR = ".kaizen";
|
|
8
|
+
export const KAIZEN_SYSTEMS_DIR = "systems";
|
|
9
|
+
|
|
10
|
+
export function kaizenDir(workspaceRoot) {
|
|
11
|
+
return join(workspaceRoot, KAIZEN_DIR);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function kaizenConfigPath(workspaceRoot) {
|
|
15
|
+
return join(kaizenDir(workspaceRoot), KAIZEN_CONFIG);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function kaizenSystemsDir(workspaceRoot) {
|
|
19
|
+
return join(kaizenDir(workspaceRoot), KAIZEN_SYSTEMS_DIR);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function kaizenSystemDir(workspaceRoot, systemId) {
|
|
23
|
+
return join(kaizenSystemsDir(workspaceRoot), systemId);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function kaizenSystemPath(workspaceRoot, systemId) {
|
|
27
|
+
return join(kaizenSystemDir(workspaceRoot, systemId), "system.md");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function defaultKaizenStateDir(workspaceRoot) {
|
|
31
|
+
return join(primaryWorktreeRoot(workspaceRoot), KAIZEN_DIR, KAIZEN_STATE_DIR);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function resolveKaizenStateDir(workspaceRoot) {
|
|
35
|
+
const raw = process.env.KAIZEN_STATE_DIR;
|
|
36
|
+
return raw
|
|
37
|
+
? resolve(workspaceRoot, raw)
|
|
38
|
+
: defaultKaizenStateDir(workspaceRoot);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function primaryWorktreeRoot(workspaceRoot) {
|
|
42
|
+
const normalizedWorkspace = canonicalPath(workspaceRoot);
|
|
43
|
+
const gitRoot = gitTopLevel(normalizedWorkspace);
|
|
44
|
+
if (!gitRoot || canonicalPath(gitRoot) !== normalizedWorkspace) {
|
|
45
|
+
return normalizedWorkspace;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
try {
|
|
49
|
+
const out = execFileSync(
|
|
50
|
+
"git",
|
|
51
|
+
["-C", normalizedWorkspace, "worktree", "list", "--porcelain"],
|
|
52
|
+
{ encoding: "utf-8", stdio: ["ignore", "pipe", "ignore"] },
|
|
53
|
+
);
|
|
54
|
+
const first = out.split("\n").find((line) => line.startsWith("worktree "));
|
|
55
|
+
return first
|
|
56
|
+
? canonicalPath(first.slice("worktree ".length))
|
|
57
|
+
: normalizedWorkspace;
|
|
58
|
+
} catch {
|
|
59
|
+
return normalizedWorkspace;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function gitTopLevel(workspaceRoot) {
|
|
64
|
+
try {
|
|
65
|
+
return execFileSync(
|
|
66
|
+
"git",
|
|
67
|
+
["-C", workspaceRoot, "rev-parse", "--show-toplevel"],
|
|
68
|
+
{
|
|
69
|
+
encoding: "utf-8",
|
|
70
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
71
|
+
},
|
|
72
|
+
).trim();
|
|
73
|
+
} catch {
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function canonicalPath(path) {
|
|
79
|
+
try {
|
|
80
|
+
return realpathSync(path);
|
|
81
|
+
} catch {
|
|
82
|
+
return resolve(path);
|
|
83
|
+
}
|
|
84
|
+
}
|
package/templates/system/eval.py
CHANGED
|
@@ -7,7 +7,7 @@ well-formed event stream:
|
|
|
7
7
|
{"type": "start", "n": <int>, "eval_version": <int>, "dataset_version": "<str>"}
|
|
8
8
|
{"type": "item", "id": "<str>", "score": <float in [0,1]>, "breakdown": {...}, "trace_id": "<str|null>"}
|
|
9
9
|
... one item event per dataset item ...
|
|
10
|
-
{"type": "complete", "score": <float>, "breakdown": {...}, "worst_traces": [...]}
|
|
10
|
+
{"type": "complete", "score": <float>, "n": <int>, "breakdown": {...}, "worst_traces": [...]}
|
|
11
11
|
|
|
12
12
|
If something goes wrong, emit `{"type": "error", "message": "<str>"}` and exit non-zero.
|
|
13
13
|
The supervisor will record the run as `crashed` if no `complete` event is seen.
|
|
@@ -19,8 +19,9 @@ For production evals backed by Langfuse, keep the NDJSON stream as the required
|
|
|
19
19
|
Kaizen contract and also persist results back to Langfuse as a best-effort side
|
|
20
20
|
effect: load the versioned dataset, run the system to create a fresh trace for
|
|
21
21
|
each item, link that trace to the dataset item in a dataset run, and write the
|
|
22
|
-
primary metric as a score on the trace.
|
|
23
|
-
|
|
22
|
+
primary metric as a score on the trace. Treat --dataset as the Langfuse dataset
|
|
23
|
+
name unless system.md says otherwise. Include that fresh trace id in the Kaizen
|
|
24
|
+
item event so failure analysis can jump directly to Langfuse.
|
|
24
25
|
"""
|
|
25
26
|
from __future__ import annotations
|
|
26
27
|
|
|
@@ -41,8 +42,14 @@ def emit(out, event: dict[str, Any]) -> None:
|
|
|
41
42
|
out.flush()
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
def
|
|
45
|
-
"""Yield dataset items.
|
|
45
|
+
def load_items(dataset_name: str, max_items: int | None) -> Iterator[dict[str, Any]]:
|
|
46
|
+
"""Yield dataset items.
|
|
47
|
+
|
|
48
|
+
Replace this with your real dataset loader. For Langfuse-backed evals, list
|
|
49
|
+
items from dataset_name and yield stable ids, inputs, expected outputs, and
|
|
50
|
+
any metadata your scorer needs.
|
|
51
|
+
"""
|
|
52
|
+
del dataset_name
|
|
46
53
|
items = [
|
|
47
54
|
{"id": "demo-1", "input": "hello", "expected": "hi"},
|
|
48
55
|
{"id": "demo-2", "input": "world", "expected": "world"},
|
|
@@ -87,7 +94,7 @@ def main() -> int:
|
|
|
87
94
|
else:
|
|
88
95
|
out = sys.stdout
|
|
89
96
|
|
|
90
|
-
items = list(
|
|
97
|
+
items = list(load_items(args.dataset, args.max_items))
|
|
91
98
|
emit(out, {
|
|
92
99
|
"type": "start",
|
|
93
100
|
"n": len(items),
|
package/templates/system/eval.ts
CHANGED
|
@@ -8,14 +8,16 @@
|
|
|
8
8
|
* {"type":"start","n":<int>,"eval_version":<int>,"dataset_version":"<str>"}
|
|
9
9
|
* {"type":"item","id":"<str>","score":<float in [0,1]>,"breakdown":{},"trace_id":"<str|null>"}
|
|
10
10
|
* ... one item event per dataset item ...
|
|
11
|
-
* {"type":"complete","score":<float>,"breakdown":{},"worst_traces":[]}
|
|
11
|
+
* {"type":"complete","score":<float>,"n":<int>,"breakdown":{},"worst_traces":[]}
|
|
12
12
|
*
|
|
13
13
|
* For Langfuse-backed production evals, keep the NDJSON stream as the required
|
|
14
14
|
* Kaizen contract and also persist results back to Langfuse as a best-effort
|
|
15
15
|
* side effect: load the versioned dataset, run the system to create a fresh
|
|
16
16
|
* trace for each item, link that trace to the dataset item in a dataset run,
|
|
17
|
-
* and write the primary metric as a score on the trace.
|
|
18
|
-
*
|
|
17
|
+
* and write the primary metric as a score on the trace. Treat --dataset as the
|
|
18
|
+
* Langfuse dataset name unless your system.md says otherwise. Include each
|
|
19
|
+
* fresh trace id in the Kaizen item event so failure analysis can jump to
|
|
20
|
+
* Langfuse.
|
|
19
21
|
*/
|
|
20
22
|
import { closeSync, writeSync } from "node:fs";
|
|
21
23
|
|
|
@@ -82,7 +84,11 @@ function parseArgs(argv: string[]): EvalArgs {
|
|
|
82
84
|
return args;
|
|
83
85
|
}
|
|
84
86
|
|
|
85
|
-
function
|
|
87
|
+
function loadItems(datasetName: string, maxItems: number | null): DemoItem[] {
|
|
88
|
+
void datasetName;
|
|
89
|
+
// Replace this with your real dataset loader. For Langfuse-backed evals,
|
|
90
|
+
// list dataset items from --dataset and return objects with stable ids,
|
|
91
|
+
// inputs, expected outputs, and any metadata your scorer needs.
|
|
86
92
|
const items = [
|
|
87
93
|
{ id: "demo-1", input: "hello", expected: "hi" },
|
|
88
94
|
{ id: "demo-2", input: "world", expected: "world" },
|
|
@@ -111,7 +117,7 @@ function average(values: number[]): number {
|
|
|
111
117
|
|
|
112
118
|
async function main(): Promise<void> {
|
|
113
119
|
const args = parseArgs(process.argv.slice(2));
|
|
114
|
-
const items =
|
|
120
|
+
const items = loadItems(args.dataset, args.maxItems);
|
|
115
121
|
|
|
116
122
|
emit(args.outFd, {
|
|
117
123
|
type: "start",
|
|
@@ -25,4 +25,4 @@ The judge is itself an LLM system. Calibrate it against human labels until they
|
|
|
25
25
|
|
|
26
26
|
## Calibration
|
|
27
27
|
|
|
28
|
-
Run the judge on 30 items that also have human labels. Iterate the rubric/prompt until agreement ≥ 95%. Track in
|
|
28
|
+
Run the judge on 30 items that also have human labels. Iterate the rubric/prompt until agreement ≥ 95%. Track in `kaizen/.kaizen/runs/<system>/<judge_run_id>/`.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: {{name}}
|
|
3
3
|
description: {{description}}
|
|
4
|
-
run_eval:
|
|
4
|
+
run_eval: kaizen/systems/{{name}}/{{eval_file}}
|
|
5
5
|
eval_version: 1
|
|
6
6
|
dataset_version: v1
|
|
7
7
|
eval_style: {{eval_style}}
|
|
8
8
|
primary_metric: {{primary_metric}}
|
|
9
9
|
target: {{target}}
|
|
10
|
-
|
|
10
|
+
{{rubric_frontmatter}}
|
|
11
11
|
# Optional: stable Linear project URL or ID for Kaizen Ideas.
|
|
12
12
|
# linear_project: https://linear.app/<workspace>/project/<project-slug>
|
|
13
13
|
created_at: {{iso_now}}
|
|
@@ -21,7 +21,7 @@ created_at: {{iso_now}}
|
|
|
21
21
|
|
|
22
22
|
## Key files
|
|
23
23
|
|
|
24
|
-
<!-- Paths in this repo that a
|
|
24
|
+
<!-- Paths in this repo that a coding agent needs to read to understand the system. -->
|
|
25
25
|
|
|
26
26
|
- `path/to/main_workflow.py` — orchestrator
|
|
27
27
|
- `path/to/prompts.py` — prompt templates
|
|
@@ -29,7 +29,7 @@ created_at: {{iso_now}}
|
|
|
29
29
|
|
|
30
30
|
## Setup
|
|
31
31
|
|
|
32
|
-
<!-- What does the runner or
|
|
32
|
+
<!-- What does the runner or coding agent need before invoking the eval? E.g.:
|
|
33
33
|
- start servers
|
|
34
34
|
- install deps
|
|
35
35
|
- set env vars
|
|
@@ -39,6 +39,7 @@ created_at: {{iso_now}}
|
|
|
39
39
|
|
|
40
40
|
<!-- For Langfuse-backed production evals:
|
|
41
41
|
- Load dataset items from the `dataset_version` named in frontmatter.
|
|
42
|
+
- Treat `dataset_version` as the Langfuse dataset name unless this section says otherwise.
|
|
42
43
|
- For each item, run the candidate system and capture the fresh Langfuse trace id.
|
|
43
44
|
- Link the dataset item to that trace in a Langfuse dataset run named for the Kaizen run.
|
|
44
45
|
- Write the primary metric as a Langfuse score on the fresh trace, with secondary metrics in metadata.
|
|
@@ -63,6 +64,6 @@ graph TD
|
|
|
63
64
|
|
|
64
65
|
## Variant candidates
|
|
65
66
|
|
|
66
|
-
<!-- A scratchpad of ideas to try.
|
|
67
|
+
<!-- A scratchpad of ideas to try. Coding agents read this to seed variant generation. -->
|
|
67
68
|
|
|
68
69
|
- _none yet_
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { useState } from "react";
|
|
2
|
+
import type { DatasetItemRendererProps } from "@percepta/kaizen";
|
|
3
|
+
|
|
4
|
+
export default function DatasetItemView({
|
|
5
|
+
datasetItem,
|
|
6
|
+
trace,
|
|
7
|
+
actions,
|
|
8
|
+
}: DatasetItemRendererProps) {
|
|
9
|
+
const [expectedOutput, setExpectedOutput] = useState(() =>
|
|
10
|
+
JSON.stringify(datasetItem.expectedOutput ?? null, null, 2),
|
|
11
|
+
);
|
|
12
|
+
const [status, setStatus] = useState<"idle" | "saving" | "saved" | "error">(
|
|
13
|
+
"idle",
|
|
14
|
+
);
|
|
15
|
+
|
|
16
|
+
async function saveExpectedOutput() {
|
|
17
|
+
setStatus("saving");
|
|
18
|
+
try {
|
|
19
|
+
await actions.updateDatasetItem({
|
|
20
|
+
expectedOutput: JSON.parse(expectedOutput),
|
|
21
|
+
});
|
|
22
|
+
setStatus("saved");
|
|
23
|
+
} catch {
|
|
24
|
+
setStatus("error");
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return (
|
|
29
|
+
<main>
|
|
30
|
+
<h2>{datasetItem.id}</h2>
|
|
31
|
+
<section>
|
|
32
|
+
<h3>Expected output</h3>
|
|
33
|
+
<textarea
|
|
34
|
+
value={expectedOutput}
|
|
35
|
+
onChange={(event) => {
|
|
36
|
+
setExpectedOutput(event.target.value);
|
|
37
|
+
setStatus("idle");
|
|
38
|
+
}}
|
|
39
|
+
rows={10}
|
|
40
|
+
style={{ width: "100%" }}
|
|
41
|
+
/>
|
|
42
|
+
<button type="button" onClick={saveExpectedOutput}>
|
|
43
|
+
Save label
|
|
44
|
+
</button>
|
|
45
|
+
{status !== "idle" ? <span>{status}</span> : null}
|
|
46
|
+
</section>
|
|
47
|
+
<section>
|
|
48
|
+
<h3>Input</h3>
|
|
49
|
+
<pre>{JSON.stringify(datasetItem.input, null, 2)}</pre>
|
|
50
|
+
</section>
|
|
51
|
+
<section>
|
|
52
|
+
<h3>Metadata</h3>
|
|
53
|
+
<pre>{JSON.stringify(datasetItem.metadata, null, 2)}</pre>
|
|
54
|
+
</section>
|
|
55
|
+
{trace ? (
|
|
56
|
+
<section>
|
|
57
|
+
<h3>Source trace</h3>
|
|
58
|
+
<pre>{JSON.stringify(trace, null, 2)}</pre>
|
|
59
|
+
</section>
|
|
60
|
+
) : null}
|
|
61
|
+
</main>
|
|
62
|
+
);
|
|
63
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { TraceRendererProps } from "@percepta/kaizen";
|
|
2
|
+
|
|
3
|
+
export default function TraceView({ trace }: TraceRendererProps) {
|
|
4
|
+
return (
|
|
5
|
+
<main>
|
|
6
|
+
<h2>{trace.name ?? trace.id ?? "Trace"}</h2>
|
|
7
|
+
<pre>{JSON.stringify(trace, null, 2)}</pre>
|
|
8
|
+
</main>
|
|
9
|
+
);
|
|
10
|
+
}
|