@agentgrader/optimizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+ import { z } from 'zod';
2
+ import { AgentConfig } from '@agentgrader/core';
3
+
4
+ /**
5
+ * Each populated array becomes one axis of the cartesian product. Axes left
6
+ * `undefined` are not varied - their value (if any) comes from `base`.
7
+ */
8
+ declare const MatrixDimensionsSchema: z.ZodEffects<z.ZodObject<{
9
+ model: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
10
+ temperature: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
11
+ system_prompt: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
12
+ max_steps: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
13
+ toolkits: z.ZodOptional<z.ZodArray<z.ZodArray<z.ZodString, "many">, "many">>;
14
+ }, "strip", z.ZodTypeAny, {
15
+ model?: string[] | undefined;
16
+ max_steps?: number[] | undefined;
17
+ temperature?: number[] | undefined;
18
+ system_prompt?: string[] | undefined;
19
+ toolkits?: string[][] | undefined;
20
+ }, {
21
+ model?: string[] | undefined;
22
+ max_steps?: number[] | undefined;
23
+ temperature?: number[] | undefined;
24
+ system_prompt?: string[] | undefined;
25
+ toolkits?: string[][] | undefined;
26
+ }>, {
27
+ model?: string[] | undefined;
28
+ max_steps?: number[] | undefined;
29
+ temperature?: number[] | undefined;
30
+ system_prompt?: string[] | undefined;
31
+ toolkits?: string[][] | undefined;
32
+ }, {
33
+ model?: string[] | undefined;
34
+ max_steps?: number[] | undefined;
35
+ temperature?: number[] | undefined;
36
+ system_prompt?: string[] | undefined;
37
+ toolkits?: string[][] | undefined;
38
+ }>;
39
+ declare const MatrixSchema: z.ZodObject<{
40
+ /** used as the prefix for generated agent config ids/names */
41
+ name: z.ZodString;
42
+ base: z.ZodDefault<z.ZodObject<{
43
+ model: z.ZodOptional<z.ZodString>;
44
+ max_steps: z.ZodOptional<z.ZodNumber>;
45
+ temperature: z.ZodOptional<z.ZodNumber>;
46
+ system_prompt: z.ZodOptional<z.ZodString>;
47
+ tools: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
48
+ toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
49
+ }, "strip", z.ZodTypeAny, {
50
+ model?: string | undefined;
51
+ max_steps?: number | undefined;
52
+ temperature?: number | undefined;
53
+ system_prompt?: string | undefined;
54
+ tools?: string[] | undefined;
55
+ toolkits?: string[] | undefined;
56
+ }, {
57
+ model?: string | undefined;
58
+ max_steps?: number | undefined;
59
+ temperature?: number | undefined;
60
+ system_prompt?: string | undefined;
61
+ tools?: string[] | undefined;
62
+ toolkits?: string[] | undefined;
63
+ }>>;
64
+ dimensions: z.ZodEffects<z.ZodObject<{
65
+ model: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
66
+ temperature: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
67
+ system_prompt: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
68
+ max_steps: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
69
+ toolkits: z.ZodOptional<z.ZodArray<z.ZodArray<z.ZodString, "many">, "many">>;
70
+ }, "strip", z.ZodTypeAny, {
71
+ model?: string[] | undefined;
72
+ max_steps?: number[] | undefined;
73
+ temperature?: number[] | undefined;
74
+ system_prompt?: string[] | undefined;
75
+ toolkits?: string[][] | undefined;
76
+ }, {
77
+ model?: string[] | undefined;
78
+ max_steps?: number[] | undefined;
79
+ temperature?: number[] | undefined;
80
+ system_prompt?: string[] | undefined;
81
+ toolkits?: string[][] | undefined;
82
+ }>, {
83
+ model?: string[] | undefined;
84
+ max_steps?: number[] | undefined;
85
+ temperature?: number[] | undefined;
86
+ system_prompt?: string[] | undefined;
87
+ toolkits?: string[][] | undefined;
88
+ }, {
89
+ model?: string[] | undefined;
90
+ max_steps?: number[] | undefined;
91
+ temperature?: number[] | undefined;
92
+ system_prompt?: string[] | undefined;
93
+ toolkits?: string[][] | undefined;
94
+ }>;
95
+ }, "strip", z.ZodTypeAny, {
96
+ name: string;
97
+ base: {
98
+ model?: string | undefined;
99
+ max_steps?: number | undefined;
100
+ temperature?: number | undefined;
101
+ system_prompt?: string | undefined;
102
+ tools?: string[] | undefined;
103
+ toolkits?: string[] | undefined;
104
+ };
105
+ dimensions: {
106
+ model?: string[] | undefined;
107
+ max_steps?: number[] | undefined;
108
+ temperature?: number[] | undefined;
109
+ system_prompt?: string[] | undefined;
110
+ toolkits?: string[][] | undefined;
111
+ };
112
+ }, {
113
+ name: string;
114
+ dimensions: {
115
+ model?: string[] | undefined;
116
+ max_steps?: number[] | undefined;
117
+ temperature?: number[] | undefined;
118
+ system_prompt?: string[] | undefined;
119
+ toolkits?: string[][] | undefined;
120
+ };
121
+ base?: {
122
+ model?: string | undefined;
123
+ max_steps?: number | undefined;
124
+ temperature?: number | undefined;
125
+ system_prompt?: string | undefined;
126
+ tools?: string[] | undefined;
127
+ toolkits?: string[] | undefined;
128
+ } | undefined;
129
+ }>;
130
+ type Matrix = z.infer<typeof MatrixSchema>;
131
+ /**
132
+ * Expands a `Matrix` into the cartesian product of its `dimensions`,
133
+ * producing one `AgentConfig` per combination with `base` values as
134
+ * defaults and a deterministic, slugified `id`/`name`.
135
+ */
136
+ declare function expandMatrix(matrix: Matrix): AgentConfig[];
137
+
138
+ /**
139
+ * Minimal shape of a persisted run row needed for aggregation. Matches both
140
+ * `@agentgrader/store`'s `runs` rows (`metrics` as a JSON string) and
141
+ * `RunSingleResult`-like objects (`metrics` as a parsed object).
142
+ */
143
+ interface RunRecord {
144
+ agentConfigId: string;
145
+ passed?: boolean | null;
146
+ costUsd?: number | null;
147
+ durationMs?: number | null;
148
+ tokensIn?: number | null;
149
+ tokensOut?: number | null;
150
+ metrics?: string | Record<string, any> | null;
151
+ }
152
+ interface QualityAverages {
153
+ diffLines?: number;
154
+ filesModified?: number;
155
+ todosIntroduced?: number;
156
+ linterViolations?: number;
157
+ llmJudgeScore?: number;
158
+ }
159
+ interface AggregateResult {
160
+ agentConfigId: string;
161
+ agentConfigName: string;
162
+ totalRuns: number;
163
+ passedRuns: number;
164
+ solveRate: number;
165
+ avgCostUsd: number;
166
+ avgDurationMs: number;
167
+ avgTokensIn: number;
168
+ avgTokensOut: number;
169
+ /** only set when every run for this config has `metrics["static-quality"].quality` */
170
+ avgQuality?: QualityAverages;
171
+ }
172
+ /**
173
+ * Groups `runs` by `agentConfigId` and computes solve rate, average cost,
174
+ * duration, token usage, and (if present on every run) average quality
175
+ * metrics from `StaticQualityScorer`/`LlmJudgeScorer`.
176
+ */
177
+ declare function aggregateResults(runs: RunRecord[], configs: AgentConfig[]): AggregateResult[];
178
+
179
+ /**
180
+ * Filters `aggregates` down to the Pareto-optimal set across:
181
+ * - `solveRate` (maximize)
182
+ * - `avgCostUsd` (minimize)
183
+ * - `avgQuality.linterViolations` (minimize) - only included as an
184
+ * objective if every aggregate has it set, since `avgQuality` is itself
185
+ * optional (only populated when a static-quality scorer ran for all runs
186
+ * of a config).
187
+ *
188
+ * An aggregate is dropped if another aggregate is at least as good on every
189
+ * objective and strictly better on at least one (i.e. it is dominated).
190
+ */
191
+ declare function paretoFront(aggregates: AggregateResult[]): AggregateResult[];
192
+
193
+ export { type AggregateResult, type Matrix, MatrixDimensionsSchema, MatrixSchema, type QualityAverages, type RunRecord, aggregateResults, expandMatrix, paretoFront };
package/dist/index.js ADDED
@@ -0,0 +1,158 @@
1
+ import { z } from 'zod';
2
+
3
+ // src/matrix.ts
4
+ var MatrixBaseSchema = z.object({
5
+ model: z.string().optional(),
6
+ max_steps: z.number().optional(),
7
+ temperature: z.number().optional(),
8
+ system_prompt: z.string().optional(),
9
+ tools: z.array(z.string()).optional(),
10
+ toolkits: z.array(z.string()).optional()
11
+ });
12
+ var MatrixDimensionsSchema = z.object({
13
+ model: z.array(z.string()).optional(),
14
+ temperature: z.array(z.number()).optional(),
15
+ system_prompt: z.array(z.string()).optional(),
16
+ max_steps: z.array(z.number()).optional(),
17
+ toolkits: z.array(z.array(z.string())).optional()
18
+ }).refine((dims) => Object.values(dims).some((v) => Array.isArray(v) && v.length > 0), {
19
+ message: "Matrix must define at least one non-empty dimension"
20
+ });
21
+ var MatrixSchema = z.object({
22
+ /** used as the prefix for generated agent config ids/names */
23
+ name: z.string(),
24
+ base: MatrixBaseSchema.default({}),
25
+ dimensions: MatrixDimensionsSchema
26
+ });
27
+ var DEFAULT_MODEL = "anthropic/claude-sonnet-4-6";
28
+ var DEFAULT_MAX_STEPS = 30;
29
+ function expandMatrix(matrix) {
30
+ const { name, base, dimensions } = matrix;
31
+ const dimEntries = Object.entries(dimensions).filter(
32
+ (entry) => Array.isArray(entry[1]) && entry[1].length > 0
33
+ );
34
+ if (dimEntries.length === 0) {
35
+ throw new Error("Matrix must define at least one non-empty dimension");
36
+ }
37
+ let combinations = [{}];
38
+ for (const [key, values] of dimEntries) {
39
+ const next = [];
40
+ for (const combo of combinations) {
41
+ for (const value of values) {
42
+ next.push({ ...combo, [key]: value });
43
+ }
44
+ }
45
+ combinations = next;
46
+ }
47
+ return combinations.map((combo) => {
48
+ const id = `${name}-${dimEntries.map(([key]) => slugifyValue(combo[key])).join("-")}`;
49
+ const config = {
50
+ id,
51
+ name: id,
52
+ model: combo.model ?? base.model ?? DEFAULT_MODEL,
53
+ max_steps: combo.max_steps ?? base.max_steps ?? DEFAULT_MAX_STEPS
54
+ };
55
+ const temperature = combo.temperature ?? base.temperature;
56
+ if (temperature !== void 0) config.temperature = temperature;
57
+ const systemPrompt = combo.system_prompt ?? base.system_prompt;
58
+ if (systemPrompt !== void 0) config.system_prompt = systemPrompt;
59
+ const toolkits = combo.toolkits ?? base.toolkits;
60
+ if (toolkits !== void 0) config.toolkits = toolkits;
61
+ if (base.tools !== void 0) config.tools = base.tools;
62
+ return config;
63
+ });
64
+ }
65
+ function slugifyValue(value) {
66
+ if (Array.isArray(value)) {
67
+ return value.length > 0 ? value.map(slugifyValue).join("+") : "none";
68
+ }
69
+ return String(value).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
70
+ }
71
+
72
+ // src/aggregate.ts
73
+ var QUALITY_KEYS = [
74
+ "diffLines",
75
+ "filesModified",
76
+ "todosIntroduced",
77
+ "linterViolations",
78
+ "llmJudgeScore"
79
+ ];
80
+ function aggregateResults(runs, configs) {
81
+ const configsById = new Map(configs.map((c) => [c.id || c.name, c]));
82
+ const byConfig = /* @__PURE__ */ new Map();
83
+ for (const run of runs) {
84
+ const list = byConfig.get(run.agentConfigId);
85
+ if (list) list.push(run);
86
+ else byConfig.set(run.agentConfigId, [run]);
87
+ }
88
+ const results = [];
89
+ for (const [agentConfigId, configRuns] of byConfig) {
90
+ const total = configRuns.length;
91
+ const passed = configRuns.filter((r) => r.passed).length;
92
+ const sum = (f) => configRuns.reduce((acc, r) => acc + f(r), 0);
93
+ const qualities = configRuns.map((r) => extractQuality(r.metrics)).filter((q) => q !== void 0);
94
+ let avgQuality;
95
+ if (total > 0 && qualities.length === total) {
96
+ const avg = {};
97
+ for (const key of QUALITY_KEYS) {
98
+ const values = qualities.map((q) => q[key]).filter((v) => typeof v === "number");
99
+ if (values.length === qualities.length) {
100
+ avg[key] = values.reduce((a, b) => a + b, 0) / values.length;
101
+ }
102
+ }
103
+ if (Object.keys(avg).length > 0) avgQuality = avg;
104
+ }
105
+ results.push({
106
+ agentConfigId,
107
+ agentConfigName: configsById.get(agentConfigId)?.name ?? agentConfigId,
108
+ totalRuns: total,
109
+ passedRuns: passed,
110
+ solveRate: total > 0 ? passed / total : 0,
111
+ avgCostUsd: total > 0 ? sum((r) => r.costUsd ?? 0) / total : 0,
112
+ avgDurationMs: total > 0 ? sum((r) => r.durationMs ?? 0) / total : 0,
113
+ avgTokensIn: total > 0 ? sum((r) => r.tokensIn ?? 0) / total : 0,
114
+ avgTokensOut: total > 0 ? sum((r) => r.tokensOut ?? 0) / total : 0,
115
+ avgQuality
116
+ });
117
+ }
118
+ return results;
119
+ }
120
+ function extractQuality(metrics) {
121
+ if (!metrics) return void 0;
122
+ const parsed = typeof metrics === "string" ? safeParseJson(metrics) : metrics;
123
+ return parsed?.["static-quality"]?.quality;
124
+ }
125
+ function safeParseJson(value) {
126
+ try {
127
+ return JSON.parse(value);
128
+ } catch {
129
+ return void 0;
130
+ }
131
+ }
132
+
133
+ // src/pareto.ts
134
+ function paretoFront(aggregates) {
135
+ if (aggregates.length === 0) return [];
136
+ const objectives = [
137
+ { get: (a) => a.solveRate, maximize: true },
138
+ { get: (a) => a.avgCostUsd, maximize: false }
139
+ ];
140
+ if (aggregates.every((a) => typeof a.avgQuality?.linterViolations === "number")) {
141
+ objectives.push({ get: (a) => a.avgQuality.linterViolations, maximize: false });
142
+ }
143
+ const dominates = (a, b) => {
144
+ let strictlyBetter = false;
145
+ for (const { get, maximize } of objectives) {
146
+ const av = get(a);
147
+ const bv = get(b);
148
+ if (maximize ? av < bv : av > bv) return false;
149
+ if (av !== bv) strictlyBetter = true;
150
+ }
151
+ return strictlyBetter;
152
+ };
153
+ return aggregates.filter(
154
+ (candidate) => !aggregates.some((other) => other !== candidate && dominates(other, candidate))
155
+ );
156
+ }
157
+
158
+ export { MatrixDimensionsSchema, MatrixSchema, aggregateResults, expandMatrix, paretoFront };
package/package.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "name": "@agentgrader/optimizer",
3
+ "version": "0.1.0",
4
+ "description": "Architecture optimizer for the Agentgrader framework: agent-config matrix expansion, result aggregation, and Pareto-front analysis",
5
+ "license": "MIT",
6
+ "type": "module",
7
+ "main": "./dist/index.js",
8
+ "module": "./dist/index.js",
9
+ "types": "./dist/index.d.ts",
10
+ "exports": {
11
+ ".": {
12
+ "import": "./dist/index.js",
13
+ "types": "./dist/index.d.ts"
14
+ }
15
+ },
16
+ "files": [
17
+ "dist"
18
+ ],
19
+ "scripts": {
20
+ "build": "tsup src/index.ts --format esm --dts --clean --treeshake",
21
+ "build:watch": "tsup src/index.ts --format esm --dts --watch"
22
+ },
23
+ "dependencies": {
24
+ "@agentgrader/core": "^1.1.0",
25
+ "zod": "^3.23.8"
26
+ },
27
+ "devDependencies": {
28
+ "tsup": "^8.5.1"
29
+ },
30
+ "peerDependencies": {
31
+ "@agentgrader/core": "^1.1.0"
32
+ }
33
+ }