@agentgrader/optimizer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +193 -0
- package/dist/index.js +158 -0
- package/package.json +33 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { AgentConfig } from '@agentgrader/core';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Each populated array becomes one axis of the cartesian product. Axes left
|
|
6
|
+
* `undefined` are not varied - their value (if any) comes from `base`.
|
|
7
|
+
*/
|
|
8
|
+
declare const MatrixDimensionsSchema: z.ZodEffects<z.ZodObject<{
|
|
9
|
+
model: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
10
|
+
temperature: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
|
|
11
|
+
system_prompt: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
12
|
+
max_steps: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
|
|
13
|
+
toolkits: z.ZodOptional<z.ZodArray<z.ZodArray<z.ZodString, "many">, "many">>;
|
|
14
|
+
}, "strip", z.ZodTypeAny, {
|
|
15
|
+
model?: string[] | undefined;
|
|
16
|
+
max_steps?: number[] | undefined;
|
|
17
|
+
temperature?: number[] | undefined;
|
|
18
|
+
system_prompt?: string[] | undefined;
|
|
19
|
+
toolkits?: string[][] | undefined;
|
|
20
|
+
}, {
|
|
21
|
+
model?: string[] | undefined;
|
|
22
|
+
max_steps?: number[] | undefined;
|
|
23
|
+
temperature?: number[] | undefined;
|
|
24
|
+
system_prompt?: string[] | undefined;
|
|
25
|
+
toolkits?: string[][] | undefined;
|
|
26
|
+
}>, {
|
|
27
|
+
model?: string[] | undefined;
|
|
28
|
+
max_steps?: number[] | undefined;
|
|
29
|
+
temperature?: number[] | undefined;
|
|
30
|
+
system_prompt?: string[] | undefined;
|
|
31
|
+
toolkits?: string[][] | undefined;
|
|
32
|
+
}, {
|
|
33
|
+
model?: string[] | undefined;
|
|
34
|
+
max_steps?: number[] | undefined;
|
|
35
|
+
temperature?: number[] | undefined;
|
|
36
|
+
system_prompt?: string[] | undefined;
|
|
37
|
+
toolkits?: string[][] | undefined;
|
|
38
|
+
}>;
|
|
39
|
+
declare const MatrixSchema: z.ZodObject<{
|
|
40
|
+
/** used as the prefix for generated agent config ids/names */
|
|
41
|
+
name: z.ZodString;
|
|
42
|
+
base: z.ZodDefault<z.ZodObject<{
|
|
43
|
+
model: z.ZodOptional<z.ZodString>;
|
|
44
|
+
max_steps: z.ZodOptional<z.ZodNumber>;
|
|
45
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
46
|
+
system_prompt: z.ZodOptional<z.ZodString>;
|
|
47
|
+
tools: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
48
|
+
toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
49
|
+
}, "strip", z.ZodTypeAny, {
|
|
50
|
+
model?: string | undefined;
|
|
51
|
+
max_steps?: number | undefined;
|
|
52
|
+
temperature?: number | undefined;
|
|
53
|
+
system_prompt?: string | undefined;
|
|
54
|
+
tools?: string[] | undefined;
|
|
55
|
+
toolkits?: string[] | undefined;
|
|
56
|
+
}, {
|
|
57
|
+
model?: string | undefined;
|
|
58
|
+
max_steps?: number | undefined;
|
|
59
|
+
temperature?: number | undefined;
|
|
60
|
+
system_prompt?: string | undefined;
|
|
61
|
+
tools?: string[] | undefined;
|
|
62
|
+
toolkits?: string[] | undefined;
|
|
63
|
+
}>>;
|
|
64
|
+
dimensions: z.ZodEffects<z.ZodObject<{
|
|
65
|
+
model: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
66
|
+
temperature: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
|
|
67
|
+
system_prompt: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
68
|
+
max_steps: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
|
|
69
|
+
toolkits: z.ZodOptional<z.ZodArray<z.ZodArray<z.ZodString, "many">, "many">>;
|
|
70
|
+
}, "strip", z.ZodTypeAny, {
|
|
71
|
+
model?: string[] | undefined;
|
|
72
|
+
max_steps?: number[] | undefined;
|
|
73
|
+
temperature?: number[] | undefined;
|
|
74
|
+
system_prompt?: string[] | undefined;
|
|
75
|
+
toolkits?: string[][] | undefined;
|
|
76
|
+
}, {
|
|
77
|
+
model?: string[] | undefined;
|
|
78
|
+
max_steps?: number[] | undefined;
|
|
79
|
+
temperature?: number[] | undefined;
|
|
80
|
+
system_prompt?: string[] | undefined;
|
|
81
|
+
toolkits?: string[][] | undefined;
|
|
82
|
+
}>, {
|
|
83
|
+
model?: string[] | undefined;
|
|
84
|
+
max_steps?: number[] | undefined;
|
|
85
|
+
temperature?: number[] | undefined;
|
|
86
|
+
system_prompt?: string[] | undefined;
|
|
87
|
+
toolkits?: string[][] | undefined;
|
|
88
|
+
}, {
|
|
89
|
+
model?: string[] | undefined;
|
|
90
|
+
max_steps?: number[] | undefined;
|
|
91
|
+
temperature?: number[] | undefined;
|
|
92
|
+
system_prompt?: string[] | undefined;
|
|
93
|
+
toolkits?: string[][] | undefined;
|
|
94
|
+
}>;
|
|
95
|
+
}, "strip", z.ZodTypeAny, {
|
|
96
|
+
name: string;
|
|
97
|
+
base: {
|
|
98
|
+
model?: string | undefined;
|
|
99
|
+
max_steps?: number | undefined;
|
|
100
|
+
temperature?: number | undefined;
|
|
101
|
+
system_prompt?: string | undefined;
|
|
102
|
+
tools?: string[] | undefined;
|
|
103
|
+
toolkits?: string[] | undefined;
|
|
104
|
+
};
|
|
105
|
+
dimensions: {
|
|
106
|
+
model?: string[] | undefined;
|
|
107
|
+
max_steps?: number[] | undefined;
|
|
108
|
+
temperature?: number[] | undefined;
|
|
109
|
+
system_prompt?: string[] | undefined;
|
|
110
|
+
toolkits?: string[][] | undefined;
|
|
111
|
+
};
|
|
112
|
+
}, {
|
|
113
|
+
name: string;
|
|
114
|
+
dimensions: {
|
|
115
|
+
model?: string[] | undefined;
|
|
116
|
+
max_steps?: number[] | undefined;
|
|
117
|
+
temperature?: number[] | undefined;
|
|
118
|
+
system_prompt?: string[] | undefined;
|
|
119
|
+
toolkits?: string[][] | undefined;
|
|
120
|
+
};
|
|
121
|
+
base?: {
|
|
122
|
+
model?: string | undefined;
|
|
123
|
+
max_steps?: number | undefined;
|
|
124
|
+
temperature?: number | undefined;
|
|
125
|
+
system_prompt?: string | undefined;
|
|
126
|
+
tools?: string[] | undefined;
|
|
127
|
+
toolkits?: string[] | undefined;
|
|
128
|
+
} | undefined;
|
|
129
|
+
}>;
|
|
130
|
+
type Matrix = z.infer<typeof MatrixSchema>;
|
|
131
|
+
/**
|
|
132
|
+
* Expands a `Matrix` into the cartesian product of its `dimensions`,
|
|
133
|
+
* producing one `AgentConfig` per combination with `base` values as
|
|
134
|
+
* defaults and a deterministic, slugified `id`/`name`.
|
|
135
|
+
*/
|
|
136
|
+
declare function expandMatrix(matrix: Matrix): AgentConfig[];
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Minimal shape of a persisted run row needed for aggregation. Matches both
|
|
140
|
+
* `@agentgrader/store`'s `runs` rows (`metrics` as a JSON string) and
|
|
141
|
+
* `RunSingleResult`-like objects (`metrics` as a parsed object).
|
|
142
|
+
*/
|
|
143
|
+
interface RunRecord {
|
|
144
|
+
agentConfigId: string;
|
|
145
|
+
passed?: boolean | null;
|
|
146
|
+
costUsd?: number | null;
|
|
147
|
+
durationMs?: number | null;
|
|
148
|
+
tokensIn?: number | null;
|
|
149
|
+
tokensOut?: number | null;
|
|
150
|
+
metrics?: string | Record<string, any> | null;
|
|
151
|
+
}
|
|
152
|
+
interface QualityAverages {
|
|
153
|
+
diffLines?: number;
|
|
154
|
+
filesModified?: number;
|
|
155
|
+
todosIntroduced?: number;
|
|
156
|
+
linterViolations?: number;
|
|
157
|
+
llmJudgeScore?: number;
|
|
158
|
+
}
|
|
159
|
+
interface AggregateResult {
|
|
160
|
+
agentConfigId: string;
|
|
161
|
+
agentConfigName: string;
|
|
162
|
+
totalRuns: number;
|
|
163
|
+
passedRuns: number;
|
|
164
|
+
solveRate: number;
|
|
165
|
+
avgCostUsd: number;
|
|
166
|
+
avgDurationMs: number;
|
|
167
|
+
avgTokensIn: number;
|
|
168
|
+
avgTokensOut: number;
|
|
169
|
+
/** only set when every run for this config has `metrics["static-quality"].quality` */
|
|
170
|
+
avgQuality?: QualityAverages;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Groups `runs` by `agentConfigId` and computes solve rate, average cost,
|
|
174
|
+
* duration, token usage, and (if present on every run) average quality
|
|
175
|
+
* metrics from `StaticQualityScorer`/`LlmJudgeScorer`.
|
|
176
|
+
*/
|
|
177
|
+
declare function aggregateResults(runs: RunRecord[], configs: AgentConfig[]): AggregateResult[];
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Filters `aggregates` down to the Pareto-optimal set across:
|
|
181
|
+
* - `solveRate` (maximize)
|
|
182
|
+
* - `avgCostUsd` (minimize)
|
|
183
|
+
* - `avgQuality.linterViolations` (minimize) - only included as an
|
|
184
|
+
* objective if every aggregate has it set, since `avgQuality` is itself
|
|
185
|
+
* optional (only populated when a static-quality scorer ran for all runs
|
|
186
|
+
* of a config).
|
|
187
|
+
*
|
|
188
|
+
* An aggregate is dropped if another aggregate is at least as good on every
|
|
189
|
+
* objective and strictly better on at least one (i.e. it is dominated).
|
|
190
|
+
*/
|
|
191
|
+
declare function paretoFront(aggregates: AggregateResult[]): AggregateResult[];
|
|
192
|
+
|
|
193
|
+
export { type AggregateResult, type Matrix, MatrixDimensionsSchema, MatrixSchema, type QualityAverages, type RunRecord, aggregateResults, expandMatrix, paretoFront };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
// src/matrix.ts
|
|
4
|
+
var MatrixBaseSchema = z.object({
|
|
5
|
+
model: z.string().optional(),
|
|
6
|
+
max_steps: z.number().optional(),
|
|
7
|
+
temperature: z.number().optional(),
|
|
8
|
+
system_prompt: z.string().optional(),
|
|
9
|
+
tools: z.array(z.string()).optional(),
|
|
10
|
+
toolkits: z.array(z.string()).optional()
|
|
11
|
+
});
|
|
12
|
+
var MatrixDimensionsSchema = z.object({
|
|
13
|
+
model: z.array(z.string()).optional(),
|
|
14
|
+
temperature: z.array(z.number()).optional(),
|
|
15
|
+
system_prompt: z.array(z.string()).optional(),
|
|
16
|
+
max_steps: z.array(z.number()).optional(),
|
|
17
|
+
toolkits: z.array(z.array(z.string())).optional()
|
|
18
|
+
}).refine((dims) => Object.values(dims).some((v) => Array.isArray(v) && v.length > 0), {
|
|
19
|
+
message: "Matrix must define at least one non-empty dimension"
|
|
20
|
+
});
|
|
21
|
+
var MatrixSchema = z.object({
|
|
22
|
+
/** used as the prefix for generated agent config ids/names */
|
|
23
|
+
name: z.string(),
|
|
24
|
+
base: MatrixBaseSchema.default({}),
|
|
25
|
+
dimensions: MatrixDimensionsSchema
|
|
26
|
+
});
|
|
27
|
+
var DEFAULT_MODEL = "anthropic/claude-sonnet-4-6";
|
|
28
|
+
var DEFAULT_MAX_STEPS = 30;
|
|
29
|
+
function expandMatrix(matrix) {
|
|
30
|
+
const { name, base, dimensions } = matrix;
|
|
31
|
+
const dimEntries = Object.entries(dimensions).filter(
|
|
32
|
+
(entry) => Array.isArray(entry[1]) && entry[1].length > 0
|
|
33
|
+
);
|
|
34
|
+
if (dimEntries.length === 0) {
|
|
35
|
+
throw new Error("Matrix must define at least one non-empty dimension");
|
|
36
|
+
}
|
|
37
|
+
let combinations = [{}];
|
|
38
|
+
for (const [key, values] of dimEntries) {
|
|
39
|
+
const next = [];
|
|
40
|
+
for (const combo of combinations) {
|
|
41
|
+
for (const value of values) {
|
|
42
|
+
next.push({ ...combo, [key]: value });
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
combinations = next;
|
|
46
|
+
}
|
|
47
|
+
return combinations.map((combo) => {
|
|
48
|
+
const id = `${name}-${dimEntries.map(([key]) => slugifyValue(combo[key])).join("-")}`;
|
|
49
|
+
const config = {
|
|
50
|
+
id,
|
|
51
|
+
name: id,
|
|
52
|
+
model: combo.model ?? base.model ?? DEFAULT_MODEL,
|
|
53
|
+
max_steps: combo.max_steps ?? base.max_steps ?? DEFAULT_MAX_STEPS
|
|
54
|
+
};
|
|
55
|
+
const temperature = combo.temperature ?? base.temperature;
|
|
56
|
+
if (temperature !== void 0) config.temperature = temperature;
|
|
57
|
+
const systemPrompt = combo.system_prompt ?? base.system_prompt;
|
|
58
|
+
if (systemPrompt !== void 0) config.system_prompt = systemPrompt;
|
|
59
|
+
const toolkits = combo.toolkits ?? base.toolkits;
|
|
60
|
+
if (toolkits !== void 0) config.toolkits = toolkits;
|
|
61
|
+
if (base.tools !== void 0) config.tools = base.tools;
|
|
62
|
+
return config;
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
function slugifyValue(value) {
|
|
66
|
+
if (Array.isArray(value)) {
|
|
67
|
+
return value.length > 0 ? value.map(slugifyValue).join("+") : "none";
|
|
68
|
+
}
|
|
69
|
+
return String(value).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// src/aggregate.ts
|
|
73
|
+
var QUALITY_KEYS = [
|
|
74
|
+
"diffLines",
|
|
75
|
+
"filesModified",
|
|
76
|
+
"todosIntroduced",
|
|
77
|
+
"linterViolations",
|
|
78
|
+
"llmJudgeScore"
|
|
79
|
+
];
|
|
80
|
+
function aggregateResults(runs, configs) {
|
|
81
|
+
const configsById = new Map(configs.map((c) => [c.id || c.name, c]));
|
|
82
|
+
const byConfig = /* @__PURE__ */ new Map();
|
|
83
|
+
for (const run of runs) {
|
|
84
|
+
const list = byConfig.get(run.agentConfigId);
|
|
85
|
+
if (list) list.push(run);
|
|
86
|
+
else byConfig.set(run.agentConfigId, [run]);
|
|
87
|
+
}
|
|
88
|
+
const results = [];
|
|
89
|
+
for (const [agentConfigId, configRuns] of byConfig) {
|
|
90
|
+
const total = configRuns.length;
|
|
91
|
+
const passed = configRuns.filter((r) => r.passed).length;
|
|
92
|
+
const sum = (f) => configRuns.reduce((acc, r) => acc + f(r), 0);
|
|
93
|
+
const qualities = configRuns.map((r) => extractQuality(r.metrics)).filter((q) => q !== void 0);
|
|
94
|
+
let avgQuality;
|
|
95
|
+
if (total > 0 && qualities.length === total) {
|
|
96
|
+
const avg = {};
|
|
97
|
+
for (const key of QUALITY_KEYS) {
|
|
98
|
+
const values = qualities.map((q) => q[key]).filter((v) => typeof v === "number");
|
|
99
|
+
if (values.length === qualities.length) {
|
|
100
|
+
avg[key] = values.reduce((a, b) => a + b, 0) / values.length;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
if (Object.keys(avg).length > 0) avgQuality = avg;
|
|
104
|
+
}
|
|
105
|
+
results.push({
|
|
106
|
+
agentConfigId,
|
|
107
|
+
agentConfigName: configsById.get(agentConfigId)?.name ?? agentConfigId,
|
|
108
|
+
totalRuns: total,
|
|
109
|
+
passedRuns: passed,
|
|
110
|
+
solveRate: total > 0 ? passed / total : 0,
|
|
111
|
+
avgCostUsd: total > 0 ? sum((r) => r.costUsd ?? 0) / total : 0,
|
|
112
|
+
avgDurationMs: total > 0 ? sum((r) => r.durationMs ?? 0) / total : 0,
|
|
113
|
+
avgTokensIn: total > 0 ? sum((r) => r.tokensIn ?? 0) / total : 0,
|
|
114
|
+
avgTokensOut: total > 0 ? sum((r) => r.tokensOut ?? 0) / total : 0,
|
|
115
|
+
avgQuality
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
return results;
|
|
119
|
+
}
|
|
120
|
+
function extractQuality(metrics) {
|
|
121
|
+
if (!metrics) return void 0;
|
|
122
|
+
const parsed = typeof metrics === "string" ? safeParseJson(metrics) : metrics;
|
|
123
|
+
return parsed?.["static-quality"]?.quality;
|
|
124
|
+
}
|
|
125
|
+
function safeParseJson(value) {
|
|
126
|
+
try {
|
|
127
|
+
return JSON.parse(value);
|
|
128
|
+
} catch {
|
|
129
|
+
return void 0;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// src/pareto.ts
|
|
134
|
+
function paretoFront(aggregates) {
|
|
135
|
+
if (aggregates.length === 0) return [];
|
|
136
|
+
const objectives = [
|
|
137
|
+
{ get: (a) => a.solveRate, maximize: true },
|
|
138
|
+
{ get: (a) => a.avgCostUsd, maximize: false }
|
|
139
|
+
];
|
|
140
|
+
if (aggregates.every((a) => typeof a.avgQuality?.linterViolations === "number")) {
|
|
141
|
+
objectives.push({ get: (a) => a.avgQuality.linterViolations, maximize: false });
|
|
142
|
+
}
|
|
143
|
+
const dominates = (a, b) => {
|
|
144
|
+
let strictlyBetter = false;
|
|
145
|
+
for (const { get, maximize } of objectives) {
|
|
146
|
+
const av = get(a);
|
|
147
|
+
const bv = get(b);
|
|
148
|
+
if (maximize ? av < bv : av > bv) return false;
|
|
149
|
+
if (av !== bv) strictlyBetter = true;
|
|
150
|
+
}
|
|
151
|
+
return strictlyBetter;
|
|
152
|
+
};
|
|
153
|
+
return aggregates.filter(
|
|
154
|
+
(candidate) => !aggregates.some((other) => other !== candidate && dominates(other, candidate))
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
export { MatrixDimensionsSchema, MatrixSchema, aggregateResults, expandMatrix, paretoFront };
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@agentgrader/optimizer",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Architecture optimizer for the Agentgrader framework: agent-config matrix expansion, result aggregation, and Pareto-front analysis",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"module": "./dist/index.js",
|
|
9
|
+
"types": "./dist/index.d.ts",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"import": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "tsup src/index.ts --format esm --dts --clean --treeshake",
|
|
21
|
+
"build:watch": "tsup src/index.ts --format esm --dts --watch"
|
|
22
|
+
},
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"@agentgrader/core": "^1.1.0",
|
|
25
|
+
"zod": "^3.23.8"
|
|
26
|
+
},
|
|
27
|
+
"devDependencies": {
|
|
28
|
+
"tsup": "^8.5.1"
|
|
29
|
+
},
|
|
30
|
+
"peerDependencies": {
|
|
31
|
+
"@agentgrader/core": "^1.1.0"
|
|
32
|
+
}
|
|
33
|
+
}
|