agency-lang 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/agents/agency-agent/agent.js +3 -8
- package/dist/lib/agents/agency-agent/subagents/review.js +0 -2
- package/dist/lib/agents/docs/cli/cli/optimize.md +195 -72
- package/dist/lib/agents/docs/cli/optimize.md +195 -72
- package/dist/lib/agents/policy/agent.js +0 -1
- package/dist/lib/agents/review/agent.js +0 -1
- package/dist/lib/backends/typescriptBuilder.d.ts +1 -0
- package/dist/lib/backends/typescriptBuilder.js +9 -1
- package/package.json +1 -1
- package/stdlib/agency.agency +10 -10
- package/stdlib/agency.js +10 -10
- package/stdlib/cli.agency +2 -2
- package/stdlib/cli.js +2 -3
- package/stdlib/clipboard.agency +1 -1
- package/stdlib/clipboard.js +1 -2
- package/stdlib/http.agency +3 -3
- package/stdlib/http.js +3 -3
- package/stdlib/index.agency +13 -3
- package/stdlib/index.js +4 -4
- package/stdlib/keyring.agency +1 -1
- package/stdlib/keyring.js +1 -1
- package/stdlib/layout.agency +3 -3
- package/stdlib/layout.js +3 -3
- package/stdlib/memory.agency +2 -2
- package/stdlib/memory.js +2 -4
- package/stdlib/policy.js +9 -18
- package/stdlib/shell.agency +3 -3
- package/stdlib/shell.js +3 -3
- package/stdlib/statelog.agency +4 -4
- package/stdlib/statelog.js +4 -8
- package/stdlib/syntax.agency +1 -1
- package/stdlib/syntax.js +1 -2
- package/stdlib/table.agency +1 -1
- package/stdlib/table.js +1 -1
- package/stdlib/threads.agency +1 -1
- package/stdlib/threads.js +1 -1
- package/stdlib/ui.agency +6 -6
- package/stdlib/ui.js +6 -6
- package/stdlib/validators.agency +11 -11
- package/stdlib/validators.js +11 -22
|
@@ -1444,7 +1444,6 @@ async function __loadAgentsMd_impl(dir) {
|
|
|
1444
1444
|
]);
|
|
1445
1445
|
await runner.handle(2, async (__data) => approve(), async (runner2) => {
|
|
1446
1446
|
await runner2.step(0, async (runner3) => {
|
|
1447
|
-
__self.__retryable = false;
|
|
1448
1447
|
__stack.locals.result = await __call(read, {
|
|
1449
1448
|
type: "positional",
|
|
1450
1449
|
args: [`AGENTS.md`, __stack.args.dir]
|
|
@@ -1456,10 +1455,7 @@ async function __loadAgentsMd_impl(dir) {
|
|
|
1456
1455
|
}
|
|
1457
1456
|
});
|
|
1458
1457
|
});
|
|
1459
|
-
await runner.
|
|
1460
|
-
__self.__retryable = false;
|
|
1461
|
-
});
|
|
1462
|
-
await runner.ifElse(4, [
|
|
1458
|
+
await runner.ifElse(3, [
|
|
1463
1459
|
{
|
|
1464
1460
|
condition: async () => await isFailure(__stack.locals.result),
|
|
1465
1461
|
body: async (runner2) => {
|
|
@@ -1471,7 +1467,7 @@ async function __loadAgentsMd_impl(dir) {
|
|
|
1471
1467
|
}
|
|
1472
1468
|
}
|
|
1473
1469
|
]);
|
|
1474
|
-
await runner.step(
|
|
1470
|
+
await runner.step(4, async (runner2) => {
|
|
1475
1471
|
__functionCompleted = true;
|
|
1476
1472
|
runner2.halt(`
|
|
1477
1473
|
|
|
@@ -3147,7 +3143,6 @@ async function __printHeader_impl() {
|
|
|
3147
3143
|
}
|
|
3148
3144
|
});
|
|
3149
3145
|
await runner.step(2, async (runner2) => {
|
|
3150
|
-
__self.__retryable = false;
|
|
3151
3146
|
__stack.locals.data = await __call(box, {
|
|
3152
3147
|
type: "named",
|
|
3153
3148
|
positionalArgs: [],
|
|
@@ -4899,7 +4894,7 @@ Agent crashed: ${__error.message}`);
|
|
|
4899
4894
|
}
|
|
4900
4895
|
}
|
|
4901
4896
|
var stdin_default = graph;
|
|
4902
|
-
const __sourceMap = { "dist/lib/agents/agency-agent/agent.agency:__cb_top_0": { "1": { "line": 97, "col": 2 }, "1.0": { "line": 98, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_1": { "1": { "line": 103, "col": 2 }, "1.0.0": { "line": 104, "col": 4 }, "1.0.1": { "line": 105, "col": 6 }, "1.0.2": { "line": 106, "col": 11 }, "1.0.3": { "line": 107, "col": 6 }, "1.0.4": { "line": 109, "col": 6 }, "1.0": { "line": 104, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_2": { "1": { "line": 115, "col": 2 }, "1.0": { "line": 116, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_3": { "1": { "line": 121, "col": 2 }, "1.0": { "line": 122, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:_showTraces": { "1": { "line": 93, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:renderLLMCallResponse": { "1": { "line": 142, "col": 2 }, "2": { "line": 143, "col": 2 }, "3": { "line": 146, "col": 2 }, "5": { "line": 151, "col": 2 }, "2.0": { "line": 144, "col": 4 }, "3.0.0": { "line": 148, "col": 6 }, "3.0": { "line": 147, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:loadAgentsMd": { "1": { "line": 162, "col": 2 }, "2": { "line": 165, "col": 2 }, "
|
|
4897
|
+
const __sourceMap = { "dist/lib/agents/agency-agent/agent.agency:__cb_top_0": { "1": { "line": 97, "col": 2 }, "1.0": { "line": 98, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_1": { "1": { "line": 103, "col": 2 }, "1.0.0": { "line": 104, "col": 4 }, "1.0.1": { "line": 105, "col": 6 }, "1.0.2": { "line": 106, "col": 11 }, "1.0.3": { "line": 107, "col": 6 }, "1.0.4": { "line": 109, "col": 6 }, "1.0": { "line": 104, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_2": { "1": { "line": 115, "col": 2 }, "1.0": { "line": 116, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__cb_top_3": { "1": { "line": 121, "col": 2 }, "1.0": { "line": 122, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:_showTraces": { "1": { "line": 93, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:renderLLMCallResponse": { "1": { "line": 142, "col": 2 }, "2": { "line": 143, "col": 2 }, "3": { "line": 146, "col": 2 }, "5": { "line": 151, "col": 2 }, "2.0": { "line": 144, "col": 4 }, "3.0.0": { "line": 148, "col": 6 }, "3.0": { "line": 147, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:loadAgentsMd": { "1": { "line": 162, "col": 2 }, "2": { "line": 165, "col": 2 }, "3": { "line": 166, "col": 2 }, "4": { "line": 169, "col": 2 }, "1.0": { "line": 163, "col": 4 }, "2.0": { "line": 165, "col": 2 }, "3.0": { "line": 167, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:builtinPalette": { "1": { "line": 183, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:mergedPalette": { "1": { "line": 195, "col": 2 }, "2": { "line": 196, "col": 2 }, "3": { "line": 203, "col": 2 }, "4": { "line": 204, "col": 2 }, "5": { "line": 207, "col": 2 }, "2.0": { "line": 197, "col": 4 }, "2.1.0": { "line": 199, "col": 6 }, "2.1": { "line": 198, "col": 4 }, "2.2": { "line": 201, "col": 4 }, "4.0": { "line": 205, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:_runTurn": { "2": { "line": 218, "col": 2 }, "3": { "line": 219, "col": 2 }, "4": { "line": 222, "col": 2 }, "5": { "line": 225, "col": 2 }, "7": { "line": 229, "col": 2 }, "9": { "line": 233, "col": 2 }, "11": { "line": 261, "col": 2 }, "12": { "line": 262, "col": 2 }, "14": { "line": 267, "col": 2 }, "3.0": { "line": 220, "col": 4 }, "4.0": { "line": 223, "col": 4 }, "5.0": { "line": 226, "col": 4 }, "5.1": { "line": 227, "col": 4 }, "7.0": { "line": 230, "col": 4 }, "7.1": { "line": 231, "col": 4 }, "9.1": { "line": 241, "col": 4 }, "9.2": { "line": 242, "col": 4 }, "9.3": { "line": 243, "col": 4 }, "9.4.0": { "line": 245, "col": 6 }, "9.4.1": { "line": 246, "col": 6 }, "9.4": { "line": 244, "col": 4 }, "9.5": { "line": 248, "col": 4 }, "9.6": { "line": 249, "col": 4 }, "9.7.0": { "line": 251, "col": 6 }, "9.7.1": { "line": 252, "col": 6 }, "9.7.2.0": { "line": 254, "col": 8 }, "9.7.2": { "line": 253, "col": 6 }, "9.7": { "line": 250, "col": 4 }, "9.9": { "line": 257, "col": 4 }, "12.0": { "line": 263, "col": 4 }, "12.1": { "line": 265, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:mainAgent": { "1": { "line": 450, "col": 2 }, "3": { "line": 464, "col": 2 }, "1.0": { "line": 451, "col": 4 }, "1.1.1": { "line": 456, "col": 6 }, "1.1.2": { "line": 457, "col": 6 }, "1.1": { "line": 452, "col": 4 }, "1.3": { "line": 459, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:agentReplyVia": { "1": { "line": 473, "col": 2 }, "2": { "line": 474, "col": 2 }, "4": { "line": 477, "col": 2 }, "6": { "line": 480, "col": 2 }, "8": { "line": 483, "col": 2 }, "10": { "line": 486, "col": 2 }, "12": { "line": 489, "col": 2 }, "2.0": { "line": 475, "col": 4 }, "4.0": { "line": 478, "col": 4 }, "6.0": { "line": 481, "col": 4 }, "8.0": { "line": 484, "col": 4 }, "10.0": { "line": 487, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:agentReply": { "1": { "line": 499, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:roundedCost": { "1": { "line": 503, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:_buildStatus": { "1": { "line": 507, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:sample": { "1": { "line": 515, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:printHeader": { "1": { "line": 519, "col": 2 }, "2": { "line": 520, "col": 2 }, "3": { "line": 542, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:__block_0": { "2.0": { "line": 527, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:__block_1": { "2.0.0": { "line": 528, "col": 6 }, "2.0.1": { "line": 536, "col": 6 }, "2.0.2": { "line": 537, "col": 6 } }, "dist/lib/agents/agency-agent/agent.agency:__block_2": { "2.0.0.0": { "line": 529, "col": 8 }, "2.0.0.1": { "line": 530, "col": 8 }, "2.0.0.2": { "line": 531, "col": 8 }, "2.0.0.3": { "line": 532, "col": 8 }, "2.0.0.4": { "line": 533, "col": 8 }, "2.0.0.5": { "line": 534, "col": 8 } }, "dist/lib/agents/agency-agent/agent.agency:__block_3": { "2.0.2.0": { "line": 538, "col": 8 } }, "dist/lib/agents/agency-agent/agent.agency:givePolicyChoice": { "1": { "line": 546, "col": 2 }, "2": { "line": 547, "col": 2 }, "3": { "line": 548, "col": 2 }, "4": { "line": 559, "col": 2 }, "5": { "line": 560, "col": 2 } }, "dist/lib/agents/agency-agent/agent.agency:setupSession": { "2": { "line": 578, "col": 2 }, "3": { "line": 583, "col": 2 }, "4": { "line": 584, "col": 2 }, "5": { "line": 586, "col": 2 }, "6": { "line": 587, "col": 2 }, "8": { "line": 604, "col": 2 }, "3.0": { "line": 583, "col": 2 }, "6.0": { "line": 587, "col": 2 }, "6.1.0.0": { "line": 590, "col": 8 }, "6.1.0.1": { "line": 591, "col": 8 }, "6.1.0.2": { "line": 593, "col": 8 }, "6.1.0": { "line": 589, "col": 6 }, "6.1.2": { "line": 596, "col": 6 }, "6.1.3": { "line": 597, "col": 6 }, "6.1": { "line": 588, "col": 4 }, "6.3": { "line": 600, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:oneShotAgent": { "1": { "line": 614, "col": 2 }, "2": { "line": 615, "col": 2 }, "3": { "line": 616, "col": 2 }, "4": { "line": 617, "col": 2 }, "5": { "line": 622, "col": 2 }, "4.0": { "line": 618, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:_runSeedTurn": { "1": { "line": 629, "col": 2 }, "2": { "line": 630, "col": 2 }, "3": { "line": 631, "col": 2 }, "3.0": { "line": 632, "col": 4 }, "3.1": { "line": 634, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:startInteractive": { "1": { "line": 645, "col": 2 }, "3": { "line": 660, "col": 2 }, "1.0.0": { "line": 647, "col": 6 }, "1.0": { "line": 646, "col": 4 }, "1.1": { "line": 649, "col": 4 } }, "dist/lib/agents/agency-agent/agent.agency:main": { "2": { "line": 666, "col": 2 }, "3": { "line": 714, "col": 2 }, "4": { "line": 717, "col": 2 }, "6": { "line": 728, "col": 2 }, "7": { "line": 738, "col": 2 }, "8": { "line": 739, "col": 2 }, "9": { "line": 740, "col": 2 }, "10": { "line": 741, "col": 2 }, "11": { "line": 744, "col": 2 }, "13": { "line": 759, "col": 2 }, "15": { "line": 778, "col": 2 }, "16": { "line": 779, "col": 2 }, "18": { "line": 797, "col": 2 }, "19": { "line": 798, "col": 2 }, "20": { "line": 799, "col": 2 }, "21": { "line": 800, "col": 2 }, "22": { "line": 801, "col": 2 }, "3.0": { "line": 715, "col": 4 }, "4.0": { "line": 718, "col": 4 }, "11.0": { "line": 745, "col": 4 }, "11.1": { "line": 750, "col": 4 }, "13.0": { "line": 760, "col": 4 }, "13.1": { "line": 761, "col": 4 }, "13.2": { "line": 762, "col": 4 }, "13.3": { "line": 763, "col": 4 }, "13.4": { "line": 764, "col": 4 }, "13.5": { "line": 765, "col": 4 }, "16.0": { "line": 780, "col": 4 }, "16.1.0": { "line": 782, "col": 6 }, "16.1.1.0": { "line": 784, "col": 8 }, "16.1.1": { "line": 783, "col": 6 }, "16.1.2": { "line": 786, "col": 6 }, "16.1": { "line": 781, "col": 4 }, "16.2": { "line": 788, "col": 4 }, "16.3": { "line": 789, "col": 4 } } };
|
|
4903
4898
|
export {
|
|
4904
4899
|
__getCheckpoints,
|
|
4905
4900
|
__mainNodeParams,
|
|
@@ -1175,7 +1175,6 @@ async function ___typecheck_impl(agencyCode) {
|
|
|
1175
1175
|
});
|
|
1176
1176
|
});
|
|
1177
1177
|
await runner.step(1, async (runner2) => {
|
|
1178
|
-
__self.__retryable = false;
|
|
1179
1178
|
__stack.locals.result = await __call(typecheck, {
|
|
1180
1179
|
type: "positional",
|
|
1181
1180
|
args: [__stack.args.agencyCode]
|
|
@@ -1378,7 +1377,6 @@ async function ___parse_impl(agencyCode) {
|
|
|
1378
1377
|
});
|
|
1379
1378
|
});
|
|
1380
1379
|
await runner.step(1, async (runner2) => {
|
|
1381
|
-
__self.__retryable = false;
|
|
1382
1380
|
__stack.locals.result = await __call(parseAST, {
|
|
1383
1381
|
type: "positional",
|
|
1384
1382
|
args: [__stack.args.agencyCode]
|
|
@@ -5,117 +5,237 @@ description: Documents `agency eval optimize` — the eval-driven loop that rewr
|
|
|
5
5
|
|
|
6
6
|
# Optimizing agents
|
|
7
7
|
|
|
8
|
-
`agency
|
|
8
|
+
`agency optimize` improves an agent by rewriting your prompts for you.
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
For example, let's say you are writing an agent to return the capital of India. Here's your code:
|
|
11
|
+
|
|
12
|
+
```ts
|
|
13
|
+
node main() {
|
|
14
|
+
const prompt = "What is the capital of France?"
|
|
15
|
+
const response = llm(prompt)
|
|
16
|
+
return response
|
|
17
|
+
}
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Notice that the prompt is incorrectly asking for the capital of France. We're going to have the optimizer change this prompt to India. It's really easy to get started with the optimizer for a toy example like this. First, we need to mark the targets we want the optimizer to optimize:
|
|
21
|
+
|
|
22
|
+
```ts
|
|
23
|
+
node main() {
|
|
24
|
+
// added `optimize` to next line
|
|
25
|
+
optimize const prompt = "What is the capital of France?"
|
|
26
|
+
const response = llm(prompt)
|
|
27
|
+
return response
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The only change needed is the `optimize` modifier on the `prompt` variable declaration. Now call the `optimize` command, giving it your agency file and a goal:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
agency optimize foo.agency --goal 'Return the capital of India'
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
If you run this command, you'll see output similar to this:
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
grading:
|
|
41
|
+
- goal
|
|
42
|
+
first input: input-1 — goal: Return the capital of India
|
|
43
|
+
|
|
44
|
+
== optimize greedy (run demo-run): 1 target(s), 1 input(s), up to 5 iteration(s) ==
|
|
45
|
+
- bar.agency:main:prompt = "What is the capital of France?"
|
|
46
|
+
baseline objective 0.000
|
|
47
|
+
iter 1/5 accepted objective 1.000 (6.3s)
|
|
48
|
+
~ bar.agency:main:prompt:
|
|
49
|
+
- What is the capital of France?
|
|
50
|
+
+ What is the capital of India?
|
|
51
|
+
The change focuses on directly addressing the goal of retrieving the capital of India by modifying the prompt to reflect…
|
|
52
|
+
reached the maximum objective (1.000) — stopping early
|
|
53
|
+
|
|
54
|
+
== Optimized variables ==
|
|
55
|
+
~ bar.agency:main:prompt:
|
|
56
|
+
- What is the capital of France?
|
|
57
|
+
+ What is the capital of India?
|
|
58
|
+
|
|
59
|
+
Complete: champion iteration 1, accepted 1, rejected 0, invalid 0 (10.0s)
|
|
60
|
+
Optimize demo-run completed: 1 accepted, 0 rejected
|
|
14
61
|
```
|
|
15
62
|
|
|
16
|
-
|
|
63
|
+
You can put `optimize` on any string `const` `let` to tell the the optimizer to rewrite it.
|
|
64
|
+
|
|
65
|
+
## Inputs, graders, optimizers
|
|
66
|
+
|
|
67
|
+
The `--goal` flag makes it really easy to get started with the optimizer, but gives you limited control. Now let's look at a more real-world example. But first I need to explain how the optimizer works.
|
|
17
68
|
|
|
18
|
-
|
|
69
|
+
The optimizer has three core things: inputs, graders, and the optimizer itself.
|
|
19
70
|
|
|
20
|
-
|
|
21
|
-
|
|
71
|
+
### Inputs
|
|
72
|
+
Inputs are examples you give to the optimizer. They are example input-output pairs.
|
|
22
73
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
74
|
+
For example, let's say we're optimizing this code:
|
|
75
|
+
|
|
76
|
+
```ts
|
|
77
|
+
node main(country) {
|
|
78
|
+
// note prompt incorrectly says "area" instead of "capital"
|
|
79
|
+
optimize const prompt = `What is the area of ${country}?`
|
|
80
|
+
const response = llm(prompt)
|
|
81
|
+
return response
|
|
27
82
|
}
|
|
28
83
|
```
|
|
29
84
|
|
|
30
|
-
|
|
85
|
+
It is very similar to the code we just saw, but now there's a `country` parameter for the node. We might give these inputs to the optimizer:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
{
|
|
89
|
+
"inputs": [
|
|
90
|
+
{ "args": { "country": "India" }, "expected": "New Delhi" },
|
|
91
|
+
{ "args": { "country": "Japan" }, "expected": "Tokyo" },
|
|
92
|
+
{ "args": { "country": "Brazil" }, "expected": "Brasília" }
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Save this as inputs.json and run the optimizer again:
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
agency optimize foo.agency --goal 'Return the capital of India' --inputs inputs.json
|
|
101
|
+
|
|
102
|
+
```
|
|
31
103
|
|
|
32
|
-
|
|
104
|
+
This will run the optimizer the same as earlier, except now it also has three example inputs to look at. The optimizer will run foo.agency once for each input. That means it will run your agent, setting country to `"India"` for the first iteration, `"Japan"` for the second iteration etc, and look at the return value of the node.
|
|
33
105
|
|
|
34
|
-
You
|
|
106
|
+
You can optionally also provide other values:
|
|
35
107
|
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
108
|
+
```ts
|
|
109
|
+
export type Input = {
|
|
110
|
+
/** Unique id. Generated for you if not given.*/
|
|
111
|
+
id?: string;
|
|
112
|
+
/** What the agent should accomplish — read by the goal judge and the
|
|
113
|
+
* pairwise judge suite. This is a per-input goal.*/
|
|
114
|
+
goal?: string;
|
|
115
|
+
/** Entry node to run. Defaults to `main`. */
|
|
116
|
+
node?: string;
|
|
117
|
+
/** Freeform, grader-agnostic metadata (tags, expectedOutput, …). */
|
|
118
|
+
metadata?: Record<string, any>;
|
|
119
|
+
};
|
|
41
120
|
```
|
|
42
121
|
|
|
43
|
-
- `--inputs
|
|
44
|
-
|
|
45
|
-
|
|
122
|
+
Notice that you can pass in a per-input goal, or an overall goal, as we have been doing with the `--goal` flag. You can pass in either one or both, but at least one goal is required. The `--goal` flag only fills in goals for inputs that don't have their own; they don't get combined. So if an input already has a goal, the `--goal` flag's value won't be used.
|
|
123
|
+
|
|
124
|
+
### Graders
|
|
125
|
+
So, we pass in an input, an expected output, and a goal to the optimizer. How does the optimizer measure the expected output? In our example with capitals, the expected output for India was `"New Delhi"`. What if the agent instead returned `"the capital of India is New Delhi"`? It's the job of the *grader* to decide how well the agent did. Let's look at some examples of graders.
|
|
126
|
+
|
|
127
|
+
#### ExactMatchGrader
|
|
128
|
+
Returns a binary pass-fail. Not the most useful grader, because it would give both of these the same score, which makes it hard for the optimizer to see if its changes to the agent are making any progress:
|
|
46
129
|
|
|
47
|
-
|
|
130
|
+
```
|
|
131
|
+
// these responses would get the same score:
|
|
132
|
+
response1 = "asdadasdasd"
|
|
133
|
+
response2 = "the capital of India is New Delhi"
|
|
134
|
+
```
|
|
48
135
|
|
|
49
|
-
|
|
136
|
+
#### ContainsGrader
|
|
137
|
+
Also returns a binary pass/fail like exact match, but this one checks to see if the expected output is anywhere in the response. Slightly better.
|
|
50
138
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
| `<file>[:<node>]` | Required agent target. A directory resolves to `main.agency`; the node defaults to `main`. |
|
|
54
|
-
| `--inputs <file\|dir>` | Input suite file or directory. |
|
|
55
|
-
| `--goal <text>` | Overall goal (combinable with `--inputs`; or a single inline input on its own). |
|
|
56
|
-
| `--graders <file>` | A TypeScript grading module that replaces the default goal judge. See [Custom graders](#custom-graders). |
|
|
57
|
-
| `--validation-inputs <file\|dir>` | Held-out validation suite. See [Validation sets](#validation-sets). |
|
|
58
|
-
| `--validation-split <ratio>` | Hold out this fraction of `--inputs` (seeded by `--seed`) when `--validation-inputs` is absent. |
|
|
59
|
-
| `--optimizer <name>` | `greedy` (default), `gepa`, or `example`. |
|
|
60
|
-
| `--iterations <n>` | Max candidate iterations after the baseline. Default `5`. |
|
|
61
|
-
| `--minibatch <n>` | GEPA minibatch size (gepa only). Default `8`. |
|
|
62
|
-
| `--seed <n>` | RNG seed for reproducible search / validation split. |
|
|
63
|
-
| `--mutator-model <model>` | Model override for proposing mutations. |
|
|
64
|
-
| `--no-writeback` | Don't write the champion back to the source files. |
|
|
65
|
-
| `--silent` | Print nothing; artifacts are still written. |
|
|
66
|
-
| `--run-id <id>` | Output run id (must not already exist). |
|
|
67
|
-
| `--runs-dir <path>` | Output root. Defaults to `eval.optimizeRunsDir`, then `eval.runsDir/optimize`, then `runs/optimize`. |
|
|
139
|
+
#### SimilarityGrader
|
|
140
|
+
Calculates the levenshtein distance and returns a score between 0 and 1 (0 = no match, 1 = perfect match).
|
|
68
141
|
|
|
69
|
-
|
|
142
|
+
#### LLM Judge
|
|
143
|
+
Asks an LLM to return a score between 0 and 1 (0 = no match, 1 = perfect match) for how well the response matches the expected output.
|
|
70
144
|
|
|
71
|
-
|
|
145
|
+
This is the default grader.
|
|
72
146
|
|
|
73
|
-
|
|
147
|
+
### Custom graders
|
|
74
148
|
|
|
75
|
-
|
|
149
|
+
So far, we have just been using the LLM Judge, which is the default grader. But we can also specify a custom grader using the `--graders` flag.
|
|
150
|
+
|
|
151
|
+
First write a grader file:
|
|
76
152
|
|
|
77
153
|
```ts
|
|
78
|
-
|
|
154
|
+
// graders.ts
|
|
155
|
+
import { type Grader } from "agency-lang/optimize";
|
|
156
|
+
|
|
157
|
+
// `input` is the typed Input; the gold answer is at `input.expected`
|
|
158
|
+
// `output` is the actual response from your agent.
|
|
159
|
+
const exact: Grader = ({ output, input }) => {
|
|
160
|
+
// return a number (0..1), a boolean, or a Grade
|
|
161
|
+
return output === input.expected ? 1 : 0;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export default exact;
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Use the grader:
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
agency optimize foo.agency --goal 'Return the capital of India' --graders graders.ts
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
That's a really simple example where we're writing a custom function to use as the grader. It's an exact match function which, as we know, isn't very good. We can easily change this though. Let's see some options.
|
|
79
174
|
|
|
80
|
-
|
|
81
|
-
// `input` is the typed Input; the gold answer is `input.expected`
|
|
82
|
-
// (extra per-input data can also live under `input.metadata`).
|
|
83
|
-
const exact: Grader = ({ output, input }) =>
|
|
84
|
-
output === input.expected ? 1 : 0; // return a number (0..1), a boolean, or a Grade
|
|
175
|
+
We could call an LLM judge, passing it a custom judge prompt:
|
|
85
176
|
|
|
86
|
-
|
|
177
|
+
```ts
|
|
178
|
+
import { scalar, type Grader } from "agency-lang/optimize";
|
|
87
179
|
const judged: Grader = async ({ output, input, judge }) => {
|
|
88
|
-
const v = await judge({ goal:
|
|
89
|
-
|
|
180
|
+
const v = await judge({ goal:
|
|
181
|
+
`Hi this is my custom LLM judge prompt. The output should match this expected value: ${input.expected}.`,
|
|
182
|
+
output
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
// Agency func to return a scalar score + reasoning for the score.
|
|
186
|
+
// Generates something like:
|
|
187
|
+
//
|
|
188
|
+
// ```
|
|
189
|
+
// { score: { kind: "scalar", value: v.score }, feedback: v.reasoning }
|
|
190
|
+
// ```
|
|
191
|
+
return scalar(v.score, v.reasoning);
|
|
90
192
|
};
|
|
193
|
+
```
|
|
91
194
|
|
|
92
|
-
|
|
93
|
-
const gate = grader(exact, { mustPass: true, name: "capital-exact" });
|
|
94
|
-
|
|
95
|
-
// (c) a configured built-in — matchOn defaults to ["expected"]
|
|
96
|
-
const has = new Contains({}); // output contains input.expected
|
|
97
|
-
const judge = new LlmJudge({ goal: "Return the capital.", samples: 3 });
|
|
195
|
+
We could use a built-in grader:
|
|
98
196
|
|
|
99
|
-
|
|
197
|
+
```ts
|
|
198
|
+
import { Contains } from "agency-lang/optimize";
|
|
199
|
+
export default (new Contains({}));
|
|
100
200
|
```
|
|
101
201
|
|
|
102
|
-
|
|
202
|
+
Instead of a single grader, we can also return an array of graders:
|
|
103
203
|
|
|
104
|
-
|
|
204
|
+
```ts
|
|
205
|
+
import { Contains, Grader, scalar } from "agency-lang/optimize";
|
|
105
206
|
|
|
106
|
-
|
|
207
|
+
const judged: Grader = async ({ output, input, judge }) => {
|
|
208
|
+
const v = await judge({
|
|
209
|
+
goal:
|
|
210
|
+
`Hi this is my custom LLM judge prompt. The output should match this expected value: ${input.expected}.`,
|
|
211
|
+
output
|
|
212
|
+
});
|
|
107
213
|
|
|
108
|
-
|
|
214
|
+
return scalar(v.score, v.reasoning);
|
|
215
|
+
};
|
|
109
216
|
|
|
110
|
-
|
|
217
|
+
export default [new Contains({}), judged];
|
|
218
|
+
```
|
|
111
219
|
|
|
112
|
-
|
|
220
|
+
Finally, you can use the `grader` function to wrap a custom function and supply some metadata:
|
|
113
221
|
|
|
114
|
-
|
|
222
|
+
```ts
|
|
223
|
+
// use the `exact` function as the grader.
|
|
224
|
+
// mustPass = if this grader fails, consider this entire iteration failed.
|
|
225
|
+
// name = shown in debug output.
|
|
226
|
+
const gate = grader(exact, { mustPass: true, name: "capital-exact" });
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
To recap:
|
|
230
|
+
- A grading module **default-exports one grader or an array of graders**.
|
|
231
|
+
- A metric function returns a **number** (0..1 scalar), a **boolean** (1.0/0.0), or a full **Grade**. For a Grade with feedback, the `scalar(value, feedback?)` and `binary(pass, feedback?)` constructors are the ergonomic way to build one.
|
|
232
|
+
|
|
233
|
+
#### How grades become the objective
|
|
234
|
+
Every grade counts: a number contributes its value (0..1), and a boolean / `ExactMatch` / `Contains` result contributes `1.0` (pass) or `0.0` (fail) — so a binary-only grader gives you plain accuracy. The objective for an input is the weighted mean of its grades, and the run objective is the mean across inputs. `mustPass` is an orthogonal **gate**: a failed `mustPass` grader zeroes that input regardless of its other grades.
|
|
115
235
|
|
|
116
236
|
## Validation sets
|
|
117
237
|
|
|
118
|
-
Pass `--validation-inputs <file|dir>` to grade the champion against held-out inputs, or `--validation-split <ratio>` to hold out a seeded fraction of `--inputs`. Search and candidate acceptance run on the **training** inputs; with the default `greedy` optimizer the champion written back is the one with the best **validation** objective, and `report.md` shows train-vs-validation side by side so an overfit prompt (high train, flat validation) is visible.
|
|
238
|
+
Pass `--validation-inputs <file|dir>` to grade the champion against held-out inputs, or `--validation-split <ratio>` to hold out a seeded fraction of `--inputs`. Search and candidate acceptance run on the **training** inputs; with the default `greedy` optimizer the champion written back is the one with the best **validation** objective, and `report.md` shows train-vs-validation side by side so an overfit prompt (high train, flat validation) is visible.
|
|
119
239
|
|
|
120
240
|
## Configuration
|
|
121
241
|
|
|
@@ -152,6 +272,9 @@ runs/optimize/<run-id>/
|
|
|
152
272
|
|
|
153
273
|
By default the optimizer also prints progress to the console (the resolved grading setup, per-iteration decisions, and the start→end value of every optimized variable). `--silent` suppresses console output; artifacts are still written.
|
|
154
274
|
|
|
275
|
+
## Optimizers
|
|
276
|
+
Agency comes with two built-in optimizers, `greedy` and `gepa`. `greedy` is the default. You can specify the optimizer using the `--optimizer` flag. You can also write your own optimizers.
|
|
277
|
+
|
|
155
278
|
## Writing your own optimizer
|
|
156
279
|
|
|
157
280
|
`greedy`, `gepa`, and `example` are built on a shared `BaseOptimizer`, which you can extend. Write a module that default-exports a **factory** `(config) => Optimizer`, then point `--optimizer` (or `eval.optimize.optimizer`) at its path — exactly like `--graders`:
|
|
@@ -178,4 +301,4 @@ agency optimize foo.agency --inputs inputs.json --optimizer ./myOptimizer.ts
|
|
|
178
301
|
|
|
179
302
|
## Notes
|
|
180
303
|
|
|
181
|
-
The CLI installs an approval handler for the internal `std::agency.run(...)` calls used by eval execution. The stdlib `agency.eval.optimize(...)` function does **not** install a handler; Agency callers should wrap it in their own handler when they want auto-approval.
|
|
304
|
+
The CLI installs an approval handler for the internal `std::agency.run(...)` calls used by eval execution. The stdlib `agency.eval.optimize(...)` function does **not** install a handler; Agency callers should wrap it in their own handler when they want auto-approval.
|