agentv 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,9 +34,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
34
34
  mod
35
35
  ));
36
36
 
37
- // ../../node_modules/.pnpm/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token-error.js
37
+ // ../../node_modules/.bun/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token-error.js
38
38
  var require_token_error = __commonJS({
39
- "../../node_modules/.pnpm/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token-error.js"(exports, module) {
39
+ "../../node_modules/.bun/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token-error.js"(exports, module) {
40
40
  "use strict";
41
41
  var __defProp2 = Object.defineProperty;
42
42
  var __getOwnPropDesc2 = Object.getOwnPropertyDescriptor;
@@ -83,4 +83,4 @@ export {
83
83
  __toESM,
84
84
  require_token_error
85
85
  };
86
- //# sourceMappingURL=chunk-7XYYGJAC.js.map
86
+ //# sourceMappingURL=chunk-UE4GLFVL.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../../node_modules/.bun/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token-error.js"],"sourcesContent":["\"use strict\";\nvar __defProp = Object.defineProperty;\nvar __getOwnPropDesc = Object.getOwnPropertyDescriptor;\nvar __getOwnPropNames = Object.getOwnPropertyNames;\nvar __hasOwnProp = Object.prototype.hasOwnProperty;\nvar __export = (target, all) => {\n for (var name in all)\n __defProp(target, name, { get: all[name], enumerable: true });\n};\nvar __copyProps = (to, from, except, desc) => {\n if (from && typeof from === \"object\" || typeof from === \"function\") {\n for (let key of __getOwnPropNames(from))\n if (!__hasOwnProp.call(to, key) && key !== except)\n __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });\n }\n return to;\n};\nvar __toCommonJS = (mod) => __copyProps(__defProp({}, \"__esModule\", { value: true }), mod);\nvar token_error_exports = {};\n__export(token_error_exports, {\n VercelOidcTokenError: () => VercelOidcTokenError\n});\nmodule.exports = __toCommonJS(token_error_exports);\nclass VercelOidcTokenError extends Error {\n constructor(message, cause) {\n super(message);\n this.name = \"VercelOidcTokenError\";\n this.cause = cause;\n }\n toString() {\n if (this.cause) {\n return `${this.name}: ${this.message}: ${this.cause}`;\n }\n return `${this.name}: ${this.message}`;\n }\n}\n// Annotate the CommonJS export names for ESM import in node:\n0 && (module.exports = {\n VercelOidcTokenError\n});\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AACA,QAAIA,aAAY,OAAO;AACvB,QAAIC,oBAAmB,OAAO;AAC9B,QAAIC,qBAAoB,OAAO;AAC/B,QAAIC,gBAAe,OAAO,UAAU;AACpC,QAAIC,YAAW,CAAC,QAAQ,QAAQ;AAC9B,eAAS,QAAQ;AACf,QAAAJ,WAAU,QAAQ,MAAM,EAAE,KAAK,IAAI,IAAI,GAAG,YAAY,KAAK,CAAC;AAAA,IAChE;AACA,QAAIK,eAAc,CAAC,IAAI,MAAM,QAAQ,SAAS;AAC5C,UAAI,QAAQ,OAAO,SAAS,YAAY,OAAO,SAAS,YAAY;AAClE,iBAAS,OAAOH,mBAAkB,IAAI;AACpC,cAAI,CAACC,cAAa,KAAK,IAAI,GAAG,KAAK,QAAQ;AACzC,YAAAH,WAAU,IAAI,KAAK,EAAE,KAAK,MAAM,KAAK,GAAG,GAAG,YAAY,EAAE,OAAOC,kBAAiB,MAAM,GAAG,MAAM,KAAK,WAAW,CAAC;AAAA,MACvH;AACA,aAAO;AAAA,IACT;AACA,QAAI,eAAe,CAAC,QAAQI,aAAYL,WAAU,CAAC,GAAG,cAAc,EAAE,OAAO,KAAK,CAAC,GAAG,GAAG;AACzF,QAAI,sBAAsB,CAAC;AAC3B,IAAAI,UAAS,qBAAqB;AAAA,MAC5B,sBAAsB,MAAM;AAAA,IAC9B,CAAC;AACD,WAAO,UAAU,aAAa,mBAAmB;AACjD,QAAM,uBAAN,cAAmC,MAAM;AAAA,MACvC,YAAY,SAAS,OAAO;AAC1B,cAAM,OAAO;AACb,aAAK,OAAO;AACZ,aAAK,QAAQ;AAAA,MACf;AAAA,MACA,WAAW;AACT,YAAI,KAAK,OAAO;AACd,iBAAO,GAAG,KAAK,IAAI,KAAK,KAAK,OAAO,KAAK,KAAK,KAAK;AAAA,QACrD;AACA,eAAO,GAAG,KAAK,IAAI,KAAK,KAAK,OAAO;AAAA,MACtC;AAAA,IACF;AAAA;AAAA;","names":["__defProp","__getOwnPropDesc","__getOwnPropNames","__hasOwnProp","__export","__copyProps"]}
package/dist/cli.js CHANGED
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  runCli
4
- } from "./chunk-HWGALLUR.js";
5
- import "./chunk-7XYYGJAC.js";
4
+ } from "./chunk-PHILDJ3W.js";
5
+ import "./chunk-UE4GLFVL.js";
6
6
 
7
7
  // src/cli.ts
8
8
  void runCli();
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from './index.js';\r\n\r\nvoid runCli();\r\n"],"mappings":";;;;;;;AAGA,KAAK,OAAO;","names":[]}
1
+ {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from \"./index.js\";\r\n\r\nvoid runCli();\r\n"],"mappings":";;;;;;;AAGA,KAAK,OAAO;","names":[]}
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
1
  import {
2
2
  createProgram,
3
3
  runCli
4
- } from "./chunk-HWGALLUR.js";
5
- import "./chunk-7XYYGJAC.js";
4
+ } from "./chunk-PHILDJ3W.js";
5
+ import "./chunk-UE4GLFVL.js";
6
6
  export {
7
7
  createProgram,
8
8
  runCli
@@ -1,27 +1,27 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "title": "AgentV Config Schema",
4
- "description": "Schema for .agentv/config.yaml configuration files",
5
- "type": "object",
6
- "properties": {
7
- "$schema": {
8
- "type": "string",
9
- "description": "Schema identifier",
10
- "enum": ["agentv-config-v2"]
11
- },
12
- "guideline_patterns": {
13
- "type": "array",
14
- "description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
15
- "items": {
16
- "type": "string",
17
- "description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
18
- },
19
- "examples": [
20
- ["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
21
- ["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
22
- ]
23
- }
24
- },
25
- "required": ["$schema"],
26
- "additionalProperties": false
27
- }
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "AgentV Config Schema",
4
+ "description": "Schema for .agentv/config.yaml configuration files",
5
+ "type": "object",
6
+ "properties": {
7
+ "$schema": {
8
+ "type": "string",
9
+ "description": "Schema identifier",
10
+ "enum": ["agentv-config-v2"]
11
+ },
12
+ "guideline_patterns": {
13
+ "type": "array",
14
+ "description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
15
+ "items": {
16
+ "type": "string",
17
+ "description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
18
+ },
19
+ "examples": [
20
+ ["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
21
+ ["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
22
+ ]
23
+ }
24
+ },
25
+ "required": ["$schema"],
26
+ "additionalProperties": false
27
+ }
@@ -1,217 +1,217 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "title": "AgentV Eval Schema",
4
- "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
5
- "type": "object",
6
- "properties": {
7
- "$schema": {
8
- "type": "string",
9
- "description": "Schema identifier",
10
- "enum": ["agentv-eval-v2"]
11
- },
12
- "description": {
13
- "type": "string",
14
- "description": "Description of what this eval suite covers"
15
- },
16
- "target": {
17
- "type": "string",
18
- "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
19
- },
20
- "execution": {
21
- "type": "object",
22
- "description": "Default execution configuration for all eval cases (can be overridden per case)",
23
- "properties": {
24
- "target": {
25
- "type": "string",
26
- "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
27
- },
28
- "evaluators": {
29
- "type": "array",
30
- "description": "Default evaluators for all eval cases (code-based and LLM judges)",
31
- "items": {
32
- "type": "object",
33
- "properties": {
34
- "name": {
35
- "type": "string",
36
- "description": "Evaluator name/identifier"
37
- },
38
- "type": {
39
- "type": "string",
40
- "enum": ["code", "llm_judge"],
41
- "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
42
- },
43
- "script": {
44
- "type": "string",
45
- "description": "Path to evaluator script (for type: code)"
46
- },
47
- "prompt": {
48
- "type": "string",
49
- "description": "Path to judge prompt file (for type: llm_judge)"
50
- }
51
- },
52
- "required": ["name", "type"],
53
- "additionalProperties": true
54
- }
55
- }
56
- },
57
- "additionalProperties": true
58
- },
59
- "evalcases": {
60
- "type": "array",
61
- "description": "Array of evaluation cases",
62
- "minItems": 1,
63
- "items": {
64
- "type": "object",
65
- "properties": {
66
- "id": {
67
- "type": "string",
68
- "description": "Unique identifier for the eval case"
69
- },
70
- "conversation_id": {
71
- "type": "string",
72
- "description": "Optional conversation identifier for threading multiple eval cases together"
73
- },
74
- "outcome": {
75
- "type": "string",
76
- "description": "Description of what the AI should accomplish in this eval"
77
- },
78
- "note": {
79
- "type": "string",
80
- "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
81
- },
82
- "input_messages": {
83
- "type": "array",
84
- "description": "Input messages for the conversation",
85
- "minItems": 1,
86
- "items": {
87
- "type": "object",
88
- "properties": {
89
- "role": {
90
- "type": "string",
91
- "enum": ["system", "user", "assistant", "tool"],
92
- "description": "Message role"
93
- },
94
- "content": {
95
- "oneOf": [
96
- {
97
- "type": "string",
98
- "description": "Simple text content"
99
- },
100
- {
101
- "type": "array",
102
- "description": "Mixed content items (text and file references)",
103
- "items": {
104
- "type": "object",
105
- "properties": {
106
- "type": {
107
- "type": "string",
108
- "enum": ["text", "file"],
109
- "description": "Content type: 'text' for inline content, 'file' for file references"
110
- },
111
- "value": {
112
- "type": "string",
113
- "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
114
- }
115
- },
116
- "required": ["type", "value"],
117
- "additionalProperties": false
118
- }
119
- }
120
- ]
121
- }
122
- },
123
- "required": ["role", "content"],
124
- "additionalProperties": false
125
- }
126
- },
127
- "expected_messages": {
128
- "type": "array",
129
- "description": "Expected response messages",
130
- "minItems": 1,
131
- "items": {
132
- "type": "object",
133
- "properties": {
134
- "role": {
135
- "type": "string",
136
- "enum": ["system", "user", "assistant", "tool"],
137
- "description": "Message role"
138
- },
139
- "content": {
140
- "oneOf": [
141
- {
142
- "type": "string",
143
- "description": "Simple text content"
144
- },
145
- {
146
- "type": "array",
147
- "description": "Mixed content items",
148
- "items": {
149
- "type": "object",
150
- "properties": {
151
- "type": {
152
- "type": "string",
153
- "enum": ["text", "file"]
154
- },
155
- "value": {
156
- "type": "string"
157
- }
158
- },
159
- "required": ["type", "value"],
160
- "additionalProperties": false
161
- }
162
- }
163
- ]
164
- }
165
- },
166
- "required": ["role", "content"],
167
- "additionalProperties": false
168
- }
169
- },
170
- "execution": {
171
- "type": "object",
172
- "description": "Per-case execution configuration",
173
- "properties": {
174
- "target": {
175
- "type": "string",
176
- "description": "Override target for this specific eval case"
177
- },
178
- "evaluators": {
179
- "type": "array",
180
- "description": "Multiple evaluators (code-based and LLM judges)",
181
- "items": {
182
- "type": "object",
183
- "properties": {
184
- "name": {
185
- "type": "string",
186
- "description": "Evaluator name/identifier"
187
- },
188
- "type": {
189
- "type": "string",
190
- "enum": ["code", "llm_judge"],
191
- "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
192
- },
193
- "script": {
194
- "type": "string",
195
- "description": "Path to evaluator script (for type: code)"
196
- },
197
- "prompt": {
198
- "type": "string",
199
- "description": "Path to judge prompt file (for type: llm_judge)"
200
- }
201
- },
202
- "required": ["name", "type"],
203
- "additionalProperties": true
204
- }
205
- }
206
- },
207
- "additionalProperties": true
208
- }
209
- },
210
- "required": ["id", "outcome", "input_messages", "expected_messages"],
211
- "additionalProperties": false
212
- }
213
- }
214
- },
215
- "required": ["evalcases"],
216
- "additionalProperties": false
217
- }
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "AgentV Eval Schema",
4
+ "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
5
+ "type": "object",
6
+ "properties": {
7
+ "$schema": {
8
+ "type": "string",
9
+ "description": "Schema identifier",
10
+ "enum": ["agentv-eval-v2"]
11
+ },
12
+ "description": {
13
+ "type": "string",
14
+ "description": "Description of what this eval suite covers"
15
+ },
16
+ "target": {
17
+ "type": "string",
18
+ "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
19
+ },
20
+ "execution": {
21
+ "type": "object",
22
+ "description": "Default execution configuration for all eval cases (can be overridden per case)",
23
+ "properties": {
24
+ "target": {
25
+ "type": "string",
26
+ "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
27
+ },
28
+ "evaluators": {
29
+ "type": "array",
30
+ "description": "Default evaluators for all eval cases (code-based and LLM judges)",
31
+ "items": {
32
+ "type": "object",
33
+ "properties": {
34
+ "name": {
35
+ "type": "string",
36
+ "description": "Evaluator name/identifier"
37
+ },
38
+ "type": {
39
+ "type": "string",
40
+ "enum": ["code", "llm_judge"],
41
+ "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
42
+ },
43
+ "script": {
44
+ "type": "string",
45
+ "description": "Path to evaluator script (for type: code)"
46
+ },
47
+ "prompt": {
48
+ "type": "string",
49
+ "description": "Path to judge prompt file (for type: llm_judge)"
50
+ }
51
+ },
52
+ "required": ["name", "type"],
53
+ "additionalProperties": true
54
+ }
55
+ }
56
+ },
57
+ "additionalProperties": true
58
+ },
59
+ "evalcases": {
60
+ "type": "array",
61
+ "description": "Array of evaluation cases",
62
+ "minItems": 1,
63
+ "items": {
64
+ "type": "object",
65
+ "properties": {
66
+ "id": {
67
+ "type": "string",
68
+ "description": "Unique identifier for the eval case"
69
+ },
70
+ "conversation_id": {
71
+ "type": "string",
72
+ "description": "Optional conversation identifier for threading multiple eval cases together"
73
+ },
74
+ "outcome": {
75
+ "type": "string",
76
+ "description": "Description of what the AI should accomplish in this eval"
77
+ },
78
+ "note": {
79
+ "type": "string",
80
+ "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
81
+ },
82
+ "input_messages": {
83
+ "type": "array",
84
+ "description": "Input messages for the conversation",
85
+ "minItems": 1,
86
+ "items": {
87
+ "type": "object",
88
+ "properties": {
89
+ "role": {
90
+ "type": "string",
91
+ "enum": ["system", "user", "assistant", "tool"],
92
+ "description": "Message role"
93
+ },
94
+ "content": {
95
+ "oneOf": [
96
+ {
97
+ "type": "string",
98
+ "description": "Simple text content"
99
+ },
100
+ {
101
+ "type": "array",
102
+ "description": "Mixed content items (text and file references)",
103
+ "items": {
104
+ "type": "object",
105
+ "properties": {
106
+ "type": {
107
+ "type": "string",
108
+ "enum": ["text", "file"],
109
+ "description": "Content type: 'text' for inline content, 'file' for file references"
110
+ },
111
+ "value": {
112
+ "type": "string",
113
+ "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
114
+ }
115
+ },
116
+ "required": ["type", "value"],
117
+ "additionalProperties": false
118
+ }
119
+ }
120
+ ]
121
+ }
122
+ },
123
+ "required": ["role", "content"],
124
+ "additionalProperties": false
125
+ }
126
+ },
127
+ "expected_messages": {
128
+ "type": "array",
129
+ "description": "Expected response messages",
130
+ "minItems": 1,
131
+ "items": {
132
+ "type": "object",
133
+ "properties": {
134
+ "role": {
135
+ "type": "string",
136
+ "enum": ["system", "user", "assistant", "tool"],
137
+ "description": "Message role"
138
+ },
139
+ "content": {
140
+ "oneOf": [
141
+ {
142
+ "type": "string",
143
+ "description": "Simple text content"
144
+ },
145
+ {
146
+ "type": "array",
147
+ "description": "Mixed content items",
148
+ "items": {
149
+ "type": "object",
150
+ "properties": {
151
+ "type": {
152
+ "type": "string",
153
+ "enum": ["text", "file"]
154
+ },
155
+ "value": {
156
+ "type": "string"
157
+ }
158
+ },
159
+ "required": ["type", "value"],
160
+ "additionalProperties": false
161
+ }
162
+ }
163
+ ]
164
+ }
165
+ },
166
+ "required": ["role", "content"],
167
+ "additionalProperties": false
168
+ }
169
+ },
170
+ "execution": {
171
+ "type": "object",
172
+ "description": "Per-case execution configuration",
173
+ "properties": {
174
+ "target": {
175
+ "type": "string",
176
+ "description": "Override target for this specific eval case"
177
+ },
178
+ "evaluators": {
179
+ "type": "array",
180
+ "description": "Multiple evaluators (code-based and LLM judges)",
181
+ "items": {
182
+ "type": "object",
183
+ "properties": {
184
+ "name": {
185
+ "type": "string",
186
+ "description": "Evaluator name/identifier"
187
+ },
188
+ "type": {
189
+ "type": "string",
190
+ "enum": ["code", "llm_judge"],
191
+ "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
192
+ },
193
+ "script": {
194
+ "type": "string",
195
+ "description": "Path to evaluator script (for type: code)"
196
+ },
197
+ "prompt": {
198
+ "type": "string",
199
+ "description": "Path to judge prompt file (for type: llm_judge)"
200
+ }
201
+ },
202
+ "required": ["name", "type"],
203
+ "additionalProperties": true
204
+ }
205
+ }
206
+ },
207
+ "additionalProperties": true
208
+ }
209
+ },
210
+ "required": ["id", "outcome", "input_messages", "expected_messages"],
211
+ "additionalProperties": false
212
+ }
213
+ }
214
+ },
215
+ "required": ["evalcases"],
216
+ "additionalProperties": false
217
+ }
@@ -21,41 +21,48 @@ description: Iteratively optimize prompt files against AgentV evaluation dataset
21
21
  - If not, create a new one in the parent directory of the eval files: `optimization-[timestamp].md`.
22
22
  - Read content of the identified prompt file.
23
23
 
24
- 2. **Optimization Loop** (Max 5 iterations)
25
- - **Execute (The Generator)**: Run `pnpm agentv eval <eval-path>`.
24
+ 2. **Optimization Loop** (Max 10 iterations)
25
+ - **Execute (The Generator)**: Run `agentv eval <eval-path>`.
26
+ - *Targeted Run*: If iterating on specific stubborn failures, use `--eval-id <case_id>` to run only the relevant eval cases.
26
27
  - **Analyze (The Reflector)**:
27
28
  - Locate the results file path from the console output (e.g., `.agentv/results/eval_...jsonl`).
28
- - Read the results file. Calculate pass rate.
29
- - **Root Cause Analysis**: For each failure, perform a deep dive:
30
- - **Error Identification**: What exactly went wrong? (e.g., "Predicted 'High' but expected 'Low'")
31
- - **Root Cause**: Why did it happen? (e.g., "Ambiguous definition of 'High' severity", "Hallucinated a constraint", "Incorrect test expectation")
32
- - **Correct Approach**: What *should* the model have done?
33
- - **Key Insight**: What general rule or pattern can we learn from this?
34
- - **Regression Check**: Did this change break previously passing tests? If so, mark the previous change as "Harmful".
29
+ - **Orchestrate Subagent**: Use `runSubagent` to analyze the results.
30
+ - **Task**: Read the results file, calculate pass rate, and perform root cause analysis.
31
+ - **Output**: Return a structured analysis including:
32
+ - **Score**: Current pass rate.
33
+ - **Root Cause**: Why failures occurred (e.g., "Ambiguous definition", "Hallucination").
34
+ - **Insight**: Key learning or pattern identified from the failures.
35
+ - **Strategy**: High-level plan to fix the prompt (e.g., "Clarify section X", "Add negative constraint").
35
36
  - **Decide**:
36
37
  - If **100% pass**: STOP and report success.
37
38
  - If **Score decreased**: Revert last change, try different approach.
38
39
  - If **No improvement** (2x): STOP and report stagnation.
39
- - **Log Result**:
40
- - Append the result of this iteration to the identified optimization log file.
41
- - **Format**:
42
- ```markdown
43
- ### Iteration [N]
44
- - **Change**: [Description of edit]
45
- - **Rationale**: [Root Cause / Why this fix was chosen]
46
- - **Outcome**: [Success / Failure / Harmful] (Score: X% -> Y%)
47
- - **Insight**: [Key learning or pattern identified]
48
- ```
49
40
  - **Refine (The Curator)**:
50
- - Modify the relevant `<prompt-file>` (pick the one most likely to be the root cause) to address failures.
51
- - **Strategy**: Treat the prompt as a structured set of rules and instructions.
52
- - **Clarify**: If ambiguous, make the existing instruction more specific.
53
- - **Add Rule**: If a constraint was missed, add a specific bullet point to the relevant section.
54
- - **Negative Constraint**: If hallucinating, explicitly state what NOT to do.
55
- - **Consolidate**: Check for redundant or overlapping instructions and merge them.
41
+ - **Orchestrate Subagent**: Use `runSubagent` to apply the fix.
42
+ - **Task**: Read the relevant prompt file(s), apply the **Strategy** from the Reflector, and generate the log entry.
43
+ - **Output**: The **Log Entry** describing the specific operation performed.
44
+ ```markdown
45
+ ### Iteration [N]
46
+ - **Operation**: [ADD / UPDATE / DELETE]
47
+ - **Target**: [Section Name]
48
+ - **Change**: [Specific text added/modified]
49
+ - **Trigger**: [Specific failing test case or error pattern]
50
+ - **Rationale**: [From Reflector: Root Cause]
51
+ - **Score**: [From Reflector: Current Pass Rate]
52
+ - **Insight**: [From Reflector: Key Learning]
53
+ ```
54
+ - **Strategy**: Treat the prompt as a structured set of rules. Execute atomic operations:
55
+ - **ADD**: Insert a new rule if a constraint was missed.
56
+ - **UPDATE**: Refine an existing rule to be clearer or more general.
57
+ - *Clarify*: Make ambiguous instructions specific.
58
+ - *Generalize*: Refactor specific fixes into high-level principles (First Principles).
59
+ - **DELETE**: Remove obsolete, redundant, or harmful rules.
60
+ - *Prune*: If a general rule covers specific cases, delete the specific ones.
61
+ - **Negative Constraint**: If hallucinating, explicitly state what NOT to do. Prefer generalized prohibitions over specific forbidden tokens where possible.
56
62
  - **Safety Check**: Ensure new rules don't contradict existing ones (unless intended).
57
63
  - **Constraint**: Avoid rewriting large sections. Make surgical, additive changes to preserve existing behavior.
58
- - **Apply**: Use `replace_string_in_file`.
64
+ - **Log Result**:
65
+ - Append the **Log Entry** returned by the Curator to the optimization log file.
59
66
 
60
67
  3. **Completion**
61
68
  - Report final score.
@@ -63,7 +70,8 @@ description: Iteratively optimize prompt files against AgentV evaluation dataset
63
70
  - **Finalize Optimization Log**: Add a summary header to the optimization log file indicating the session completion and final score.
64
71
 
65
72
  ## Guidelines
66
- - **Simplicity ("Less is More")**: Avoid adding specific rules for rare edge cases ("hotfixes"). Focus on universally applicable instructions.
73
+ - **Generalization First**: Prefer broad, principle-based guidelines over specific examples or "hotfixes". Only use specific rules if generalized instructions fail to achieve the desired score.
74
+ - **Simplicity ("Less is More")**: Avoid overfitting to the test set. If a specific rule doesn't significantly improve the score compared to a general one, choose the general one.
67
75
  - **Structure**: Maintain existing Markdown headers/sections.
68
76
  - **Progressive Disclosure**: If the prompt grows too large (>200 lines), consider moving specialized logic into a separate file or skill.
69
77
  - **Quality Criteria**: Ensure the prompt defines a clear persona, specific task, and measurable success criteria.