codexapi 0.5.3__tar.gz → 0.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codexapi-0.5.3/src/codexapi.egg-info → codexapi-0.5.5}/PKG-INFO +21 -7
- {codexapi-0.5.3 → codexapi-0.5.5}/README.md +20 -6
- {codexapi-0.5.3 → codexapi-0.5.5}/pyproject.toml +1 -1
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/__init__.py +1 -1
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/cli.py +81 -18
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/foreach.py +9 -11
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/task.py +238 -110
- codexapi-0.5.5/src/codexapi/taskfile.py +123 -0
- {codexapi-0.5.3 → codexapi-0.5.5/src/codexapi.egg-info}/PKG-INFO +21 -7
- codexapi-0.5.3/src/codexapi/taskfile.py +0 -108
- {codexapi-0.5.3 → codexapi-0.5.5}/LICENSE +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/setup.cfg +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/__main__.py +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/agent.py +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi/ralph.py +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi.egg-info/SOURCES.txt +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi.egg-info/dependency_links.txt +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi.egg-info/entry_points.txt +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi.egg-info/requires.txt +0 -0
- {codexapi-0.5.3 → codexapi-0.5.5}/src/codexapi.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: codexapi
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: Minimal Python API for running the Codex CLI.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: codex,agent,cli,openai
|
|
@@ -73,7 +73,14 @@ echo "Say hello." | codexapi run
|
|
|
73
73
|
```bash
|
|
74
74
|
codexapi task "Fix the failing tests." --max-iterations 5
|
|
75
75
|
codexapi task -f task.yaml
|
|
76
|
+
codexapi task -f task.yaml -i README.md
|
|
76
77
|
```
|
|
78
|
+
Progress is shown by default for `codexapi task`; use `--quiet` to suppress it.
|
|
79
|
+
When using `--item`, the task file must include at least one `{{item}}` placeholder.
|
|
80
|
+
|
|
81
|
+
Task files default to using the standard check prompt for the task. Set `check: "None"` to skip verification.
|
|
82
|
+
Use `max_iterations` in the task file to override the default attempt cap (0 means unlimited).
|
|
83
|
+
Checks are wrapped with the verifier prompt, include the agent output, and expect JSON with `success`/`reason`.
|
|
77
84
|
|
|
78
85
|
Show running sessions and their latest activity:
|
|
79
86
|
|
|
@@ -115,6 +122,8 @@ Run a task file across a list file:
|
|
|
115
122
|
```bash
|
|
116
123
|
codexapi foreach list.txt task.yaml
|
|
117
124
|
codexapi foreach list.txt task.yaml -n 4
|
|
125
|
+
codexapi foreach list.txt task.yaml --retry-failed
|
|
126
|
+
codexapi foreach list.txt task.yaml --retry-all
|
|
118
127
|
```
|
|
119
128
|
|
|
120
129
|
## API
|
|
@@ -139,26 +148,31 @@ the same conversation and returns only the agent's message.
|
|
|
139
148
|
- `yolo` (bool): pass `--yolo` to Codex when true (defaults to true).
|
|
140
149
|
- `flags` (str | None): extra CLI flags to pass to Codex.
|
|
141
150
|
|
|
142
|
-
### `task(prompt, check=None,
|
|
151
|
+
### `task(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> str`
|
|
143
152
|
|
|
144
153
|
Runs a task with checker-driven retries and returns the success summary.
|
|
145
154
|
Raises `TaskFailed` when the maximum attempts are reached.
|
|
146
155
|
|
|
147
|
-
- `check` (str | None | False): custom check prompt, default checker, or `False` to skip.
|
|
148
|
-
- `
|
|
156
|
+
- `check` (str | None | False): custom check prompt, default checker, or `False`/`"None"` to skip.
|
|
157
|
+
- `max_iterations` (int): maximum number of task attempts (0 means unlimited).
|
|
158
|
+
- `progress` (bool): print progress after each verification round.
|
|
159
|
+
- `set_up`/`tear_down`/`on_success`/`on_failure` (str | None): optional hook prompts.
|
|
149
160
|
|
|
150
|
-
### `task_result(prompt, check=None,
|
|
161
|
+
### `task_result(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> TaskResult`
|
|
151
162
|
|
|
152
163
|
Runs a task with checker-driven retries and returns a `TaskResult` without
|
|
153
164
|
raising `TaskFailed`.
|
|
165
|
+
Arguments mirror `task()` (including hooks).
|
|
154
166
|
|
|
155
167
|
### `Task(prompt, max_attempts=10, cwd=None, yolo=True, thread_id=None, flags=None)`
|
|
156
168
|
|
|
157
169
|
Runs a Codex task with checker-driven retries. Subclass it and implement
|
|
158
170
|
`check()` to return an error string when the task is incomplete, or return
|
|
159
171
|
`None`/`""` when the task passes.
|
|
172
|
+
If you do not override `check()`, the default verifier wrapper runs with the
|
|
173
|
+
default check prompt and includes the agent output.
|
|
160
174
|
|
|
161
|
-
- `__call__() -> TaskResult`: run the task.
|
|
175
|
+
- `__call__(debug=False, progress=False) -> TaskResult`: run the task.
|
|
162
176
|
- `set_up()`: optional setup hook.
|
|
163
177
|
- `tear_down()`: optional cleanup hook.
|
|
164
178
|
- `check(output=None) -> str | None`: return an error description or `None`/`""`. `output` is the last agent response.
|
|
@@ -177,7 +191,7 @@ Simple result object returned by `Task.__call__`.
|
|
|
177
191
|
|
|
178
192
|
### `TaskFailed`
|
|
179
193
|
|
|
180
|
-
Exception raised by `task()` when
|
|
194
|
+
Exception raised by `task()` when attempts are exhausted.
|
|
181
195
|
|
|
182
196
|
- `summary` (str): failure summary text.
|
|
183
197
|
- `attempts` (int | None): attempts made when the task failed.
|
|
@@ -59,7 +59,14 @@ echo "Say hello." | codexapi run
|
|
|
59
59
|
```bash
|
|
60
60
|
codexapi task "Fix the failing tests." --max-iterations 5
|
|
61
61
|
codexapi task -f task.yaml
|
|
62
|
+
codexapi task -f task.yaml -i README.md
|
|
62
63
|
```
|
|
64
|
+
Progress is shown by default for `codexapi task`; use `--quiet` to suppress it.
|
|
65
|
+
When using `--item`, the task file must include at least one `{{item}}` placeholder.
|
|
66
|
+
|
|
67
|
+
Task files default to using the standard check prompt for the task. Set `check: "None"` to skip verification.
|
|
68
|
+
Use `max_iterations` in the task file to override the default attempt cap (0 means unlimited).
|
|
69
|
+
Checks are wrapped with the verifier prompt, include the agent output, and expect JSON with `success`/`reason`.
|
|
63
70
|
|
|
64
71
|
Show running sessions and their latest activity:
|
|
65
72
|
|
|
@@ -101,6 +108,8 @@ Run a task file across a list file:
|
|
|
101
108
|
```bash
|
|
102
109
|
codexapi foreach list.txt task.yaml
|
|
103
110
|
codexapi foreach list.txt task.yaml -n 4
|
|
111
|
+
codexapi foreach list.txt task.yaml --retry-failed
|
|
112
|
+
codexapi foreach list.txt task.yaml --retry-all
|
|
104
113
|
```
|
|
105
114
|
|
|
106
115
|
## API
|
|
@@ -125,26 +134,31 @@ the same conversation and returns only the agent's message.
|
|
|
125
134
|
- `yolo` (bool): pass `--yolo` to Codex when true (defaults to true).
|
|
126
135
|
- `flags` (str | None): extra CLI flags to pass to Codex.
|
|
127
136
|
|
|
128
|
-
### `task(prompt, check=None,
|
|
137
|
+
### `task(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> str`
|
|
129
138
|
|
|
130
139
|
Runs a task with checker-driven retries and returns the success summary.
|
|
131
140
|
Raises `TaskFailed` when the maximum attempts are reached.
|
|
132
141
|
|
|
133
|
-
- `check` (str | None | False): custom check prompt, default checker, or `False` to skip.
|
|
134
|
-
- `
|
|
142
|
+
- `check` (str | None | False): custom check prompt, default checker, or `False`/`"None"` to skip.
|
|
143
|
+
- `max_iterations` (int): maximum number of task attempts (0 means unlimited).
|
|
144
|
+
- `progress` (bool): print progress after each verification round.
|
|
145
|
+
- `set_up`/`tear_down`/`on_success`/`on_failure` (str | None): optional hook prompts.
|
|
135
146
|
|
|
136
|
-
### `task_result(prompt, check=None,
|
|
147
|
+
### `task_result(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> TaskResult`
|
|
137
148
|
|
|
138
149
|
Runs a task with checker-driven retries and returns a `TaskResult` without
|
|
139
150
|
raising `TaskFailed`.
|
|
151
|
+
Arguments mirror `task()` (including hooks).
|
|
140
152
|
|
|
141
153
|
### `Task(prompt, max_attempts=10, cwd=None, yolo=True, thread_id=None, flags=None)`
|
|
142
154
|
|
|
143
155
|
Runs a Codex task with checker-driven retries. Subclass it and implement
|
|
144
156
|
`check()` to return an error string when the task is incomplete, or return
|
|
145
157
|
`None`/`""` when the task passes.
|
|
158
|
+
If you do not override `check()`, the default verifier wrapper runs with the
|
|
159
|
+
default check prompt and includes the agent output.
|
|
146
160
|
|
|
147
|
-
- `__call__() -> TaskResult`: run the task.
|
|
161
|
+
- `__call__(debug=False, progress=False) -> TaskResult`: run the task.
|
|
148
162
|
- `set_up()`: optional setup hook.
|
|
149
163
|
- `tear_down()`: optional cleanup hook.
|
|
150
164
|
- `check(output=None) -> str | None`: return an error description or `None`/`""`. `output` is the last agent response.
|
|
@@ -163,7 +177,7 @@ Simple result object returned by `Task.__call__`.
|
|
|
163
177
|
|
|
164
178
|
### `TaskFailed`
|
|
165
179
|
|
|
166
|
-
Exception raised by `task()` when
|
|
180
|
+
Exception raised by `task()` when attempts are exhausted.
|
|
167
181
|
|
|
168
182
|
- `summary` (str): failure summary text.
|
|
169
183
|
- `attempts` (int | None): attempts made when the task failed.
|
|
@@ -14,8 +14,8 @@ from pathlib import Path
|
|
|
14
14
|
from .agent import Agent, agent
|
|
15
15
|
from .foreach import foreach
|
|
16
16
|
from .ralph import cancel_ralph_loop, run_ralph_loop
|
|
17
|
-
from .task import TaskFailed, task
|
|
18
|
-
from .taskfile import
|
|
17
|
+
from .task import DEFAULT_MAX_ITERATIONS, TaskFailed, task
|
|
18
|
+
from .taskfile import TaskFile, load_task_file, task_def_uses_item
|
|
19
19
|
|
|
20
20
|
_SESSION_ID_RE = re.compile(
|
|
21
21
|
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
|
|
@@ -62,6 +62,7 @@ _COLUMN_TITLES = {
|
|
|
62
62
|
"perm": "PERM",
|
|
63
63
|
"cwd": "CWD",
|
|
64
64
|
}
|
|
65
|
+
_FOREACH_STATUS_MARKERS = {"⏳", "✅", "❌"}
|
|
65
66
|
|
|
66
67
|
|
|
67
68
|
def _read_prompt(prompt):
|
|
@@ -871,6 +872,37 @@ def _print_top_once(show):
|
|
|
871
872
|
print(_format_session(session, layout))
|
|
872
873
|
|
|
873
874
|
|
|
875
|
+
def _clean_foreach_list(path, retry_failed, retry_all):
|
|
876
|
+
with open(path, "r", encoding="utf-8") as handle:
|
|
877
|
+
data = handle.read()
|
|
878
|
+
ends_with_newline = data.endswith("\n")
|
|
879
|
+
lines = data.splitlines()
|
|
880
|
+
|
|
881
|
+
cleaned = []
|
|
882
|
+
changed = False
|
|
883
|
+
for line in lines:
|
|
884
|
+
new_line = line
|
|
885
|
+
if retry_all or (retry_failed and new_line.startswith("❌")):
|
|
886
|
+
if new_line and new_line[0] in _FOREACH_STATUS_MARKERS:
|
|
887
|
+
new_line = new_line[1:]
|
|
888
|
+
if new_line.startswith(" "):
|
|
889
|
+
new_line = new_line[1:]
|
|
890
|
+
pipe = new_line.find("|")
|
|
891
|
+
if pipe != -1:
|
|
892
|
+
new_line = new_line[:pipe].rstrip()
|
|
893
|
+
if new_line != line:
|
|
894
|
+
changed = True
|
|
895
|
+
cleaned.append(new_line)
|
|
896
|
+
|
|
897
|
+
if not changed:
|
|
898
|
+
return
|
|
899
|
+
text = "\n".join(cleaned)
|
|
900
|
+
if ends_with_newline:
|
|
901
|
+
text += "\n"
|
|
902
|
+
with open(path, "w", encoding="utf-8") as handle:
|
|
903
|
+
handle.write(text)
|
|
904
|
+
|
|
905
|
+
|
|
874
906
|
def _run_top(argv):
|
|
875
907
|
if argv and argv[0] in ("-h", "--help"):
|
|
876
908
|
print("usage: codexapi top")
|
|
@@ -995,6 +1027,11 @@ def main(argv=None):
|
|
|
995
1027
|
"--task-file",
|
|
996
1028
|
help="YAML task file to run.",
|
|
997
1029
|
)
|
|
1030
|
+
task_parser.add_argument(
|
|
1031
|
+
"-i",
|
|
1032
|
+
"--item",
|
|
1033
|
+
help="Item value for task files that use {{item}} placeholders.",
|
|
1034
|
+
)
|
|
998
1035
|
task_parser.add_argument(
|
|
999
1036
|
"prompt",
|
|
1000
1037
|
nargs="?",
|
|
@@ -1008,7 +1045,10 @@ def main(argv=None):
|
|
|
1008
1045
|
"--max-iterations",
|
|
1009
1046
|
type=int,
|
|
1010
1047
|
default=None,
|
|
1011
|
-
help=
|
|
1048
|
+
help=(
|
|
1049
|
+
"Max agent attempts (0 means unlimited). "
|
|
1050
|
+
f"Defaults to {DEFAULT_MAX_ITERATIONS}."
|
|
1051
|
+
),
|
|
1012
1052
|
)
|
|
1013
1053
|
task_parser.add_argument("--cwd", help="Working directory for the Codex session.")
|
|
1014
1054
|
task_parser.add_argument(
|
|
@@ -1022,9 +1062,9 @@ def main(argv=None):
|
|
|
1022
1062
|
help="Additional raw CLI flags to pass to Codex (quoted as needed).",
|
|
1023
1063
|
)
|
|
1024
1064
|
task_parser.add_argument(
|
|
1025
|
-
"--
|
|
1065
|
+
"--quiet",
|
|
1026
1066
|
action="store_true",
|
|
1027
|
-
help="
|
|
1067
|
+
help="Suppress progress output during verification.",
|
|
1028
1068
|
)
|
|
1029
1069
|
|
|
1030
1070
|
ralph_parser = subparsers.add_parser(
|
|
@@ -1145,6 +1185,17 @@ def main(argv=None):
|
|
|
1145
1185
|
"task_file",
|
|
1146
1186
|
help="Path to the YAML task file.",
|
|
1147
1187
|
)
|
|
1188
|
+
foreach_retry_group = foreach_parser.add_mutually_exclusive_group()
|
|
1189
|
+
foreach_retry_group.add_argument(
|
|
1190
|
+
"--retry-failed",
|
|
1191
|
+
action="store_true",
|
|
1192
|
+
help="Reset failed (❌) items for re-run.",
|
|
1193
|
+
)
|
|
1194
|
+
foreach_retry_group.add_argument(
|
|
1195
|
+
"--retry-all",
|
|
1196
|
+
action="store_true",
|
|
1197
|
+
help="Reset all items for re-run.",
|
|
1198
|
+
)
|
|
1148
1199
|
foreach_parser.add_argument(
|
|
1149
1200
|
"-n",
|
|
1150
1201
|
type=int,
|
|
@@ -1178,6 +1229,12 @@ def main(argv=None):
|
|
|
1178
1229
|
if args.command == "foreach":
|
|
1179
1230
|
if args.n is not None and args.n < 1:
|
|
1180
1231
|
raise SystemExit("-n must be >= 1.")
|
|
1232
|
+
if args.retry_failed or args.retry_all:
|
|
1233
|
+
_clean_foreach_list(
|
|
1234
|
+
args.list_file,
|
|
1235
|
+
args.retry_failed,
|
|
1236
|
+
args.retry_all,
|
|
1237
|
+
)
|
|
1181
1238
|
result = foreach(
|
|
1182
1239
|
args.list_file,
|
|
1183
1240
|
args.task_file,
|
|
@@ -1222,21 +1279,25 @@ def main(argv=None):
|
|
|
1222
1279
|
if args.command == "task" and args.task_file:
|
|
1223
1280
|
if args.prompt:
|
|
1224
1281
|
raise SystemExit("task -f does not take a prompt.")
|
|
1282
|
+
if args.item is not None:
|
|
1283
|
+
task_def = load_task_file(args.task_file)
|
|
1284
|
+
if not task_def_uses_item(task_def):
|
|
1285
|
+
raise SystemExit(
|
|
1286
|
+
"task -f --item requires {{item}} in the task file."
|
|
1287
|
+
)
|
|
1225
1288
|
if args.check is not None:
|
|
1226
1289
|
raise SystemExit("--check is not allowed with -f.")
|
|
1227
1290
|
if args.max_iterations is not None:
|
|
1228
1291
|
raise SystemExit("--max-iterations is not allowed with -f.")
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
args.
|
|
1236
|
-
None,
|
|
1237
|
-
args.flags,
|
|
1292
|
+
task_runner = TaskFile(
|
|
1293
|
+
args.task_file,
|
|
1294
|
+
args.item,
|
|
1295
|
+
cwd=args.cwd,
|
|
1296
|
+
yolo=args.yolo,
|
|
1297
|
+
thread_id=None,
|
|
1298
|
+
flags=args.flags,
|
|
1238
1299
|
)
|
|
1239
|
-
result = task_runner()
|
|
1300
|
+
result = task_runner(progress=not args.quiet)
|
|
1240
1301
|
print(result.summary)
|
|
1241
1302
|
if not result.success:
|
|
1242
1303
|
raise SystemExit(1)
|
|
@@ -1278,11 +1339,13 @@ def main(argv=None):
|
|
|
1278
1339
|
)
|
|
1279
1340
|
return
|
|
1280
1341
|
if args.command == "task":
|
|
1342
|
+
if args.item is not None:
|
|
1343
|
+
raise SystemExit("--item is only supported with -f.")
|
|
1281
1344
|
if args.max_iterations is None:
|
|
1282
|
-
args.max_iterations =
|
|
1345
|
+
args.max_iterations = DEFAULT_MAX_ITERATIONS
|
|
1283
1346
|
if args.max_iterations < 0:
|
|
1284
1347
|
raise SystemExit("--max-iterations must be >= 0.")
|
|
1285
|
-
check = args.check
|
|
1348
|
+
check = args.check
|
|
1286
1349
|
try:
|
|
1287
1350
|
message = task(
|
|
1288
1351
|
prompt,
|
|
@@ -1291,7 +1354,7 @@ def main(argv=None):
|
|
|
1291
1354
|
args.cwd,
|
|
1292
1355
|
args.yolo,
|
|
1293
1356
|
args.flags,
|
|
1294
|
-
args.
|
|
1357
|
+
not args.quiet,
|
|
1295
1358
|
)
|
|
1296
1359
|
except TaskFailed as exc:
|
|
1297
1360
|
message = exc.summary
|
|
@@ -6,7 +6,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
6
6
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
|
|
9
|
-
from .taskfile import
|
|
9
|
+
from .taskfile import TaskFile
|
|
10
10
|
|
|
11
11
|
_STATUS_RUNNING = "⏳"
|
|
12
12
|
_STATUS_SUCCESS = "✅"
|
|
@@ -43,7 +43,6 @@ def foreach(
|
|
|
43
43
|
flags=None,
|
|
44
44
|
):
|
|
45
45
|
"""Run a task file over each item in list_file and update the file."""
|
|
46
|
-
task_def = load_task_file(task_file)
|
|
47
46
|
lines, ends_with_newline = _read_lines(list_file)
|
|
48
47
|
items, skipped = _collect_items(lines)
|
|
49
48
|
|
|
@@ -69,7 +68,7 @@ def foreach(
|
|
|
69
68
|
_run_item,
|
|
70
69
|
index,
|
|
71
70
|
item,
|
|
72
|
-
|
|
71
|
+
task_file,
|
|
73
72
|
lines,
|
|
74
73
|
ends_with_newline,
|
|
75
74
|
list_file,
|
|
@@ -165,7 +164,7 @@ def _format_turns(used, total):
|
|
|
165
164
|
def _run_item(
|
|
166
165
|
index,
|
|
167
166
|
item,
|
|
168
|
-
|
|
167
|
+
task_file,
|
|
169
168
|
lines,
|
|
170
169
|
ends_with_newline,
|
|
171
170
|
list_file,
|
|
@@ -189,14 +188,13 @@ def _run_item(
|
|
|
189
188
|
attempts = None
|
|
190
189
|
max_attempts = None
|
|
191
190
|
try:
|
|
192
|
-
task =
|
|
193
|
-
|
|
191
|
+
task = TaskFile(
|
|
192
|
+
task_file,
|
|
194
193
|
item,
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
flags,
|
|
194
|
+
cwd=cwd,
|
|
195
|
+
yolo=yolo,
|
|
196
|
+
thread_id=None,
|
|
197
|
+
flags=flags,
|
|
200
198
|
)
|
|
201
199
|
max_attempts = task.max_attempts
|
|
202
200
|
result = task()
|
|
@@ -10,9 +10,12 @@ _logger = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
_CHECK_PREFIX = (
|
|
12
12
|
"You are a verification agent. Explore this workspace and carefully evaluate it "
|
|
13
|
-
"against the
|
|
14
|
-
"and tracing through code, but do not change any of the code.\n"
|
|
15
|
-
"
|
|
13
|
+
"against the instructions below. Collect evidence by running any tests and/or "
|
|
14
|
+
"reading and tracing through code, but do not change any of the code.\n"
|
|
15
|
+
"You will receive the task or check instructions first, then the agent output "
|
|
16
|
+
"under the heading 'AGENT OUTPUT', which is provided for context and does not "
|
|
17
|
+
"replace or supersede collecting your own evidence unless it is clear from the "
|
|
18
|
+
"instructions that the agent's output IS the expected output of the task.\n"
|
|
16
19
|
"Return only JSON with keys: success (boolean) and reason (string).\n"
|
|
17
20
|
"Set success to true only if everything matches the intent."
|
|
18
21
|
)
|
|
@@ -23,6 +26,7 @@ _PROGRESS_PROMPT = (
|
|
|
23
26
|
"Each value must be a single line with no newlines.\n"
|
|
24
27
|
"Do not run commands or change any files."
|
|
25
28
|
)
|
|
29
|
+
DEFAULT_MAX_ITERATIONS = 10
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
def _default_check(prompt):
|
|
@@ -35,8 +39,27 @@ def _default_check(prompt):
|
|
|
35
39
|
)
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
def _build_check_prompt(check):
|
|
39
|
-
|
|
42
|
+
def _build_check_prompt(check, agent_output):
|
|
43
|
+
output = agent_output or ""
|
|
44
|
+
return (
|
|
45
|
+
f"{_CHECK_PREFIX}\n\n"
|
|
46
|
+
f"{check}\n\n"
|
|
47
|
+
"AGENT OUTPUT:\n"
|
|
48
|
+
f"{output}\n\n"
|
|
49
|
+
f"{_CHECK_SUFFIX}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _resolve_check_text(prompt, check):
|
|
54
|
+
if check is False:
|
|
55
|
+
return None, True
|
|
56
|
+
if check is None:
|
|
57
|
+
return _default_check(prompt), False
|
|
58
|
+
if not isinstance(check, str):
|
|
59
|
+
raise TypeError("check must be a string or False")
|
|
60
|
+
if check.strip() == "None":
|
|
61
|
+
return None, True
|
|
62
|
+
return check, False
|
|
40
63
|
|
|
41
64
|
|
|
42
65
|
def _build_progress_prompt(agent_output, check_output):
|
|
@@ -111,7 +134,17 @@ def _format_duration(seconds):
|
|
|
111
134
|
return " ".join(parts)
|
|
112
135
|
|
|
113
136
|
|
|
114
|
-
def
|
|
137
|
+
def _progress_round_label(attempt, total):
|
|
138
|
+
if not total:
|
|
139
|
+
return f"Round {attempt}/unlimited"
|
|
140
|
+
return f"Round {attempt}/{total}"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _print_progress_start(attempt, total):
|
|
144
|
+
print(_progress_round_label(attempt, total), flush=True)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _print_progress_result(
|
|
115
148
|
attempt,
|
|
116
149
|
total,
|
|
117
150
|
start_time,
|
|
@@ -120,24 +153,27 @@ def _print_progress(
|
|
|
120
153
|
cwd,
|
|
121
154
|
yolo,
|
|
122
155
|
flags,
|
|
156
|
+
success,
|
|
123
157
|
):
|
|
124
158
|
elapsed = time.monotonic() - start_time
|
|
125
159
|
remaining = 0
|
|
126
|
-
|
|
160
|
+
remaining_text = "unknown"
|
|
161
|
+
if total and attempt:
|
|
127
162
|
remaining = (elapsed / attempt) * (total - attempt)
|
|
163
|
+
remaining_text = _format_duration(remaining)
|
|
128
164
|
|
|
129
165
|
summary_prompt = _build_progress_prompt(agent_output, check_output)
|
|
130
166
|
summary = agent(summary_prompt, cwd, yolo, flags)
|
|
131
167
|
agent_summary, check_summary = _progress_result(summary)
|
|
132
168
|
|
|
133
169
|
elapsed_text = _format_duration(elapsed)
|
|
134
|
-
|
|
170
|
+
print(f"Agent: {agent_summary}", flush=True)
|
|
171
|
+
print(f"Check: {check_summary}", flush=True)
|
|
172
|
+
verdict = "success" if success else "failure"
|
|
135
173
|
print(
|
|
136
|
-
f"
|
|
174
|
+
f"Verdict: {verdict} ({elapsed_text} elapsed, {remaining_text} remaining)",
|
|
137
175
|
flush=True,
|
|
138
176
|
)
|
|
139
|
-
print(f"Agent: {agent_summary}", flush=True)
|
|
140
|
-
print(f"Check: {check_summary}", flush=True)
|
|
141
177
|
print("", flush=True)
|
|
142
178
|
|
|
143
179
|
def _fix_prompt(error):
|
|
@@ -174,26 +210,42 @@ class TaskFailed(RuntimeError):
|
|
|
174
210
|
self.errors = errors
|
|
175
211
|
|
|
176
212
|
|
|
213
|
+
def _validate_hook(name, value):
|
|
214
|
+
if value is None:
|
|
215
|
+
return None
|
|
216
|
+
if isinstance(value, str):
|
|
217
|
+
return value
|
|
218
|
+
raise TypeError(f"{name} must be a string or None")
|
|
219
|
+
|
|
220
|
+
|
|
177
221
|
def task(
|
|
178
222
|
prompt,
|
|
179
223
|
check=None,
|
|
180
|
-
|
|
224
|
+
max_iterations=DEFAULT_MAX_ITERATIONS,
|
|
181
225
|
cwd=None,
|
|
182
226
|
yolo=True,
|
|
183
227
|
flags=None,
|
|
184
228
|
progress=False,
|
|
229
|
+
set_up=None,
|
|
230
|
+
tear_down=None,
|
|
231
|
+
on_success=None,
|
|
232
|
+
on_failure=None,
|
|
185
233
|
):
|
|
186
234
|
"""Run a prompt with optional checker-driven retries.
|
|
187
235
|
|
|
188
236
|
Args:
|
|
189
237
|
prompt: The task prompt to run.
|
|
190
238
|
check: False to skip verification, None for the default check, or
|
|
191
|
-
a string check prompt.
|
|
192
|
-
|
|
239
|
+
a string check prompt. The string "None" skips verification.
|
|
240
|
+
max_iterations: Maximum number of task attempts (0 means unlimited).
|
|
193
241
|
cwd: Optional working directory for the Codex session.
|
|
194
242
|
yolo: Whether to pass --yolo to Codex.
|
|
195
243
|
flags: Additional raw CLI flags to pass to Codex.
|
|
196
244
|
progress: Whether to print progress after each verification round.
|
|
245
|
+
set_up: Optional setup prompt to run before the task.
|
|
246
|
+
tear_down: Optional cleanup prompt to run after the task.
|
|
247
|
+
on_success: Optional prompt to run after a successful task.
|
|
248
|
+
on_failure: Optional prompt to run after a failed task.
|
|
197
249
|
|
|
198
250
|
Returns:
|
|
199
251
|
The agent's response text when the task succeeds.
|
|
@@ -201,7 +253,19 @@ def task(
|
|
|
201
253
|
Raises:
|
|
202
254
|
TaskFailed: when the task reaches the maximum attempts without success.
|
|
203
255
|
"""
|
|
204
|
-
result = task_result(
|
|
256
|
+
result = task_result(
|
|
257
|
+
prompt,
|
|
258
|
+
check,
|
|
259
|
+
max_iterations,
|
|
260
|
+
cwd,
|
|
261
|
+
yolo,
|
|
262
|
+
flags,
|
|
263
|
+
progress,
|
|
264
|
+
set_up,
|
|
265
|
+
tear_down,
|
|
266
|
+
on_success,
|
|
267
|
+
on_failure,
|
|
268
|
+
)
|
|
205
269
|
if result.success:
|
|
206
270
|
return result.summary
|
|
207
271
|
raise TaskFailed(result.summary, result.attempts, result.errors)
|
|
@@ -210,78 +274,46 @@ def task(
|
|
|
210
274
|
def task_result(
|
|
211
275
|
prompt,
|
|
212
276
|
check=None,
|
|
213
|
-
|
|
277
|
+
max_iterations=DEFAULT_MAX_ITERATIONS,
|
|
214
278
|
cwd=None,
|
|
215
279
|
yolo=True,
|
|
216
280
|
flags=None,
|
|
217
281
|
progress=False,
|
|
282
|
+
set_up=None,
|
|
283
|
+
tear_down=None,
|
|
284
|
+
on_success=None,
|
|
285
|
+
on_failure=None,
|
|
218
286
|
):
|
|
219
287
|
"""Run a prompt with optional checker-driven retries and return TaskResult.
|
|
220
288
|
|
|
221
289
|
The runner keeps a single session. Each verification attempt uses a fresh,
|
|
222
290
|
stateless agent call. When progress is True, print a summary each round.
|
|
291
|
+
|
|
292
|
+
Hook strings mirror task file keys: set_up, tear_down, on_success, on_failure.
|
|
223
293
|
"""
|
|
224
|
-
if
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
summary = runner(prompt)
|
|
228
|
-
if progress:
|
|
229
|
-
_print_progress(
|
|
230
|
-
1,
|
|
231
|
-
1,
|
|
232
|
-
start_time,
|
|
233
|
-
summary,
|
|
234
|
-
"Verification skipped.",
|
|
235
|
-
cwd,
|
|
236
|
-
yolo,
|
|
237
|
-
flags,
|
|
238
|
-
)
|
|
239
|
-
return TaskResult(True, summary, 1, None, runner.thread_id)
|
|
240
|
-
if check is None:
|
|
241
|
-
check = _default_check(prompt)
|
|
242
|
-
if not isinstance(check, str):
|
|
294
|
+
if max_iterations < 0:
|
|
295
|
+
raise ValueError("max_iterations must be >= 0")
|
|
296
|
+
if not (check is None or check is False or isinstance(check, str)):
|
|
243
297
|
raise TypeError("check must be a string or False")
|
|
244
|
-
if n < 0:
|
|
245
|
-
raise ValueError("n must be >= 0")
|
|
246
|
-
|
|
247
|
-
runner = Agent(cwd, yolo, None, flags)
|
|
248
|
-
start_time = time.monotonic()
|
|
249
|
-
last_output = runner(prompt)
|
|
250
|
-
check_prompt = _build_check_prompt(check)
|
|
251
298
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
summary,
|
|
271
|
-
attempt + 1,
|
|
272
|
-
None,
|
|
273
|
-
runner.thread_id,
|
|
274
|
-
)
|
|
275
|
-
if attempt == n:
|
|
276
|
-
summary = runner(_failure_prompt(reason))
|
|
277
|
-
return TaskResult(
|
|
278
|
-
False,
|
|
279
|
-
summary,
|
|
280
|
-
attempt + 1,
|
|
281
|
-
reason,
|
|
282
|
-
runner.thread_id,
|
|
283
|
-
)
|
|
284
|
-
last_output = runner(_fix_prompt(reason))
|
|
299
|
+
set_up_text = _validate_hook("set_up", set_up)
|
|
300
|
+
tear_down_text = _validate_hook("tear_down", tear_down)
|
|
301
|
+
on_success_text = _validate_hook("on_success", on_success)
|
|
302
|
+
on_failure_text = _validate_hook("on_failure", on_failure)
|
|
303
|
+
runner = AutoTask(
|
|
304
|
+
prompt,
|
|
305
|
+
check,
|
|
306
|
+
max_iterations,
|
|
307
|
+
cwd,
|
|
308
|
+
yolo,
|
|
309
|
+
None,
|
|
310
|
+
flags,
|
|
311
|
+
set_up=set_up_text,
|
|
312
|
+
tear_down=tear_down_text,
|
|
313
|
+
on_success=on_success_text,
|
|
314
|
+
on_failure=on_failure_text,
|
|
315
|
+
)
|
|
316
|
+
return runner(progress=progress)
|
|
285
317
|
|
|
286
318
|
|
|
287
319
|
class TaskResult:
|
|
@@ -320,18 +352,23 @@ class Task:
|
|
|
320
352
|
def __init__(
|
|
321
353
|
self,
|
|
322
354
|
prompt,
|
|
323
|
-
max_attempts=
|
|
355
|
+
max_attempts=DEFAULT_MAX_ITERATIONS,
|
|
324
356
|
cwd=None,
|
|
325
357
|
yolo=True,
|
|
326
358
|
thread_id=None,
|
|
327
359
|
flags=None,
|
|
328
360
|
):
|
|
329
|
-
if max_attempts <
|
|
330
|
-
raise ValueError("max_attempts must be >=
|
|
361
|
+
if max_attempts < 0:
|
|
362
|
+
raise ValueError("max_attempts must be >= 0")
|
|
331
363
|
self.prompt = prompt
|
|
332
364
|
self.max_attempts = max_attempts
|
|
333
365
|
self.cwd = cwd
|
|
334
366
|
self.last_output = None
|
|
367
|
+
self.last_check_output = None
|
|
368
|
+
self.check_skipped = False
|
|
369
|
+
self.check_text = None
|
|
370
|
+
self._yolo = yolo
|
|
371
|
+
self._flags = flags
|
|
335
372
|
self.agent = Agent(
|
|
336
373
|
cwd,
|
|
337
374
|
yolo,
|
|
@@ -346,11 +383,26 @@ class Task:
|
|
|
346
383
|
"""Delete the directory etc."""
|
|
347
384
|
|
|
348
385
|
def check(self, output=None):
|
|
349
|
-
"""
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
386
|
+
"""Check if the task is done, return a string describing problems if not.
|
|
387
|
+
|
|
388
|
+
The default implementation runs the verifier agent with the standard
|
|
389
|
+
check wrapper and expects JSON output.
|
|
390
|
+
"""
|
|
391
|
+
self.last_check_output = None
|
|
392
|
+
self.check_skipped = False
|
|
393
|
+
check_text, skip = _resolve_check_text(self.prompt, self.check_text)
|
|
394
|
+
if skip:
|
|
395
|
+
self.check_skipped = True
|
|
396
|
+
return None
|
|
397
|
+
last_output = output if output is not None else self.last_output
|
|
398
|
+
last_output = last_output or ""
|
|
399
|
+
check_prompt = _build_check_prompt(check_text, last_output)
|
|
400
|
+
check_output = agent(check_prompt, self.cwd, self._yolo, self._flags)
|
|
401
|
+
self.last_check_output = check_output
|
|
402
|
+
success, reason = _check_result(check_output)
|
|
403
|
+
if success:
|
|
404
|
+
return None
|
|
405
|
+
return reason
|
|
354
406
|
|
|
355
407
|
def on_success(self, result):
|
|
356
408
|
"""Hook called after a successful task, e.g. commit the changes."""
|
|
@@ -365,23 +417,22 @@ class Task:
|
|
|
365
417
|
f"{error}\n\n"
|
|
366
418
|
"Take another look and see whether you agree and, if so, please take "
|
|
367
419
|
"this feedback into consideration and use it to continue to make "
|
|
368
|
-
"progress towards our original goal and intent."
|
|
420
|
+
"progress towards our original goal and intent. Don't propose next steps, "
|
|
421
|
+
"use your best judgement and work towards the goal!"
|
|
369
422
|
)
|
|
370
423
|
|
|
371
424
|
def success_prompt(self):
|
|
372
425
|
"""Ask the agent to summarize what it did."""
|
|
373
|
-
return
|
|
426
|
+
return _success_prompt()
|
|
374
427
|
|
|
375
428
|
def failure_prompt(self, error):
|
|
376
429
|
"""Ask the agent to summarize remaining issues after retries."""
|
|
377
|
-
return (
|
|
378
|
-
"We ran out of attempts. Can you please look back at everything you tried and summarize what it was that made this task too hard to complete, including anything you wish you'd known at the start that would have helped improve things?\n\n"
|
|
379
|
-
f"Outstanding issues:\n{error}"
|
|
380
|
-
)
|
|
430
|
+
return _failure_prompt(error)
|
|
381
431
|
|
|
382
|
-
def __call__(self, debug=False):
|
|
432
|
+
def __call__(self, debug=False, progress=False):
|
|
383
433
|
"""Run the task with checker-driven retries.
|
|
384
434
|
If debug is True, log debug messages.
|
|
435
|
+
If progress is True, print progress after each verification round.
|
|
385
436
|
"""
|
|
386
437
|
try:
|
|
387
438
|
# If this fails in the middle we will still try to tear down
|
|
@@ -392,35 +443,112 @@ class Task:
|
|
|
392
443
|
self.last_output = output
|
|
393
444
|
if debug:
|
|
394
445
|
_logger.debug("Initial output: %s", output)
|
|
395
|
-
|
|
446
|
+
|
|
396
447
|
# Try correcting it up to max_attempts times
|
|
397
|
-
|
|
448
|
+
start_time = time.monotonic()
|
|
449
|
+
error = None
|
|
450
|
+
attempt = 0
|
|
451
|
+
while True:
|
|
452
|
+
attempt += 1
|
|
453
|
+
if progress:
|
|
454
|
+
_print_progress_start(
|
|
455
|
+
attempt,
|
|
456
|
+
self.max_attempts,
|
|
457
|
+
)
|
|
398
458
|
error = self.check(self.last_output)
|
|
399
459
|
if debug:
|
|
400
460
|
_logger.debug("Check error: %s", error)
|
|
401
|
-
|
|
402
|
-
if
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
461
|
+
|
|
462
|
+
if progress:
|
|
463
|
+
check_output = self.last_check_output
|
|
464
|
+
if self.check_skipped:
|
|
465
|
+
check_output = "Verification skipped."
|
|
466
|
+
_print_progress_result(
|
|
467
|
+
attempt,
|
|
468
|
+
self.max_attempts,
|
|
469
|
+
start_time,
|
|
470
|
+
self.last_output,
|
|
471
|
+
check_output or "",
|
|
472
|
+
self.cwd,
|
|
473
|
+
self._yolo,
|
|
474
|
+
self._flags,
|
|
475
|
+
not error,
|
|
476
|
+
)
|
|
477
|
+
if not error:
|
|
410
478
|
summary = self.agent(self.success_prompt())
|
|
411
479
|
if debug:
|
|
412
480
|
_logger.debug("Success summary: %s", summary)
|
|
413
|
-
result = TaskResult(
|
|
481
|
+
result = TaskResult(
|
|
482
|
+
True,
|
|
483
|
+
summary,
|
|
484
|
+
attempt,
|
|
485
|
+
None,
|
|
486
|
+
self.agent.thread_id,
|
|
487
|
+
)
|
|
414
488
|
self.on_success(result)
|
|
415
489
|
return result
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
490
|
+
if self.max_attempts and attempt >= self.max_attempts:
|
|
491
|
+
summary = self.agent(self.failure_prompt(error))
|
|
492
|
+
if debug:
|
|
493
|
+
_logger.debug("Failure summary: %s", summary)
|
|
494
|
+
result = TaskResult(
|
|
495
|
+
False,
|
|
496
|
+
summary,
|
|
497
|
+
attempt,
|
|
498
|
+
error,
|
|
499
|
+
self.agent.thread_id,
|
|
500
|
+
)
|
|
501
|
+
self.on_failure(result)
|
|
502
|
+
return result
|
|
503
|
+
output = self.agent(self.fix_prompt(error))
|
|
504
|
+
self.last_output = output
|
|
505
|
+
if debug:
|
|
506
|
+
_logger.debug("Fix output: %s", output)
|
|
424
507
|
finally:
|
|
425
508
|
# No matter what, once we have set_up we will always tear_down
|
|
426
509
|
self.tear_down()
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
class AutoTask(Task):
|
|
513
|
+
"""Task subclass that maps prompt strings onto Task hooks."""
|
|
514
|
+
|
|
515
|
+
def __init__(
|
|
516
|
+
self,
|
|
517
|
+
prompt,
|
|
518
|
+
check=None,
|
|
519
|
+
max_attempts=DEFAULT_MAX_ITERATIONS,
|
|
520
|
+
cwd=None,
|
|
521
|
+
yolo=True,
|
|
522
|
+
thread_id=None,
|
|
523
|
+
flags=None,
|
|
524
|
+
set_up=None,
|
|
525
|
+
tear_down=None,
|
|
526
|
+
on_success=None,
|
|
527
|
+
on_failure=None,
|
|
528
|
+
):
|
|
529
|
+
if not (check is None or check is False or isinstance(check, str)):
|
|
530
|
+
raise TypeError("check must be a string or False")
|
|
531
|
+
if max_attempts < 0:
|
|
532
|
+
raise ValueError("max_attempts must be >= 0")
|
|
533
|
+
super().__init__(prompt, max_attempts, cwd, yolo, thread_id, flags)
|
|
534
|
+
self.check_text = check
|
|
535
|
+
self._set_up = _validate_hook("set_up", set_up)
|
|
536
|
+
self._tear_down = _validate_hook("tear_down", tear_down)
|
|
537
|
+
self._on_success = _validate_hook("on_success", on_success)
|
|
538
|
+
self._on_failure = _validate_hook("on_failure", on_failure)
|
|
539
|
+
|
|
540
|
+
def _run_hook(self, text):
|
|
541
|
+
if text:
|
|
542
|
+
agent(text, self.cwd, self._yolo, self._flags)
|
|
543
|
+
|
|
544
|
+
def set_up(self):
|
|
545
|
+
self._run_hook(self._set_up)
|
|
546
|
+
|
|
547
|
+
def tear_down(self):
|
|
548
|
+
self._run_hook(self._tear_down)
|
|
549
|
+
|
|
550
|
+
def on_success(self, result):
|
|
551
|
+
self._run_hook(self._on_success)
|
|
552
|
+
|
|
553
|
+
def on_failure(self, result):
|
|
554
|
+
self._run_hook(self._on_failure)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Load YAML task files and map them onto Task hooks."""
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from .task import AutoTask
|
|
6
|
+
|
|
7
|
+
_ITEM_TOKEN = "{{item}}"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_task_file(path):
|
|
11
|
+
"""Load a YAML task file and return a normalized task definition."""
|
|
12
|
+
if not path:
|
|
13
|
+
raise ValueError("task file path is required")
|
|
14
|
+
with open(path, "r", encoding="utf-8") as handle:
|
|
15
|
+
data = yaml.safe_load(handle) or {}
|
|
16
|
+
if not isinstance(data, dict):
|
|
17
|
+
raise ValueError("Task file must be a YAML mapping.")
|
|
18
|
+
|
|
19
|
+
prompt = data.get("prompt")
|
|
20
|
+
if not isinstance(prompt, str) or not prompt.strip():
|
|
21
|
+
raise ValueError("Task file missing non-empty 'prompt'.")
|
|
22
|
+
|
|
23
|
+
max_iterations = data.get("max_iterations")
|
|
24
|
+
if max_iterations is not None:
|
|
25
|
+
if not isinstance(max_iterations, int):
|
|
26
|
+
raise ValueError("Task file max_iterations must be an integer.")
|
|
27
|
+
if max_iterations < 0:
|
|
28
|
+
raise ValueError("Task file max_iterations must be >= 0.")
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
"prompt": prompt,
|
|
32
|
+
"set_up": _optional_str(data.get("set_up")),
|
|
33
|
+
"tear_down": _optional_str(data.get("tear_down")),
|
|
34
|
+
"check": _optional_str(data.get("check")),
|
|
35
|
+
"on_success": _optional_str(data.get("on_success")),
|
|
36
|
+
"on_failure": _optional_str(data.get("on_failure")),
|
|
37
|
+
"max_iterations": max_iterations,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _optional_str(value):
|
|
42
|
+
if value is None:
|
|
43
|
+
return None
|
|
44
|
+
if isinstance(value, str):
|
|
45
|
+
return value if value.strip() else None
|
|
46
|
+
raise ValueError("Task file values must be strings.")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _render(text, item):
|
|
50
|
+
if text is None:
|
|
51
|
+
return None
|
|
52
|
+
if item is None:
|
|
53
|
+
return text
|
|
54
|
+
return text.replace(_ITEM_TOKEN, item)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def task_def_uses_item(task_def):
|
|
58
|
+
"""Return True if a task definition includes the {{item}} placeholder."""
|
|
59
|
+
if not isinstance(task_def, dict):
|
|
60
|
+
raise TypeError("task definition must be a dict")
|
|
61
|
+
for key in ("prompt", "set_up", "tear_down", "check", "on_success", "on_failure"):
|
|
62
|
+
value = task_def.get(key)
|
|
63
|
+
if isinstance(value, str) and _ITEM_TOKEN in value:
|
|
64
|
+
return True
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TaskFile(AutoTask):
|
|
69
|
+
"""Task subclass that maps a YAML task file onto Task hooks."""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
path,
|
|
74
|
+
item=None,
|
|
75
|
+
max_iterations=None,
|
|
76
|
+
cwd=None,
|
|
77
|
+
yolo=True,
|
|
78
|
+
thread_id=None,
|
|
79
|
+
flags=None,
|
|
80
|
+
):
|
|
81
|
+
task_def = load_task_file(path)
|
|
82
|
+
if max_iterations is None:
|
|
83
|
+
max_iterations = task_def.get("max_iterations")
|
|
84
|
+
elif not isinstance(max_iterations, int):
|
|
85
|
+
raise ValueError("max_iterations must be an integer.")
|
|
86
|
+
elif max_iterations < 0:
|
|
87
|
+
raise ValueError("max_iterations must be >= 0.")
|
|
88
|
+
item_text = "" if item is None else str(item)
|
|
89
|
+
rendered = {
|
|
90
|
+
"prompt": _render(task_def.get("prompt"), item_text),
|
|
91
|
+
"set_up": _render(task_def.get("set_up"), item_text),
|
|
92
|
+
"tear_down": _render(task_def.get("tear_down"), item_text),
|
|
93
|
+
"check": _render(task_def.get("check"), item_text),
|
|
94
|
+
"on_success": _render(task_def.get("on_success"), item_text),
|
|
95
|
+
"on_failure": _render(task_def.get("on_failure"), item_text),
|
|
96
|
+
}
|
|
97
|
+
if max_iterations is None:
|
|
98
|
+
super().__init__(
|
|
99
|
+
rendered["prompt"],
|
|
100
|
+
rendered["check"],
|
|
101
|
+
cwd=cwd,
|
|
102
|
+
yolo=yolo,
|
|
103
|
+
thread_id=thread_id,
|
|
104
|
+
flags=flags,
|
|
105
|
+
set_up=rendered["set_up"],
|
|
106
|
+
tear_down=rendered["tear_down"],
|
|
107
|
+
on_success=rendered["on_success"],
|
|
108
|
+
on_failure=rendered["on_failure"],
|
|
109
|
+
)
|
|
110
|
+
return
|
|
111
|
+
super().__init__(
|
|
112
|
+
rendered["prompt"],
|
|
113
|
+
rendered["check"],
|
|
114
|
+
max_iterations,
|
|
115
|
+
cwd,
|
|
116
|
+
yolo,
|
|
117
|
+
thread_id,
|
|
118
|
+
flags,
|
|
119
|
+
set_up=rendered["set_up"],
|
|
120
|
+
tear_down=rendered["tear_down"],
|
|
121
|
+
on_success=rendered["on_success"],
|
|
122
|
+
on_failure=rendered["on_failure"],
|
|
123
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: codexapi
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: Minimal Python API for running the Codex CLI.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: codex,agent,cli,openai
|
|
@@ -73,7 +73,14 @@ echo "Say hello." | codexapi run
|
|
|
73
73
|
```bash
|
|
74
74
|
codexapi task "Fix the failing tests." --max-iterations 5
|
|
75
75
|
codexapi task -f task.yaml
|
|
76
|
+
codexapi task -f task.yaml -i README.md
|
|
76
77
|
```
|
|
78
|
+
Progress is shown by default for `codexapi task`; use `--quiet` to suppress it.
|
|
79
|
+
When using `--item`, the task file must include at least one `{{item}}` placeholder.
|
|
80
|
+
|
|
81
|
+
Task files default to using the standard check prompt for the task. Set `check: "None"` to skip verification.
|
|
82
|
+
Use `max_iterations` in the task file to override the default attempt cap (0 means unlimited).
|
|
83
|
+
Checks are wrapped with the verifier prompt, include the agent output, and expect JSON with `success`/`reason`.
|
|
77
84
|
|
|
78
85
|
Show running sessions and their latest activity:
|
|
79
86
|
|
|
@@ -115,6 +122,8 @@ Run a task file across a list file:
|
|
|
115
122
|
```bash
|
|
116
123
|
codexapi foreach list.txt task.yaml
|
|
117
124
|
codexapi foreach list.txt task.yaml -n 4
|
|
125
|
+
codexapi foreach list.txt task.yaml --retry-failed
|
|
126
|
+
codexapi foreach list.txt task.yaml --retry-all
|
|
118
127
|
```
|
|
119
128
|
|
|
120
129
|
## API
|
|
@@ -139,26 +148,31 @@ the same conversation and returns only the agent's message.
|
|
|
139
148
|
- `yolo` (bool): pass `--yolo` to Codex when true (defaults to true).
|
|
140
149
|
- `flags` (str | None): extra CLI flags to pass to Codex.
|
|
141
150
|
|
|
142
|
-
### `task(prompt, check=None,
|
|
151
|
+
### `task(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> str`
|
|
143
152
|
|
|
144
153
|
Runs a task with checker-driven retries and returns the success summary.
|
|
145
154
|
Raises `TaskFailed` when the maximum attempts are reached.
|
|
146
155
|
|
|
147
|
-
- `check` (str | None | False): custom check prompt, default checker, or `False` to skip.
|
|
148
|
-
- `
|
|
156
|
+
- `check` (str | None | False): custom check prompt, default checker, or `False`/`"None"` to skip.
|
|
157
|
+
- `max_iterations` (int): maximum number of task attempts (0 means unlimited).
|
|
158
|
+
- `progress` (bool): print progress after each verification round.
|
|
159
|
+
- `set_up`/`tear_down`/`on_success`/`on_failure` (str | None): optional hook prompts.
|
|
149
160
|
|
|
150
|
-
### `task_result(prompt, check=None,
|
|
161
|
+
### `task_result(prompt, check=None, max_iterations=10, cwd=None, yolo=True, flags=None, progress=False, set_up=None, tear_down=None, on_success=None, on_failure=None) -> TaskResult`
|
|
151
162
|
|
|
152
163
|
Runs a task with checker-driven retries and returns a `TaskResult` without
|
|
153
164
|
raising `TaskFailed`.
|
|
165
|
+
Arguments mirror `task()` (including hooks).
|
|
154
166
|
|
|
155
167
|
### `Task(prompt, max_attempts=10, cwd=None, yolo=True, thread_id=None, flags=None)`
|
|
156
168
|
|
|
157
169
|
Runs a Codex task with checker-driven retries. Subclass it and implement
|
|
158
170
|
`check()` to return an error string when the task is incomplete, or return
|
|
159
171
|
`None`/`""` when the task passes.
|
|
172
|
+
If you do not override `check()`, the default verifier wrapper runs with the
|
|
173
|
+
default check prompt and includes the agent output.
|
|
160
174
|
|
|
161
|
-
- `__call__() -> TaskResult`: run the task.
|
|
175
|
+
- `__call__(debug=False, progress=False) -> TaskResult`: run the task.
|
|
162
176
|
- `set_up()`: optional setup hook.
|
|
163
177
|
- `tear_down()`: optional cleanup hook.
|
|
164
178
|
- `check(output=None) -> str | None`: return an error description or `None`/`""`. `output` is the last agent response.
|
|
@@ -177,7 +191,7 @@ Simple result object returned by `Task.__call__`.
|
|
|
177
191
|
|
|
178
192
|
### `TaskFailed`
|
|
179
193
|
|
|
180
|
-
Exception raised by `task()` when
|
|
194
|
+
Exception raised by `task()` when attempts are exhausted.
|
|
181
195
|
|
|
182
196
|
- `summary` (str): failure summary text.
|
|
183
197
|
- `attempts` (int | None): attempts made when the task failed.
|
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
"""Load YAML task files and map them onto Task hooks."""
|
|
2
|
-
|
|
3
|
-
import yaml
|
|
4
|
-
|
|
5
|
-
from .agent import agent
|
|
6
|
-
from .task import Task
|
|
7
|
-
|
|
8
|
-
_ITEM_TOKEN = "{{item}}"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def load_task_file(path):
|
|
12
|
-
"""Load a YAML task file and return a normalized task definition."""
|
|
13
|
-
if not path:
|
|
14
|
-
raise ValueError("task file path is required")
|
|
15
|
-
with open(path, "r", encoding="utf-8") as handle:
|
|
16
|
-
data = yaml.safe_load(handle) or {}
|
|
17
|
-
if not isinstance(data, dict):
|
|
18
|
-
raise ValueError("Task file must be a YAML mapping.")
|
|
19
|
-
|
|
20
|
-
prompt = data.get("prompt")
|
|
21
|
-
if not isinstance(prompt, str) or not prompt.strip():
|
|
22
|
-
raise ValueError("Task file missing non-empty 'prompt'.")
|
|
23
|
-
|
|
24
|
-
return {
|
|
25
|
-
"prompt": prompt,
|
|
26
|
-
"set_up": _optional_str(data.get("set_up")),
|
|
27
|
-
"tear_down": _optional_str(data.get("tear_down")),
|
|
28
|
-
"check": _optional_str(data.get("check")),
|
|
29
|
-
"on_success": _optional_str(data.get("on_success")),
|
|
30
|
-
"on_failure": _optional_str(data.get("on_failure")),
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _optional_str(value):
|
|
35
|
-
if value is None:
|
|
36
|
-
return None
|
|
37
|
-
if isinstance(value, str):
|
|
38
|
-
return value if value.strip() else None
|
|
39
|
-
raise ValueError("Task file values must be strings.")
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def _render(text, item):
|
|
43
|
-
if text is None:
|
|
44
|
-
return None
|
|
45
|
-
if item is None:
|
|
46
|
-
return text
|
|
47
|
-
return text.replace(_ITEM_TOKEN, item)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class AutoTask(Task):
|
|
51
|
-
"""Task subclass that maps YAML strings onto Task hooks."""
|
|
52
|
-
|
|
53
|
-
def __init__(
|
|
54
|
-
self,
|
|
55
|
-
config,
|
|
56
|
-
item=None,
|
|
57
|
-
max_attempts=10,
|
|
58
|
-
cwd=None,
|
|
59
|
-
yolo=True,
|
|
60
|
-
thread_id=None,
|
|
61
|
-
flags=None,
|
|
62
|
-
):
|
|
63
|
-
if not isinstance(config, dict):
|
|
64
|
-
raise TypeError("config must be a task definition dict")
|
|
65
|
-
self._config = config
|
|
66
|
-
self._item = "" if item is None else str(item)
|
|
67
|
-
self._yolo = yolo
|
|
68
|
-
self._flags = flags
|
|
69
|
-
prompt = _render(config.get("prompt"), self._item)
|
|
70
|
-
super().__init__(prompt, max_attempts, cwd, yolo, thread_id, flags)
|
|
71
|
-
|
|
72
|
-
def _hook(self, name):
|
|
73
|
-
return _render(self._config.get(name), self._item)
|
|
74
|
-
|
|
75
|
-
def set_up(self):
|
|
76
|
-
text = self._hook("set_up")
|
|
77
|
-
if text:
|
|
78
|
-
agent(text, self.cwd, self._yolo, self._flags)
|
|
79
|
-
|
|
80
|
-
def tear_down(self):
|
|
81
|
-
text = self._hook("tear_down")
|
|
82
|
-
if text:
|
|
83
|
-
agent(text, self.cwd, self._yolo, self._flags)
|
|
84
|
-
|
|
85
|
-
def check(self, output=None):
|
|
86
|
-
text = self._hook("check")
|
|
87
|
-
if not text:
|
|
88
|
-
return None
|
|
89
|
-
last_output = output if output is not None else self.last_output
|
|
90
|
-
last_output = last_output or ""
|
|
91
|
-
if last_output:
|
|
92
|
-
prompt = f"{text}\n\nAGENT OUTPUT:\n{last_output}"
|
|
93
|
-
else:
|
|
94
|
-
prompt = text
|
|
95
|
-
result = agent(prompt, self.cwd, self._yolo, self._flags)
|
|
96
|
-
if not isinstance(result, str) or not result.strip():
|
|
97
|
-
return None
|
|
98
|
-
return result
|
|
99
|
-
|
|
100
|
-
def on_success(self, result):
|
|
101
|
-
text = self._hook("on_success")
|
|
102
|
-
if text:
|
|
103
|
-
agent(text, self.cwd, self._yolo, self._flags)
|
|
104
|
-
|
|
105
|
-
def on_failure(self, result):
|
|
106
|
-
text = self._hook("on_failure")
|
|
107
|
-
if text:
|
|
108
|
-
agent(text, self.cwd, self._yolo, self._flags)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|