agent-cli-dispatcher 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-cli-dispatcher
3
+ Version: 0.1.0
4
+ Summary: Structured LLM task runner with outcome routing via file signals
5
+ Requires-Python: >=3.11
@@ -0,0 +1,12 @@
1
+ llm_eval/__init__.py,sha256=ntKJRXSfXzS2pJTRn6NzfZM2wCzeesWO7rbSwGjAMLQ,3918
2
+ llm_eval/job.py,sha256=iz-KIojEV5_7emGjLHXv1-KnCMFhXGq9R7alm9Qf4HY,543
3
+ llm_eval/llm_svc.py,sha256=ZS_FMQPdWtcJHTOSgjy6DOm1kPyO3shRz9pGwr5TUM4,9048
4
+ llm_eval/llm_target.py,sha256=E_tCZHpHmZ-tpTW__bAESVFZOu80Ucy8owI9el1NYkg,368
5
+ llm_eval/preflight.py,sha256=kJX_ozQiwZl7qp4P3hdp8Vol8Ybz8jjtQWVRZsyTRHw,3614
6
+ llm_eval/prompt_builder.py,sha256=BZrgcAhpWqTsJSW_EsMB-I_vQu3Q82fDMQpgv185Cuc,1166
7
+ llm_eval/status_resolver.py,sha256=BcADbc-0RfrCdwcGLq3xALt7f5spUEvk9IeVLdGQqmQ,2000
8
+ llm_eval/workspace.py,sha256=M03nffvpHqUDMGlAHZRSCdmo0GJvPEWYGohJwiNvMhY,426
9
+ agent_cli_dispatcher-0.1.0.dist-info/METADATA,sha256=7XU_mh8v_dq3O0uCXxWQmkeOb9aLzCWUHL6TOZYo4wA,162
10
+ agent_cli_dispatcher-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
11
+ agent_cli_dispatcher-0.1.0.dist-info/top_level.txt,sha256=qHnZEKloXHXIzVDe-vmFJ8gMIp8Kxy_PftF1f3HMCxs,9
12
+ agent_cli_dispatcher-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ llm_eval
llm_eval/__init__.py ADDED
@@ -0,0 +1,108 @@
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Callable, Union
5
+
6
+ from llm_eval import llm_svc
7
+ from llm_eval.job import JobResult, Outcome
8
+ from llm_eval.llm_svc import LLMEvaluationError
9
+ from llm_eval.llm_target import LLMTarget, parse_targets
10
+ from llm_eval.preflight import TargetStatus, check_all, check_target
11
+ from llm_eval.prompt_builder import build_prompt
12
+ from llm_eval.status_resolver import resolve
13
+ from llm_eval.workspace import cleanup_workspace, create_workspace
14
+
15
+ __all__ = ["evaluate", "Outcome", "JobResult", "LLMTarget", "LLMEvaluationError",
16
+ "check_target", "check_all", "TargetStatus", "parse_targets"]
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def evaluate(
22
+ target: LLMTarget | None,
23
+ purpose: Union[str, Callable[[Path], str]],
24
+ outcomes: list[Outcome],
25
+ *,
26
+ targets: list[LLMTarget] | None = None,
27
+ on_exception: Callable[[Exception], None] | None = None,
28
+ model: str | None = None,
29
+ timeout: float = 1800,
30
+ cwd: str | None = None,
31
+ ) -> None:
32
+ """Run an LLM evaluation against one or more targets.
33
+
34
+ Pass ``targets`` (list) to enable ordered fallback: each target is tried in
35
+ sequence and the next is used only when the previous raises LLMEvaluationError.
36
+ Pass ``target`` (single) for the original single-target behaviour.
37
+ Providing both raises ValueError; providing neither raises ValueError.
38
+
39
+ ``purpose`` may be a plain string or a callable that receives the workspace
40
+ Path and returns a string. Use the callable form when the prompt must embed
41
+ the workspace path as the LLM output directory so that resolve() can find
42
+ the files written by the LLM.
43
+ """
44
+ if targets is not None and target is not None:
45
+ raise ValueError("Provide either 'target' or 'targets', not both.")
46
+ _targets: list[LLMTarget] = targets if targets is not None else ([target] if target is not None else [])
47
+ if not _targets:
48
+ raise ValueError("evaluate() requires 'target' or 'targets'.")
49
+
50
+ job_id, workspace = create_workspace(cwd)
51
+ purpose_str = purpose(workspace) if callable(purpose) else purpose
52
+ prompt = build_prompt(purpose_str, outcomes)
53
+ start = time.monotonic()
54
+
55
+ try:
56
+ stdout, winning_target = _run_with_fallback(
57
+ _targets, prompt, model=model, cwd=str(workspace), timeout=timeout
58
+ )
59
+ except Exception as exc:
60
+ cleanup_workspace(workspace)
61
+ if on_exception is not None:
62
+ on_exception(exc)
63
+ return
64
+ raise
65
+
66
+ duration = time.monotonic() - start
67
+
68
+ try:
69
+ matched_outcome, result = resolve(
70
+ workspace=workspace,
71
+ outcomes=outcomes,
72
+ job_id=job_id,
73
+ target=winning_target.value,
74
+ duration_seconds=duration,
75
+ stdout=stdout,
76
+ )
77
+ except Exception as exc:
78
+ cleanup_workspace(workspace)
79
+ if on_exception is not None:
80
+ on_exception(exc)
81
+ return
82
+ raise
83
+
84
+ # Callback exceptions are intentionally NOT caught here — they originate from
85
+ # business logic, not the LLM layer, and must propagate directly to the caller.
86
+ try:
87
+ matched_outcome.callback(result)
88
+ finally:
89
+ cleanup_workspace(workspace)
90
+
91
+
92
+ def _run_with_fallback(
93
+ targets: list[LLMTarget],
94
+ prompt: str,
95
+ *,
96
+ model: str | None = None,
97
+ cwd: str | None = None,
98
+ timeout: float = 1800,
99
+ ) -> tuple[str, LLMTarget]:
100
+ """Try each target in order; return (stdout, winning_target) on first success."""
101
+ last_exc: llm_svc.LLMEvaluationError | None = None
102
+ for t in targets:
103
+ try:
104
+ return llm_svc.run(t, prompt, model=model, cwd=cwd, timeout=timeout), t
105
+ except llm_svc.LLMEvaluationError as exc:
106
+ logger.warning("evaluate fallback: %s failed, trying next. error: %s", t.value, exc)
107
+ last_exc = exc
108
+ raise last_exc # type: ignore[misc]
llm_eval/job.py ADDED
@@ -0,0 +1,24 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Callable
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class Outcome:
7
+ description: str
8
+ callback: Callable[["JobResult"], None]
9
+ status: str | None = None
10
+ output_files: list[str] | None = None
11
+
12
+
13
+ def effective_status(outcome: Outcome, index: int) -> str:
14
+ return outcome.status if outcome.status is not None else str(index)
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class JobResult:
19
+ job_id: str
20
+ status: str
21
+ target: str
22
+ duration_seconds: float
23
+ files: dict[str, bytes]
24
+ stdout: str
llm_eval/llm_svc.py ADDED
@@ -0,0 +1,233 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ import time
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+
12
+ from llm_eval.llm_target import LLMTarget
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class LLMEvaluationError(RuntimeError):
18
+ """LLM subprocess failure: non-zero exit, quota exhaustion, timeout, or execution error."""
19
+
20
+ _QUOTA_ERROR_PATTERNS: list[re.Pattern] = [
21
+ re.compile(p, re.IGNORECASE)
22
+ for p in [
23
+ r"exceeded your monthly token limit",
24
+ r"exceeded your current quota",
25
+ r"insufficient.quota",
26
+ r"quota.exceeded",
27
+ r"billing hard limit",
28
+ r"credit balance is too low",
29
+ r"out of credits",
30
+ r"rate.limit.exceeded",
31
+ r"429",
32
+ r"payment required",
33
+ ]
34
+ ]
35
+
36
+ _QUOTA_RETRY_INTERVAL_SECONDS: int = int(os.getenv("LLM_QUOTA_RETRY_INTERVAL", "300"))
37
+ _QUOTA_MAX_RETRIES: int = int(os.getenv("LLM_QUOTA_MAX_RETRIES", "288"))
38
+
39
+ _ALLOW_ALL_OPENCODE_PERMISSION = {
40
+ "bash": "allow", "read": "allow", "edit": "allow", "task": "allow",
41
+ "glob": "allow", "grep": "allow", "list": "allow",
42
+ "external_directory": "allow", "todowrite": "allow", "todoread": "allow",
43
+ "question": "allow", "webfetch": "allow", "websearch": "allow",
44
+ "codesearch": "allow", "lsp": "allow", "doom_loop": "allow", "skill": "allow",
45
+ }
46
+
47
+
48
+ def _is_quota_error(text: str) -> bool:
49
+ for pattern in _QUOTA_ERROR_PATTERNS:
50
+ if pattern.search(text):
51
+ return True
52
+ return False
53
+
54
+
55
+ def _resolve_cli(command_name: str) -> str:
56
+ if os.name == "nt":
57
+ cmd_candidate = shutil.which(f"{command_name}.cmd")
58
+ if cmd_candidate:
59
+ return cmd_candidate
60
+ resolved = shutil.which(command_name)
61
+ return resolved if resolved else command_name
62
+
63
+
64
+ def run(
65
+ target: LLMTarget,
66
+ prompt: str,
67
+ *,
68
+ model: str | None = None,
69
+ cwd: str | None = None,
70
+ timeout: float | None = 1800,
71
+ encoding: str = "utf-8",
72
+ quota_retry_interval: int | None = None,
73
+ quota_max_retries: int | None = None,
74
+ ) -> str:
75
+ if not prompt.strip():
76
+ raise ValueError("prompt must not be empty.")
77
+
78
+ _retry_interval = quota_retry_interval if quota_retry_interval is not None else _QUOTA_RETRY_INTERVAL_SECONDS
79
+ _max_retries = quota_max_retries if quota_max_retries is not None else _QUOTA_MAX_RETRIES
80
+
81
+ work_dir = str(Path(cwd).resolve()) if cwd else None
82
+ effective_dir = work_dir or str(Path.cwd())
83
+
84
+ run_id = uuid.uuid4().hex[:8]
85
+ io_dir = Path(effective_dir) / ".llm_io"
86
+ io_dir.mkdir(parents=True, exist_ok=True)
87
+ prompt_file = io_dir / f"prompt_{run_id}.txt"
88
+ output_file = io_dir / f"output_{run_id}.txt"
89
+ prompt_file.write_text(prompt, encoding=encoding)
90
+
91
+ stdin_input: str = ""
92
+ env = dict(os.environ)
93
+
94
+ try:
95
+ if target == LLMTarget.CLAUDE:
96
+ command = [_resolve_cli("claude"), "--print", "--dangerously-skip-permissions"]
97
+ if model:
98
+ command.extend(["--model", model])
99
+ stdin_input = prompt_file.read_text(encoding=encoding)
100
+
101
+ elif target == LLMTarget.GEMINI:
102
+ command = [_resolve_cli("gemini"), "--approval-mode", "auto_edit",
103
+ "--prompt", prompt_file.read_text(encoding=encoding)]
104
+
105
+ elif target == LLMTarget.CODEX:
106
+ # Pass prompt via stdin to avoid Windows cmd line length limits (~8191 chars).
107
+ # codex reads stdin when no PROMPT arg is given (or arg is "-").
108
+ command = [_resolve_cli("codex"), "exec", "--dangerously-bypass-approvals-and-sandbox",
109
+ "--skip-git-repo-check"]
110
+ stdin_input = prompt_file.read_text(encoding=encoding)
111
+
112
+ elif target == LLMTarget.OPENCODE:
113
+ env.setdefault("OPENCODE_PERMISSION", json.dumps(_ALLOW_ALL_OPENCODE_PERMISSION))
114
+ runtime_root = Path(effective_dir).resolve() / "data" / "tool-runtime" / "opencode"
115
+ for subdir in ("config", "data", "state"):
116
+ (runtime_root / subdir).mkdir(parents=True, exist_ok=True)
117
+ env.setdefault("XDG_CONFIG_HOME", str(runtime_root / "config"))
118
+ env.setdefault("XDG_DATA_HOME", str(runtime_root / "data"))
119
+ env.setdefault("XDG_STATE_HOME", str(runtime_root / "state"))
120
+ command = [_resolve_cli("opencode"), "run",
121
+ "--dir", effective_dir, "--format", "json", "-"]
122
+ stdin_input = prompt_file.read_text(encoding=encoding)
123
+
124
+ elif target == LLMTarget.COPILOT:
125
+ command = [_resolve_cli("copilot"), "-p", prompt_file.read_text(encoding=encoding),
126
+ "--allow-all", "--no-ask-user", "--output-format", "text", "--silent",
127
+ "--add-dir", effective_dir]
128
+ if model:
129
+ command.extend(["--model", model])
130
+
131
+ else:
132
+ raise ValueError(f"Unsupported LLM target: {target}")
133
+
134
+ logger.info("run [%s] cwd=%s", target.value, work_dir or "(inherit)")
135
+ logger.debug("run [%s] command=%s", target.value, command)
136
+ logger.debug("run [%s] prompt_file=%s\n%s", target.value, prompt_file, prompt)
137
+
138
+ completed = None
139
+ for quota_attempt in range(_max_retries + 1):
140
+ try:
141
+ completed = subprocess.run(
142
+ command,
143
+ input=stdin_input,
144
+ capture_output=True,
145
+ text=True,
146
+ encoding=encoding,
147
+ errors="replace",
148
+ cwd=work_dir,
149
+ env=env,
150
+ timeout=timeout,
151
+ )
152
+ except LLMEvaluationError:
153
+ raise
154
+ except Exception as e:
155
+ logger.error("execute cmd exception: %s", e)
156
+ raise LLMEvaluationError(f"{target.value} subprocess error: {e}") from e
157
+
158
+ if completed.returncode != 0:
159
+ stderr = (completed.stderr or "").strip()
160
+ stdout = (completed.stdout or "").strip()
161
+ parts = [s for s in [stderr, stdout] if s]
162
+ detail = "\n".join(parts) if parts else "(no output)"
163
+
164
+ if _is_quota_error(detail):
165
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
166
+ logger.warning(
167
+ "[QUOTA EXHAUSTED] %s | attempt=%d/%d | Retrying in %ds. Detail: %s",
168
+ target.value, quota_attempt + 1, _max_retries, _retry_interval, detail[:300],
169
+ )
170
+ if quota_attempt < _max_retries:
171
+ time.sleep(_retry_interval)
172
+ continue
173
+ raise LLMEvaluationError(
174
+ f"{target.value} quota exhausted after {_max_retries} retries. "
175
+ f"Last error: {detail[:300]}"
176
+ )
177
+
178
+ raise LLMEvaluationError(
179
+ f"{target.value} CLI failed (exit {completed.returncode}): {detail[:500]}"
180
+ )
181
+
182
+ break
183
+
184
+ raw_stdout = (completed.stdout or "").strip()
185
+
186
+ if target == LLMTarget.OPENCODE and raw_stdout:
187
+ try:
188
+ chunks = []
189
+ for line in raw_stdout.splitlines():
190
+ if not line.strip():
191
+ continue
192
+ event = json.loads(line)
193
+ if event.get("type") == "error":
194
+ msg = (event.get("error") or {}).get("data", {}).get("message", "")
195
+ raise RuntimeError(str(msg))
196
+ message = event.get("message")
197
+ if isinstance(message, dict):
198
+ for item in (message.get("content") or []):
199
+ if isinstance(item, dict) and item.get("type") == "text":
200
+ chunks.append(str(item["text"]))
201
+ if chunks:
202
+ raw_stdout = "\n".join(chunks).strip()
203
+ except json.JSONDecodeError:
204
+ pass
205
+
206
+ logger.info("run [%s] done. stdout_len=%d", target.value, len(raw_stdout))
207
+ output_file.write_text(raw_stdout, encoding=encoding)
208
+ return output_file.read_text(encoding=encoding)
209
+
210
+ finally:
211
+ prompt_file.unlink(missing_ok=True)
212
+ output_file.unlink(missing_ok=True)
213
+
214
+
215
+ def run_with_fallback(
216
+ targets: list[LLMTarget],
217
+ prompt: str,
218
+ *,
219
+ model: str | None = None,
220
+ cwd: str | None = None,
221
+ timeout: float | None = 1800,
222
+ encoding: str = "utf-8",
223
+ ) -> str:
224
+ if not targets:
225
+ raise ValueError("targets must not be empty")
226
+ last_exc: LLMEvaluationError | None = None
227
+ for target in targets:
228
+ try:
229
+ return run(target, prompt, model=model, cwd=cwd, timeout=timeout, encoding=encoding)
230
+ except LLMEvaluationError as exc:
231
+ logger.warning("run_with_fallback: %s failed, trying next. error: %s", target.value, exc)
232
+ last_exc = exc
233
+ raise last_exc
llm_eval/llm_target.py ADDED
@@ -0,0 +1,14 @@
1
+ from enum import Enum
2
+
3
+
4
+ class LLMTarget(Enum):
5
+ CLAUDE = "claude"
6
+ GEMINI = "gemini"
7
+ CODEX = "codex"
8
+ OPENCODE = "opencode"
9
+ COPILOT = "copilot"
10
+
11
+
12
+ def parse_targets(value: str) -> list[LLMTarget]:
13
+ """Parse 'claude,gemini' → [LLMTarget.CLAUDE, LLMTarget.GEMINI]."""
14
+ return [LLMTarget(v.strip()) for v in value.split(",") if v.strip()]
llm_eval/preflight.py ADDED
@@ -0,0 +1,101 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ from dataclasses import dataclass
5
+ from typing import Callable
6
+
7
+ from llm_eval.llm_target import LLMTarget
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class TargetStatus:
12
+ ok: bool
13
+ reason: str | None = None
14
+
15
+
16
+ def _resolve_cli(command_name: str) -> str:
17
+ if os.name == "nt":
18
+ cmd_candidate = shutil.which(f"{command_name}.cmd")
19
+ if cmd_candidate:
20
+ return cmd_candidate
21
+ resolved = shutil.which(command_name)
22
+ return resolved if resolved else command_name
23
+
24
+
25
+ def _check_claude() -> TargetStatus:
26
+ binary = _resolve_cli("claude")
27
+ try:
28
+ result = subprocess.run(
29
+ [binary, "auth", "status", "--json"],
30
+ capture_output=True, text=True, timeout=15, input="",
31
+ )
32
+ if result.returncode == 0 and '"loggedIn": true' in result.stdout:
33
+ return TargetStatus(ok=True)
34
+ reason = result.stderr.strip() or result.stdout.strip() or "loggedIn not true"
35
+ return TargetStatus(ok=False, reason=reason[:200])
36
+ except FileNotFoundError:
37
+ return TargetStatus(ok=False, reason="claude CLI not found on PATH")
38
+ except subprocess.TimeoutExpired:
39
+ return TargetStatus(ok=False, reason="claude auth status timed out")
40
+ except Exception as e:
41
+ return TargetStatus(ok=False, reason=str(e)[:200])
42
+
43
+
44
+ def _check_codex() -> TargetStatus:
45
+ binary = _resolve_cli("codex")
46
+ try:
47
+ result = subprocess.run(
48
+ [binary, "login", "status"],
49
+ capture_output=True, text=True, timeout=15, input="",
50
+ )
51
+ if result.returncode == 0 and "Logged in" in result.stderr:
52
+ return TargetStatus(ok=True)
53
+ reason = result.stderr.strip() or result.stdout.strip() or "not logged in"
54
+ return TargetStatus(ok=False, reason=reason[:200])
55
+ except FileNotFoundError:
56
+ return TargetStatus(ok=False, reason="codex CLI not found on PATH")
57
+ except subprocess.TimeoutExpired:
58
+ return TargetStatus(ok=False, reason="codex login status timed out")
59
+ except Exception as e:
60
+ return TargetStatus(ok=False, reason=str(e)[:200])
61
+
62
+
63
+ def _check_via_version(tool: str) -> TargetStatus:
64
+ binary = _resolve_cli(tool)
65
+ try:
66
+ result = subprocess.run(
67
+ [binary, "--version"],
68
+ capture_output=True, text=True, timeout=15, input="",
69
+ )
70
+ if result.returncode == 0:
71
+ return TargetStatus(ok=True)
72
+ reason = (result.stderr or result.stdout or "non-zero exit").strip()
73
+ return TargetStatus(ok=False, reason=reason[:200])
74
+ except FileNotFoundError:
75
+ return TargetStatus(ok=False, reason=f"{tool} CLI not found on PATH")
76
+ except subprocess.TimeoutExpired:
77
+ return TargetStatus(ok=False, reason=f"{tool} --version timed out")
78
+ except Exception as e:
79
+ return TargetStatus(ok=False, reason=str(e)[:200])
80
+
81
+
82
+ _CHECKERS: dict[LLMTarget, Callable[[], TargetStatus]] = {
83
+ LLMTarget.CLAUDE: _check_claude,
84
+ LLMTarget.GEMINI: lambda: _check_via_version("gemini"),
85
+ LLMTarget.CODEX: _check_codex,
86
+ LLMTarget.OPENCODE: lambda: _check_via_version("opencode"),
87
+ LLMTarget.COPILOT: lambda: _check_via_version("copilot"),
88
+ }
89
+
90
+
91
+ def check_target(target: LLMTarget) -> TargetStatus:
92
+ """Check whether a single LLM target's CLI is installed and authenticated."""
93
+ return _CHECKERS[target]()
94
+
95
+
96
+ def check_all() -> dict[LLMTarget, TargetStatus]:
97
+ """Check all known LLM targets. Always runs live; no caching."""
98
+ return {target: checker() for target, checker in _CHECKERS.items()}
99
+
100
+ if __name__ == '__main__':
101
+ _check_codex()
@@ -0,0 +1,34 @@
1
+ from llm_eval.job import Outcome, effective_status
2
+
3
+
4
+ def build_prompt(purpose: str, outcomes: list[Outcome]) -> str:
5
+ lines = [
6
+ purpose,
7
+ "",
8
+ "---",
9
+ "",
10
+ "After completing your analysis, you MUST create exactly one of the following "
11
+ "empty files in your current working directory to signal your conclusion. "
12
+ "This must be the last action you take.",
13
+ "",
14
+ "Status files (create exactly one, leave it empty):",
15
+ ]
16
+
17
+ for i, outcome in enumerate(outcomes):
18
+ status = effective_status(outcome, i)
19
+ lines.append(f" status_{status:<20} — {outcome.description}")
20
+
21
+ lines.append("")
22
+
23
+ for i, outcome in enumerate(outcomes):
24
+ if outcome.output_files:
25
+ status = effective_status(outcome, i)
26
+ lines.append(f'If the outcome is "{status}", also write these files:')
27
+ for filename in outcome.output_files:
28
+ lines.append(f" {filename}")
29
+ lines.append("")
30
+
31
+ lines.append("Do not create more than one status file.")
32
+ lines.append("Do not write anything inside the status file.")
33
+
34
+ return "\n".join(lines)
@@ -0,0 +1,67 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from llm_eval.job import JobResult, Outcome, effective_status
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def resolve(
10
+ workspace: Path,
11
+ outcomes: list[Outcome],
12
+ job_id: str,
13
+ target: str,
14
+ duration_seconds: float,
15
+ stdout: str,
16
+ ) -> tuple[Outcome, JobResult]:
17
+ status_files = sorted(workspace.glob("status_*"))
18
+
19
+ if len(status_files) > 1:
20
+ logger.warning(
21
+ "Multiple status files found: %s. Using %s.",
22
+ [p.name for p in status_files],
23
+ status_files[0].name,
24
+ )
25
+
26
+ if not status_files:
27
+ error_outcome = next(
28
+ (o for i, o in enumerate(outcomes) if effective_status(o, i) == "error"),
29
+ None,
30
+ )
31
+ if error_outcome is None:
32
+ raise RuntimeError(
33
+ "No status file created by LLM and no 'error' outcome defined."
34
+ )
35
+ matched = error_outcome
36
+ status_name = "error"
37
+ else:
38
+ status_name = status_files[0].stem[len("status_"):]
39
+ matched = next(
40
+ (o for i, o in enumerate(outcomes) if effective_status(o, i) == status_name),
41
+ None,
42
+ )
43
+ if matched is None:
44
+ raise RuntimeError(
45
+ f"Status file 'status_{status_name}' does not match any defined outcome."
46
+ )
47
+
48
+ if matched.output_files is not None:
49
+ missing = [f for f in matched.output_files if not (workspace / f).exists()]
50
+ if missing:
51
+ raise RuntimeError(
52
+ f"Outcome '{status_name}' declared output_files {missing} "
53
+ "but LLM did not create them."
54
+ )
55
+
56
+ declared = matched.output_files or []
57
+ files: dict[str, bytes] = {f: (workspace / f).read_bytes() for f in declared}
58
+
59
+ result = JobResult(
60
+ job_id=job_id,
61
+ status=status_name,
62
+ target=target,
63
+ duration_seconds=duration_seconds,
64
+ files=files,
65
+ stdout=stdout,
66
+ )
67
+ return matched, result
llm_eval/workspace.py ADDED
@@ -0,0 +1,15 @@
1
+ import shutil
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+
6
+ def create_workspace(base_dir: str | None) -> tuple[str, Path]:
7
+ job_id = uuid.uuid4().hex[:8]
8
+ base = Path(base_dir) if base_dir else Path.cwd()
9
+ workspace = base / ".llm_eval" / job_id
10
+ workspace.mkdir(parents=True, exist_ok=True)
11
+ return job_id, workspace
12
+
13
+
14
+ def cleanup_workspace(workspace: Path) -> None:
15
+ shutil.rmtree(workspace, ignore_errors=True)