agent-cli-dispatcher 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli_dispatcher-0.1.0.dist-info/METADATA +5 -0
- agent_cli_dispatcher-0.1.0.dist-info/RECORD +12 -0
- agent_cli_dispatcher-0.1.0.dist-info/WHEEL +5 -0
- agent_cli_dispatcher-0.1.0.dist-info/top_level.txt +1 -0
- llm_eval/__init__.py +108 -0
- llm_eval/job.py +24 -0
- llm_eval/llm_svc.py +233 -0
- llm_eval/llm_target.py +14 -0
- llm_eval/preflight.py +101 -0
- llm_eval/prompt_builder.py +34 -0
- llm_eval/status_resolver.py +67 -0
- llm_eval/workspace.py +15 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
llm_eval/__init__.py,sha256=ntKJRXSfXzS2pJTRn6NzfZM2wCzeesWO7rbSwGjAMLQ,3918
|
|
2
|
+
llm_eval/job.py,sha256=iz-KIojEV5_7emGjLHXv1-KnCMFhXGq9R7alm9Qf4HY,543
|
|
3
|
+
llm_eval/llm_svc.py,sha256=ZS_FMQPdWtcJHTOSgjy6DOm1kPyO3shRz9pGwr5TUM4,9048
|
|
4
|
+
llm_eval/llm_target.py,sha256=E_tCZHpHmZ-tpTW__bAESVFZOu80Ucy8owI9el1NYkg,368
|
|
5
|
+
llm_eval/preflight.py,sha256=kJX_ozQiwZl7qp4P3hdp8Vol8Ybz8jjtQWVRZsyTRHw,3614
|
|
6
|
+
llm_eval/prompt_builder.py,sha256=BZrgcAhpWqTsJSW_EsMB-I_vQu3Q82fDMQpgv185Cuc,1166
|
|
7
|
+
llm_eval/status_resolver.py,sha256=BcADbc-0RfrCdwcGLq3xALt7f5spUEvk9IeVLdGQqmQ,2000
|
|
8
|
+
llm_eval/workspace.py,sha256=M03nffvpHqUDMGlAHZRSCdmo0GJvPEWYGohJwiNvMhY,426
|
|
9
|
+
agent_cli_dispatcher-0.1.0.dist-info/METADATA,sha256=7XU_mh8v_dq3O0uCXxWQmkeOb9aLzCWUHL6TOZYo4wA,162
|
|
10
|
+
agent_cli_dispatcher-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
agent_cli_dispatcher-0.1.0.dist-info/top_level.txt,sha256=qHnZEKloXHXIzVDe-vmFJ8gMIp8Kxy_PftF1f3HMCxs,9
|
|
12
|
+
agent_cli_dispatcher-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llm_eval
|
llm_eval/__init__.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Union
|
|
5
|
+
|
|
6
|
+
from llm_eval import llm_svc
|
|
7
|
+
from llm_eval.job import JobResult, Outcome
|
|
8
|
+
from llm_eval.llm_svc import LLMEvaluationError
|
|
9
|
+
from llm_eval.llm_target import LLMTarget, parse_targets
|
|
10
|
+
from llm_eval.preflight import TargetStatus, check_all, check_target
|
|
11
|
+
from llm_eval.prompt_builder import build_prompt
|
|
12
|
+
from llm_eval.status_resolver import resolve
|
|
13
|
+
from llm_eval.workspace import cleanup_workspace, create_workspace
|
|
14
|
+
|
|
15
|
+
__all__ = ["evaluate", "Outcome", "JobResult", "LLMTarget", "LLMEvaluationError",
|
|
16
|
+
"check_target", "check_all", "TargetStatus", "parse_targets"]
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def evaluate(
|
|
22
|
+
target: LLMTarget | None,
|
|
23
|
+
purpose: Union[str, Callable[[Path], str]],
|
|
24
|
+
outcomes: list[Outcome],
|
|
25
|
+
*,
|
|
26
|
+
targets: list[LLMTarget] | None = None,
|
|
27
|
+
on_exception: Callable[[Exception], None] | None = None,
|
|
28
|
+
model: str | None = None,
|
|
29
|
+
timeout: float = 1800,
|
|
30
|
+
cwd: str | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Run an LLM evaluation against one or more targets.
|
|
33
|
+
|
|
34
|
+
Pass ``targets`` (list) to enable ordered fallback: each target is tried in
|
|
35
|
+
sequence and the next is used only when the previous raises LLMEvaluationError.
|
|
36
|
+
Pass ``target`` (single) for the original single-target behaviour.
|
|
37
|
+
Providing both raises ValueError; providing neither raises ValueError.
|
|
38
|
+
|
|
39
|
+
``purpose`` may be a plain string or a callable that receives the workspace
|
|
40
|
+
Path and returns a string. Use the callable form when the prompt must embed
|
|
41
|
+
the workspace path as the LLM output directory so that resolve() can find
|
|
42
|
+
the files written by the LLM.
|
|
43
|
+
"""
|
|
44
|
+
if targets is not None and target is not None:
|
|
45
|
+
raise ValueError("Provide either 'target' or 'targets', not both.")
|
|
46
|
+
_targets: list[LLMTarget] = targets if targets is not None else ([target] if target is not None else [])
|
|
47
|
+
if not _targets:
|
|
48
|
+
raise ValueError("evaluate() requires 'target' or 'targets'.")
|
|
49
|
+
|
|
50
|
+
job_id, workspace = create_workspace(cwd)
|
|
51
|
+
purpose_str = purpose(workspace) if callable(purpose) else purpose
|
|
52
|
+
prompt = build_prompt(purpose_str, outcomes)
|
|
53
|
+
start = time.monotonic()
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
stdout, winning_target = _run_with_fallback(
|
|
57
|
+
_targets, prompt, model=model, cwd=str(workspace), timeout=timeout
|
|
58
|
+
)
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
cleanup_workspace(workspace)
|
|
61
|
+
if on_exception is not None:
|
|
62
|
+
on_exception(exc)
|
|
63
|
+
return
|
|
64
|
+
raise
|
|
65
|
+
|
|
66
|
+
duration = time.monotonic() - start
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
matched_outcome, result = resolve(
|
|
70
|
+
workspace=workspace,
|
|
71
|
+
outcomes=outcomes,
|
|
72
|
+
job_id=job_id,
|
|
73
|
+
target=winning_target.value,
|
|
74
|
+
duration_seconds=duration,
|
|
75
|
+
stdout=stdout,
|
|
76
|
+
)
|
|
77
|
+
except Exception as exc:
|
|
78
|
+
cleanup_workspace(workspace)
|
|
79
|
+
if on_exception is not None:
|
|
80
|
+
on_exception(exc)
|
|
81
|
+
return
|
|
82
|
+
raise
|
|
83
|
+
|
|
84
|
+
# Callback exceptions are intentionally NOT caught here — they originate from
|
|
85
|
+
# business logic, not the LLM layer, and must propagate directly to the caller.
|
|
86
|
+
try:
|
|
87
|
+
matched_outcome.callback(result)
|
|
88
|
+
finally:
|
|
89
|
+
cleanup_workspace(workspace)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _run_with_fallback(
|
|
93
|
+
targets: list[LLMTarget],
|
|
94
|
+
prompt: str,
|
|
95
|
+
*,
|
|
96
|
+
model: str | None = None,
|
|
97
|
+
cwd: str | None = None,
|
|
98
|
+
timeout: float = 1800,
|
|
99
|
+
) -> tuple[str, LLMTarget]:
|
|
100
|
+
"""Try each target in order; return (stdout, winning_target) on first success."""
|
|
101
|
+
last_exc: llm_svc.LLMEvaluationError | None = None
|
|
102
|
+
for t in targets:
|
|
103
|
+
try:
|
|
104
|
+
return llm_svc.run(t, prompt, model=model, cwd=cwd, timeout=timeout), t
|
|
105
|
+
except llm_svc.LLMEvaluationError as exc:
|
|
106
|
+
logger.warning("evaluate fallback: %s failed, trying next. error: %s", t.value, exc)
|
|
107
|
+
last_exc = exc
|
|
108
|
+
raise last_exc # type: ignore[misc]
|
llm_eval/job.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class Outcome:
|
|
7
|
+
description: str
|
|
8
|
+
callback: Callable[["JobResult"], None]
|
|
9
|
+
status: str | None = None
|
|
10
|
+
output_files: list[str] | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def effective_status(outcome: Outcome, index: int) -> str:
|
|
14
|
+
return outcome.status if outcome.status is not None else str(index)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class JobResult:
|
|
19
|
+
job_id: str
|
|
20
|
+
status: str
|
|
21
|
+
target: str
|
|
22
|
+
duration_seconds: float
|
|
23
|
+
files: dict[str, bytes]
|
|
24
|
+
stdout: str
|
llm_eval/llm_svc.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from llm_eval.llm_target import LLMTarget
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LLMEvaluationError(RuntimeError):
|
|
18
|
+
"""LLM subprocess failure: non-zero exit, quota exhaustion, timeout, or execution error."""
|
|
19
|
+
|
|
20
|
+
_QUOTA_ERROR_PATTERNS: list[re.Pattern] = [
|
|
21
|
+
re.compile(p, re.IGNORECASE)
|
|
22
|
+
for p in [
|
|
23
|
+
r"exceeded your monthly token limit",
|
|
24
|
+
r"exceeded your current quota",
|
|
25
|
+
r"insufficient.quota",
|
|
26
|
+
r"quota.exceeded",
|
|
27
|
+
r"billing hard limit",
|
|
28
|
+
r"credit balance is too low",
|
|
29
|
+
r"out of credits",
|
|
30
|
+
r"rate.limit.exceeded",
|
|
31
|
+
r"429",
|
|
32
|
+
r"payment required",
|
|
33
|
+
]
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
_QUOTA_RETRY_INTERVAL_SECONDS: int = int(os.getenv("LLM_QUOTA_RETRY_INTERVAL", "300"))
|
|
37
|
+
_QUOTA_MAX_RETRIES: int = int(os.getenv("LLM_QUOTA_MAX_RETRIES", "288"))
|
|
38
|
+
|
|
39
|
+
_ALLOW_ALL_OPENCODE_PERMISSION = {
|
|
40
|
+
"bash": "allow", "read": "allow", "edit": "allow", "task": "allow",
|
|
41
|
+
"glob": "allow", "grep": "allow", "list": "allow",
|
|
42
|
+
"external_directory": "allow", "todowrite": "allow", "todoread": "allow",
|
|
43
|
+
"question": "allow", "webfetch": "allow", "websearch": "allow",
|
|
44
|
+
"codesearch": "allow", "lsp": "allow", "doom_loop": "allow", "skill": "allow",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_quota_error(text: str) -> bool:
|
|
49
|
+
for pattern in _QUOTA_ERROR_PATTERNS:
|
|
50
|
+
if pattern.search(text):
|
|
51
|
+
return True
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _resolve_cli(command_name: str) -> str:
|
|
56
|
+
if os.name == "nt":
|
|
57
|
+
cmd_candidate = shutil.which(f"{command_name}.cmd")
|
|
58
|
+
if cmd_candidate:
|
|
59
|
+
return cmd_candidate
|
|
60
|
+
resolved = shutil.which(command_name)
|
|
61
|
+
return resolved if resolved else command_name
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def run(
|
|
65
|
+
target: LLMTarget,
|
|
66
|
+
prompt: str,
|
|
67
|
+
*,
|
|
68
|
+
model: str | None = None,
|
|
69
|
+
cwd: str | None = None,
|
|
70
|
+
timeout: float | None = 1800,
|
|
71
|
+
encoding: str = "utf-8",
|
|
72
|
+
quota_retry_interval: int | None = None,
|
|
73
|
+
quota_max_retries: int | None = None,
|
|
74
|
+
) -> str:
|
|
75
|
+
if not prompt.strip():
|
|
76
|
+
raise ValueError("prompt must not be empty.")
|
|
77
|
+
|
|
78
|
+
_retry_interval = quota_retry_interval if quota_retry_interval is not None else _QUOTA_RETRY_INTERVAL_SECONDS
|
|
79
|
+
_max_retries = quota_max_retries if quota_max_retries is not None else _QUOTA_MAX_RETRIES
|
|
80
|
+
|
|
81
|
+
work_dir = str(Path(cwd).resolve()) if cwd else None
|
|
82
|
+
effective_dir = work_dir or str(Path.cwd())
|
|
83
|
+
|
|
84
|
+
run_id = uuid.uuid4().hex[:8]
|
|
85
|
+
io_dir = Path(effective_dir) / ".llm_io"
|
|
86
|
+
io_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
prompt_file = io_dir / f"prompt_{run_id}.txt"
|
|
88
|
+
output_file = io_dir / f"output_{run_id}.txt"
|
|
89
|
+
prompt_file.write_text(prompt, encoding=encoding)
|
|
90
|
+
|
|
91
|
+
stdin_input: str = ""
|
|
92
|
+
env = dict(os.environ)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
if target == LLMTarget.CLAUDE:
|
|
96
|
+
command = [_resolve_cli("claude"), "--print", "--dangerously-skip-permissions"]
|
|
97
|
+
if model:
|
|
98
|
+
command.extend(["--model", model])
|
|
99
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
100
|
+
|
|
101
|
+
elif target == LLMTarget.GEMINI:
|
|
102
|
+
command = [_resolve_cli("gemini"), "--approval-mode", "auto_edit",
|
|
103
|
+
"--prompt", prompt_file.read_text(encoding=encoding)]
|
|
104
|
+
|
|
105
|
+
elif target == LLMTarget.CODEX:
|
|
106
|
+
# Pass prompt via stdin to avoid Windows cmd line length limits (~8191 chars).
|
|
107
|
+
# codex reads stdin when no PROMPT arg is given (or arg is "-").
|
|
108
|
+
command = [_resolve_cli("codex"), "exec", "--dangerously-bypass-approvals-and-sandbox",
|
|
109
|
+
"--skip-git-repo-check"]
|
|
110
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
111
|
+
|
|
112
|
+
elif target == LLMTarget.OPENCODE:
|
|
113
|
+
env.setdefault("OPENCODE_PERMISSION", json.dumps(_ALLOW_ALL_OPENCODE_PERMISSION))
|
|
114
|
+
runtime_root = Path(effective_dir).resolve() / "data" / "tool-runtime" / "opencode"
|
|
115
|
+
for subdir in ("config", "data", "state"):
|
|
116
|
+
(runtime_root / subdir).mkdir(parents=True, exist_ok=True)
|
|
117
|
+
env.setdefault("XDG_CONFIG_HOME", str(runtime_root / "config"))
|
|
118
|
+
env.setdefault("XDG_DATA_HOME", str(runtime_root / "data"))
|
|
119
|
+
env.setdefault("XDG_STATE_HOME", str(runtime_root / "state"))
|
|
120
|
+
command = [_resolve_cli("opencode"), "run",
|
|
121
|
+
"--dir", effective_dir, "--format", "json", "-"]
|
|
122
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
123
|
+
|
|
124
|
+
elif target == LLMTarget.COPILOT:
|
|
125
|
+
command = [_resolve_cli("copilot"), "-p", prompt_file.read_text(encoding=encoding),
|
|
126
|
+
"--allow-all", "--no-ask-user", "--output-format", "text", "--silent",
|
|
127
|
+
"--add-dir", effective_dir]
|
|
128
|
+
if model:
|
|
129
|
+
command.extend(["--model", model])
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
raise ValueError(f"Unsupported LLM target: {target}")
|
|
133
|
+
|
|
134
|
+
logger.info("run [%s] cwd=%s", target.value, work_dir or "(inherit)")
|
|
135
|
+
logger.debug("run [%s] command=%s", target.value, command)
|
|
136
|
+
logger.debug("run [%s] prompt_file=%s\n%s", target.value, prompt_file, prompt)
|
|
137
|
+
|
|
138
|
+
completed = None
|
|
139
|
+
for quota_attempt in range(_max_retries + 1):
|
|
140
|
+
try:
|
|
141
|
+
completed = subprocess.run(
|
|
142
|
+
command,
|
|
143
|
+
input=stdin_input,
|
|
144
|
+
capture_output=True,
|
|
145
|
+
text=True,
|
|
146
|
+
encoding=encoding,
|
|
147
|
+
errors="replace",
|
|
148
|
+
cwd=work_dir,
|
|
149
|
+
env=env,
|
|
150
|
+
timeout=timeout,
|
|
151
|
+
)
|
|
152
|
+
except LLMEvaluationError:
|
|
153
|
+
raise
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error("execute cmd exception: %s", e)
|
|
156
|
+
raise LLMEvaluationError(f"{target.value} subprocess error: {e}") from e
|
|
157
|
+
|
|
158
|
+
if completed.returncode != 0:
|
|
159
|
+
stderr = (completed.stderr or "").strip()
|
|
160
|
+
stdout = (completed.stdout or "").strip()
|
|
161
|
+
parts = [s for s in [stderr, stdout] if s]
|
|
162
|
+
detail = "\n".join(parts) if parts else "(no output)"
|
|
163
|
+
|
|
164
|
+
if _is_quota_error(detail):
|
|
165
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
166
|
+
logger.warning(
|
|
167
|
+
"[QUOTA EXHAUSTED] %s | attempt=%d/%d | Retrying in %ds. Detail: %s",
|
|
168
|
+
target.value, quota_attempt + 1, _max_retries, _retry_interval, detail[:300],
|
|
169
|
+
)
|
|
170
|
+
if quota_attempt < _max_retries:
|
|
171
|
+
time.sleep(_retry_interval)
|
|
172
|
+
continue
|
|
173
|
+
raise LLMEvaluationError(
|
|
174
|
+
f"{target.value} quota exhausted after {_max_retries} retries. "
|
|
175
|
+
f"Last error: {detail[:300]}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
raise LLMEvaluationError(
|
|
179
|
+
f"{target.value} CLI failed (exit {completed.returncode}): {detail[:500]}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
raw_stdout = (completed.stdout or "").strip()
|
|
185
|
+
|
|
186
|
+
if target == LLMTarget.OPENCODE and raw_stdout:
|
|
187
|
+
try:
|
|
188
|
+
chunks = []
|
|
189
|
+
for line in raw_stdout.splitlines():
|
|
190
|
+
if not line.strip():
|
|
191
|
+
continue
|
|
192
|
+
event = json.loads(line)
|
|
193
|
+
if event.get("type") == "error":
|
|
194
|
+
msg = (event.get("error") or {}).get("data", {}).get("message", "")
|
|
195
|
+
raise RuntimeError(str(msg))
|
|
196
|
+
message = event.get("message")
|
|
197
|
+
if isinstance(message, dict):
|
|
198
|
+
for item in (message.get("content") or []):
|
|
199
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
200
|
+
chunks.append(str(item["text"]))
|
|
201
|
+
if chunks:
|
|
202
|
+
raw_stdout = "\n".join(chunks).strip()
|
|
203
|
+
except json.JSONDecodeError:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
logger.info("run [%s] done. stdout_len=%d", target.value, len(raw_stdout))
|
|
207
|
+
output_file.write_text(raw_stdout, encoding=encoding)
|
|
208
|
+
return output_file.read_text(encoding=encoding)
|
|
209
|
+
|
|
210
|
+
finally:
|
|
211
|
+
prompt_file.unlink(missing_ok=True)
|
|
212
|
+
output_file.unlink(missing_ok=True)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def run_with_fallback(
|
|
216
|
+
targets: list[LLMTarget],
|
|
217
|
+
prompt: str,
|
|
218
|
+
*,
|
|
219
|
+
model: str | None = None,
|
|
220
|
+
cwd: str | None = None,
|
|
221
|
+
timeout: float | None = 1800,
|
|
222
|
+
encoding: str = "utf-8",
|
|
223
|
+
) -> str:
|
|
224
|
+
if not targets:
|
|
225
|
+
raise ValueError("targets must not be empty")
|
|
226
|
+
last_exc: LLMEvaluationError | None = None
|
|
227
|
+
for target in targets:
|
|
228
|
+
try:
|
|
229
|
+
return run(target, prompt, model=model, cwd=cwd, timeout=timeout, encoding=encoding)
|
|
230
|
+
except LLMEvaluationError as exc:
|
|
231
|
+
logger.warning("run_with_fallback: %s failed, trying next. error: %s", target.value, exc)
|
|
232
|
+
last_exc = exc
|
|
233
|
+
raise last_exc
|
llm_eval/llm_target.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LLMTarget(Enum):
|
|
5
|
+
CLAUDE = "claude"
|
|
6
|
+
GEMINI = "gemini"
|
|
7
|
+
CODEX = "codex"
|
|
8
|
+
OPENCODE = "opencode"
|
|
9
|
+
COPILOT = "copilot"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_targets(value: str) -> list[LLMTarget]:
|
|
13
|
+
"""Parse 'claude,gemini' → [LLMTarget.CLAUDE, LLMTarget.GEMINI]."""
|
|
14
|
+
return [LLMTarget(v.strip()) for v in value.split(",") if v.strip()]
|
llm_eval/preflight.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from llm_eval.llm_target import LLMTarget
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class TargetStatus:
|
|
12
|
+
ok: bool
|
|
13
|
+
reason: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _resolve_cli(command_name: str) -> str:
|
|
17
|
+
if os.name == "nt":
|
|
18
|
+
cmd_candidate = shutil.which(f"{command_name}.cmd")
|
|
19
|
+
if cmd_candidate:
|
|
20
|
+
return cmd_candidate
|
|
21
|
+
resolved = shutil.which(command_name)
|
|
22
|
+
return resolved if resolved else command_name
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _check_claude() -> TargetStatus:
|
|
26
|
+
binary = _resolve_cli("claude")
|
|
27
|
+
try:
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
[binary, "auth", "status", "--json"],
|
|
30
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
31
|
+
)
|
|
32
|
+
if result.returncode == 0 and '"loggedIn": true' in result.stdout:
|
|
33
|
+
return TargetStatus(ok=True)
|
|
34
|
+
reason = result.stderr.strip() or result.stdout.strip() or "loggedIn not true"
|
|
35
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
return TargetStatus(ok=False, reason="claude CLI not found on PATH")
|
|
38
|
+
except subprocess.TimeoutExpired:
|
|
39
|
+
return TargetStatus(ok=False, reason="claude auth status timed out")
|
|
40
|
+
except Exception as e:
|
|
41
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _check_codex() -> TargetStatus:
|
|
45
|
+
binary = _resolve_cli("codex")
|
|
46
|
+
try:
|
|
47
|
+
result = subprocess.run(
|
|
48
|
+
[binary, "login", "status"],
|
|
49
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
50
|
+
)
|
|
51
|
+
if result.returncode == 0 and "Logged in" in result.stderr:
|
|
52
|
+
return TargetStatus(ok=True)
|
|
53
|
+
reason = result.stderr.strip() or result.stdout.strip() or "not logged in"
|
|
54
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
return TargetStatus(ok=False, reason="codex CLI not found on PATH")
|
|
57
|
+
except subprocess.TimeoutExpired:
|
|
58
|
+
return TargetStatus(ok=False, reason="codex login status timed out")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _check_via_version(tool: str) -> TargetStatus:
|
|
64
|
+
binary = _resolve_cli(tool)
|
|
65
|
+
try:
|
|
66
|
+
result = subprocess.run(
|
|
67
|
+
[binary, "--version"],
|
|
68
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
69
|
+
)
|
|
70
|
+
if result.returncode == 0:
|
|
71
|
+
return TargetStatus(ok=True)
|
|
72
|
+
reason = (result.stderr or result.stdout or "non-zero exit").strip()
|
|
73
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
74
|
+
except FileNotFoundError:
|
|
75
|
+
return TargetStatus(ok=False, reason=f"{tool} CLI not found on PATH")
|
|
76
|
+
except subprocess.TimeoutExpired:
|
|
77
|
+
return TargetStatus(ok=False, reason=f"{tool} --version timed out")
|
|
78
|
+
except Exception as e:
|
|
79
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
_CHECKERS: dict[LLMTarget, Callable[[], TargetStatus]] = {
|
|
83
|
+
LLMTarget.CLAUDE: _check_claude,
|
|
84
|
+
LLMTarget.GEMINI: lambda: _check_via_version("gemini"),
|
|
85
|
+
LLMTarget.CODEX: _check_codex,
|
|
86
|
+
LLMTarget.OPENCODE: lambda: _check_via_version("opencode"),
|
|
87
|
+
LLMTarget.COPILOT: lambda: _check_via_version("copilot"),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def check_target(target: LLMTarget) -> TargetStatus:
|
|
92
|
+
"""Check whether a single LLM target's CLI is installed and authenticated."""
|
|
93
|
+
return _CHECKERS[target]()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_all() -> dict[LLMTarget, TargetStatus]:
|
|
97
|
+
"""Check all known LLM targets. Always runs live; no caching."""
|
|
98
|
+
return {target: checker() for target, checker in _CHECKERS.items()}
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
_check_codex()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from llm_eval.job import Outcome, effective_status
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_prompt(purpose: str, outcomes: list[Outcome]) -> str:
|
|
5
|
+
lines = [
|
|
6
|
+
purpose,
|
|
7
|
+
"",
|
|
8
|
+
"---",
|
|
9
|
+
"",
|
|
10
|
+
"After completing your analysis, you MUST create exactly one of the following "
|
|
11
|
+
"empty files in your current working directory to signal your conclusion. "
|
|
12
|
+
"This must be the last action you take.",
|
|
13
|
+
"",
|
|
14
|
+
"Status files (create exactly one, leave it empty):",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
for i, outcome in enumerate(outcomes):
|
|
18
|
+
status = effective_status(outcome, i)
|
|
19
|
+
lines.append(f" status_{status:<20} — {outcome.description}")
|
|
20
|
+
|
|
21
|
+
lines.append("")
|
|
22
|
+
|
|
23
|
+
for i, outcome in enumerate(outcomes):
|
|
24
|
+
if outcome.output_files:
|
|
25
|
+
status = effective_status(outcome, i)
|
|
26
|
+
lines.append(f'If the outcome is "{status}", also write these files:')
|
|
27
|
+
for filename in outcome.output_files:
|
|
28
|
+
lines.append(f" {filename}")
|
|
29
|
+
lines.append("")
|
|
30
|
+
|
|
31
|
+
lines.append("Do not create more than one status file.")
|
|
32
|
+
lines.append("Do not write anything inside the status file.")
|
|
33
|
+
|
|
34
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from llm_eval.job import JobResult, Outcome, effective_status
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve(
|
|
10
|
+
workspace: Path,
|
|
11
|
+
outcomes: list[Outcome],
|
|
12
|
+
job_id: str,
|
|
13
|
+
target: str,
|
|
14
|
+
duration_seconds: float,
|
|
15
|
+
stdout: str,
|
|
16
|
+
) -> tuple[Outcome, JobResult]:
|
|
17
|
+
status_files = sorted(workspace.glob("status_*"))
|
|
18
|
+
|
|
19
|
+
if len(status_files) > 1:
|
|
20
|
+
logger.warning(
|
|
21
|
+
"Multiple status files found: %s. Using %s.",
|
|
22
|
+
[p.name for p in status_files],
|
|
23
|
+
status_files[0].name,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if not status_files:
|
|
27
|
+
error_outcome = next(
|
|
28
|
+
(o for i, o in enumerate(outcomes) if effective_status(o, i) == "error"),
|
|
29
|
+
None,
|
|
30
|
+
)
|
|
31
|
+
if error_outcome is None:
|
|
32
|
+
raise RuntimeError(
|
|
33
|
+
"No status file created by LLM and no 'error' outcome defined."
|
|
34
|
+
)
|
|
35
|
+
matched = error_outcome
|
|
36
|
+
status_name = "error"
|
|
37
|
+
else:
|
|
38
|
+
status_name = status_files[0].stem[len("status_"):]
|
|
39
|
+
matched = next(
|
|
40
|
+
(o for i, o in enumerate(outcomes) if effective_status(o, i) == status_name),
|
|
41
|
+
None,
|
|
42
|
+
)
|
|
43
|
+
if matched is None:
|
|
44
|
+
raise RuntimeError(
|
|
45
|
+
f"Status file 'status_{status_name}' does not match any defined outcome."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if matched.output_files is not None:
|
|
49
|
+
missing = [f for f in matched.output_files if not (workspace / f).exists()]
|
|
50
|
+
if missing:
|
|
51
|
+
raise RuntimeError(
|
|
52
|
+
f"Outcome '{status_name}' declared output_files {missing} "
|
|
53
|
+
"but LLM did not create them."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
declared = matched.output_files or []
|
|
57
|
+
files: dict[str, bytes] = {f: (workspace / f).read_bytes() for f in declared}
|
|
58
|
+
|
|
59
|
+
result = JobResult(
|
|
60
|
+
job_id=job_id,
|
|
61
|
+
status=status_name,
|
|
62
|
+
target=target,
|
|
63
|
+
duration_seconds=duration_seconds,
|
|
64
|
+
files=files,
|
|
65
|
+
stdout=stdout,
|
|
66
|
+
)
|
|
67
|
+
return matched, result
|
llm_eval/workspace.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_workspace(base_dir: str | None) -> tuple[str, Path]:
|
|
7
|
+
job_id = uuid.uuid4().hex[:8]
|
|
8
|
+
base = Path(base_dir) if base_dir else Path.cwd()
|
|
9
|
+
workspace = base / ".llm_eval" / job_id
|
|
10
|
+
workspace.mkdir(parents=True, exist_ok=True)
|
|
11
|
+
return job_id, workspace
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cleanup_workspace(workspace: Path) -> None:
|
|
15
|
+
shutil.rmtree(workspace, ignore_errors=True)
|