agent-cli-dispatcher 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli_dispatcher-0.1.0/PKG-INFO +5 -0
- agent_cli_dispatcher-0.1.0/README.md +166 -0
- agent_cli_dispatcher-0.1.0/agent_cli_dispatcher.egg-info/PKG-INFO +5 -0
- agent_cli_dispatcher-0.1.0/agent_cli_dispatcher.egg-info/SOURCES.txt +22 -0
- agent_cli_dispatcher-0.1.0/agent_cli_dispatcher.egg-info/dependency_links.txt +1 -0
- agent_cli_dispatcher-0.1.0/agent_cli_dispatcher.egg-info/top_level.txt +1 -0
- agent_cli_dispatcher-0.1.0/llm_eval/__init__.py +108 -0
- agent_cli_dispatcher-0.1.0/llm_eval/job.py +24 -0
- agent_cli_dispatcher-0.1.0/llm_eval/llm_svc.py +233 -0
- agent_cli_dispatcher-0.1.0/llm_eval/llm_target.py +14 -0
- agent_cli_dispatcher-0.1.0/llm_eval/preflight.py +101 -0
- agent_cli_dispatcher-0.1.0/llm_eval/prompt_builder.py +34 -0
- agent_cli_dispatcher-0.1.0/llm_eval/status_resolver.py +67 -0
- agent_cli_dispatcher-0.1.0/llm_eval/workspace.py +15 -0
- agent_cli_dispatcher-0.1.0/pyproject.toml +14 -0
- agent_cli_dispatcher-0.1.0/setup.cfg +4 -0
- agent_cli_dispatcher-0.1.0/tests/test_codex_diagnose.py +328 -0
- agent_cli_dispatcher-0.1.0/tests/test_evaluate.py +120 -0
- agent_cli_dispatcher-0.1.0/tests/test_job.py +55 -0
- agent_cli_dispatcher-0.1.0/tests/test_llm_target.py +19 -0
- agent_cli_dispatcher-0.1.0/tests/test_preflight.py +72 -0
- agent_cli_dispatcher-0.1.0/tests/test_prompt_builder.py +61 -0
- agent_cli_dispatcher-0.1.0/tests/test_status_resolver.py +91 -0
- agent_cli_dispatcher-0.1.0/tests/test_workspace.py +46 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# llm-eval
|
|
2
|
+
|
|
3
|
+
A Python library for running structured LLM tasks with outcome routing. Define what the agent should conclude, and receive a typed callback for whichever outcome the LLM signals — no polling, no parsing.
|
|
4
|
+
|
|
5
|
+
## How it works
|
|
6
|
+
|
|
7
|
+
1. You provide a `purpose` (the task) and a list of `Outcome` objects (what the LLM might decide).
|
|
8
|
+
2. The library builds a prompt that tells the LLM to create exactly one empty signal file (`status_<name>`) plus any declared output files.
|
|
9
|
+
3. The LLM CLI runs in an isolated workspace directory.
|
|
10
|
+
4. The library scans the workspace, routes to the matching outcome, and calls its `callback` with a `JobResult`.
|
|
11
|
+
5. The workspace is deleted unconditionally after the callback returns.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Requires Python ≥ 3.11. The LLM CLI tools must be installed separately and available on your `PATH` (e.g. `claude`, `gemini`, `codex`, `opencode`, `copilot`).
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from llm_eval import evaluate, Outcome, JobResult, LLMTarget
|
|
25
|
+
|
|
26
|
+
def on_complete(result: JobResult) -> None:
|
|
27
|
+
print(f"Done in {result.duration_seconds:.1f}s")
|
|
28
|
+
|
|
29
|
+
def on_incomplete(result: JobResult) -> None:
|
|
30
|
+
questions = result.files.get("questions.txt", "")
|
|
31
|
+
print("Gaps found:\n", questions)
|
|
32
|
+
|
|
33
|
+
def on_error(result: JobResult) -> None:
|
|
34
|
+
print("LLM could not determine completeness")
|
|
35
|
+
|
|
36
|
+
evaluate(
|
|
37
|
+
target=LLMTarget.CLAUDE,
|
|
38
|
+
purpose="Review this spec and determine whether it is complete:\n\n<spec>\n...\n</spec>",
|
|
39
|
+
outcomes=[
|
|
40
|
+
Outcome(
|
|
41
|
+
status="complete",
|
|
42
|
+
description="The spec is complete and well-formed",
|
|
43
|
+
callback=on_complete,
|
|
44
|
+
),
|
|
45
|
+
Outcome(
|
|
46
|
+
status="incomplete",
|
|
47
|
+
description="The spec has gaps or missing sections",
|
|
48
|
+
output_files=["questions.txt"],
|
|
49
|
+
callback=on_incomplete,
|
|
50
|
+
),
|
|
51
|
+
Outcome(
|
|
52
|
+
status="error",
|
|
53
|
+
description="Cannot determine completeness",
|
|
54
|
+
callback=on_error,
|
|
55
|
+
),
|
|
56
|
+
],
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## API reference
|
|
61
|
+
|
|
62
|
+
### `evaluate()`
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from llm_eval import evaluate, LLMTarget
|
|
66
|
+
|
|
67
|
+
evaluate(
|
|
68
|
+
target, # LLMTarget — LLM CLI to use (see Supported targets)
|
|
69
|
+
purpose, # str — task description, embedded verbatim in the prompt
|
|
70
|
+
outcomes, # list[Outcome] — possible conclusions the LLM can signal
|
|
71
|
+
*,
|
|
72
|
+
on_exception=None, # Callable[[Exception], None] — called on subprocess failure
|
|
73
|
+
model=None, # str | None — model override passed to the CLI
|
|
74
|
+
timeout=1800, # float — subprocess timeout in seconds
|
|
75
|
+
cwd=None, # str | None — base dir for the workspace (default: cwd)
|
|
76
|
+
)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
`evaluate()` is **synchronous and blocking**. For concurrent calls, manage threads or a process pool in the calling code.
|
|
80
|
+
|
|
81
|
+
#### Supported targets
|
|
82
|
+
|
|
83
|
+
| `LLMTarget` member | CLI binary |
|
|
84
|
+
|---|---|
|
|
85
|
+
| `LLMTarget.CLAUDE` | `claude` |
|
|
86
|
+
| `LLMTarget.GEMINI` | `gemini` |
|
|
87
|
+
| `LLMTarget.CODEX` | `codex` |
|
|
88
|
+
| `LLMTarget.OPENCODE` | `opencode` |
|
|
89
|
+
| `LLMTarget.COPILOT` | `copilot` |
|
|
90
|
+
|
|
91
|
+
### `Outcome`
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from llm_eval import Outcome
|
|
95
|
+
|
|
96
|
+
Outcome(
|
|
97
|
+
status, # str — identifier, e.g. "complete"
|
|
98
|
+
description, # str — shown to the LLM: when should it pick this outcome
|
|
99
|
+
callback, # Callable[[JobResult], None]
|
|
100
|
+
output_files=[], # list[str] — files the LLM must write for this outcome
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### `JobResult`
|
|
105
|
+
|
|
106
|
+
Passed to the matching `callback`.
|
|
107
|
+
|
|
108
|
+
| Field | Type | Description |
|
|
109
|
+
|---|---|---|
|
|
110
|
+
| `job_id` | `str` | Unique ID for this call (8-char hex) |
|
|
111
|
+
| `status` | `str` | The outcome that was triggered |
|
|
112
|
+
| `target` | `str` | LLM target used |
|
|
113
|
+
| `duration_seconds` | `float` | Wall time from LLM start to completion |
|
|
114
|
+
| `files` | `dict[str, str]` | All non-status files in the workspace: `{filename: content}` |
|
|
115
|
+
| `stdout` | `str` | Raw LLM stdout |
|
|
116
|
+
|
|
117
|
+
## Error handling
|
|
118
|
+
|
|
119
|
+
| Situation | Behaviour |
|
|
120
|
+
|---|---|
|
|
121
|
+
| LLM subprocess fails (non-zero exit, timeout, binary not found) | `on_exception(exc)` is called; if not defined, the exception propagates |
|
|
122
|
+
| LLM produces no status file, `"error"` outcome defined | `"error"` outcome callback is called |
|
|
123
|
+
| LLM produces no status file, no `"error"` outcome | `RuntimeError` raised |
|
|
124
|
+
| Declared `output_files` not created by the LLM | `RuntimeError` raised; workspace still cleaned up |
|
|
125
|
+
| Multiple status files found | First alphabetically is used; a warning is logged |
|
|
126
|
+
| Callback raises | Exception propagates to `evaluate()` caller; workspace still cleaned up |
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
import logging
|
|
130
|
+
|
|
131
|
+
evaluate(
|
|
132
|
+
target="claude",
|
|
133
|
+
purpose="...",
|
|
134
|
+
outcomes=[...],
|
|
135
|
+
on_exception=lambda exc: logging.error("LLM failed: %s", exc),
|
|
136
|
+
)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Quota retry
|
|
140
|
+
|
|
141
|
+
`run_once` automatically retries on quota / rate-limit errors. Configure via environment variables:
|
|
142
|
+
|
|
143
|
+
| Variable | Default | Description |
|
|
144
|
+
|---|---|---|
|
|
145
|
+
| `LLM_QUOTA_RETRY_INTERVAL` | `300` | Seconds to wait between quota retries |
|
|
146
|
+
| `LLM_QUOTA_MAX_RETRIES` | `288` | Maximum number of quota retries before raising |
|
|
147
|
+
|
|
148
|
+
## Workspace
|
|
149
|
+
|
|
150
|
+
Each call creates an isolated directory at `{cwd}/.llm_eval/{job_id}/`. The LLM CLI runs with this as its working directory and must write its signal file there. The directory is deleted after the callback returns (or `on_exception` returns).
|
|
151
|
+
|
|
152
|
+
## Logging
|
|
153
|
+
|
|
154
|
+
The library uses the standard `logging` module under the `llm_eval` logger hierarchy. Enable debug output to see prompt content and subprocess details:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import logging
|
|
158
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Running tests
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
pip install pytest
|
|
165
|
+
pytest
|
|
166
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
agent_cli_dispatcher.egg-info/PKG-INFO
|
|
4
|
+
agent_cli_dispatcher.egg-info/SOURCES.txt
|
|
5
|
+
agent_cli_dispatcher.egg-info/dependency_links.txt
|
|
6
|
+
agent_cli_dispatcher.egg-info/top_level.txt
|
|
7
|
+
llm_eval/__init__.py
|
|
8
|
+
llm_eval/job.py
|
|
9
|
+
llm_eval/llm_svc.py
|
|
10
|
+
llm_eval/llm_target.py
|
|
11
|
+
llm_eval/preflight.py
|
|
12
|
+
llm_eval/prompt_builder.py
|
|
13
|
+
llm_eval/status_resolver.py
|
|
14
|
+
llm_eval/workspace.py
|
|
15
|
+
tests/test_codex_diagnose.py
|
|
16
|
+
tests/test_evaluate.py
|
|
17
|
+
tests/test_job.py
|
|
18
|
+
tests/test_llm_target.py
|
|
19
|
+
tests/test_preflight.py
|
|
20
|
+
tests/test_prompt_builder.py
|
|
21
|
+
tests/test_status_resolver.py
|
|
22
|
+
tests/test_workspace.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llm_eval
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Union
|
|
5
|
+
|
|
6
|
+
from llm_eval import llm_svc
|
|
7
|
+
from llm_eval.job import JobResult, Outcome
|
|
8
|
+
from llm_eval.llm_svc import LLMEvaluationError
|
|
9
|
+
from llm_eval.llm_target import LLMTarget, parse_targets
|
|
10
|
+
from llm_eval.preflight import TargetStatus, check_all, check_target
|
|
11
|
+
from llm_eval.prompt_builder import build_prompt
|
|
12
|
+
from llm_eval.status_resolver import resolve
|
|
13
|
+
from llm_eval.workspace import cleanup_workspace, create_workspace
|
|
14
|
+
|
|
15
|
+
__all__ = ["evaluate", "Outcome", "JobResult", "LLMTarget", "LLMEvaluationError",
|
|
16
|
+
"check_target", "check_all", "TargetStatus", "parse_targets"]
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def evaluate(
|
|
22
|
+
target: LLMTarget | None,
|
|
23
|
+
purpose: Union[str, Callable[[Path], str]],
|
|
24
|
+
outcomes: list[Outcome],
|
|
25
|
+
*,
|
|
26
|
+
targets: list[LLMTarget] | None = None,
|
|
27
|
+
on_exception: Callable[[Exception], None] | None = None,
|
|
28
|
+
model: str | None = None,
|
|
29
|
+
timeout: float = 1800,
|
|
30
|
+
cwd: str | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Run an LLM evaluation against one or more targets.
|
|
33
|
+
|
|
34
|
+
Pass ``targets`` (list) to enable ordered fallback: each target is tried in
|
|
35
|
+
sequence and the next is used only when the previous raises LLMEvaluationError.
|
|
36
|
+
Pass ``target`` (single) for the original single-target behaviour.
|
|
37
|
+
Providing both raises ValueError; providing neither raises ValueError.
|
|
38
|
+
|
|
39
|
+
``purpose`` may be a plain string or a callable that receives the workspace
|
|
40
|
+
Path and returns a string. Use the callable form when the prompt must embed
|
|
41
|
+
the workspace path as the LLM output directory so that resolve() can find
|
|
42
|
+
the files written by the LLM.
|
|
43
|
+
"""
|
|
44
|
+
if targets is not None and target is not None:
|
|
45
|
+
raise ValueError("Provide either 'target' or 'targets', not both.")
|
|
46
|
+
_targets: list[LLMTarget] = targets if targets is not None else ([target] if target is not None else [])
|
|
47
|
+
if not _targets:
|
|
48
|
+
raise ValueError("evaluate() requires 'target' or 'targets'.")
|
|
49
|
+
|
|
50
|
+
job_id, workspace = create_workspace(cwd)
|
|
51
|
+
purpose_str = purpose(workspace) if callable(purpose) else purpose
|
|
52
|
+
prompt = build_prompt(purpose_str, outcomes)
|
|
53
|
+
start = time.monotonic()
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
stdout, winning_target = _run_with_fallback(
|
|
57
|
+
_targets, prompt, model=model, cwd=str(workspace), timeout=timeout
|
|
58
|
+
)
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
cleanup_workspace(workspace)
|
|
61
|
+
if on_exception is not None:
|
|
62
|
+
on_exception(exc)
|
|
63
|
+
return
|
|
64
|
+
raise
|
|
65
|
+
|
|
66
|
+
duration = time.monotonic() - start
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
matched_outcome, result = resolve(
|
|
70
|
+
workspace=workspace,
|
|
71
|
+
outcomes=outcomes,
|
|
72
|
+
job_id=job_id,
|
|
73
|
+
target=winning_target.value,
|
|
74
|
+
duration_seconds=duration,
|
|
75
|
+
stdout=stdout,
|
|
76
|
+
)
|
|
77
|
+
except Exception as exc:
|
|
78
|
+
cleanup_workspace(workspace)
|
|
79
|
+
if on_exception is not None:
|
|
80
|
+
on_exception(exc)
|
|
81
|
+
return
|
|
82
|
+
raise
|
|
83
|
+
|
|
84
|
+
# Callback exceptions are intentionally NOT caught here — they originate from
|
|
85
|
+
# business logic, not the LLM layer, and must propagate directly to the caller.
|
|
86
|
+
try:
|
|
87
|
+
matched_outcome.callback(result)
|
|
88
|
+
finally:
|
|
89
|
+
cleanup_workspace(workspace)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _run_with_fallback(
|
|
93
|
+
targets: list[LLMTarget],
|
|
94
|
+
prompt: str,
|
|
95
|
+
*,
|
|
96
|
+
model: str | None = None,
|
|
97
|
+
cwd: str | None = None,
|
|
98
|
+
timeout: float = 1800,
|
|
99
|
+
) -> tuple[str, LLMTarget]:
|
|
100
|
+
"""Try each target in order; return (stdout, winning_target) on first success."""
|
|
101
|
+
last_exc: llm_svc.LLMEvaluationError | None = None
|
|
102
|
+
for t in targets:
|
|
103
|
+
try:
|
|
104
|
+
return llm_svc.run(t, prompt, model=model, cwd=cwd, timeout=timeout), t
|
|
105
|
+
except llm_svc.LLMEvaluationError as exc:
|
|
106
|
+
logger.warning("evaluate fallback: %s failed, trying next. error: %s", t.value, exc)
|
|
107
|
+
last_exc = exc
|
|
108
|
+
raise last_exc # type: ignore[misc]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class Outcome:
|
|
7
|
+
description: str
|
|
8
|
+
callback: Callable[["JobResult"], None]
|
|
9
|
+
status: str | None = None
|
|
10
|
+
output_files: list[str] | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def effective_status(outcome: Outcome, index: int) -> str:
|
|
14
|
+
return outcome.status if outcome.status is not None else str(index)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class JobResult:
|
|
19
|
+
job_id: str
|
|
20
|
+
status: str
|
|
21
|
+
target: str
|
|
22
|
+
duration_seconds: float
|
|
23
|
+
files: dict[str, bytes]
|
|
24
|
+
stdout: str
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from llm_eval.llm_target import LLMTarget
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LLMEvaluationError(RuntimeError):
|
|
18
|
+
"""LLM subprocess failure: non-zero exit, quota exhaustion, timeout, or execution error."""
|
|
19
|
+
|
|
20
|
+
_QUOTA_ERROR_PATTERNS: list[re.Pattern] = [
|
|
21
|
+
re.compile(p, re.IGNORECASE)
|
|
22
|
+
for p in [
|
|
23
|
+
r"exceeded your monthly token limit",
|
|
24
|
+
r"exceeded your current quota",
|
|
25
|
+
r"insufficient.quota",
|
|
26
|
+
r"quota.exceeded",
|
|
27
|
+
r"billing hard limit",
|
|
28
|
+
r"credit balance is too low",
|
|
29
|
+
r"out of credits",
|
|
30
|
+
r"rate.limit.exceeded",
|
|
31
|
+
r"429",
|
|
32
|
+
r"payment required",
|
|
33
|
+
]
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
_QUOTA_RETRY_INTERVAL_SECONDS: int = int(os.getenv("LLM_QUOTA_RETRY_INTERVAL", "300"))
|
|
37
|
+
_QUOTA_MAX_RETRIES: int = int(os.getenv("LLM_QUOTA_MAX_RETRIES", "288"))
|
|
38
|
+
|
|
39
|
+
_ALLOW_ALL_OPENCODE_PERMISSION = {
|
|
40
|
+
"bash": "allow", "read": "allow", "edit": "allow", "task": "allow",
|
|
41
|
+
"glob": "allow", "grep": "allow", "list": "allow",
|
|
42
|
+
"external_directory": "allow", "todowrite": "allow", "todoread": "allow",
|
|
43
|
+
"question": "allow", "webfetch": "allow", "websearch": "allow",
|
|
44
|
+
"codesearch": "allow", "lsp": "allow", "doom_loop": "allow", "skill": "allow",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _is_quota_error(text: str) -> bool:
|
|
49
|
+
for pattern in _QUOTA_ERROR_PATTERNS:
|
|
50
|
+
if pattern.search(text):
|
|
51
|
+
return True
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _resolve_cli(command_name: str) -> str:
|
|
56
|
+
if os.name == "nt":
|
|
57
|
+
cmd_candidate = shutil.which(f"{command_name}.cmd")
|
|
58
|
+
if cmd_candidate:
|
|
59
|
+
return cmd_candidate
|
|
60
|
+
resolved = shutil.which(command_name)
|
|
61
|
+
return resolved if resolved else command_name
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def run(
|
|
65
|
+
target: LLMTarget,
|
|
66
|
+
prompt: str,
|
|
67
|
+
*,
|
|
68
|
+
model: str | None = None,
|
|
69
|
+
cwd: str | None = None,
|
|
70
|
+
timeout: float | None = 1800,
|
|
71
|
+
encoding: str = "utf-8",
|
|
72
|
+
quota_retry_interval: int | None = None,
|
|
73
|
+
quota_max_retries: int | None = None,
|
|
74
|
+
) -> str:
|
|
75
|
+
if not prompt.strip():
|
|
76
|
+
raise ValueError("prompt must not be empty.")
|
|
77
|
+
|
|
78
|
+
_retry_interval = quota_retry_interval if quota_retry_interval is not None else _QUOTA_RETRY_INTERVAL_SECONDS
|
|
79
|
+
_max_retries = quota_max_retries if quota_max_retries is not None else _QUOTA_MAX_RETRIES
|
|
80
|
+
|
|
81
|
+
work_dir = str(Path(cwd).resolve()) if cwd else None
|
|
82
|
+
effective_dir = work_dir or str(Path.cwd())
|
|
83
|
+
|
|
84
|
+
run_id = uuid.uuid4().hex[:8]
|
|
85
|
+
io_dir = Path(effective_dir) / ".llm_io"
|
|
86
|
+
io_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
prompt_file = io_dir / f"prompt_{run_id}.txt"
|
|
88
|
+
output_file = io_dir / f"output_{run_id}.txt"
|
|
89
|
+
prompt_file.write_text(prompt, encoding=encoding)
|
|
90
|
+
|
|
91
|
+
stdin_input: str = ""
|
|
92
|
+
env = dict(os.environ)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
if target == LLMTarget.CLAUDE:
|
|
96
|
+
command = [_resolve_cli("claude"), "--print", "--dangerously-skip-permissions"]
|
|
97
|
+
if model:
|
|
98
|
+
command.extend(["--model", model])
|
|
99
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
100
|
+
|
|
101
|
+
elif target == LLMTarget.GEMINI:
|
|
102
|
+
command = [_resolve_cli("gemini"), "--approval-mode", "auto_edit",
|
|
103
|
+
"--prompt", prompt_file.read_text(encoding=encoding)]
|
|
104
|
+
|
|
105
|
+
elif target == LLMTarget.CODEX:
|
|
106
|
+
# Pass prompt via stdin to avoid Windows cmd line length limits (~8191 chars).
|
|
107
|
+
# codex reads stdin when no PROMPT arg is given (or arg is "-").
|
|
108
|
+
command = [_resolve_cli("codex"), "exec", "--dangerously-bypass-approvals-and-sandbox",
|
|
109
|
+
"--skip-git-repo-check"]
|
|
110
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
111
|
+
|
|
112
|
+
elif target == LLMTarget.OPENCODE:
|
|
113
|
+
env.setdefault("OPENCODE_PERMISSION", json.dumps(_ALLOW_ALL_OPENCODE_PERMISSION))
|
|
114
|
+
runtime_root = Path(effective_dir).resolve() / "data" / "tool-runtime" / "opencode"
|
|
115
|
+
for subdir in ("config", "data", "state"):
|
|
116
|
+
(runtime_root / subdir).mkdir(parents=True, exist_ok=True)
|
|
117
|
+
env.setdefault("XDG_CONFIG_HOME", str(runtime_root / "config"))
|
|
118
|
+
env.setdefault("XDG_DATA_HOME", str(runtime_root / "data"))
|
|
119
|
+
env.setdefault("XDG_STATE_HOME", str(runtime_root / "state"))
|
|
120
|
+
command = [_resolve_cli("opencode"), "run",
|
|
121
|
+
"--dir", effective_dir, "--format", "json", "-"]
|
|
122
|
+
stdin_input = prompt_file.read_text(encoding=encoding)
|
|
123
|
+
|
|
124
|
+
elif target == LLMTarget.COPILOT:
|
|
125
|
+
command = [_resolve_cli("copilot"), "-p", prompt_file.read_text(encoding=encoding),
|
|
126
|
+
"--allow-all", "--no-ask-user", "--output-format", "text", "--silent",
|
|
127
|
+
"--add-dir", effective_dir]
|
|
128
|
+
if model:
|
|
129
|
+
command.extend(["--model", model])
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
raise ValueError(f"Unsupported LLM target: {target}")
|
|
133
|
+
|
|
134
|
+
logger.info("run [%s] cwd=%s", target.value, work_dir or "(inherit)")
|
|
135
|
+
logger.debug("run [%s] command=%s", target.value, command)
|
|
136
|
+
logger.debug("run [%s] prompt_file=%s\n%s", target.value, prompt_file, prompt)
|
|
137
|
+
|
|
138
|
+
completed = None
|
|
139
|
+
for quota_attempt in range(_max_retries + 1):
|
|
140
|
+
try:
|
|
141
|
+
completed = subprocess.run(
|
|
142
|
+
command,
|
|
143
|
+
input=stdin_input,
|
|
144
|
+
capture_output=True,
|
|
145
|
+
text=True,
|
|
146
|
+
encoding=encoding,
|
|
147
|
+
errors="replace",
|
|
148
|
+
cwd=work_dir,
|
|
149
|
+
env=env,
|
|
150
|
+
timeout=timeout,
|
|
151
|
+
)
|
|
152
|
+
except LLMEvaluationError:
|
|
153
|
+
raise
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.error("execute cmd exception: %s", e)
|
|
156
|
+
raise LLMEvaluationError(f"{target.value} subprocess error: {e}") from e
|
|
157
|
+
|
|
158
|
+
if completed.returncode != 0:
|
|
159
|
+
stderr = (completed.stderr or "").strip()
|
|
160
|
+
stdout = (completed.stdout or "").strip()
|
|
161
|
+
parts = [s for s in [stderr, stdout] if s]
|
|
162
|
+
detail = "\n".join(parts) if parts else "(no output)"
|
|
163
|
+
|
|
164
|
+
if _is_quota_error(detail):
|
|
165
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
166
|
+
logger.warning(
|
|
167
|
+
"[QUOTA EXHAUSTED] %s | attempt=%d/%d | Retrying in %ds. Detail: %s",
|
|
168
|
+
target.value, quota_attempt + 1, _max_retries, _retry_interval, detail[:300],
|
|
169
|
+
)
|
|
170
|
+
if quota_attempt < _max_retries:
|
|
171
|
+
time.sleep(_retry_interval)
|
|
172
|
+
continue
|
|
173
|
+
raise LLMEvaluationError(
|
|
174
|
+
f"{target.value} quota exhausted after {_max_retries} retries. "
|
|
175
|
+
f"Last error: {detail[:300]}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
raise LLMEvaluationError(
|
|
179
|
+
f"{target.value} CLI failed (exit {completed.returncode}): {detail[:500]}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
raw_stdout = (completed.stdout or "").strip()
|
|
185
|
+
|
|
186
|
+
if target == LLMTarget.OPENCODE and raw_stdout:
|
|
187
|
+
try:
|
|
188
|
+
chunks = []
|
|
189
|
+
for line in raw_stdout.splitlines():
|
|
190
|
+
if not line.strip():
|
|
191
|
+
continue
|
|
192
|
+
event = json.loads(line)
|
|
193
|
+
if event.get("type") == "error":
|
|
194
|
+
msg = (event.get("error") or {}).get("data", {}).get("message", "")
|
|
195
|
+
raise RuntimeError(str(msg))
|
|
196
|
+
message = event.get("message")
|
|
197
|
+
if isinstance(message, dict):
|
|
198
|
+
for item in (message.get("content") or []):
|
|
199
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
200
|
+
chunks.append(str(item["text"]))
|
|
201
|
+
if chunks:
|
|
202
|
+
raw_stdout = "\n".join(chunks).strip()
|
|
203
|
+
except json.JSONDecodeError:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
logger.info("run [%s] done. stdout_len=%d", target.value, len(raw_stdout))
|
|
207
|
+
output_file.write_text(raw_stdout, encoding=encoding)
|
|
208
|
+
return output_file.read_text(encoding=encoding)
|
|
209
|
+
|
|
210
|
+
finally:
|
|
211
|
+
prompt_file.unlink(missing_ok=True)
|
|
212
|
+
output_file.unlink(missing_ok=True)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def run_with_fallback(
|
|
216
|
+
targets: list[LLMTarget],
|
|
217
|
+
prompt: str,
|
|
218
|
+
*,
|
|
219
|
+
model: str | None = None,
|
|
220
|
+
cwd: str | None = None,
|
|
221
|
+
timeout: float | None = 1800,
|
|
222
|
+
encoding: str = "utf-8",
|
|
223
|
+
) -> str:
|
|
224
|
+
if not targets:
|
|
225
|
+
raise ValueError("targets must not be empty")
|
|
226
|
+
last_exc: LLMEvaluationError | None = None
|
|
227
|
+
for target in targets:
|
|
228
|
+
try:
|
|
229
|
+
return run(target, prompt, model=model, cwd=cwd, timeout=timeout, encoding=encoding)
|
|
230
|
+
except LLMEvaluationError as exc:
|
|
231
|
+
logger.warning("run_with_fallback: %s failed, trying next. error: %s", target.value, exc)
|
|
232
|
+
last_exc = exc
|
|
233
|
+
raise last_exc
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LLMTarget(Enum):
|
|
5
|
+
CLAUDE = "claude"
|
|
6
|
+
GEMINI = "gemini"
|
|
7
|
+
CODEX = "codex"
|
|
8
|
+
OPENCODE = "opencode"
|
|
9
|
+
COPILOT = "copilot"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_targets(value: str) -> list[LLMTarget]:
|
|
13
|
+
"""Parse 'claude,gemini' → [LLMTarget.CLAUDE, LLMTarget.GEMINI]."""
|
|
14
|
+
return [LLMTarget(v.strip()) for v in value.split(",") if v.strip()]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from llm_eval.llm_target import LLMTarget
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class TargetStatus:
|
|
12
|
+
ok: bool
|
|
13
|
+
reason: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _resolve_cli(command_name: str) -> str:
|
|
17
|
+
if os.name == "nt":
|
|
18
|
+
cmd_candidate = shutil.which(f"{command_name}.cmd")
|
|
19
|
+
if cmd_candidate:
|
|
20
|
+
return cmd_candidate
|
|
21
|
+
resolved = shutil.which(command_name)
|
|
22
|
+
return resolved if resolved else command_name
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _check_claude() -> TargetStatus:
|
|
26
|
+
binary = _resolve_cli("claude")
|
|
27
|
+
try:
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
[binary, "auth", "status", "--json"],
|
|
30
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
31
|
+
)
|
|
32
|
+
if result.returncode == 0 and '"loggedIn": true' in result.stdout:
|
|
33
|
+
return TargetStatus(ok=True)
|
|
34
|
+
reason = result.stderr.strip() or result.stdout.strip() or "loggedIn not true"
|
|
35
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
36
|
+
except FileNotFoundError:
|
|
37
|
+
return TargetStatus(ok=False, reason="claude CLI not found on PATH")
|
|
38
|
+
except subprocess.TimeoutExpired:
|
|
39
|
+
return TargetStatus(ok=False, reason="claude auth status timed out")
|
|
40
|
+
except Exception as e:
|
|
41
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _check_codex() -> TargetStatus:
|
|
45
|
+
binary = _resolve_cli("codex")
|
|
46
|
+
try:
|
|
47
|
+
result = subprocess.run(
|
|
48
|
+
[binary, "login", "status"],
|
|
49
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
50
|
+
)
|
|
51
|
+
if result.returncode == 0 and "Logged in" in result.stderr:
|
|
52
|
+
return TargetStatus(ok=True)
|
|
53
|
+
reason = result.stderr.strip() or result.stdout.strip() or "not logged in"
|
|
54
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
return TargetStatus(ok=False, reason="codex CLI not found on PATH")
|
|
57
|
+
except subprocess.TimeoutExpired:
|
|
58
|
+
return TargetStatus(ok=False, reason="codex login status timed out")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _check_via_version(tool: str) -> TargetStatus:
|
|
64
|
+
binary = _resolve_cli(tool)
|
|
65
|
+
try:
|
|
66
|
+
result = subprocess.run(
|
|
67
|
+
[binary, "--version"],
|
|
68
|
+
capture_output=True, text=True, timeout=15, input="",
|
|
69
|
+
)
|
|
70
|
+
if result.returncode == 0:
|
|
71
|
+
return TargetStatus(ok=True)
|
|
72
|
+
reason = (result.stderr or result.stdout or "non-zero exit").strip()
|
|
73
|
+
return TargetStatus(ok=False, reason=reason[:200])
|
|
74
|
+
except FileNotFoundError:
|
|
75
|
+
return TargetStatus(ok=False, reason=f"{tool} CLI not found on PATH")
|
|
76
|
+
except subprocess.TimeoutExpired:
|
|
77
|
+
return TargetStatus(ok=False, reason=f"{tool} --version timed out")
|
|
78
|
+
except Exception as e:
|
|
79
|
+
return TargetStatus(ok=False, reason=str(e)[:200])
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
_CHECKERS: dict[LLMTarget, Callable[[], TargetStatus]] = {
|
|
83
|
+
LLMTarget.CLAUDE: _check_claude,
|
|
84
|
+
LLMTarget.GEMINI: lambda: _check_via_version("gemini"),
|
|
85
|
+
LLMTarget.CODEX: _check_codex,
|
|
86
|
+
LLMTarget.OPENCODE: lambda: _check_via_version("opencode"),
|
|
87
|
+
LLMTarget.COPILOT: lambda: _check_via_version("copilot"),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def check_target(target: LLMTarget) -> TargetStatus:
|
|
92
|
+
"""Check whether a single LLM target's CLI is installed and authenticated."""
|
|
93
|
+
return _CHECKERS[target]()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_all() -> dict[LLMTarget, TargetStatus]:
|
|
97
|
+
"""Check all known LLM targets. Always runs live; no caching."""
|
|
98
|
+
return {target: checker() for target, checker in _CHECKERS.items()}
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
_check_codex()
|