zcode-supervisor 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tools/__init__.py +2 -0
- tools/zcode_control/__init__.py +16 -0
- tools/zcode_control/browser_scripts.mjs +106 -0
- tools/zcode_control/provider_errors.mjs +135 -0
- tools/zcode_control/zcodectl.mjs +2097 -0
- tools/zcode_eval/__init__.py +2 -0
- tools/zcode_eval/duel_import.py +304 -0
- tools/zcode_eval/zcode_eval.py +687 -0
- tools/zcode_eval/zcode_release.py +221 -0
- tools/zcode_supervisor/__init__.py +2 -0
- tools/zcode_supervisor/auto_route.py +393 -0
- tools/zcode_supervisor/repo_setup.py +439 -0
- tools/zcode_supervisor/zcode_supervisor.py +696 -0
- zcode_supervisor-0.0.1.dist-info/METADATA +928 -0
- zcode_supervisor-0.0.1.dist-info/RECORD +19 -0
- zcode_supervisor-0.0.1.dist-info/WHEEL +5 -0
- zcode_supervisor-0.0.1.dist-info/entry_points.txt +7 -0
- zcode_supervisor-0.0.1.dist-info/licenses/LICENSE +21 -0
- zcode_supervisor-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Small harness for evaluating ZCode against Claude Code GLM workers.
|
|
3
|
+
|
|
4
|
+
The CLI deliberately starts with inspection and measurement. Direct ZCode GUI
|
|
5
|
+
automation should only be added after the local app exposes a stable control
|
|
6
|
+
surface such as a CLI, URL scheme, or documented IPC.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
import plistlib
|
|
15
|
+
import statistics
|
|
16
|
+
import subprocess
|
|
17
|
+
import sys
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from .duel_import import import_duel_results
|
|
25
|
+
except ImportError:
|
|
26
|
+
from duel_import import import_duel_results
|
|
27
|
+
|
|
28
|
+
DEFAULT_APP_DIRS = (Path("/Applications"), Path.home() / "Applications")
|
|
29
|
+
DEFAULT_CLI_PATHS = (Path("/Applications/ZCode.app/Contents/Resources/glm/zcode.cjs"),)
|
|
30
|
+
DEFAULT_LEDGER = Path("artifacts/evals/zcode-vs-claude.jsonl")
|
|
31
|
+
SUPPORTED_TOOLS = ("zcode", "claude-code-glm52")
|
|
32
|
+
STAT_FIELDS = (
|
|
33
|
+
"duration_seconds",
|
|
34
|
+
"manual_interventions",
|
|
35
|
+
"tokens_total",
|
|
36
|
+
"tokens_before",
|
|
37
|
+
"tokens_after",
|
|
38
|
+
"tokens_used",
|
|
39
|
+
"quota_units",
|
|
40
|
+
"quota_percent_before",
|
|
41
|
+
"quota_percent_after",
|
|
42
|
+
"quota_percent_used",
|
|
43
|
+
"files_changed",
|
|
44
|
+
"lines_added",
|
|
45
|
+
"lines_deleted",
|
|
46
|
+
"tests_passed",
|
|
47
|
+
"tests_failed",
|
|
48
|
+
)
|
|
49
|
+
PERCENT_FIELDS = {"quota_percent_before", "quota_percent_after", "quota_percent_used"}
|
|
50
|
+
PROVIDER_META_FIELDS = (
|
|
51
|
+
"supervisor_state",
|
|
52
|
+
"provider_code",
|
|
53
|
+
"provider_message",
|
|
54
|
+
"provider_request_id",
|
|
55
|
+
"provider_error_line",
|
|
56
|
+
"provider_id",
|
|
57
|
+
"provider_kind",
|
|
58
|
+
"attempt_count",
|
|
59
|
+
"attempts",
|
|
60
|
+
"retry_count",
|
|
61
|
+
"retry_delays_ms",
|
|
62
|
+
"no_usage_reason",
|
|
63
|
+
"quota_percent_status",
|
|
64
|
+
"quota_percent_unavailable_reason",
|
|
65
|
+
"source_run_dir",
|
|
66
|
+
"source_result_path",
|
|
67
|
+
"preview",
|
|
68
|
+
"task_kind",
|
|
69
|
+
)
|
|
70
|
+
PROVIDER_BOOL_FIELDS = (
|
|
71
|
+
"provider_error",
|
|
72
|
+
"retryable_provider_error",
|
|
73
|
+
"partial_artifacts_possible",
|
|
74
|
+
"safe_to_retry_later",
|
|
75
|
+
"usage_available",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass(frozen=True)
|
|
80
|
+
class ZCodeAppInfo:
|
|
81
|
+
path: str
|
|
82
|
+
bundle_id: str | None
|
|
83
|
+
version: str | None
|
|
84
|
+
executable: str | None
|
|
85
|
+
url_schemes: list[str]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def utc_now() -> str:
|
|
89
|
+
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def find_zcode_apps(search_dirs: tuple[Path, ...] = DEFAULT_APP_DIRS) -> list[Path]:
|
|
93
|
+
apps: list[Path] = []
|
|
94
|
+
for base in search_dirs:
|
|
95
|
+
if not base.exists():
|
|
96
|
+
continue
|
|
97
|
+
for path in base.glob("*.app"):
|
|
98
|
+
if "zcode" in path.name.lower():
|
|
99
|
+
apps.append(path)
|
|
100
|
+
return sorted(apps)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def read_app_info(app_path: Path) -> ZCodeAppInfo:
|
|
104
|
+
info_plist = app_path / "Contents" / "Info.plist"
|
|
105
|
+
data: dict[str, Any] = {}
|
|
106
|
+
if info_plist.exists():
|
|
107
|
+
with info_plist.open("rb") as handle:
|
|
108
|
+
data = plistlib.load(handle)
|
|
109
|
+
|
|
110
|
+
schemes: list[str] = []
|
|
111
|
+
for entry in data.get("CFBundleURLTypes", []) or []:
|
|
112
|
+
schemes.extend(entry.get("CFBundleURLSchemes", []) or [])
|
|
113
|
+
|
|
114
|
+
return ZCodeAppInfo(
|
|
115
|
+
path=str(app_path),
|
|
116
|
+
bundle_id=data.get("CFBundleIdentifier"),
|
|
117
|
+
version=data.get("CFBundleShortVersionString") or data.get("CFBundleVersion"),
|
|
118
|
+
executable=data.get("CFBundleExecutable"),
|
|
119
|
+
url_schemes=schemes,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def infer_control_surface(apps: list[ZCodeAppInfo]) -> str:
|
|
124
|
+
if not apps:
|
|
125
|
+
return "install_required"
|
|
126
|
+
if any(app.url_schemes for app in apps):
|
|
127
|
+
return "url_scheme_candidate"
|
|
128
|
+
return "gui_or_bundle_only"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def print_doctor(as_json: bool) -> int:
|
|
132
|
+
apps = [read_app_info(path) for path in find_zcode_apps()]
|
|
133
|
+
payload = {
|
|
134
|
+
"status": "found" if apps else "not_found",
|
|
135
|
+
"apps": [app.__dict__ for app in apps],
|
|
136
|
+
"cli": find_cli_info(),
|
|
137
|
+
"control_surface": infer_control_surface(apps),
|
|
138
|
+
}
|
|
139
|
+
if as_json:
|
|
140
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
141
|
+
else:
|
|
142
|
+
print(f"ZCode app: {payload['status']}")
|
|
143
|
+
for app in apps:
|
|
144
|
+
print(f"- path: {app.path}")
|
|
145
|
+
print(f" bundle_id: {app.bundle_id or 'unknown'}")
|
|
146
|
+
print(f" version: {app.version or 'unknown'}")
|
|
147
|
+
print(f" executable: {app.executable or 'unknown'}")
|
|
148
|
+
print(f" url_schemes: {', '.join(app.url_schemes) or 'none'}")
|
|
149
|
+
cli = payload["cli"]
|
|
150
|
+
print(f"cli: {cli['status']}")
|
|
151
|
+
if cli.get("path"):
|
|
152
|
+
print(f" path: {cli['path']}")
|
|
153
|
+
print(f" version: {cli.get('version') or 'unknown'}")
|
|
154
|
+
print(f"control_surface: {payload['control_surface']}")
|
|
155
|
+
return 0
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def find_cli_info() -> dict[str, Any]:
|
|
159
|
+
for path in DEFAULT_CLI_PATHS:
|
|
160
|
+
if not path.exists():
|
|
161
|
+
continue
|
|
162
|
+
version = None
|
|
163
|
+
try:
|
|
164
|
+
result = subprocess.run(
|
|
165
|
+
["node", str(path), "--version"],
|
|
166
|
+
text=True,
|
|
167
|
+
capture_output=True,
|
|
168
|
+
check=False,
|
|
169
|
+
timeout=5,
|
|
170
|
+
)
|
|
171
|
+
if result.returncode == 0:
|
|
172
|
+
version = result.stdout.strip()
|
|
173
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
174
|
+
version = None
|
|
175
|
+
return {"status": "found", "path": str(path), "version": version}
|
|
176
|
+
return {"status": "not_found"}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def load_records(path: Path) -> list[dict[str, Any]]:
|
|
180
|
+
if not path.exists():
|
|
181
|
+
return []
|
|
182
|
+
records: list[dict[str, Any]] = []
|
|
183
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
184
|
+
for line_no, line in enumerate(handle, start=1):
|
|
185
|
+
line = line.strip()
|
|
186
|
+
if not line:
|
|
187
|
+
continue
|
|
188
|
+
try:
|
|
189
|
+
records.append(json.loads(line))
|
|
190
|
+
except json.JSONDecodeError as exc:
|
|
191
|
+
raise ValueError(f"{path}:{line_no}: invalid JSONL: {exc}") from exc
|
|
192
|
+
return records
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def read_json(path: Path) -> dict[str, Any]:
|
|
196
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def non_negative_float(raw: str) -> float:
|
|
200
|
+
value = float(raw)
|
|
201
|
+
if value < 0:
|
|
202
|
+
raise argparse.ArgumentTypeError("value must be non-negative")
|
|
203
|
+
return value
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def bounded_percent(raw: str) -> float:
|
|
207
|
+
value = non_negative_float(raw)
|
|
208
|
+
if value > 100:
|
|
209
|
+
raise argparse.ArgumentTypeError("percent must be between 0 and 100")
|
|
210
|
+
return value
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def non_negative_int(raw: str) -> int:
|
|
214
|
+
value = int(raw)
|
|
215
|
+
if value < 0:
|
|
216
|
+
raise argparse.ArgumentTypeError("value must be non-negative")
|
|
217
|
+
return value
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def parse_retry_delays_ms(raw: str) -> list[int]:
|
|
221
|
+
text = raw.strip()
|
|
222
|
+
if not text:
|
|
223
|
+
return []
|
|
224
|
+
if text.startswith("["):
|
|
225
|
+
value = json.loads(text)
|
|
226
|
+
if not isinstance(value, list):
|
|
227
|
+
raise argparse.ArgumentTypeError("retry delays must be a JSON list or comma list")
|
|
228
|
+
delays = value
|
|
229
|
+
else:
|
|
230
|
+
delays = [item.strip() for item in text.split(",") if item.strip()]
|
|
231
|
+
parsed: list[int] = []
|
|
232
|
+
for item in delays:
|
|
233
|
+
delay = int(item)
|
|
234
|
+
if delay < 0:
|
|
235
|
+
raise argparse.ArgumentTypeError("retry delays must be non-negative")
|
|
236
|
+
parsed.append(delay)
|
|
237
|
+
return parsed
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def stat_type(field: str):
|
|
241
|
+
return bounded_percent if field in PERCENT_FIELDS else non_negative_float
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def finite_number(value: Any) -> float | None:
|
|
245
|
+
if isinstance(value, bool):
|
|
246
|
+
return None
|
|
247
|
+
if isinstance(value, (int, float)) and math.isfinite(value):
|
|
248
|
+
return float(value)
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def normalize_codexbar_usage_snapshot(payload: Any) -> dict[str, Any]:
|
|
253
|
+
rows = payload if isinstance(payload, list) else [payload]
|
|
254
|
+
row = next((item for item in rows if isinstance(item, dict) and item.get("provider") == "zai"), None)
|
|
255
|
+
row = row if row is not None else next((item for item in rows if isinstance(item, dict)), {})
|
|
256
|
+
usage = row.get("usage", {}) if isinstance(row, dict) else {}
|
|
257
|
+
windows: dict[str, dict[str, Any]] = {}
|
|
258
|
+
quota_candidates: list[dict[str, Any]] = []
|
|
259
|
+
if isinstance(usage, dict):
|
|
260
|
+
for name in ("primary", "secondary", "tertiary"):
|
|
261
|
+
window = usage.get(name)
|
|
262
|
+
if not isinstance(window, dict):
|
|
263
|
+
continue
|
|
264
|
+
used_percent = window.get("usedPercent", window.get("used_percent"))
|
|
265
|
+
normalized = {
|
|
266
|
+
"name": name,
|
|
267
|
+
"used_percent": used_percent if isinstance(used_percent, (int, float)) else None,
|
|
268
|
+
"reset_description": window.get("resetDescription"),
|
|
269
|
+
"resets_at": window.get("resetsAt"),
|
|
270
|
+
}
|
|
271
|
+
windows[name] = normalized
|
|
272
|
+
if isinstance(normalized["used_percent"], (int, float)):
|
|
273
|
+
quota_candidates.append(
|
|
274
|
+
{
|
|
275
|
+
"name": name,
|
|
276
|
+
"value": float(normalized["used_percent"]),
|
|
277
|
+
"line": f"{name}.usedPercent ({normalized['reset_description'] or 'quota window'})",
|
|
278
|
+
}
|
|
279
|
+
)
|
|
280
|
+
best = quota_candidates[0] if quota_candidates else None
|
|
281
|
+
return {
|
|
282
|
+
"source": "codexbar",
|
|
283
|
+
"provider": row.get("provider") if isinstance(row, dict) else None,
|
|
284
|
+
"windows": windows,
|
|
285
|
+
"best": {
|
|
286
|
+
"tokens_total": None,
|
|
287
|
+
"quota_percent": best["value"] if best else None,
|
|
288
|
+
"quota_percent_line": best["line"] if best else None,
|
|
289
|
+
},
|
|
290
|
+
"token_candidates": [],
|
|
291
|
+
"quota_percent_candidates": quota_candidates,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def normalize_zai_api_usage_snapshot(payload: Any) -> dict[str, Any]:
|
|
296
|
+
data = payload.get("data", {}) if isinstance(payload, dict) else {}
|
|
297
|
+
limits = data.get("limits", []) if isinstance(data, dict) else []
|
|
298
|
+
windows: dict[str, dict[str, Any]] = {}
|
|
299
|
+
quota_candidates: list[dict[str, Any]] = []
|
|
300
|
+
raw_quota_candidates: list[dict[str, Any]] = []
|
|
301
|
+
names_by_type = {"TOKENS_LIMIT": "primary", "TIME_LIMIT": "secondary"}
|
|
302
|
+
iterable_limits = limits if isinstance(limits, list) else []
|
|
303
|
+
for limit in iterable_limits:
|
|
304
|
+
if not isinstance(limit, dict):
|
|
305
|
+
continue
|
|
306
|
+
limit_type = limit.get("type")
|
|
307
|
+
name = names_by_type.get(limit_type)
|
|
308
|
+
if name is None:
|
|
309
|
+
continue
|
|
310
|
+
raw_used_percent = finite_number(limit.get("percentage", limit.get("usedPercent", limit.get("used_percent"))))
|
|
311
|
+
usage = finite_number(limit.get("usage"))
|
|
312
|
+
remaining = finite_number(limit.get("remaining"))
|
|
313
|
+
token_counts_available = usage is not None and remaining is not None
|
|
314
|
+
authoritative = raw_used_percent is not None
|
|
315
|
+
unavailable_reason = (
|
|
316
|
+
"zai_limit_missing_percentage"
|
|
317
|
+
if raw_used_percent is None
|
|
318
|
+
else None
|
|
319
|
+
)
|
|
320
|
+
normalized = {
|
|
321
|
+
"name": name,
|
|
322
|
+
"type": limit_type,
|
|
323
|
+
"used_percent": raw_used_percent,
|
|
324
|
+
"raw_used_percent": raw_used_percent,
|
|
325
|
+
"non_authoritative_used_percent": None,
|
|
326
|
+
"authoritative": authoritative,
|
|
327
|
+
"token_counts_available": token_counts_available,
|
|
328
|
+
"quota_percent_unavailable_reason": unavailable_reason,
|
|
329
|
+
"reset_description": "Tokens limit" if name == "primary" else "Time limit",
|
|
330
|
+
"resets_at": limit.get("nextResetTime"),
|
|
331
|
+
"usage": usage,
|
|
332
|
+
"remaining": remaining,
|
|
333
|
+
}
|
|
334
|
+
windows[name] = normalized
|
|
335
|
+
if name == "primary" and isinstance(normalized["used_percent"], (int, float)):
|
|
336
|
+
quota_candidates.append(
|
|
337
|
+
{
|
|
338
|
+
"name": name,
|
|
339
|
+
"value": float(normalized["used_percent"]),
|
|
340
|
+
"line": f"{limit_type}.percentage ({normalized['reset_description']})",
|
|
341
|
+
}
|
|
342
|
+
)
|
|
343
|
+
if raw_used_percent is not None:
|
|
344
|
+
raw_quota_candidates.append(
|
|
345
|
+
{
|
|
346
|
+
"name": name,
|
|
347
|
+
"value": raw_used_percent,
|
|
348
|
+
"line": f"{limit_type}.percentage ({normalized['reset_description']})",
|
|
349
|
+
"authoritative": authoritative,
|
|
350
|
+
"unavailable_reason": unavailable_reason,
|
|
351
|
+
}
|
|
352
|
+
)
|
|
353
|
+
best = quota_candidates[0] if quota_candidates else None
|
|
354
|
+
raw_best = raw_quota_candidates[0] if raw_quota_candidates else None
|
|
355
|
+
primary = windows.get("primary")
|
|
356
|
+
return {
|
|
357
|
+
"source": "zai-api",
|
|
358
|
+
"provider": "zai",
|
|
359
|
+
"plan": data.get("level") if isinstance(data, dict) else None,
|
|
360
|
+
"windows": windows,
|
|
361
|
+
"best": {
|
|
362
|
+
"tokens_total": None,
|
|
363
|
+
"quota_percent": best["value"] if best else None,
|
|
364
|
+
"quota_percent_line": best["line"] if best else None,
|
|
365
|
+
"raw_quota_percent": raw_best["value"] if raw_best else None,
|
|
366
|
+
"quota_percent_authoritative": bool(best),
|
|
367
|
+
"quota_percent_unavailable_reason": None
|
|
368
|
+
if best
|
|
369
|
+
else primary.get("quota_percent_unavailable_reason")
|
|
370
|
+
if isinstance(primary, dict)
|
|
371
|
+
else None,
|
|
372
|
+
},
|
|
373
|
+
"token_candidates": [],
|
|
374
|
+
"quota_percent_candidates": quota_candidates,
|
|
375
|
+
"quota_percent_raw_candidates": raw_quota_candidates,
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def unwrap_usage_snapshot(payload: Any) -> dict[str, Any]:
|
|
380
|
+
if isinstance(payload, list):
|
|
381
|
+
return normalize_codexbar_usage_snapshot(payload)
|
|
382
|
+
if not isinstance(payload, dict):
|
|
383
|
+
return {}
|
|
384
|
+
if isinstance(payload.get("value"), dict):
|
|
385
|
+
return payload["value"]
|
|
386
|
+
if isinstance(payload.get("data"), dict) and isinstance(payload["data"].get("limits"), list):
|
|
387
|
+
return normalize_zai_api_usage_snapshot(payload)
|
|
388
|
+
if payload.get("source") == "codexbar" and isinstance(payload.get("windows"), dict):
|
|
389
|
+
return payload
|
|
390
|
+
if payload.get("source") == "zai-api" and isinstance(payload.get("windows"), dict):
|
|
391
|
+
return payload
|
|
392
|
+
if payload.get("provider") == "zai" and isinstance(payload.get("usage"), dict):
|
|
393
|
+
return normalize_codexbar_usage_snapshot(payload)
|
|
394
|
+
return payload
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def load_usage_snapshot(path: Path) -> dict[str, Any]:
|
|
398
|
+
return unwrap_usage_snapshot(read_json(path))
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def best_usage_value(snapshot: dict[str, Any], key: str) -> float | None:
|
|
402
|
+
best = snapshot.get("best")
|
|
403
|
+
if isinstance(best, dict) and isinstance(best.get(key), (int, float)):
|
|
404
|
+
return float(best[key])
|
|
405
|
+
candidates_key = "token_candidates" if key == "tokens_total" else "quota_percent_candidates"
|
|
406
|
+
candidates = snapshot.get(candidates_key, [])
|
|
407
|
+
if isinstance(candidates, list):
|
|
408
|
+
for candidate in candidates:
|
|
409
|
+
if isinstance(candidate, dict) and isinstance(candidate.get("value"), (int, float)):
|
|
410
|
+
return float(candidate["value"])
|
|
411
|
+
return None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def snapshot_quota_unavailable_reason(snapshot: dict[str, Any]) -> str | None:
|
|
415
|
+
best = snapshot.get("best")
|
|
416
|
+
if isinstance(best, dict) and isinstance(best.get("quota_percent_unavailable_reason"), str):
|
|
417
|
+
return best["quota_percent_unavailable_reason"]
|
|
418
|
+
windows = snapshot.get("windows")
|
|
419
|
+
if isinstance(windows, dict):
|
|
420
|
+
primary = windows.get("primary")
|
|
421
|
+
if isinstance(primary, dict) and isinstance(primary.get("quota_percent_unavailable_reason"), str):
|
|
422
|
+
return primary["quota_percent_unavailable_reason"]
|
|
423
|
+
if snapshot.get("ok") is False:
|
|
424
|
+
reason = snapshot.get("error_type") or snapshot.get("reason") or snapshot.get("message")
|
|
425
|
+
if isinstance(reason, str):
|
|
426
|
+
return reason
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def usage_snapshot_quota_unavailable_reason(args: argparse.Namespace) -> str:
|
|
431
|
+
for path in (args.usage_after, args.usage_before):
|
|
432
|
+
if path:
|
|
433
|
+
reason = snapshot_quota_unavailable_reason(load_usage_snapshot(path))
|
|
434
|
+
if reason:
|
|
435
|
+
return reason
|
|
436
|
+
return "quota_percent_unavailable"
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def apply_usage_snapshot_defaults(args: argparse.Namespace, stats: dict[str, float | None]) -> None:
|
|
440
|
+
if args.usage_before:
|
|
441
|
+
before = load_usage_snapshot(args.usage_before)
|
|
442
|
+
if stats["tokens_before"] is None:
|
|
443
|
+
stats["tokens_before"] = best_usage_value(before, "tokens_total")
|
|
444
|
+
stats["quota_percent_before"] = (
|
|
445
|
+
stats["quota_percent_before"]
|
|
446
|
+
if stats["quota_percent_before"] is not None
|
|
447
|
+
else best_usage_value(before, "quota_percent")
|
|
448
|
+
)
|
|
449
|
+
if args.usage_after:
|
|
450
|
+
after = load_usage_snapshot(args.usage_after)
|
|
451
|
+
if stats["tokens_after"] is None:
|
|
452
|
+
stats["tokens_after"] = best_usage_value(after, "tokens_total")
|
|
453
|
+
stats["quota_percent_after"] = (
|
|
454
|
+
stats["quota_percent_after"]
|
|
455
|
+
if stats["quota_percent_after"] is not None
|
|
456
|
+
else best_usage_value(after, "quota_percent")
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def derive_usage_stats(args: argparse.Namespace, stats: dict[str, float | None]) -> None:
|
|
461
|
+
tokens_before = stats.get("tokens_before")
|
|
462
|
+
tokens_after = stats.get("tokens_after")
|
|
463
|
+
if stats.get("tokens_used") is None and tokens_before is not None and tokens_after is not None:
|
|
464
|
+
if tokens_after >= tokens_before:
|
|
465
|
+
stats["tokens_used"] = round(tokens_after - tokens_before, 4)
|
|
466
|
+
|
|
467
|
+
quota_before = stats.get("quota_percent_before")
|
|
468
|
+
quota_after = stats.get("quota_percent_after")
|
|
469
|
+
if stats.get("quota_percent_used") is None and quota_before is not None and quota_after is not None:
|
|
470
|
+
if args.quota_percent_direction == "remaining":
|
|
471
|
+
delta = quota_before - quota_after
|
|
472
|
+
else:
|
|
473
|
+
delta = quota_after - quota_before
|
|
474
|
+
if delta >= 0:
|
|
475
|
+
stats["quota_percent_used"] = round(delta, 4)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def init_ledger(path: Path) -> int:
|
|
479
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
480
|
+
path.touch(exist_ok=True)
|
|
481
|
+
print(str(path))
|
|
482
|
+
return 0
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def append_result(args: argparse.Namespace) -> int:
|
|
486
|
+
path = args.path
|
|
487
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
488
|
+
record = {
|
|
489
|
+
"recorded_at": utc_now(),
|
|
490
|
+
"run_id": args.run_id,
|
|
491
|
+
"tool": args.tool,
|
|
492
|
+
"task_id": args.task_id,
|
|
493
|
+
"task_name": args.task_name,
|
|
494
|
+
"status": args.status,
|
|
495
|
+
"validation": args.validation,
|
|
496
|
+
"notes": args.notes,
|
|
497
|
+
}
|
|
498
|
+
stats = {field: getattr(args, field) for field in STAT_FIELDS}
|
|
499
|
+
apply_usage_snapshot_defaults(args, stats)
|
|
500
|
+
derive_usage_stats(args, stats)
|
|
501
|
+
if (
|
|
502
|
+
args.usage_before
|
|
503
|
+
or args.usage_after
|
|
504
|
+
or stats.get("quota_percent_before") is not None
|
|
505
|
+
or stats.get("quota_percent_after") is not None
|
|
506
|
+
):
|
|
507
|
+
record["quota_percent_direction"] = args.quota_percent_direction
|
|
508
|
+
quota_percent_status = args.quota_percent_status
|
|
509
|
+
quota_percent_unavailable_reason = args.quota_percent_unavailable_reason
|
|
510
|
+
if quota_percent_status is None:
|
|
511
|
+
if stats.get("quota_percent_used") is not None:
|
|
512
|
+
quota_percent_status = "measured"
|
|
513
|
+
elif args.usage_before or args.usage_after:
|
|
514
|
+
quota_percent_status = "unavailable"
|
|
515
|
+
if quota_percent_status == "unavailable" and quota_percent_unavailable_reason is None:
|
|
516
|
+
quota_percent_unavailable_reason = usage_snapshot_quota_unavailable_reason(args)
|
|
517
|
+
if quota_percent_status is not None:
|
|
518
|
+
record["quota_percent_status"] = quota_percent_status
|
|
519
|
+
if quota_percent_unavailable_reason is not None:
|
|
520
|
+
record["quota_percent_unavailable_reason"] = quota_percent_unavailable_reason
|
|
521
|
+
if args.usage_available is False and args.no_usage_reason is None:
|
|
522
|
+
record["no_usage_reason"] = "usage_marked_unavailable"
|
|
523
|
+
for field in STAT_FIELDS:
|
|
524
|
+
value = stats[field]
|
|
525
|
+
if value is not None:
|
|
526
|
+
record[field] = value
|
|
527
|
+
for field in PROVIDER_META_FIELDS:
|
|
528
|
+
value = getattr(args, field)
|
|
529
|
+
if value is not None:
|
|
530
|
+
record[field] = value
|
|
531
|
+
for field in PROVIDER_BOOL_FIELDS:
|
|
532
|
+
value = getattr(args, field)
|
|
533
|
+
if value is not None:
|
|
534
|
+
record[field] = value
|
|
535
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
536
|
+
handle.write(json.dumps(record, sort_keys=True) + "\n")
|
|
537
|
+
print(json.dumps(record, indent=2, sort_keys=True))
|
|
538
|
+
return 0
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def build_summary(records: list[dict[str, Any]]) -> dict[str, Any]:
|
|
542
|
+
by_tool: dict[str, list[dict[str, Any]]] = {}
|
|
543
|
+
for record in records:
|
|
544
|
+
by_tool.setdefault(record.get("tool", "unknown"), []).append(record)
|
|
545
|
+
|
|
546
|
+
return {
|
|
547
|
+
"records": len(records),
|
|
548
|
+
"tools": {
|
|
549
|
+
tool: summarize_tool(tool_records)
|
|
550
|
+
for tool, tool_records in sorted(by_tool.items())
|
|
551
|
+
},
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def summarize_tool(records: list[dict[str, Any]]) -> dict[str, Any]:
|
|
556
|
+
passed = sum(1 for record in records if record.get("status") == "pass")
|
|
557
|
+
result: dict[str, Any] = {
|
|
558
|
+
"runs": len(records),
|
|
559
|
+
"pass_rate": round(passed / len(records), 3),
|
|
560
|
+
"provider_errors": sum(1 for record in records if record.get("provider_error") is True),
|
|
561
|
+
"retryable_provider_errors": sum(1 for record in records if record.get("safe_to_retry_later") is True),
|
|
562
|
+
"partial_successes": sum(1 for record in records if record.get("supervisor_state") == "partial_success"),
|
|
563
|
+
}
|
|
564
|
+
for field in STAT_FIELDS:
|
|
565
|
+
values = [
|
|
566
|
+
record[field]
|
|
567
|
+
for record in records
|
|
568
|
+
if isinstance(record.get(field), (int, float))
|
|
569
|
+
]
|
|
570
|
+
if values:
|
|
571
|
+
result[f"avg_{field}"] = round(statistics.fmean(values), 2)
|
|
572
|
+
result[f"total_{field}"] = round(sum(values), 2)
|
|
573
|
+
return result
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def summarize(path: Path) -> int:
|
|
577
|
+
records = load_records(path)
|
|
578
|
+
if not records:
|
|
579
|
+
print("No records.")
|
|
580
|
+
return 0
|
|
581
|
+
print(json.dumps(build_summary(records), indent=2, sort_keys=True))
|
|
582
|
+
return 0
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def show_log(path: Path, limit: int, as_json: bool) -> int:
|
|
586
|
+
records = load_records(path)
|
|
587
|
+
selected = records[-limit:] if limit else records
|
|
588
|
+
if as_json:
|
|
589
|
+
print(json.dumps(selected, indent=2, sort_keys=True))
|
|
590
|
+
return 0
|
|
591
|
+
for record in selected:
|
|
592
|
+
tokens = record.get("tokens_used", record.get("tokens_total", "unknown"))
|
|
593
|
+
quota = record.get("quota_percent_used", "unknown")
|
|
594
|
+
print(
|
|
595
|
+
f"{record.get('recorded_at', 'unknown')} "
|
|
596
|
+
f"{record.get('tool', 'unknown')} "
|
|
597
|
+
f"{record.get('task_id', 'unknown')} "
|
|
598
|
+
f"status={record.get('status', 'unknown')} "
|
|
599
|
+
f"tokens_used={tokens} quota_percent_used={quota}"
|
|
600
|
+
)
|
|
601
|
+
return 0
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
605
|
+
parser = argparse.ArgumentParser(description="Evaluate ZCode vs Claude Code GLM workers.")
|
|
606
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
607
|
+
|
|
608
|
+
doctor = subparsers.add_parser("doctor", help="Inspect whether ZCode.app is installed.")
|
|
609
|
+
doctor.add_argument("--json", action="store_true", help="Print machine-readable app info.")
|
|
610
|
+
doctor.set_defaults(func=lambda args: print_doctor(args.json))
|
|
611
|
+
|
|
612
|
+
init = subparsers.add_parser("init-ledger", help="Create the JSONL evaluation ledger.")
|
|
613
|
+
init.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
|
|
614
|
+
init.set_defaults(func=lambda args: init_ledger(args.path))
|
|
615
|
+
|
|
616
|
+
append = subparsers.add_parser("append-result", help="Append one benchmark result.")
|
|
617
|
+
append.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
|
|
618
|
+
append.add_argument("--run-id", required=True)
|
|
619
|
+
append.add_argument("--tool", choices=SUPPORTED_TOOLS, required=True)
|
|
620
|
+
append.add_argument("--task-id", required=True)
|
|
621
|
+
append.add_argument("--task-name", required=True)
|
|
622
|
+
append.add_argument("--status", choices=("pass", "fail", "partial", "blocked"), required=True)
|
|
623
|
+
append.add_argument("--validation", default="")
|
|
624
|
+
append.add_argument("--notes", default="")
|
|
625
|
+
append.add_argument(
|
|
626
|
+
"--supervisor-state",
|
|
627
|
+
choices=("success", "partial_success", "retryable_provider_error", "unsafe_partial", "cli_error"),
|
|
628
|
+
)
|
|
629
|
+
append.add_argument("--provider-code")
|
|
630
|
+
append.add_argument("--provider-message")
|
|
631
|
+
append.add_argument("--provider-request-id")
|
|
632
|
+
append.add_argument("--provider-error-line")
|
|
633
|
+
append.add_argument("--provider-id")
|
|
634
|
+
append.add_argument("--provider-kind")
|
|
635
|
+
append.add_argument("--attempt-count", type=non_negative_int)
|
|
636
|
+
append.add_argument("--attempts", type=non_negative_int)
|
|
637
|
+
append.add_argument("--retry-count", type=non_negative_int)
|
|
638
|
+
append.add_argument("--retry-delays-ms", type=parse_retry_delays_ms)
|
|
639
|
+
append.add_argument("--no-usage-reason")
|
|
640
|
+
append.add_argument("--quota-percent-status", choices=("measured", "estimated", "unavailable"))
|
|
641
|
+
append.add_argument("--quota-percent-unavailable-reason")
|
|
642
|
+
append.add_argument("--source-run-dir")
|
|
643
|
+
append.add_argument("--source-result-path")
|
|
644
|
+
append.add_argument("--preview")
|
|
645
|
+
append.add_argument("--task-kind")
|
|
646
|
+
for field in PROVIDER_BOOL_FIELDS:
|
|
647
|
+
append.add_argument(f"--{field.replace('_', '-')}", action=argparse.BooleanOptionalAction, default=None)
|
|
648
|
+
append.add_argument("--usage-before", type=Path)
|
|
649
|
+
append.add_argument("--usage-after", type=Path)
|
|
650
|
+
append.add_argument(
|
|
651
|
+
"--quota-percent-direction",
|
|
652
|
+
choices=("remaining", "used"),
|
|
653
|
+
default="remaining",
|
|
654
|
+
help="How to derive quota-percent-used from before/after snapshots.",
|
|
655
|
+
)
|
|
656
|
+
for field in STAT_FIELDS:
|
|
657
|
+
append.add_argument(f"--{field.replace('_', '-')}", type=stat_type(field))
|
|
658
|
+
append.set_defaults(func=append_result)
|
|
659
|
+
|
|
660
|
+
import_duel = subparsers.add_parser("import-duel-results", help="Append rows from an external supervisor duel results.json.")
|
|
661
|
+
import_duel.add_argument("--source", type=Path, required=True)
|
|
662
|
+
import_duel.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
|
|
663
|
+
import_duel.add_argument("--tool", choices=("zcode", "claude-code-glm52", "all"), default="zcode")
|
|
664
|
+
import_duel.add_argument("--allow-duplicates", action="store_true")
|
|
665
|
+
import_duel.set_defaults(func=import_duel_results)
|
|
666
|
+
|
|
667
|
+
report = subparsers.add_parser("summarize", help="Summarize recorded benchmark results.")
|
|
668
|
+
report.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
|
|
669
|
+
report.set_defaults(func=lambda args: summarize(args.path))
|
|
670
|
+
|
|
671
|
+
show = subparsers.add_parser("show-log", help="Show recent JSONL result records.")
|
|
672
|
+
show.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
|
|
673
|
+
show.add_argument("--limit", type=int, default=10)
|
|
674
|
+
show.add_argument("--json", action="store_true")
|
|
675
|
+
show.set_defaults(func=lambda args: show_log(args.path, args.limit, args.json))
|
|
676
|
+
|
|
677
|
+
return parser
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def main(argv: list[str] | None = None) -> int:
|
|
681
|
+
parser = build_parser()
|
|
682
|
+
args = parser.parse_args(argv)
|
|
683
|
+
return args.func(args)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
if __name__ == "__main__":
|
|
687
|
+
sys.exit(main())
|