zcode-supervisor 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,687 @@
1
+ #!/usr/bin/env python3
2
+ """Small harness for evaluating ZCode against Claude Code GLM workers.
3
+
4
+ The CLI deliberately starts with inspection and measurement. Direct ZCode GUI
5
+ automation should only be added after the local app exposes a stable control
6
+ surface such as a CLI, URL scheme, or documented IPC.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import math
14
+ import plistlib
15
+ import statistics
16
+ import subprocess
17
+ import sys
18
+ from dataclasses import dataclass
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ try:
24
+ from .duel_import import import_duel_results
25
+ except ImportError:
26
+ from duel_import import import_duel_results
27
+
28
+ DEFAULT_APP_DIRS = (Path("/Applications"), Path.home() / "Applications")
29
+ DEFAULT_CLI_PATHS = (Path("/Applications/ZCode.app/Contents/Resources/glm/zcode.cjs"),)
30
+ DEFAULT_LEDGER = Path("artifacts/evals/zcode-vs-claude.jsonl")
31
+ SUPPORTED_TOOLS = ("zcode", "claude-code-glm52")
32
+ STAT_FIELDS = (
33
+ "duration_seconds",
34
+ "manual_interventions",
35
+ "tokens_total",
36
+ "tokens_before",
37
+ "tokens_after",
38
+ "tokens_used",
39
+ "quota_units",
40
+ "quota_percent_before",
41
+ "quota_percent_after",
42
+ "quota_percent_used",
43
+ "files_changed",
44
+ "lines_added",
45
+ "lines_deleted",
46
+ "tests_passed",
47
+ "tests_failed",
48
+ )
49
+ PERCENT_FIELDS = {"quota_percent_before", "quota_percent_after", "quota_percent_used"}
50
+ PROVIDER_META_FIELDS = (
51
+ "supervisor_state",
52
+ "provider_code",
53
+ "provider_message",
54
+ "provider_request_id",
55
+ "provider_error_line",
56
+ "provider_id",
57
+ "provider_kind",
58
+ "attempt_count",
59
+ "attempts",
60
+ "retry_count",
61
+ "retry_delays_ms",
62
+ "no_usage_reason",
63
+ "quota_percent_status",
64
+ "quota_percent_unavailable_reason",
65
+ "source_run_dir",
66
+ "source_result_path",
67
+ "preview",
68
+ "task_kind",
69
+ )
70
+ PROVIDER_BOOL_FIELDS = (
71
+ "provider_error",
72
+ "retryable_provider_error",
73
+ "partial_artifacts_possible",
74
+ "safe_to_retry_later",
75
+ "usage_available",
76
+ )
77
+
78
+
79
+ @dataclass(frozen=True)
80
+ class ZCodeAppInfo:
81
+ path: str
82
+ bundle_id: str | None
83
+ version: str | None
84
+ executable: str | None
85
+ url_schemes: list[str]
86
+
87
+
88
+ def utc_now() -> str:
89
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
90
+
91
+
92
+ def find_zcode_apps(search_dirs: tuple[Path, ...] = DEFAULT_APP_DIRS) -> list[Path]:
93
+ apps: list[Path] = []
94
+ for base in search_dirs:
95
+ if not base.exists():
96
+ continue
97
+ for path in base.glob("*.app"):
98
+ if "zcode" in path.name.lower():
99
+ apps.append(path)
100
+ return sorted(apps)
101
+
102
+
103
+ def read_app_info(app_path: Path) -> ZCodeAppInfo:
104
+ info_plist = app_path / "Contents" / "Info.plist"
105
+ data: dict[str, Any] = {}
106
+ if info_plist.exists():
107
+ with info_plist.open("rb") as handle:
108
+ data = plistlib.load(handle)
109
+
110
+ schemes: list[str] = []
111
+ for entry in data.get("CFBundleURLTypes", []) or []:
112
+ schemes.extend(entry.get("CFBundleURLSchemes", []) or [])
113
+
114
+ return ZCodeAppInfo(
115
+ path=str(app_path),
116
+ bundle_id=data.get("CFBundleIdentifier"),
117
+ version=data.get("CFBundleShortVersionString") or data.get("CFBundleVersion"),
118
+ executable=data.get("CFBundleExecutable"),
119
+ url_schemes=schemes,
120
+ )
121
+
122
+
123
+ def infer_control_surface(apps: list[ZCodeAppInfo]) -> str:
124
+ if not apps:
125
+ return "install_required"
126
+ if any(app.url_schemes for app in apps):
127
+ return "url_scheme_candidate"
128
+ return "gui_or_bundle_only"
129
+
130
+
131
+ def print_doctor(as_json: bool) -> int:
132
+ apps = [read_app_info(path) for path in find_zcode_apps()]
133
+ payload = {
134
+ "status": "found" if apps else "not_found",
135
+ "apps": [app.__dict__ for app in apps],
136
+ "cli": find_cli_info(),
137
+ "control_surface": infer_control_surface(apps),
138
+ }
139
+ if as_json:
140
+ print(json.dumps(payload, indent=2, sort_keys=True))
141
+ else:
142
+ print(f"ZCode app: {payload['status']}")
143
+ for app in apps:
144
+ print(f"- path: {app.path}")
145
+ print(f" bundle_id: {app.bundle_id or 'unknown'}")
146
+ print(f" version: {app.version or 'unknown'}")
147
+ print(f" executable: {app.executable or 'unknown'}")
148
+ print(f" url_schemes: {', '.join(app.url_schemes) or 'none'}")
149
+ cli = payload["cli"]
150
+ print(f"cli: {cli['status']}")
151
+ if cli.get("path"):
152
+ print(f" path: {cli['path']}")
153
+ print(f" version: {cli.get('version') or 'unknown'}")
154
+ print(f"control_surface: {payload['control_surface']}")
155
+ return 0
156
+
157
+
158
+ def find_cli_info() -> dict[str, Any]:
159
+ for path in DEFAULT_CLI_PATHS:
160
+ if not path.exists():
161
+ continue
162
+ version = None
163
+ try:
164
+ result = subprocess.run(
165
+ ["node", str(path), "--version"],
166
+ text=True,
167
+ capture_output=True,
168
+ check=False,
169
+ timeout=5,
170
+ )
171
+ if result.returncode == 0:
172
+ version = result.stdout.strip()
173
+ except (OSError, subprocess.TimeoutExpired):
174
+ version = None
175
+ return {"status": "found", "path": str(path), "version": version}
176
+ return {"status": "not_found"}
177
+
178
+
179
+ def load_records(path: Path) -> list[dict[str, Any]]:
180
+ if not path.exists():
181
+ return []
182
+ records: list[dict[str, Any]] = []
183
+ with path.open("r", encoding="utf-8") as handle:
184
+ for line_no, line in enumerate(handle, start=1):
185
+ line = line.strip()
186
+ if not line:
187
+ continue
188
+ try:
189
+ records.append(json.loads(line))
190
+ except json.JSONDecodeError as exc:
191
+ raise ValueError(f"{path}:{line_no}: invalid JSONL: {exc}") from exc
192
+ return records
193
+
194
+
195
+ def read_json(path: Path) -> dict[str, Any]:
196
+ return json.loads(path.read_text(encoding="utf-8"))
197
+
198
+
199
+ def non_negative_float(raw: str) -> float:
200
+ value = float(raw)
201
+ if value < 0:
202
+ raise argparse.ArgumentTypeError("value must be non-negative")
203
+ return value
204
+
205
+
206
+ def bounded_percent(raw: str) -> float:
207
+ value = non_negative_float(raw)
208
+ if value > 100:
209
+ raise argparse.ArgumentTypeError("percent must be between 0 and 100")
210
+ return value
211
+
212
+
213
+ def non_negative_int(raw: str) -> int:
214
+ value = int(raw)
215
+ if value < 0:
216
+ raise argparse.ArgumentTypeError("value must be non-negative")
217
+ return value
218
+
219
+
220
+ def parse_retry_delays_ms(raw: str) -> list[int]:
221
+ text = raw.strip()
222
+ if not text:
223
+ return []
224
+ if text.startswith("["):
225
+ value = json.loads(text)
226
+ if not isinstance(value, list):
227
+ raise argparse.ArgumentTypeError("retry delays must be a JSON list or comma list")
228
+ delays = value
229
+ else:
230
+ delays = [item.strip() for item in text.split(",") if item.strip()]
231
+ parsed: list[int] = []
232
+ for item in delays:
233
+ delay = int(item)
234
+ if delay < 0:
235
+ raise argparse.ArgumentTypeError("retry delays must be non-negative")
236
+ parsed.append(delay)
237
+ return parsed
238
+
239
+
240
+ def stat_type(field: str):
241
+ return bounded_percent if field in PERCENT_FIELDS else non_negative_float
242
+
243
+
244
+ def finite_number(value: Any) -> float | None:
245
+ if isinstance(value, bool):
246
+ return None
247
+ if isinstance(value, (int, float)) and math.isfinite(value):
248
+ return float(value)
249
+ return None
250
+
251
+
252
+ def normalize_codexbar_usage_snapshot(payload: Any) -> dict[str, Any]:
253
+ rows = payload if isinstance(payload, list) else [payload]
254
+ row = next((item for item in rows if isinstance(item, dict) and item.get("provider") == "zai"), None)
255
+ row = row if row is not None else next((item for item in rows if isinstance(item, dict)), {})
256
+ usage = row.get("usage", {}) if isinstance(row, dict) else {}
257
+ windows: dict[str, dict[str, Any]] = {}
258
+ quota_candidates: list[dict[str, Any]] = []
259
+ if isinstance(usage, dict):
260
+ for name in ("primary", "secondary", "tertiary"):
261
+ window = usage.get(name)
262
+ if not isinstance(window, dict):
263
+ continue
264
+ used_percent = window.get("usedPercent", window.get("used_percent"))
265
+ normalized = {
266
+ "name": name,
267
+ "used_percent": used_percent if isinstance(used_percent, (int, float)) else None,
268
+ "reset_description": window.get("resetDescription"),
269
+ "resets_at": window.get("resetsAt"),
270
+ }
271
+ windows[name] = normalized
272
+ if isinstance(normalized["used_percent"], (int, float)):
273
+ quota_candidates.append(
274
+ {
275
+ "name": name,
276
+ "value": float(normalized["used_percent"]),
277
+ "line": f"{name}.usedPercent ({normalized['reset_description'] or 'quota window'})",
278
+ }
279
+ )
280
+ best = quota_candidates[0] if quota_candidates else None
281
+ return {
282
+ "source": "codexbar",
283
+ "provider": row.get("provider") if isinstance(row, dict) else None,
284
+ "windows": windows,
285
+ "best": {
286
+ "tokens_total": None,
287
+ "quota_percent": best["value"] if best else None,
288
+ "quota_percent_line": best["line"] if best else None,
289
+ },
290
+ "token_candidates": [],
291
+ "quota_percent_candidates": quota_candidates,
292
+ }
293
+
294
+
295
+ def normalize_zai_api_usage_snapshot(payload: Any) -> dict[str, Any]:
296
+ data = payload.get("data", {}) if isinstance(payload, dict) else {}
297
+ limits = data.get("limits", []) if isinstance(data, dict) else []
298
+ windows: dict[str, dict[str, Any]] = {}
299
+ quota_candidates: list[dict[str, Any]] = []
300
+ raw_quota_candidates: list[dict[str, Any]] = []
301
+ names_by_type = {"TOKENS_LIMIT": "primary", "TIME_LIMIT": "secondary"}
302
+ iterable_limits = limits if isinstance(limits, list) else []
303
+ for limit in iterable_limits:
304
+ if not isinstance(limit, dict):
305
+ continue
306
+ limit_type = limit.get("type")
307
+ name = names_by_type.get(limit_type)
308
+ if name is None:
309
+ continue
310
+ raw_used_percent = finite_number(limit.get("percentage", limit.get("usedPercent", limit.get("used_percent"))))
311
+ usage = finite_number(limit.get("usage"))
312
+ remaining = finite_number(limit.get("remaining"))
313
+ token_counts_available = usage is not None and remaining is not None
314
+ authoritative = raw_used_percent is not None
315
+ unavailable_reason = (
316
+ "zai_limit_missing_percentage"
317
+ if raw_used_percent is None
318
+ else None
319
+ )
320
+ normalized = {
321
+ "name": name,
322
+ "type": limit_type,
323
+ "used_percent": raw_used_percent,
324
+ "raw_used_percent": raw_used_percent,
325
+ "non_authoritative_used_percent": None,
326
+ "authoritative": authoritative,
327
+ "token_counts_available": token_counts_available,
328
+ "quota_percent_unavailable_reason": unavailable_reason,
329
+ "reset_description": "Tokens limit" if name == "primary" else "Time limit",
330
+ "resets_at": limit.get("nextResetTime"),
331
+ "usage": usage,
332
+ "remaining": remaining,
333
+ }
334
+ windows[name] = normalized
335
+ if name == "primary" and isinstance(normalized["used_percent"], (int, float)):
336
+ quota_candidates.append(
337
+ {
338
+ "name": name,
339
+ "value": float(normalized["used_percent"]),
340
+ "line": f"{limit_type}.percentage ({normalized['reset_description']})",
341
+ }
342
+ )
343
+ if raw_used_percent is not None:
344
+ raw_quota_candidates.append(
345
+ {
346
+ "name": name,
347
+ "value": raw_used_percent,
348
+ "line": f"{limit_type}.percentage ({normalized['reset_description']})",
349
+ "authoritative": authoritative,
350
+ "unavailable_reason": unavailable_reason,
351
+ }
352
+ )
353
+ best = quota_candidates[0] if quota_candidates else None
354
+ raw_best = raw_quota_candidates[0] if raw_quota_candidates else None
355
+ primary = windows.get("primary")
356
+ return {
357
+ "source": "zai-api",
358
+ "provider": "zai",
359
+ "plan": data.get("level") if isinstance(data, dict) else None,
360
+ "windows": windows,
361
+ "best": {
362
+ "tokens_total": None,
363
+ "quota_percent": best["value"] if best else None,
364
+ "quota_percent_line": best["line"] if best else None,
365
+ "raw_quota_percent": raw_best["value"] if raw_best else None,
366
+ "quota_percent_authoritative": bool(best),
367
+ "quota_percent_unavailable_reason": None
368
+ if best
369
+ else primary.get("quota_percent_unavailable_reason")
370
+ if isinstance(primary, dict)
371
+ else None,
372
+ },
373
+ "token_candidates": [],
374
+ "quota_percent_candidates": quota_candidates,
375
+ "quota_percent_raw_candidates": raw_quota_candidates,
376
+ }
377
+
378
+
379
+ def unwrap_usage_snapshot(payload: Any) -> dict[str, Any]:
380
+ if isinstance(payload, list):
381
+ return normalize_codexbar_usage_snapshot(payload)
382
+ if not isinstance(payload, dict):
383
+ return {}
384
+ if isinstance(payload.get("value"), dict):
385
+ return payload["value"]
386
+ if isinstance(payload.get("data"), dict) and isinstance(payload["data"].get("limits"), list):
387
+ return normalize_zai_api_usage_snapshot(payload)
388
+ if payload.get("source") == "codexbar" and isinstance(payload.get("windows"), dict):
389
+ return payload
390
+ if payload.get("source") == "zai-api" and isinstance(payload.get("windows"), dict):
391
+ return payload
392
+ if payload.get("provider") == "zai" and isinstance(payload.get("usage"), dict):
393
+ return normalize_codexbar_usage_snapshot(payload)
394
+ return payload
395
+
396
+
397
+ def load_usage_snapshot(path: Path) -> dict[str, Any]:
398
+ return unwrap_usage_snapshot(read_json(path))
399
+
400
+
401
+ def best_usage_value(snapshot: dict[str, Any], key: str) -> float | None:
402
+ best = snapshot.get("best")
403
+ if isinstance(best, dict) and isinstance(best.get(key), (int, float)):
404
+ return float(best[key])
405
+ candidates_key = "token_candidates" if key == "tokens_total" else "quota_percent_candidates"
406
+ candidates = snapshot.get(candidates_key, [])
407
+ if isinstance(candidates, list):
408
+ for candidate in candidates:
409
+ if isinstance(candidate, dict) and isinstance(candidate.get("value"), (int, float)):
410
+ return float(candidate["value"])
411
+ return None
412
+
413
+
414
+ def snapshot_quota_unavailable_reason(snapshot: dict[str, Any]) -> str | None:
415
+ best = snapshot.get("best")
416
+ if isinstance(best, dict) and isinstance(best.get("quota_percent_unavailable_reason"), str):
417
+ return best["quota_percent_unavailable_reason"]
418
+ windows = snapshot.get("windows")
419
+ if isinstance(windows, dict):
420
+ primary = windows.get("primary")
421
+ if isinstance(primary, dict) and isinstance(primary.get("quota_percent_unavailable_reason"), str):
422
+ return primary["quota_percent_unavailable_reason"]
423
+ if snapshot.get("ok") is False:
424
+ reason = snapshot.get("error_type") or snapshot.get("reason") or snapshot.get("message")
425
+ if isinstance(reason, str):
426
+ return reason
427
+ return None
428
+
429
+
430
+ def usage_snapshot_quota_unavailable_reason(args: argparse.Namespace) -> str:
431
+ for path in (args.usage_after, args.usage_before):
432
+ if path:
433
+ reason = snapshot_quota_unavailable_reason(load_usage_snapshot(path))
434
+ if reason:
435
+ return reason
436
+ return "quota_percent_unavailable"
437
+
438
+
439
+ def apply_usage_snapshot_defaults(args: argparse.Namespace, stats: dict[str, float | None]) -> None:
440
+ if args.usage_before:
441
+ before = load_usage_snapshot(args.usage_before)
442
+ if stats["tokens_before"] is None:
443
+ stats["tokens_before"] = best_usage_value(before, "tokens_total")
444
+ stats["quota_percent_before"] = (
445
+ stats["quota_percent_before"]
446
+ if stats["quota_percent_before"] is not None
447
+ else best_usage_value(before, "quota_percent")
448
+ )
449
+ if args.usage_after:
450
+ after = load_usage_snapshot(args.usage_after)
451
+ if stats["tokens_after"] is None:
452
+ stats["tokens_after"] = best_usage_value(after, "tokens_total")
453
+ stats["quota_percent_after"] = (
454
+ stats["quota_percent_after"]
455
+ if stats["quota_percent_after"] is not None
456
+ else best_usage_value(after, "quota_percent")
457
+ )
458
+
459
+
460
+ def derive_usage_stats(args: argparse.Namespace, stats: dict[str, float | None]) -> None:
461
+ tokens_before = stats.get("tokens_before")
462
+ tokens_after = stats.get("tokens_after")
463
+ if stats.get("tokens_used") is None and tokens_before is not None and tokens_after is not None:
464
+ if tokens_after >= tokens_before:
465
+ stats["tokens_used"] = round(tokens_after - tokens_before, 4)
466
+
467
+ quota_before = stats.get("quota_percent_before")
468
+ quota_after = stats.get("quota_percent_after")
469
+ if stats.get("quota_percent_used") is None and quota_before is not None and quota_after is not None:
470
+ if args.quota_percent_direction == "remaining":
471
+ delta = quota_before - quota_after
472
+ else:
473
+ delta = quota_after - quota_before
474
+ if delta >= 0:
475
+ stats["quota_percent_used"] = round(delta, 4)
476
+
477
+
478
+ def init_ledger(path: Path) -> int:
479
+ path.parent.mkdir(parents=True, exist_ok=True)
480
+ path.touch(exist_ok=True)
481
+ print(str(path))
482
+ return 0
483
+
484
+
485
+ def append_result(args: argparse.Namespace) -> int:
486
+ path = args.path
487
+ path.parent.mkdir(parents=True, exist_ok=True)
488
+ record = {
489
+ "recorded_at": utc_now(),
490
+ "run_id": args.run_id,
491
+ "tool": args.tool,
492
+ "task_id": args.task_id,
493
+ "task_name": args.task_name,
494
+ "status": args.status,
495
+ "validation": args.validation,
496
+ "notes": args.notes,
497
+ }
498
+ stats = {field: getattr(args, field) for field in STAT_FIELDS}
499
+ apply_usage_snapshot_defaults(args, stats)
500
+ derive_usage_stats(args, stats)
501
+ if (
502
+ args.usage_before
503
+ or args.usage_after
504
+ or stats.get("quota_percent_before") is not None
505
+ or stats.get("quota_percent_after") is not None
506
+ ):
507
+ record["quota_percent_direction"] = args.quota_percent_direction
508
+ quota_percent_status = args.quota_percent_status
509
+ quota_percent_unavailable_reason = args.quota_percent_unavailable_reason
510
+ if quota_percent_status is None:
511
+ if stats.get("quota_percent_used") is not None:
512
+ quota_percent_status = "measured"
513
+ elif args.usage_before or args.usage_after:
514
+ quota_percent_status = "unavailable"
515
+ if quota_percent_status == "unavailable" and quota_percent_unavailable_reason is None:
516
+ quota_percent_unavailable_reason = usage_snapshot_quota_unavailable_reason(args)
517
+ if quota_percent_status is not None:
518
+ record["quota_percent_status"] = quota_percent_status
519
+ if quota_percent_unavailable_reason is not None:
520
+ record["quota_percent_unavailable_reason"] = quota_percent_unavailable_reason
521
+ if args.usage_available is False and args.no_usage_reason is None:
522
+ record["no_usage_reason"] = "usage_marked_unavailable"
523
+ for field in STAT_FIELDS:
524
+ value = stats[field]
525
+ if value is not None:
526
+ record[field] = value
527
+ for field in PROVIDER_META_FIELDS:
528
+ value = getattr(args, field)
529
+ if value is not None:
530
+ record[field] = value
531
+ for field in PROVIDER_BOOL_FIELDS:
532
+ value = getattr(args, field)
533
+ if value is not None:
534
+ record[field] = value
535
+ with path.open("a", encoding="utf-8") as handle:
536
+ handle.write(json.dumps(record, sort_keys=True) + "\n")
537
+ print(json.dumps(record, indent=2, sort_keys=True))
538
+ return 0
539
+
540
+
541
+ def build_summary(records: list[dict[str, Any]]) -> dict[str, Any]:
542
+ by_tool: dict[str, list[dict[str, Any]]] = {}
543
+ for record in records:
544
+ by_tool.setdefault(record.get("tool", "unknown"), []).append(record)
545
+
546
+ return {
547
+ "records": len(records),
548
+ "tools": {
549
+ tool: summarize_tool(tool_records)
550
+ for tool, tool_records in sorted(by_tool.items())
551
+ },
552
+ }
553
+
554
+
555
+ def summarize_tool(records: list[dict[str, Any]]) -> dict[str, Any]:
556
+ passed = sum(1 for record in records if record.get("status") == "pass")
557
+ result: dict[str, Any] = {
558
+ "runs": len(records),
559
+ "pass_rate": round(passed / len(records), 3),
560
+ "provider_errors": sum(1 for record in records if record.get("provider_error") is True),
561
+ "retryable_provider_errors": sum(1 for record in records if record.get("safe_to_retry_later") is True),
562
+ "partial_successes": sum(1 for record in records if record.get("supervisor_state") == "partial_success"),
563
+ }
564
+ for field in STAT_FIELDS:
565
+ values = [
566
+ record[field]
567
+ for record in records
568
+ if isinstance(record.get(field), (int, float))
569
+ ]
570
+ if values:
571
+ result[f"avg_{field}"] = round(statistics.fmean(values), 2)
572
+ result[f"total_{field}"] = round(sum(values), 2)
573
+ return result
574
+
575
+
576
+ def summarize(path: Path) -> int:
577
+ records = load_records(path)
578
+ if not records:
579
+ print("No records.")
580
+ return 0
581
+ print(json.dumps(build_summary(records), indent=2, sort_keys=True))
582
+ return 0
583
+
584
+
585
+ def show_log(path: Path, limit: int, as_json: bool) -> int:
586
+ records = load_records(path)
587
+ selected = records[-limit:] if limit else records
588
+ if as_json:
589
+ print(json.dumps(selected, indent=2, sort_keys=True))
590
+ return 0
591
+ for record in selected:
592
+ tokens = record.get("tokens_used", record.get("tokens_total", "unknown"))
593
+ quota = record.get("quota_percent_used", "unknown")
594
+ print(
595
+ f"{record.get('recorded_at', 'unknown')} "
596
+ f"{record.get('tool', 'unknown')} "
597
+ f"{record.get('task_id', 'unknown')} "
598
+ f"status={record.get('status', 'unknown')} "
599
+ f"tokens_used={tokens} quota_percent_used={quota}"
600
+ )
601
+ return 0
602
+
603
+
604
+ def build_parser() -> argparse.ArgumentParser:
605
+ parser = argparse.ArgumentParser(description="Evaluate ZCode vs Claude Code GLM workers.")
606
+ subparsers = parser.add_subparsers(dest="command", required=True)
607
+
608
+ doctor = subparsers.add_parser("doctor", help="Inspect whether ZCode.app is installed.")
609
+ doctor.add_argument("--json", action="store_true", help="Print machine-readable app info.")
610
+ doctor.set_defaults(func=lambda args: print_doctor(args.json))
611
+
612
+ init = subparsers.add_parser("init-ledger", help="Create the JSONL evaluation ledger.")
613
+ init.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
614
+ init.set_defaults(func=lambda args: init_ledger(args.path))
615
+
616
+ append = subparsers.add_parser("append-result", help="Append one benchmark result.")
617
+ append.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
618
+ append.add_argument("--run-id", required=True)
619
+ append.add_argument("--tool", choices=SUPPORTED_TOOLS, required=True)
620
+ append.add_argument("--task-id", required=True)
621
+ append.add_argument("--task-name", required=True)
622
+ append.add_argument("--status", choices=("pass", "fail", "partial", "blocked"), required=True)
623
+ append.add_argument("--validation", default="")
624
+ append.add_argument("--notes", default="")
625
+ append.add_argument(
626
+ "--supervisor-state",
627
+ choices=("success", "partial_success", "retryable_provider_error", "unsafe_partial", "cli_error"),
628
+ )
629
+ append.add_argument("--provider-code")
630
+ append.add_argument("--provider-message")
631
+ append.add_argument("--provider-request-id")
632
+ append.add_argument("--provider-error-line")
633
+ append.add_argument("--provider-id")
634
+ append.add_argument("--provider-kind")
635
+ append.add_argument("--attempt-count", type=non_negative_int)
636
+ append.add_argument("--attempts", type=non_negative_int)
637
+ append.add_argument("--retry-count", type=non_negative_int)
638
+ append.add_argument("--retry-delays-ms", type=parse_retry_delays_ms)
639
+ append.add_argument("--no-usage-reason")
640
+ append.add_argument("--quota-percent-status", choices=("measured", "estimated", "unavailable"))
641
+ append.add_argument("--quota-percent-unavailable-reason")
642
+ append.add_argument("--source-run-dir")
643
+ append.add_argument("--source-result-path")
644
+ append.add_argument("--preview")
645
+ append.add_argument("--task-kind")
646
+ for field in PROVIDER_BOOL_FIELDS:
647
+ append.add_argument(f"--{field.replace('_', '-')}", action=argparse.BooleanOptionalAction, default=None)
648
+ append.add_argument("--usage-before", type=Path)
649
+ append.add_argument("--usage-after", type=Path)
650
+ append.add_argument(
651
+ "--quota-percent-direction",
652
+ choices=("remaining", "used"),
653
+ default="remaining",
654
+ help="How to derive quota-percent-used from before/after snapshots.",
655
+ )
656
+ for field in STAT_FIELDS:
657
+ append.add_argument(f"--{field.replace('_', '-')}", type=stat_type(field))
658
+ append.set_defaults(func=append_result)
659
+
660
+ import_duel = subparsers.add_parser("import-duel-results", help="Append rows from an external supervisor duel results.json.")
661
+ import_duel.add_argument("--source", type=Path, required=True)
662
+ import_duel.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
663
+ import_duel.add_argument("--tool", choices=("zcode", "claude-code-glm52", "all"), default="zcode")
664
+ import_duel.add_argument("--allow-duplicates", action="store_true")
665
+ import_duel.set_defaults(func=import_duel_results)
666
+
667
+ report = subparsers.add_parser("summarize", help="Summarize recorded benchmark results.")
668
+ report.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
669
+ report.set_defaults(func=lambda args: summarize(args.path))
670
+
671
+ show = subparsers.add_parser("show-log", help="Show recent JSONL result records.")
672
+ show.add_argument("--path", type=Path, default=DEFAULT_LEDGER)
673
+ show.add_argument("--limit", type=int, default=10)
674
+ show.add_argument("--json", action="store_true")
675
+ show.set_defaults(func=lambda args: show_log(args.path, args.limit, args.json))
676
+
677
+ return parser
678
+
679
+
680
+ def main(argv: list[str] | None = None) -> int:
681
+ parser = build_parser()
682
+ args = parser.parse_args(argv)
683
+ return args.func(args)
684
+
685
+
686
+ if __name__ == "__main__":
687
+ sys.exit(main())