codeer-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,919 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ import mimetypes
6
+ import os
7
+ import time
8
+ from collections import defaultdict
9
+ from dataclasses import asdict
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from .. import agents as agents_mod
14
+ from .. import eval_ as eval_mod
15
+ from ..client import CodeerClient
16
+ from ..parse import parse_eval_result, parse_eval_tool_calls, summarize_eval_tool_calls
17
+ from ._util import log, truncate
18
+
19
+ POLL_INTERVAL = 5
20
+ POLL_TIMEOUT = 900
21
+
22
+
23
+ def _ids(csv_text: str | None) -> list[str] | None:
24
+ if not csv_text:
25
+ return None
26
+ return [x.strip() for x in csv_text.split(",") if x.strip()]
27
+
28
+
29
+ def register(subparsers):
30
+ ev = subparsers.add_parser("eval", help="Eval suite operations")
31
+ sub = ev.add_subparsers(dest="action", required=True)
32
+
33
+ # codeer eval list
34
+ p = sub.add_parser("list", help="List eval cases for an agent")
35
+ p.add_argument("--agent", required=True)
36
+ p.set_defaults(func=run_list)
37
+
38
+ # codeer eval evaluators
39
+ p = sub.add_parser("evaluators", help="List evaluators in workspace")
40
+ p.set_defaults(func=run_evaluators)
41
+
42
+ # codeer eval evaluator-create
43
+ p = sub.add_parser("evaluator-create", help="Create an evaluator in the workspace")
44
+ p.add_argument("--name", required=True)
45
+ g = p.add_mutually_exclusive_group(required=True)
46
+ g.add_argument("--system-prompt-template", help="Evaluator system prompt template text")
47
+ g.add_argument("--system-prompt-template-file", help="Path to evaluator system prompt template")
48
+ p.add_argument("--description", default=None)
49
+ p.set_defaults(func=run_evaluator_create)
50
+
51
+ # codeer eval evaluator-update
52
+ p = sub.add_parser("evaluator-update", help="Update an evaluator in the workspace")
53
+ p.add_argument("--evaluator", required=True, help="Evaluator UUID")
54
+ p.add_argument("--name", default=None)
55
+ g = p.add_mutually_exclusive_group()
56
+ g.add_argument("--system-prompt-template", help="Evaluator system prompt template text")
57
+ g.add_argument("--system-prompt-template-file", help="Path to evaluator system prompt template")
58
+ p.add_argument("--description", default=None)
59
+ p.set_defaults(func=run_evaluator_update)
60
+
61
+ # codeer eval run
62
+ p = sub.add_parser("run", help="Trigger eval run, poll for results, print scores")
63
+ p.add_argument("--agent", required=True)
64
+ g = p.add_mutually_exclusive_group()
65
+ g.add_argument("--history", default=None, help="History UUID to pin the run to")
66
+ g.add_argument("--latest", action="store_true",
67
+ help="Auto-select the newest AgentHistory (default)")
68
+ p.add_argument("--cases", default=None, help="Comma-separated case UUIDs (default: all)")
69
+ p.add_argument("--evaluators", required=True, help="Comma-separated evaluator UUIDs")
70
+ p.add_argument("--poll-timeout", type=int, default=POLL_TIMEOUT)
71
+ p.add_argument("--out", default=None)
72
+ p.set_defaults(func=run_run)
73
+
74
+ # codeer eval export
75
+ p = sub.add_parser("export", help="Export eval table (CSV + JSON + summary MD)")
76
+ p.add_argument("--agent", default=None)
77
+ g = p.add_mutually_exclusive_group()
78
+ g.add_argument("--version", type=int, help="AgentHistory version_number")
79
+ g.add_argument("--published", action="store_true", help="Use the published history")
80
+ p.add_argument("--cases", default=None, help="Comma-separated case UUIDs")
81
+ p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
82
+ p.add_argument("--out-dir", default=".codeer/eval_table")
83
+ p.set_defaults(func=run_export)
84
+
85
+ # codeer eval reconcile
86
+ p = sub.add_parser("reconcile", help="Audit local manifest vs server eval suite (read-only)")
87
+ p.add_argument("--manifest", default=".codeer/eval_cases.json")
88
+ p.add_argument("--agent", default=None)
89
+ p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
90
+ p.add_argument("--out", default=None)
91
+ p.set_defaults(func=run_reconcile)
92
+
93
+ # codeer eval cases-apply
94
+ p = sub.add_parser("cases-apply", help="Create/update eval cases from JSON manifest")
95
+ p.add_argument("--cases", required=True, help="Path to eval_cases.json")
96
+ p.add_argument("--agent", required=True)
97
+ p.add_argument("--attachments-dir", default=None, dest="attachments_dir")
98
+ p.add_argument("--allow-duplicates", action="store_true")
99
+ p.add_argument("--out", default=None)
100
+ p.set_defaults(func=run_cases_apply)
101
+
102
+ # codeer eval rubrics
103
+ p = sub.add_parser("rubrics", help="Read per-(case, evaluator) rubrics")
104
+ p.add_argument("--agent", required=True)
105
+ p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
106
+ p.add_argument("--cases", default=None, help="Comma-separated case UUIDs")
107
+ p.add_argument("--out", default=None)
108
+ p.set_defaults(func=run_rubrics)
109
+
110
+ # codeer eval rubrics-apply
111
+ p = sub.add_parser("rubrics-apply", help="Apply rubric changes from JSON file")
112
+ p.add_argument("--rubrics", required=True, help="Path to rubrics JSON")
113
+ p.add_argument("--dry-run", action="store_true")
114
+ p.add_argument("--force", action="store_true", help="Write all rubrics even if unchanged")
115
+ p.add_argument("--out", default=None)
116
+ p.set_defaults(func=run_rubrics_apply)
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # eval list
121
+ # ---------------------------------------------------------------------------
122
+
123
+ def run_list(args, client) -> int:
124
+ cases = eval_mod.list_cases(client, args.agent)
125
+ print(json.dumps(cases, ensure_ascii=False, indent=2, default=str))
126
+ return 0
127
+
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # eval evaluators
131
+ # ---------------------------------------------------------------------------
132
+
133
+ def run_evaluators(args, client) -> int:
134
+ ws, _ = client.resolve_scope()
135
+ evaluators = eval_mod.list_evaluators(client, ws)
136
+ print(json.dumps(evaluators, ensure_ascii=False, indent=2, default=str))
137
+ return 0
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # eval evaluator-create
142
+ # ---------------------------------------------------------------------------
143
+
144
+ def run_evaluator_create(args, client) -> int:
145
+ workspace_id, _ = client.resolve_scope()
146
+ if args.system_prompt_template_file:
147
+ system_prompt_template = Path(args.system_prompt_template_file).read_text()
148
+ else:
149
+ system_prompt_template = args.system_prompt_template
150
+ evaluator = eval_mod.create_evaluator(
151
+ client,
152
+ workspace_id=workspace_id,
153
+ name=args.name,
154
+ system_prompt_template=system_prompt_template,
155
+ description=args.description,
156
+ )
157
+ print(json.dumps(evaluator, ensure_ascii=False, indent=2, default=str))
158
+ return 0
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # eval evaluator-update
163
+ # ---------------------------------------------------------------------------
164
+
165
+ def run_evaluator_update(args, client) -> int:
166
+ if args.system_prompt_template_file:
167
+ system_prompt_template = Path(args.system_prompt_template_file).read_text()
168
+ else:
169
+ system_prompt_template = args.system_prompt_template
170
+
171
+ if args.name is None and args.description is None and system_prompt_template is None:
172
+ log(
173
+ "error: provide at least one of --name, --description, "
174
+ "--system-prompt-template, --system-prompt-template-file"
175
+ )
176
+ return 2
177
+
178
+ evaluator = eval_mod.update_evaluator(
179
+ client,
180
+ evaluator_id=args.evaluator,
181
+ name=args.name,
182
+ system_prompt_template=system_prompt_template,
183
+ description=args.description,
184
+ )
185
+ print(json.dumps(evaluator, ensure_ascii=False, indent=2, default=str))
186
+ return 0
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # eval run
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def run_run(args, client) -> int:
194
+ workspace_id, _ = client.resolve_scope()
195
+ if args.latest or not args.history:
196
+ history_id = agents_mod.get_latest_history_id(client, args.agent)
197
+ if not history_id:
198
+ log("error: agent has no history versions; pass --history instead")
199
+ return 2
200
+ log(f"--latest -> {history_id}")
201
+ args.history = history_id
202
+
203
+ if args.cases:
204
+ case_ids = _ids(args.cases) or []
205
+ case_objs: list[dict] = []
206
+ for cid in case_ids:
207
+ try:
208
+ case_objs.append(eval_mod.get_case(client, cid))
209
+ except Exception as e:
210
+ log(f"warning: could not fetch case {cid}: {e}")
211
+ case_objs.append({"id": cid, "input": ""})
212
+ else:
213
+ case_objs = eval_mod.list_cases(client, args.agent)
214
+ case_ids = [c["id"] for c in case_objs]
215
+ if not case_ids:
216
+ log("error: no cases to run")
217
+ return 2
218
+
219
+ evaluator_ids = _ids(args.evaluators) or []
220
+ if not evaluator_ids:
221
+ log("error: --evaluators is required")
222
+ return 2
223
+ evaluators = [eval_mod.get_evaluator(client, eid) for eid in evaluator_ids]
224
+
225
+ case_label_by_id = {c["id"]: truncate(c.get("input") or "", 60) for c in case_objs}
226
+ evaluator_name_by_id = {e["id"]: e.get("name", e["id"]) for e in evaluators}
227
+
228
+ log(f"triggering: {len(case_ids)} cases x {len(evaluator_ids)} evaluators on history {args.history}")
229
+ eval_mod.trigger(client, case_ids=case_ids, evaluator_ids=evaluator_ids,
230
+ agent_history_id=args.history)
231
+
232
+ deadline = time.time() + args.poll_timeout
233
+ results_by_eval: dict[str, list[dict]] = {}
234
+ while time.time() < deadline:
235
+ results_by_eval = {}
236
+ done = 0
237
+ total = len(case_ids) * len(evaluator_ids)
238
+ for ev_id in evaluator_ids:
239
+ rows = eval_mod.get_results(
240
+ client, case_ids=case_ids, evaluator_id=ev_id,
241
+ agent_history_id=args.history, workspace_id=workspace_id,
242
+ include_output=True, include_reasoning_steps=True,
243
+ )
244
+ results_by_eval[ev_id] = rows
245
+ done += sum(1 for r in rows if r.get("score") is not None)
246
+ log(f" progress: {done}/{total}")
247
+ if done >= total:
248
+ break
249
+ time.sleep(POLL_INTERVAL)
250
+
251
+ flat: list[dict] = []
252
+ for ev_id, rows in results_by_eval.items():
253
+ for r in rows:
254
+ result_summary = parse_eval_result(r)
255
+ tool_calls = parse_eval_tool_calls(r)
256
+ total_tool_duration_ms = sum(
257
+ tc.duration_ms for tc in tool_calls if tc.duration_ms is not None
258
+ )
259
+ flat.append({
260
+ "case_id": r.get("case_id") or r.get("evaluation_case_id"),
261
+ "case_label": case_label_by_id.get(
262
+ r.get("case_id") or r.get("evaluation_case_id"), "?"),
263
+ "evaluator_id": ev_id,
264
+ "evaluator_name": evaluator_name_by_id.get(ev_id, ev_id),
265
+ "score": r.get("score"),
266
+ "status": result_summary.status,
267
+ "reason": r.get("reason"),
268
+ "output": r.get("output") or r.get("actual_output"),
269
+ "execution_time_s": result_summary.execution_time_s,
270
+ "cost_credits": result_summary.cost_credits,
271
+ "tool_call_count": len(tool_calls),
272
+ "tool_total_duration_ms": total_tool_duration_ms or None,
273
+ "tool_calls_summary": summarize_eval_tool_calls(tool_calls),
274
+ "tool_calls": [asdict(tc) for tc in tool_calls],
275
+ "raw_result": r,
276
+ })
277
+
278
+ all_perfect = all((r.get("score") or 0.0) >= 1.0 for r in flat) if flat else False
279
+ log("\n" + "=" * 80)
280
+ log(f"RESULTS agent={args.agent} history={args.history}")
281
+ log("=" * 80)
282
+ log(f"{'score':>6} {'evaluator':<35} case")
283
+ for r in sorted(flat, key=lambda x: (x.get("score") or 0.0, x["evaluator_name"])):
284
+ score = r.get("score")
285
+ score_str = f"{score:.2f}" if score is not None else " - "
286
+ log(f"{score_str:>6} {r['evaluator_name'][:35]:<35} {r['case_label']}")
287
+
288
+ non_perfect = [r for r in flat if (r.get("score") or 0.0) < 1.0]
289
+ if non_perfect:
290
+ log("\nNON-PERFECT RESULTS:\n")
291
+ for r in non_perfect:
292
+ log(f" [{r.get('score') or 0:.2f}] {r['evaluator_name']} — {r['case_label']}")
293
+ if r.get("tool_calls_summary"):
294
+ log(f" tools: {r['tool_calls_summary'][:600]}")
295
+ log(f" reason: {(r.get('reason') or '').strip()[:600]}")
296
+ log("")
297
+
298
+ out = {
299
+ "agent_id": args.agent,
300
+ "history_id": args.history,
301
+ "all_perfect": all_perfect,
302
+ "results": flat,
303
+ }
304
+ print(json.dumps(out, indent=2, ensure_ascii=False))
305
+ if args.out:
306
+ Path(args.out).write_text(json.dumps(out, indent=2, ensure_ascii=False) + "\n")
307
+ return 0 if all_perfect else 1
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # eval export
312
+ # ---------------------------------------------------------------------------
313
+
314
+ def _pick_history(versions: list[dict], args) -> dict:
315
+ if args.version is not None:
316
+ for v in versions:
317
+ if v.get("version_number") == args.version:
318
+ return v
319
+ raise SystemExit(f"no AgentHistory with version_number={args.version}")
320
+ if args.published:
321
+ current = [v for v in versions if v.get("status") == "published"]
322
+ if current:
323
+ return sorted(current, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
324
+ previous = [v for v in versions if v.get("was_published")]
325
+ if previous:
326
+ return sorted(previous, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
327
+ raise SystemExit("no published AgentHistory found")
328
+ return sorted(versions, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
329
+
330
+
331
+ def run_export(args, client) -> int:
332
+ agent_id = args.agent or client.agent_id or os.environ.get("CODEER_AGENT_ID")
333
+ workspace_id, _ = client.resolve_scope()
334
+ if not agent_id:
335
+ log("error: --agent is required or set CODEER_AGENT_ID")
336
+ return 2
337
+
338
+ cases = eval_mod.list_cases(client, agent_id)
339
+ wanted_cases = set(_ids(args.cases) or [])
340
+ if wanted_cases:
341
+ cases = [c for c in cases if c["id"] in wanted_cases]
342
+ case_ids = [c["id"] for c in cases]
343
+ if not case_ids:
344
+ log("error: no eval cases matched")
345
+ return 2
346
+
347
+ if args.evaluators:
348
+ evaluators = [eval_mod.get_evaluator(client, eid) for eid in (_ids(args.evaluators) or [])]
349
+ else:
350
+ evaluators = eval_mod.list_evaluators(client, workspace_id)
351
+ if not evaluators:
352
+ log("error: no evaluators matched")
353
+ return 2
354
+
355
+ versions = agents_mod.list_versions(client, agent_id)
356
+ history = _pick_history(versions, args)
357
+
358
+ rows: list[dict[str, Any]] = []
359
+ all_rubrics: list[dict] = []
360
+ all_results: list[dict] = []
361
+ for evaluator in evaluators:
362
+ evaluator_id = evaluator["id"]
363
+ rubrics = eval_mod.get_rubrics_batch(
364
+ client, case_ids=case_ids, evaluator_id=evaluator_id)
365
+ results = eval_mod.get_results(
366
+ client, case_ids=case_ids, evaluator_id=evaluator_id,
367
+ agent_history_id=history["id"], workspace_id=workspace_id,
368
+ include_output=True, include_reasoning_steps=True,
369
+ )
370
+ all_rubrics.extend(rubrics)
371
+ all_results.extend({**r, "evaluator_id": evaluator_id} for r in results)
372
+ rubric_by_case = {
373
+ (row.get("case_id") or row.get("evaluation_case_id")): row.get("rubric", "")
374
+ for row in rubrics
375
+ }
376
+ result_by_case = {
377
+ (row.get("case_id") or row.get("evaluation_case_id")): row
378
+ for row in results
379
+ }
380
+ for order, case in enumerate(cases, 1):
381
+ case_id = case["id"]
382
+ result = result_by_case.get(case_id, {})
383
+ tool_calls = parse_eval_tool_calls(result)
384
+ total_tool_duration_ms = sum(
385
+ tc.duration_ms for tc in tool_calls if tc.duration_ms is not None
386
+ )
387
+ rows.append({
388
+ "order": order,
389
+ "case_id": case_id,
390
+ "input": case.get("input") or "",
391
+ "note": case.get("note") or "",
392
+ "evaluator_id": evaluator_id,
393
+ "evaluator_name": evaluator.get("name") or evaluator_id,
394
+ "score": result.get("score"),
395
+ "reason": result.get("reason") or "",
396
+ "output": result.get("output") or result.get("actual_output") or "",
397
+ "rubric": rubric_by_case.get(case_id, ""),
398
+ "tool_call_count": len(tool_calls),
399
+ "tool_total_duration_ms": total_tool_duration_ms or "",
400
+ "tool_calls_summary": summarize_eval_tool_calls(tool_calls),
401
+ "tool_calls_json": json.dumps(
402
+ [asdict(tc) for tc in tool_calls], ensure_ascii=False),
403
+ })
404
+
405
+ out_dir = Path(args.out_dir)
406
+ out_dir.mkdir(parents=True, exist_ok=True)
407
+ full = {
408
+ "agent_id": agent_id,
409
+ "workspace_id": workspace_id,
410
+ "history": {
411
+ "id": history["id"],
412
+ "version_number": history.get("version_number"),
413
+ "status": history.get("status"),
414
+ "was_published": history.get("was_published"),
415
+ "version_note": history.get("version_note"),
416
+ "created_at": history.get("created_at"),
417
+ },
418
+ "evaluators": [
419
+ {"id": e["id"], "name": e.get("name"), "description": e.get("description")}
420
+ for e in evaluators
421
+ ],
422
+ "cases": cases,
423
+ "rubrics": all_rubrics,
424
+ "results": all_results,
425
+ "table": rows,
426
+ }
427
+ (out_dir / "eval_table_full.json").write_text(
428
+ json.dumps(full, ensure_ascii=False, indent=2) + "\n")
429
+ with (out_dir / "eval_table.csv").open("w", newline="") as fh:
430
+ fields = [
431
+ "order", "case_id", "input", "note", "evaluator_name", "score",
432
+ "reason", "output", "rubric", "tool_call_count",
433
+ "tool_calls_summary", "tool_total_duration_ms", "tool_calls_json",
434
+ "evaluator_id",
435
+ ]
436
+ writer = csv.DictWriter(fh, fieldnames=fields)
437
+ writer.writeheader()
438
+ writer.writerows(rows)
439
+ with (out_dir / "eval_table_summary.md").open("w") as fh:
440
+ fh.write("# Codeer Eval Table Export\n\n")
441
+ fh.write(f"Agent: `{agent_id}`\n\n")
442
+ fh.write(f"History: v{history.get('version_number')} `{history['id']}`\n\n")
443
+ fh.write("| # | Evaluator | Score | Case ID | Input |\n")
444
+ fh.write("|---:|---|---:|---|---|\n")
445
+ for row in rows:
446
+ inp = truncate(row["input"], 80).replace("|", "\\|")
447
+ fh.write(
448
+ f"| {row['order']} | {row['evaluator_name']} | {row['score']} | "
449
+ f"`{row['case_id']}` | {inp} |\n")
450
+
451
+ print(json.dumps({
452
+ "out_dir": str(out_dir),
453
+ "cases": len(cases),
454
+ "evaluators": len(evaluators),
455
+ "rows": len(rows),
456
+ "history_id": history["id"],
457
+ "version_number": history.get("version_number"),
458
+ }, ensure_ascii=False, indent=2))
459
+ return 0
460
+
461
+
462
+ # ---------------------------------------------------------------------------
463
+ # eval reconcile
464
+ # ---------------------------------------------------------------------------
465
+
466
+ def _normalize_manifest(payload: dict[str, Any]) -> list[dict[str, Any]]:
467
+ cases = payload.get("cases") or []
468
+ shared_style_rubric = payload.get("shared_style_rubric")
469
+ shared_style_evaluators = payload.get("shared_style_evaluators") or []
470
+ out: list[dict[str, Any]] = []
471
+ for idx, case in enumerate(cases):
472
+ rubrics = dict(case.get("rubrics") or case.get("rubrics_by_evaluator") or {})
473
+ if shared_style_rubric:
474
+ for ev_id in shared_style_evaluators:
475
+ rubrics.setdefault(ev_id, shared_style_rubric)
476
+ out.append({
477
+ "index": idx,
478
+ "label": case.get("label") or f"case[{idx}]",
479
+ "input": case.get("input") or "",
480
+ "expected_output": case.get("expected_output"),
481
+ "rubrics_by_evaluator": rubrics,
482
+ })
483
+ return out
484
+
485
+
486
+ def _by_input(rows: list[dict], *, input_key: str = "input") -> dict[str, list[dict]]:
487
+ grouped: dict[str, list[dict]] = defaultdict(list)
488
+ for row in rows:
489
+ grouped[row.get(input_key) or ""].append(row)
490
+ return dict(grouped)
491
+
492
+
493
+ def _duplicate_inputs(grouped: dict[str, list[dict]], *, kind: str) -> list[dict]:
494
+ out = []
495
+ for input_text, rows in grouped.items():
496
+ if len(rows) <= 1:
497
+ continue
498
+ out.append({
499
+ "input_preview": truncate(input_text, 120),
500
+ "count": len(rows),
501
+ "items": [
502
+ {"case_id": r.get("id") or r.get("case_id"),
503
+ "label": r.get("label"), "index": r.get("index")}
504
+ for r in rows
505
+ ],
506
+ "kind": kind,
507
+ })
508
+ return out
509
+
510
+
511
+ def run_reconcile(args, client) -> int:
512
+ manifest_path = Path(args.manifest)
513
+ if not manifest_path.exists():
514
+ log(f"error: manifest not found: {manifest_path}")
515
+ return 2
516
+
517
+ payload = json.loads(manifest_path.read_text())
518
+ local_cases = _normalize_manifest(payload)
519
+ local_by_input = _by_input(local_cases)
520
+
521
+ agent_id = args.agent or client.agent_id or os.environ.get("CODEER_AGENT_ID")
522
+ workspace_id, _ = client.resolve_scope()
523
+ if not agent_id:
524
+ log("error: --agent or CODEER_AGENT_ID is required")
525
+ return 2
526
+
527
+ server_cases = eval_mod.list_cases(client, agent_id)
528
+ server_by_input = _by_input(server_cases)
529
+
530
+ all_evaluators = eval_mod.list_evaluators(client, workspace_id)
531
+ evaluator_by_id = {e["id"]: e for e in all_evaluators}
532
+ if args.evaluators:
533
+ evaluator_ids = _ids(args.evaluators) or []
534
+ else:
535
+ evaluator_ids = [e["id"] for e in all_evaluators]
536
+
537
+ manifest_evaluator_ids = {
538
+ ev_id for case in local_cases
539
+ for ev_id in (case.get("rubrics_by_evaluator") or {}).keys()
540
+ }
541
+ invalid_manifest_evaluator_ids = sorted(manifest_evaluator_ids - set(evaluator_by_id))
542
+ invalid_requested_evaluator_ids = sorted(set(evaluator_ids) - set(evaluator_by_id))
543
+ valid_evaluator_ids = [eid for eid in evaluator_ids if eid in evaluator_by_id]
544
+
545
+ server_case_ids = [c["id"] for c in server_cases]
546
+ rubrics_by_case = eval_mod.get_case_rubrics(
547
+ client, agent_id=agent_id, workspace_id=workspace_id,
548
+ evaluator_ids=valid_evaluator_ids, case_ids=server_case_ids,
549
+ ) if server_case_ids and valid_evaluator_ids else {}
550
+
551
+ local_missing_on_server = []
552
+ server_missing_in_manifest = []
553
+ rubric_drift = []
554
+ missing_server_rubrics = []
555
+
556
+ for local in local_cases:
557
+ matches = server_by_input.get(local["input"], [])
558
+ if not matches:
559
+ local_missing_on_server.append({
560
+ "label": local["label"], "index": local["index"],
561
+ "input_preview": truncate(local["input"], 120),
562
+ })
563
+ continue
564
+ server = matches[0]
565
+ server_rubrics = rubrics_by_case.get(server["id"]) or {}
566
+ for ev_id, local_rubric in (local.get("rubrics_by_evaluator") or {}).items():
567
+ if ev_id not in evaluator_by_id:
568
+ continue
569
+ if valid_evaluator_ids and ev_id not in valid_evaluator_ids:
570
+ continue
571
+ server_rubric = server_rubrics.get(ev_id, "")
572
+ if not server_rubric:
573
+ missing_server_rubrics.append({
574
+ "case_id": server["id"], "label": local["label"],
575
+ "evaluator_id": ev_id,
576
+ "evaluator_name": evaluator_by_id[ev_id].get("name"),
577
+ "input_preview": truncate(local["input"], 120),
578
+ })
579
+ elif server_rubric != local_rubric:
580
+ rubric_drift.append({
581
+ "case_id": server["id"], "label": local["label"],
582
+ "evaluator_id": ev_id,
583
+ "evaluator_name": evaluator_by_id[ev_id].get("name"),
584
+ "input_preview": truncate(local["input"], 120),
585
+ "local_rubric_preview": truncate(local_rubric, 120),
586
+ "server_rubric_preview": truncate(server_rubric, 120),
587
+ })
588
+
589
+ for server in server_cases:
590
+ if server.get("input") not in local_by_input:
591
+ server_missing_in_manifest.append({
592
+ "case_id": server["id"],
593
+ "input_preview": truncate(server.get("input") or "", 120),
594
+ })
595
+
596
+ duplicate_local_inputs = _duplicate_inputs(local_by_input, kind="local")
597
+ duplicate_server_inputs = _duplicate_inputs(server_by_input, kind="server")
598
+
599
+ issue_counts = {
600
+ "duplicate_local_inputs": len(duplicate_local_inputs),
601
+ "duplicate_server_inputs": len(duplicate_server_inputs),
602
+ "local_missing_on_server": len(local_missing_on_server),
603
+ "server_missing_in_manifest": len(server_missing_in_manifest),
604
+ "invalid_manifest_evaluator_ids": len(invalid_manifest_evaluator_ids),
605
+ "invalid_requested_evaluator_ids": len(invalid_requested_evaluator_ids),
606
+ "missing_server_rubrics": len(missing_server_rubrics),
607
+ "rubric_drift": len(rubric_drift),
608
+ }
609
+ total_issues = sum(issue_counts.values())
610
+
611
+ report = {
612
+ "agent_id": agent_id, "workspace_id": workspace_id,
613
+ "manifest": str(manifest_path),
614
+ "local_case_count": len(local_cases),
615
+ "server_case_count": len(server_cases),
616
+ "compared_evaluators": [
617
+ {"id": eid, "name": evaluator_by_id[eid].get("name")}
618
+ for eid in valid_evaluator_ids
619
+ ],
620
+ "issue_counts": issue_counts,
621
+ "issues": {
622
+ "duplicate_local_inputs": duplicate_local_inputs,
623
+ "duplicate_server_inputs": duplicate_server_inputs,
624
+ "local_missing_on_server": local_missing_on_server,
625
+ "server_missing_in_manifest": server_missing_in_manifest,
626
+ "invalid_manifest_evaluator_ids": invalid_manifest_evaluator_ids,
627
+ "invalid_requested_evaluator_ids": invalid_requested_evaluator_ids,
628
+ "missing_server_rubrics": missing_server_rubrics,
629
+ "rubric_drift": rubric_drift,
630
+ },
631
+ }
632
+
633
+ log(f"reconcile: {len(local_cases)} local cases, {len(server_cases)} server cases, "
634
+ f"{len(valid_evaluator_ids)} evaluators, {total_issues} issues")
635
+ for key, count in issue_counts.items():
636
+ if count:
637
+ log(f" {key}: {count}")
638
+
639
+ out_text = json.dumps(report, indent=2, ensure_ascii=False)
640
+ print(out_text)
641
+ if args.out:
642
+ Path(args.out).write_text(out_text + "\n")
643
+ return 1 if total_issues else 0
644
+
645
+
646
+ # ---------------------------------------------------------------------------
647
+ # eval cases-apply
648
+ # ---------------------------------------------------------------------------
649
+
650
+ def _upload_attachment(client: CodeerClient, *, file_path: Path, workspace_id: str) -> str:
651
+ ct, _ = mimetypes.guess_type(file_path.name)
652
+ ct = ct or "application/octet-stream"
653
+ files = {"file": (file_path.name, file_path.read_bytes(), ct)}
654
+ data = {"scope": "persistent", "purpose": "evaluation_context"}
655
+ uploaded = client.post("/external/files", files=files, data=data)
656
+ uuid = uploaded.get("uuid") if isinstance(uploaded, dict) else None
657
+ if not uuid:
658
+ raise RuntimeError(f"upload-file response missing uuid for {file_path.name}: {uploaded}")
659
+ return uuid
660
+
661
+
662
+ def run_cases_apply(args, client) -> int:
663
+ payload = json.loads(Path(args.cases).read_text())
664
+ cases = payload.get("cases") or []
665
+ if not cases:
666
+ log("error: no cases in payload")
667
+ return 2
668
+
669
+ shared_style_rubric = payload.get("shared_style_rubric")
670
+ shared_style_evals = payload.get("shared_style_evaluators") or []
671
+
672
+ needs_attach = any(case.get("attachment_files") for case in cases)
673
+ attach_dir: Path | None = Path(args.attachments_dir) if args.attachments_dir else None
674
+ if needs_attach and attach_dir is None:
675
+ log("error: at least one case has attachment_files, but --attachments-dir was not provided")
676
+ return 2
677
+ if needs_attach and attach_dir and not attach_dir.is_dir():
678
+ log(f"error: --attachments-dir does not exist or is not a directory: {attach_dir}")
679
+ return 2
680
+
681
+ workspace_id, _ = client.resolve_scope()
682
+
683
+ existing_by_input: dict[str, dict] = {}
684
+ if not args.allow_duplicates:
685
+ for existing in eval_mod.list_cases(client, args.agent):
686
+ existing_input = existing.get("input")
687
+ if isinstance(existing_input, str) and existing_input not in existing_by_input:
688
+ existing_by_input[existing_input] = existing
689
+
690
+ case_ids: list[str] = []
691
+ labels: list[str] = []
692
+ created: list[dict] = []
693
+ reused: list[dict] = []
694
+ for case in cases:
695
+ rubrics = dict(case.get("rubrics") or {})
696
+ if shared_style_rubric:
697
+ for ev_id in shared_style_evals:
698
+ rubrics.setdefault(ev_id, shared_style_rubric)
699
+ if not rubrics:
700
+ log(f"error: case '{case.get('label')}' has no rubrics")
701
+ return 2
702
+
703
+ label = case.get("label", "(unlabeled)")
704
+ attachment_ids: list[str] = []
705
+ for fname in case.get("attachment_files") or []:
706
+ fp = (attach_dir / fname).resolve() if attach_dir else None
707
+ if not fp or not fp.is_file():
708
+ log(f"error: attachment file not found for case '{label}': {fname}")
709
+ return 2
710
+ log(f" uploading attachment: {fname}")
711
+ uid = _upload_attachment(client, file_path=fp, workspace_id=workspace_id)
712
+ attachment_ids.append(uid)
713
+
714
+ existing = existing_by_input.get(case["input"])
715
+ if existing is not None:
716
+ case_id = existing["id"]
717
+ log(f"reusing existing case: {label} ({case_id[:8]})")
718
+ if case.get("expected_output") is not None or attachment_ids or case.get("meta") is not None or case.get("note") is not None:
719
+ eval_mod.update_case(
720
+ client, case_id,
721
+ expected_output=case.get("expected_output"),
722
+ attachment_ids=attachment_ids or None,
723
+ meta=case.get("meta"),
724
+ note=case.get("note"),
725
+ )
726
+ for ev_id, rubric in rubrics.items():
727
+ eval_mod.set_rubric(client, evaluation_case_id=case_id,
728
+ evaluator_id=ev_id, rubric=rubric)
729
+ case_ids.append(case_id)
730
+ labels.append(label)
731
+ reused.append({"case_id": case_id, "label": label})
732
+ continue
733
+
734
+ log(f"creating: {label}")
735
+ result = eval_mod.create_case_with_rubrics(
736
+ client, agent_id=args.agent, input=case["input"],
737
+ expected_output=case.get("expected_output"),
738
+ attachment_ids=attachment_ids or None,
739
+ rubrics_by_evaluator=rubrics, meta=case.get("meta"),
740
+ note=case.get("note"),
741
+ )
742
+ case_ids.append(result["id"])
743
+ labels.append(label)
744
+ created.append({"case_id": result["id"], "label": label})
745
+
746
+ out = {"case_ids": case_ids, "labels": labels, "created": created, "reused": reused}
747
+ out_text = json.dumps(out, indent=2, ensure_ascii=False)
748
+ print(out_text)
749
+ if args.out:
750
+ Path(args.out).write_text(out_text + "\n")
751
+ return 0
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # eval rubrics
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def run_rubrics(args, client) -> int:
759
+ workspace_id, _ = client.resolve_scope()
760
+ cases = eval_mod.list_cases(client, args.agent)
761
+ if args.cases:
762
+ wanted = set(_ids(args.cases) or [])
763
+ cases = [x for x in cases if x["id"] in wanted]
764
+ case_ids = [x["id"] for x in cases]
765
+ case_input = {x["id"]: (x.get("input") or "") for x in cases}
766
+ if not case_ids:
767
+ log("error: no cases for this agent")
768
+ return 2
769
+
770
+ if args.evaluators:
771
+ evaluator_ids = _ids(args.evaluators) or []
772
+ evaluators = [eval_mod.get_evaluator(client, eid) for eid in evaluator_ids]
773
+ else:
774
+ evaluators = eval_mod.list_evaluators(client, workspace_id)
775
+ evaluator_ids = [e["id"] for e in evaluators]
776
+ evaluator_name = {e["id"]: e.get("name", e["id"]) for e in evaluators}
777
+ if not evaluator_ids:
778
+ log("error: no evaluators in workspace")
779
+ return 2
780
+
781
+ log(f"reading {len(case_ids)} cases x {len(evaluator_ids)} evaluators...")
782
+
783
+ rubrics = eval_mod.get_case_rubrics(
784
+ client, agent_id=args.agent, workspace_id=workspace_id,
785
+ evaluator_ids=evaluator_ids, case_ids=case_ids,
786
+ )
787
+
788
+ for cid in case_ids:
789
+ log("=" * 80)
790
+ log(f"CASE {cid}")
791
+ log(f" input: {truncate(case_input.get(cid, ''), 120)}")
792
+ for ev_id in evaluator_ids:
793
+ ev_name = evaluator_name.get(ev_id, ev_id)
794
+ rubric_text = (rubrics.get(cid) or {}).get(ev_id, "")
795
+ if not rubric_text:
796
+ log(f" [{ev_name}] (rubric not set)")
797
+ else:
798
+ log(f" [{ev_name}]")
799
+ for line in rubric_text.splitlines():
800
+ log(f" {line}")
801
+
802
+ out = {
803
+ "agent_id": args.agent,
804
+ "workspace_id": workspace_id,
805
+ "evaluators": [{"id": e["id"], "name": e.get("name")} for e in evaluators],
806
+ "cases": [
807
+ {
808
+ "case_id": cid,
809
+ "input": case_input.get(cid, ""),
810
+ "rubrics_by_evaluator": {
811
+ ev_id: (rubrics.get(cid) or {}).get(ev_id)
812
+ for ev_id in evaluator_ids
813
+ },
814
+ }
815
+ for cid in case_ids
816
+ ],
817
+ }
818
+ print(json.dumps(out, indent=2, ensure_ascii=False))
819
+ if args.out:
820
+ Path(args.out).write_text(json.dumps(out, indent=2, ensure_ascii=False) + "\n")
821
+ return 0
822
+
823
+
824
+ # ---------------------------------------------------------------------------
825
+ # eval rubrics-apply
826
+ # ---------------------------------------------------------------------------
827
+
828
+ def run_rubrics_apply(args, client) -> int:
829
+ payload = json.loads(Path(args.rubrics).read_text())
830
+ cases = payload.get("cases") or []
831
+ if not cases:
832
+ log("error: no cases in payload")
833
+ return 2
834
+
835
+ agent_id = payload.get("agent_id")
836
+ workspace_id, _ = client.resolve_scope()
837
+
838
+ all_case_ids = [c["case_id"] for c in cases]
839
+ all_evaluator_ids: set[str] = set()
840
+ for c in cases:
841
+ all_evaluator_ids.update((c.get("rubrics_by_evaluator") or {}).keys())
842
+
843
+ failed: list[dict] = []
844
+ valid_case_ids = set(all_case_ids)
845
+ valid_evaluator_ids = set(all_evaluator_ids)
846
+
847
+ if agent_id:
848
+ known_case_ids = {c["id"] for c in eval_mod.list_cases(client, agent_id)}
849
+ invalid_case_ids = valid_case_ids - known_case_ids
850
+ valid_case_ids &= known_case_ids
851
+ if invalid_case_ids:
852
+ log(f"warning: {len(invalid_case_ids)} case IDs are not part of agent {agent_id}")
853
+
854
+ known_evaluator_ids = {e["id"] for e in eval_mod.list_evaluators(client, workspace_id)}
855
+ invalid_evaluator_ids = valid_evaluator_ids - known_evaluator_ids
856
+ valid_evaluator_ids &= known_evaluator_ids
857
+ if invalid_evaluator_ids:
858
+ log(f"warning: {len(invalid_evaluator_ids)} evaluator IDs not in workspace {workspace_id}")
859
+
860
+ current: dict[str, dict[str, str]] = {}
861
+ if not args.force:
862
+ if not valid_case_ids or not valid_evaluator_ids:
863
+ log("reading current rubrics skipped: no valid case/evaluator pairs")
864
+ else:
865
+ log(f"reading current rubrics for {len(valid_case_ids)} cases x {len(valid_evaluator_ids)} evaluators...")
866
+ current = eval_mod.get_case_rubrics(
867
+ client, agent_id=agent_id or "", workspace_id=workspace_id,
868
+ evaluator_ids=list(valid_evaluator_ids), case_ids=list(valid_case_ids),
869
+ )
870
+
871
+ updated: list[dict] = []
872
+ skipped: list[dict] = []
873
+
874
+ for case in cases:
875
+ case_id = case["case_id"]
876
+ case_input = truncate(case.get("input") or "")
877
+ rubrics_map = case.get("rubrics_by_evaluator") or {}
878
+
879
+ for ev_id, new_rubric in rubrics_map.items():
880
+ entry = {
881
+ "case_id": case_id, "evaluator_id": ev_id,
882
+ "case_input": case_input,
883
+ "new_rubric_preview": truncate(new_rubric, 80),
884
+ }
885
+ if case_id not in valid_case_ids:
886
+ failed.append({**entry, "error": "case_id not found for agent"})
887
+ continue
888
+ if ev_id not in valid_evaluator_ids:
889
+ failed.append({**entry, "error": "evaluator_id not found for workspace"})
890
+ continue
891
+
892
+ old_rubric = (current.get(case_id) or {}).get(ev_id, "")
893
+ entry["old_rubric_preview"] = truncate(old_rubric, 80)
894
+ if not args.force and new_rubric == old_rubric:
895
+ skipped.append({"case_id": case_id, "evaluator_id": ev_id, "reason": "unchanged"})
896
+ continue
897
+
898
+ if args.dry_run:
899
+ log(f" [dry-run] would update: {case_input} x {ev_id[:8]}...")
900
+ updated.append(entry)
901
+ continue
902
+
903
+ try:
904
+ eval_mod.set_rubric(client, evaluation_case_id=case_id,
905
+ evaluator_id=ev_id, rubric=new_rubric)
906
+ log(f" updated: {case_input} x {ev_id[:8]}...")
907
+ updated.append(entry)
908
+ except Exception as e:
909
+ log(f" FAILED: {case_input} x {ev_id[:8]}... -- {e}")
910
+ failed.append({**entry, "error": str(e)})
911
+
912
+ log(f"\ndone: {len(updated)} updated, {len(skipped)} skipped (unchanged), {len(failed)} failed")
913
+
914
+ out = {"updated": updated, "skipped": skipped, "failed": failed}
915
+ out_text = json.dumps(out, indent=2, ensure_ascii=False)
916
+ print(out_text)
917
+ if args.out:
918
+ Path(args.out).write_text(out_text + "\n")
919
+ return 1 if failed else 0