codeer-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeer_cli/__init__.py +54 -0
- codeer_cli/_validate.py +131 -0
- codeer_cli/agents.py +155 -0
- codeer_cli/chats.py +87 -0
- codeer_cli/cli.py +92 -0
- codeer_cli/client.py +277 -0
- codeer_cli/commands/__init__.py +0 -0
- codeer_cli/commands/_util.py +12 -0
- codeer_cli/commands/agent.py +186 -0
- codeer_cli/commands/check.py +66 -0
- codeer_cli/commands/eval_cmd.py +919 -0
- codeer_cli/commands/history.py +200 -0
- codeer_cli/commands/kb.py +126 -0
- codeer_cli/commands/profile.py +205 -0
- codeer_cli/constants.py +66 -0
- codeer_cli/eval_.py +423 -0
- codeer_cli/histories.py +156 -0
- codeer_cli/kb.py +226 -0
- codeer_cli/parse.py +567 -0
- codeer_cli-0.1.0.dist-info/METADATA +108 -0
- codeer_cli-0.1.0.dist-info/RECORD +23 -0
- codeer_cli-0.1.0.dist-info/WHEEL +4 -0
- codeer_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,919 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import mimetypes
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from dataclasses import asdict
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from .. import agents as agents_mod
|
|
14
|
+
from .. import eval_ as eval_mod
|
|
15
|
+
from ..client import CodeerClient
|
|
16
|
+
from ..parse import parse_eval_result, parse_eval_tool_calls, summarize_eval_tool_calls
|
|
17
|
+
from ._util import log, truncate
|
|
18
|
+
|
|
19
|
+
POLL_INTERVAL = 5
|
|
20
|
+
POLL_TIMEOUT = 900
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ids(csv_text: str | None) -> list[str] | None:
|
|
24
|
+
if not csv_text:
|
|
25
|
+
return None
|
|
26
|
+
return [x.strip() for x in csv_text.split(",") if x.strip()]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def register(subparsers):
|
|
30
|
+
ev = subparsers.add_parser("eval", help="Eval suite operations")
|
|
31
|
+
sub = ev.add_subparsers(dest="action", required=True)
|
|
32
|
+
|
|
33
|
+
# codeer eval list
|
|
34
|
+
p = sub.add_parser("list", help="List eval cases for an agent")
|
|
35
|
+
p.add_argument("--agent", required=True)
|
|
36
|
+
p.set_defaults(func=run_list)
|
|
37
|
+
|
|
38
|
+
# codeer eval evaluators
|
|
39
|
+
p = sub.add_parser("evaluators", help="List evaluators in workspace")
|
|
40
|
+
p.set_defaults(func=run_evaluators)
|
|
41
|
+
|
|
42
|
+
# codeer eval evaluator-create
|
|
43
|
+
p = sub.add_parser("evaluator-create", help="Create an evaluator in the workspace")
|
|
44
|
+
p.add_argument("--name", required=True)
|
|
45
|
+
g = p.add_mutually_exclusive_group(required=True)
|
|
46
|
+
g.add_argument("--system-prompt-template", help="Evaluator system prompt template text")
|
|
47
|
+
g.add_argument("--system-prompt-template-file", help="Path to evaluator system prompt template")
|
|
48
|
+
p.add_argument("--description", default=None)
|
|
49
|
+
p.set_defaults(func=run_evaluator_create)
|
|
50
|
+
|
|
51
|
+
# codeer eval evaluator-update
|
|
52
|
+
p = sub.add_parser("evaluator-update", help="Update an evaluator in the workspace")
|
|
53
|
+
p.add_argument("--evaluator", required=True, help="Evaluator UUID")
|
|
54
|
+
p.add_argument("--name", default=None)
|
|
55
|
+
g = p.add_mutually_exclusive_group()
|
|
56
|
+
g.add_argument("--system-prompt-template", help="Evaluator system prompt template text")
|
|
57
|
+
g.add_argument("--system-prompt-template-file", help="Path to evaluator system prompt template")
|
|
58
|
+
p.add_argument("--description", default=None)
|
|
59
|
+
p.set_defaults(func=run_evaluator_update)
|
|
60
|
+
|
|
61
|
+
# codeer eval run
|
|
62
|
+
p = sub.add_parser("run", help="Trigger eval run, poll for results, print scores")
|
|
63
|
+
p.add_argument("--agent", required=True)
|
|
64
|
+
g = p.add_mutually_exclusive_group()
|
|
65
|
+
g.add_argument("--history", default=None, help="History UUID to pin the run to")
|
|
66
|
+
g.add_argument("--latest", action="store_true",
|
|
67
|
+
help="Auto-select the newest AgentHistory (default)")
|
|
68
|
+
p.add_argument("--cases", default=None, help="Comma-separated case UUIDs (default: all)")
|
|
69
|
+
p.add_argument("--evaluators", required=True, help="Comma-separated evaluator UUIDs")
|
|
70
|
+
p.add_argument("--poll-timeout", type=int, default=POLL_TIMEOUT)
|
|
71
|
+
p.add_argument("--out", default=None)
|
|
72
|
+
p.set_defaults(func=run_run)
|
|
73
|
+
|
|
74
|
+
# codeer eval export
|
|
75
|
+
p = sub.add_parser("export", help="Export eval table (CSV + JSON + summary MD)")
|
|
76
|
+
p.add_argument("--agent", default=None)
|
|
77
|
+
g = p.add_mutually_exclusive_group()
|
|
78
|
+
g.add_argument("--version", type=int, help="AgentHistory version_number")
|
|
79
|
+
g.add_argument("--published", action="store_true", help="Use the published history")
|
|
80
|
+
p.add_argument("--cases", default=None, help="Comma-separated case UUIDs")
|
|
81
|
+
p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
|
|
82
|
+
p.add_argument("--out-dir", default=".codeer/eval_table")
|
|
83
|
+
p.set_defaults(func=run_export)
|
|
84
|
+
|
|
85
|
+
# codeer eval reconcile
|
|
86
|
+
p = sub.add_parser("reconcile", help="Audit local manifest vs server eval suite (read-only)")
|
|
87
|
+
p.add_argument("--manifest", default=".codeer/eval_cases.json")
|
|
88
|
+
p.add_argument("--agent", default=None)
|
|
89
|
+
p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
|
|
90
|
+
p.add_argument("--out", default=None)
|
|
91
|
+
p.set_defaults(func=run_reconcile)
|
|
92
|
+
|
|
93
|
+
# codeer eval cases-apply
|
|
94
|
+
p = sub.add_parser("cases-apply", help="Create/update eval cases from JSON manifest")
|
|
95
|
+
p.add_argument("--cases", required=True, help="Path to eval_cases.json")
|
|
96
|
+
p.add_argument("--agent", required=True)
|
|
97
|
+
p.add_argument("--attachments-dir", default=None, dest="attachments_dir")
|
|
98
|
+
p.add_argument("--allow-duplicates", action="store_true")
|
|
99
|
+
p.add_argument("--out", default=None)
|
|
100
|
+
p.set_defaults(func=run_cases_apply)
|
|
101
|
+
|
|
102
|
+
# codeer eval rubrics
|
|
103
|
+
p = sub.add_parser("rubrics", help="Read per-(case, evaluator) rubrics")
|
|
104
|
+
p.add_argument("--agent", required=True)
|
|
105
|
+
p.add_argument("--evaluators", default=None, help="Comma-separated evaluator UUIDs")
|
|
106
|
+
p.add_argument("--cases", default=None, help="Comma-separated case UUIDs")
|
|
107
|
+
p.add_argument("--out", default=None)
|
|
108
|
+
p.set_defaults(func=run_rubrics)
|
|
109
|
+
|
|
110
|
+
# codeer eval rubrics-apply
|
|
111
|
+
p = sub.add_parser("rubrics-apply", help="Apply rubric changes from JSON file")
|
|
112
|
+
p.add_argument("--rubrics", required=True, help="Path to rubrics JSON")
|
|
113
|
+
p.add_argument("--dry-run", action="store_true")
|
|
114
|
+
p.add_argument("--force", action="store_true", help="Write all rubrics even if unchanged")
|
|
115
|
+
p.add_argument("--out", default=None)
|
|
116
|
+
p.set_defaults(func=run_rubrics_apply)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# eval list
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
def run_list(args, client) -> int:
|
|
124
|
+
cases = eval_mod.list_cases(client, args.agent)
|
|
125
|
+
print(json.dumps(cases, ensure_ascii=False, indent=2, default=str))
|
|
126
|
+
return 0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
# eval evaluators
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def run_evaluators(args, client) -> int:
|
|
134
|
+
ws, _ = client.resolve_scope()
|
|
135
|
+
evaluators = eval_mod.list_evaluators(client, ws)
|
|
136
|
+
print(json.dumps(evaluators, ensure_ascii=False, indent=2, default=str))
|
|
137
|
+
return 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# eval evaluator-create
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def run_evaluator_create(args, client) -> int:
|
|
145
|
+
workspace_id, _ = client.resolve_scope()
|
|
146
|
+
if args.system_prompt_template_file:
|
|
147
|
+
system_prompt_template = Path(args.system_prompt_template_file).read_text()
|
|
148
|
+
else:
|
|
149
|
+
system_prompt_template = args.system_prompt_template
|
|
150
|
+
evaluator = eval_mod.create_evaluator(
|
|
151
|
+
client,
|
|
152
|
+
workspace_id=workspace_id,
|
|
153
|
+
name=args.name,
|
|
154
|
+
system_prompt_template=system_prompt_template,
|
|
155
|
+
description=args.description,
|
|
156
|
+
)
|
|
157
|
+
print(json.dumps(evaluator, ensure_ascii=False, indent=2, default=str))
|
|
158
|
+
return 0
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# eval evaluator-update
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
def run_evaluator_update(args, client) -> int:
|
|
166
|
+
if args.system_prompt_template_file:
|
|
167
|
+
system_prompt_template = Path(args.system_prompt_template_file).read_text()
|
|
168
|
+
else:
|
|
169
|
+
system_prompt_template = args.system_prompt_template
|
|
170
|
+
|
|
171
|
+
if args.name is None and args.description is None and system_prompt_template is None:
|
|
172
|
+
log(
|
|
173
|
+
"error: provide at least one of --name, --description, "
|
|
174
|
+
"--system-prompt-template, --system-prompt-template-file"
|
|
175
|
+
)
|
|
176
|
+
return 2
|
|
177
|
+
|
|
178
|
+
evaluator = eval_mod.update_evaluator(
|
|
179
|
+
client,
|
|
180
|
+
evaluator_id=args.evaluator,
|
|
181
|
+
name=args.name,
|
|
182
|
+
system_prompt_template=system_prompt_template,
|
|
183
|
+
description=args.description,
|
|
184
|
+
)
|
|
185
|
+
print(json.dumps(evaluator, ensure_ascii=False, indent=2, default=str))
|
|
186
|
+
return 0
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# eval run
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def run_run(args, client) -> int:
|
|
194
|
+
workspace_id, _ = client.resolve_scope()
|
|
195
|
+
if args.latest or not args.history:
|
|
196
|
+
history_id = agents_mod.get_latest_history_id(client, args.agent)
|
|
197
|
+
if not history_id:
|
|
198
|
+
log("error: agent has no history versions; pass --history instead")
|
|
199
|
+
return 2
|
|
200
|
+
log(f"--latest -> {history_id}")
|
|
201
|
+
args.history = history_id
|
|
202
|
+
|
|
203
|
+
if args.cases:
|
|
204
|
+
case_ids = _ids(args.cases) or []
|
|
205
|
+
case_objs: list[dict] = []
|
|
206
|
+
for cid in case_ids:
|
|
207
|
+
try:
|
|
208
|
+
case_objs.append(eval_mod.get_case(client, cid))
|
|
209
|
+
except Exception as e:
|
|
210
|
+
log(f"warning: could not fetch case {cid}: {e}")
|
|
211
|
+
case_objs.append({"id": cid, "input": ""})
|
|
212
|
+
else:
|
|
213
|
+
case_objs = eval_mod.list_cases(client, args.agent)
|
|
214
|
+
case_ids = [c["id"] for c in case_objs]
|
|
215
|
+
if not case_ids:
|
|
216
|
+
log("error: no cases to run")
|
|
217
|
+
return 2
|
|
218
|
+
|
|
219
|
+
evaluator_ids = _ids(args.evaluators) or []
|
|
220
|
+
if not evaluator_ids:
|
|
221
|
+
log("error: --evaluators is required")
|
|
222
|
+
return 2
|
|
223
|
+
evaluators = [eval_mod.get_evaluator(client, eid) for eid in evaluator_ids]
|
|
224
|
+
|
|
225
|
+
case_label_by_id = {c["id"]: truncate(c.get("input") or "", 60) for c in case_objs}
|
|
226
|
+
evaluator_name_by_id = {e["id"]: e.get("name", e["id"]) for e in evaluators}
|
|
227
|
+
|
|
228
|
+
log(f"triggering: {len(case_ids)} cases x {len(evaluator_ids)} evaluators on history {args.history}")
|
|
229
|
+
eval_mod.trigger(client, case_ids=case_ids, evaluator_ids=evaluator_ids,
|
|
230
|
+
agent_history_id=args.history)
|
|
231
|
+
|
|
232
|
+
deadline = time.time() + args.poll_timeout
|
|
233
|
+
results_by_eval: dict[str, list[dict]] = {}
|
|
234
|
+
while time.time() < deadline:
|
|
235
|
+
results_by_eval = {}
|
|
236
|
+
done = 0
|
|
237
|
+
total = len(case_ids) * len(evaluator_ids)
|
|
238
|
+
for ev_id in evaluator_ids:
|
|
239
|
+
rows = eval_mod.get_results(
|
|
240
|
+
client, case_ids=case_ids, evaluator_id=ev_id,
|
|
241
|
+
agent_history_id=args.history, workspace_id=workspace_id,
|
|
242
|
+
include_output=True, include_reasoning_steps=True,
|
|
243
|
+
)
|
|
244
|
+
results_by_eval[ev_id] = rows
|
|
245
|
+
done += sum(1 for r in rows if r.get("score") is not None)
|
|
246
|
+
log(f" progress: {done}/{total}")
|
|
247
|
+
if done >= total:
|
|
248
|
+
break
|
|
249
|
+
time.sleep(POLL_INTERVAL)
|
|
250
|
+
|
|
251
|
+
flat: list[dict] = []
|
|
252
|
+
for ev_id, rows in results_by_eval.items():
|
|
253
|
+
for r in rows:
|
|
254
|
+
result_summary = parse_eval_result(r)
|
|
255
|
+
tool_calls = parse_eval_tool_calls(r)
|
|
256
|
+
total_tool_duration_ms = sum(
|
|
257
|
+
tc.duration_ms for tc in tool_calls if tc.duration_ms is not None
|
|
258
|
+
)
|
|
259
|
+
flat.append({
|
|
260
|
+
"case_id": r.get("case_id") or r.get("evaluation_case_id"),
|
|
261
|
+
"case_label": case_label_by_id.get(
|
|
262
|
+
r.get("case_id") or r.get("evaluation_case_id"), "?"),
|
|
263
|
+
"evaluator_id": ev_id,
|
|
264
|
+
"evaluator_name": evaluator_name_by_id.get(ev_id, ev_id),
|
|
265
|
+
"score": r.get("score"),
|
|
266
|
+
"status": result_summary.status,
|
|
267
|
+
"reason": r.get("reason"),
|
|
268
|
+
"output": r.get("output") or r.get("actual_output"),
|
|
269
|
+
"execution_time_s": result_summary.execution_time_s,
|
|
270
|
+
"cost_credits": result_summary.cost_credits,
|
|
271
|
+
"tool_call_count": len(tool_calls),
|
|
272
|
+
"tool_total_duration_ms": total_tool_duration_ms or None,
|
|
273
|
+
"tool_calls_summary": summarize_eval_tool_calls(tool_calls),
|
|
274
|
+
"tool_calls": [asdict(tc) for tc in tool_calls],
|
|
275
|
+
"raw_result": r,
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
all_perfect = all((r.get("score") or 0.0) >= 1.0 for r in flat) if flat else False
|
|
279
|
+
log("\n" + "=" * 80)
|
|
280
|
+
log(f"RESULTS agent={args.agent} history={args.history}")
|
|
281
|
+
log("=" * 80)
|
|
282
|
+
log(f"{'score':>6} {'evaluator':<35} case")
|
|
283
|
+
for r in sorted(flat, key=lambda x: (x.get("score") or 0.0, x["evaluator_name"])):
|
|
284
|
+
score = r.get("score")
|
|
285
|
+
score_str = f"{score:.2f}" if score is not None else " - "
|
|
286
|
+
log(f"{score_str:>6} {r['evaluator_name'][:35]:<35} {r['case_label']}")
|
|
287
|
+
|
|
288
|
+
non_perfect = [r for r in flat if (r.get("score") or 0.0) < 1.0]
|
|
289
|
+
if non_perfect:
|
|
290
|
+
log("\nNON-PERFECT RESULTS:\n")
|
|
291
|
+
for r in non_perfect:
|
|
292
|
+
log(f" [{r.get('score') or 0:.2f}] {r['evaluator_name']} — {r['case_label']}")
|
|
293
|
+
if r.get("tool_calls_summary"):
|
|
294
|
+
log(f" tools: {r['tool_calls_summary'][:600]}")
|
|
295
|
+
log(f" reason: {(r.get('reason') or '').strip()[:600]}")
|
|
296
|
+
log("")
|
|
297
|
+
|
|
298
|
+
out = {
|
|
299
|
+
"agent_id": args.agent,
|
|
300
|
+
"history_id": args.history,
|
|
301
|
+
"all_perfect": all_perfect,
|
|
302
|
+
"results": flat,
|
|
303
|
+
}
|
|
304
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
305
|
+
if args.out:
|
|
306
|
+
Path(args.out).write_text(json.dumps(out, indent=2, ensure_ascii=False) + "\n")
|
|
307
|
+
return 0 if all_perfect else 1
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
# eval export
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
def _pick_history(versions: list[dict], args) -> dict:
|
|
315
|
+
if args.version is not None:
|
|
316
|
+
for v in versions:
|
|
317
|
+
if v.get("version_number") == args.version:
|
|
318
|
+
return v
|
|
319
|
+
raise SystemExit(f"no AgentHistory with version_number={args.version}")
|
|
320
|
+
if args.published:
|
|
321
|
+
current = [v for v in versions if v.get("status") == "published"]
|
|
322
|
+
if current:
|
|
323
|
+
return sorted(current, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
|
|
324
|
+
previous = [v for v in versions if v.get("was_published")]
|
|
325
|
+
if previous:
|
|
326
|
+
return sorted(previous, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
|
|
327
|
+
raise SystemExit("no published AgentHistory found")
|
|
328
|
+
return sorted(versions, key=lambda v: v.get("version_number") or 0, reverse=True)[0]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def run_export(args, client) -> int:
|
|
332
|
+
agent_id = args.agent or client.agent_id or os.environ.get("CODEER_AGENT_ID")
|
|
333
|
+
workspace_id, _ = client.resolve_scope()
|
|
334
|
+
if not agent_id:
|
|
335
|
+
log("error: --agent is required or set CODEER_AGENT_ID")
|
|
336
|
+
return 2
|
|
337
|
+
|
|
338
|
+
cases = eval_mod.list_cases(client, agent_id)
|
|
339
|
+
wanted_cases = set(_ids(args.cases) or [])
|
|
340
|
+
if wanted_cases:
|
|
341
|
+
cases = [c for c in cases if c["id"] in wanted_cases]
|
|
342
|
+
case_ids = [c["id"] for c in cases]
|
|
343
|
+
if not case_ids:
|
|
344
|
+
log("error: no eval cases matched")
|
|
345
|
+
return 2
|
|
346
|
+
|
|
347
|
+
if args.evaluators:
|
|
348
|
+
evaluators = [eval_mod.get_evaluator(client, eid) for eid in (_ids(args.evaluators) or [])]
|
|
349
|
+
else:
|
|
350
|
+
evaluators = eval_mod.list_evaluators(client, workspace_id)
|
|
351
|
+
if not evaluators:
|
|
352
|
+
log("error: no evaluators matched")
|
|
353
|
+
return 2
|
|
354
|
+
|
|
355
|
+
versions = agents_mod.list_versions(client, agent_id)
|
|
356
|
+
history = _pick_history(versions, args)
|
|
357
|
+
|
|
358
|
+
rows: list[dict[str, Any]] = []
|
|
359
|
+
all_rubrics: list[dict] = []
|
|
360
|
+
all_results: list[dict] = []
|
|
361
|
+
for evaluator in evaluators:
|
|
362
|
+
evaluator_id = evaluator["id"]
|
|
363
|
+
rubrics = eval_mod.get_rubrics_batch(
|
|
364
|
+
client, case_ids=case_ids, evaluator_id=evaluator_id)
|
|
365
|
+
results = eval_mod.get_results(
|
|
366
|
+
client, case_ids=case_ids, evaluator_id=evaluator_id,
|
|
367
|
+
agent_history_id=history["id"], workspace_id=workspace_id,
|
|
368
|
+
include_output=True, include_reasoning_steps=True,
|
|
369
|
+
)
|
|
370
|
+
all_rubrics.extend(rubrics)
|
|
371
|
+
all_results.extend({**r, "evaluator_id": evaluator_id} for r in results)
|
|
372
|
+
rubric_by_case = {
|
|
373
|
+
(row.get("case_id") or row.get("evaluation_case_id")): row.get("rubric", "")
|
|
374
|
+
for row in rubrics
|
|
375
|
+
}
|
|
376
|
+
result_by_case = {
|
|
377
|
+
(row.get("case_id") or row.get("evaluation_case_id")): row
|
|
378
|
+
for row in results
|
|
379
|
+
}
|
|
380
|
+
for order, case in enumerate(cases, 1):
|
|
381
|
+
case_id = case["id"]
|
|
382
|
+
result = result_by_case.get(case_id, {})
|
|
383
|
+
tool_calls = parse_eval_tool_calls(result)
|
|
384
|
+
total_tool_duration_ms = sum(
|
|
385
|
+
tc.duration_ms for tc in tool_calls if tc.duration_ms is not None
|
|
386
|
+
)
|
|
387
|
+
rows.append({
|
|
388
|
+
"order": order,
|
|
389
|
+
"case_id": case_id,
|
|
390
|
+
"input": case.get("input") or "",
|
|
391
|
+
"note": case.get("note") or "",
|
|
392
|
+
"evaluator_id": evaluator_id,
|
|
393
|
+
"evaluator_name": evaluator.get("name") or evaluator_id,
|
|
394
|
+
"score": result.get("score"),
|
|
395
|
+
"reason": result.get("reason") or "",
|
|
396
|
+
"output": result.get("output") or result.get("actual_output") or "",
|
|
397
|
+
"rubric": rubric_by_case.get(case_id, ""),
|
|
398
|
+
"tool_call_count": len(tool_calls),
|
|
399
|
+
"tool_total_duration_ms": total_tool_duration_ms or "",
|
|
400
|
+
"tool_calls_summary": summarize_eval_tool_calls(tool_calls),
|
|
401
|
+
"tool_calls_json": json.dumps(
|
|
402
|
+
[asdict(tc) for tc in tool_calls], ensure_ascii=False),
|
|
403
|
+
})
|
|
404
|
+
|
|
405
|
+
out_dir = Path(args.out_dir)
|
|
406
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
407
|
+
full = {
|
|
408
|
+
"agent_id": agent_id,
|
|
409
|
+
"workspace_id": workspace_id,
|
|
410
|
+
"history": {
|
|
411
|
+
"id": history["id"],
|
|
412
|
+
"version_number": history.get("version_number"),
|
|
413
|
+
"status": history.get("status"),
|
|
414
|
+
"was_published": history.get("was_published"),
|
|
415
|
+
"version_note": history.get("version_note"),
|
|
416
|
+
"created_at": history.get("created_at"),
|
|
417
|
+
},
|
|
418
|
+
"evaluators": [
|
|
419
|
+
{"id": e["id"], "name": e.get("name"), "description": e.get("description")}
|
|
420
|
+
for e in evaluators
|
|
421
|
+
],
|
|
422
|
+
"cases": cases,
|
|
423
|
+
"rubrics": all_rubrics,
|
|
424
|
+
"results": all_results,
|
|
425
|
+
"table": rows,
|
|
426
|
+
}
|
|
427
|
+
(out_dir / "eval_table_full.json").write_text(
|
|
428
|
+
json.dumps(full, ensure_ascii=False, indent=2) + "\n")
|
|
429
|
+
with (out_dir / "eval_table.csv").open("w", newline="") as fh:
|
|
430
|
+
fields = [
|
|
431
|
+
"order", "case_id", "input", "note", "evaluator_name", "score",
|
|
432
|
+
"reason", "output", "rubric", "tool_call_count",
|
|
433
|
+
"tool_calls_summary", "tool_total_duration_ms", "tool_calls_json",
|
|
434
|
+
"evaluator_id",
|
|
435
|
+
]
|
|
436
|
+
writer = csv.DictWriter(fh, fieldnames=fields)
|
|
437
|
+
writer.writeheader()
|
|
438
|
+
writer.writerows(rows)
|
|
439
|
+
with (out_dir / "eval_table_summary.md").open("w") as fh:
|
|
440
|
+
fh.write("# Codeer Eval Table Export\n\n")
|
|
441
|
+
fh.write(f"Agent: `{agent_id}`\n\n")
|
|
442
|
+
fh.write(f"History: v{history.get('version_number')} `{history['id']}`\n\n")
|
|
443
|
+
fh.write("| # | Evaluator | Score | Case ID | Input |\n")
|
|
444
|
+
fh.write("|---:|---|---:|---|---|\n")
|
|
445
|
+
for row in rows:
|
|
446
|
+
inp = truncate(row["input"], 80).replace("|", "\\|")
|
|
447
|
+
fh.write(
|
|
448
|
+
f"| {row['order']} | {row['evaluator_name']} | {row['score']} | "
|
|
449
|
+
f"`{row['case_id']}` | {inp} |\n")
|
|
450
|
+
|
|
451
|
+
print(json.dumps({
|
|
452
|
+
"out_dir": str(out_dir),
|
|
453
|
+
"cases": len(cases),
|
|
454
|
+
"evaluators": len(evaluators),
|
|
455
|
+
"rows": len(rows),
|
|
456
|
+
"history_id": history["id"],
|
|
457
|
+
"version_number": history.get("version_number"),
|
|
458
|
+
}, ensure_ascii=False, indent=2))
|
|
459
|
+
return 0
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# ---------------------------------------------------------------------------
|
|
463
|
+
# eval reconcile
|
|
464
|
+
# ---------------------------------------------------------------------------
|
|
465
|
+
|
|
466
|
+
def _normalize_manifest(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|
467
|
+
cases = payload.get("cases") or []
|
|
468
|
+
shared_style_rubric = payload.get("shared_style_rubric")
|
|
469
|
+
shared_style_evaluators = payload.get("shared_style_evaluators") or []
|
|
470
|
+
out: list[dict[str, Any]] = []
|
|
471
|
+
for idx, case in enumerate(cases):
|
|
472
|
+
rubrics = dict(case.get("rubrics") or case.get("rubrics_by_evaluator") or {})
|
|
473
|
+
if shared_style_rubric:
|
|
474
|
+
for ev_id in shared_style_evaluators:
|
|
475
|
+
rubrics.setdefault(ev_id, shared_style_rubric)
|
|
476
|
+
out.append({
|
|
477
|
+
"index": idx,
|
|
478
|
+
"label": case.get("label") or f"case[{idx}]",
|
|
479
|
+
"input": case.get("input") or "",
|
|
480
|
+
"expected_output": case.get("expected_output"),
|
|
481
|
+
"rubrics_by_evaluator": rubrics,
|
|
482
|
+
})
|
|
483
|
+
return out
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _by_input(rows: list[dict], *, input_key: str = "input") -> dict[str, list[dict]]:
|
|
487
|
+
grouped: dict[str, list[dict]] = defaultdict(list)
|
|
488
|
+
for row in rows:
|
|
489
|
+
grouped[row.get(input_key) or ""].append(row)
|
|
490
|
+
return dict(grouped)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _duplicate_inputs(grouped: dict[str, list[dict]], *, kind: str) -> list[dict]:
|
|
494
|
+
out = []
|
|
495
|
+
for input_text, rows in grouped.items():
|
|
496
|
+
if len(rows) <= 1:
|
|
497
|
+
continue
|
|
498
|
+
out.append({
|
|
499
|
+
"input_preview": truncate(input_text, 120),
|
|
500
|
+
"count": len(rows),
|
|
501
|
+
"items": [
|
|
502
|
+
{"case_id": r.get("id") or r.get("case_id"),
|
|
503
|
+
"label": r.get("label"), "index": r.get("index")}
|
|
504
|
+
for r in rows
|
|
505
|
+
],
|
|
506
|
+
"kind": kind,
|
|
507
|
+
})
|
|
508
|
+
return out
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def run_reconcile(args, client) -> int:
|
|
512
|
+
manifest_path = Path(args.manifest)
|
|
513
|
+
if not manifest_path.exists():
|
|
514
|
+
log(f"error: manifest not found: {manifest_path}")
|
|
515
|
+
return 2
|
|
516
|
+
|
|
517
|
+
payload = json.loads(manifest_path.read_text())
|
|
518
|
+
local_cases = _normalize_manifest(payload)
|
|
519
|
+
local_by_input = _by_input(local_cases)
|
|
520
|
+
|
|
521
|
+
agent_id = args.agent or client.agent_id or os.environ.get("CODEER_AGENT_ID")
|
|
522
|
+
workspace_id, _ = client.resolve_scope()
|
|
523
|
+
if not agent_id:
|
|
524
|
+
log("error: --agent or CODEER_AGENT_ID is required")
|
|
525
|
+
return 2
|
|
526
|
+
|
|
527
|
+
server_cases = eval_mod.list_cases(client, agent_id)
|
|
528
|
+
server_by_input = _by_input(server_cases)
|
|
529
|
+
|
|
530
|
+
all_evaluators = eval_mod.list_evaluators(client, workspace_id)
|
|
531
|
+
evaluator_by_id = {e["id"]: e for e in all_evaluators}
|
|
532
|
+
if args.evaluators:
|
|
533
|
+
evaluator_ids = _ids(args.evaluators) or []
|
|
534
|
+
else:
|
|
535
|
+
evaluator_ids = [e["id"] for e in all_evaluators]
|
|
536
|
+
|
|
537
|
+
manifest_evaluator_ids = {
|
|
538
|
+
ev_id for case in local_cases
|
|
539
|
+
for ev_id in (case.get("rubrics_by_evaluator") or {}).keys()
|
|
540
|
+
}
|
|
541
|
+
invalid_manifest_evaluator_ids = sorted(manifest_evaluator_ids - set(evaluator_by_id))
|
|
542
|
+
invalid_requested_evaluator_ids = sorted(set(evaluator_ids) - set(evaluator_by_id))
|
|
543
|
+
valid_evaluator_ids = [eid for eid in evaluator_ids if eid in evaluator_by_id]
|
|
544
|
+
|
|
545
|
+
server_case_ids = [c["id"] for c in server_cases]
|
|
546
|
+
rubrics_by_case = eval_mod.get_case_rubrics(
|
|
547
|
+
client, agent_id=agent_id, workspace_id=workspace_id,
|
|
548
|
+
evaluator_ids=valid_evaluator_ids, case_ids=server_case_ids,
|
|
549
|
+
) if server_case_ids and valid_evaluator_ids else {}
|
|
550
|
+
|
|
551
|
+
local_missing_on_server = []
|
|
552
|
+
server_missing_in_manifest = []
|
|
553
|
+
rubric_drift = []
|
|
554
|
+
missing_server_rubrics = []
|
|
555
|
+
|
|
556
|
+
for local in local_cases:
|
|
557
|
+
matches = server_by_input.get(local["input"], [])
|
|
558
|
+
if not matches:
|
|
559
|
+
local_missing_on_server.append({
|
|
560
|
+
"label": local["label"], "index": local["index"],
|
|
561
|
+
"input_preview": truncate(local["input"], 120),
|
|
562
|
+
})
|
|
563
|
+
continue
|
|
564
|
+
server = matches[0]
|
|
565
|
+
server_rubrics = rubrics_by_case.get(server["id"]) or {}
|
|
566
|
+
for ev_id, local_rubric in (local.get("rubrics_by_evaluator") or {}).items():
|
|
567
|
+
if ev_id not in evaluator_by_id:
|
|
568
|
+
continue
|
|
569
|
+
if valid_evaluator_ids and ev_id not in valid_evaluator_ids:
|
|
570
|
+
continue
|
|
571
|
+
server_rubric = server_rubrics.get(ev_id, "")
|
|
572
|
+
if not server_rubric:
|
|
573
|
+
missing_server_rubrics.append({
|
|
574
|
+
"case_id": server["id"], "label": local["label"],
|
|
575
|
+
"evaluator_id": ev_id,
|
|
576
|
+
"evaluator_name": evaluator_by_id[ev_id].get("name"),
|
|
577
|
+
"input_preview": truncate(local["input"], 120),
|
|
578
|
+
})
|
|
579
|
+
elif server_rubric != local_rubric:
|
|
580
|
+
rubric_drift.append({
|
|
581
|
+
"case_id": server["id"], "label": local["label"],
|
|
582
|
+
"evaluator_id": ev_id,
|
|
583
|
+
"evaluator_name": evaluator_by_id[ev_id].get("name"),
|
|
584
|
+
"input_preview": truncate(local["input"], 120),
|
|
585
|
+
"local_rubric_preview": truncate(local_rubric, 120),
|
|
586
|
+
"server_rubric_preview": truncate(server_rubric, 120),
|
|
587
|
+
})
|
|
588
|
+
|
|
589
|
+
for server in server_cases:
|
|
590
|
+
if server.get("input") not in local_by_input:
|
|
591
|
+
server_missing_in_manifest.append({
|
|
592
|
+
"case_id": server["id"],
|
|
593
|
+
"input_preview": truncate(server.get("input") or "", 120),
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
duplicate_local_inputs = _duplicate_inputs(local_by_input, kind="local")
|
|
597
|
+
duplicate_server_inputs = _duplicate_inputs(server_by_input, kind="server")
|
|
598
|
+
|
|
599
|
+
issue_counts = {
|
|
600
|
+
"duplicate_local_inputs": len(duplicate_local_inputs),
|
|
601
|
+
"duplicate_server_inputs": len(duplicate_server_inputs),
|
|
602
|
+
"local_missing_on_server": len(local_missing_on_server),
|
|
603
|
+
"server_missing_in_manifest": len(server_missing_in_manifest),
|
|
604
|
+
"invalid_manifest_evaluator_ids": len(invalid_manifest_evaluator_ids),
|
|
605
|
+
"invalid_requested_evaluator_ids": len(invalid_requested_evaluator_ids),
|
|
606
|
+
"missing_server_rubrics": len(missing_server_rubrics),
|
|
607
|
+
"rubric_drift": len(rubric_drift),
|
|
608
|
+
}
|
|
609
|
+
total_issues = sum(issue_counts.values())
|
|
610
|
+
|
|
611
|
+
report = {
|
|
612
|
+
"agent_id": agent_id, "workspace_id": workspace_id,
|
|
613
|
+
"manifest": str(manifest_path),
|
|
614
|
+
"local_case_count": len(local_cases),
|
|
615
|
+
"server_case_count": len(server_cases),
|
|
616
|
+
"compared_evaluators": [
|
|
617
|
+
{"id": eid, "name": evaluator_by_id[eid].get("name")}
|
|
618
|
+
for eid in valid_evaluator_ids
|
|
619
|
+
],
|
|
620
|
+
"issue_counts": issue_counts,
|
|
621
|
+
"issues": {
|
|
622
|
+
"duplicate_local_inputs": duplicate_local_inputs,
|
|
623
|
+
"duplicate_server_inputs": duplicate_server_inputs,
|
|
624
|
+
"local_missing_on_server": local_missing_on_server,
|
|
625
|
+
"server_missing_in_manifest": server_missing_in_manifest,
|
|
626
|
+
"invalid_manifest_evaluator_ids": invalid_manifest_evaluator_ids,
|
|
627
|
+
"invalid_requested_evaluator_ids": invalid_requested_evaluator_ids,
|
|
628
|
+
"missing_server_rubrics": missing_server_rubrics,
|
|
629
|
+
"rubric_drift": rubric_drift,
|
|
630
|
+
},
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
log(f"reconcile: {len(local_cases)} local cases, {len(server_cases)} server cases, "
|
|
634
|
+
f"{len(valid_evaluator_ids)} evaluators, {total_issues} issues")
|
|
635
|
+
for key, count in issue_counts.items():
|
|
636
|
+
if count:
|
|
637
|
+
log(f" {key}: {count}")
|
|
638
|
+
|
|
639
|
+
out_text = json.dumps(report, indent=2, ensure_ascii=False)
|
|
640
|
+
print(out_text)
|
|
641
|
+
if args.out:
|
|
642
|
+
Path(args.out).write_text(out_text + "\n")
|
|
643
|
+
return 1 if total_issues else 0
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
# ---------------------------------------------------------------------------
|
|
647
|
+
# eval cases-apply
|
|
648
|
+
# ---------------------------------------------------------------------------
|
|
649
|
+
|
|
650
|
+
def _upload_attachment(client: CodeerClient, *, file_path: Path, workspace_id: str) -> str:
|
|
651
|
+
ct, _ = mimetypes.guess_type(file_path.name)
|
|
652
|
+
ct = ct or "application/octet-stream"
|
|
653
|
+
files = {"file": (file_path.name, file_path.read_bytes(), ct)}
|
|
654
|
+
data = {"scope": "persistent", "purpose": "evaluation_context"}
|
|
655
|
+
uploaded = client.post("/external/files", files=files, data=data)
|
|
656
|
+
uuid = uploaded.get("uuid") if isinstance(uploaded, dict) else None
|
|
657
|
+
if not uuid:
|
|
658
|
+
raise RuntimeError(f"upload-file response missing uuid for {file_path.name}: {uploaded}")
|
|
659
|
+
return uuid
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def run_cases_apply(args, client) -> int:
|
|
663
|
+
payload = json.loads(Path(args.cases).read_text())
|
|
664
|
+
cases = payload.get("cases") or []
|
|
665
|
+
if not cases:
|
|
666
|
+
log("error: no cases in payload")
|
|
667
|
+
return 2
|
|
668
|
+
|
|
669
|
+
shared_style_rubric = payload.get("shared_style_rubric")
|
|
670
|
+
shared_style_evals = payload.get("shared_style_evaluators") or []
|
|
671
|
+
|
|
672
|
+
needs_attach = any(case.get("attachment_files") for case in cases)
|
|
673
|
+
attach_dir: Path | None = Path(args.attachments_dir) if args.attachments_dir else None
|
|
674
|
+
if needs_attach and attach_dir is None:
|
|
675
|
+
log("error: at least one case has attachment_files, but --attachments-dir was not provided")
|
|
676
|
+
return 2
|
|
677
|
+
if needs_attach and attach_dir and not attach_dir.is_dir():
|
|
678
|
+
log(f"error: --attachments-dir does not exist or is not a directory: {attach_dir}")
|
|
679
|
+
return 2
|
|
680
|
+
|
|
681
|
+
workspace_id, _ = client.resolve_scope()
|
|
682
|
+
|
|
683
|
+
existing_by_input: dict[str, dict] = {}
|
|
684
|
+
if not args.allow_duplicates:
|
|
685
|
+
for existing in eval_mod.list_cases(client, args.agent):
|
|
686
|
+
existing_input = existing.get("input")
|
|
687
|
+
if isinstance(existing_input, str) and existing_input not in existing_by_input:
|
|
688
|
+
existing_by_input[existing_input] = existing
|
|
689
|
+
|
|
690
|
+
case_ids: list[str] = []
|
|
691
|
+
labels: list[str] = []
|
|
692
|
+
created: list[dict] = []
|
|
693
|
+
reused: list[dict] = []
|
|
694
|
+
for case in cases:
|
|
695
|
+
rubrics = dict(case.get("rubrics") or {})
|
|
696
|
+
if shared_style_rubric:
|
|
697
|
+
for ev_id in shared_style_evals:
|
|
698
|
+
rubrics.setdefault(ev_id, shared_style_rubric)
|
|
699
|
+
if not rubrics:
|
|
700
|
+
log(f"error: case '{case.get('label')}' has no rubrics")
|
|
701
|
+
return 2
|
|
702
|
+
|
|
703
|
+
label = case.get("label", "(unlabeled)")
|
|
704
|
+
attachment_ids: list[str] = []
|
|
705
|
+
for fname in case.get("attachment_files") or []:
|
|
706
|
+
fp = (attach_dir / fname).resolve() if attach_dir else None
|
|
707
|
+
if not fp or not fp.is_file():
|
|
708
|
+
log(f"error: attachment file not found for case '{label}': {fname}")
|
|
709
|
+
return 2
|
|
710
|
+
log(f" uploading attachment: {fname}")
|
|
711
|
+
uid = _upload_attachment(client, file_path=fp, workspace_id=workspace_id)
|
|
712
|
+
attachment_ids.append(uid)
|
|
713
|
+
|
|
714
|
+
existing = existing_by_input.get(case["input"])
|
|
715
|
+
if existing is not None:
|
|
716
|
+
case_id = existing["id"]
|
|
717
|
+
log(f"reusing existing case: {label} ({case_id[:8]})")
|
|
718
|
+
if case.get("expected_output") is not None or attachment_ids or case.get("meta") is not None or case.get("note") is not None:
|
|
719
|
+
eval_mod.update_case(
|
|
720
|
+
client, case_id,
|
|
721
|
+
expected_output=case.get("expected_output"),
|
|
722
|
+
attachment_ids=attachment_ids or None,
|
|
723
|
+
meta=case.get("meta"),
|
|
724
|
+
note=case.get("note"),
|
|
725
|
+
)
|
|
726
|
+
for ev_id, rubric in rubrics.items():
|
|
727
|
+
eval_mod.set_rubric(client, evaluation_case_id=case_id,
|
|
728
|
+
evaluator_id=ev_id, rubric=rubric)
|
|
729
|
+
case_ids.append(case_id)
|
|
730
|
+
labels.append(label)
|
|
731
|
+
reused.append({"case_id": case_id, "label": label})
|
|
732
|
+
continue
|
|
733
|
+
|
|
734
|
+
log(f"creating: {label}")
|
|
735
|
+
result = eval_mod.create_case_with_rubrics(
|
|
736
|
+
client, agent_id=args.agent, input=case["input"],
|
|
737
|
+
expected_output=case.get("expected_output"),
|
|
738
|
+
attachment_ids=attachment_ids or None,
|
|
739
|
+
rubrics_by_evaluator=rubrics, meta=case.get("meta"),
|
|
740
|
+
note=case.get("note"),
|
|
741
|
+
)
|
|
742
|
+
case_ids.append(result["id"])
|
|
743
|
+
labels.append(label)
|
|
744
|
+
created.append({"case_id": result["id"], "label": label})
|
|
745
|
+
|
|
746
|
+
out = {"case_ids": case_ids, "labels": labels, "created": created, "reused": reused}
|
|
747
|
+
out_text = json.dumps(out, indent=2, ensure_ascii=False)
|
|
748
|
+
print(out_text)
|
|
749
|
+
if args.out:
|
|
750
|
+
Path(args.out).write_text(out_text + "\n")
|
|
751
|
+
return 0
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
# ---------------------------------------------------------------------------
|
|
755
|
+
# eval rubrics
|
|
756
|
+
# ---------------------------------------------------------------------------
|
|
757
|
+
|
|
758
|
+
def run_rubrics(args, client) -> int:
|
|
759
|
+
workspace_id, _ = client.resolve_scope()
|
|
760
|
+
cases = eval_mod.list_cases(client, args.agent)
|
|
761
|
+
if args.cases:
|
|
762
|
+
wanted = set(_ids(args.cases) or [])
|
|
763
|
+
cases = [x for x in cases if x["id"] in wanted]
|
|
764
|
+
case_ids = [x["id"] for x in cases]
|
|
765
|
+
case_input = {x["id"]: (x.get("input") or "") for x in cases}
|
|
766
|
+
if not case_ids:
|
|
767
|
+
log("error: no cases for this agent")
|
|
768
|
+
return 2
|
|
769
|
+
|
|
770
|
+
if args.evaluators:
|
|
771
|
+
evaluator_ids = _ids(args.evaluators) or []
|
|
772
|
+
evaluators = [eval_mod.get_evaluator(client, eid) for eid in evaluator_ids]
|
|
773
|
+
else:
|
|
774
|
+
evaluators = eval_mod.list_evaluators(client, workspace_id)
|
|
775
|
+
evaluator_ids = [e["id"] for e in evaluators]
|
|
776
|
+
evaluator_name = {e["id"]: e.get("name", e["id"]) for e in evaluators}
|
|
777
|
+
if not evaluator_ids:
|
|
778
|
+
log("error: no evaluators in workspace")
|
|
779
|
+
return 2
|
|
780
|
+
|
|
781
|
+
log(f"reading {len(case_ids)} cases x {len(evaluator_ids)} evaluators...")
|
|
782
|
+
|
|
783
|
+
rubrics = eval_mod.get_case_rubrics(
|
|
784
|
+
client, agent_id=args.agent, workspace_id=workspace_id,
|
|
785
|
+
evaluator_ids=evaluator_ids, case_ids=case_ids,
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
for cid in case_ids:
|
|
789
|
+
log("=" * 80)
|
|
790
|
+
log(f"CASE {cid}")
|
|
791
|
+
log(f" input: {truncate(case_input.get(cid, ''), 120)}")
|
|
792
|
+
for ev_id in evaluator_ids:
|
|
793
|
+
ev_name = evaluator_name.get(ev_id, ev_id)
|
|
794
|
+
rubric_text = (rubrics.get(cid) or {}).get(ev_id, "")
|
|
795
|
+
if not rubric_text:
|
|
796
|
+
log(f" [{ev_name}] (rubric not set)")
|
|
797
|
+
else:
|
|
798
|
+
log(f" [{ev_name}]")
|
|
799
|
+
for line in rubric_text.splitlines():
|
|
800
|
+
log(f" {line}")
|
|
801
|
+
|
|
802
|
+
out = {
|
|
803
|
+
"agent_id": args.agent,
|
|
804
|
+
"workspace_id": workspace_id,
|
|
805
|
+
"evaluators": [{"id": e["id"], "name": e.get("name")} for e in evaluators],
|
|
806
|
+
"cases": [
|
|
807
|
+
{
|
|
808
|
+
"case_id": cid,
|
|
809
|
+
"input": case_input.get(cid, ""),
|
|
810
|
+
"rubrics_by_evaluator": {
|
|
811
|
+
ev_id: (rubrics.get(cid) or {}).get(ev_id)
|
|
812
|
+
for ev_id in evaluator_ids
|
|
813
|
+
},
|
|
814
|
+
}
|
|
815
|
+
for cid in case_ids
|
|
816
|
+
],
|
|
817
|
+
}
|
|
818
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
819
|
+
if args.out:
|
|
820
|
+
Path(args.out).write_text(json.dumps(out, indent=2, ensure_ascii=False) + "\n")
|
|
821
|
+
return 0
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
# ---------------------------------------------------------------------------
|
|
825
|
+
# eval rubrics-apply
|
|
826
|
+
# ---------------------------------------------------------------------------
|
|
827
|
+
|
|
828
|
+
def run_rubrics_apply(args, client) -> int:
|
|
829
|
+
payload = json.loads(Path(args.rubrics).read_text())
|
|
830
|
+
cases = payload.get("cases") or []
|
|
831
|
+
if not cases:
|
|
832
|
+
log("error: no cases in payload")
|
|
833
|
+
return 2
|
|
834
|
+
|
|
835
|
+
agent_id = payload.get("agent_id")
|
|
836
|
+
workspace_id, _ = client.resolve_scope()
|
|
837
|
+
|
|
838
|
+
all_case_ids = [c["case_id"] for c in cases]
|
|
839
|
+
all_evaluator_ids: set[str] = set()
|
|
840
|
+
for c in cases:
|
|
841
|
+
all_evaluator_ids.update((c.get("rubrics_by_evaluator") or {}).keys())
|
|
842
|
+
|
|
843
|
+
failed: list[dict] = []
|
|
844
|
+
valid_case_ids = set(all_case_ids)
|
|
845
|
+
valid_evaluator_ids = set(all_evaluator_ids)
|
|
846
|
+
|
|
847
|
+
if agent_id:
|
|
848
|
+
known_case_ids = {c["id"] for c in eval_mod.list_cases(client, agent_id)}
|
|
849
|
+
invalid_case_ids = valid_case_ids - known_case_ids
|
|
850
|
+
valid_case_ids &= known_case_ids
|
|
851
|
+
if invalid_case_ids:
|
|
852
|
+
log(f"warning: {len(invalid_case_ids)} case IDs are not part of agent {agent_id}")
|
|
853
|
+
|
|
854
|
+
known_evaluator_ids = {e["id"] for e in eval_mod.list_evaluators(client, workspace_id)}
|
|
855
|
+
invalid_evaluator_ids = valid_evaluator_ids - known_evaluator_ids
|
|
856
|
+
valid_evaluator_ids &= known_evaluator_ids
|
|
857
|
+
if invalid_evaluator_ids:
|
|
858
|
+
log(f"warning: {len(invalid_evaluator_ids)} evaluator IDs not in workspace {workspace_id}")
|
|
859
|
+
|
|
860
|
+
current: dict[str, dict[str, str]] = {}
|
|
861
|
+
if not args.force:
|
|
862
|
+
if not valid_case_ids or not valid_evaluator_ids:
|
|
863
|
+
log("reading current rubrics skipped: no valid case/evaluator pairs")
|
|
864
|
+
else:
|
|
865
|
+
log(f"reading current rubrics for {len(valid_case_ids)} cases x {len(valid_evaluator_ids)} evaluators...")
|
|
866
|
+
current = eval_mod.get_case_rubrics(
|
|
867
|
+
client, agent_id=agent_id or "", workspace_id=workspace_id,
|
|
868
|
+
evaluator_ids=list(valid_evaluator_ids), case_ids=list(valid_case_ids),
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
updated: list[dict] = []
|
|
872
|
+
skipped: list[dict] = []
|
|
873
|
+
|
|
874
|
+
for case in cases:
|
|
875
|
+
case_id = case["case_id"]
|
|
876
|
+
case_input = truncate(case.get("input") or "")
|
|
877
|
+
rubrics_map = case.get("rubrics_by_evaluator") or {}
|
|
878
|
+
|
|
879
|
+
for ev_id, new_rubric in rubrics_map.items():
|
|
880
|
+
entry = {
|
|
881
|
+
"case_id": case_id, "evaluator_id": ev_id,
|
|
882
|
+
"case_input": case_input,
|
|
883
|
+
"new_rubric_preview": truncate(new_rubric, 80),
|
|
884
|
+
}
|
|
885
|
+
if case_id not in valid_case_ids:
|
|
886
|
+
failed.append({**entry, "error": "case_id not found for agent"})
|
|
887
|
+
continue
|
|
888
|
+
if ev_id not in valid_evaluator_ids:
|
|
889
|
+
failed.append({**entry, "error": "evaluator_id not found for workspace"})
|
|
890
|
+
continue
|
|
891
|
+
|
|
892
|
+
old_rubric = (current.get(case_id) or {}).get(ev_id, "")
|
|
893
|
+
entry["old_rubric_preview"] = truncate(old_rubric, 80)
|
|
894
|
+
if not args.force and new_rubric == old_rubric:
|
|
895
|
+
skipped.append({"case_id": case_id, "evaluator_id": ev_id, "reason": "unchanged"})
|
|
896
|
+
continue
|
|
897
|
+
|
|
898
|
+
if args.dry_run:
|
|
899
|
+
log(f" [dry-run] would update: {case_input} x {ev_id[:8]}...")
|
|
900
|
+
updated.append(entry)
|
|
901
|
+
continue
|
|
902
|
+
|
|
903
|
+
try:
|
|
904
|
+
eval_mod.set_rubric(client, evaluation_case_id=case_id,
|
|
905
|
+
evaluator_id=ev_id, rubric=new_rubric)
|
|
906
|
+
log(f" updated: {case_input} x {ev_id[:8]}...")
|
|
907
|
+
updated.append(entry)
|
|
908
|
+
except Exception as e:
|
|
909
|
+
log(f" FAILED: {case_input} x {ev_id[:8]}... -- {e}")
|
|
910
|
+
failed.append({**entry, "error": str(e)})
|
|
911
|
+
|
|
912
|
+
log(f"\ndone: {len(updated)} updated, {len(skipped)} skipped (unchanged), {len(failed)} failed")
|
|
913
|
+
|
|
914
|
+
out = {"updated": updated, "skipped": skipped, "failed": failed}
|
|
915
|
+
out_text = json.dumps(out, indent=2, ensure_ascii=False)
|
|
916
|
+
print(out_text)
|
|
917
|
+
if args.out:
|
|
918
|
+
Path(args.out).write_text(out_text + "\n")
|
|
919
|
+
return 1 if failed else 0
|