codeer-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeer_cli/eval_.py ADDED
@@ -0,0 +1,423 @@
1
+ """Evaluation: cases, evaluators, trigger runs, read results.
2
+
3
+ 'eval' is a Python builtin, so this module is named ``eval_`` — import it as such::
4
+
5
+ from codeer_cli import eval_ as eval_mod
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, List, Optional
11
+
12
+ from .client import CodeerClient
13
+
14
+
15
+ # --- cases ----------------------------------------------------------------
16
+
17
+ def create_case(
18
+ client: CodeerClient,
19
+ *,
20
+ agent_id: str,
21
+ input: str,
22
+ expected_output: Optional[str] = None,
23
+ rubric: Optional[str] = None,
24
+ attachment_ids: Optional[List[str]] = None,
25
+ meta: Optional[dict] = None,
26
+ note: Optional[str] = None,
27
+ ) -> dict:
28
+ """Create an eval case.
29
+
30
+ IMPORTANT: the case-level ``rubric`` is a default and is NOT what the Test
31
+ Suite's ``Standard`` column displays. That column reads the per-evaluator
32
+ rubric set via ``POST /eval/rubric`` (:func:`set_rubric`). If you want a
33
+ case to show a Standard for a given evaluator, also call :func:`set_rubric`
34
+ for that (case, evaluator) pair — or use :func:`create_case_with_rubrics`
35
+ which does both in one shot.
36
+ """
37
+ body: dict[str, Any] = {"agent_id": agent_id, "input": input}
38
+ if expected_output is not None:
39
+ body["expected_output"] = expected_output
40
+ if rubric is not None:
41
+ body["rubric"] = rubric
42
+ if attachment_ids:
43
+ body["attachment_ids"] = attachment_ids
44
+ if meta:
45
+ body["meta"] = meta
46
+ if note is not None:
47
+ body["note"] = note
48
+ return client.post("/external/eval/cases", json=body)
49
+
50
+
51
+ def list_cases(client: CodeerClient, agent_id: str) -> list[dict]:
52
+ return client.get(f"/external/eval/agents/{agent_id}/cases")
53
+
54
+
55
+ def get_case(client: CodeerClient, case_id: str) -> dict:
56
+ return client.get(f"/external/eval/cases/{case_id}")
57
+
58
+
59
+ def update_case(
60
+ client: CodeerClient,
61
+ case_id: str,
62
+ *,
63
+ input: Optional[str] = None,
64
+ expected_output: Optional[str] = None,
65
+ rubric: Optional[str] = None,
66
+ attachment_ids: Optional[List[str]] = None,
67
+ meta: Optional[dict] = None,
68
+ note: Optional[str] = None,
69
+ ) -> dict:
70
+ body: dict[str, Any] = {}
71
+ if input is not None:
72
+ body["input"] = input
73
+ if expected_output is not None:
74
+ body["expected_output"] = expected_output
75
+ if rubric is not None:
76
+ body["rubric"] = rubric
77
+ if attachment_ids is not None:
78
+ body["attachment_ids"] = attachment_ids
79
+ if meta is not None:
80
+ body["meta"] = meta
81
+ if note is not None:
82
+ body["note"] = note
83
+ return client.put(f"/external/eval/cases/{case_id}", json=body)
84
+
85
+
86
+
87
+ # --- evaluators -----------------------------------------------------------
88
+
89
+ def create_evaluator(
90
+ client: CodeerClient,
91
+ *,
92
+ workspace_id: str,
93
+ name: str,
94
+ system_prompt_template: str,
95
+ description: Optional[str] = None,
96
+ ) -> dict:
97
+ body: dict[str, Any] = {
98
+ "name": name,
99
+ "system_prompt_template": system_prompt_template,
100
+ }
101
+ if description is not None:
102
+ body["description"] = description
103
+ return client.post("/external/eval/evaluators", json=body)
104
+
105
+
106
+ def list_evaluators(client: CodeerClient, workspace_id: str) -> list[dict]:
107
+ return client.get("/external/eval/evaluators")
108
+
109
+
110
+ def get_evaluator(client: CodeerClient, evaluator_id: str) -> dict:
111
+ return client.get(f"/external/eval/evaluators/{evaluator_id}")
112
+
113
+
114
+ def update_evaluator(
115
+ client: CodeerClient,
116
+ evaluator_id: str,
117
+ *,
118
+ name: Optional[str] = None,
119
+ system_prompt_template: Optional[str] = None,
120
+ description: Optional[str] = None,
121
+ ) -> dict:
122
+ body: dict[str, Any] = {}
123
+ if name is not None:
124
+ body["name"] = name
125
+ if system_prompt_template is not None:
126
+ body["system_prompt_template"] = system_prompt_template
127
+ if description is not None:
128
+ body["description"] = description
129
+ return client.put(f"/external/eval/evaluators/{evaluator_id}", json=body)
130
+
131
+
132
+
133
+ # --- runs + results -------------------------------------------------------
134
+
135
+ def trigger(
136
+ client: CodeerClient,
137
+ *,
138
+ case_ids: List[str],
139
+ evaluator_ids: List[str],
140
+ agent_history_id: Optional[str] = None,
141
+ ) -> dict:
142
+ """Kick off an evaluation run.
143
+
144
+ Pass ``agent_history_id`` to pin the run to a specific (possibly unpublished)
145
+ agent version — this is the core of the apply → eval → publish loop.
146
+ """
147
+ if agent_history_id is None:
148
+ raise ValueError("agent_history_id is required for external eval runs")
149
+ body: dict[str, Any] = {
150
+ "case_ids": case_ids,
151
+ "evaluator_ids": evaluator_ids,
152
+ "version_id": agent_history_id,
153
+ }
154
+ return client.post("/external/eval/runs", json=body)
155
+
156
+
157
+ def stop(client: CodeerClient, *, case_id: str, evaluator_id: str) -> Any:
158
+ return client.post("/external/eval/runs:stop", json={"case_id": case_id, "evaluator_id": evaluator_id})
159
+
160
+
161
+ def get_results(
162
+ client: CodeerClient,
163
+ *,
164
+ case_ids: List[str],
165
+ evaluator_id: str,
166
+ agent_history_id: str,
167
+ workspace_id: str,
168
+ include_output: bool = True,
169
+ include_reasoning_steps: bool = True,
170
+ ) -> list[dict]:
171
+ """Fetch scored results for a batch of cases under one evaluator + agent version."""
172
+ return client.post(
173
+ "/external/eval/results:batch",
174
+ json={
175
+ "case_ids": case_ids,
176
+ "evaluator_id": evaluator_id,
177
+ "version_id": agent_history_id,
178
+ "include_output": include_output,
179
+ "include_reasoning_steps": include_reasoning_steps,
180
+ },
181
+ )
182
+
183
+
184
+ def set_rubric(client: CodeerClient, *, evaluation_case_id: str, evaluator_id: str, rubric: str) -> Any:
185
+ """Set the per-evaluator rubric (the ``Standard`` the UI displays) for a case."""
186
+ return client.put(
187
+ f"/external/eval/cases/{evaluation_case_id}/rubrics/{evaluator_id}",
188
+ json={"rubric": rubric},
189
+ )
190
+
191
+
192
+ # --- reading rubrics back -------------------------------------------------
193
+ #
194
+ # Rubrics are per-(case, evaluator) and version-independent. The proper read
195
+ # endpoint is ``POST /eval/rubrics/batch`` — it returns the rubric string
196
+ # directly out of the ``CaseEvaluatorInfo`` table (the same row that
197
+ # ``set_rubric`` writes to). It does NOT require an ``agent_history_id``.
198
+ #
199
+ # Do not try to scrape rubrics out of past eval ``reason`` text — the judge
200
+ # paraphrases and reformats them, and a case with a rubric set but never
201
+ # evaluated will look indistinguishable from a case with no rubric. The
202
+ # legacy ``parse_rubrics_from_reason()`` helper still ships in ``parse.py``
203
+ # but it's a fallback for analyzing already-fetched judge output, not a
204
+ # discovery tool for current rubrics.
205
+
206
+ def get_rubrics_batch(
207
+ client: CodeerClient,
208
+ *,
209
+ case_ids: List[str],
210
+ evaluator_id: str,
211
+ ) -> list[dict]:
212
+ """Read rubric strings for a batch of (case, evaluator) pairs in one call.
213
+
214
+ Returns a list of ``{"case_id", "evaluator_id", "rubric"}`` dicts in the
215
+ same order as ``case_ids``. Cases without a rubric set come back with
216
+ ``rubric == ""`` — they're not omitted from the response.
217
+ """
218
+ return client.post(
219
+ "/external/eval/rubrics:batch",
220
+ json={"case_ids": case_ids, "evaluator_id": evaluator_id},
221
+ )
222
+
223
+
224
+ def get_case_rubrics(
225
+ client: CodeerClient,
226
+ *,
227
+ agent_id: str,
228
+ workspace_id: str,
229
+ evaluator_ids: Optional[List[str]] = None,
230
+ case_ids: Optional[List[str]] = None,
231
+ ) -> dict[str, dict[str, str]]:
232
+ """Read every (case, evaluator) rubric for an agent, in one nested dict.
233
+
234
+ Returns ``{case_id: {evaluator_id: rubric_str}, ...}``. An evaluator with
235
+ no rubric set for a given case still appears in the inner dict with
236
+ ``rubric_str == ""`` — that's the convention; treat empty string as
237
+ "no rubric currently set" (the case is being judged with no constraints).
238
+
239
+ If ``evaluator_ids`` is omitted, scans every evaluator in the workspace.
240
+ If ``case_ids`` is omitted, scans every case under the agent.
241
+ """
242
+ if evaluator_ids is None:
243
+ evaluator_ids = [e["id"] for e in list_evaluators(client, workspace_id)]
244
+ if case_ids is None:
245
+ case_ids = [c["id"] for c in list_cases(client, agent_id)]
246
+ if not evaluator_ids or not case_ids:
247
+ return {}
248
+
249
+ out: dict[str, dict[str, str]] = {cid: {} for cid in case_ids}
250
+ for ev_id in evaluator_ids:
251
+ rows = get_rubrics_batch(client, case_ids=case_ids, evaluator_id=ev_id)
252
+ for row in rows:
253
+ cid = row.get("case_id")
254
+ if cid in out:
255
+ out[cid][ev_id] = row.get("rubric") or ""
256
+ return out
257
+
258
+
259
+ def list_runs_for_case(
260
+ client: CodeerClient,
261
+ *,
262
+ case_id: str,
263
+ agent_id: str,
264
+ workspace_id: str,
265
+ evaluator_id: str,
266
+ include_output: bool = False,
267
+ ) -> list[dict]:
268
+ """Score history for ONE case across every version of the agent.
269
+
270
+ Use this when investigating regressions: "this case scored 1.0 on v38 but
271
+ 0 on v42 — when did it break?". Iterates ``/agents/{id}/histories`` and
272
+ asks ``/eval/results/batch`` per version, keeping only the rows that match
273
+ ``case_id``. Returns most-recent version first.
274
+
275
+ Returns a list of dicts:
276
+ {
277
+ "history_id": str,
278
+ "version_number": int,
279
+ "version_note": str,
280
+ "status": str, # 'draft' | 'published'
281
+ "was_published": bool,
282
+ "created_at": str,
283
+ "score": float | None, # None if the case wasn't run on this version
284
+ "reason": str | None,
285
+ "output": str | None, # only when include_output=True
286
+ }
287
+
288
+ Versions where the case was never evaluated come back with ``score=None``
289
+ rather than being omitted — useful for spotting "we forgot to add this
290
+ case to the run" alongside true regressions.
291
+ """
292
+ versions = client.get(f"/external/agents/{agent_id}/versions")
293
+ if not versions:
294
+ return []
295
+ out: list[dict] = []
296
+ for v in versions:
297
+ hid = v.get("id")
298
+ if not hid:
299
+ continue
300
+ score: Optional[float] = None
301
+ reason: Optional[str] = None
302
+ output: Optional[str] = None
303
+ try:
304
+ rows = get_results(
305
+ client, case_ids=[case_id], evaluator_id=evaluator_id,
306
+ agent_history_id=hid, workspace_id=workspace_id,
307
+ include_output=include_output,
308
+ )
309
+ except Exception:
310
+ rows = []
311
+ for r in rows or []:
312
+ cid = r.get("evaluation_case_id") or r.get("case_id")
313
+ if cid != case_id:
314
+ continue
315
+ score = r.get("score")
316
+ reason = r.get("reason")
317
+ output = r.get("output")
318
+ break
319
+ out.append({
320
+ "history_id": hid,
321
+ "version_number": v.get("version_number"),
322
+ "version_note": v.get("version_note") or "",
323
+ "status": v.get("status"),
324
+ "was_published": bool(v.get("was_published")),
325
+ "created_at": v.get("created_at"),
326
+ "score": score,
327
+ "reason": reason,
328
+ "output": output,
329
+ })
330
+ return out
331
+
332
+
333
+ def list_results_across_versions(
334
+ client: CodeerClient,
335
+ *,
336
+ agent_id: str,
337
+ workspace_id: str,
338
+ evaluator_id: str,
339
+ case_ids: Optional[List[str]] = None,
340
+ ) -> list[dict]:
341
+ """Fetch every eval result for an agent across ALL of its versions.
342
+
343
+ ``/eval/results/batch`` is per-version (``agent_history_id`` is required),
344
+ so this helper iterates every version of the agent and concatenates the
345
+ results. Use it when you want score history over time or want to find a
346
+ specific past judge ``reason`` — NOT for fetching current rubrics
347
+ (use :func:`get_case_rubrics` instead, which goes through the proper
348
+ ``/eval/rubrics/batch`` endpoint).
349
+
350
+ If ``case_ids`` is omitted, fetches results for every case under the agent.
351
+ """
352
+ versions = client.get(f"/external/agents/{agent_id}/versions")
353
+ if case_ids is None:
354
+ case_ids = [c["id"] for c in list_cases(client, agent_id)]
355
+ if not case_ids or not versions:
356
+ return []
357
+ out: list[dict] = []
358
+ for v in versions:
359
+ hid = v.get("id")
360
+ if not hid:
361
+ continue
362
+ try:
363
+ rows = get_results(
364
+ client, case_ids=case_ids, evaluator_id=evaluator_id,
365
+ agent_history_id=hid, workspace_id=workspace_id,
366
+ include_output=False,
367
+ )
368
+ except Exception:
369
+ continue
370
+ if rows:
371
+ out.extend(rows)
372
+ return out
373
+
374
+
375
+ def set_rubric_bulk(
376
+ client: CodeerClient,
377
+ *,
378
+ evaluation_case_id: str,
379
+ rubrics_by_evaluator: dict[str, str],
380
+ ) -> list[Any]:
381
+ """Set rubrics for one case across multiple evaluators in a single call."""
382
+ return [
383
+ set_rubric(client, evaluation_case_id=evaluation_case_id, evaluator_id=ev_id, rubric=r)
384
+ for ev_id, r in rubrics_by_evaluator.items()
385
+ ]
386
+
387
+
388
+ def create_case_with_rubrics(
389
+ client: CodeerClient,
390
+ *,
391
+ agent_id: str,
392
+ input: str,
393
+ rubrics_by_evaluator: dict[str, str],
394
+ expected_output: Optional[str] = None,
395
+ attachment_ids: Optional[List[str]] = None,
396
+ meta: Optional[dict] = None,
397
+ note: Optional[str] = None,
398
+ ) -> dict:
399
+ """Create a case AND populate the per-evaluator rubrics in one step.
400
+
401
+ This is the shape you almost always want — a case whose ``Standard`` column
402
+ is filled in for every evaluator it will be judged by.
403
+
404
+ ``rubrics_by_evaluator`` maps ``evaluator_id → rubric_text``. Each entry
405
+ becomes a ``POST /eval/rubric`` call after the case is created. Use
406
+ different rubric wording per evaluator when the evaluators judge different
407
+ aspects (e.g. Style/Tone vs Content Compliance).
408
+ """
409
+ case = create_case(
410
+ client,
411
+ agent_id=agent_id,
412
+ input=input,
413
+ expected_output=expected_output,
414
+ attachment_ids=attachment_ids,
415
+ meta=meta,
416
+ note=note,
417
+ )
418
+ set_rubric_bulk(
419
+ client,
420
+ evaluation_case_id=case["id"],
421
+ rubrics_by_evaluator=rubrics_by_evaluator,
422
+ )
423
+ return case
@@ -0,0 +1,156 @@
1
+ """Post-release analysis: read conversation histories and their feedback signals.
2
+
3
+ Use this after an agent has been published and running for a while, to pull
4
+ recent traffic, filter by feedback, and feed the failing cases back into the
5
+ evaluation loop.
6
+
7
+ Pagination: ``/histories`` uses ``limit`` + ``offset`` (NOT ``page`` /
8
+ ``page_size``). Default ``limit=500`` here is a deliberate choice for analysis
9
+ workflows — the backend caps responses anyway and returning everything in one
10
+ call removes a common foot-gun where the caller silently truncates at 10.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Any, Iterable, Optional
16
+
17
+ from .client import CodeerClient
18
+
19
+
20
+ def list(
21
+ client: CodeerClient,
22
+ *,
23
+ agent_id: Optional[str] = None,
24
+ workspace_id: Optional[str] = None,
25
+ organization_id: Optional[str] = None,
26
+ external_user_id: Optional[str] = None,
27
+ feedback_filter: Optional[str] = None,
28
+ exclude_users: Iterable[str] = (),
29
+ limit: int = 500,
30
+ offset: int = 0,
31
+ order_by: str = "desc",
32
+ ) -> list[dict]:
33
+ """List conversation histories, optionally filtered by agent and feedback state.
34
+
35
+ ``exclude_users`` filters out histories whose ``external_user_id`` matches
36
+ any of the given values (case-insensitive). Use this to exclude internal
37
+ testing accounts from production analysis.
38
+
39
+ feedback_filter values are defined by FeedbackFilterType in the backend —
40
+ typical values include 'positive' / 'negative' / 'any'. Check the current
41
+ enum before assuming.
42
+ """
43
+ params: dict[str, Any] = {"limit": limit, "offset": offset, "order_by": order_by}
44
+ if agent_id:
45
+ params["agent_id"] = agent_id
46
+ if external_user_id:
47
+ params["external_user_id"] = external_user_id
48
+ if feedback_filter:
49
+ params["feedback_filter"] = feedback_filter
50
+ rows = client.get("/external/histories", params=params)
51
+ drop = {e.lower() for e in exclude_users}
52
+ if drop:
53
+ rows = [h for h in rows if (h.get("external_user_id") or "").lower() not in drop]
54
+ return rows
55
+
56
+
57
+ def list_negative_feedback_turns(
58
+ client: CodeerClient,
59
+ *,
60
+ agent_id: str,
61
+ workspace_id: Optional[str] = None,
62
+ organization_id: Optional[str] = None,
63
+ exclude_users: Iterable[str] = (),
64
+ feedback_types: Iterable[str] = ("sys_improve",),
65
+ limit: int = 500,
66
+ user_excerpt_chars: int = 200,
67
+ assistant_excerpt_chars: int = 400,
68
+ ) -> list[dict]:
69
+ """Walk every (filtered) history and surface assistant turns flagged by users.
70
+
71
+ Returns a flat list of dicts, one per matching turn:
72
+ {
73
+ "history_id": int,
74
+ "history_title": str,
75
+ "external_user_id": str,
76
+ "created_at": str,
77
+ "turn_idx": int,
78
+ "feedback_type": str, # 'sys_improve' / 'sys_helpful' / etc.
79
+ "feedback_text": str,
80
+ "user_message": str, # the user turn that preceded this assistant
81
+ "assistant_excerpt": str, # the assistant text (tool markers stripped)
82
+ }
83
+
84
+ Designed for "what's failing in production?" analysis: piping the result
85
+ straight into a dataframe / spreadsheet should let you cluster failure
86
+ modes without ever loading raw conversation JSON.
87
+
88
+ The conversation feedback row shape is::
89
+
90
+ {"id": N, "tag": "system", "type": "sys_improve",
91
+ "identity": {...}, "content": "...", "created_at": "..."}
92
+
93
+ The user-meaningful sentiment lives in ``type`` (NOT ``tag``, which is
94
+ the source channel — usually "system"). Pass the desired sentiment(s)
95
+ in ``feedback_types``.
96
+
97
+ Cost: O(N histories) network calls — one ``/histories/{id}/conversations``
98
+ per history. Filter aggressively via ``exclude_users`` and ``limit``
99
+ before invoking on a busy agent.
100
+ """
101
+ from .parse import strip_tool_markers # local import to avoid cycle
102
+
103
+ type_set = {t.lower() for t in feedback_types}
104
+ histories = list(
105
+ client,
106
+ agent_id=agent_id,
107
+ workspace_id=workspace_id,
108
+ organization_id=organization_id,
109
+ exclude_users=exclude_users,
110
+ limit=limit,
111
+ )
112
+ out: list[dict] = []
113
+ for h in histories:
114
+ hid = h.get("id")
115
+ if hid is None:
116
+ continue
117
+ try:
118
+ convs = get_conversations(client, hid)
119
+ except Exception:
120
+ continue
121
+ for i, c in enumerate(convs):
122
+ if (c.get("role") or "") != "assistant":
123
+ continue
124
+ fbs = c.get("feedbacks") or []
125
+ for fb in fbs:
126
+ ftype = (fb.get("type") or "").lower()
127
+ if ftype not in type_set:
128
+ continue
129
+ # Find the most recent user turn before this assistant turn.
130
+ prior_user = ""
131
+ for j in range(i - 1, -1, -1):
132
+ if (convs[j].get("role") or "") == "user":
133
+ prior_user = (convs[j].get("content") or "")[:user_excerpt_chars]
134
+ break
135
+ out.append({
136
+ "history_id": hid,
137
+ "history_title": h.get("name") or h.get("title") or "",
138
+ "external_user_id": h.get("external_user_id") or "",
139
+ "created_at": h.get("created_at"),
140
+ "turn_idx": i,
141
+ "feedback_type": ftype,
142
+ "feedback_text": fb.get("content") or "",
143
+ "user_message": prior_user,
144
+ "assistant_excerpt": strip_tool_markers(c.get("content") or "")[:assistant_excerpt_chars],
145
+ })
146
+ return out
147
+
148
+
149
+ def get(client: CodeerClient, history_id: int) -> dict:
150
+ return client.get(f"/external/histories/{history_id}")
151
+
152
+
153
+ def get_conversations(client: CodeerClient, history_id: int) -> list[dict]:
154
+ """Return all conversation turns for a history — includes tool calls and reasoning."""
155
+ return client.get(f"/external/histories/{history_id}/conversations")
156
+