selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
  2. selfevals/__init__.py +19 -0
  3. selfevals/_errors.py +44 -0
  4. selfevals/_internal/__init__.py +0 -0
  5. selfevals/_internal/hashing.py +23 -0
  6. selfevals/_internal/ids.py +65 -0
  7. selfevals/_internal/time.py +17 -0
  8. selfevals/analysis/__init__.py +23 -0
  9. selfevals/analysis/bundle.py +162 -0
  10. selfevals/analysis/hypothesis.py +26 -0
  11. selfevals/analysis/ingest.py +185 -0
  12. selfevals/analysis/schemas.py +119 -0
  13. selfevals/analysis/staging.py +34 -0
  14. selfevals/api/__init__.py +24 -0
  15. selfevals/api/__main__.py +47 -0
  16. selfevals/api/app.py +351 -0
  17. selfevals/api/broker.py +210 -0
  18. selfevals/api/broker_bridge.py +29 -0
  19. selfevals/api/queries.py +447 -0
  20. selfevals/api/schemas.py +151 -0
  21. selfevals/api/sse.py +114 -0
  22. selfevals/cli/__init__.py +15 -0
  23. selfevals/cli/_friendly.py +180 -0
  24. selfevals/cli/_help.py +55 -0
  25. selfevals/cli/analyze_commands.py +169 -0
  26. selfevals/cli/commands.py +615 -0
  27. selfevals/cli/main.py +409 -0
  28. selfevals/decision/__init__.py +34 -0
  29. selfevals/decision/matrix.py +185 -0
  30. selfevals/examples/__init__.py +8 -0
  31. selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
  32. selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
  33. selfevals/examples/pingpong.py +21 -0
  34. selfevals/graders/__init__.py +46 -0
  35. selfevals/graders/base.py +54 -0
  36. selfevals/graders/calibration.py +145 -0
  37. selfevals/graders/deterministic.py +143 -0
  38. selfevals/graders/llm_judge.py +187 -0
  39. selfevals/graders/registry.py +66 -0
  40. selfevals/optimization/__init__.py +47 -0
  41. selfevals/optimization/aggregator.py +246 -0
  42. selfevals/optimization/loop.py +432 -0
  43. selfevals/optimization/proposers.py +202 -0
  44. selfevals/py.typed +0 -0
  45. selfevals/repo/__init__.py +28 -0
  46. selfevals/repo/loader.py +276 -0
  47. selfevals/reporter/__init__.py +21 -0
  48. selfevals/reporter/_metrics.py +114 -0
  49. selfevals/reporter/compare.py +221 -0
  50. selfevals/reporter/json_report.py +105 -0
  51. selfevals/reporter/markdown.py +232 -0
  52. selfevals/runner/__init__.py +42 -0
  53. selfevals/runner/adapters.py +268 -0
  54. selfevals/runner/executor.py +234 -0
  55. selfevals/runner/otlp_receiver.py +343 -0
  56. selfevals/runner/otlp_to_recorder.py +180 -0
  57. selfevals/runner/sandbox.py +46 -0
  58. selfevals/schemas/__init__.py +213 -0
  59. selfevals/schemas/_base.py +82 -0
  60. selfevals/schemas/annotation.py +55 -0
  61. selfevals/schemas/dataset.py +111 -0
  62. selfevals/schemas/enums.py +324 -0
  63. selfevals/schemas/eval_case.py +189 -0
  64. selfevals/schemas/experiment.py +367 -0
  65. selfevals/schemas/failure_mode.py +76 -0
  66. selfevals/schemas/fleet.py +111 -0
  67. selfevals/schemas/grader_card.py +112 -0
  68. selfevals/schemas/iteration.py +219 -0
  69. selfevals/schemas/registry.py +125 -0
  70. selfevals/schemas/tool.py +43 -0
  71. selfevals/schemas/trace.py +384 -0
  72. selfevals/schemas/workspace.py +69 -0
  73. selfevals/sdk/__init__.py +24 -0
  74. selfevals/sdk/auto_instrument.py +165 -0
  75. selfevals/sdk/context.py +45 -0
  76. selfevals/sdk/exporter.py +50 -0
  77. selfevals/sdk/facade.py +203 -0
  78. selfevals/skills/__init__.py +61 -0
  79. selfevals/storage/__init__.py +53 -0
  80. selfevals/storage/errors.py +66 -0
  81. selfevals/storage/filesystem.py +137 -0
  82. selfevals/storage/interface.py +135 -0
  83. selfevals/storage/migrations/__init__.py +80 -0
  84. selfevals/storage/migrations/m0001_initial.py +57 -0
  85. selfevals/storage/seed.py +199 -0
  86. selfevals/storage/sqlite.py +232 -0
  87. selfevals/trace/__init__.py +31 -0
  88. selfevals/trace/otel_importer.py +455 -0
  89. selfevals/trace/payload_router.py +106 -0
  90. selfevals/trace/recorder.py +540 -0
  91. selfevals/version.py +1 -0
  92. selfevals-0.2.2.dist-info/METADATA +283 -0
  93. selfevals-0.2.2.dist-info/RECORD +96 -0
  94. selfevals-0.2.2.dist-info/WHEEL +4 -0
  95. selfevals-0.2.2.dist-info/entry_points.txt +2 -0
  96. selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,58 @@
1
+ # Example experiment spec.
2
+ #
3
+ # Copy it with:
4
+ # selfevals examples copy pingpong
5
+ # Then run:
6
+ # selfevals run evals/experiments/example_pingpong.yaml --no-persist
7
+
8
+ workspace: ws_01HZZZZZZZZZZZZZZZZZZZZZZZ
9
+
10
+ experiment:
11
+ name: pingpong baseline
12
+ goal: warm up the end-to-end loop with a trivial echo agent
13
+ mode: handoff
14
+ taxonomy:
15
+ target_features:
16
+ - commerce.product_resolution
17
+ dataset_types:
18
+ - capability
19
+ datasets:
20
+ optimization: { id: ds_pingpong, version: 1 }
21
+ target:
22
+ primary: { name: pass@1, operator: ">=", value: 0.5 }
23
+ editable:
24
+ prompt: true
25
+ model_params: true
26
+ frozen:
27
+ fleet: { id: flt_demo }
28
+ agents:
29
+ - { id: ag_demo }
30
+ datasets:
31
+ - { id: ds_pingpong }
32
+ proposer:
33
+ strategy: grid
34
+ search_space:
35
+ model_params:
36
+ level: [0.0, 1.0]
37
+ run:
38
+ sandbox: mock
39
+ max_iterations: 4
40
+ convergence:
41
+ min_delta: 1.0e-6
42
+ patience: 10
43
+ reliability:
44
+ metrics:
45
+ - pass@1
46
+ error_analysis:
47
+ enabled: true
48
+ taxonomy: workspace
49
+ trigger:
50
+ when: fail_rate_above
51
+ threshold: 0.10
52
+ scope: failed_only
53
+
54
+ dataset:
55
+ cases_path: ../datasets/pingpong.jsonl
56
+
57
+ agent:
58
+ entrypoint: selfevals.examples.pingpong:run
@@ -0,0 +1,21 @@
1
+ """Trivial example agent for the pingpong experiment.
2
+
3
+ It returns "pong" when the proposer cranks `model_params.level >= 0.5`,
4
+ and "miss" otherwise. That lets the grid proposer demonstrate a real
5
+ improvement path from level=0.0 (fail) to level=1.0 (pass).
6
+
7
+ Real agents will replace this with a function that calls Anthropic /
8
+ OpenAI / their framework of choice. The contract is the same:
9
+
10
+ def run(req: AdapterRequest) -> AdapterResponse | str: ...
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from selfevals.runner.adapters import AdapterRequest, AdapterResponse
16
+
17
+
18
+ def run(req: AdapterRequest) -> AdapterResponse:
19
+ level = req.parameters.get("model_params", {}).get("level", 0.0)
20
+ content = "pong" if level >= 0.5 else "miss"
21
+ return AdapterResponse(content=content, tokens_input=4, tokens_output=2)
@@ -0,0 +1,46 @@
1
+ """Graders: score traces against expectations.
2
+
3
+ A `Grader` reads a Trace + EvalCase and returns a `GradeResult` (label
4
+ + optional score + reason). Two concrete graders ship in MVP:
5
+
6
+ - `DeterministicGrader` evaluates rule-based expectations declared on
7
+ `EvalCase.expected` (must_include / forbidden_tools / regex / schema).
8
+ - `LLMJudgeGrader` invokes an `AgentAdapter` as a judge against a rubric
9
+ prompt; single-judge in MVP, panel infrastructure-ready for post-MVP.
10
+
11
+ Calibration helpers turn observed predictions + human annotations into
12
+ the metrics tracked on a `GraderCard`.
13
+ """
14
+
15
+ from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
16
+ from selfevals.graders.calibration import (
17
+ CalibrationReport,
18
+ HumanLabel,
19
+ PredictedLabel,
20
+ compute_classification_metrics,
21
+ )
22
+ from selfevals.graders.deterministic import (
23
+ DeterministicGrader,
24
+ DeterministicRuleViolationError,
25
+ )
26
+ from selfevals.graders.llm_judge import (
27
+ JudgeDecision,
28
+ LLMJudgeGrader,
29
+ RubricTemplate,
30
+ )
31
+
32
+ __all__ = [
33
+ "CalibrationReport",
34
+ "DeterministicGrader",
35
+ "DeterministicRuleViolationError",
36
+ "GradeLabel",
37
+ "GradeResult",
38
+ "Grader",
39
+ "GraderContext",
40
+ "HumanLabel",
41
+ "JudgeDecision",
42
+ "LLMJudgeGrader",
43
+ "PredictedLabel",
44
+ "RubricTemplate",
45
+ "compute_classification_metrics",
46
+ ]
@@ -0,0 +1,54 @@
1
+ """Grader ABC + shared result types.
2
+
3
+ The grader contract is intentionally narrow: receive a context bundle
4
+ (case, trace, optional response) and return a `GradeResult`. The case
5
+ and the trace carry everything needed to score; graders never reach
6
+ into storage themselves.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from dataclasses import dataclass, field
13
+ from enum import StrEnum
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ if TYPE_CHECKING:
17
+ from selfevals.runner.adapters import AdapterResponse
18
+ from selfevals.schemas.eval_case import EvalCase
19
+ from selfevals.schemas.trace import Trace
20
+
21
+
22
+ class GradeLabel(StrEnum):
23
+ PASS = "pass"
24
+ FAIL = "fail"
25
+ PARTIAL = "partial"
26
+ ERROR = "error"
27
+ SKIPPED = "skipped"
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class GradeResult:
32
+ grader: str
33
+ label: GradeLabel
34
+ reason: str
35
+ score: float | None = None
36
+ confidence: float | None = None
37
+ failure_modes: list[str] = field(default_factory=list)
38
+ details: dict[str, Any] = field(default_factory=dict)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class GraderContext:
43
+ case: EvalCase
44
+ trace: Trace
45
+ response: AdapterResponse | None = None
46
+
47
+
48
+ class Grader(ABC):
49
+ name: str
50
+ """Stable identifier — used as the GradeResult.grader and the
51
+ Trace.grader_results[i].grader field."""
52
+
53
+ @abstractmethod
54
+ def grade(self, context: GraderContext) -> GradeResult: ...
@@ -0,0 +1,145 @@
1
+ """Calibration helpers: turn predictions + human labels into classification metrics.
2
+
3
+ These functions live outside any specific grader so they can be reused by
4
+ calibration scripts, dashboards, and the optimizer. They consume two flat
5
+ lists keyed by `case_id` and produce a `CalibrationReport`.
6
+
7
+ Metrics are computed treating `pass` as the positive class by default;
8
+ `positive_label` is configurable. Macro-F1 is averaged across all observed
9
+ labels.
10
+
11
+ A class-imbalance guard: if a label appears in only one of the two streams,
12
+ precision/recall for that class are reported as `None` rather than zero —
13
+ this avoids the "100% precision on a class with 0 predictions" trap.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from collections import Counter
19
+ from collections.abc import Mapping
20
+ from dataclasses import dataclass, field
21
+
22
+ from selfevals.graders.base import GradeLabel
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class PredictedLabel:
27
+ case_id: str
28
+ label: GradeLabel
29
+ confidence: float | None = None
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class HumanLabel:
34
+ case_id: str
35
+ label: GradeLabel
36
+ high_risk: bool = False
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class CalibrationReport:
41
+ n_pairs: int
42
+ precision: float | None
43
+ recall: float | None
44
+ f1: float | None
45
+ macro_f1: float | None
46
+ accuracy: float
47
+ high_risk_false_negatives: int
48
+ per_label_precision: dict[str, float | None] = field(default_factory=dict)
49
+ per_label_recall: dict[str, float | None] = field(default_factory=dict)
50
+ confusion: dict[tuple[str, str], int] = field(default_factory=dict)
51
+
52
+
53
+ def _safe_div(num: float, den: float) -> float | None:
54
+ if den == 0:
55
+ return None
56
+ return num / den
57
+
58
+
59
+ def _f1(p: float | None, r: float | None) -> float | None:
60
+ if p is None or r is None or (p + r) == 0:
61
+ return None
62
+ return 2 * p * r / (p + r)
63
+
64
+
65
+ def compute_classification_metrics(
66
+ predictions: list[PredictedLabel],
67
+ human_labels: list[HumanLabel],
68
+ *,
69
+ positive_label: GradeLabel = GradeLabel.PASS,
70
+ ) -> CalibrationReport:
71
+ """Compute precision/recall/F1/macro-F1 + high-risk FNs.
72
+
73
+ Pairs are joined on case_id. Cases with a prediction but no human label
74
+ (or vice versa) are dropped from the metrics but counted via n_pairs=0
75
+ if there are no pairs at all.
76
+ """
77
+ pred_by_case: Mapping[str, PredictedLabel] = {p.case_id: p for p in predictions}
78
+ human_by_case: Mapping[str, HumanLabel] = {h.case_id: h for h in human_labels}
79
+ paired_ids = sorted(set(pred_by_case) & set(human_by_case))
80
+ n = len(paired_ids)
81
+
82
+ if n == 0:
83
+ return CalibrationReport(
84
+ n_pairs=0,
85
+ precision=None,
86
+ recall=None,
87
+ f1=None,
88
+ macro_f1=None,
89
+ accuracy=0.0,
90
+ high_risk_false_negatives=0,
91
+ )
92
+
93
+ confusion: Counter[tuple[str, str]] = Counter()
94
+ correct = 0
95
+ high_risk_fns = 0
96
+ for case_id in paired_ids:
97
+ pred = pred_by_case[case_id].label.value
98
+ human = human_by_case[case_id].label.value
99
+ confusion[(human, pred)] += 1
100
+ if pred == human:
101
+ correct += 1
102
+ if (
103
+ human == positive_label.value
104
+ and pred != positive_label.value
105
+ and human_by_case[case_id].high_risk
106
+ ):
107
+ high_risk_fns += 1
108
+
109
+ accuracy = correct / n
110
+
111
+ labels = sorted({k[0] for k in confusion} | {k[1] for k in confusion})
112
+ per_label_precision: dict[str, float | None] = {}
113
+ per_label_recall: dict[str, float | None] = {}
114
+ f1_values: list[float] = []
115
+ for label in labels:
116
+ tp = confusion[(label, label)]
117
+ pred_pos = sum(c for (h, p), c in confusion.items() if p == label)
118
+ actual_pos = sum(c for (h, p), c in confusion.items() if h == label)
119
+ precision_l = _safe_div(tp, pred_pos)
120
+ recall_l = _safe_div(tp, actual_pos)
121
+ per_label_precision[label] = precision_l
122
+ per_label_recall[label] = recall_l
123
+ f1_l = _f1(precision_l, recall_l)
124
+ if f1_l is not None:
125
+ f1_values.append(f1_l)
126
+
127
+ macro_f1 = sum(f1_values) / len(f1_values) if f1_values else None
128
+
129
+ pos = positive_label.value
130
+ precision_pos = per_label_precision.get(pos)
131
+ recall_pos = per_label_recall.get(pos)
132
+ f1_pos = _f1(precision_pos, recall_pos)
133
+
134
+ return CalibrationReport(
135
+ n_pairs=n,
136
+ precision=precision_pos,
137
+ recall=recall_pos,
138
+ f1=f1_pos,
139
+ macro_f1=macro_f1,
140
+ accuracy=accuracy,
141
+ high_risk_false_negatives=high_risk_fns,
142
+ per_label_precision=per_label_precision,
143
+ per_label_recall=per_label_recall,
144
+ confusion=dict(confusion),
145
+ )
@@ -0,0 +1,143 @@
1
+ """DeterministicGrader: declarative rules from EvalCase.expected.
2
+
3
+ Rules supported in MVP:
4
+ - must_include: every string must appear in the final response (case-
5
+ insensitive by default; controllable via constructor flag).
6
+ - must_not_include: none of the strings may appear.
7
+ - required_tools: every tool listed must appear in the trace.
8
+ - forbidden_tools: no tool listed may appear.
9
+ - regex_match: optional regex applied to the final response.
10
+ - structured_output equality: when EvalCase.expected.structured_output
11
+ is set, the adapter's structured_output must match exactly.
12
+
13
+ Each rule has a stable failure-mode tag emitted in GradeResult.failure_modes
14
+ so weighted scoring can attribute failures upstream.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from dataclasses import dataclass
21
+ from typing import TYPE_CHECKING
22
+
23
+ from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
24
+ from selfevals.schemas.trace import LLMCallSpan, ToolCallSpan
25
+
26
+ if TYPE_CHECKING:
27
+ from selfevals.schemas.eval_case import Expected
28
+ from selfevals.schemas.trace import Trace
29
+
30
+
31
+ class DeterministicRuleViolationError(RuntimeError):
32
+ """Raised if the grader is asked to evaluate a contradictory rule set."""
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class _Violation:
37
+ failure_mode: str
38
+ detail: str
39
+
40
+
41
+ def _final_response_text(ctx: GraderContext) -> str:
42
+ if ctx.response is not None and ctx.response.content:
43
+ return ctx.response.content
44
+ # Fall back to the final structured output's "content" or empty.
45
+ if ctx.response is not None and ctx.response.structured_output is not None:
46
+ content = ctx.response.structured_output.get("content")
47
+ if isinstance(content, str):
48
+ return content
49
+ return ""
50
+
51
+
52
+ def _tools_invoked(trace: Trace) -> list[str]:
53
+ return [s.tool_name for s in trace.spans if isinstance(s, ToolCallSpan)]
54
+
55
+
56
+ def _llm_call_count(trace: Trace) -> int:
57
+ return sum(1 for s in trace.spans if isinstance(s, LLMCallSpan))
58
+
59
+
60
+ class DeterministicGrader(Grader):
61
+ def __init__(
62
+ self,
63
+ name: str = "deterministic",
64
+ *,
65
+ case_sensitive: bool = False,
66
+ regex_match: re.Pattern[str] | str | None = None,
67
+ ) -> None:
68
+ if not name:
69
+ raise ValueError("grader name must be non-empty")
70
+ self.name = name
71
+ self._case_sensitive = case_sensitive
72
+ if isinstance(regex_match, str):
73
+ self._regex: re.Pattern[str] | None = re.compile(regex_match)
74
+ else:
75
+ self._regex = regex_match
76
+
77
+ def grade(self, context: GraderContext) -> GradeResult:
78
+ expected: Expected = context.case.expected
79
+ violations: list[_Violation] = []
80
+
81
+ text = _final_response_text(context)
82
+ haystack = text if self._case_sensitive else text.lower()
83
+ invoked = _tools_invoked(context.trace)
84
+ invoked_set = set(invoked)
85
+
86
+ for needle in expected.must_include:
87
+ probe = needle if self._case_sensitive else needle.lower()
88
+ if probe not in haystack:
89
+ violations.append(
90
+ _Violation(failure_mode="missing_required_substring", detail=needle)
91
+ )
92
+
93
+ for needle in expected.must_not_include:
94
+ probe = needle if self._case_sensitive else needle.lower()
95
+ if probe in haystack:
96
+ violations.append(_Violation(failure_mode="forbidden_substring", detail=needle))
97
+
98
+ for required in expected.required_tools:
99
+ if required not in invoked_set:
100
+ violations.append(_Violation(failure_mode="missing_required_tool", detail=required))
101
+
102
+ for forbidden in expected.forbidden_tools:
103
+ if forbidden in invoked_set:
104
+ violations.append(
105
+ _Violation(failure_mode="forbidden_tool_invoked", detail=forbidden)
106
+ )
107
+
108
+ if self._regex is not None and not self._regex.search(text):
109
+ violations.append(_Violation(failure_mode="regex_mismatch", detail=self._regex.pattern))
110
+
111
+ if expected.structured_output is not None:
112
+ response_struct = context.response.structured_output if context.response else None
113
+ if response_struct != expected.structured_output:
114
+ violations.append(
115
+ _Violation(
116
+ failure_mode="structured_output_mismatch",
117
+ detail="expected != actual",
118
+ )
119
+ )
120
+
121
+ # Loose hint metrics that don't fail but are useful for debug.
122
+ details = {
123
+ "tools_invoked": invoked,
124
+ "llm_call_count": _llm_call_count(context.trace),
125
+ }
126
+ if not violations:
127
+ return GradeResult(
128
+ grader=self.name,
129
+ label=GradeLabel.PASS,
130
+ reason="all deterministic rules satisfied",
131
+ score=1.0,
132
+ details=details,
133
+ )
134
+ modes = sorted({v.failure_mode for v in violations})
135
+ reason = "; ".join(f"{v.failure_mode}:{v.detail}" for v in violations)
136
+ return GradeResult(
137
+ grader=self.name,
138
+ label=GradeLabel.FAIL,
139
+ reason=reason,
140
+ score=0.0,
141
+ failure_modes=modes,
142
+ details=details,
143
+ )
@@ -0,0 +1,187 @@
1
+ """LLMJudgeGrader: invoke an AgentAdapter as a judge.
2
+
3
+ The grader formats a rubric prompt with the case input, the agent's final
4
+ response, and the rubric instructions; the judge adapter returns a JSON
5
+ response with `label`, `reason`, optional `score`, optional `confidence`.
6
+
7
+ MVP ships single-judge. The constructor accepts an optional `card` (a
8
+ `GraderCard`) so future panel infra can pin behavior to a calibrated
9
+ configuration. When `card.blocking` and calibration metrics are below
10
+ thresholds, the grader degrades to advisory (returns SKIPPED) per the
11
+ operational spec — unless `force=True` is set.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from dataclasses import dataclass
18
+ from string import Template
19
+ from typing import TYPE_CHECKING, Any
20
+
21
+ from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
22
+
23
+ if TYPE_CHECKING:
24
+ from selfevals.runner.adapters import AdapterRequest, AgentAdapter
25
+ from selfevals.schemas.grader_card import GraderCard
26
+
27
+
28
+ _DEFAULT_RUBRIC = """You are an evaluator. Read the case input and the agent's response,
29
+ then decide whether the response meets the rubric.
30
+
31
+ Rubric:
32
+ $rubric
33
+
34
+ Case input:
35
+ $case_input
36
+
37
+ Agent response:
38
+ $agent_response
39
+
40
+ Return a single JSON object with keys:
41
+ - label: one of "pass", "fail", "partial"
42
+ - reason: short justification
43
+ - score: number in [0, 1] (optional)
44
+ - confidence: number in [0, 1] (optional)
45
+ """
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class RubricTemplate:
50
+ rubric: str
51
+ template: str = _DEFAULT_RUBRIC
52
+
53
+ def render(self, *, case_input: Any, agent_response: str) -> str:
54
+ return Template(self.template).safe_substitute(
55
+ rubric=self.rubric,
56
+ case_input=json.dumps(case_input, ensure_ascii=False),
57
+ agent_response=agent_response,
58
+ )
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class JudgeDecision:
63
+ label: GradeLabel
64
+ reason: str
65
+ score: float | None = None
66
+ confidence: float | None = None
67
+
68
+
69
+ def _parse_judge_output(text: str) -> JudgeDecision:
70
+ try:
71
+ data = json.loads(text)
72
+ except json.JSONDecodeError as exc:
73
+ raise ValueError(f"judge did not return valid JSON: {exc}; text={text!r}") from exc
74
+ if not isinstance(data, dict):
75
+ raise ValueError(f"judge JSON must be an object, got {type(data).__name__}")
76
+ raw_label = str(data.get("label", "")).strip().lower()
77
+ if raw_label not in {label.value for label in GradeLabel}:
78
+ raise ValueError(f"judge returned unknown label: {raw_label!r}")
79
+ label = GradeLabel(raw_label)
80
+ reason = str(data.get("reason", "")).strip() or "no reason supplied"
81
+ score = data.get("score")
82
+ confidence = data.get("confidence")
83
+ return JudgeDecision(
84
+ label=label,
85
+ reason=reason,
86
+ score=float(score) if score is not None else None,
87
+ confidence=float(confidence) if confidence is not None else None,
88
+ )
89
+
90
+
91
+ def _is_card_calibrated(card: GraderCard | None) -> bool:
92
+ if card is None:
93
+ return True
94
+ if not card.blocking:
95
+ return True
96
+ metrics = card.metrics
97
+ thresholds = card.thresholds
98
+ if thresholds.min_precision is not None and (
99
+ metrics.precision is None or metrics.precision < thresholds.min_precision
100
+ ):
101
+ return False
102
+ if thresholds.min_recall is not None and (
103
+ metrics.recall is None or metrics.recall < thresholds.min_recall
104
+ ):
105
+ return False
106
+ return not (
107
+ thresholds.max_high_risk_false_negatives is not None
108
+ and (
109
+ metrics.high_risk_false_negatives is None
110
+ or metrics.high_risk_false_negatives > thresholds.max_high_risk_false_negatives
111
+ )
112
+ )
113
+
114
+
115
+ class LLMJudgeGrader(Grader):
116
+ def __init__(
117
+ self,
118
+ name: str,
119
+ *,
120
+ judge_adapter: AgentAdapter,
121
+ rubric: RubricTemplate,
122
+ card: GraderCard | None = None,
123
+ force: bool = False,
124
+ ) -> None:
125
+ if not name:
126
+ raise ValueError("grader name must be non-empty")
127
+ self.name = name
128
+ self._judge = judge_adapter
129
+ self._rubric = rubric
130
+ self._card = card
131
+ self._force = force
132
+
133
+ def grade(self, context: GraderContext) -> GradeResult:
134
+ if not self._force and not _is_card_calibrated(self._card):
135
+ return GradeResult(
136
+ grader=self.name,
137
+ label=GradeLabel.SKIPPED,
138
+ reason="blocking grader below calibration thresholds; degraded to advisory",
139
+ score=None,
140
+ confidence=None,
141
+ details={"card_state": getattr(self._card, "state", None)},
142
+ )
143
+ prompt = self._rubric.render(
144
+ case_input=context.case.input,
145
+ agent_response=_extract_response_text(context),
146
+ )
147
+ request = _build_judge_request(context, prompt, self.name)
148
+ try:
149
+ response = self._judge.invoke(request)
150
+ except Exception as exc:
151
+ return GradeResult(
152
+ grader=self.name,
153
+ label=GradeLabel.ERROR,
154
+ reason=f"judge invocation failed: {exc}",
155
+ )
156
+ try:
157
+ decision = _parse_judge_output(response.content or "")
158
+ except ValueError as exc:
159
+ return GradeResult(
160
+ grader=self.name,
161
+ label=GradeLabel.ERROR,
162
+ reason=f"could not parse judge output: {exc}",
163
+ )
164
+ return GradeResult(
165
+ grader=self.name,
166
+ label=decision.label,
167
+ reason=decision.reason,
168
+ score=decision.score,
169
+ confidence=decision.confidence,
170
+ )
171
+
172
+
173
+ def _extract_response_text(context: GraderContext) -> str:
174
+ if context.response is not None and context.response.content:
175
+ return context.response.content
176
+ return ""
177
+
178
+
179
+ def _build_judge_request(context: GraderContext, prompt: str, grader_name: str) -> AdapterRequest:
180
+ from selfevals.runner.adapters import AdapterRequest # local import to avoid cycle
181
+
182
+ return AdapterRequest(
183
+ workspace_id=context.case.workspace_id,
184
+ case_id=context.case.id,
185
+ input={"messages": [{"role": "user", "content": prompt}]},
186
+ metadata={"grader": grader_name, "judge": True},
187
+ )