selfevals 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
- selfevals/__init__.py +19 -0
- selfevals/_errors.py +44 -0
- selfevals/_internal/__init__.py +0 -0
- selfevals/_internal/hashing.py +23 -0
- selfevals/_internal/ids.py +65 -0
- selfevals/_internal/time.py +17 -0
- selfevals/analysis/__init__.py +23 -0
- selfevals/analysis/bundle.py +162 -0
- selfevals/analysis/hypothesis.py +26 -0
- selfevals/analysis/ingest.py +185 -0
- selfevals/analysis/schemas.py +119 -0
- selfevals/analysis/staging.py +34 -0
- selfevals/api/__init__.py +24 -0
- selfevals/api/__main__.py +47 -0
- selfevals/api/app.py +351 -0
- selfevals/api/broker.py +210 -0
- selfevals/api/broker_bridge.py +29 -0
- selfevals/api/queries.py +447 -0
- selfevals/api/schemas.py +151 -0
- selfevals/api/sse.py +114 -0
- selfevals/cli/__init__.py +15 -0
- selfevals/cli/_friendly.py +180 -0
- selfevals/cli/_help.py +55 -0
- selfevals/cli/analyze_commands.py +169 -0
- selfevals/cli/commands.py +615 -0
- selfevals/cli/main.py +409 -0
- selfevals/decision/__init__.py +34 -0
- selfevals/decision/matrix.py +185 -0
- selfevals/examples/__init__.py +8 -0
- selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
- selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
- selfevals/examples/pingpong.py +21 -0
- selfevals/graders/__init__.py +46 -0
- selfevals/graders/base.py +54 -0
- selfevals/graders/calibration.py +145 -0
- selfevals/graders/deterministic.py +143 -0
- selfevals/graders/llm_judge.py +187 -0
- selfevals/graders/registry.py +66 -0
- selfevals/optimization/__init__.py +47 -0
- selfevals/optimization/aggregator.py +246 -0
- selfevals/optimization/loop.py +432 -0
- selfevals/optimization/proposers.py +202 -0
- selfevals/py.typed +0 -0
- selfevals/repo/__init__.py +28 -0
- selfevals/repo/loader.py +276 -0
- selfevals/reporter/__init__.py +21 -0
- selfevals/reporter/_metrics.py +114 -0
- selfevals/reporter/compare.py +221 -0
- selfevals/reporter/json_report.py +105 -0
- selfevals/reporter/markdown.py +232 -0
- selfevals/runner/__init__.py +42 -0
- selfevals/runner/adapters.py +268 -0
- selfevals/runner/executor.py +234 -0
- selfevals/runner/otlp_receiver.py +343 -0
- selfevals/runner/otlp_to_recorder.py +180 -0
- selfevals/runner/sandbox.py +46 -0
- selfevals/schemas/__init__.py +213 -0
- selfevals/schemas/_base.py +82 -0
- selfevals/schemas/annotation.py +55 -0
- selfevals/schemas/dataset.py +111 -0
- selfevals/schemas/enums.py +324 -0
- selfevals/schemas/eval_case.py +189 -0
- selfevals/schemas/experiment.py +367 -0
- selfevals/schemas/failure_mode.py +76 -0
- selfevals/schemas/fleet.py +111 -0
- selfevals/schemas/grader_card.py +112 -0
- selfevals/schemas/iteration.py +219 -0
- selfevals/schemas/registry.py +125 -0
- selfevals/schemas/tool.py +43 -0
- selfevals/schemas/trace.py +384 -0
- selfevals/schemas/workspace.py +69 -0
- selfevals/sdk/__init__.py +24 -0
- selfevals/sdk/auto_instrument.py +165 -0
- selfevals/sdk/context.py +45 -0
- selfevals/sdk/exporter.py +50 -0
- selfevals/sdk/facade.py +203 -0
- selfevals/skills/__init__.py +61 -0
- selfevals/storage/__init__.py +53 -0
- selfevals/storage/errors.py +66 -0
- selfevals/storage/filesystem.py +137 -0
- selfevals/storage/interface.py +135 -0
- selfevals/storage/migrations/__init__.py +80 -0
- selfevals/storage/migrations/m0001_initial.py +57 -0
- selfevals/storage/seed.py +199 -0
- selfevals/storage/sqlite.py +232 -0
- selfevals/trace/__init__.py +31 -0
- selfevals/trace/otel_importer.py +455 -0
- selfevals/trace/payload_router.py +106 -0
- selfevals/trace/recorder.py +540 -0
- selfevals/version.py +1 -0
- selfevals-0.2.2.dist-info/METADATA +283 -0
- selfevals-0.2.2.dist-info/RECORD +96 -0
- selfevals-0.2.2.dist-info/WHEEL +4 -0
- selfevals-0.2.2.dist-info/entry_points.txt +2 -0
- selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Example experiment spec.
|
|
2
|
+
#
|
|
3
|
+
# Copy it with:
|
|
4
|
+
# selfevals examples copy pingpong
|
|
5
|
+
# Then run:
|
|
6
|
+
# selfevals run evals/experiments/example_pingpong.yaml --no-persist
|
|
7
|
+
|
|
8
|
+
workspace: ws_01HZZZZZZZZZZZZZZZZZZZZZZZ
|
|
9
|
+
|
|
10
|
+
experiment:
|
|
11
|
+
name: pingpong baseline
|
|
12
|
+
goal: warm up the end-to-end loop with a trivial echo agent
|
|
13
|
+
mode: handoff
|
|
14
|
+
taxonomy:
|
|
15
|
+
target_features:
|
|
16
|
+
- commerce.product_resolution
|
|
17
|
+
dataset_types:
|
|
18
|
+
- capability
|
|
19
|
+
datasets:
|
|
20
|
+
optimization: { id: ds_pingpong, version: 1 }
|
|
21
|
+
target:
|
|
22
|
+
primary: { name: pass@1, operator: ">=", value: 0.5 }
|
|
23
|
+
editable:
|
|
24
|
+
prompt: true
|
|
25
|
+
model_params: true
|
|
26
|
+
frozen:
|
|
27
|
+
fleet: { id: flt_demo }
|
|
28
|
+
agents:
|
|
29
|
+
- { id: ag_demo }
|
|
30
|
+
datasets:
|
|
31
|
+
- { id: ds_pingpong }
|
|
32
|
+
proposer:
|
|
33
|
+
strategy: grid
|
|
34
|
+
search_space:
|
|
35
|
+
model_params:
|
|
36
|
+
level: [0.0, 1.0]
|
|
37
|
+
run:
|
|
38
|
+
sandbox: mock
|
|
39
|
+
max_iterations: 4
|
|
40
|
+
convergence:
|
|
41
|
+
min_delta: 1.0e-6
|
|
42
|
+
patience: 10
|
|
43
|
+
reliability:
|
|
44
|
+
metrics:
|
|
45
|
+
- pass@1
|
|
46
|
+
error_analysis:
|
|
47
|
+
enabled: true
|
|
48
|
+
taxonomy: workspace
|
|
49
|
+
trigger:
|
|
50
|
+
when: fail_rate_above
|
|
51
|
+
threshold: 0.10
|
|
52
|
+
scope: failed_only
|
|
53
|
+
|
|
54
|
+
dataset:
|
|
55
|
+
cases_path: ../datasets/pingpong.jsonl
|
|
56
|
+
|
|
57
|
+
agent:
|
|
58
|
+
entrypoint: selfevals.examples.pingpong:run
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Trivial example agent for the pingpong experiment.
|
|
2
|
+
|
|
3
|
+
It returns "pong" when the proposer cranks `model_params.level >= 0.5`,
|
|
4
|
+
and "miss" otherwise. That lets the grid proposer demonstrate a real
|
|
5
|
+
improvement path from level=0.0 (fail) to level=1.0 (pass).
|
|
6
|
+
|
|
7
|
+
Real agents will replace this with a function that calls Anthropic /
|
|
8
|
+
OpenAI / their framework of choice. The contract is the same:
|
|
9
|
+
|
|
10
|
+
def run(req: AdapterRequest) -> AdapterResponse | str: ...
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from selfevals.runner.adapters import AdapterRequest, AdapterResponse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def run(req: AdapterRequest) -> AdapterResponse:
|
|
19
|
+
level = req.parameters.get("model_params", {}).get("level", 0.0)
|
|
20
|
+
content = "pong" if level >= 0.5 else "miss"
|
|
21
|
+
return AdapterResponse(content=content, tokens_input=4, tokens_output=2)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Graders: score traces against expectations.
|
|
2
|
+
|
|
3
|
+
A `Grader` reads a Trace + EvalCase and returns a `GradeResult` (label
|
|
4
|
+
+ optional score + reason). Two concrete graders ship in MVP:
|
|
5
|
+
|
|
6
|
+
- `DeterministicGrader` evaluates rule-based expectations declared on
|
|
7
|
+
`EvalCase.expected` (must_include / forbidden_tools / regex / schema).
|
|
8
|
+
- `LLMJudgeGrader` invokes an `AgentAdapter` as a judge against a rubric
|
|
9
|
+
prompt; single-judge in MVP, panel infrastructure-ready for post-MVP.
|
|
10
|
+
|
|
11
|
+
Calibration helpers turn observed predictions + human annotations into
|
|
12
|
+
the metrics tracked on a `GraderCard`.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
|
|
16
|
+
from selfevals.graders.calibration import (
|
|
17
|
+
CalibrationReport,
|
|
18
|
+
HumanLabel,
|
|
19
|
+
PredictedLabel,
|
|
20
|
+
compute_classification_metrics,
|
|
21
|
+
)
|
|
22
|
+
from selfevals.graders.deterministic import (
|
|
23
|
+
DeterministicGrader,
|
|
24
|
+
DeterministicRuleViolationError,
|
|
25
|
+
)
|
|
26
|
+
from selfevals.graders.llm_judge import (
|
|
27
|
+
JudgeDecision,
|
|
28
|
+
LLMJudgeGrader,
|
|
29
|
+
RubricTemplate,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"CalibrationReport",
|
|
34
|
+
"DeterministicGrader",
|
|
35
|
+
"DeterministicRuleViolationError",
|
|
36
|
+
"GradeLabel",
|
|
37
|
+
"GradeResult",
|
|
38
|
+
"Grader",
|
|
39
|
+
"GraderContext",
|
|
40
|
+
"HumanLabel",
|
|
41
|
+
"JudgeDecision",
|
|
42
|
+
"LLMJudgeGrader",
|
|
43
|
+
"PredictedLabel",
|
|
44
|
+
"RubricTemplate",
|
|
45
|
+
"compute_classification_metrics",
|
|
46
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Grader ABC + shared result types.
|
|
2
|
+
|
|
3
|
+
The grader contract is intentionally narrow: receive a context bundle
|
|
4
|
+
(case, trace, optional response) and return a `GradeResult`. The case
|
|
5
|
+
and the trace carry everything needed to score; graders never reach
|
|
6
|
+
into storage themselves.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import StrEnum
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from selfevals.runner.adapters import AdapterResponse
|
|
18
|
+
from selfevals.schemas.eval_case import EvalCase
|
|
19
|
+
from selfevals.schemas.trace import Trace
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GradeLabel(StrEnum):
|
|
23
|
+
PASS = "pass"
|
|
24
|
+
FAIL = "fail"
|
|
25
|
+
PARTIAL = "partial"
|
|
26
|
+
ERROR = "error"
|
|
27
|
+
SKIPPED = "skipped"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class GradeResult:
|
|
32
|
+
grader: str
|
|
33
|
+
label: GradeLabel
|
|
34
|
+
reason: str
|
|
35
|
+
score: float | None = None
|
|
36
|
+
confidence: float | None = None
|
|
37
|
+
failure_modes: list[str] = field(default_factory=list)
|
|
38
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class GraderContext:
|
|
43
|
+
case: EvalCase
|
|
44
|
+
trace: Trace
|
|
45
|
+
response: AdapterResponse | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Grader(ABC):
|
|
49
|
+
name: str
|
|
50
|
+
"""Stable identifier — used as the GradeResult.grader and the
|
|
51
|
+
Trace.grader_results[i].grader field."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def grade(self, context: GraderContext) -> GradeResult: ...
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Calibration helpers: turn predictions + human labels into classification metrics.
|
|
2
|
+
|
|
3
|
+
These functions live outside any specific grader so they can be reused by
|
|
4
|
+
calibration scripts, dashboards, and the optimizer. They consume two flat
|
|
5
|
+
lists keyed by `case_id` and produce a `CalibrationReport`.
|
|
6
|
+
|
|
7
|
+
Metrics are computed treating `pass` as the positive class by default;
|
|
8
|
+
`positive_label` is configurable. Macro-F1 is averaged across all observed
|
|
9
|
+
labels.
|
|
10
|
+
|
|
11
|
+
A class-imbalance guard: if a label appears in only one of the two streams,
|
|
12
|
+
precision/recall for that class are reported as `None` rather than zero —
|
|
13
|
+
this avoids the "100% precision on a class with 0 predictions" trap.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from collections.abc import Mapping
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
|
|
22
|
+
from selfevals.graders.base import GradeLabel
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class PredictedLabel:
|
|
27
|
+
case_id: str
|
|
28
|
+
label: GradeLabel
|
|
29
|
+
confidence: float | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class HumanLabel:
|
|
34
|
+
case_id: str
|
|
35
|
+
label: GradeLabel
|
|
36
|
+
high_risk: bool = False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class CalibrationReport:
|
|
41
|
+
n_pairs: int
|
|
42
|
+
precision: float | None
|
|
43
|
+
recall: float | None
|
|
44
|
+
f1: float | None
|
|
45
|
+
macro_f1: float | None
|
|
46
|
+
accuracy: float
|
|
47
|
+
high_risk_false_negatives: int
|
|
48
|
+
per_label_precision: dict[str, float | None] = field(default_factory=dict)
|
|
49
|
+
per_label_recall: dict[str, float | None] = field(default_factory=dict)
|
|
50
|
+
confusion: dict[tuple[str, str], int] = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _safe_div(num: float, den: float) -> float | None:
|
|
54
|
+
if den == 0:
|
|
55
|
+
return None
|
|
56
|
+
return num / den
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _f1(p: float | None, r: float | None) -> float | None:
|
|
60
|
+
if p is None or r is None or (p + r) == 0:
|
|
61
|
+
return None
|
|
62
|
+
return 2 * p * r / (p + r)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def compute_classification_metrics(
|
|
66
|
+
predictions: list[PredictedLabel],
|
|
67
|
+
human_labels: list[HumanLabel],
|
|
68
|
+
*,
|
|
69
|
+
positive_label: GradeLabel = GradeLabel.PASS,
|
|
70
|
+
) -> CalibrationReport:
|
|
71
|
+
"""Compute precision/recall/F1/macro-F1 + high-risk FNs.
|
|
72
|
+
|
|
73
|
+
Pairs are joined on case_id. Cases with a prediction but no human label
|
|
74
|
+
(or vice versa) are dropped from the metrics but counted via n_pairs=0
|
|
75
|
+
if there are no pairs at all.
|
|
76
|
+
"""
|
|
77
|
+
pred_by_case: Mapping[str, PredictedLabel] = {p.case_id: p for p in predictions}
|
|
78
|
+
human_by_case: Mapping[str, HumanLabel] = {h.case_id: h for h in human_labels}
|
|
79
|
+
paired_ids = sorted(set(pred_by_case) & set(human_by_case))
|
|
80
|
+
n = len(paired_ids)
|
|
81
|
+
|
|
82
|
+
if n == 0:
|
|
83
|
+
return CalibrationReport(
|
|
84
|
+
n_pairs=0,
|
|
85
|
+
precision=None,
|
|
86
|
+
recall=None,
|
|
87
|
+
f1=None,
|
|
88
|
+
macro_f1=None,
|
|
89
|
+
accuracy=0.0,
|
|
90
|
+
high_risk_false_negatives=0,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
confusion: Counter[tuple[str, str]] = Counter()
|
|
94
|
+
correct = 0
|
|
95
|
+
high_risk_fns = 0
|
|
96
|
+
for case_id in paired_ids:
|
|
97
|
+
pred = pred_by_case[case_id].label.value
|
|
98
|
+
human = human_by_case[case_id].label.value
|
|
99
|
+
confusion[(human, pred)] += 1
|
|
100
|
+
if pred == human:
|
|
101
|
+
correct += 1
|
|
102
|
+
if (
|
|
103
|
+
human == positive_label.value
|
|
104
|
+
and pred != positive_label.value
|
|
105
|
+
and human_by_case[case_id].high_risk
|
|
106
|
+
):
|
|
107
|
+
high_risk_fns += 1
|
|
108
|
+
|
|
109
|
+
accuracy = correct / n
|
|
110
|
+
|
|
111
|
+
labels = sorted({k[0] for k in confusion} | {k[1] for k in confusion})
|
|
112
|
+
per_label_precision: dict[str, float | None] = {}
|
|
113
|
+
per_label_recall: dict[str, float | None] = {}
|
|
114
|
+
f1_values: list[float] = []
|
|
115
|
+
for label in labels:
|
|
116
|
+
tp = confusion[(label, label)]
|
|
117
|
+
pred_pos = sum(c for (h, p), c in confusion.items() if p == label)
|
|
118
|
+
actual_pos = sum(c for (h, p), c in confusion.items() if h == label)
|
|
119
|
+
precision_l = _safe_div(tp, pred_pos)
|
|
120
|
+
recall_l = _safe_div(tp, actual_pos)
|
|
121
|
+
per_label_precision[label] = precision_l
|
|
122
|
+
per_label_recall[label] = recall_l
|
|
123
|
+
f1_l = _f1(precision_l, recall_l)
|
|
124
|
+
if f1_l is not None:
|
|
125
|
+
f1_values.append(f1_l)
|
|
126
|
+
|
|
127
|
+
macro_f1 = sum(f1_values) / len(f1_values) if f1_values else None
|
|
128
|
+
|
|
129
|
+
pos = positive_label.value
|
|
130
|
+
precision_pos = per_label_precision.get(pos)
|
|
131
|
+
recall_pos = per_label_recall.get(pos)
|
|
132
|
+
f1_pos = _f1(precision_pos, recall_pos)
|
|
133
|
+
|
|
134
|
+
return CalibrationReport(
|
|
135
|
+
n_pairs=n,
|
|
136
|
+
precision=precision_pos,
|
|
137
|
+
recall=recall_pos,
|
|
138
|
+
f1=f1_pos,
|
|
139
|
+
macro_f1=macro_f1,
|
|
140
|
+
accuracy=accuracy,
|
|
141
|
+
high_risk_false_negatives=high_risk_fns,
|
|
142
|
+
per_label_precision=per_label_precision,
|
|
143
|
+
per_label_recall=per_label_recall,
|
|
144
|
+
confusion=dict(confusion),
|
|
145
|
+
)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""DeterministicGrader: declarative rules from EvalCase.expected.
|
|
2
|
+
|
|
3
|
+
Rules supported in MVP:
|
|
4
|
+
- must_include: every string must appear in the final response (case-
|
|
5
|
+
insensitive by default; controllable via constructor flag).
|
|
6
|
+
- must_not_include: none of the strings may appear.
|
|
7
|
+
- required_tools: every tool listed must appear in the trace.
|
|
8
|
+
- forbidden_tools: no tool listed may appear.
|
|
9
|
+
- regex_match: optional regex applied to the final response.
|
|
10
|
+
- structured_output equality: when EvalCase.expected.structured_output
|
|
11
|
+
is set, the adapter's structured_output must match exactly.
|
|
12
|
+
|
|
13
|
+
Each rule has a stable failure-mode tag emitted in GradeResult.failure_modes
|
|
14
|
+
so weighted scoring can attribute failures upstream.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from typing import TYPE_CHECKING
|
|
22
|
+
|
|
23
|
+
from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
|
|
24
|
+
from selfevals.schemas.trace import LLMCallSpan, ToolCallSpan
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from selfevals.schemas.eval_case import Expected
|
|
28
|
+
from selfevals.schemas.trace import Trace
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DeterministicRuleViolationError(RuntimeError):
|
|
32
|
+
"""Raised if the grader is asked to evaluate a contradictory rule set."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class _Violation:
|
|
37
|
+
failure_mode: str
|
|
38
|
+
detail: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _final_response_text(ctx: GraderContext) -> str:
|
|
42
|
+
if ctx.response is not None and ctx.response.content:
|
|
43
|
+
return ctx.response.content
|
|
44
|
+
# Fall back to the final structured output's "content" or empty.
|
|
45
|
+
if ctx.response is not None and ctx.response.structured_output is not None:
|
|
46
|
+
content = ctx.response.structured_output.get("content")
|
|
47
|
+
if isinstance(content, str):
|
|
48
|
+
return content
|
|
49
|
+
return ""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _tools_invoked(trace: Trace) -> list[str]:
|
|
53
|
+
return [s.tool_name for s in trace.spans if isinstance(s, ToolCallSpan)]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _llm_call_count(trace: Trace) -> int:
|
|
57
|
+
return sum(1 for s in trace.spans if isinstance(s, LLMCallSpan))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DeterministicGrader(Grader):
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
name: str = "deterministic",
|
|
64
|
+
*,
|
|
65
|
+
case_sensitive: bool = False,
|
|
66
|
+
regex_match: re.Pattern[str] | str | None = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
if not name:
|
|
69
|
+
raise ValueError("grader name must be non-empty")
|
|
70
|
+
self.name = name
|
|
71
|
+
self._case_sensitive = case_sensitive
|
|
72
|
+
if isinstance(regex_match, str):
|
|
73
|
+
self._regex: re.Pattern[str] | None = re.compile(regex_match)
|
|
74
|
+
else:
|
|
75
|
+
self._regex = regex_match
|
|
76
|
+
|
|
77
|
+
def grade(self, context: GraderContext) -> GradeResult:
|
|
78
|
+
expected: Expected = context.case.expected
|
|
79
|
+
violations: list[_Violation] = []
|
|
80
|
+
|
|
81
|
+
text = _final_response_text(context)
|
|
82
|
+
haystack = text if self._case_sensitive else text.lower()
|
|
83
|
+
invoked = _tools_invoked(context.trace)
|
|
84
|
+
invoked_set = set(invoked)
|
|
85
|
+
|
|
86
|
+
for needle in expected.must_include:
|
|
87
|
+
probe = needle if self._case_sensitive else needle.lower()
|
|
88
|
+
if probe not in haystack:
|
|
89
|
+
violations.append(
|
|
90
|
+
_Violation(failure_mode="missing_required_substring", detail=needle)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
for needle in expected.must_not_include:
|
|
94
|
+
probe = needle if self._case_sensitive else needle.lower()
|
|
95
|
+
if probe in haystack:
|
|
96
|
+
violations.append(_Violation(failure_mode="forbidden_substring", detail=needle))
|
|
97
|
+
|
|
98
|
+
for required in expected.required_tools:
|
|
99
|
+
if required not in invoked_set:
|
|
100
|
+
violations.append(_Violation(failure_mode="missing_required_tool", detail=required))
|
|
101
|
+
|
|
102
|
+
for forbidden in expected.forbidden_tools:
|
|
103
|
+
if forbidden in invoked_set:
|
|
104
|
+
violations.append(
|
|
105
|
+
_Violation(failure_mode="forbidden_tool_invoked", detail=forbidden)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if self._regex is not None and not self._regex.search(text):
|
|
109
|
+
violations.append(_Violation(failure_mode="regex_mismatch", detail=self._regex.pattern))
|
|
110
|
+
|
|
111
|
+
if expected.structured_output is not None:
|
|
112
|
+
response_struct = context.response.structured_output if context.response else None
|
|
113
|
+
if response_struct != expected.structured_output:
|
|
114
|
+
violations.append(
|
|
115
|
+
_Violation(
|
|
116
|
+
failure_mode="structured_output_mismatch",
|
|
117
|
+
detail="expected != actual",
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Loose hint metrics that don't fail but are useful for debug.
|
|
122
|
+
details = {
|
|
123
|
+
"tools_invoked": invoked,
|
|
124
|
+
"llm_call_count": _llm_call_count(context.trace),
|
|
125
|
+
}
|
|
126
|
+
if not violations:
|
|
127
|
+
return GradeResult(
|
|
128
|
+
grader=self.name,
|
|
129
|
+
label=GradeLabel.PASS,
|
|
130
|
+
reason="all deterministic rules satisfied",
|
|
131
|
+
score=1.0,
|
|
132
|
+
details=details,
|
|
133
|
+
)
|
|
134
|
+
modes = sorted({v.failure_mode for v in violations})
|
|
135
|
+
reason = "; ".join(f"{v.failure_mode}:{v.detail}" for v in violations)
|
|
136
|
+
return GradeResult(
|
|
137
|
+
grader=self.name,
|
|
138
|
+
label=GradeLabel.FAIL,
|
|
139
|
+
reason=reason,
|
|
140
|
+
score=0.0,
|
|
141
|
+
failure_modes=modes,
|
|
142
|
+
details=details,
|
|
143
|
+
)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""LLMJudgeGrader: invoke an AgentAdapter as a judge.
|
|
2
|
+
|
|
3
|
+
The grader formats a rubric prompt with the case input, the agent's final
|
|
4
|
+
response, and the rubric instructions; the judge adapter returns a JSON
|
|
5
|
+
response with `label`, `reason`, optional `score`, optional `confidence`.
|
|
6
|
+
|
|
7
|
+
MVP ships single-judge. The constructor accepts an optional `card` (a
|
|
8
|
+
`GraderCard`) so future panel infra can pin behavior to a calibrated
|
|
9
|
+
configuration. When `card.blocking` and calibration metrics are below
|
|
10
|
+
thresholds, the grader degrades to advisory (returns SKIPPED) per the
|
|
11
|
+
operational spec — unless `force=True` is set.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from string import Template
|
|
19
|
+
from typing import TYPE_CHECKING, Any
|
|
20
|
+
|
|
21
|
+
from selfevals.graders.base import GradeLabel, Grader, GraderContext, GradeResult
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from selfevals.runner.adapters import AdapterRequest, AgentAdapter
|
|
25
|
+
from selfevals.schemas.grader_card import GraderCard
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_DEFAULT_RUBRIC = """You are an evaluator. Read the case input and the agent's response,
|
|
29
|
+
then decide whether the response meets the rubric.
|
|
30
|
+
|
|
31
|
+
Rubric:
|
|
32
|
+
$rubric
|
|
33
|
+
|
|
34
|
+
Case input:
|
|
35
|
+
$case_input
|
|
36
|
+
|
|
37
|
+
Agent response:
|
|
38
|
+
$agent_response
|
|
39
|
+
|
|
40
|
+
Return a single JSON object with keys:
|
|
41
|
+
- label: one of "pass", "fail", "partial"
|
|
42
|
+
- reason: short justification
|
|
43
|
+
- score: number in [0, 1] (optional)
|
|
44
|
+
- confidence: number in [0, 1] (optional)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class RubricTemplate:
|
|
50
|
+
rubric: str
|
|
51
|
+
template: str = _DEFAULT_RUBRIC
|
|
52
|
+
|
|
53
|
+
def render(self, *, case_input: Any, agent_response: str) -> str:
|
|
54
|
+
return Template(self.template).safe_substitute(
|
|
55
|
+
rubric=self.rubric,
|
|
56
|
+
case_input=json.dumps(case_input, ensure_ascii=False),
|
|
57
|
+
agent_response=agent_response,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True)
|
|
62
|
+
class JudgeDecision:
|
|
63
|
+
label: GradeLabel
|
|
64
|
+
reason: str
|
|
65
|
+
score: float | None = None
|
|
66
|
+
confidence: float | None = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _parse_judge_output(text: str) -> JudgeDecision:
|
|
70
|
+
try:
|
|
71
|
+
data = json.loads(text)
|
|
72
|
+
except json.JSONDecodeError as exc:
|
|
73
|
+
raise ValueError(f"judge did not return valid JSON: {exc}; text={text!r}") from exc
|
|
74
|
+
if not isinstance(data, dict):
|
|
75
|
+
raise ValueError(f"judge JSON must be an object, got {type(data).__name__}")
|
|
76
|
+
raw_label = str(data.get("label", "")).strip().lower()
|
|
77
|
+
if raw_label not in {label.value for label in GradeLabel}:
|
|
78
|
+
raise ValueError(f"judge returned unknown label: {raw_label!r}")
|
|
79
|
+
label = GradeLabel(raw_label)
|
|
80
|
+
reason = str(data.get("reason", "")).strip() or "no reason supplied"
|
|
81
|
+
score = data.get("score")
|
|
82
|
+
confidence = data.get("confidence")
|
|
83
|
+
return JudgeDecision(
|
|
84
|
+
label=label,
|
|
85
|
+
reason=reason,
|
|
86
|
+
score=float(score) if score is not None else None,
|
|
87
|
+
confidence=float(confidence) if confidence is not None else None,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _is_card_calibrated(card: GraderCard | None) -> bool:
|
|
92
|
+
if card is None:
|
|
93
|
+
return True
|
|
94
|
+
if not card.blocking:
|
|
95
|
+
return True
|
|
96
|
+
metrics = card.metrics
|
|
97
|
+
thresholds = card.thresholds
|
|
98
|
+
if thresholds.min_precision is not None and (
|
|
99
|
+
metrics.precision is None or metrics.precision < thresholds.min_precision
|
|
100
|
+
):
|
|
101
|
+
return False
|
|
102
|
+
if thresholds.min_recall is not None and (
|
|
103
|
+
metrics.recall is None or metrics.recall < thresholds.min_recall
|
|
104
|
+
):
|
|
105
|
+
return False
|
|
106
|
+
return not (
|
|
107
|
+
thresholds.max_high_risk_false_negatives is not None
|
|
108
|
+
and (
|
|
109
|
+
metrics.high_risk_false_negatives is None
|
|
110
|
+
or metrics.high_risk_false_negatives > thresholds.max_high_risk_false_negatives
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class LLMJudgeGrader(Grader):
|
|
116
|
+
def __init__(
|
|
117
|
+
self,
|
|
118
|
+
name: str,
|
|
119
|
+
*,
|
|
120
|
+
judge_adapter: AgentAdapter,
|
|
121
|
+
rubric: RubricTemplate,
|
|
122
|
+
card: GraderCard | None = None,
|
|
123
|
+
force: bool = False,
|
|
124
|
+
) -> None:
|
|
125
|
+
if not name:
|
|
126
|
+
raise ValueError("grader name must be non-empty")
|
|
127
|
+
self.name = name
|
|
128
|
+
self._judge = judge_adapter
|
|
129
|
+
self._rubric = rubric
|
|
130
|
+
self._card = card
|
|
131
|
+
self._force = force
|
|
132
|
+
|
|
133
|
+
def grade(self, context: GraderContext) -> GradeResult:
|
|
134
|
+
if not self._force and not _is_card_calibrated(self._card):
|
|
135
|
+
return GradeResult(
|
|
136
|
+
grader=self.name,
|
|
137
|
+
label=GradeLabel.SKIPPED,
|
|
138
|
+
reason="blocking grader below calibration thresholds; degraded to advisory",
|
|
139
|
+
score=None,
|
|
140
|
+
confidence=None,
|
|
141
|
+
details={"card_state": getattr(self._card, "state", None)},
|
|
142
|
+
)
|
|
143
|
+
prompt = self._rubric.render(
|
|
144
|
+
case_input=context.case.input,
|
|
145
|
+
agent_response=_extract_response_text(context),
|
|
146
|
+
)
|
|
147
|
+
request = _build_judge_request(context, prompt, self.name)
|
|
148
|
+
try:
|
|
149
|
+
response = self._judge.invoke(request)
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
return GradeResult(
|
|
152
|
+
grader=self.name,
|
|
153
|
+
label=GradeLabel.ERROR,
|
|
154
|
+
reason=f"judge invocation failed: {exc}",
|
|
155
|
+
)
|
|
156
|
+
try:
|
|
157
|
+
decision = _parse_judge_output(response.content or "")
|
|
158
|
+
except ValueError as exc:
|
|
159
|
+
return GradeResult(
|
|
160
|
+
grader=self.name,
|
|
161
|
+
label=GradeLabel.ERROR,
|
|
162
|
+
reason=f"could not parse judge output: {exc}",
|
|
163
|
+
)
|
|
164
|
+
return GradeResult(
|
|
165
|
+
grader=self.name,
|
|
166
|
+
label=decision.label,
|
|
167
|
+
reason=decision.reason,
|
|
168
|
+
score=decision.score,
|
|
169
|
+
confidence=decision.confidence,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _extract_response_text(context: GraderContext) -> str:
|
|
174
|
+
if context.response is not None and context.response.content:
|
|
175
|
+
return context.response.content
|
|
176
|
+
return ""
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _build_judge_request(context: GraderContext, prompt: str, grader_name: str) -> AdapterRequest:
|
|
180
|
+
from selfevals.runner.adapters import AdapterRequest # local import to avoid cycle
|
|
181
|
+
|
|
182
|
+
return AdapterRequest(
|
|
183
|
+
workspace_id=context.case.workspace_id,
|
|
184
|
+
case_id=context.case.id,
|
|
185
|
+
input={"messages": [{"role": "user", "content": prompt}]},
|
|
186
|
+
metadata={"grader": grader_name, "judge": True},
|
|
187
|
+
)
|