structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from statistics import mean
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
from structured_eval.models.metric_result import ( # noqa: TC001
|
|
11
|
+
MetricCollection,
|
|
12
|
+
MetricResult,
|
|
13
|
+
)
|
|
14
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult # noqa: TC001
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NodeType(StrEnum):
|
|
18
|
+
"""The kind of tree node a ``FieldScore`` describes."""
|
|
19
|
+
|
|
20
|
+
SCALAR = "scalar"
|
|
21
|
+
OBJECT = "object"
|
|
22
|
+
ARRAY = "array"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _percentile(values: list[float], q: float) -> float:
|
|
26
|
+
"""Linear-interpolation percentile (q in [0, 1]) over a non-empty list."""
|
|
27
|
+
ordered = sorted(values)
|
|
28
|
+
if len(ordered) == 1:
|
|
29
|
+
return ordered[0]
|
|
30
|
+
pos = q * (len(ordered) - 1)
|
|
31
|
+
lo = int(pos)
|
|
32
|
+
hi = min(lo + 1, len(ordered) - 1)
|
|
33
|
+
frac = pos - lo
|
|
34
|
+
return ordered[lo] + (ordered[hi] - ordered[lo]) * frac
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ── Warnings ────────────────────────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class WarningType(StrEnum):
|
|
41
|
+
"""The kind of structural warning the engine raised while building the tree."""
|
|
42
|
+
|
|
43
|
+
EXTRA_KEY = (
|
|
44
|
+
"extra_key" # key present in actual but not expected (ExtraKeysPolicy.IGNORE)
|
|
45
|
+
)
|
|
46
|
+
MISSING_FIELD = "missing_field" # key present in expected but absent in actual
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvalWarning(BaseModel):
|
|
50
|
+
"""A structural warning, typed by ``WarningType`` and located by ``path``."""
|
|
51
|
+
|
|
52
|
+
type: WarningType
|
|
53
|
+
path: str
|
|
54
|
+
message: str = ""
|
|
55
|
+
|
|
56
|
+
def __str__(self) -> str:
|
|
57
|
+
tag = f"[{self.type.name}]"
|
|
58
|
+
return f"{tag} {self.message}" if self.message else f"{tag} {self.path}"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── Rules ─────────────────────────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class RuleResult(BaseModel):
|
|
65
|
+
"""Result of evaluating a single business rule."""
|
|
66
|
+
|
|
67
|
+
name: str
|
|
68
|
+
passed: bool
|
|
69
|
+
message: str = ""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ── Per-path score ────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class FieldScore(BaseModel):
|
|
76
|
+
"""Evaluation result for one node of the tree (flat, dot-notation path).
|
|
77
|
+
|
|
78
|
+
``metrics`` holds only the metrics that were requested and applied to this
|
|
79
|
+
node (e.g. ``{"exact_match": 0.0, "token_f1": 0.62}``). Each value is a
|
|
80
|
+
``MetricResult`` — a ``float`` that also carries ``.extra`` (structured
|
|
81
|
+
detail the metric chose to surface). ``score`` is the value of the key metric
|
|
82
|
+
at this path, ``threshold`` the bar applied to it.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
86
|
+
|
|
87
|
+
path: str
|
|
88
|
+
node_type: NodeType
|
|
89
|
+
actual: Any = None
|
|
90
|
+
expected: Any = None
|
|
91
|
+
metrics: dict[str, MetricResult] = Field(default_factory=dict)
|
|
92
|
+
# TODO: Should be required by default key metric
|
|
93
|
+
score: float | None = None
|
|
94
|
+
# TODO: Reconsider default arguments - all possible should be defined in model as defaults
|
|
95
|
+
threshold: float | None = None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ── Regression diff ─────────────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class RegressionDiff(BaseModel):
|
|
102
|
+
"""Metric deltas between two EvalReports (self minus other).
|
|
103
|
+
|
|
104
|
+
``deltas`` are per-metric changes in the aggregate; ``field_deltas`` maps
|
|
105
|
+
each field path to its own per-metric changes. Positive means improvement.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
deltas: dict[str, float] = Field(default_factory=dict)
|
|
109
|
+
field_deltas: dict[str, dict[str, float]] = Field(default_factory=dict)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ── Eval report ───────────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class EvalReport(BaseModel):
|
|
116
|
+
"""Full evaluation result for a single document.
|
|
117
|
+
|
|
118
|
+
``metrics`` maps each metric name to a ``MetricCollection`` — its value at
|
|
119
|
+
every node that produced it, plus that metric's structured detail (schema
|
|
120
|
+
errors, hallucinated paths, per-rule outcomes, …) on each value's ``.extra``.
|
|
121
|
+
``field_scores`` is a flat map of every tree node keyed by its path. On a
|
|
122
|
+
parse error, ``parse_error`` is True and the metrics are left empty.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
126
|
+
|
|
127
|
+
score: float | None = None
|
|
128
|
+
score_label: str | None = None
|
|
129
|
+
metrics: dict[str, MetricCollection] = Field(default_factory=dict)
|
|
130
|
+
field_scores: dict[str, FieldScore] = Field(default_factory=dict)
|
|
131
|
+
array_matches: dict[str, ArrayMatchResult] = Field(default_factory=dict)
|
|
132
|
+
parse_error: bool = False
|
|
133
|
+
parse_error_message: str | None = None
|
|
134
|
+
warnings: list[EvalWarning] = Field(default_factory=list)
|
|
135
|
+
|
|
136
|
+
# ── Queries ───────────────────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
def failed_fields(self, threshold: float | None = None) -> dict[str, FieldScore]:
|
|
139
|
+
"""Return fields whose score falls below the applicable threshold.
|
|
140
|
+
|
|
141
|
+
Keyed by field path (the same keys as ``field_scores``). Precedence per
|
|
142
|
+
field: the ``threshold`` argument, else the field's own ``threshold``,
|
|
143
|
+
else a perfect-match bar of 1.0. Fields without a score (no key metric
|
|
144
|
+
applied) are skipped.
|
|
145
|
+
"""
|
|
146
|
+
failed: dict[str, FieldScore] = {}
|
|
147
|
+
for path, fs in self.field_scores.items():
|
|
148
|
+
if fs.score is None:
|
|
149
|
+
continue
|
|
150
|
+
bar = threshold if threshold is not None else fs.threshold
|
|
151
|
+
if bar is None:
|
|
152
|
+
bar = 1.0
|
|
153
|
+
if fs.score < bar:
|
|
154
|
+
failed[path] = fs
|
|
155
|
+
return failed
|
|
156
|
+
|
|
157
|
+
# ── Reporting / serialization ─────────────────────────────────────────
|
|
158
|
+
|
|
159
|
+
def print_summary(self) -> None:
|
|
160
|
+
"""Print a field-level summary table to stdout."""
|
|
161
|
+
from structured_eval.reporting import render
|
|
162
|
+
|
|
163
|
+
print(render(self)) # noqa: T201
|
|
164
|
+
|
|
165
|
+
def to_dict(self) -> dict[str, Any]:
|
|
166
|
+
"""Return a JSON-friendly dict of the full report."""
|
|
167
|
+
return self.model_dump(mode="json")
|
|
168
|
+
|
|
169
|
+
def to_json(self, path: str) -> None:
|
|
170
|
+
"""Serialize the report to a JSON file."""
|
|
171
|
+
with Path(path).open("w", encoding="utf-8") as fh:
|
|
172
|
+
fh.write(self.model_dump_json(indent=2))
|
|
173
|
+
|
|
174
|
+
@classmethod
|
|
175
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalReport:
|
|
176
|
+
"""Reconstruct a report from a dict produced by ``to_dict``."""
|
|
177
|
+
return cls.model_validate(data)
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def from_json(cls, path: str) -> EvalReport:
|
|
181
|
+
"""Load a report from a JSON file."""
|
|
182
|
+
with Path(path).open(encoding="utf-8") as fh:
|
|
183
|
+
return cls.model_validate_json(fh.read())
|
|
184
|
+
|
|
185
|
+
def diff_from(
|
|
186
|
+
self, other: EvalReport, metrics: list[str] | None = None
|
|
187
|
+
) -> RegressionDiff:
|
|
188
|
+
"""Compute metric deltas relative to ``other`` (self minus other).
|
|
189
|
+
|
|
190
|
+
``deltas`` covers document-level metrics present in both reports (or the
|
|
191
|
+
subset named in ``metrics``); ``field_deltas`` covers per-field metrics
|
|
192
|
+
for paths present in both. Positive means improvement.
|
|
193
|
+
"""
|
|
194
|
+
names = metrics if metrics is not None else sorted(self.metrics)
|
|
195
|
+
deltas = {
|
|
196
|
+
name: self.metrics[name].representative()
|
|
197
|
+
- other.metrics[name].representative()
|
|
198
|
+
for name in names
|
|
199
|
+
if name in self.metrics and name in other.metrics
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
field_deltas: dict[str, dict[str, float]] = {}
|
|
203
|
+
for path, fs in self.field_scores.items():
|
|
204
|
+
other_fs = other.field_scores.get(path)
|
|
205
|
+
if other_fs is None:
|
|
206
|
+
continue
|
|
207
|
+
per: dict[str, float] = {
|
|
208
|
+
m: fs.metrics[m] - other_fs.metrics[m]
|
|
209
|
+
for m in fs.metrics
|
|
210
|
+
if m in other_fs.metrics
|
|
211
|
+
}
|
|
212
|
+
if fs.score is not None and other_fs.score is not None:
|
|
213
|
+
per["score"] = fs.score - other_fs.score
|
|
214
|
+
if per:
|
|
215
|
+
field_deltas[path] = per
|
|
216
|
+
|
|
217
|
+
return RegressionDiff(deltas=deltas, field_deltas=field_deltas)
|
|
218
|
+
|
|
219
|
+
# ── Assertions (pytest-style: raise AssertionError, else None) ────────
|
|
220
|
+
|
|
221
|
+
def assert_no_parse_errors(self) -> None:
|
|
222
|
+
"""Fail if the document could not be parsed."""
|
|
223
|
+
if self.parse_error:
|
|
224
|
+
raise AssertionError(
|
|
225
|
+
f"parse error: {self.parse_error_message or 'could not parse document'}"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def assert_score(self, min_score: float) -> None:
|
|
229
|
+
"""Fail if the key-metric score is below ``min_score``."""
|
|
230
|
+
self.assert_no_parse_errors()
|
|
231
|
+
if self.score is None:
|
|
232
|
+
raise AssertionError(
|
|
233
|
+
"no score available (no key metric configured); use assert_metric() instead"
|
|
234
|
+
)
|
|
235
|
+
if self.score < min_score:
|
|
236
|
+
label = self.score_label or "score"
|
|
237
|
+
raise AssertionError(f"{label} {self.score:.4g} < required {min_score:.4g}")
|
|
238
|
+
|
|
239
|
+
def assert_field(self, path: str, min_score: float) -> None:
|
|
240
|
+
"""Fail if the field at ``path`` scores below ``min_score``."""
|
|
241
|
+
fs = self.field_scores.get(path)
|
|
242
|
+
if fs is None:
|
|
243
|
+
raise AssertionError(f"no field at path {path!r}")
|
|
244
|
+
if fs.score is None:
|
|
245
|
+
raise AssertionError(f"field {path!r} has no score (no key metric applied)")
|
|
246
|
+
if fs.score < min_score:
|
|
247
|
+
raise AssertionError(
|
|
248
|
+
f"field {path!r} scored {fs.score:.4g} < required {min_score:.4g} "
|
|
249
|
+
f"(actual={fs.actual!r}, expected={fs.expected!r})"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def assert_metric(self, metric_name: str, min_value: float) -> None:
|
|
253
|
+
"""Fail if metric ``metric_name`` is missing or below ``min_value``.
|
|
254
|
+
|
|
255
|
+
Compares the metric's document-level value (the root, else its mean
|
|
256
|
+
across the tree).
|
|
257
|
+
"""
|
|
258
|
+
if metric_name not in self.metrics:
|
|
259
|
+
available = ", ".join(sorted(self.metrics)) or "none"
|
|
260
|
+
raise AssertionError(
|
|
261
|
+
f"metric {metric_name!r} not computed (available: {available})"
|
|
262
|
+
)
|
|
263
|
+
value = self.metrics[metric_name].representative()
|
|
264
|
+
if value < min_value:
|
|
265
|
+
raise AssertionError(
|
|
266
|
+
f"metric {metric_name!r} {value:.4g} < required {min_value:.4g}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def assert_schema_valid(self) -> None:
|
|
270
|
+
"""Fail if schema validation produced errors."""
|
|
271
|
+
coll = self.metrics.get("schema_validity")
|
|
272
|
+
if coll is None:
|
|
273
|
+
return
|
|
274
|
+
errors = coll.extra_values("schema_errors")
|
|
275
|
+
if coll.representative() == 0.0 or errors:
|
|
276
|
+
message = "; ".join(errors) or "schema validation failed"
|
|
277
|
+
raise AssertionError(f"schema invalid: {message}")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# ── Batch / consistency reports ───────────────────────────────────────────────
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class BatchEvalReport(BaseModel):
|
|
284
|
+
"""Aggregate result over a list of documents (``evaluate(list[Sample])``).
|
|
285
|
+
|
|
286
|
+
``metrics`` is the mean of each document-level metric across successfully
|
|
287
|
+
parsed samples; ``score`` is the mean key-metric score. ``perfect_response_rate``
|
|
288
|
+
is the fraction of samples that parsed and had no failing field;
|
|
289
|
+
``parse_error_rate`` the fraction that failed to parse.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
per_sample: list[EvalReport] = Field(default_factory=list)
|
|
293
|
+
metrics: dict[str, float] = Field(default_factory=dict)
|
|
294
|
+
score: float | None = None
|
|
295
|
+
score_label: str | None = None
|
|
296
|
+
perfect_response_rate: float = 0.0
|
|
297
|
+
parse_error_rate: float = 0.0
|
|
298
|
+
|
|
299
|
+
def field_breakdown(
|
|
300
|
+
self, threshold: float | None = None
|
|
301
|
+
) -> dict[str, dict[str, float]]:
|
|
302
|
+
"""Per-path statistics across the batch: mean/min/max/p95/fail_rate.
|
|
303
|
+
|
|
304
|
+
Only nodes with a score (a key metric applied) are counted. ``fail_rate``
|
|
305
|
+
is the fraction of samples where the field scored below its bar (the
|
|
306
|
+
``threshold`` argument, else the field's own threshold, else 1.0).
|
|
307
|
+
"""
|
|
308
|
+
scores: dict[str, list[float]] = {}
|
|
309
|
+
fails: dict[str, int] = {}
|
|
310
|
+
for r in self.per_sample:
|
|
311
|
+
if r.parse_error:
|
|
312
|
+
continue
|
|
313
|
+
for path, fs in r.field_scores.items():
|
|
314
|
+
if fs.score is None:
|
|
315
|
+
continue
|
|
316
|
+
scores.setdefault(path, []).append(fs.score)
|
|
317
|
+
bar = threshold if threshold is not None else fs.threshold
|
|
318
|
+
if bar is None:
|
|
319
|
+
bar = 1.0
|
|
320
|
+
if fs.score < bar:
|
|
321
|
+
fails[path] = fails.get(path, 0) + 1
|
|
322
|
+
|
|
323
|
+
return {
|
|
324
|
+
path: {
|
|
325
|
+
"mean": mean(vals),
|
|
326
|
+
"min": min(vals),
|
|
327
|
+
"max": max(vals),
|
|
328
|
+
"p95": _percentile(vals, 0.95),
|
|
329
|
+
"fail_rate": fails.get(path, 0) / len(vals),
|
|
330
|
+
}
|
|
331
|
+
for path, vals in scores.items()
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
def print_summary(self) -> None:
|
|
335
|
+
"""Print a batch summary (aggregate metrics + field breakdown)."""
|
|
336
|
+
from structured_eval.reporting import render
|
|
337
|
+
|
|
338
|
+
print(render(self)) # noqa: T201
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class ConsistencyReport(BaseModel):
|
|
342
|
+
"""Stability of repeated runs of one prompt (``evaluate_consistency``).
|
|
343
|
+
|
|
344
|
+
``field_variance`` is the variance of each field's score across runs;
|
|
345
|
+
fields with variance at or below ``variance_threshold`` are ``stable_fields``,
|
|
346
|
+
the rest ``unstable_fields``. ``score_variance`` is the variance of the
|
|
347
|
+
document-level key-metric score.
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
per_run: list[EvalReport] = Field(default_factory=list)
|
|
351
|
+
field_variance: dict[str, float] = Field(default_factory=dict)
|
|
352
|
+
stable_fields: list[str] = Field(default_factory=list)
|
|
353
|
+
unstable_fields: list[str] = Field(default_factory=list)
|
|
354
|
+
mean_score: float | None = None
|
|
355
|
+
score_variance: float | None = None
|
|
356
|
+
|
|
357
|
+
def print_summary(self) -> None:
|
|
358
|
+
"""Print a consistency summary (stable/unstable fields + variance)."""
|
|
359
|
+
from structured_eval.reporting import render
|
|
360
|
+
|
|
361
|
+
print(render(self)) # noqa: T201
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Sample(BaseModel):
|
|
9
|
+
"""One document to evaluate.
|
|
10
|
+
|
|
11
|
+
Wrapping in ``Sample`` removes the ambiguity of a bare ``list``: a list
|
|
12
|
+
passed as ``actual`` is a single document whose root is an array, whereas
|
|
13
|
+
``list[Sample]`` is a batch of documents.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
actual: dict[str, Any] | list[Any] | str
|
|
17
|
+
expected: dict[str, Any] | list[Any] | str | None = None
|
|
18
|
+
source: str | None = None # original text, for Faithfulness
|
|
19
|
+
id: str | None = None # identifier in a BatchEvalReport
|
structured_eval/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Plain-text console rendering for the report types (no hard dependency).
|
|
2
|
+
|
|
3
|
+
``ConsoleRenderer().render(report)`` returns a string; the module-level
|
|
4
|
+
``render`` is a thin convenience over it, used by ``EvalReport.print_summary``.
|
|
5
|
+
The layout is pure stdlib so it works out of the box; Rich can be layered on
|
|
6
|
+
for colour later.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from structured_eval.models.result import (
|
|
12
|
+
BatchEvalReport,
|
|
13
|
+
ConsistencyReport,
|
|
14
|
+
EvalReport,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
_RULE = "─"
|
|
18
|
+
_BAR = "━"
|
|
19
|
+
_WIDTH = 60
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ConsoleRenderer:
|
|
23
|
+
"""Renders ``EvalReport`` / ``BatchEvalReport`` / ``ConsistencyReport``."""
|
|
24
|
+
|
|
25
|
+
def render(self, report: EvalReport | BatchEvalReport | ConsistencyReport) -> str:
|
|
26
|
+
if isinstance(report, EvalReport):
|
|
27
|
+
return self._render_eval(report)
|
|
28
|
+
if isinstance(report, BatchEvalReport):
|
|
29
|
+
return self._render_batch(report)
|
|
30
|
+
if isinstance(report, ConsistencyReport):
|
|
31
|
+
return self._render_consistency(report)
|
|
32
|
+
raise NotImplementedError
|
|
33
|
+
|
|
34
|
+
# ── EvalReport ──────────────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
def _render_eval(self, report: EvalReport) -> str:
|
|
37
|
+
out: list[str] = [_BAR * _WIDTH]
|
|
38
|
+
|
|
39
|
+
if report.parse_error:
|
|
40
|
+
out.append(f" PARSE ERROR: {report.parse_error_message}")
|
|
41
|
+
out.append(_BAR * _WIDTH)
|
|
42
|
+
return "\n".join(out)
|
|
43
|
+
|
|
44
|
+
bar = _BAR * _WIDTH
|
|
45
|
+
if report.score is not None:
|
|
46
|
+
verdict = "✓ PASS" if report.score >= 1.0 else "✗ FAIL"
|
|
47
|
+
label = report.score_label or ""
|
|
48
|
+
out.append(f" OVERALL {report.score:.2f} {verdict} {label}")
|
|
49
|
+
else:
|
|
50
|
+
out.append(" OVERALL — (no ground truth)")
|
|
51
|
+
|
|
52
|
+
# Document-level metrics: those a metric produced at the root ("$").
|
|
53
|
+
doc_metrics: dict[str, float] = {}
|
|
54
|
+
for name, coll in report.metrics.items():
|
|
55
|
+
v = coll.root()
|
|
56
|
+
if v is not None:
|
|
57
|
+
doc_metrics[name] = v
|
|
58
|
+
grid = self._metric_grid(doc_metrics, skip=report.score_label)
|
|
59
|
+
if grid:
|
|
60
|
+
out += ["", *grid]
|
|
61
|
+
out.append(bar)
|
|
62
|
+
|
|
63
|
+
# scalar leaves with a key-metric score
|
|
64
|
+
rows = []
|
|
65
|
+
for fs in report.field_scores.values():
|
|
66
|
+
if fs.score is None:
|
|
67
|
+
continue
|
|
68
|
+
metric_name = next(
|
|
69
|
+
(k for k, v in fs.metrics.items() if v == fs.score), "score"
|
|
70
|
+
)
|
|
71
|
+
rows.append(
|
|
72
|
+
[
|
|
73
|
+
fs.path,
|
|
74
|
+
metric_name,
|
|
75
|
+
self._num(fs.score),
|
|
76
|
+
self._num(fs.threshold),
|
|
77
|
+
self._mark(fs.score, fs.threshold),
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
if rows:
|
|
81
|
+
out += self._table(
|
|
82
|
+
["Field", "Metric", "Score", "Threshold", "Mark"],
|
|
83
|
+
rows,
|
|
84
|
+
["<", "<", ">", ">", "^"],
|
|
85
|
+
)
|
|
86
|
+
out.append(bar)
|
|
87
|
+
|
|
88
|
+
return "\n".join(out)
|
|
89
|
+
|
|
90
|
+
# ── BatchEvalReport ───────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
def _render_batch(self, report: BatchEvalReport) -> str:
|
|
93
|
+
bar = _BAR * _WIDTH
|
|
94
|
+
n = len(report.per_sample)
|
|
95
|
+
out = [bar, f" BATCH {n} samples"]
|
|
96
|
+
if report.score is not None:
|
|
97
|
+
out.append(f" mean {report.score_label or 'score'} {report.score:.2f}")
|
|
98
|
+
out.append(f" perfect_response_rate {report.perfect_response_rate:.2f}")
|
|
99
|
+
out.append(f" parse_error_rate {report.parse_error_rate:.2f}")
|
|
100
|
+
grid = self._metric_grid(report.metrics, skip=report.score_label)
|
|
101
|
+
if grid:
|
|
102
|
+
out += ["", *grid]
|
|
103
|
+
out.append(bar)
|
|
104
|
+
|
|
105
|
+
bd = report.field_breakdown()
|
|
106
|
+
ranked = sorted(bd.items(), key=lambda kv: kv[1]["fail_rate"], reverse=True)
|
|
107
|
+
if ranked:
|
|
108
|
+
rows = [
|
|
109
|
+
[
|
|
110
|
+
path,
|
|
111
|
+
self._num(s["mean"]),
|
|
112
|
+
self._num(s["p95"]),
|
|
113
|
+
self._num(s["fail_rate"]),
|
|
114
|
+
]
|
|
115
|
+
for path, s in ranked
|
|
116
|
+
]
|
|
117
|
+
out.append(" Field breakdown (worst first)")
|
|
118
|
+
out += self._table(
|
|
119
|
+
["Field", "mean", "p95", "fail_rate"], rows, ["<", ">", ">", ">"]
|
|
120
|
+
)
|
|
121
|
+
out.append(bar)
|
|
122
|
+
return "\n".join(out)
|
|
123
|
+
|
|
124
|
+
# ── ConsistencyReport ─────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def _render_consistency(self, report: ConsistencyReport) -> str:
|
|
127
|
+
bar = _BAR * _WIDTH
|
|
128
|
+
out = [bar, f" CONSISTENCY {len(report.per_run)} runs"]
|
|
129
|
+
if report.mean_score is not None:
|
|
130
|
+
out.append(f" mean score {report.mean_score:.2f}")
|
|
131
|
+
if report.score_variance is not None:
|
|
132
|
+
out.append(f" score variance {report.score_variance:.4f}")
|
|
133
|
+
out += [
|
|
134
|
+
f" stable {', '.join(report.stable_fields) or '—'}",
|
|
135
|
+
f" unstable {', '.join(report.unstable_fields) or '—'}",
|
|
136
|
+
bar,
|
|
137
|
+
]
|
|
138
|
+
ranked = sorted(
|
|
139
|
+
report.field_variance.items(), key=lambda kv: kv[1], reverse=True
|
|
140
|
+
)
|
|
141
|
+
if ranked:
|
|
142
|
+
rows = [[path, f"{var:.4f}"] for path, var in ranked]
|
|
143
|
+
out += self._table(["Field", "variance"], rows, ["<", ">"])
|
|
144
|
+
out.append(bar)
|
|
145
|
+
return "\n".join(out)
|
|
146
|
+
|
|
147
|
+
# ── formatting helpers ─────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _num(value: float | None) -> str:
|
|
151
|
+
return "—" if value is None else f"{value:.2f}"
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _mark(score: float | None, bar: float | None) -> str:
|
|
155
|
+
if score is None or bar is None:
|
|
156
|
+
return " "
|
|
157
|
+
return "✓" if score >= bar else "✗"
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _table(
|
|
161
|
+
headers: list[str], rows: list[list[str]], aligns: list[str] | None = None
|
|
162
|
+
) -> list[str]:
|
|
163
|
+
"""Render a simple monospace table as a list of lines."""
|
|
164
|
+
cols = (
|
|
165
|
+
list(zip(*([headers, *rows]), strict=False))
|
|
166
|
+
if rows
|
|
167
|
+
else [[h] for h in headers]
|
|
168
|
+
)
|
|
169
|
+
widths = [max(len(c) for c in col) for col in cols]
|
|
170
|
+
aligns = aligns or ["<"] * len(headers)
|
|
171
|
+
|
|
172
|
+
def fmt(cells: list[str]) -> str:
|
|
173
|
+
return " ".join(
|
|
174
|
+
f"{c:{a}{w}}" for c, w, a in zip(cells, widths, aligns, strict=False)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
lines = [fmt(headers), fmt([_RULE * w for w in widths])]
|
|
178
|
+
lines += [fmt(r) for r in rows]
|
|
179
|
+
return [" " + line for line in lines]
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _metric_grid(metrics: dict[str, float], skip: str | None = None) -> list[str]:
|
|
183
|
+
"""Two-per-line key/value grid of document metrics."""
|
|
184
|
+
items = [(k, v) for k, v in metrics.items() if k != skip]
|
|
185
|
+
if not items:
|
|
186
|
+
return []
|
|
187
|
+
width = max(len(k) for k, _ in items)
|
|
188
|
+
cells = [f"{k:<{width}} {ConsoleRenderer._num(v)}" for k, v in items]
|
|
189
|
+
return [" " + " ".join(cells[i : i + 2]) for i in range(0, len(cells), 2)]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def render(report: EvalReport | BatchEvalReport | ConsistencyReport) -> str:
|
|
193
|
+
"""Render any of the report types to a printable string."""
|
|
194
|
+
return ConsoleRenderer().render(report)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from structured_eval.utils.flatten import extract_paths, flatten
|
|
2
|
+
from structured_eval.utils.structured_diff import (
|
|
3
|
+
DiffEntry,
|
|
4
|
+
DiffType,
|
|
5
|
+
StructuredDiff,
|
|
6
|
+
structured_diff,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DiffEntry",
|
|
11
|
+
"DiffType",
|
|
12
|
+
"StructuredDiff",
|
|
13
|
+
"extract_paths",
|
|
14
|
+
"flatten",
|
|
15
|
+
"structured_diff",
|
|
16
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def flatten(obj: dict[str, Any] | list[Any], prefix: str = "") -> dict[str, Any]:
|
|
7
|
+
"""Recursively flatten a nested dict/list into dot-and-bracket key paths.
|
|
8
|
+
|
|
9
|
+
Dict keys use dot notation: {"a": {"b": 1}} → {"a.b": 1}
|
|
10
|
+
List indices use brackets: {"a": [1, 2]} → {"a[0]": 1, "a[1]": 2}
|
|
11
|
+
Empty containers are left as-is: {"a": {}} → {"a": {}}
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
obj: Dict or list to flatten.
|
|
15
|
+
prefix: Internal prefix for recursive calls; do not pass externally.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Flat dict mapping string paths to primitive (or empty container) values.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
>>> flatten({"invoice": {"id": "1", "items": [{"price": 100}]}})
|
|
22
|
+
{"invoice.id": "1", "invoice.items[0].price": 100}
|
|
23
|
+
"""
|
|
24
|
+
result: dict[str, Any] = {}
|
|
25
|
+
if isinstance(obj, dict):
|
|
26
|
+
for key, value in obj.items():
|
|
27
|
+
path = f"{prefix}.{key}" if prefix else key
|
|
28
|
+
if isinstance(value, (dict, list)) and value:
|
|
29
|
+
result.update(flatten(value, path))
|
|
30
|
+
else:
|
|
31
|
+
result[path] = value
|
|
32
|
+
elif isinstance(obj, list):
|
|
33
|
+
for i, item in enumerate(obj):
|
|
34
|
+
path = f"{prefix}[{i}]"
|
|
35
|
+
if isinstance(item, (dict, list)) and item:
|
|
36
|
+
result.update(flatten(item, path))
|
|
37
|
+
else:
|
|
38
|
+
result[path] = item
|
|
39
|
+
return result
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def extract_paths(value: Any, prefix: str = "") -> set[str]:
|
|
43
|
+
"""Every structural path in a JSON-like value — order- and value-blind.
|
|
44
|
+
|
|
45
|
+
Yields the path of each container *and* each leaf, so the set captures the
|
|
46
|
+
whole skeleton: dict keys (``a``, ``a.b``), list indices (``a[0]``) and the
|
|
47
|
+
leaf paths beneath them. Values themselves are ignored — only the shape.
|
|
48
|
+
Unlike :func:`flatten`, intermediate container paths are included, not just
|
|
49
|
+
leaves, and the result is a set of paths rather than a path→value mapping.
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> sorted(extract_paths({"a": {"b": 1}, "c": [2]}))
|
|
53
|
+
['a', 'a.b', 'c', 'c[0]']
|
|
54
|
+
"""
|
|
55
|
+
paths: set[str] = set()
|
|
56
|
+
if isinstance(value, dict):
|
|
57
|
+
for key, child in value.items():
|
|
58
|
+
here = f"{prefix}.{key}" if prefix else str(key)
|
|
59
|
+
paths.add(here)
|
|
60
|
+
paths |= extract_paths(child, here)
|
|
61
|
+
elif isinstance(value, list):
|
|
62
|
+
for index, child in enumerate(value):
|
|
63
|
+
here = f"{prefix}[{index}]"
|
|
64
|
+
paths.add(here)
|
|
65
|
+
paths |= extract_paths(child, here)
|
|
66
|
+
return paths
|