structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""structured_eval — field-level evaluation of structured LLM outputs.
|
|
2
|
+
|
|
3
|
+
The top level exposes only the entrypoints. Everything else lives one level
|
|
4
|
+
down, imported explicitly from its subsystem:
|
|
5
|
+
|
|
6
|
+
- ``structured_eval.models`` — user-facing data models: ``Sample``,
|
|
7
|
+
``EvalConfig`` (+ the ``*FieldConfig`` family & policies), ``EvalReport`` /
|
|
8
|
+
``BatchEvalReport`` / ``ConsistencyReport``. Lower-level model pieces live in
|
|
9
|
+
precise submodules (``models.nodes`` / ``models.result`` /
|
|
10
|
+
``models.metric_result`` / ``models.context``).
|
|
11
|
+
- ``structured_eval.metrics`` — every metric plus the base hierarchy
|
|
12
|
+
(``Metric`` / ``FieldMetric`` / …), ``resolve_metric``, and the rule DSL
|
|
13
|
+
(``Rule`` / ``RulePassRate``).
|
|
14
|
+
- ``structured_eval.alignment`` / ``.formats`` / ``.utils`` — supporting
|
|
15
|
+
machinery (array alignment, parsers, ``flatten`` / ``structured_diff``).
|
|
16
|
+
|
|
17
|
+
``evaluate`` / ``evaluate_batch`` / ``evaluate_consistency`` are thin wrappers
|
|
18
|
+
over ``engine.Evaluator``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from structured_eval.api import evaluate, evaluate_batch, evaluate_consistency
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"evaluate",
|
|
25
|
+
"evaluate_batch",
|
|
26
|
+
"evaluate_consistency",
|
|
27
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from structured_eval.alignment.base import ArrayAligner, key_value
|
|
2
|
+
from structured_eval.alignment.by_index import ByIndexAligner
|
|
3
|
+
from structured_eval.alignment.by_key import ByKeyAligner
|
|
4
|
+
from structured_eval.alignment.factory import make_aligner
|
|
5
|
+
from structured_eval.alignment.hungarian import HungarianAligner, Scorer
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ArrayAligner",
|
|
9
|
+
"ByIndexAligner",
|
|
10
|
+
"ByKeyAligner",
|
|
11
|
+
"HungarianAligner",
|
|
12
|
+
"Scorer",
|
|
13
|
+
"key_value",
|
|
14
|
+
"make_aligner",
|
|
15
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.utils.paths import MISSING, navigate
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult
|
|
10
|
+
|
|
11
|
+
# Sentinel for a key that cannot be extracted (absent, or element not a dict).
|
|
12
|
+
_MISSING_KEY = object()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def key_value(element: Any, key: str | None) -> Any:
|
|
16
|
+
"""The alignment key of an element: the whole element, or a named field.
|
|
17
|
+
|
|
18
|
+
Shared by every aligner that pairs on a key (``ByKeyAligner``,
|
|
19
|
+
``HungarianAligner``). Returns ``None`` for a missing field and a private
|
|
20
|
+
sentinel when ``key`` is given but the element is not a dict.
|
|
21
|
+
"""
|
|
22
|
+
if key is None:
|
|
23
|
+
return element
|
|
24
|
+
if isinstance(element, dict):
|
|
25
|
+
value = navigate(element, key)
|
|
26
|
+
return None if value is MISSING else value
|
|
27
|
+
return _MISSING_KEY
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ArrayAligner(ABC):
|
|
31
|
+
"""Maps actual array items onto expected ones (the only role of a matcher).
|
|
32
|
+
|
|
33
|
+
``align`` returns an ``ArrayMatchResult`` with matched ``(expected_idx,
|
|
34
|
+
actual_idx)`` pairs plus the unmatched expected (missed) and actual
|
|
35
|
+
(spurious) indices. Value scoring of matched pairs happens later, in the
|
|
36
|
+
array metrics.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult: ...
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.alignment.base import ArrayAligner
|
|
6
|
+
from structured_eval.models.config import ArrayStrategy
|
|
7
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ByIndexAligner(ArrayAligner):
|
|
11
|
+
"""Pairs the i-th expected item with the i-th actual item.
|
|
12
|
+
|
|
13
|
+
For positionally significant lists (steps, time series, rankings). No key
|
|
14
|
+
comparison is performed.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
|
|
18
|
+
n = min(len(expected), len(actual))
|
|
19
|
+
return ArrayMatchResult(
|
|
20
|
+
strategy=ArrayStrategy.BY_INDEX,
|
|
21
|
+
matched=[(i, i) for i in range(n)],
|
|
22
|
+
missed=list(range(n, len(expected))),
|
|
23
|
+
spurious=list(range(n, len(actual))),
|
|
24
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.alignment.base import ArrayAligner, key_value
|
|
6
|
+
from structured_eval.metrics.base import BaseMetric, resolve_metric
|
|
7
|
+
from structured_eval.metrics.exact import ExactMatch
|
|
8
|
+
from structured_eval.metrics.invoker import MetricInvoker
|
|
9
|
+
from structured_eval.models.config import ArrayStrategy
|
|
10
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ByKeyAligner(ArrayAligner):
|
|
14
|
+
"""Pairs items whose keys match, greedily best-first (generalized matching).
|
|
15
|
+
|
|
16
|
+
Extracts a key from each element (the ``key`` field, or the whole element
|
|
17
|
+
when ``key`` is None), compares keys with ``key_metric`` (default
|
|
18
|
+
``ExactMatch``) and pairs them when the score clears ``threshold``. This
|
|
19
|
+
subsumes value- and similarity-based matching (technical_details_v3 §5).
|
|
20
|
+
|
|
21
|
+
Pairing is **globally greedy**: every candidate pair whose key score clears
|
|
22
|
+
the threshold is ranked by score (highest first) and claimed one-to-one,
|
|
23
|
+
skipping pairs whose either side is already taken. So a *soft* key picks the
|
|
24
|
+
strongest available partner rather than the first one found, and the result
|
|
25
|
+
does not depend on element order. With an exact key (all passing scores tie
|
|
26
|
+
at 1.0) this reduces to the original first-match behaviour. It is a cheap,
|
|
27
|
+
scipy-free approximation of the optimal assignment that ``HungarianAligner``
|
|
28
|
+
computes.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
key: str | None = None,
|
|
34
|
+
key_metric: str | BaseMetric | None = None,
|
|
35
|
+
threshold: float = 1.0,
|
|
36
|
+
):
|
|
37
|
+
self.key = key
|
|
38
|
+
metric = ExactMatch() if key_metric is None else resolve_metric(key_metric)
|
|
39
|
+
self.scorer = MetricInvoker(metric)
|
|
40
|
+
self.threshold = threshold
|
|
41
|
+
|
|
42
|
+
def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
|
|
43
|
+
# Score every (expected, actual) pair on its key; keep those clearing
|
|
44
|
+
# the threshold. Generated in (ei, ai) order so a stable sort breaks
|
|
45
|
+
# score ties by that order (→ exact-key matches reproduce first-match).
|
|
46
|
+
candidates: list[tuple[float, int, int]] = []
|
|
47
|
+
for ei, e_item in enumerate(expected):
|
|
48
|
+
e_key = key_value(e_item, self.key)
|
|
49
|
+
for ai, a_item in enumerate(actual):
|
|
50
|
+
score = self.scorer.scalar_on_values(key_value(a_item, self.key), e_key)
|
|
51
|
+
if score >= self.threshold:
|
|
52
|
+
candidates.append((score, ei, ai))
|
|
53
|
+
candidates.sort(key=lambda c: c[0], reverse=True) # best first; ties keep order
|
|
54
|
+
|
|
55
|
+
used_e: set[int] = set()
|
|
56
|
+
used_a: set[int] = set()
|
|
57
|
+
matched: list[tuple[int, int]] = []
|
|
58
|
+
for _score, ei, ai in candidates:
|
|
59
|
+
if ei in used_e or ai in used_a:
|
|
60
|
+
continue
|
|
61
|
+
used_e.add(ei)
|
|
62
|
+
used_a.add(ai)
|
|
63
|
+
matched.append((ei, ai))
|
|
64
|
+
matched.sort() # report pairs in expected order
|
|
65
|
+
|
|
66
|
+
missed = [ei for ei in range(len(expected)) if ei not in used_e]
|
|
67
|
+
spurious = [ai for ai in range(len(actual)) if ai not in used_a]
|
|
68
|
+
return ArrayMatchResult(
|
|
69
|
+
strategy=ArrayStrategy.BY_KEY,
|
|
70
|
+
matched=matched,
|
|
71
|
+
missed=missed,
|
|
72
|
+
spurious=spurious,
|
|
73
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.alignment.by_index import ByIndexAligner
|
|
6
|
+
from structured_eval.alignment.by_key import ByKeyAligner
|
|
7
|
+
from structured_eval.alignment.hungarian import HungarianAligner
|
|
8
|
+
from structured_eval.models.config import ArrayStrategy
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from structured_eval.alignment.base import ArrayAligner
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_aligner(
|
|
15
|
+
strategy: ArrayStrategy = ArrayStrategy.BY_INDEX,
|
|
16
|
+
params: dict[str, Any] | None = None,
|
|
17
|
+
) -> ArrayAligner:
|
|
18
|
+
"""Build the aligner for an array config's ``strategy`` from its ``params``.
|
|
19
|
+
|
|
20
|
+
``params`` keys match the chosen aligner's constructor arguments; an unknown
|
|
21
|
+
key surfaces as a ``TypeError`` from that constructor.
|
|
22
|
+
"""
|
|
23
|
+
params = params or {}
|
|
24
|
+
if strategy == ArrayStrategy.BY_INDEX:
|
|
25
|
+
return ByIndexAligner()
|
|
26
|
+
if strategy == ArrayStrategy.HUNGARIAN:
|
|
27
|
+
return HungarianAligner(**params)
|
|
28
|
+
return ByKeyAligner(**params)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from structured_eval.alignment.base import ArrayAligner, key_value
|
|
8
|
+
from structured_eval.metrics.base import FieldMetric, Metric, resolve_metric
|
|
9
|
+
from structured_eval.metrics.exact import ExactMatch
|
|
10
|
+
from structured_eval.metrics.invoker import MetricInvoker
|
|
11
|
+
from structured_eval.metrics.numeric_closeness import NumericCloseness
|
|
12
|
+
from structured_eval.models.config import ArrayStrategy
|
|
13
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult
|
|
14
|
+
|
|
15
|
+
_LARGE_MATRIX_WARN = 10_000 # rows*cols beyond which we warn (quadratic scoring cost)
|
|
16
|
+
|
|
17
|
+
# A per-element similarity: a Metric instance (every Metric has ``score``), its
|
|
18
|
+
# registered name, or a plain ``(actual, expected) -> float`` callable.
|
|
19
|
+
Scorer = Metric[Any] | str | Callable[[Any, Any], float]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HungarianAligner(ArrayAligner):
|
|
23
|
+
"""Optimal one-to-one assignment via the Hungarian algorithm.
|
|
24
|
+
|
|
25
|
+
Builds a similarity matrix ``S[i,j] = score(expected[i], actual[j])`` and
|
|
26
|
+
solves ``min sum(1 - S)`` with ``scipy.optimize.linear_sum_assignment`` —
|
|
27
|
+
the globally optimal pairing regardless of order. A pair counts as matched
|
|
28
|
+
only when its similarity clears ``threshold`` (otherwise both sides are left
|
|
29
|
+
unmatched: a missed expected and a spurious actual).
|
|
30
|
+
|
|
31
|
+
``scorer`` is the element similarity. Crucially our field metrics already
|
|
32
|
+
*are* scorers (``FieldMetric.score(actual, expected) -> float``), so no
|
|
33
|
+
adapter is needed — a metric, its registered name, or a plain callable is
|
|
34
|
+
used directly. It may be:
|
|
35
|
+
|
|
36
|
+
* a single ``Scorer`` — applied to the whole element;
|
|
37
|
+
* a ``dict[str, Scorer]`` — per-field scorers for arrays of objects; the
|
|
38
|
+
element score is the mean over the union of fields (a field with no entry
|
|
39
|
+
falls back to its type default);
|
|
40
|
+
* ``None`` — type-aware default (graded numeric / ``Fuzzy`` / exact), with
|
|
41
|
+
objects scored field-by-field.
|
|
42
|
+
|
|
43
|
+
``key`` scores on a named sub-field instead of the whole element. Requires
|
|
44
|
+
the ``align`` extra (scipy).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
scorer: Scorer | dict[str, Scorer] | None = None,
|
|
50
|
+
threshold: float = 0.8,
|
|
51
|
+
key: str | None = None,
|
|
52
|
+
):
|
|
53
|
+
self.scorer = scorer
|
|
54
|
+
self.threshold = threshold
|
|
55
|
+
self.key = key
|
|
56
|
+
|
|
57
|
+
def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
|
|
58
|
+
if not expected or not actual:
|
|
59
|
+
return ArrayMatchResult(
|
|
60
|
+
strategy=ArrayStrategy.HUNGARIAN,
|
|
61
|
+
matched=[],
|
|
62
|
+
missed=list(range(len(expected))),
|
|
63
|
+
spurious=list(range(len(actual))),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if len(expected) * len(actual) > _LARGE_MATRIX_WARN:
|
|
67
|
+
warnings.warn(
|
|
68
|
+
f"HungarianAligner: large {len(expected)}x{len(actual)} similarity "
|
|
69
|
+
"matrix; alignment may be slow.",
|
|
70
|
+
stacklevel=2,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
from scipy.optimize import linear_sum_assignment
|
|
75
|
+
except ImportError as exc: # pragma: no cover
|
|
76
|
+
raise ImportError(
|
|
77
|
+
"scipy is required for HungarianAligner. "
|
|
78
|
+
"Install it with: pip install 'structured-eval[align]'"
|
|
79
|
+
) from exc
|
|
80
|
+
|
|
81
|
+
cost = [[1.0 - self._score(e, a) for a in actual] for e in expected]
|
|
82
|
+
rows, cols = linear_sum_assignment(cost)
|
|
83
|
+
|
|
84
|
+
matched: list[tuple[int, int]] = []
|
|
85
|
+
used_e: set[int] = set()
|
|
86
|
+
used_a: set[int] = set()
|
|
87
|
+
for ei, ai in zip(rows, cols, strict=True):
|
|
88
|
+
if 1.0 - cost[ei][ai] >= self.threshold:
|
|
89
|
+
matched.append((int(ei), int(ai)))
|
|
90
|
+
used_e.add(int(ei))
|
|
91
|
+
used_a.add(int(ai))
|
|
92
|
+
|
|
93
|
+
missed = [ei for ei in range(len(expected)) if ei not in used_e]
|
|
94
|
+
spurious = [ai for ai in range(len(actual)) if ai not in used_a]
|
|
95
|
+
return ArrayMatchResult(
|
|
96
|
+
strategy=ArrayStrategy.HUNGARIAN,
|
|
97
|
+
matched=matched,
|
|
98
|
+
missed=missed,
|
|
99
|
+
spurious=spurious,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# ── element similarity ──────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
def _score(self, expected: Any, actual: Any) -> float:
|
|
105
|
+
return self._similarity(
|
|
106
|
+
key_value(expected, self.key),
|
|
107
|
+
key_value(actual, self.key),
|
|
108
|
+
self.scorer,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _similarity(
|
|
112
|
+
self,
|
|
113
|
+
expected: Any,
|
|
114
|
+
actual: Any,
|
|
115
|
+
scorer: Scorer | dict[str, Scorer] | None,
|
|
116
|
+
) -> float:
|
|
117
|
+
if isinstance(scorer, dict):
|
|
118
|
+
return self._object_similarity(expected, actual, scorer)
|
|
119
|
+
if scorer is not None:
|
|
120
|
+
return self._apply(scorer, expected, actual)
|
|
121
|
+
if isinstance(expected, dict) and isinstance(actual, dict):
|
|
122
|
+
return self._object_similarity(expected, actual, {})
|
|
123
|
+
return self._apply(self._default_scorer(expected, actual), expected, actual)
|
|
124
|
+
|
|
125
|
+
def _object_similarity(
|
|
126
|
+
self, expected: Any, actual: Any, scorers: dict[str, Scorer]
|
|
127
|
+
) -> float:
|
|
128
|
+
if not isinstance(expected, dict) or not isinstance(actual, dict):
|
|
129
|
+
return 1.0 if expected == actual else 0.0
|
|
130
|
+
keys = set(expected) | set(actual)
|
|
131
|
+
if not keys:
|
|
132
|
+
return 1.0
|
|
133
|
+
total = sum(
|
|
134
|
+
self._similarity(expected.get(k), actual.get(k), scorers.get(k))
|
|
135
|
+
for k in keys
|
|
136
|
+
)
|
|
137
|
+
return total / len(keys)
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def _apply(scorer: Scorer, expected: Any, actual: Any) -> float:
|
|
141
|
+
if callable(scorer) and not isinstance(scorer, (str, Metric)):
|
|
142
|
+
return float(scorer(actual, expected))
|
|
143
|
+
return MetricInvoker(resolve_metric(scorer)).scalar_on_values(actual, expected)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _default_scorer(expected: Any, actual: Any) -> FieldMetric:
|
|
147
|
+
"""Type-aware default similarity metric for a pair of scalar values.
|
|
148
|
+
|
|
149
|
+
``bool`` → exact, number → graded :class:`NumericCloseness`, ``str`` →
|
|
150
|
+
:class:`Fuzzy` (or exact without rapidfuzz), everything else → exact.
|
|
151
|
+
"""
|
|
152
|
+
if isinstance(expected, bool) or isinstance(actual, bool):
|
|
153
|
+
return ExactMatch()
|
|
154
|
+
if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
|
|
155
|
+
return NumericCloseness()
|
|
156
|
+
return ExactMatch()
|
structured_eval/api.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.engine.evaluator import Evaluator
|
|
6
|
+
from structured_eval.models.sample import Sample
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.models.config import EvalConfig
|
|
10
|
+
from structured_eval.models.result import (
|
|
11
|
+
BatchEvalReport,
|
|
12
|
+
ConsistencyReport,
|
|
13
|
+
EvalReport,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_batch(actual: Any) -> bool:
|
|
18
|
+
"""A list of Samples is a batch; a bare list is a single array-root doc."""
|
|
19
|
+
return isinstance(actual, list) and all(isinstance(x, Sample) for x in actual)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def evaluate(
|
|
23
|
+
actual: Any,
|
|
24
|
+
expected: Any = None,
|
|
25
|
+
config: EvalConfig | None = None,
|
|
26
|
+
*,
|
|
27
|
+
source: str | None = None,
|
|
28
|
+
) -> EvalReport:
|
|
29
|
+
"""Evaluate one document against an expected reference → ``EvalReport``.
|
|
30
|
+
|
|
31
|
+
Two call shapes:
|
|
32
|
+
- ``evaluate(actual, expected, config=...)`` — shorthand for one document;
|
|
33
|
+
- ``evaluate(sample, config=...)`` — one ``Sample``.
|
|
34
|
+
|
|
35
|
+
A bare ``list`` is a single document with an array root, not a batch. To
|
|
36
|
+
evaluate several samples use :func:`evaluate_batch`. Thin wrapper over
|
|
37
|
+
``Evaluator``.
|
|
38
|
+
"""
|
|
39
|
+
if _is_batch(actual):
|
|
40
|
+
raise TypeError(
|
|
41
|
+
"evaluate() takes a single document; pass a list of Samples to evaluate_batch()"
|
|
42
|
+
)
|
|
43
|
+
sample = (
|
|
44
|
+
actual
|
|
45
|
+
if isinstance(actual, Sample)
|
|
46
|
+
else Sample(actual=actual, expected=expected, source=source)
|
|
47
|
+
)
|
|
48
|
+
return Evaluator(config).evaluate_one(sample)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def evaluate_batch(
|
|
52
|
+
samples: list[Sample],
|
|
53
|
+
config: EvalConfig | None = None,
|
|
54
|
+
) -> BatchEvalReport:
|
|
55
|
+
"""Evaluate a list of ``Sample`` s → ``BatchEvalReport``.
|
|
56
|
+
|
|
57
|
+
Each sample carries its own ``actual`` / ``expected`` / ``source``; the
|
|
58
|
+
aggregate report exposes per-sample reports plus batch-level metrics. Thin
|
|
59
|
+
wrapper over ``Evaluator``.
|
|
60
|
+
"""
|
|
61
|
+
return Evaluator(config).evaluate_batch(samples)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def evaluate_consistency(
|
|
65
|
+
runs: list[Sample],
|
|
66
|
+
config: EvalConfig | None = None,
|
|
67
|
+
*,
|
|
68
|
+
variance_threshold: float = 0.05,
|
|
69
|
+
) -> ConsistencyReport:
|
|
70
|
+
"""Measure run-to-run stability across repeated outputs of one prompt.
|
|
71
|
+
|
|
72
|
+
``runs`` are several outputs for the same input (with or without a shared
|
|
73
|
+
``expected``). Fields whose score varies at most ``variance_threshold``
|
|
74
|
+
across runs are reported as stable, the rest as unstable. Thin wrapper over
|
|
75
|
+
``Evaluator``.
|
|
76
|
+
"""
|
|
77
|
+
return Evaluator(config).evaluate_consistency(
|
|
78
|
+
runs, variance_threshold=variance_threshold
|
|
79
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from structured_eval.engine.aggregator import BatchAggregator
|
|
2
|
+
from structured_eval.engine.evaluator import Evaluator
|
|
3
|
+
from structured_eval.engine.metric_runner import MetricRunner
|
|
4
|
+
from structured_eval.engine.parser import Parser
|
|
5
|
+
from structured_eval.engine.report_builder import ReportBuilder
|
|
6
|
+
from structured_eval.engine.tree_builder import TreeBuilder
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BatchAggregator",
|
|
10
|
+
"Evaluator",
|
|
11
|
+
"MetricRunner",
|
|
12
|
+
"Parser",
|
|
13
|
+
"ReportBuilder",
|
|
14
|
+
"TreeBuilder",
|
|
15
|
+
]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Aggregation over multiple EvalReports: batch and consistency statistics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from statistics import mean, pvariance
|
|
6
|
+
|
|
7
|
+
from structured_eval.models.result import (
|
|
8
|
+
BatchEvalReport,
|
|
9
|
+
ConsistencyReport,
|
|
10
|
+
EvalReport,
|
|
11
|
+
NodeType,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BatchAggregator:
|
|
16
|
+
"""Combines per-document reports into batch / consistency summaries."""
|
|
17
|
+
|
|
18
|
+
def batch(self, reports: list[EvalReport]) -> BatchEvalReport:
|
|
19
|
+
"""Aggregate a list of single-document reports into a BatchEvalReport."""
|
|
20
|
+
n = len(reports)
|
|
21
|
+
errors = sum(1 for r in reports if r.parse_error)
|
|
22
|
+
ok = [r for r in reports if not r.parse_error]
|
|
23
|
+
|
|
24
|
+
scores = [r.score for r in ok if r.score is not None]
|
|
25
|
+
score = mean(scores) if scores else None
|
|
26
|
+
label = next((r.score_label for r in ok if r.score_label is not None), None)
|
|
27
|
+
|
|
28
|
+
perfect = sum(1 for r in ok if not r.failed_fields())
|
|
29
|
+
|
|
30
|
+
return BatchEvalReport(
|
|
31
|
+
per_sample=reports,
|
|
32
|
+
metrics=self._mean_metrics(reports),
|
|
33
|
+
score=score,
|
|
34
|
+
score_label=label,
|
|
35
|
+
perfect_response_rate=(perfect / n) if n else 0.0,
|
|
36
|
+
parse_error_rate=(errors / n) if n else 0.0,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def consistency(
|
|
40
|
+
self, reports: list[EvalReport], variance_threshold: float = 0.05
|
|
41
|
+
) -> ConsistencyReport:
|
|
42
|
+
"""Measure run-to-run stability across repeated outputs of one prompt."""
|
|
43
|
+
ok = [r for r in reports if not r.parse_error]
|
|
44
|
+
|
|
45
|
+
by_path: dict[str, list[float]] = {}
|
|
46
|
+
for r in ok:
|
|
47
|
+
for path, fs in r.field_scores.items():
|
|
48
|
+
# Field-level stability currently tracks leaf fields only: object/
|
|
49
|
+
# array nodes carry an aggregate representative score whose variance
|
|
50
|
+
# is just a function of its children's, so including it here would
|
|
51
|
+
# be redundant (double-counting the same wobble), non-actionable
|
|
52
|
+
# (a parent path doesn't point at a concrete field to fix) and
|
|
53
|
+
# noisy (an F1-over-children varies for different reasons than a
|
|
54
|
+
# single atomic value). Hence the leaf filter.
|
|
55
|
+
#
|
|
56
|
+
# TODO: support per-node stability regardless of node type. Some
|
|
57
|
+
# users want block-level wobble ("the whole `address` object is
|
|
58
|
+
# unstable") without drilling into leaves. The fix is NOT to drop
|
|
59
|
+
# this filter (that mixes scales) but to expose a separate,
|
|
60
|
+
# parallel view computed over non-scalar nodes (e.g.
|
|
61
|
+
# ConsistencyReport.object_variance / block_variance), keeping the
|
|
62
|
+
# leaf map clean and adding the aggregate one alongside it.
|
|
63
|
+
if fs.score is None or fs.node_type != NodeType.SCALAR:
|
|
64
|
+
continue
|
|
65
|
+
by_path.setdefault(path, []).append(fs.score)
|
|
66
|
+
|
|
67
|
+
variance: dict[str, float] = {}
|
|
68
|
+
stable: list[str] = []
|
|
69
|
+
unstable: list[str] = []
|
|
70
|
+
for path, vals in by_path.items():
|
|
71
|
+
var = pvariance(vals) if len(vals) > 1 else 0.0
|
|
72
|
+
variance[path] = var
|
|
73
|
+
(stable if var <= variance_threshold else unstable).append(path)
|
|
74
|
+
|
|
75
|
+
scores = [r.score for r in ok if r.score is not None]
|
|
76
|
+
return ConsistencyReport(
|
|
77
|
+
per_run=reports,
|
|
78
|
+
field_variance=variance,
|
|
79
|
+
stable_fields=stable,
|
|
80
|
+
unstable_fields=unstable,
|
|
81
|
+
mean_score=mean(scores) if scores else None,
|
|
82
|
+
score_variance=(pvariance(scores) if len(scores) > 1 else 0.0)
|
|
83
|
+
if scores
|
|
84
|
+
else None,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def _mean_metrics(reports: list[EvalReport]) -> dict[str, float]:
|
|
89
|
+
"""Mean of each metric across reports that carry it (errors excluded)."""
|
|
90
|
+
buckets: dict[str, list[float]] = {}
|
|
91
|
+
for r in reports:
|
|
92
|
+
if r.parse_error:
|
|
93
|
+
continue
|
|
94
|
+
for name, coll in r.metrics.items():
|
|
95
|
+
buckets.setdefault(name, []).append(coll.representative())
|
|
96
|
+
return {name: mean(vals) for name, vals in buckets.items() if vals}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.engine.aggregator import BatchAggregator
|
|
6
|
+
from structured_eval.engine.metric_runner import MetricRunner
|
|
7
|
+
from structured_eval.engine.parser import Parser
|
|
8
|
+
from structured_eval.engine.report_builder import ReportBuilder
|
|
9
|
+
from structured_eval.engine.tree_builder import TreeBuilder
|
|
10
|
+
from structured_eval.models.config import EvalConfig
|
|
11
|
+
from structured_eval.models.context import EvalContext
|
|
12
|
+
from structured_eval.models.result import BatchEvalReport, ConsistencyReport, EvalReport
|
|
13
|
+
from structured_eval.utils.flatten import flatten
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from structured_eval.models.sample import Sample
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Evaluator:
|
|
20
|
+
"""Orchestrates the three evaluation phases for one config.
|
|
21
|
+
|
|
22
|
+
Holds the ``EvalConfig`` and the phase collaborators (parse → build tree →
|
|
23
|
+
run metrics → build report) and aggregates batches. The module-level
|
|
24
|
+
``evaluate`` / ``evaluate_consistency`` functions are thin wrappers over this.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: EvalConfig | None = None):
|
|
28
|
+
self.config = config or EvalConfig()
|
|
29
|
+
self._parser = Parser()
|
|
30
|
+
self._runner = MetricRunner()
|
|
31
|
+
self._report_builder = ReportBuilder()
|
|
32
|
+
self._aggregator = BatchAggregator()
|
|
33
|
+
|
|
34
|
+
def evaluate_one(self, sample: Sample) -> EvalReport:
|
|
35
|
+
"""Evaluate a single document against its expected reference."""
|
|
36
|
+
actual, actual_err = self._parser.parse(sample.actual)
|
|
37
|
+
expected, expected_err = self._parser.parse(sample.expected)
|
|
38
|
+
error = actual_err or expected_err
|
|
39
|
+
if error is not None:
|
|
40
|
+
return EvalReport(parse_error=True, parse_error_message=error)
|
|
41
|
+
|
|
42
|
+
context = EvalContext(
|
|
43
|
+
actual=actual,
|
|
44
|
+
expected=expected,
|
|
45
|
+
source=sample.source,
|
|
46
|
+
flat_actual=_flat(actual),
|
|
47
|
+
flat_expected=_flat(expected),
|
|
48
|
+
config=self.config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
root, warnings = TreeBuilder(
|
|
52
|
+
context
|
|
53
|
+
).build() # phase 1: structure + per-node metrics
|
|
54
|
+
self._runner.run(
|
|
55
|
+
root
|
|
56
|
+
) # phase 2: compute post-order, each node's key_metric last
|
|
57
|
+
return self._report_builder.build(root, context, warnings) # phase 3
|
|
58
|
+
|
|
59
|
+
def evaluate_batch(self, samples: list[Sample]) -> BatchEvalReport:
|
|
60
|
+
"""Evaluate a list of documents and aggregate the results."""
|
|
61
|
+
return self._aggregator.batch([self.evaluate_one(s) for s in samples])
|
|
62
|
+
|
|
63
|
+
def evaluate_consistency(
|
|
64
|
+
self, runs: list[Sample], *, variance_threshold: float = 0.05
|
|
65
|
+
) -> ConsistencyReport:
|
|
66
|
+
"""Measure run-to-run stability across repeated outputs of one prompt."""
|
|
67
|
+
reports = [self.evaluate_one(s) for s in runs]
|
|
68
|
+
return self._aggregator.consistency(reports, variance_threshold)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _flat(data: Any) -> dict[str, Any]:
|
|
72
|
+
return flatten(data) if isinstance(data, (dict, list)) else {}
|