structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ """structured_eval — field-level evaluation of structured LLM outputs.
2
+
3
+ The top level exposes only the entrypoints. Everything else lives one level
4
+ down, imported explicitly from its subsystem:
5
+
6
+ - ``structured_eval.models`` — user-facing data models: ``Sample``,
7
+ ``EvalConfig`` (+ the ``*FieldConfig`` family & policies), ``EvalReport`` /
8
+ ``BatchEvalReport`` / ``ConsistencyReport``. Lower-level model pieces live in
9
+ precise submodules (``models.nodes`` / ``models.result`` /
10
+ ``models.metric_result`` / ``models.context``).
11
+ - ``structured_eval.metrics`` — every metric plus the base hierarchy
12
+ (``Metric`` / ``FieldMetric`` / …), ``resolve_metric``, and the rule DSL
13
+ (``Rule`` / ``RulePassRate``).
14
+ - ``structured_eval.alignment`` / ``.formats`` / ``.utils`` — supporting
15
+ machinery (array alignment, parsers, ``flatten`` / ``structured_diff``).
16
+
17
+ ``evaluate`` / ``evaluate_batch`` / ``evaluate_consistency`` are thin wrappers
18
+ over ``engine.Evaluator``.
19
+ """
20
+
21
+ from structured_eval.api import evaluate, evaluate_batch, evaluate_consistency
22
+
23
+ __all__ = [
24
+ "evaluate",
25
+ "evaluate_batch",
26
+ "evaluate_consistency",
27
+ ]
@@ -0,0 +1,15 @@
1
+ from structured_eval.alignment.base import ArrayAligner, key_value
2
+ from structured_eval.alignment.by_index import ByIndexAligner
3
+ from structured_eval.alignment.by_key import ByKeyAligner
4
+ from structured_eval.alignment.factory import make_aligner
5
+ from structured_eval.alignment.hungarian import HungarianAligner, Scorer
6
+
7
+ __all__ = [
8
+ "ArrayAligner",
9
+ "ByIndexAligner",
10
+ "ByKeyAligner",
11
+ "HungarianAligner",
12
+ "Scorer",
13
+ "key_value",
14
+ "make_aligner",
15
+ ]
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from structured_eval.utils.paths import MISSING, navigate
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.models.nodes.array_node import ArrayMatchResult
10
+
11
+ # Sentinel for a key that cannot be extracted (absent, or element not a dict).
12
+ _MISSING_KEY = object()
13
+
14
+
15
+ def key_value(element: Any, key: str | None) -> Any:
16
+ """The alignment key of an element: the whole element, or a named field.
17
+
18
+ Shared by every aligner that pairs on a key (``ByKeyAligner``,
19
+ ``HungarianAligner``). Returns ``None`` for a missing field and a private
20
+ sentinel when ``key`` is given but the element is not a dict.
21
+ """
22
+ if key is None:
23
+ return element
24
+ if isinstance(element, dict):
25
+ value = navigate(element, key)
26
+ return None if value is MISSING else value
27
+ return _MISSING_KEY
28
+
29
+
30
+ class ArrayAligner(ABC):
31
+ """Maps actual array items onto expected ones (the only role of a matcher).
32
+
33
+ ``align`` returns an ``ArrayMatchResult`` with matched ``(expected_idx,
34
+ actual_idx)`` pairs plus the unmatched expected (missed) and actual
35
+ (spurious) indices. Value scoring of matched pairs happens later, in the
36
+ array metrics.
37
+ """
38
+
39
+ @abstractmethod
40
+ def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult: ...
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.alignment.base import ArrayAligner
6
+ from structured_eval.models.config import ArrayStrategy
7
+ from structured_eval.models.nodes.array_node import ArrayMatchResult
8
+
9
+
10
+ class ByIndexAligner(ArrayAligner):
11
+ """Pairs the i-th expected item with the i-th actual item.
12
+
13
+ For positionally significant lists (steps, time series, rankings). No key
14
+ comparison is performed.
15
+ """
16
+
17
+ def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
18
+ n = min(len(expected), len(actual))
19
+ return ArrayMatchResult(
20
+ strategy=ArrayStrategy.BY_INDEX,
21
+ matched=[(i, i) for i in range(n)],
22
+ missed=list(range(n, len(expected))),
23
+ spurious=list(range(n, len(actual))),
24
+ )
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.alignment.base import ArrayAligner, key_value
6
+ from structured_eval.metrics.base import BaseMetric, resolve_metric
7
+ from structured_eval.metrics.exact import ExactMatch
8
+ from structured_eval.metrics.invoker import MetricInvoker
9
+ from structured_eval.models.config import ArrayStrategy
10
+ from structured_eval.models.nodes.array_node import ArrayMatchResult
11
+
12
+
13
+ class ByKeyAligner(ArrayAligner):
14
+ """Pairs items whose keys match, greedily best-first (generalized matching).
15
+
16
+ Extracts a key from each element (the ``key`` field, or the whole element
17
+ when ``key`` is None), compares keys with ``key_metric`` (default
18
+ ``ExactMatch``) and pairs them when the score clears ``threshold``. This
19
+ subsumes value- and similarity-based matching (technical_details_v3 §5).
20
+
21
+ Pairing is **globally greedy**: every candidate pair whose key score clears
22
+ the threshold is ranked by score (highest first) and claimed one-to-one,
23
+ skipping pairs whose either side is already taken. So a *soft* key picks the
24
+ strongest available partner rather than the first one found, and the result
25
+ does not depend on element order. With an exact key (all passing scores tie
26
+ at 1.0) this reduces to the original first-match behaviour. It is a cheap,
27
+ scipy-free approximation of the optimal assignment that ``HungarianAligner``
28
+ computes.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ key: str | None = None,
34
+ key_metric: str | BaseMetric | None = None,
35
+ threshold: float = 1.0,
36
+ ):
37
+ self.key = key
38
+ metric = ExactMatch() if key_metric is None else resolve_metric(key_metric)
39
+ self.scorer = MetricInvoker(metric)
40
+ self.threshold = threshold
41
+
42
+ def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
43
+ # Score every (expected, actual) pair on its key; keep those clearing
44
+ # the threshold. Generated in (ei, ai) order so a stable sort breaks
45
+ # score ties by that order (→ exact-key matches reproduce first-match).
46
+ candidates: list[tuple[float, int, int]] = []
47
+ for ei, e_item in enumerate(expected):
48
+ e_key = key_value(e_item, self.key)
49
+ for ai, a_item in enumerate(actual):
50
+ score = self.scorer.scalar_on_values(key_value(a_item, self.key), e_key)
51
+ if score >= self.threshold:
52
+ candidates.append((score, ei, ai))
53
+ candidates.sort(key=lambda c: c[0], reverse=True) # best first; ties keep order
54
+
55
+ used_e: set[int] = set()
56
+ used_a: set[int] = set()
57
+ matched: list[tuple[int, int]] = []
58
+ for _score, ei, ai in candidates:
59
+ if ei in used_e or ai in used_a:
60
+ continue
61
+ used_e.add(ei)
62
+ used_a.add(ai)
63
+ matched.append((ei, ai))
64
+ matched.sort() # report pairs in expected order
65
+
66
+ missed = [ei for ei in range(len(expected)) if ei not in used_e]
67
+ spurious = [ai for ai in range(len(actual)) if ai not in used_a]
68
+ return ArrayMatchResult(
69
+ strategy=ArrayStrategy.BY_KEY,
70
+ matched=matched,
71
+ missed=missed,
72
+ spurious=spurious,
73
+ )
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.alignment.by_index import ByIndexAligner
6
+ from structured_eval.alignment.by_key import ByKeyAligner
7
+ from structured_eval.alignment.hungarian import HungarianAligner
8
+ from structured_eval.models.config import ArrayStrategy
9
+
10
+ if TYPE_CHECKING:
11
+ from structured_eval.alignment.base import ArrayAligner
12
+
13
+
14
+ def make_aligner(
15
+ strategy: ArrayStrategy = ArrayStrategy.BY_INDEX,
16
+ params: dict[str, Any] | None = None,
17
+ ) -> ArrayAligner:
18
+ """Build the aligner for an array config's ``strategy`` from its ``params``.
19
+
20
+ ``params`` keys match the chosen aligner's constructor arguments; an unknown
21
+ key surfaces as a ``TypeError`` from that constructor.
22
+ """
23
+ params = params or {}
24
+ if strategy == ArrayStrategy.BY_INDEX:
25
+ return ByIndexAligner()
26
+ if strategy == ArrayStrategy.HUNGARIAN:
27
+ return HungarianAligner(**params)
28
+ return ByKeyAligner(**params)
@@ -0,0 +1,156 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from collections.abc import Callable
5
+ from typing import Any
6
+
7
+ from structured_eval.alignment.base import ArrayAligner, key_value
8
+ from structured_eval.metrics.base import FieldMetric, Metric, resolve_metric
9
+ from structured_eval.metrics.exact import ExactMatch
10
+ from structured_eval.metrics.invoker import MetricInvoker
11
+ from structured_eval.metrics.numeric_closeness import NumericCloseness
12
+ from structured_eval.models.config import ArrayStrategy
13
+ from structured_eval.models.nodes.array_node import ArrayMatchResult
14
+
15
+ _LARGE_MATRIX_WARN = 10_000 # rows*cols beyond which we warn (quadratic scoring cost)
16
+
17
+ # A per-element similarity: a Metric instance (every Metric has ``score``), its
18
+ # registered name, or a plain ``(actual, expected) -> float`` callable.
19
+ Scorer = Metric[Any] | str | Callable[[Any, Any], float]
20
+
21
+
22
+ class HungarianAligner(ArrayAligner):
23
+ """Optimal one-to-one assignment via the Hungarian algorithm.
24
+
25
+ Builds a similarity matrix ``S[i,j] = score(expected[i], actual[j])`` and
26
+ solves ``min sum(1 - S)`` with ``scipy.optimize.linear_sum_assignment`` —
27
+ the globally optimal pairing regardless of order. A pair counts as matched
28
+ only when its similarity clears ``threshold`` (otherwise both sides are left
29
+ unmatched: a missed expected and a spurious actual).
30
+
31
+ ``scorer`` is the element similarity. Crucially our field metrics already
32
+ *are* scorers (``FieldMetric.score(actual, expected) -> float``), so no
33
+ adapter is needed — a metric, its registered name, or a plain callable is
34
+ used directly. It may be:
35
+
36
+ * a single ``Scorer`` — applied to the whole element;
37
+ * a ``dict[str, Scorer]`` — per-field scorers for arrays of objects; the
38
+ element score is the mean over the union of fields (a field with no entry
39
+ falls back to its type default);
40
+ * ``None`` — type-aware default (graded numeric / ``Fuzzy`` / exact), with
41
+ objects scored field-by-field.
42
+
43
+ ``key`` scores on a named sub-field instead of the whole element. Requires
44
+ the ``align`` extra (scipy).
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ scorer: Scorer | dict[str, Scorer] | None = None,
50
+ threshold: float = 0.8,
51
+ key: str | None = None,
52
+ ):
53
+ self.scorer = scorer
54
+ self.threshold = threshold
55
+ self.key = key
56
+
57
+ def align(self, expected: list[Any], actual: list[Any]) -> ArrayMatchResult:
58
+ if not expected or not actual:
59
+ return ArrayMatchResult(
60
+ strategy=ArrayStrategy.HUNGARIAN,
61
+ matched=[],
62
+ missed=list(range(len(expected))),
63
+ spurious=list(range(len(actual))),
64
+ )
65
+
66
+ if len(expected) * len(actual) > _LARGE_MATRIX_WARN:
67
+ warnings.warn(
68
+ f"HungarianAligner: large {len(expected)}x{len(actual)} similarity "
69
+ "matrix; alignment may be slow.",
70
+ stacklevel=2,
71
+ )
72
+
73
+ try:
74
+ from scipy.optimize import linear_sum_assignment
75
+ except ImportError as exc: # pragma: no cover
76
+ raise ImportError(
77
+ "scipy is required for HungarianAligner. "
78
+ "Install it with: pip install 'structured-eval[align]'"
79
+ ) from exc
80
+
81
+ cost = [[1.0 - self._score(e, a) for a in actual] for e in expected]
82
+ rows, cols = linear_sum_assignment(cost)
83
+
84
+ matched: list[tuple[int, int]] = []
85
+ used_e: set[int] = set()
86
+ used_a: set[int] = set()
87
+ for ei, ai in zip(rows, cols, strict=True):
88
+ if 1.0 - cost[ei][ai] >= self.threshold:
89
+ matched.append((int(ei), int(ai)))
90
+ used_e.add(int(ei))
91
+ used_a.add(int(ai))
92
+
93
+ missed = [ei for ei in range(len(expected)) if ei not in used_e]
94
+ spurious = [ai for ai in range(len(actual)) if ai not in used_a]
95
+ return ArrayMatchResult(
96
+ strategy=ArrayStrategy.HUNGARIAN,
97
+ matched=matched,
98
+ missed=missed,
99
+ spurious=spurious,
100
+ )
101
+
102
+ # ── element similarity ──────────────────────────────────────────────────
103
+
104
+ def _score(self, expected: Any, actual: Any) -> float:
105
+ return self._similarity(
106
+ key_value(expected, self.key),
107
+ key_value(actual, self.key),
108
+ self.scorer,
109
+ )
110
+
111
+ def _similarity(
112
+ self,
113
+ expected: Any,
114
+ actual: Any,
115
+ scorer: Scorer | dict[str, Scorer] | None,
116
+ ) -> float:
117
+ if isinstance(scorer, dict):
118
+ return self._object_similarity(expected, actual, scorer)
119
+ if scorer is not None:
120
+ return self._apply(scorer, expected, actual)
121
+ if isinstance(expected, dict) and isinstance(actual, dict):
122
+ return self._object_similarity(expected, actual, {})
123
+ return self._apply(self._default_scorer(expected, actual), expected, actual)
124
+
125
+ def _object_similarity(
126
+ self, expected: Any, actual: Any, scorers: dict[str, Scorer]
127
+ ) -> float:
128
+ if not isinstance(expected, dict) or not isinstance(actual, dict):
129
+ return 1.0 if expected == actual else 0.0
130
+ keys = set(expected) | set(actual)
131
+ if not keys:
132
+ return 1.0
133
+ total = sum(
134
+ self._similarity(expected.get(k), actual.get(k), scorers.get(k))
135
+ for k in keys
136
+ )
137
+ return total / len(keys)
138
+
139
+ @staticmethod
140
+ def _apply(scorer: Scorer, expected: Any, actual: Any) -> float:
141
+ if callable(scorer) and not isinstance(scorer, (str, Metric)):
142
+ return float(scorer(actual, expected))
143
+ return MetricInvoker(resolve_metric(scorer)).scalar_on_values(actual, expected)
144
+
145
+ @staticmethod
146
+ def _default_scorer(expected: Any, actual: Any) -> FieldMetric:
147
+ """Type-aware default similarity metric for a pair of scalar values.
148
+
149
+ ``bool`` → exact, number → graded :class:`NumericCloseness`, ``str`` →
150
+ :class:`Fuzzy` (or exact without rapidfuzz), everything else → exact.
151
+ """
152
+ if isinstance(expected, bool) or isinstance(actual, bool):
153
+ return ExactMatch()
154
+ if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
155
+ return NumericCloseness()
156
+ return ExactMatch()
structured_eval/api.py ADDED
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.engine.evaluator import Evaluator
6
+ from structured_eval.models.sample import Sample
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.models.config import EvalConfig
10
+ from structured_eval.models.result import (
11
+ BatchEvalReport,
12
+ ConsistencyReport,
13
+ EvalReport,
14
+ )
15
+
16
+
17
+ def _is_batch(actual: Any) -> bool:
18
+ """A list of Samples is a batch; a bare list is a single array-root doc."""
19
+ return isinstance(actual, list) and all(isinstance(x, Sample) for x in actual)
20
+
21
+
22
+ def evaluate(
23
+ actual: Any,
24
+ expected: Any = None,
25
+ config: EvalConfig | None = None,
26
+ *,
27
+ source: str | None = None,
28
+ ) -> EvalReport:
29
+ """Evaluate one document against an expected reference → ``EvalReport``.
30
+
31
+ Two call shapes:
32
+ - ``evaluate(actual, expected, config=...)`` — shorthand for one document;
33
+ - ``evaluate(sample, config=...)`` — one ``Sample``.
34
+
35
+ A bare ``list`` is a single document with an array root, not a batch. To
36
+ evaluate several samples use :func:`evaluate_batch`. Thin wrapper over
37
+ ``Evaluator``.
38
+ """
39
+ if _is_batch(actual):
40
+ raise TypeError(
41
+ "evaluate() takes a single document; pass a list of Samples to evaluate_batch()"
42
+ )
43
+ sample = (
44
+ actual
45
+ if isinstance(actual, Sample)
46
+ else Sample(actual=actual, expected=expected, source=source)
47
+ )
48
+ return Evaluator(config).evaluate_one(sample)
49
+
50
+
51
+ def evaluate_batch(
52
+ samples: list[Sample],
53
+ config: EvalConfig | None = None,
54
+ ) -> BatchEvalReport:
55
+ """Evaluate a list of ``Sample`` s → ``BatchEvalReport``.
56
+
57
+ Each sample carries its own ``actual`` / ``expected`` / ``source``; the
58
+ aggregate report exposes per-sample reports plus batch-level metrics. Thin
59
+ wrapper over ``Evaluator``.
60
+ """
61
+ return Evaluator(config).evaluate_batch(samples)
62
+
63
+
64
+ def evaluate_consistency(
65
+ runs: list[Sample],
66
+ config: EvalConfig | None = None,
67
+ *,
68
+ variance_threshold: float = 0.05,
69
+ ) -> ConsistencyReport:
70
+ """Measure run-to-run stability across repeated outputs of one prompt.
71
+
72
+ ``runs`` are several outputs for the same input (with or without a shared
73
+ ``expected``). Fields whose score varies at most ``variance_threshold``
74
+ across runs are reported as stable, the rest as unstable. Thin wrapper over
75
+ ``Evaluator``.
76
+ """
77
+ return Evaluator(config).evaluate_consistency(
78
+ runs, variance_threshold=variance_threshold
79
+ )
@@ -0,0 +1,15 @@
1
+ from structured_eval.engine.aggregator import BatchAggregator
2
+ from structured_eval.engine.evaluator import Evaluator
3
+ from structured_eval.engine.metric_runner import MetricRunner
4
+ from structured_eval.engine.parser import Parser
5
+ from structured_eval.engine.report_builder import ReportBuilder
6
+ from structured_eval.engine.tree_builder import TreeBuilder
7
+
8
+ __all__ = [
9
+ "BatchAggregator",
10
+ "Evaluator",
11
+ "MetricRunner",
12
+ "Parser",
13
+ "ReportBuilder",
14
+ "TreeBuilder",
15
+ ]
@@ -0,0 +1,96 @@
1
+ """Aggregation over multiple EvalReports: batch and consistency statistics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from statistics import mean, pvariance
6
+
7
+ from structured_eval.models.result import (
8
+ BatchEvalReport,
9
+ ConsistencyReport,
10
+ EvalReport,
11
+ NodeType,
12
+ )
13
+
14
+
15
+ class BatchAggregator:
16
+ """Combines per-document reports into batch / consistency summaries."""
17
+
18
+ def batch(self, reports: list[EvalReport]) -> BatchEvalReport:
19
+ """Aggregate a list of single-document reports into a BatchEvalReport."""
20
+ n = len(reports)
21
+ errors = sum(1 for r in reports if r.parse_error)
22
+ ok = [r for r in reports if not r.parse_error]
23
+
24
+ scores = [r.score for r in ok if r.score is not None]
25
+ score = mean(scores) if scores else None
26
+ label = next((r.score_label for r in ok if r.score_label is not None), None)
27
+
28
+ perfect = sum(1 for r in ok if not r.failed_fields())
29
+
30
+ return BatchEvalReport(
31
+ per_sample=reports,
32
+ metrics=self._mean_metrics(reports),
33
+ score=score,
34
+ score_label=label,
35
+ perfect_response_rate=(perfect / n) if n else 0.0,
36
+ parse_error_rate=(errors / n) if n else 0.0,
37
+ )
38
+
39
+ def consistency(
40
+ self, reports: list[EvalReport], variance_threshold: float = 0.05
41
+ ) -> ConsistencyReport:
42
+ """Measure run-to-run stability across repeated outputs of one prompt."""
43
+ ok = [r for r in reports if not r.parse_error]
44
+
45
+ by_path: dict[str, list[float]] = {}
46
+ for r in ok:
47
+ for path, fs in r.field_scores.items():
48
+ # Field-level stability currently tracks leaf fields only: object/
49
+ # array nodes carry an aggregate representative score whose variance
50
+ # is just a function of its children's, so including it here would
51
+ # be redundant (double-counting the same wobble), non-actionable
52
+ # (a parent path doesn't point at a concrete field to fix) and
53
+ # noisy (an F1-over-children varies for different reasons than a
54
+ # single atomic value). Hence the leaf filter.
55
+ #
56
+ # TODO: support per-node stability regardless of node type. Some
57
+ # users want block-level wobble ("the whole `address` object is
58
+ # unstable") without drilling into leaves. The fix is NOT to drop
59
+ # this filter (that mixes scales) but to expose a separate,
60
+ # parallel view computed over non-scalar nodes (e.g.
61
+ # ConsistencyReport.object_variance / block_variance), keeping the
62
+ # leaf map clean and adding the aggregate one alongside it.
63
+ if fs.score is None or fs.node_type != NodeType.SCALAR:
64
+ continue
65
+ by_path.setdefault(path, []).append(fs.score)
66
+
67
+ variance: dict[str, float] = {}
68
+ stable: list[str] = []
69
+ unstable: list[str] = []
70
+ for path, vals in by_path.items():
71
+ var = pvariance(vals) if len(vals) > 1 else 0.0
72
+ variance[path] = var
73
+ (stable if var <= variance_threshold else unstable).append(path)
74
+
75
+ scores = [r.score for r in ok if r.score is not None]
76
+ return ConsistencyReport(
77
+ per_run=reports,
78
+ field_variance=variance,
79
+ stable_fields=stable,
80
+ unstable_fields=unstable,
81
+ mean_score=mean(scores) if scores else None,
82
+ score_variance=(pvariance(scores) if len(scores) > 1 else 0.0)
83
+ if scores
84
+ else None,
85
+ )
86
+
87
+ @staticmethod
88
+ def _mean_metrics(reports: list[EvalReport]) -> dict[str, float]:
89
+ """Mean of each metric across reports that carry it (errors excluded)."""
90
+ buckets: dict[str, list[float]] = {}
91
+ for r in reports:
92
+ if r.parse_error:
93
+ continue
94
+ for name, coll in r.metrics.items():
95
+ buckets.setdefault(name, []).append(coll.representative())
96
+ return {name: mean(vals) for name, vals in buckets.items() if vals}
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.engine.aggregator import BatchAggregator
6
+ from structured_eval.engine.metric_runner import MetricRunner
7
+ from structured_eval.engine.parser import Parser
8
+ from structured_eval.engine.report_builder import ReportBuilder
9
+ from structured_eval.engine.tree_builder import TreeBuilder
10
+ from structured_eval.models.config import EvalConfig
11
+ from structured_eval.models.context import EvalContext
12
+ from structured_eval.models.result import BatchEvalReport, ConsistencyReport, EvalReport
13
+ from structured_eval.utils.flatten import flatten
14
+
15
+ if TYPE_CHECKING:
16
+ from structured_eval.models.sample import Sample
17
+
18
+
19
+ class Evaluator:
20
+ """Orchestrates the three evaluation phases for one config.
21
+
22
+ Holds the ``EvalConfig`` and the phase collaborators (parse → build tree →
23
+ run metrics → build report) and aggregates batches. The module-level
24
+ ``evaluate`` / ``evaluate_consistency`` functions are thin wrappers over this.
25
+ """
26
+
27
+ def __init__(self, config: EvalConfig | None = None):
28
+ self.config = config or EvalConfig()
29
+ self._parser = Parser()
30
+ self._runner = MetricRunner()
31
+ self._report_builder = ReportBuilder()
32
+ self._aggregator = BatchAggregator()
33
+
34
+ def evaluate_one(self, sample: Sample) -> EvalReport:
35
+ """Evaluate a single document against its expected reference."""
36
+ actual, actual_err = self._parser.parse(sample.actual)
37
+ expected, expected_err = self._parser.parse(sample.expected)
38
+ error = actual_err or expected_err
39
+ if error is not None:
40
+ return EvalReport(parse_error=True, parse_error_message=error)
41
+
42
+ context = EvalContext(
43
+ actual=actual,
44
+ expected=expected,
45
+ source=sample.source,
46
+ flat_actual=_flat(actual),
47
+ flat_expected=_flat(expected),
48
+ config=self.config,
49
+ )
50
+
51
+ root, warnings = TreeBuilder(
52
+ context
53
+ ).build() # phase 1: structure + per-node metrics
54
+ self._runner.run(
55
+ root
56
+ ) # phase 2: compute post-order, each node's key_metric last
57
+ return self._report_builder.build(root, context, warnings) # phase 3
58
+
59
+ def evaluate_batch(self, samples: list[Sample]) -> BatchEvalReport:
60
+ """Evaluate a list of documents and aggregate the results."""
61
+ return self._aggregator.batch([self.evaluate_one(s) for s in samples])
62
+
63
+ def evaluate_consistency(
64
+ self, runs: list[Sample], *, variance_threshold: float = 0.05
65
+ ) -> ConsistencyReport:
66
+ """Measure run-to-run stability across repeated outputs of one prompt."""
67
+ reports = [self.evaluate_one(s) for s in runs]
68
+ return self._aggregator.consistency(reports, variance_threshold)
69
+
70
+
71
+ def _flat(data: Any) -> dict[str, Any]:
72
+ return flatten(data) if isinstance(data, (dict, list)) else {}