structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,46 @@
1
+ """Lenient numeric parsing shared by the numeric field metrics.
2
+
3
+ One parsing behavior for ``Numeric`` and ``NumericCloseness`` so a value is read
4
+ the same way by both. Accepts int/float (rejecting ``bool``) and parses numeric
5
+ strings: currency symbols, thousands separators and whitespace are stripped,
6
+ accounting notation ``"(123)"`` is read as ``-123``, and scientific notation
7
+ ``"1e3"`` is supported. A ``"%"`` is only stripped, never interpreted
8
+ (``"50%"`` → ``50``). US format is assumed (``,`` = thousands, ``.`` = decimal);
9
+ anything that does not parse cleanly returns ``None``.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import Any
16
+
17
+ # Everything that is not part of a (possibly scientific) number. Kept: digits,
18
+ # decimal point, signs, and the exponent marker e/E, so float() parses
19
+ # scientific notation ("1e3" → 1000.0, "1.5e-3" → 0.0015).
20
+ _NON_NUMERIC = re.compile(r"[^0-9eE.+\-]")
21
+
22
+
23
+ def parse_number(value: Any) -> float | None:
24
+ """Coerce ``value`` to a float, or ``None`` if it isn't cleanly numeric."""
25
+ if isinstance(value, bool):
26
+ return None
27
+ if isinstance(value, (int, float)):
28
+ return float(value)
29
+ if not isinstance(value, str):
30
+ return None
31
+
32
+ text = value.strip()
33
+ negative = False
34
+ # Accounting notation: "(123)" means -123.
35
+ if text.startswith("(") and text.endswith(")"):
36
+ text = text[1:-1]
37
+ negative = True
38
+
39
+ text = _NON_NUMERIC.sub("", text)
40
+ if text in ("", "-", ".", "-."):
41
+ return None
42
+ try:
43
+ number = float(text)
44
+ except ValueError:
45
+ return None
46
+ return -number if negative else number
@@ -0,0 +1,87 @@
1
+ """Verdicts for object metrics: matched fields → ``(score, threshold, weight)``.
2
+
3
+ A parent object does not re-compare its children; it reads each matched child's
4
+ already-computed representative score (``node.representative``) and pairs it with
5
+ the bar it must clear and the weight it carries. Those triples feed
6
+ ``calculate.prf_counts``.
7
+
8
+ ``score_policy`` (on ``ObjectF1`` / ``ObjectAccuracy`` / …) overrides the
9
+ criterion for a named field — a metric instance or its registered name, run on
10
+ that child via ``MetricInvoker`` (so it works for any child kind, not only
11
+ scalars). ``thresholds`` may be a per-field dict or a single float.
12
+
13
+ ``weight_mode`` (see ``calculate.WeightMode``) decides each child's weight:
14
+ ``NONE`` → ``1.0`` (plain counts), ``PROPORTIONAL`` → the child's configured
15
+ ``weight``. Missing (FN) and spurious (FP) children are weighted the same way
16
+ via ``missing_weight`` / ``spurious_weight``.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from typing import TYPE_CHECKING, Any
22
+
23
+ from structured_eval.metrics.base import resolve_metric
24
+ from structured_eval.metrics.invoker import MetricInvoker
25
+ from structured_eval.metrics.utils.calculate import WeightMode
26
+
27
+ if TYPE_CHECKING:
28
+ from structured_eval.models.nodes.base import EvalNode
29
+ from structured_eval.models.nodes.object_node import ObjectNode
30
+
31
+
32
+ def leaf_name(path: str) -> str:
33
+ """Last path segment without any trailing index, e.g. ``"a.b[0]"`` → ``"b"``."""
34
+ return path.rsplit(".", 1)[-1].split("[", 1)[0]
35
+
36
+
37
+ def _resolve_threshold(
38
+ thresholds: float | dict[str, float] | None, name: str, fallback: float
39
+ ) -> float:
40
+ if isinstance(thresholds, dict):
41
+ return float(thresholds.get(name, fallback))
42
+ if thresholds is not None:
43
+ return float(thresholds)
44
+ return fallback
45
+
46
+
47
+ def _weight_of(child: EvalNode, weight_mode: WeightMode) -> float:
48
+ return child.weight if weight_mode == WeightMode.PROPORTIONAL else 1.0
49
+
50
+
51
+ def matched_verdicts(
52
+ node: ObjectNode,
53
+ score_policy: dict[str, Any] | None = None,
54
+ thresholds: float | dict[str, float] | None = None,
55
+ weight_mode: WeightMode = WeightMode.PROPORTIONAL,
56
+ ) -> list[tuple[float, float, float]]:
57
+ """``(score, threshold, weight)`` for each matched child of an object.
58
+
59
+ Each child contributes its representative score (any node type — scalars and
60
+ nested objects/arrays alike). ``score_policy`` overrides the criterion for a
61
+ named child, re-scoring it with the policy metric (any node kind).
62
+ """
63
+ out: list[tuple[float, float, float]] = []
64
+ for child in node.matched:
65
+ name = leaf_name(child.path)
66
+ spec = (score_policy or {}).get(name)
67
+ if spec is not None:
68
+ score = MetricInvoker(resolve_metric(spec)).scalar_on_node(child)
69
+ else:
70
+ score = child.representative
71
+ threshold = _resolve_threshold(thresholds, name, child.threshold)
72
+ out.append((score, threshold, _weight_of(child, weight_mode)))
73
+ return out
74
+
75
+
76
+ def missing_weight(
77
+ node: ObjectNode, weight_mode: WeightMode = WeightMode.PROPORTIONAL
78
+ ) -> float:
79
+ """Summed weight of the object's missing (FN) children (count when uniform)."""
80
+ return sum(_weight_of(node.children[name], weight_mode) for name in node.missing)
81
+
82
+
83
+ def spurious_weight(
84
+ node: ObjectNode, weight_mode: WeightMode = WeightMode.PROPORTIONAL
85
+ ) -> float:
86
+ """Summed weight of the object's spurious (FP) children (count when uniform)."""
87
+ return sum(_weight_of(node.children[name], weight_mode) for name in node.spurious)
@@ -0,0 +1,72 @@
1
+ """structured_eval.models — the pydantic data layer.
2
+
3
+ Single home for every user-facing data model, re-exported here so callers use
4
+ one path — ``from structured_eval.models import <X>`` — rather than reaching
5
+ into individual submodules:
6
+
7
+ - configuration: ``EvalConfig`` + the ``*FieldConfig`` family and policies;
8
+ - input: ``Sample``, ``EvalContext``;
9
+ - the ``EvalNode`` tree: ``EvalNode`` / ``ScalarNode`` / ``ObjectNode`` /
10
+ ``ArrayNode`` (+ ``ArrayMatchResult``);
11
+ - metric values: ``MetricResult`` / ``MetricCollection``;
12
+ - reports & scores: ``EvalReport`` / ``BatchEvalReport`` / ``ConsistencyReport``,
13
+ ``FieldScore`` / ``RuleResult`` / ``RegressionDiff`` / ``EvalWarning`` /
14
+ ``WarningType`` / ``NodeType``.
15
+ """
16
+
17
+ from structured_eval.models.config import (
18
+ ArrayFieldConfig,
19
+ ArrayStrategy,
20
+ EvalConfig,
21
+ ExtraKeysPolicy,
22
+ FieldConfig,
23
+ ObjectFieldConfig,
24
+ )
25
+ from structured_eval.models.context import EvalContext
26
+ from structured_eval.models.metric_result import MetricCollection, MetricResult
27
+ from structured_eval.models.nodes import (
28
+ ArrayMatchResult,
29
+ ArrayNode,
30
+ EvalNode,
31
+ ObjectNode,
32
+ ScalarNode,
33
+ )
34
+ from structured_eval.models.result import (
35
+ BatchEvalReport,
36
+ ConsistencyReport,
37
+ EvalReport,
38
+ EvalWarning,
39
+ FieldScore,
40
+ NodeType,
41
+ RegressionDiff,
42
+ RuleResult,
43
+ WarningType,
44
+ )
45
+ from structured_eval.models.sample import Sample
46
+
47
+ __all__ = [
48
+ "ArrayFieldConfig",
49
+ "ArrayMatchResult",
50
+ "ArrayNode",
51
+ "ArrayStrategy",
52
+ "BatchEvalReport",
53
+ "ConsistencyReport",
54
+ "EvalConfig",
55
+ "EvalContext",
56
+ "EvalNode",
57
+ "EvalReport",
58
+ "EvalWarning",
59
+ "ExtraKeysPolicy",
60
+ "FieldConfig",
61
+ "FieldScore",
62
+ "MetricCollection",
63
+ "MetricResult",
64
+ "NodeType",
65
+ "ObjectFieldConfig",
66
+ "ObjectNode",
67
+ "RegressionDiff",
68
+ "RuleResult",
69
+ "Sample",
70
+ "ScalarNode",
71
+ "WarningType",
72
+ ]
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ from typing import Any
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ # ── Defaults ──────────────────────────────────────────────────────────────────
9
+
10
+ DEFAULT_FIELD_WEIGHT: float = 1.0
11
+
12
+
13
+ # ── Enums ─────────────────────────────────────────────────────────────────────
14
+
15
+
16
+ class ExtraKeysPolicy(StrEnum):
17
+ """How to treat keys present in actual but absent from expected."""
18
+
19
+ IGNORE = "ignore" # extra keys are skipped
20
+ PENALIZE = "penalize" # extra keys lower precision
21
+
22
+
23
+ class ArrayStrategy(StrEnum):
24
+ """How to align actual array items with expected ones."""
25
+
26
+ BY_INDEX = "by_index" # pair the i-th with the i-th
27
+ BY_KEY = "by_key" # match on a shared unique field (see ArrayFieldConfig.params)
28
+ HUNGARIAN = "hungarian" # optimal one-to-one assignment by element similarity
29
+
30
+
31
+ # ── Field configs ───────────────────────────────────────────────────────────
32
+
33
+
34
+ class FieldConfig(BaseModel):
35
+ """Configuration for a scalar (leaf) field.
36
+
37
+ In v3 comparison is a metric: ``metrics`` is the field's metric list, *added*
38
+ to the metrics cascading from ``EvalConfig.metrics``. ``key_metric`` names
39
+ which of them is the match criterion the parent object/array uses (a metric
40
+ instance or its registered name; ``None`` → ``ExactMatch``); ``threshold`` is
41
+ the bar it must clear to count as a true positive.
42
+ """
43
+
44
+ model_config = ConfigDict(arbitrary_types_allowed=True)
45
+
46
+ metrics: list[Any] | None = (
47
+ None # list[Metric]; added to the cascading config.metrics
48
+ )
49
+ key_metric: Any = None # Metric | name str used as the parent's match criterion
50
+ threshold: float | None = None
51
+ weight: float = DEFAULT_FIELD_WEIGHT
52
+
53
+
54
+ class ObjectFieldConfig(BaseModel):
55
+ """Configuration for an object (dict) field."""
56
+
57
+ model_config = ConfigDict(arbitrary_types_allowed=True)
58
+
59
+ fields: dict[str, AnyFieldConfig] = Field(default_factory=dict)
60
+ weight: float = DEFAULT_FIELD_WEIGHT
61
+ threshold: float | None = None
62
+ metrics: list[Any] | None = None
63
+
64
+
65
+ class ArrayFieldConfig(BaseModel):
66
+ """Configuration for an array (list) field.
67
+
68
+ ``item`` describes the type and config of each element. ``strategy`` picks
69
+ the aligner; ``params`` carries that strategy's options (interpreted by the
70
+ aligner built in ``make_aligner``), so new strategies add no new fields here:
71
+
72
+ * ``BY_INDEX`` → ``params`` empty.
73
+ * ``BY_KEY`` → ``{"key": <field|None>, "key_metric": <metric|name>,
74
+ "threshold": <float>}``. The generalized ``BY_KEY`` subsumes value- and
75
+ similarity-based matching (technical_details_v3 §5).
76
+ * ``HUNGARIAN`` → ``{"scorer": <Scorer | dict[str, Scorer] | None>,
77
+ "threshold": <float>, "key": <field|None>}``. Optimal one-to-one
78
+ assignment; ``scorer`` as a per-field dict scores arrays of objects.
79
+ """
80
+
81
+ model_config = ConfigDict(arbitrary_types_allowed=True)
82
+
83
+ item: FieldConfig | ObjectFieldConfig | None = None
84
+ strategy: ArrayStrategy = ArrayStrategy.BY_INDEX
85
+ params: dict[str, Any] = Field(default_factory=dict) # strategy-specific options
86
+ weight: float = DEFAULT_FIELD_WEIGHT
87
+ threshold: float | None = None
88
+ metrics: list[Any] | None = None
89
+
90
+
91
+ AnyFieldConfig = FieldConfig | ObjectFieldConfig | ArrayFieldConfig
92
+
93
+
94
+ def weight_of(cfg: AnyFieldConfig | None) -> float:
95
+ """The aggregation weight a field config contributes (``1.0`` when absent)."""
96
+ return cfg.weight if cfg is not None else DEFAULT_FIELD_WEIGHT
97
+
98
+
99
+ # ── Eval config ───────────────────────────────────────────────────────────────
100
+
101
+
102
+ class EvalConfig(BaseModel):
103
+ """Top-level evaluation configuration.
104
+
105
+ Metrics are class instances (e.g. ``ObjectF1()``, ``SchemaValidity(...)``).
106
+ ``fields`` accepts canonical nested configs as well as dot-notation keys
107
+ (``"vendor.name"``) as syntactic sugar. ``root`` explicitly declares the
108
+ type of the root node; when omitted it is inferred from ``type(actual)``.
109
+ """
110
+
111
+ model_config = ConfigDict(arbitrary_types_allowed=True)
112
+
113
+ metrics: list[Any] = Field(
114
+ default_factory=list
115
+ ) # list[Metric]; cascade by type to all nodes
116
+ fields: dict[str, AnyFieldConfig] = Field(default_factory=dict)
117
+ root: ObjectFieldConfig | ArrayFieldConfig | None = None
118
+ key_metric: Any = None # Metric whose value becomes report.score
119
+ extra_keys: ExtraKeysPolicy = ExtraKeysPolicy.IGNORE
120
+
121
+
122
+ ObjectFieldConfig.model_rebuild()
123
+ ArrayFieldConfig.model_rebuild()
124
+ EvalConfig.model_rebuild()
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+ from structured_eval.models.config import EvalConfig # noqa: TC001
8
+
9
+
10
+ class EvalContext(BaseModel):
11
+ """The single owner of a sample's data.
12
+
13
+ Every ``EvalNode`` in the tree holds a reference to one ``EvalContext``;
14
+ nothing is copied. ``flat_actual`` / ``flat_expected`` are the documents
15
+ pre-flattened to dot-notation paths, computed once up front.
16
+ """
17
+
18
+ model_config = ConfigDict(arbitrary_types_allowed=True)
19
+
20
+ actual: Any
21
+ expected: Any
22
+ source: str | None
23
+ flat_actual: dict[str, Any]
24
+ flat_expected: dict[str, Any]
25
+ config: EvalConfig
@@ -0,0 +1,121 @@
1
+ """A metric's value: a float that also carries structured detail.
2
+
3
+ ``MetricResult`` is the single shape every metric value takes once it has passed
4
+ through ``MetricRunner._apply`` — whatever a metric's ``compute`` returns (a bare
5
+ ``float``, a ``dict`` of sub-scores, a ``(value, extra)`` tuple, or a
6
+ ``MetricResult``) is normalized to it. It *is* a ``float`` (so every existing
7
+ numeric use keeps working) and additionally exposes ``.extra`` — arbitrary
8
+ structured detail a metric wants to surface beyond the number (offending paths,
9
+ per-rule outcomes, an LLM judge's reasoning, …).
10
+
11
+ ``MetricCollection`` is the cross-field view: ``report.metrics[name]`` gathers a
12
+ named metric's value at every node that produced it, keyed by path, with numeric
13
+ reductions and the union of their ``extra`` payloads.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from statistics import mean
19
+ from typing import Any
20
+
21
+ from pydantic import BaseModel, ConfigDict, Field, GetCoreSchemaHandler
22
+ from pydantic_core import core_schema
23
+
24
+
25
+ class MetricResult(float):
26
+ """A metric value: a ``float`` everywhere, plus structured ``.extra``."""
27
+
28
+ extra: dict[str, Any]
29
+
30
+ def __new__(cls, value: float, extra: dict[str, Any] | None = None) -> MetricResult:
31
+ obj = super().__new__(cls, value)
32
+ obj.extra = dict(extra) if extra else {}
33
+ return obj
34
+
35
+ def __repr__(self) -> str:
36
+ num = float.__repr__(self)
37
+ return (
38
+ f"MetricResult({num}, extra={self.extra!r})"
39
+ if self.extra
40
+ else f"MetricResult({num})"
41
+ )
42
+
43
+ # ── pydantic (round-trips extra: serialized as a bare float when empty,
44
+ # else as ``{"value": ..., "extra": ...}``; both forms re-validate) ──
45
+ @classmethod
46
+ def _validate(cls, value: Any) -> MetricResult:
47
+ if isinstance(value, cls):
48
+ return value
49
+ if isinstance(value, dict):
50
+ return cls(value["value"], value.get("extra"))
51
+ return cls(value)
52
+
53
+ @staticmethod
54
+ def _serialize(value: MetricResult) -> Any:
55
+ return (
56
+ {"value": float(value), "extra": value.extra}
57
+ if value.extra
58
+ else float(value)
59
+ )
60
+
61
+ @classmethod
62
+ def __get_pydantic_core_schema__(
63
+ cls, source: Any, handler: GetCoreSchemaHandler
64
+ ) -> core_schema.CoreSchema:
65
+ return core_schema.no_info_plain_validator_function(
66
+ cls._validate,
67
+ serialization=core_schema.plain_serializer_function_ser_schema(
68
+ cls._serialize
69
+ ),
70
+ )
71
+
72
+
73
+ class MetricCollection(BaseModel):
74
+ """A named metric's values across the tree (``report.metrics[name]``).
75
+
76
+ ``by_path`` maps every node path that produced this metric to its
77
+ ``MetricResult``. Numeric reductions (``mean``/``min``/``max``) summarise the
78
+ whole tree; ``root()`` is the document-level value (path ``"$"``) when the
79
+ metric ran at the root; ``extra`` is the list of non-empty detail payloads.
80
+ """
81
+
82
+ model_config = ConfigDict(arbitrary_types_allowed=True)
83
+
84
+ name: str
85
+ by_path: dict[str, MetricResult] = Field(default_factory=dict)
86
+
87
+ def values(self) -> list[MetricResult]:
88
+ return list(self.by_path.values())
89
+
90
+ def mean(self) -> float:
91
+ vals = self.values()
92
+ return mean(vals) if vals else 0.0
93
+
94
+ def min(self) -> float:
95
+ vals = self.values()
96
+ return min(vals) if vals else 0.0
97
+
98
+ def max(self) -> float:
99
+ vals = self.values()
100
+ return max(vals) if vals else 0.0
101
+
102
+ def root(self) -> MetricResult | None:
103
+ """The document-level value (path ``"$"``), or ``None`` if not at root."""
104
+ return self.by_path.get("$")
105
+
106
+ def representative(self) -> float:
107
+ """The document-level value if present, else the mean across the tree."""
108
+ root = self.root()
109
+ return float(root) if root is not None else self.mean()
110
+
111
+ @property
112
+ def extra(self) -> list[dict[str, Any]]:
113
+ """The non-empty ``extra`` payloads from each node, in path order."""
114
+ return [r.extra for r in self.values() if r.extra]
115
+
116
+ def extra_values(self, key: str) -> list[Any]:
117
+ """Flatten a list-valued ``extra[key]`` across every node's detail."""
118
+ out: list[Any] = []
119
+ for result in self.values():
120
+ out.extend(result.extra.get(key, []))
121
+ return out
@@ -0,0 +1,13 @@
1
+ from structured_eval.models.nodes.array_node import ArrayMatchResult, ArrayNode
2
+ from structured_eval.models.nodes.base import EvalNode, navigate
3
+ from structured_eval.models.nodes.object_node import ObjectNode
4
+ from structured_eval.models.nodes.scalar import ScalarNode
5
+
6
+ __all__ = [
7
+ "ArrayMatchResult",
8
+ "ArrayNode",
9
+ "EvalNode",
10
+ "ObjectNode",
11
+ "ScalarNode",
12
+ "navigate",
13
+ ]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field
4
+
5
+ from structured_eval.models.config import ArrayStrategy # noqa: TC001
6
+ from structured_eval.models.nodes.base import EvalNode
7
+
8
+
9
+ class ArrayMatchResult(BaseModel):
10
+ """Alignment of an actual array against an expected array.
11
+
12
+ A structural breakdown only: ``matched`` are ``(expected_idx, actual_idx)``
13
+ pairs, ``missed`` are expected indices with no actual counterpart (FN),
14
+ ``spurious`` are actual indices absent from expected (FP). For precision /
15
+ recall / F1 use the **value-aware** array metrics (``ArrayPrecision`` /
16
+ ``ArrayRecall`` / ``ArrayF1``), which grade each matched element rather than
17
+ just counting it.
18
+ """
19
+
20
+ model_config = ConfigDict(arbitrary_types_allowed=True)
21
+
22
+ strategy: ArrayStrategy
23
+ matched: list[tuple[int, int]] = Field(default_factory=list)
24
+ missed: list[int] = Field(default_factory=list)
25
+ spurious: list[int] = Field(default_factory=list)
26
+
27
+
28
+ class ArrayNode(EvalNode):
29
+ """A list node. ``items`` are the per-element nodes after matching."""
30
+
31
+ match_result: ArrayMatchResult | None = None
32
+ items: list[EvalNode] = Field(default_factory=list)
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+ from structured_eval.models.context import EvalContext # noqa: TC001
8
+ from structured_eval.models.metric_result import MetricResult # noqa: TC001
9
+ from structured_eval.utils.paths import MISSING, navigate
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+
14
+
15
+ # Re-exported for back-compat: ``navigate`` / ``MISSING`` now live in
16
+ # ``structured_eval.utils.paths`` (a lower layer with no model dependency).
17
+ __all__ = ["MISSING", "EvalNode", "navigate"]
18
+
19
+
20
+ class EvalNode(BaseModel):
21
+ """A node in the evaluation tree.
22
+
23
+ Holds its ``path`` and a shared reference to the ``EvalContext``; data is
24
+ never copied — ``actual``/``expected`` are resolved lazily by navigating the
25
+ context's documents. ``expected_path`` defaults to ``path``; it diverges
26
+ only for array items aligned out of order (``expected[1]`` ↔ ``actual[0]``),
27
+ so each side navigates its own index. ``metric_results`` accumulates each
28
+ requested metric's value at this node (filled by the engine in phase 2).
29
+
30
+ ``key_metric`` is the node's *representative* metric — the single score that
31
+ bubbles up to a parent's aggregation (and, at the root, to ``report.score``).
32
+ It is computed last (its logic may depend on the node's other metrics) and
33
+ defaults to ``MeanScore`` (the arithmetic mean of the node's own metrics).
34
+ """
35
+
36
+ model_config = ConfigDict(arbitrary_types_allowed=True)
37
+
38
+ path: str
39
+ context: EvalContext
40
+ expected_path: str | None = None
41
+ weight: float = 1.0 # relative importance for weighted aggregation (OverallLeafScore, object metrics)
42
+ metrics: list[Any] = Field(
43
+ default_factory=list
44
+ ) # list[BaseMetric] resolved for this node
45
+ key_metric: Any = (
46
+ None # BaseMetric: this node's representative score (parents read it)
47
+ )
48
+ threshold: float = 1.0 # bar the representative score must clear to count as a TP
49
+ metric_results: dict[str, MetricResult] = Field(default_factory=dict)
50
+
51
+ @property
52
+ def actual(self) -> Any:
53
+ value = navigate(self.context.actual, self.path)
54
+ return None if value is MISSING else value
55
+
56
+ @property
57
+ def expected(self) -> Any:
58
+ if self.context.expected is None:
59
+ return None
60
+ value = navigate(self.context.expected, self.expected_path or self.path)
61
+ return None if value is MISSING else value
62
+
63
+ @property
64
+ def representative(self) -> float:
65
+ """The node's single representative score: its ``key_metric``'s value.
66
+
67
+ Every node always carries a ``key_metric`` (the engine defaults it to
68
+ ``MeanScore``) and at least one metric for it to summarise, so by the
69
+ time anyone reads this the value exists. A parent reads its already
70
+ computed children's representatives to aggregate (post-order); the root's
71
+ is ``report.score``. Missing is a programming error, not a fallback.
72
+ """
73
+ km = self.key_metric
74
+ if km is None:
75
+ raise ValueError(f"node {self.path!r} has no key_metric")
76
+ value = self.metric_results.get(km.name)
77
+ if value is None:
78
+ raise ValueError(
79
+ f"node {self.path!r}: key_metric {km.name!r} has no computed value"
80
+ )
81
+ return float(value)
82
+
83
+ # ── traversal ──────────────────────────────────────────────────────────
84
+ # Children are discovered by duck-typing (``children`` on objects, ``items``
85
+ # on arrays) so the base node need not import its own subclasses.
86
+
87
+ def children_nodes(self) -> Iterator[EvalNode]:
88
+ """Yield the node's direct child nodes (none for a scalar leaf)."""
89
+ children = getattr(self, "children", None)
90
+ if isinstance(children, dict):
91
+ yield from children.values()
92
+ items = getattr(self, "items", None)
93
+ if isinstance(items, list):
94
+ yield from items
95
+
96
+ def is_leaf(self) -> bool:
97
+ """True for a scalar node (no object children, no array items)."""
98
+ return (
99
+ getattr(self, "children", None) is None
100
+ and getattr(self, "items", None) is None
101
+ )
102
+
103
+ def walk(self) -> Iterator[EvalNode]:
104
+ """Depth-first traversal yielding this node and every descendant."""
105
+ yield self
106
+ for child in self.children_nodes():
107
+ yield from child.walk()
108
+
109
+ def leaves(self) -> Iterator[EvalNode]:
110
+ """Yield every scalar (leaf) node at or beneath this node."""
111
+ for node in self.walk():
112
+ if node.is_leaf():
113
+ yield node
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic import Field
4
+
5
+ from structured_eval.models.nodes.base import EvalNode
6
+
7
+
8
+ class ObjectNode(EvalNode):
9
+ """A dict node.
10
+
11
+ ``matched`` holds child nodes present in both actual and expected.
12
+ ``missing`` / ``spurious`` hold keys present on only one side (FN / FP).
13
+ ``children`` maps every child key to its node for tree traversal.
14
+ """
15
+
16
+ matched: list[EvalNode] = Field(default_factory=list)
17
+ missing: list[str] = Field(default_factory=list)
18
+ spurious: list[str] = Field(default_factory=list)
19
+ children: dict[str, EvalNode] = Field(default_factory=dict)
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from structured_eval.models.nodes.base import EvalNode
4
+
5
+
6
+ class ScalarNode(EvalNode):
7
+ """A leaf node: a single comparable value.
8
+
9
+ In v3 there is no pre-computed ``similarity`` — comparison *is* a metric.
10
+ The match criterion is the node's ``key_metric`` (defined on ``EvalNode``):
11
+ its representative score, defaulting to ``MeanScore`` over the node's field
12
+ metrics (a lone ``ExactMatch`` when none are configured). ``threshold`` is
13
+ the bar that score must clear to count as a true positive.
14
+ """