structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Lenient numeric parsing shared by the numeric field metrics.
|
|
2
|
+
|
|
3
|
+
One parsing behavior for ``Numeric`` and ``NumericCloseness`` so a value is read
|
|
4
|
+
the same way by both. Accepts int/float (rejecting ``bool``) and parses numeric
|
|
5
|
+
strings: currency symbols, thousands separators and whitespace are stripped,
|
|
6
|
+
accounting notation ``"(123)"`` is read as ``-123``, and scientific notation
|
|
7
|
+
``"1e3"`` is supported. A ``"%"`` is only stripped, never interpreted
|
|
8
|
+
(``"50%"`` → ``50``). US format is assumed (``,`` = thousands, ``.`` = decimal);
|
|
9
|
+
anything that does not parse cleanly returns ``None``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
# Everything that is not part of a (possibly scientific) number. Kept: digits,
|
|
18
|
+
# decimal point, signs, and the exponent marker e/E, so float() parses
|
|
19
|
+
# scientific notation ("1e3" → 1000.0, "1.5e-3" → 0.0015).
|
|
20
|
+
_NON_NUMERIC = re.compile(r"[^0-9eE.+\-]")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_number(value: Any) -> float | None:
|
|
24
|
+
"""Coerce ``value`` to a float, or ``None`` if it isn't cleanly numeric."""
|
|
25
|
+
if isinstance(value, bool):
|
|
26
|
+
return None
|
|
27
|
+
if isinstance(value, (int, float)):
|
|
28
|
+
return float(value)
|
|
29
|
+
if not isinstance(value, str):
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
text = value.strip()
|
|
33
|
+
negative = False
|
|
34
|
+
# Accounting notation: "(123)" means -123.
|
|
35
|
+
if text.startswith("(") and text.endswith(")"):
|
|
36
|
+
text = text[1:-1]
|
|
37
|
+
negative = True
|
|
38
|
+
|
|
39
|
+
text = _NON_NUMERIC.sub("", text)
|
|
40
|
+
if text in ("", "-", ".", "-."):
|
|
41
|
+
return None
|
|
42
|
+
try:
|
|
43
|
+
number = float(text)
|
|
44
|
+
except ValueError:
|
|
45
|
+
return None
|
|
46
|
+
return -number if negative else number
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Verdicts for object metrics: matched fields → ``(score, threshold, weight)``.
|
|
2
|
+
|
|
3
|
+
A parent object does not re-compare its children; it reads each matched child's
|
|
4
|
+
already-computed representative score (``node.representative``) and pairs it with
|
|
5
|
+
the bar it must clear and the weight it carries. Those triples feed
|
|
6
|
+
``calculate.prf_counts``.
|
|
7
|
+
|
|
8
|
+
``score_policy`` (on ``ObjectF1`` / ``ObjectAccuracy`` / …) overrides the
|
|
9
|
+
criterion for a named field — a metric instance or its registered name, run on
|
|
10
|
+
that child via ``MetricInvoker`` (so it works for any child kind, not only
|
|
11
|
+
scalars). ``thresholds`` may be a per-field dict or a single float.
|
|
12
|
+
|
|
13
|
+
``weight_mode`` (see ``calculate.WeightMode``) decides each child's weight:
|
|
14
|
+
``NONE`` → ``1.0`` (plain counts), ``PROPORTIONAL`` → the child's configured
|
|
15
|
+
``weight``. Missing (FN) and spurious (FP) children are weighted the same way
|
|
16
|
+
via ``missing_weight`` / ``spurious_weight``.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import TYPE_CHECKING, Any
|
|
22
|
+
|
|
23
|
+
from structured_eval.metrics.base import resolve_metric
|
|
24
|
+
from structured_eval.metrics.invoker import MetricInvoker
|
|
25
|
+
from structured_eval.metrics.utils.calculate import WeightMode
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
29
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def leaf_name(path: str) -> str:
|
|
33
|
+
"""Last path segment without any trailing index, e.g. ``"a.b[0]"`` → ``"b"``."""
|
|
34
|
+
return path.rsplit(".", 1)[-1].split("[", 1)[0]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _resolve_threshold(
|
|
38
|
+
thresholds: float | dict[str, float] | None, name: str, fallback: float
|
|
39
|
+
) -> float:
|
|
40
|
+
if isinstance(thresholds, dict):
|
|
41
|
+
return float(thresholds.get(name, fallback))
|
|
42
|
+
if thresholds is not None:
|
|
43
|
+
return float(thresholds)
|
|
44
|
+
return fallback
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _weight_of(child: EvalNode, weight_mode: WeightMode) -> float:
|
|
48
|
+
return child.weight if weight_mode == WeightMode.PROPORTIONAL else 1.0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def matched_verdicts(
|
|
52
|
+
node: ObjectNode,
|
|
53
|
+
score_policy: dict[str, Any] | None = None,
|
|
54
|
+
thresholds: float | dict[str, float] | None = None,
|
|
55
|
+
weight_mode: WeightMode = WeightMode.PROPORTIONAL,
|
|
56
|
+
) -> list[tuple[float, float, float]]:
|
|
57
|
+
"""``(score, threshold, weight)`` for each matched child of an object.
|
|
58
|
+
|
|
59
|
+
Each child contributes its representative score (any node type — scalars and
|
|
60
|
+
nested objects/arrays alike). ``score_policy`` overrides the criterion for a
|
|
61
|
+
named child, re-scoring it with the policy metric (any node kind).
|
|
62
|
+
"""
|
|
63
|
+
out: list[tuple[float, float, float]] = []
|
|
64
|
+
for child in node.matched:
|
|
65
|
+
name = leaf_name(child.path)
|
|
66
|
+
spec = (score_policy or {}).get(name)
|
|
67
|
+
if spec is not None:
|
|
68
|
+
score = MetricInvoker(resolve_metric(spec)).scalar_on_node(child)
|
|
69
|
+
else:
|
|
70
|
+
score = child.representative
|
|
71
|
+
threshold = _resolve_threshold(thresholds, name, child.threshold)
|
|
72
|
+
out.append((score, threshold, _weight_of(child, weight_mode)))
|
|
73
|
+
return out
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def missing_weight(
|
|
77
|
+
node: ObjectNode, weight_mode: WeightMode = WeightMode.PROPORTIONAL
|
|
78
|
+
) -> float:
|
|
79
|
+
"""Summed weight of the object's missing (FN) children (count when uniform)."""
|
|
80
|
+
return sum(_weight_of(node.children[name], weight_mode) for name in node.missing)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def spurious_weight(
|
|
84
|
+
node: ObjectNode, weight_mode: WeightMode = WeightMode.PROPORTIONAL
|
|
85
|
+
) -> float:
|
|
86
|
+
"""Summed weight of the object's spurious (FP) children (count when uniform)."""
|
|
87
|
+
return sum(_weight_of(node.children[name], weight_mode) for name in node.spurious)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""structured_eval.models — the pydantic data layer.
|
|
2
|
+
|
|
3
|
+
Single home for every user-facing data model, re-exported here so callers use
|
|
4
|
+
one path — ``from structured_eval.models import <X>`` — rather than reaching
|
|
5
|
+
into individual submodules:
|
|
6
|
+
|
|
7
|
+
- configuration: ``EvalConfig`` + the ``*FieldConfig`` family and policies;
|
|
8
|
+
- input: ``Sample``, ``EvalContext``;
|
|
9
|
+
- the ``EvalNode`` tree: ``EvalNode`` / ``ScalarNode`` / ``ObjectNode`` /
|
|
10
|
+
``ArrayNode`` (+ ``ArrayMatchResult``);
|
|
11
|
+
- metric values: ``MetricResult`` / ``MetricCollection``;
|
|
12
|
+
- reports & scores: ``EvalReport`` / ``BatchEvalReport`` / ``ConsistencyReport``,
|
|
13
|
+
``FieldScore`` / ``RuleResult`` / ``RegressionDiff`` / ``EvalWarning`` /
|
|
14
|
+
``WarningType`` / ``NodeType``.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from structured_eval.models.config import (
|
|
18
|
+
ArrayFieldConfig,
|
|
19
|
+
ArrayStrategy,
|
|
20
|
+
EvalConfig,
|
|
21
|
+
ExtraKeysPolicy,
|
|
22
|
+
FieldConfig,
|
|
23
|
+
ObjectFieldConfig,
|
|
24
|
+
)
|
|
25
|
+
from structured_eval.models.context import EvalContext
|
|
26
|
+
from structured_eval.models.metric_result import MetricCollection, MetricResult
|
|
27
|
+
from structured_eval.models.nodes import (
|
|
28
|
+
ArrayMatchResult,
|
|
29
|
+
ArrayNode,
|
|
30
|
+
EvalNode,
|
|
31
|
+
ObjectNode,
|
|
32
|
+
ScalarNode,
|
|
33
|
+
)
|
|
34
|
+
from structured_eval.models.result import (
|
|
35
|
+
BatchEvalReport,
|
|
36
|
+
ConsistencyReport,
|
|
37
|
+
EvalReport,
|
|
38
|
+
EvalWarning,
|
|
39
|
+
FieldScore,
|
|
40
|
+
NodeType,
|
|
41
|
+
RegressionDiff,
|
|
42
|
+
RuleResult,
|
|
43
|
+
WarningType,
|
|
44
|
+
)
|
|
45
|
+
from structured_eval.models.sample import Sample
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"ArrayFieldConfig",
|
|
49
|
+
"ArrayMatchResult",
|
|
50
|
+
"ArrayNode",
|
|
51
|
+
"ArrayStrategy",
|
|
52
|
+
"BatchEvalReport",
|
|
53
|
+
"ConsistencyReport",
|
|
54
|
+
"EvalConfig",
|
|
55
|
+
"EvalContext",
|
|
56
|
+
"EvalNode",
|
|
57
|
+
"EvalReport",
|
|
58
|
+
"EvalWarning",
|
|
59
|
+
"ExtraKeysPolicy",
|
|
60
|
+
"FieldConfig",
|
|
61
|
+
"FieldScore",
|
|
62
|
+
"MetricCollection",
|
|
63
|
+
"MetricResult",
|
|
64
|
+
"NodeType",
|
|
65
|
+
"ObjectFieldConfig",
|
|
66
|
+
"ObjectNode",
|
|
67
|
+
"RegressionDiff",
|
|
68
|
+
"RuleResult",
|
|
69
|
+
"Sample",
|
|
70
|
+
"ScalarNode",
|
|
71
|
+
"WarningType",
|
|
72
|
+
]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
# ── Defaults ──────────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
DEFAULT_FIELD_WEIGHT: float = 1.0
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ── Enums ─────────────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ExtraKeysPolicy(StrEnum):
|
|
17
|
+
"""How to treat keys present in actual but absent from expected."""
|
|
18
|
+
|
|
19
|
+
IGNORE = "ignore" # extra keys are skipped
|
|
20
|
+
PENALIZE = "penalize" # extra keys lower precision
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ArrayStrategy(StrEnum):
|
|
24
|
+
"""How to align actual array items with expected ones."""
|
|
25
|
+
|
|
26
|
+
BY_INDEX = "by_index" # pair the i-th with the i-th
|
|
27
|
+
BY_KEY = "by_key" # match on a shared unique field (see ArrayFieldConfig.params)
|
|
28
|
+
HUNGARIAN = "hungarian" # optimal one-to-one assignment by element similarity
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── Field configs ───────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FieldConfig(BaseModel):
|
|
35
|
+
"""Configuration for a scalar (leaf) field.
|
|
36
|
+
|
|
37
|
+
In v3 comparison is a metric: ``metrics`` is the field's metric list, *added*
|
|
38
|
+
to the metrics cascading from ``EvalConfig.metrics``. ``key_metric`` names
|
|
39
|
+
which of them is the match criterion the parent object/array uses (a metric
|
|
40
|
+
instance or its registered name; ``None`` → ``ExactMatch``); ``threshold`` is
|
|
41
|
+
the bar it must clear to count as a true positive.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
45
|
+
|
|
46
|
+
metrics: list[Any] | None = (
|
|
47
|
+
None # list[Metric]; added to the cascading config.metrics
|
|
48
|
+
)
|
|
49
|
+
key_metric: Any = None # Metric | name str used as the parent's match criterion
|
|
50
|
+
threshold: float | None = None
|
|
51
|
+
weight: float = DEFAULT_FIELD_WEIGHT
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ObjectFieldConfig(BaseModel):
|
|
55
|
+
"""Configuration for an object (dict) field."""
|
|
56
|
+
|
|
57
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
58
|
+
|
|
59
|
+
fields: dict[str, AnyFieldConfig] = Field(default_factory=dict)
|
|
60
|
+
weight: float = DEFAULT_FIELD_WEIGHT
|
|
61
|
+
threshold: float | None = None
|
|
62
|
+
metrics: list[Any] | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ArrayFieldConfig(BaseModel):
|
|
66
|
+
"""Configuration for an array (list) field.
|
|
67
|
+
|
|
68
|
+
``item`` describes the type and config of each element. ``strategy`` picks
|
|
69
|
+
the aligner; ``params`` carries that strategy's options (interpreted by the
|
|
70
|
+
aligner built in ``make_aligner``), so new strategies add no new fields here:
|
|
71
|
+
|
|
72
|
+
* ``BY_INDEX`` → ``params`` empty.
|
|
73
|
+
* ``BY_KEY`` → ``{"key": <field|None>, "key_metric": <metric|name>,
|
|
74
|
+
"threshold": <float>}``. The generalized ``BY_KEY`` subsumes value- and
|
|
75
|
+
similarity-based matching (technical_details_v3 §5).
|
|
76
|
+
* ``HUNGARIAN`` → ``{"scorer": <Scorer | dict[str, Scorer] | None>,
|
|
77
|
+
"threshold": <float>, "key": <field|None>}``. Optimal one-to-one
|
|
78
|
+
assignment; ``scorer`` as a per-field dict scores arrays of objects.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
82
|
+
|
|
83
|
+
item: FieldConfig | ObjectFieldConfig | None = None
|
|
84
|
+
strategy: ArrayStrategy = ArrayStrategy.BY_INDEX
|
|
85
|
+
params: dict[str, Any] = Field(default_factory=dict) # strategy-specific options
|
|
86
|
+
weight: float = DEFAULT_FIELD_WEIGHT
|
|
87
|
+
threshold: float | None = None
|
|
88
|
+
metrics: list[Any] | None = None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
AnyFieldConfig = FieldConfig | ObjectFieldConfig | ArrayFieldConfig
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def weight_of(cfg: AnyFieldConfig | None) -> float:
|
|
95
|
+
"""The aggregation weight a field config contributes (``1.0`` when absent)."""
|
|
96
|
+
return cfg.weight if cfg is not None else DEFAULT_FIELD_WEIGHT
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ── Eval config ───────────────────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class EvalConfig(BaseModel):
|
|
103
|
+
"""Top-level evaluation configuration.
|
|
104
|
+
|
|
105
|
+
Metrics are class instances (e.g. ``ObjectF1()``, ``SchemaValidity(...)``).
|
|
106
|
+
``fields`` accepts canonical nested configs as well as dot-notation keys
|
|
107
|
+
(``"vendor.name"``) as syntactic sugar. ``root`` explicitly declares the
|
|
108
|
+
type of the root node; when omitted it is inferred from ``type(actual)``.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
112
|
+
|
|
113
|
+
metrics: list[Any] = Field(
|
|
114
|
+
default_factory=list
|
|
115
|
+
) # list[Metric]; cascade by type to all nodes
|
|
116
|
+
fields: dict[str, AnyFieldConfig] = Field(default_factory=dict)
|
|
117
|
+
root: ObjectFieldConfig | ArrayFieldConfig | None = None
|
|
118
|
+
key_metric: Any = None # Metric whose value becomes report.score
|
|
119
|
+
extra_keys: ExtraKeysPolicy = ExtraKeysPolicy.IGNORE
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
ObjectFieldConfig.model_rebuild()
|
|
123
|
+
ArrayFieldConfig.model_rebuild()
|
|
124
|
+
EvalConfig.model_rebuild()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
from structured_eval.models.config import EvalConfig # noqa: TC001
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EvalContext(BaseModel):
|
|
11
|
+
"""The single owner of a sample's data.
|
|
12
|
+
|
|
13
|
+
Every ``EvalNode`` in the tree holds a reference to one ``EvalContext``;
|
|
14
|
+
nothing is copied. ``flat_actual`` / ``flat_expected`` are the documents
|
|
15
|
+
pre-flattened to dot-notation paths, computed once up front.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
19
|
+
|
|
20
|
+
actual: Any
|
|
21
|
+
expected: Any
|
|
22
|
+
source: str | None
|
|
23
|
+
flat_actual: dict[str, Any]
|
|
24
|
+
flat_expected: dict[str, Any]
|
|
25
|
+
config: EvalConfig
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""A metric's value: a float that also carries structured detail.
|
|
2
|
+
|
|
3
|
+
``MetricResult`` is the single shape every metric value takes once it has passed
|
|
4
|
+
through ``MetricRunner._apply`` — whatever a metric's ``compute`` returns (a bare
|
|
5
|
+
``float``, a ``dict`` of sub-scores, a ``(value, extra)`` tuple, or a
|
|
6
|
+
``MetricResult``) is normalized to it. It *is* a ``float`` (so every existing
|
|
7
|
+
numeric use keeps working) and additionally exposes ``.extra`` — arbitrary
|
|
8
|
+
structured detail a metric wants to surface beyond the number (offending paths,
|
|
9
|
+
per-rule outcomes, an LLM judge's reasoning, …).
|
|
10
|
+
|
|
11
|
+
``MetricCollection`` is the cross-field view: ``report.metrics[name]`` gathers a
|
|
12
|
+
named metric's value at every node that produced it, keyed by path, with numeric
|
|
13
|
+
reductions and the union of their ``extra`` payloads.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from statistics import mean
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from pydantic import BaseModel, ConfigDict, Field, GetCoreSchemaHandler
|
|
22
|
+
from pydantic_core import core_schema
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricResult(float):
|
|
26
|
+
"""A metric value: a ``float`` everywhere, plus structured ``.extra``."""
|
|
27
|
+
|
|
28
|
+
extra: dict[str, Any]
|
|
29
|
+
|
|
30
|
+
def __new__(cls, value: float, extra: dict[str, Any] | None = None) -> MetricResult:
|
|
31
|
+
obj = super().__new__(cls, value)
|
|
32
|
+
obj.extra = dict(extra) if extra else {}
|
|
33
|
+
return obj
|
|
34
|
+
|
|
35
|
+
def __repr__(self) -> str:
|
|
36
|
+
num = float.__repr__(self)
|
|
37
|
+
return (
|
|
38
|
+
f"MetricResult({num}, extra={self.extra!r})"
|
|
39
|
+
if self.extra
|
|
40
|
+
else f"MetricResult({num})"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# ── pydantic (round-trips extra: serialized as a bare float when empty,
|
|
44
|
+
# else as ``{"value": ..., "extra": ...}``; both forms re-validate) ──
|
|
45
|
+
@classmethod
|
|
46
|
+
def _validate(cls, value: Any) -> MetricResult:
|
|
47
|
+
if isinstance(value, cls):
|
|
48
|
+
return value
|
|
49
|
+
if isinstance(value, dict):
|
|
50
|
+
return cls(value["value"], value.get("extra"))
|
|
51
|
+
return cls(value)
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _serialize(value: MetricResult) -> Any:
|
|
55
|
+
return (
|
|
56
|
+
{"value": float(value), "extra": value.extra}
|
|
57
|
+
if value.extra
|
|
58
|
+
else float(value)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def __get_pydantic_core_schema__(
|
|
63
|
+
cls, source: Any, handler: GetCoreSchemaHandler
|
|
64
|
+
) -> core_schema.CoreSchema:
|
|
65
|
+
return core_schema.no_info_plain_validator_function(
|
|
66
|
+
cls._validate,
|
|
67
|
+
serialization=core_schema.plain_serializer_function_ser_schema(
|
|
68
|
+
cls._serialize
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MetricCollection(BaseModel):
|
|
74
|
+
"""A named metric's values across the tree (``report.metrics[name]``).
|
|
75
|
+
|
|
76
|
+
``by_path`` maps every node path that produced this metric to its
|
|
77
|
+
``MetricResult``. Numeric reductions (``mean``/``min``/``max``) summarise the
|
|
78
|
+
whole tree; ``root()`` is the document-level value (path ``"$"``) when the
|
|
79
|
+
metric ran at the root; ``extra`` is the list of non-empty detail payloads.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
83
|
+
|
|
84
|
+
name: str
|
|
85
|
+
by_path: dict[str, MetricResult] = Field(default_factory=dict)
|
|
86
|
+
|
|
87
|
+
def values(self) -> list[MetricResult]:
|
|
88
|
+
return list(self.by_path.values())
|
|
89
|
+
|
|
90
|
+
def mean(self) -> float:
|
|
91
|
+
vals = self.values()
|
|
92
|
+
return mean(vals) if vals else 0.0
|
|
93
|
+
|
|
94
|
+
def min(self) -> float:
|
|
95
|
+
vals = self.values()
|
|
96
|
+
return min(vals) if vals else 0.0
|
|
97
|
+
|
|
98
|
+
def max(self) -> float:
|
|
99
|
+
vals = self.values()
|
|
100
|
+
return max(vals) if vals else 0.0
|
|
101
|
+
|
|
102
|
+
def root(self) -> MetricResult | None:
|
|
103
|
+
"""The document-level value (path ``"$"``), or ``None`` if not at root."""
|
|
104
|
+
return self.by_path.get("$")
|
|
105
|
+
|
|
106
|
+
def representative(self) -> float:
|
|
107
|
+
"""The document-level value if present, else the mean across the tree."""
|
|
108
|
+
root = self.root()
|
|
109
|
+
return float(root) if root is not None else self.mean()
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def extra(self) -> list[dict[str, Any]]:
|
|
113
|
+
"""The non-empty ``extra`` payloads from each node, in path order."""
|
|
114
|
+
return [r.extra for r in self.values() if r.extra]
|
|
115
|
+
|
|
116
|
+
def extra_values(self, key: str) -> list[Any]:
|
|
117
|
+
"""Flatten a list-valued ``extra[key]`` across every node's detail."""
|
|
118
|
+
out: list[Any] = []
|
|
119
|
+
for result in self.values():
|
|
120
|
+
out.extend(result.extra.get(key, []))
|
|
121
|
+
return out
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from structured_eval.models.nodes.array_node import ArrayMatchResult, ArrayNode
|
|
2
|
+
from structured_eval.models.nodes.base import EvalNode, navigate
|
|
3
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
4
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ArrayMatchResult",
|
|
8
|
+
"ArrayNode",
|
|
9
|
+
"EvalNode",
|
|
10
|
+
"ObjectNode",
|
|
11
|
+
"ScalarNode",
|
|
12
|
+
"navigate",
|
|
13
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
4
|
+
|
|
5
|
+
from structured_eval.models.config import ArrayStrategy # noqa: TC001
|
|
6
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArrayMatchResult(BaseModel):
|
|
10
|
+
"""Alignment of an actual array against an expected array.
|
|
11
|
+
|
|
12
|
+
A structural breakdown only: ``matched`` are ``(expected_idx, actual_idx)``
|
|
13
|
+
pairs, ``missed`` are expected indices with no actual counterpart (FN),
|
|
14
|
+
``spurious`` are actual indices absent from expected (FP). For precision /
|
|
15
|
+
recall / F1 use the **value-aware** array metrics (``ArrayPrecision`` /
|
|
16
|
+
``ArrayRecall`` / ``ArrayF1``), which grade each matched element rather than
|
|
17
|
+
just counting it.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
21
|
+
|
|
22
|
+
strategy: ArrayStrategy
|
|
23
|
+
matched: list[tuple[int, int]] = Field(default_factory=list)
|
|
24
|
+
missed: list[int] = Field(default_factory=list)
|
|
25
|
+
spurious: list[int] = Field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ArrayNode(EvalNode):
|
|
29
|
+
"""A list node. ``items`` are the per-element nodes after matching."""
|
|
30
|
+
|
|
31
|
+
match_result: ArrayMatchResult | None = None
|
|
32
|
+
items: list[EvalNode] = Field(default_factory=list)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
from structured_eval.models.context import EvalContext # noqa: TC001
|
|
8
|
+
from structured_eval.models.metric_result import MetricResult # noqa: TC001
|
|
9
|
+
from structured_eval.utils.paths import MISSING, navigate
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Re-exported for back-compat: ``navigate`` / ``MISSING`` now live in
|
|
16
|
+
# ``structured_eval.utils.paths`` (a lower layer with no model dependency).
|
|
17
|
+
__all__ = ["MISSING", "EvalNode", "navigate"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EvalNode(BaseModel):
|
|
21
|
+
"""A node in the evaluation tree.
|
|
22
|
+
|
|
23
|
+
Holds its ``path`` and a shared reference to the ``EvalContext``; data is
|
|
24
|
+
never copied — ``actual``/``expected`` are resolved lazily by navigating the
|
|
25
|
+
context's documents. ``expected_path`` defaults to ``path``; it diverges
|
|
26
|
+
only for array items aligned out of order (``expected[1]`` ↔ ``actual[0]``),
|
|
27
|
+
so each side navigates its own index. ``metric_results`` accumulates each
|
|
28
|
+
requested metric's value at this node (filled by the engine in phase 2).
|
|
29
|
+
|
|
30
|
+
``key_metric`` is the node's *representative* metric — the single score that
|
|
31
|
+
bubbles up to a parent's aggregation (and, at the root, to ``report.score``).
|
|
32
|
+
It is computed last (its logic may depend on the node's other metrics) and
|
|
33
|
+
defaults to ``MeanScore`` (the arithmetic mean of the node's own metrics).
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
37
|
+
|
|
38
|
+
path: str
|
|
39
|
+
context: EvalContext
|
|
40
|
+
expected_path: str | None = None
|
|
41
|
+
weight: float = 1.0 # relative importance for weighted aggregation (OverallLeafScore, object metrics)
|
|
42
|
+
metrics: list[Any] = Field(
|
|
43
|
+
default_factory=list
|
|
44
|
+
) # list[BaseMetric] resolved for this node
|
|
45
|
+
key_metric: Any = (
|
|
46
|
+
None # BaseMetric: this node's representative score (parents read it)
|
|
47
|
+
)
|
|
48
|
+
threshold: float = 1.0 # bar the representative score must clear to count as a TP
|
|
49
|
+
metric_results: dict[str, MetricResult] = Field(default_factory=dict)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def actual(self) -> Any:
|
|
53
|
+
value = navigate(self.context.actual, self.path)
|
|
54
|
+
return None if value is MISSING else value
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def expected(self) -> Any:
|
|
58
|
+
if self.context.expected is None:
|
|
59
|
+
return None
|
|
60
|
+
value = navigate(self.context.expected, self.expected_path or self.path)
|
|
61
|
+
return None if value is MISSING else value
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def representative(self) -> float:
|
|
65
|
+
"""The node's single representative score: its ``key_metric``'s value.
|
|
66
|
+
|
|
67
|
+
Every node always carries a ``key_metric`` (the engine defaults it to
|
|
68
|
+
``MeanScore``) and at least one metric for it to summarise, so by the
|
|
69
|
+
time anyone reads this the value exists. A parent reads its already
|
|
70
|
+
computed children's representatives to aggregate (post-order); the root's
|
|
71
|
+
is ``report.score``. Missing is a programming error, not a fallback.
|
|
72
|
+
"""
|
|
73
|
+
km = self.key_metric
|
|
74
|
+
if km is None:
|
|
75
|
+
raise ValueError(f"node {self.path!r} has no key_metric")
|
|
76
|
+
value = self.metric_results.get(km.name)
|
|
77
|
+
if value is None:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"node {self.path!r}: key_metric {km.name!r} has no computed value"
|
|
80
|
+
)
|
|
81
|
+
return float(value)
|
|
82
|
+
|
|
83
|
+
# ── traversal ──────────────────────────────────────────────────────────
|
|
84
|
+
# Children are discovered by duck-typing (``children`` on objects, ``items``
|
|
85
|
+
# on arrays) so the base node need not import its own subclasses.
|
|
86
|
+
|
|
87
|
+
def children_nodes(self) -> Iterator[EvalNode]:
|
|
88
|
+
"""Yield the node's direct child nodes (none for a scalar leaf)."""
|
|
89
|
+
children = getattr(self, "children", None)
|
|
90
|
+
if isinstance(children, dict):
|
|
91
|
+
yield from children.values()
|
|
92
|
+
items = getattr(self, "items", None)
|
|
93
|
+
if isinstance(items, list):
|
|
94
|
+
yield from items
|
|
95
|
+
|
|
96
|
+
def is_leaf(self) -> bool:
|
|
97
|
+
"""True for a scalar node (no object children, no array items)."""
|
|
98
|
+
return (
|
|
99
|
+
getattr(self, "children", None) is None
|
|
100
|
+
and getattr(self, "items", None) is None
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def walk(self) -> Iterator[EvalNode]:
|
|
104
|
+
"""Depth-first traversal yielding this node and every descendant."""
|
|
105
|
+
yield self
|
|
106
|
+
for child in self.children_nodes():
|
|
107
|
+
yield from child.walk()
|
|
108
|
+
|
|
109
|
+
def leaves(self) -> Iterator[EvalNode]:
|
|
110
|
+
"""Yield every scalar (leaf) node at or beneath this node."""
|
|
111
|
+
for node in self.walk():
|
|
112
|
+
if node.is_leaf():
|
|
113
|
+
yield node
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ObjectNode(EvalNode):
|
|
9
|
+
"""A dict node.
|
|
10
|
+
|
|
11
|
+
``matched`` holds child nodes present in both actual and expected.
|
|
12
|
+
``missing`` / ``spurious`` hold keys present on only one side (FN / FP).
|
|
13
|
+
``children`` maps every child key to its node for tree traversal.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
matched: list[EvalNode] = Field(default_factory=list)
|
|
17
|
+
missing: list[str] = Field(default_factory=list)
|
|
18
|
+
spurious: list[str] = Field(default_factory=list)
|
|
19
|
+
children: dict[str, EvalNode] = Field(default_factory=dict)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ScalarNode(EvalNode):
|
|
7
|
+
"""A leaf node: a single comparable value.
|
|
8
|
+
|
|
9
|
+
In v3 there is no pre-computed ``similarity`` — comparison *is* a metric.
|
|
10
|
+
The match criterion is the node's ``key_metric`` (defined on ``EvalNode``):
|
|
11
|
+
its representative score, defaulting to ``MeanScore`` over the node's field
|
|
12
|
+
metrics (a lone ``ExactMatch`` when none are configured). ``threshold`` is
|
|
13
|
+
the bar that score must clear to count as a true positive.
|
|
14
|
+
"""
|