structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.models.metric_result import MetricResult
|
|
7
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
8
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
9
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
10
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
11
|
+
|
|
12
|
+
# What a metric's ``compute`` may return; ``MetricRunner._apply`` normalizes any
|
|
13
|
+
# of these to a ``MetricResult``. A bare value / dict of sub-scores, optionally
|
|
14
|
+
# paired with structured ``extra`` via a tuple, or a ready ``MetricResult``.
|
|
15
|
+
MetricOutput = (
|
|
16
|
+
float
|
|
17
|
+
| dict[str, float]
|
|
18
|
+
| tuple[float | dict[str, float], dict[str, Any]]
|
|
19
|
+
| MetricResult
|
|
20
|
+
| None
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Name → metric class. Populated automatically as BaseMetric subclasses are
|
|
24
|
+
# declared; used by EvalConfig.from_yaml() to resolve string names (Stage 10).
|
|
25
|
+
_METRIC_REGISTRY: dict[str, type] = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseMetric(ABC): # noqa: B024 — registry root; subclasses define the interface
|
|
29
|
+
"""Registry root for every metric — no evaluation interface of its own.
|
|
30
|
+
|
|
31
|
+
``name`` is the key under which a scalar result lands in ``report.metrics``
|
|
32
|
+
and ``FieldScore.metrics``. A metric that returns a ``dict`` instead writes
|
|
33
|
+
each of its keys directly (the ``name`` is then only a registry handle).
|
|
34
|
+
Declaring a subclass with a ``name`` registers it automatically.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
name: str = ""
|
|
38
|
+
|
|
39
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
40
|
+
super().__init_subclass__(**kwargs)
|
|
41
|
+
if n := getattr(cls, "name", None):
|
|
42
|
+
_METRIC_REGISTRY[n] = cls
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Metric[NodeT: EvalNode](BaseMetric):
|
|
46
|
+
"""The unified metric interface: ``compute(node)`` + ``score(actual, expected)``.
|
|
47
|
+
|
|
48
|
+
Every concrete metric is a ``Metric`` and therefore *has* a ``score`` — a
|
|
49
|
+
pure value-level comparison ``(actual, expected) -> float | dict`` reused by
|
|
50
|
+
array alignment. ``compute(node)`` is the node-level entry point; by default
|
|
51
|
+
it delegates to ``score`` on the node's values, so a leaf comparison need
|
|
52
|
+
only implement ``score``. Aggregating metrics override ``compute`` and leave
|
|
53
|
+
``score`` at its default (callers that require a scalar verdict check the
|
|
54
|
+
result type where it matters). The type parameter ``NodeT`` pins the node
|
|
55
|
+
type a subtype operates on (``ScalarNode`` for fields, ``ObjectNode`` …).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def compute(self, node: NodeT) -> MetricOutput:
|
|
59
|
+
return self.score(node.actual, node.expected)
|
|
60
|
+
|
|
61
|
+
def score(self, actual: Any, expected: Any) -> float | dict[str, float]:
|
|
62
|
+
raise NotImplementedError
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class FieldMetric(Metric[ScalarNode]):
|
|
66
|
+
"""A leaf comparison applied to each ScalarNode.
|
|
67
|
+
|
|
68
|
+
Implements ``score(actual, expected)`` and relies on the inherited
|
|
69
|
+
``compute``; metrics that need node context (e.g. ``Presence``) override
|
|
70
|
+
``compute`` directly. Also the marker the engine dispatches on for scalars.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ObjectMetric(Metric[ObjectNode]):
|
|
75
|
+
"""Applies to each ObjectNode (root and nested)."""
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def compute(self, node: ObjectNode) -> MetricOutput: ...
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ArrayMetric(Metric[ArrayNode]):
|
|
82
|
+
"""Applies to each ArrayNode."""
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def compute(self, node: ArrayNode) -> MetricOutput: ...
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class RootMetric(Metric[EvalNode]):
|
|
89
|
+
"""Applies only to the root node (path == "$"); receives any EvalNode."""
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def compute(self, node: EvalNode) -> MetricOutput: ...
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class AnyNodeMetric(Metric[EvalNode]):
|
|
96
|
+
"""Applies uniformly to *every* node — same ``compute`` regardless of kind.
|
|
97
|
+
|
|
98
|
+
The node-agnostic branch of the hierarchy: unlike the typed metrics
|
|
99
|
+
(``FieldMetric`` / ``ObjectMetric`` / ``ArrayMetric``) it is not pinned to
|
|
100
|
+
one node type, and unlike ``GenericMetric`` it does not dispatch per kind —
|
|
101
|
+
it runs one uniform computation on any ``EvalNode``. ``RootMetric`` is the
|
|
102
|
+
sibling that is *also* ``Metric[EvalNode]`` but admitted only at the root;
|
|
103
|
+
an ``AnyNodeMetric`` is admitted everywhere. ``MeanScore`` (the default
|
|
104
|
+
representative) lives here, and a custom uniform metric can be cascaded via
|
|
105
|
+
``config.metrics`` or chosen as a ``key_metric``.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def compute(self, node: EvalNode) -> MetricOutput: ...
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class GenericMetric(BaseMetric):
|
|
113
|
+
"""Metrics spanning several node types, outside the single-``compute`` shape.
|
|
114
|
+
|
|
115
|
+
Override whichever per-kind methods apply: ``compute_scalar`` /
|
|
116
|
+
``compute_object`` / ``compute_array`` for node mode, and (optionally)
|
|
117
|
+
``score_scalar`` / ``score_object`` / ``score_array`` for value mode.
|
|
118
|
+
``MetricInvoker`` dispatches by kind; ``TreeBuilder`` admits the metric onto
|
|
119
|
+
a node only when the matching ``compute_<kind>`` exists. (Replaces the former
|
|
120
|
+
``NodeMetric``.)
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def get_metric_class(name: str) -> type:
|
|
125
|
+
"""Resolve a metric class by its ``name`` (e.g. ``"object_f1"``)."""
|
|
126
|
+
if name not in _METRIC_REGISTRY:
|
|
127
|
+
raise KeyError(f"Unknown metric: {name!r}. Known: {sorted(_METRIC_REGISTRY)}")
|
|
128
|
+
return _METRIC_REGISTRY[name]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def resolve_metric(spec: str | BaseMetric) -> BaseMetric:
|
|
132
|
+
"""Coerce a metric spec to a ``BaseMetric`` instance.
|
|
133
|
+
|
|
134
|
+
Accepts an instance as-is or a registered name string (instantiated with no
|
|
135
|
+
args). The single resolver shared by the engine, array alignment, and the
|
|
136
|
+
match-criterion helper. ``None`` is *not* handled here — callers supply
|
|
137
|
+
their own default. Score-needing call sites narrow the result to ``Metric``.
|
|
138
|
+
"""
|
|
139
|
+
if isinstance(spec, str):
|
|
140
|
+
instance = get_metric_class(spec)()
|
|
141
|
+
assert isinstance(instance, BaseMetric)
|
|
142
|
+
return instance
|
|
143
|
+
assert isinstance(spec, BaseMetric)
|
|
144
|
+
return spec
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from structured_eval.metrics.base import FieldMetric
|
|
8
|
+
|
|
9
|
+
_NON_WORD = re.compile(r"[^\w\s]")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _characters(value: Any) -> list[str]:
|
|
13
|
+
"""Lowercase, drop punctuation and whitespace, split into characters."""
|
|
14
|
+
normalized = _NON_WORD.sub("", str(value).lower())
|
|
15
|
+
normalized = "".join(normalized.split()) # remove all whitespace
|
|
16
|
+
return list(normalized)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CharacterF1(FieldMetric):
|
|
20
|
+
"""Character-overlap F1 for short free-text fields.
|
|
21
|
+
|
|
22
|
+
Characters are matched as a **multiset** (``Counter``), so repeated
|
|
23
|
+
characters contribute only as many times as they appear on both sides.
|
|
24
|
+
Precision and recall are computed over character counts, and their
|
|
25
|
+
harmonic mean is returned. String-only: if either side is not a ``str``
|
|
26
|
+
the score is ``0.0`` (no coercion).
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name = "character_f1"
|
|
30
|
+
|
|
31
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
32
|
+
if not (isinstance(actual, str) and isinstance(expected, str)):
|
|
33
|
+
return 0.0
|
|
34
|
+
|
|
35
|
+
a = _characters(actual)
|
|
36
|
+
e = _characters(expected)
|
|
37
|
+
|
|
38
|
+
if not a and not e:
|
|
39
|
+
return 1.0
|
|
40
|
+
if not a or not e:
|
|
41
|
+
return 0.0
|
|
42
|
+
|
|
43
|
+
same = sum((Counter(a) & Counter(e)).values())
|
|
44
|
+
if not same:
|
|
45
|
+
return 0.0
|
|
46
|
+
|
|
47
|
+
precision = same / len(a)
|
|
48
|
+
recall = same / len(e)
|
|
49
|
+
|
|
50
|
+
return 2 * precision * recall / (precision + recall)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import AnyNodeMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CompositeScore(AnyNodeMetric):
|
|
12
|
+
"""Weighted blend of other metrics already computed on the same node.
|
|
13
|
+
|
|
14
|
+
Given ``weights={metric_name: weight}``, the score is the weighted mean of
|
|
15
|
+
those metrics' values on the node::
|
|
16
|
+
|
|
17
|
+
score = Σ wᵢ · metric_resultsᵢ (weights normalized to sum 1.0)
|
|
18
|
+
|
|
19
|
+
The referenced metrics must already be present in ``node.metric_results``,
|
|
20
|
+
so list them in the node's ``metrics`` (or as cascaded ``config.metrics``)
|
|
21
|
+
alongside ``CompositeScore``. As a representative it is best used as the
|
|
22
|
+
node's ``key_metric``, which the engine runs **last** — by then every other
|
|
23
|
+
metric on the node is computed.
|
|
24
|
+
|
|
25
|
+
Only the metrics named in ``weights`` contribute; any other metric on the
|
|
26
|
+
node is ignored, and a named metric that is absent contributes ``0``. The
|
|
27
|
+
result is clamped to ``[0, 1]`` (each input is expected in ``[0, 1]``).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name = "composite_score"
|
|
31
|
+
|
|
32
|
+
def __init__(self, weights: dict[str, float]) -> None:
|
|
33
|
+
if not weights:
|
|
34
|
+
raise ValueError("CompositeScore requires at least one metric weight")
|
|
35
|
+
total = sum(weights.values())
|
|
36
|
+
if total <= 0:
|
|
37
|
+
raise ValueError("Sum of weights must be > 0")
|
|
38
|
+
self.weights: dict[str, float] = {m: w / total for m, w in weights.items()}
|
|
39
|
+
|
|
40
|
+
def compute(self, node: EvalNode) -> float:
|
|
41
|
+
total = sum(
|
|
42
|
+
weight * float(node.metric_results[name])
|
|
43
|
+
for name, weight in self.weights.items()
|
|
44
|
+
if name in node.metric_results
|
|
45
|
+
)
|
|
46
|
+
return min(1.0, max(0.0, total))
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import RootMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CoverageLeafScore(RootMetric):
|
|
12
|
+
"""Fraction of expected leaf fields that are present (non-null) in actual.
|
|
13
|
+
|
|
14
|
+
Completeness across the whole document, independent of value correctness.
|
|
15
|
+
Counts only leaves expected to have a value; a document expecting nothing
|
|
16
|
+
is vacuously 1.0. (Array elements missed during alignment have no leaf
|
|
17
|
+
node and are covered by the array metrics instead.)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name = "coverage_leaf_score"
|
|
21
|
+
|
|
22
|
+
def compute(self, node: EvalNode) -> float:
|
|
23
|
+
expected = covered = 0
|
|
24
|
+
for leaf in node.leaves():
|
|
25
|
+
if leaf.expected is not None:
|
|
26
|
+
expected += 1
|
|
27
|
+
if leaf.actual is not None:
|
|
28
|
+
covered += 1
|
|
29
|
+
return covered / expected if expected else 1.0
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import date, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pydantic import TypeAdapter
|
|
7
|
+
|
|
8
|
+
from structured_eval.metrics.base import FieldMetric
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _to_date(value: Any) -> date | None:
|
|
12
|
+
try:
|
|
13
|
+
adapter = TypeAdapter(date)
|
|
14
|
+
return adapter.validate_python(value)
|
|
15
|
+
except Exception:
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DateDistanceScore(FieldMetric):
|
|
20
|
+
"""Linear similarity for date and datetime fields.
|
|
21
|
+
|
|
22
|
+
The score is computed as::
|
|
23
|
+
|
|
24
|
+
max(0, 1 - days_difference / max_days)
|
|
25
|
+
|
|
26
|
+
yielding:
|
|
27
|
+
|
|
28
|
+
- ``1.0`` for identical dates;
|
|
29
|
+
- a linear decrease as the difference in days grows;
|
|
30
|
+
- ``0.0`` once the difference reaches or exceeds ``max_days``.
|
|
31
|
+
|
|
32
|
+
Both ``date`` and ``datetime`` values are supported, and ISO-8601 strings
|
|
33
|
+
(e.g. ``"2026-06-29"``) are coerced via pydantic. Datetime values are
|
|
34
|
+
compared by their calendar date only (time-of-day is ignored).
|
|
35
|
+
|
|
36
|
+
If either side cannot be read as a date — ``None``, an unparseable string,
|
|
37
|
+
or any non-date type — the score is ``0.0``.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name = "date_distance_score"
|
|
41
|
+
|
|
42
|
+
def __init__(self, max_days: int = 30) -> None:
|
|
43
|
+
if max_days <= 0:
|
|
44
|
+
raise ValueError("max_days must be greater than 0")
|
|
45
|
+
self.max_days = max_days
|
|
46
|
+
|
|
47
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
48
|
+
if not isinstance(actual, (date, datetime)):
|
|
49
|
+
actual = _to_date(actual)
|
|
50
|
+
if not isinstance(expected, (date, datetime)):
|
|
51
|
+
expected = _to_date(expected)
|
|
52
|
+
if not (
|
|
53
|
+
isinstance(actual, (date, datetime))
|
|
54
|
+
and isinstance(expected, (date, datetime))
|
|
55
|
+
):
|
|
56
|
+
return 0.0
|
|
57
|
+
|
|
58
|
+
actual_date = actual.date() if isinstance(actual, datetime) else actual
|
|
59
|
+
expected_date = expected.date() if isinstance(expected, datetime) else expected
|
|
60
|
+
|
|
61
|
+
days = abs((actual_date - expected_date).days)
|
|
62
|
+
|
|
63
|
+
return max(0.0, 1.0 - days / self.max_days)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import FieldMetric
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ExactMatch(FieldMetric):
|
|
9
|
+
"""Strict equality: ``actual == expected`` → 1.0, else 0.0.
|
|
10
|
+
|
|
11
|
+
The default scalar comparison, and the default key comparison in ``by_key``
|
|
12
|
+
array alignment. It does *not* score whole objects/arrays: object metrics
|
|
13
|
+
read each child's representative, and array alignment defaults are
|
|
14
|
+
type-aware — ExactMatch only ever touches a dict/list through the
|
|
15
|
+
value-level ``score`` path.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
name = "exact_match"
|
|
19
|
+
|
|
20
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
21
|
+
return 1.0 if actual == expected else 0.0
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.metrics.base import FieldMetric
|
|
7
|
+
from structured_eval.metrics.utils.number import parse_number
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ExponentialNumericScore(FieldMetric):
|
|
11
|
+
"""Exponentially decaying similarity for numeric fields.
|
|
12
|
+
|
|
13
|
+
The score is computed as::
|
|
14
|
+
|
|
15
|
+
exp(-abs(actual - expected) / scale)
|
|
16
|
+
|
|
17
|
+
yielding:
|
|
18
|
+
|
|
19
|
+
- ``1.0`` for an exact match;
|
|
20
|
+
- a smooth exponential decay as the absolute error increases;
|
|
21
|
+
- values always in the range ``(0.0, 1.0]``.
|
|
22
|
+
|
|
23
|
+
The ``scale`` parameter controls how quickly the score decreases. Larger
|
|
24
|
+
values make the metric more tolerant to numeric differences. Unlike the
|
|
25
|
+
ratio-based :class:`NumericCloseness`, the decay is on the **absolute**
|
|
26
|
+
error, so it is scale-aware — pick ``scale`` to match the field's units.
|
|
27
|
+
|
|
28
|
+
Values are read with the same lenient parser as :class:`Numeric` /
|
|
29
|
+
:class:`NumericCloseness`, so numeric strings are graded too. The metric
|
|
30
|
+
applies **only to numbers**: if either side isn't numeric (``None``, a
|
|
31
|
+
non-numeric string, or a ``bool`` — ``True`` is not ``1``) the score is
|
|
32
|
+
``0.0``.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name = "exponential_numeric_score"
|
|
36
|
+
|
|
37
|
+
def __init__(self, scale: float = 1.0) -> None:
|
|
38
|
+
if scale <= 0:
|
|
39
|
+
raise ValueError("scale must be greater than 0")
|
|
40
|
+
self.scale = scale
|
|
41
|
+
|
|
42
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
43
|
+
a = parse_number(actual)
|
|
44
|
+
e = parse_number(expected)
|
|
45
|
+
if a is None or e is None:
|
|
46
|
+
return 0.0
|
|
47
|
+
return math.exp(-abs(a - e) / self.scale)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import FieldMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FieldFaithfulness(FieldMetric):
|
|
12
|
+
"""Is this leaf value grounded in the sample's ``source``? (L1 substring.)
|
|
13
|
+
|
|
14
|
+
A per-field faithfulness check, true to the framework's "comparison is a
|
|
15
|
+
metric" core: each scalar leaf scores ``1.0`` if its string form appears
|
|
16
|
+
(case-insensitively) verbatim in ``source``, else ``0.0`` (a hallucination).
|
|
17
|
+
Cascade it via ``EvalConfig(metrics=[FieldFaithfulness()])`` and the engine
|
|
18
|
+
does the rest — aggregation is just the usual leaf roll-up
|
|
19
|
+
(``MeanScore`` / ``OverallLeafScore``), and the hallucinated fields are the
|
|
20
|
+
leaves scoring ``0.0`` (``report.metrics["field_faithfulness"].by_path``).
|
|
21
|
+
|
|
22
|
+
Requires a grounding ``source`` on the sample — faithfulness is undefined
|
|
23
|
+
without one, so a missing ``source`` is a configuration error
|
|
24
|
+
(``ValueError``) rather than a silently omitted metric.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "field_faithfulness"
|
|
28
|
+
|
|
29
|
+
def compute(self, node: ScalarNode) -> float | None:
|
|
30
|
+
source = node.context.source
|
|
31
|
+
if source is None:
|
|
32
|
+
raise ValueError(
|
|
33
|
+
"Faithfulness requires a grounding `source`; pass source=... to evaluate()"
|
|
34
|
+
)
|
|
35
|
+
actual = node.actual
|
|
36
|
+
if actual is None:
|
|
37
|
+
return None
|
|
38
|
+
return 1.0 if str(actual).lower() in source.lower() else 0.0
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.metrics.base import FieldMetric
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FuzzyMethod(StrEnum):
|
|
10
|
+
"""RapidFuzz scorer used by :class:`Fuzzy`."""
|
|
11
|
+
|
|
12
|
+
RATIO = "ratio" # plain normalized Levenshtein ratio
|
|
13
|
+
PARTIAL_RATIO = "partial_ratio" # best matching substring
|
|
14
|
+
TOKEN_SORT_RATIO = "token_sort_ratio" # order-insensitive (default)
|
|
15
|
+
TOKEN_SET_RATIO = "token_set_ratio" # set-based, ignores duplicate tokens
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Fuzzy(FieldMetric):
|
|
19
|
+
"""Fuzzy string similarity via RapidFuzz (optional dependency).
|
|
20
|
+
|
|
21
|
+
``method`` selects the RapidFuzz scorer:
|
|
22
|
+
|
|
23
|
+
* ``ratio`` — plain normalized Levenshtein ratio;
|
|
24
|
+
* ``partial_ratio`` — best matching substring;
|
|
25
|
+
* ``token_sort_ratio`` (default) — order-insensitive, sorts tokens;
|
|
26
|
+
* ``token_set_ratio`` — set-based, ignores duplicate/extra tokens.
|
|
27
|
+
|
|
28
|
+
``normalize`` strips surrounding whitespace and lowercases before comparison.
|
|
29
|
+
String-only: if either side is not a ``str`` the score is 0.0 (no coercion),
|
|
30
|
+
consistent with the other text metrics.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name = "fuzzy"
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
method: FuzzyMethod = FuzzyMethod.TOKEN_SORT_RATIO,
|
|
38
|
+
normalize: bool = True,
|
|
39
|
+
):
|
|
40
|
+
self.method = FuzzyMethod(method)
|
|
41
|
+
self.normalize = normalize
|
|
42
|
+
|
|
43
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
44
|
+
if not (isinstance(actual, str) and isinstance(expected, str)):
|
|
45
|
+
return 0.0
|
|
46
|
+
try:
|
|
47
|
+
from rapidfuzz import fuzz
|
|
48
|
+
except ImportError as exc: # pragma: no cover
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"rapidfuzz is required for the 'fuzzy' metric. "
|
|
51
|
+
"Install it with: pip install 'structured-eval[fuzzy]'"
|
|
52
|
+
) from exc
|
|
53
|
+
|
|
54
|
+
scorer = {
|
|
55
|
+
"ratio": fuzz.ratio,
|
|
56
|
+
"partial_ratio": fuzz.partial_ratio,
|
|
57
|
+
"token_sort_ratio": fuzz.token_sort_ratio,
|
|
58
|
+
"token_set_ratio": fuzz.token_set_ratio,
|
|
59
|
+
}[self.method]
|
|
60
|
+
|
|
61
|
+
a, e = actual, expected
|
|
62
|
+
if self.normalize:
|
|
63
|
+
a, e = a.strip().lower(), e.strip().lower()
|
|
64
|
+
return float(scorer(a, e)) / 100.0
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""The single way to run a metric, whatever input is available.
|
|
2
|
+
|
|
3
|
+
Every metric is invoked through ``MetricInvoker`` — never by calling ``compute``
|
|
4
|
+
/ ``compute_<kind>`` / ``score`` directly. Two input modes:
|
|
5
|
+
|
|
6
|
+
* ``on_node`` — a node is available: grade it. A ``Metric`` uses ``compute``; a
|
|
7
|
+
``GenericMetric`` dispatches to the ``compute_<kind>`` for the node's type.
|
|
8
|
+
* ``on_values`` — only raw ``actual`` / ``expected`` (array alignment, before any
|
|
9
|
+
node exists): compare them. A ``Metric`` uses ``score``; a ``GenericMetric``
|
|
10
|
+
dispatches to the ``score_<kind>`` for the kind inferred from the value's shape.
|
|
11
|
+
|
|
12
|
+
Each mode has a ``scalar_*`` variant that narrows the result to a single
|
|
13
|
+
``float`` (rejecting a dict of sub-scores) — that narrowing is the caller's
|
|
14
|
+
contract, hence its own method.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import TYPE_CHECKING, Any
|
|
20
|
+
|
|
21
|
+
from structured_eval.metrics.base import BaseMetric, GenericMetric, Metric, MetricOutput
|
|
22
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
23
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
24
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
28
|
+
|
|
29
|
+
# A GenericMetric's per-kind method names, by node class, for each input mode.
|
|
30
|
+
GENERIC_NODE_METHOD: dict[type, str] = {
|
|
31
|
+
ScalarNode: "compute_scalar",
|
|
32
|
+
ObjectNode: "compute_object",
|
|
33
|
+
ArrayNode: "compute_array",
|
|
34
|
+
}
|
|
35
|
+
GENERIC_SCORE_METHOD: dict[type, str] = {
|
|
36
|
+
ScalarNode: "score_scalar",
|
|
37
|
+
ObjectNode: "score_object",
|
|
38
|
+
ArrayNode: "score_array",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _kind_of(actual: Any, expected: Any) -> type:
|
|
43
|
+
"""The node class a raw value pair would build (mirrors ``TreeBuilder``)."""
|
|
44
|
+
ref = expected if expected is not None else actual
|
|
45
|
+
if isinstance(ref, dict):
|
|
46
|
+
return ObjectNode
|
|
47
|
+
if isinstance(ref, list):
|
|
48
|
+
return ArrayNode
|
|
49
|
+
return ScalarNode
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MetricInvoker:
|
|
53
|
+
"""Runs ``self.metric`` in either input mode; see module docstring."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, metric: BaseMetric):
|
|
56
|
+
self.metric = metric
|
|
57
|
+
|
|
58
|
+
def on_node(self, node: EvalNode) -> MetricOutput:
|
|
59
|
+
metric = self.metric
|
|
60
|
+
if isinstance(metric, GenericMetric):
|
|
61
|
+
return self._dispatch_generic(GENERIC_NODE_METHOD.get(type(node)), node)
|
|
62
|
+
assert isinstance(metric, Metric) # every non-generic metric has compute(node)
|
|
63
|
+
return metric.compute(node)
|
|
64
|
+
|
|
65
|
+
def on_values(self, actual: Any, expected: Any) -> MetricOutput:
|
|
66
|
+
metric = self.metric
|
|
67
|
+
if isinstance(metric, GenericMetric):
|
|
68
|
+
method = GENERIC_SCORE_METHOD.get(_kind_of(actual, expected))
|
|
69
|
+
return self._dispatch_generic(method, actual, expected)
|
|
70
|
+
assert isinstance(metric, Metric) # value comparison needs score()
|
|
71
|
+
return metric.score(actual, expected)
|
|
72
|
+
|
|
73
|
+
def scalar_on_node(self, node: EvalNode) -> float:
|
|
74
|
+
return self._scalar(self.on_node(node), node.path)
|
|
75
|
+
|
|
76
|
+
def scalar_on_values(self, actual: Any, expected: Any) -> float:
|
|
77
|
+
return self._scalar(self.on_values(actual, expected), "<values>")
|
|
78
|
+
|
|
79
|
+
def _dispatch_generic(self, method: str | None, *args: Any) -> MetricOutput:
|
|
80
|
+
if method is None or not hasattr(self.metric, method):
|
|
81
|
+
return None
|
|
82
|
+
result: MetricOutput = getattr(self.metric, method)(*args)
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
def _scalar(self, result: Any, where: str) -> float:
|
|
86
|
+
assert isinstance(result, (int, float)), (
|
|
87
|
+
f"metric {self.metric.name!r} must yield a scalar at {where}, "
|
|
88
|
+
f"got {type(result).__name__}"
|
|
89
|
+
)
|
|
90
|
+
return float(result)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from structured_eval.metrics.fuzzy import Fuzzy, FuzzyMethod
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Levenshtein(Fuzzy):
|
|
7
|
+
"""Normalized Levenshtein ratio — a thin alias over ``Fuzzy(RATIO)``.
|
|
8
|
+
|
|
9
|
+
RapidFuzz's ``ratio`` *is* the normalized Levenshtein similarity; this class
|
|
10
|
+
exists only for discoverability. All arithmetic lives in ``Fuzzy``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
name = "levenshtein"
|
|
14
|
+
|
|
15
|
+
def __init__(self, method: FuzzyMethod = FuzzyMethod.RATIO, normalize: bool = True):
|
|
16
|
+
super().__init__(method=method, normalize=normalize)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import AnyNodeMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MeanScore(AnyNodeMetric):
|
|
12
|
+
"""A node's representative score: the arithmetic mean of its own metrics.
|
|
13
|
+
|
|
14
|
+
The default ``key_metric`` of every node — the single number that bubbles up
|
|
15
|
+
to a parent's aggregation and, at the root, to ``report.score``. It is
|
|
16
|
+
computed **last**, so by the time it runs the node's other metrics already
|
|
17
|
+
populate ``metric_results``; it averages those (excluding itself), without
|
|
18
|
+
recursing into children — any cross-child aggregation is the job of the
|
|
19
|
+
node's *own* metrics (``ObjectAccuracy`` / ``ObjectF1`` / ``ArrayAccuracy``),
|
|
20
|
+
which the engine guarantees by defaulting one onto every node. A node with
|
|
21
|
+
no other computed metric (e.g. a leaf whose only metric opted out by
|
|
22
|
+
returning ``None``) scores ``0.0`` — every node always has a representative.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "mean_score"
|
|
26
|
+
|
|
27
|
+
def compute(self, node: EvalNode) -> float:
|
|
28
|
+
values = [
|
|
29
|
+
float(v) for name, v in node.metric_results.items() if name != self.name
|
|
30
|
+
]
|
|
31
|
+
return sum(values) / len(values) if values else 0.0
|