structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+
6
+ from structured_eval.models.metric_result import MetricResult
7
+ from structured_eval.models.nodes.array_node import ArrayNode
8
+ from structured_eval.models.nodes.base import EvalNode
9
+ from structured_eval.models.nodes.object_node import ObjectNode
10
+ from structured_eval.models.nodes.scalar import ScalarNode
11
+
12
+ # What a metric's ``compute`` may return; ``MetricRunner._apply`` normalizes any
13
+ # of these to a ``MetricResult``. A bare value / dict of sub-scores, optionally
14
+ # paired with structured ``extra`` via a tuple, or a ready ``MetricResult``.
15
+ MetricOutput = (
16
+ float
17
+ | dict[str, float]
18
+ | tuple[float | dict[str, float], dict[str, Any]]
19
+ | MetricResult
20
+ | None
21
+ )
22
+
23
+ # Name → metric class. Populated automatically as BaseMetric subclasses are
24
+ # declared; used by EvalConfig.from_yaml() to resolve string names (Stage 10).
25
+ _METRIC_REGISTRY: dict[str, type] = {}
26
+
27
+
28
+ class BaseMetric(ABC): # noqa: B024 — registry root; subclasses define the interface
29
+ """Registry root for every metric — no evaluation interface of its own.
30
+
31
+ ``name`` is the key under which a scalar result lands in ``report.metrics``
32
+ and ``FieldScore.metrics``. A metric that returns a ``dict`` instead writes
33
+ each of its keys directly (the ``name`` is then only a registry handle).
34
+ Declaring a subclass with a ``name`` registers it automatically.
35
+ """
36
+
37
+ name: str = ""
38
+
39
+ def __init_subclass__(cls, **kwargs: Any) -> None:
40
+ super().__init_subclass__(**kwargs)
41
+ if n := getattr(cls, "name", None):
42
+ _METRIC_REGISTRY[n] = cls
43
+
44
+
45
+ class Metric[NodeT: EvalNode](BaseMetric):
46
+ """The unified metric interface: ``compute(node)`` + ``score(actual, expected)``.
47
+
48
+ Every concrete metric is a ``Metric`` and therefore *has* a ``score`` — a
49
+ pure value-level comparison ``(actual, expected) -> float | dict`` reused by
50
+ array alignment. ``compute(node)`` is the node-level entry point; by default
51
+ it delegates to ``score`` on the node's values, so a leaf comparison need
52
+ only implement ``score``. Aggregating metrics override ``compute`` and leave
53
+ ``score`` at its default (callers that require a scalar verdict check the
54
+ result type where it matters). The type parameter ``NodeT`` pins the node
55
+ type a subtype operates on (``ScalarNode`` for fields, ``ObjectNode`` …).
56
+ """
57
+
58
+ def compute(self, node: NodeT) -> MetricOutput:
59
+ return self.score(node.actual, node.expected)
60
+
61
+ def score(self, actual: Any, expected: Any) -> float | dict[str, float]:
62
+ raise NotImplementedError
63
+
64
+
65
+ class FieldMetric(Metric[ScalarNode]):
66
+ """A leaf comparison applied to each ScalarNode.
67
+
68
+ Implements ``score(actual, expected)`` and relies on the inherited
69
+ ``compute``; metrics that need node context (e.g. ``Presence``) override
70
+ ``compute`` directly. Also the marker the engine dispatches on for scalars.
71
+ """
72
+
73
+
74
+ class ObjectMetric(Metric[ObjectNode]):
75
+ """Applies to each ObjectNode (root and nested)."""
76
+
77
+ @abstractmethod
78
+ def compute(self, node: ObjectNode) -> MetricOutput: ...
79
+
80
+
81
+ class ArrayMetric(Metric[ArrayNode]):
82
+ """Applies to each ArrayNode."""
83
+
84
+ @abstractmethod
85
+ def compute(self, node: ArrayNode) -> MetricOutput: ...
86
+
87
+
88
+ class RootMetric(Metric[EvalNode]):
89
+ """Applies only to the root node (path == "$"); receives any EvalNode."""
90
+
91
+ @abstractmethod
92
+ def compute(self, node: EvalNode) -> MetricOutput: ...
93
+
94
+
95
+ class AnyNodeMetric(Metric[EvalNode]):
96
+ """Applies uniformly to *every* node — same ``compute`` regardless of kind.
97
+
98
+ The node-agnostic branch of the hierarchy: unlike the typed metrics
99
+ (``FieldMetric`` / ``ObjectMetric`` / ``ArrayMetric``) it is not pinned to
100
+ one node type, and unlike ``GenericMetric`` it does not dispatch per kind —
101
+ it runs one uniform computation on any ``EvalNode``. ``RootMetric`` is the
102
+ sibling that is *also* ``Metric[EvalNode]`` but admitted only at the root;
103
+ an ``AnyNodeMetric`` is admitted everywhere. ``MeanScore`` (the default
104
+ representative) lives here, and a custom uniform metric can be cascaded via
105
+ ``config.metrics`` or chosen as a ``key_metric``.
106
+ """
107
+
108
+ @abstractmethod
109
+ def compute(self, node: EvalNode) -> MetricOutput: ...
110
+
111
+
112
+ class GenericMetric(BaseMetric):
113
+ """Metrics spanning several node types, outside the single-``compute`` shape.
114
+
115
+ Override whichever per-kind methods apply: ``compute_scalar`` /
116
+ ``compute_object`` / ``compute_array`` for node mode, and (optionally)
117
+ ``score_scalar`` / ``score_object`` / ``score_array`` for value mode.
118
+ ``MetricInvoker`` dispatches by kind; ``TreeBuilder`` admits the metric onto
119
+ a node only when the matching ``compute_<kind>`` exists. (Replaces the former
120
+ ``NodeMetric``.)
121
+ """
122
+
123
+
124
+ def get_metric_class(name: str) -> type:
125
+ """Resolve a metric class by its ``name`` (e.g. ``"object_f1"``)."""
126
+ if name not in _METRIC_REGISTRY:
127
+ raise KeyError(f"Unknown metric: {name!r}. Known: {sorted(_METRIC_REGISTRY)}")
128
+ return _METRIC_REGISTRY[name]
129
+
130
+
131
+ def resolve_metric(spec: str | BaseMetric) -> BaseMetric:
132
+ """Coerce a metric spec to a ``BaseMetric`` instance.
133
+
134
+ Accepts an instance as-is or a registered name string (instantiated with no
135
+ args). The single resolver shared by the engine, array alignment, and the
136
+ match-criterion helper. ``None`` is *not* handled here — callers supply
137
+ their own default. Score-needing call sites narrow the result to ``Metric``.
138
+ """
139
+ if isinstance(spec, str):
140
+ instance = get_metric_class(spec)()
141
+ assert isinstance(instance, BaseMetric)
142
+ return instance
143
+ assert isinstance(spec, BaseMetric)
144
+ return spec
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections import Counter
5
+ from typing import Any
6
+
7
+ from structured_eval.metrics.base import FieldMetric
8
+
9
+ _NON_WORD = re.compile(r"[^\w\s]")
10
+
11
+
12
+ def _characters(value: Any) -> list[str]:
13
+ """Lowercase, drop punctuation and whitespace, split into characters."""
14
+ normalized = _NON_WORD.sub("", str(value).lower())
15
+ normalized = "".join(normalized.split()) # remove all whitespace
16
+ return list(normalized)
17
+
18
+
19
+ class CharacterF1(FieldMetric):
20
+ """Character-overlap F1 for short free-text fields.
21
+
22
+ Characters are matched as a **multiset** (``Counter``), so repeated
23
+ characters contribute only as many times as they appear on both sides.
24
+ Precision and recall are computed over character counts, and their
25
+ harmonic mean is returned. String-only: if either side is not a ``str``
26
+ the score is ``0.0`` (no coercion).
27
+ """
28
+
29
+ name = "character_f1"
30
+
31
+ def score(self, actual: Any, expected: Any) -> float:
32
+ if not (isinstance(actual, str) and isinstance(expected, str)):
33
+ return 0.0
34
+
35
+ a = _characters(actual)
36
+ e = _characters(expected)
37
+
38
+ if not a and not e:
39
+ return 1.0
40
+ if not a or not e:
41
+ return 0.0
42
+
43
+ same = sum((Counter(a) & Counter(e)).values())
44
+ if not same:
45
+ return 0.0
46
+
47
+ precision = same / len(a)
48
+ recall = same / len(e)
49
+
50
+ return 2 * precision * recall / (precision + recall)
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import AnyNodeMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.base import EvalNode
9
+
10
+
11
+ class CompositeScore(AnyNodeMetric):
12
+ """Weighted blend of other metrics already computed on the same node.
13
+
14
+ Given ``weights={metric_name: weight}``, the score is the weighted mean of
15
+ those metrics' values on the node::
16
+
17
+ score = Σ wᵢ · metric_resultsᵢ (weights normalized to sum 1.0)
18
+
19
+ The referenced metrics must already be present in ``node.metric_results``,
20
+ so list them in the node's ``metrics`` (or as cascaded ``config.metrics``)
21
+ alongside ``CompositeScore``. As a representative it is best used as the
22
+ node's ``key_metric``, which the engine runs **last** — by then every other
23
+ metric on the node is computed.
24
+
25
+ Only the metrics named in ``weights`` contribute; any other metric on the
26
+ node is ignored, and a named metric that is absent contributes ``0``. The
27
+ result is clamped to ``[0, 1]`` (each input is expected in ``[0, 1]``).
28
+ """
29
+
30
+ name = "composite_score"
31
+
32
+ def __init__(self, weights: dict[str, float]) -> None:
33
+ if not weights:
34
+ raise ValueError("CompositeScore requires at least one metric weight")
35
+ total = sum(weights.values())
36
+ if total <= 0:
37
+ raise ValueError("Sum of weights must be > 0")
38
+ self.weights: dict[str, float] = {m: w / total for m, w in weights.items()}
39
+
40
+ def compute(self, node: EvalNode) -> float:
41
+ total = sum(
42
+ weight * float(node.metric_results[name])
43
+ for name, weight in self.weights.items()
44
+ if name in node.metric_results
45
+ )
46
+ return min(1.0, max(0.0, total))
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import RootMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.base import EvalNode
9
+
10
+
11
+ class CoverageLeafScore(RootMetric):
12
+ """Fraction of expected leaf fields that are present (non-null) in actual.
13
+
14
+ Completeness across the whole document, independent of value correctness.
15
+ Counts only leaves expected to have a value; a document expecting nothing
16
+ is vacuously 1.0. (Array elements missed during alignment have no leaf
17
+ node and are covered by the array metrics instead.)
18
+ """
19
+
20
+ name = "coverage_leaf_score"
21
+
22
+ def compute(self, node: EvalNode) -> float:
23
+ expected = covered = 0
24
+ for leaf in node.leaves():
25
+ if leaf.expected is not None:
26
+ expected += 1
27
+ if leaf.actual is not None:
28
+ covered += 1
29
+ return covered / expected if expected else 1.0
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, datetime
4
+ from typing import Any
5
+
6
+ from pydantic import TypeAdapter
7
+
8
+ from structured_eval.metrics.base import FieldMetric
9
+
10
+
11
+ def _to_date(value: Any) -> date | None:
12
+ try:
13
+ adapter = TypeAdapter(date)
14
+ return adapter.validate_python(value)
15
+ except Exception:
16
+ return None
17
+
18
+
19
+ class DateDistanceScore(FieldMetric):
20
+ """Linear similarity for date and datetime fields.
21
+
22
+ The score is computed as::
23
+
24
+ max(0, 1 - days_difference / max_days)
25
+
26
+ yielding:
27
+
28
+ - ``1.0`` for identical dates;
29
+ - a linear decrease as the difference in days grows;
30
+ - ``0.0`` once the difference reaches or exceeds ``max_days``.
31
+
32
+ Both ``date`` and ``datetime`` values are supported, and ISO-8601 strings
33
+ (e.g. ``"2026-06-29"``) are coerced via pydantic. Datetime values are
34
+ compared by their calendar date only (time-of-day is ignored).
35
+
36
+ If either side cannot be read as a date — ``None``, an unparseable string,
37
+ or any non-date type — the score is ``0.0``.
38
+ """
39
+
40
+ name = "date_distance_score"
41
+
42
+ def __init__(self, max_days: int = 30) -> None:
43
+ if max_days <= 0:
44
+ raise ValueError("max_days must be greater than 0")
45
+ self.max_days = max_days
46
+
47
+ def score(self, actual: Any, expected: Any) -> float:
48
+ if not isinstance(actual, (date, datetime)):
49
+ actual = _to_date(actual)
50
+ if not isinstance(expected, (date, datetime)):
51
+ expected = _to_date(expected)
52
+ if not (
53
+ isinstance(actual, (date, datetime))
54
+ and isinstance(expected, (date, datetime))
55
+ ):
56
+ return 0.0
57
+
58
+ actual_date = actual.date() if isinstance(actual, datetime) else actual
59
+ expected_date = expected.date() if isinstance(expected, datetime) else expected
60
+
61
+ days = abs((actual_date - expected_date).days)
62
+
63
+ return max(0.0, 1.0 - days / self.max_days)
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.metrics.base import FieldMetric
6
+
7
+
8
+ class ExactMatch(FieldMetric):
9
+ """Strict equality: ``actual == expected`` → 1.0, else 0.0.
10
+
11
+ The default scalar comparison, and the default key comparison in ``by_key``
12
+ array alignment. It does *not* score whole objects/arrays: object metrics
13
+ read each child's representative, and array alignment defaults are
14
+ type-aware — ExactMatch only ever touches a dict/list through the
15
+ value-level ``score`` path.
16
+ """
17
+
18
+ name = "exact_match"
19
+
20
+ def score(self, actual: Any, expected: Any) -> float:
21
+ return 1.0 if actual == expected else 0.0
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from typing import Any
5
+
6
+ from structured_eval.metrics.base import FieldMetric
7
+ from structured_eval.metrics.utils.number import parse_number
8
+
9
+
10
+ class ExponentialNumericScore(FieldMetric):
11
+ """Exponentially decaying similarity for numeric fields.
12
+
13
+ The score is computed as::
14
+
15
+ exp(-abs(actual - expected) / scale)
16
+
17
+ yielding:
18
+
19
+ - ``1.0`` for an exact match;
20
+ - a smooth exponential decay as the absolute error increases;
21
+ - values always in the range ``(0.0, 1.0]``.
22
+
23
+ The ``scale`` parameter controls how quickly the score decreases. Larger
24
+ values make the metric more tolerant to numeric differences. Unlike the
25
+ ratio-based :class:`NumericCloseness`, the decay is on the **absolute**
26
+ error, so it is scale-aware — pick ``scale`` to match the field's units.
27
+
28
+ Values are read with the same lenient parser as :class:`Numeric` /
29
+ :class:`NumericCloseness`, so numeric strings are graded too. The metric
30
+ applies **only to numbers**: if either side isn't numeric (``None``, a
31
+ non-numeric string, or a ``bool`` — ``True`` is not ``1``) the score is
32
+ ``0.0``.
33
+ """
34
+
35
+ name = "exponential_numeric_score"
36
+
37
+ def __init__(self, scale: float = 1.0) -> None:
38
+ if scale <= 0:
39
+ raise ValueError("scale must be greater than 0")
40
+ self.scale = scale
41
+
42
+ def score(self, actual: Any, expected: Any) -> float:
43
+ a = parse_number(actual)
44
+ e = parse_number(expected)
45
+ if a is None or e is None:
46
+ return 0.0
47
+ return math.exp(-abs(a - e) / self.scale)
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import FieldMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.scalar import ScalarNode
9
+
10
+
11
+ class FieldFaithfulness(FieldMetric):
12
+ """Is this leaf value grounded in the sample's ``source``? (L1 substring.)
13
+
14
+ A per-field faithfulness check, true to the framework's "comparison is a
15
+ metric" core: each scalar leaf scores ``1.0`` if its string form appears
16
+ (case-insensitively) verbatim in ``source``, else ``0.0`` (a hallucination).
17
+ Cascade it via ``EvalConfig(metrics=[FieldFaithfulness()])`` and the engine
18
+ does the rest — aggregation is just the usual leaf roll-up
19
+ (``MeanScore`` / ``OverallLeafScore``), and the hallucinated fields are the
20
+ leaves scoring ``0.0`` (``report.metrics["field_faithfulness"].by_path``).
21
+
22
+ Requires a grounding ``source`` on the sample — faithfulness is undefined
23
+ without one, so a missing ``source`` is a configuration error
24
+ (``ValueError``) rather than a silently omitted metric.
25
+ """
26
+
27
+ name = "field_faithfulness"
28
+
29
+ def compute(self, node: ScalarNode) -> float | None:
30
+ source = node.context.source
31
+ if source is None:
32
+ raise ValueError(
33
+ "Faithfulness requires a grounding `source`; pass source=... to evaluate()"
34
+ )
35
+ actual = node.actual
36
+ if actual is None:
37
+ return None
38
+ return 1.0 if str(actual).lower() in source.lower() else 0.0
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ from typing import Any
5
+
6
+ from structured_eval.metrics.base import FieldMetric
7
+
8
+
9
+ class FuzzyMethod(StrEnum):
10
+ """RapidFuzz scorer used by :class:`Fuzzy`."""
11
+
12
+ RATIO = "ratio" # plain normalized Levenshtein ratio
13
+ PARTIAL_RATIO = "partial_ratio" # best matching substring
14
+ TOKEN_SORT_RATIO = "token_sort_ratio" # order-insensitive (default)
15
+ TOKEN_SET_RATIO = "token_set_ratio" # set-based, ignores duplicate tokens
16
+
17
+
18
+ class Fuzzy(FieldMetric):
19
+ """Fuzzy string similarity via RapidFuzz (optional dependency).
20
+
21
+ ``method`` selects the RapidFuzz scorer:
22
+
23
+ * ``ratio`` — plain normalized Levenshtein ratio;
24
+ * ``partial_ratio`` — best matching substring;
25
+ * ``token_sort_ratio`` (default) — order-insensitive, sorts tokens;
26
+ * ``token_set_ratio`` — set-based, ignores duplicate/extra tokens.
27
+
28
+ ``normalize`` strips surrounding whitespace and lowercases before comparison.
29
+ String-only: if either side is not a ``str`` the score is 0.0 (no coercion),
30
+ consistent with the other text metrics.
31
+ """
32
+
33
+ name = "fuzzy"
34
+
35
+ def __init__(
36
+ self,
37
+ method: FuzzyMethod = FuzzyMethod.TOKEN_SORT_RATIO,
38
+ normalize: bool = True,
39
+ ):
40
+ self.method = FuzzyMethod(method)
41
+ self.normalize = normalize
42
+
43
+ def score(self, actual: Any, expected: Any) -> float:
44
+ if not (isinstance(actual, str) and isinstance(expected, str)):
45
+ return 0.0
46
+ try:
47
+ from rapidfuzz import fuzz
48
+ except ImportError as exc: # pragma: no cover
49
+ raise ImportError(
50
+ "rapidfuzz is required for the 'fuzzy' metric. "
51
+ "Install it with: pip install 'structured-eval[fuzzy]'"
52
+ ) from exc
53
+
54
+ scorer = {
55
+ "ratio": fuzz.ratio,
56
+ "partial_ratio": fuzz.partial_ratio,
57
+ "token_sort_ratio": fuzz.token_sort_ratio,
58
+ "token_set_ratio": fuzz.token_set_ratio,
59
+ }[self.method]
60
+
61
+ a, e = actual, expected
62
+ if self.normalize:
63
+ a, e = a.strip().lower(), e.strip().lower()
64
+ return float(scorer(a, e)) / 100.0
@@ -0,0 +1,90 @@
1
+ """The single way to run a metric, whatever input is available.
2
+
3
+ Every metric is invoked through ``MetricInvoker`` — never by calling ``compute``
4
+ / ``compute_<kind>`` / ``score`` directly. Two input modes:
5
+
6
+ * ``on_node`` — a node is available: grade it. A ``Metric`` uses ``compute``; a
7
+ ``GenericMetric`` dispatches to the ``compute_<kind>`` for the node's type.
8
+ * ``on_values`` — only raw ``actual`` / ``expected`` (array alignment, before any
9
+ node exists): compare them. A ``Metric`` uses ``score``; a ``GenericMetric``
10
+ dispatches to the ``score_<kind>`` for the kind inferred from the value's shape.
11
+
12
+ Each mode has a ``scalar_*`` variant that narrows the result to a single
13
+ ``float`` (rejecting a dict of sub-scores) — that narrowing is the caller's
14
+ contract, hence its own method.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from typing import TYPE_CHECKING, Any
20
+
21
+ from structured_eval.metrics.base import BaseMetric, GenericMetric, Metric, MetricOutput
22
+ from structured_eval.models.nodes.array_node import ArrayNode
23
+ from structured_eval.models.nodes.object_node import ObjectNode
24
+ from structured_eval.models.nodes.scalar import ScalarNode
25
+
26
+ if TYPE_CHECKING:
27
+ from structured_eval.models.nodes.base import EvalNode
28
+
29
+ # A GenericMetric's per-kind method names, by node class, for each input mode.
30
+ GENERIC_NODE_METHOD: dict[type, str] = {
31
+ ScalarNode: "compute_scalar",
32
+ ObjectNode: "compute_object",
33
+ ArrayNode: "compute_array",
34
+ }
35
+ GENERIC_SCORE_METHOD: dict[type, str] = {
36
+ ScalarNode: "score_scalar",
37
+ ObjectNode: "score_object",
38
+ ArrayNode: "score_array",
39
+ }
40
+
41
+
42
+ def _kind_of(actual: Any, expected: Any) -> type:
43
+ """The node class a raw value pair would build (mirrors ``TreeBuilder``)."""
44
+ ref = expected if expected is not None else actual
45
+ if isinstance(ref, dict):
46
+ return ObjectNode
47
+ if isinstance(ref, list):
48
+ return ArrayNode
49
+ return ScalarNode
50
+
51
+
52
+ class MetricInvoker:
53
+ """Runs ``self.metric`` in either input mode; see module docstring."""
54
+
55
+ def __init__(self, metric: BaseMetric):
56
+ self.metric = metric
57
+
58
+ def on_node(self, node: EvalNode) -> MetricOutput:
59
+ metric = self.metric
60
+ if isinstance(metric, GenericMetric):
61
+ return self._dispatch_generic(GENERIC_NODE_METHOD.get(type(node)), node)
62
+ assert isinstance(metric, Metric) # every non-generic metric has compute(node)
63
+ return metric.compute(node)
64
+
65
+ def on_values(self, actual: Any, expected: Any) -> MetricOutput:
66
+ metric = self.metric
67
+ if isinstance(metric, GenericMetric):
68
+ method = GENERIC_SCORE_METHOD.get(_kind_of(actual, expected))
69
+ return self._dispatch_generic(method, actual, expected)
70
+ assert isinstance(metric, Metric) # value comparison needs score()
71
+ return metric.score(actual, expected)
72
+
73
+ def scalar_on_node(self, node: EvalNode) -> float:
74
+ return self._scalar(self.on_node(node), node.path)
75
+
76
+ def scalar_on_values(self, actual: Any, expected: Any) -> float:
77
+ return self._scalar(self.on_values(actual, expected), "<values>")
78
+
79
+ def _dispatch_generic(self, method: str | None, *args: Any) -> MetricOutput:
80
+ if method is None or not hasattr(self.metric, method):
81
+ return None
82
+ result: MetricOutput = getattr(self.metric, method)(*args)
83
+ return result
84
+
85
+ def _scalar(self, result: Any, where: str) -> float:
86
+ assert isinstance(result, (int, float)), (
87
+ f"metric {self.metric.name!r} must yield a scalar at {where}, "
88
+ f"got {type(result).__name__}"
89
+ )
90
+ return float(result)
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from structured_eval.metrics.fuzzy import Fuzzy, FuzzyMethod
4
+
5
+
6
+ class Levenshtein(Fuzzy):
7
+ """Normalized Levenshtein ratio — a thin alias over ``Fuzzy(RATIO)``.
8
+
9
+ RapidFuzz's ``ratio`` *is* the normalized Levenshtein similarity; this class
10
+ exists only for discoverability. All arithmetic lives in ``Fuzzy``.
11
+ """
12
+
13
+ name = "levenshtein"
14
+
15
+ def __init__(self, method: FuzzyMethod = FuzzyMethod.RATIO, normalize: bool = True):
16
+ super().__init__(method=method, normalize=normalize)
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import AnyNodeMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.base import EvalNode
9
+
10
+
11
+ class MeanScore(AnyNodeMetric):
12
+ """A node's representative score: the arithmetic mean of its own metrics.
13
+
14
+ The default ``key_metric`` of every node — the single number that bubbles up
15
+ to a parent's aggregation and, at the root, to ``report.score``. It is
16
+ computed **last**, so by the time it runs the node's other metrics already
17
+ populate ``metric_results``; it averages those (excluding itself), without
18
+ recursing into children — any cross-child aggregation is the job of the
19
+ node's *own* metrics (``ObjectAccuracy`` / ``ObjectF1`` / ``ArrayAccuracy``),
20
+ which the engine guarantees by defaulting one onto every node. A node with
21
+ no other computed metric (e.g. a leaf whose only metric opted out by
22
+ returning ``None``) scores ``0.0`` — every node always has a representative.
23
+ """
24
+
25
+ name = "mean_score"
26
+
27
+ def compute(self, node: EvalNode) -> float:
28
+ values = [
29
+ float(v) for name, v in node.metric_results.items() if name != self.name
30
+ ]
31
+ return sum(values) / len(values) if values else 0.0