structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.metrics.base import FieldMetric
|
|
7
|
+
from structured_eval.metrics.utils.number import parse_number
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NumericMode(StrEnum):
|
|
11
|
+
"""Tolerance band for the single-band form of :class:`Numeric`."""
|
|
12
|
+
|
|
13
|
+
RELATIVE = "relative" # |a - e| / |e|
|
|
14
|
+
ABSOLUTE = "absolute" # |a - e|
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Numeric(FieldMetric):
|
|
18
|
+
"""Numeric equality within a tolerance band → 1.0, otherwise 0.0.
|
|
19
|
+
|
|
20
|
+
Values are parsed leniently: currency symbols and thousands separators are
|
|
21
|
+
stripped (``"$1,234.50"`` → ``1234.50``), accounting notation is honored
|
|
22
|
+
(``"(123)"`` → ``-123``), and scientific notation is supported
|
|
23
|
+
(``"1e3"`` → ``1000``). A percent sign is only stripped, **not** interpreted
|
|
24
|
+
(``"50%"`` → ``50``, not ``0.5``). US format is assumed (``,`` = thousands,
|
|
25
|
+
``.`` = decimal); other shapes that don't parse cleanly yield 0.0.
|
|
26
|
+
|
|
27
|
+
Tolerance can be given two ways:
|
|
28
|
+
|
|
29
|
+
* ``tolerance`` + ``mode`` (``"relative"`` | ``"absolute"``) — the original
|
|
30
|
+
single-band form; ``relative`` measures ``|a - e| / |e|``, ``absolute``
|
|
31
|
+
measures ``|a - e|``. A tolerance of 0 means exact numeric equality.
|
|
32
|
+
* ``relative_tolerance`` and/or ``absolute_tolerance`` — explicit bands; a
|
|
33
|
+
value matches if it falls within *either* band. When either is supplied it
|
|
34
|
+
takes precedence over ``tolerance``/``mode``.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
name = "numeric"
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
tolerance: float = 0.01,
|
|
42
|
+
mode: NumericMode = NumericMode.RELATIVE,
|
|
43
|
+
relative_tolerance: float | None = None,
|
|
44
|
+
absolute_tolerance: float | None = None,
|
|
45
|
+
):
|
|
46
|
+
self.tolerance = tolerance
|
|
47
|
+
self.mode = NumericMode(mode)
|
|
48
|
+
self.relative_tolerance = relative_tolerance
|
|
49
|
+
self.absolute_tolerance = absolute_tolerance
|
|
50
|
+
|
|
51
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
52
|
+
a = parse_number(actual)
|
|
53
|
+
e = parse_number(expected)
|
|
54
|
+
if a is None or e is None:
|
|
55
|
+
return 0.0
|
|
56
|
+
return 1.0 if self._within_tolerance(a, e) else 0.0
|
|
57
|
+
|
|
58
|
+
def _within_tolerance(self, a: float, e: float) -> bool:
|
|
59
|
+
if a == e:
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
# Explicit bands take precedence; match within either.
|
|
63
|
+
if self.relative_tolerance is not None or self.absolute_tolerance is not None:
|
|
64
|
+
if self.relative_tolerance is not None:
|
|
65
|
+
if e == 0:
|
|
66
|
+
if a == 0:
|
|
67
|
+
return True
|
|
68
|
+
elif abs(a - e) / abs(e) <= self.relative_tolerance:
|
|
69
|
+
return True
|
|
70
|
+
return (
|
|
71
|
+
self.absolute_tolerance is not None
|
|
72
|
+
and abs(a - e) <= self.absolute_tolerance
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Single-band form (tolerance + mode).
|
|
76
|
+
if self.mode == NumericMode.RELATIVE:
|
|
77
|
+
if e == 0:
|
|
78
|
+
deviation = 0.0 if a == 0 else float("inf")
|
|
79
|
+
else:
|
|
80
|
+
deviation = abs(a - e) / abs(e)
|
|
81
|
+
else:
|
|
82
|
+
deviation = abs(a - e)
|
|
83
|
+
return deviation <= self.tolerance
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import FieldMetric
|
|
6
|
+
from structured_eval.metrics.utils.number import parse_number
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NumericCloseness(FieldMetric):
|
|
10
|
+
"""Graded numeric similarity in ``[0, 1]`` (not a pass/fail tolerance).
|
|
11
|
+
|
|
12
|
+
``1 - |actual - expected| / max(|actual|, |expected|)`` — the ratio of the
|
|
13
|
+
smaller magnitude to the larger (``min/max`` for same-sign values): equal
|
|
14
|
+
values score 1.0, opposite signs trend toward 0.0, and ``0/0`` is 1.0.
|
|
15
|
+
Unlike :class:`Numeric` (a hard 0/1 verdict against a tolerance), this yields
|
|
16
|
+
a continuous score, making it the default element scorer for numbers under
|
|
17
|
+
the Hungarian array aligner where a graded cost matrix matters.
|
|
18
|
+
|
|
19
|
+
Values are parsed with the shared lenient numeric parser (same as
|
|
20
|
+
:class:`Numeric`), so numeric strings are graded too. The metric applies
|
|
21
|
+
**only to numbers**: if either side isn't numeric (``None``, a non-numeric
|
|
22
|
+
string, or a ``bool`` — ``True`` is not ``1``) the score is 0.0.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "numeric_closeness"
|
|
26
|
+
|
|
27
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
28
|
+
a = parse_number(actual)
|
|
29
|
+
e = parse_number(expected)
|
|
30
|
+
if a is None or e is None:
|
|
31
|
+
return 0.0
|
|
32
|
+
if a == e:
|
|
33
|
+
return 1.0
|
|
34
|
+
denom = max(abs(a), abs(e))
|
|
35
|
+
return max(0.0, 1.0 - abs(a - e) / denom) if denom else 1.0
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
7
|
+
from structured_eval.metrics.utils import object_utils as obj
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectAccuracy(ObjectMetric):
|
|
14
|
+
"""Weighted soft mean of field correctness over an object's expected fields.
|
|
15
|
+
|
|
16
|
+
Equivalent to **soft recall**: ``Σ weight·score / (matched_weight +
|
|
17
|
+
missing_weight)``. Each matched field contributes its ``representative``
|
|
18
|
+
(any child kind — a nested object/array counts via its representative, not
|
|
19
|
+
only scalars), or a ``score_policy`` override. Missing expected fields count
|
|
20
|
+
as 0.0. **Spurious (extra) fields are not penalized** — the denominator is
|
|
21
|
+
the expected side only (use ``ObjectF1`` for a precision-aware score). An
|
|
22
|
+
object with no expected fields is vacuously 1.0.
|
|
23
|
+
|
|
24
|
+
``weight_mode`` (default ``PROPORTIONAL``) makes this a weighted mean by each
|
|
25
|
+
child's configured ``weight``; ``NONE`` restores the plain mean.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "object_accuracy"
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
score_policy: dict[str, Any] | None = None,
|
|
33
|
+
weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
|
|
34
|
+
):
|
|
35
|
+
self.score_policy = score_policy
|
|
36
|
+
self.weight_mode = stats.WeightMode(weight_mode)
|
|
37
|
+
|
|
38
|
+
def compute(self, node: ObjectNode) -> float:
|
|
39
|
+
verdicts = obj.matched_verdicts(
|
|
40
|
+
node, self.score_policy, weight_mode=self.weight_mode
|
|
41
|
+
)
|
|
42
|
+
denom = sum(weight for _, _, weight in verdicts) + obj.missing_weight(
|
|
43
|
+
node, self.weight_mode
|
|
44
|
+
)
|
|
45
|
+
if denom == 0:
|
|
46
|
+
return 1.0
|
|
47
|
+
return sum(weight * score for score, _, weight in verdicts) / denom
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ObjectExactMatch(ObjectMetric):
|
|
12
|
+
"""Strict deep equality for objects: identical dicts → 1.0, else 0.0.
|
|
13
|
+
|
|
14
|
+
Compares the raw ``actual`` / ``expected`` mappings recursively — same keys,
|
|
15
|
+
and every value deep-equal (nested dicts and lists included). No partial
|
|
16
|
+
credit and no coercion: the object as a whole is either right or wrong. For
|
|
17
|
+
field-level partial credit use the aggregating ``Object*`` metrics
|
|
18
|
+
(``ObjectAccuracy`` / ``ObjectF1`` …) instead.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "object_exact_match"
|
|
22
|
+
|
|
23
|
+
def compute(self, node: ObjectNode) -> float:
|
|
24
|
+
return self.score(node.actual, node.expected)
|
|
25
|
+
|
|
26
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
27
|
+
return 1.0 if self._object_equal(actual, expected) else 0.0
|
|
28
|
+
|
|
29
|
+
def _object_equal(self, a: Any, b: Any) -> bool:
|
|
30
|
+
"""Deep strict equality for JSON-like structures."""
|
|
31
|
+
if type(a) is not type(b):
|
|
32
|
+
return False
|
|
33
|
+
if isinstance(a, dict):
|
|
34
|
+
if set(a.keys()) != set(b.keys()):
|
|
35
|
+
return False
|
|
36
|
+
return all(self._object_equal(a[k], b[k]) for k in a)
|
|
37
|
+
if isinstance(a, list):
|
|
38
|
+
if len(a) != len(b):
|
|
39
|
+
return False
|
|
40
|
+
return all(self._object_equal(x, y) for x, y in zip(a, b, strict=False))
|
|
41
|
+
return bool(a == b)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
7
|
+
from structured_eval.metrics.utils import object_utils as obj
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectF1(ObjectMetric):
|
|
14
|
+
"""Harmonic mean of object precision and recall over an object's fields.
|
|
15
|
+
|
|
16
|
+
Slot-filling F1: matched-and-correct → TP, missing → FN, extra → FP. Match
|
|
17
|
+
criterion and ``mode`` behave as for ``ObjectPrecision`` (counts all child
|
|
18
|
+
kinds via their representative).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "object_f1"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
score_policy: dict[str, Any] | None = None,
|
|
26
|
+
threshold: float | None = None,
|
|
27
|
+
mode: stats.GradingMode = stats.GradingMode.HARD,
|
|
28
|
+
weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
|
|
29
|
+
):
|
|
30
|
+
self.score_policy = score_policy
|
|
31
|
+
self.threshold = threshold
|
|
32
|
+
self.mode = stats.GradingMode(mode)
|
|
33
|
+
self.weight_mode = stats.WeightMode(weight_mode)
|
|
34
|
+
|
|
35
|
+
def compute(self, node: ObjectNode) -> float:
|
|
36
|
+
verdicts = obj.matched_verdicts(
|
|
37
|
+
node, self.score_policy, self.threshold, self.weight_mode
|
|
38
|
+
)
|
|
39
|
+
tp, predicted, expected = stats.prf_counts(
|
|
40
|
+
verdicts,
|
|
41
|
+
obj.missing_weight(node, self.weight_mode),
|
|
42
|
+
obj.spurious_weight(node, self.weight_mode),
|
|
43
|
+
self.mode,
|
|
44
|
+
)
|
|
45
|
+
p = stats.precision(tp, predicted, expected)
|
|
46
|
+
r = stats.recall(tp, predicted, expected)
|
|
47
|
+
return stats.f1(p, r)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
7
|
+
from structured_eval.metrics.utils import object_utils as obj
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectPrecision(ObjectMetric):
|
|
14
|
+
"""TP / (TP + FP) over an object's fields (slot-filling precision).
|
|
15
|
+
|
|
16
|
+
Each matched field is a TP when its ``representative`` clears its threshold
|
|
17
|
+
(any child kind — nested objects/arrays count via their representative);
|
|
18
|
+
extra fields are FP. The per-field score comes from ``score_policy`` →
|
|
19
|
+
the child's ``key_metric`` → ``ExactMatch``. Default ``mode=HARD`` with the
|
|
20
|
+
field threshold (``1.0`` unless configured), so a field counts only when its
|
|
21
|
+
score is a perfect match; ``mode="soft"`` drops the threshold and uses the
|
|
22
|
+
field score fractionally.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "object_precision"
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
score_policy: dict[str, Any] | None = None,
|
|
30
|
+
threshold: float | None = None,
|
|
31
|
+
mode: stats.GradingMode = stats.GradingMode.HARD,
|
|
32
|
+
weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
|
|
33
|
+
):
|
|
34
|
+
self.score_policy = score_policy
|
|
35
|
+
self.threshold = threshold
|
|
36
|
+
self.mode = stats.GradingMode(mode)
|
|
37
|
+
self.weight_mode = stats.WeightMode(weight_mode)
|
|
38
|
+
|
|
39
|
+
def compute(self, node: ObjectNode) -> float:
|
|
40
|
+
verdicts = obj.matched_verdicts(
|
|
41
|
+
node, self.score_policy, self.threshold, self.weight_mode
|
|
42
|
+
)
|
|
43
|
+
tp, predicted, expected = stats.prf_counts(
|
|
44
|
+
verdicts,
|
|
45
|
+
obj.missing_weight(node, self.weight_mode),
|
|
46
|
+
obj.spurious_weight(node, self.weight_mode),
|
|
47
|
+
self.mode,
|
|
48
|
+
)
|
|
49
|
+
return stats.precision(tp, predicted, expected)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
7
|
+
from structured_eval.metrics.utils import object_utils as obj
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectPRF1(ObjectMetric):
|
|
14
|
+
"""Precision, recall and F1 in one pass.
|
|
15
|
+
|
|
16
|
+
Returns a dict; the engine writes each key (``object_precision``,
|
|
17
|
+
``object_recall``, ``object_f1``) into ``report.metrics`` directly. Match
|
|
18
|
+
criterion and ``mode`` behave as for ``ObjectPrecision``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "object_prf1"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
score_policy: dict[str, Any] | None = None,
|
|
26
|
+
threshold: float | None = None,
|
|
27
|
+
mode: stats.GradingMode = stats.GradingMode.HARD,
|
|
28
|
+
weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
|
|
29
|
+
):
|
|
30
|
+
self.score_policy = score_policy
|
|
31
|
+
self.threshold = threshold
|
|
32
|
+
self.mode = stats.GradingMode(mode)
|
|
33
|
+
self.weight_mode = stats.WeightMode(weight_mode)
|
|
34
|
+
|
|
35
|
+
def compute(self, node: ObjectNode) -> dict[str, float]:
|
|
36
|
+
verdicts = obj.matched_verdicts(
|
|
37
|
+
node, self.score_policy, self.threshold, self.weight_mode
|
|
38
|
+
)
|
|
39
|
+
tp, predicted, expected = stats.prf_counts(
|
|
40
|
+
verdicts,
|
|
41
|
+
obj.missing_weight(node, self.weight_mode),
|
|
42
|
+
obj.spurious_weight(node, self.weight_mode),
|
|
43
|
+
self.mode,
|
|
44
|
+
)
|
|
45
|
+
p = stats.precision(tp, predicted, expected)
|
|
46
|
+
r = stats.recall(tp, predicted, expected)
|
|
47
|
+
return {
|
|
48
|
+
"object_precision": p,
|
|
49
|
+
"object_recall": r,
|
|
50
|
+
"object_f1": stats.f1(p, r),
|
|
51
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
7
|
+
from structured_eval.metrics.utils import object_utils as obj
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectRecall(ObjectMetric):
|
|
14
|
+
"""TP / (TP + FN) over an object's fields (slot-filling recall).
|
|
15
|
+
|
|
16
|
+
Missing expected fields are FN. Match criterion and ``mode`` behave as for
|
|
17
|
+
``ObjectPrecision`` (counts all child kinds via their representative).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name = "object_recall"
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
score_policy: dict[str, Any] | None = None,
|
|
25
|
+
threshold: float | None = None,
|
|
26
|
+
mode: stats.GradingMode = stats.GradingMode.HARD,
|
|
27
|
+
weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
|
|
28
|
+
):
|
|
29
|
+
self.score_policy = score_policy
|
|
30
|
+
self.threshold = threshold
|
|
31
|
+
self.mode = stats.GradingMode(mode)
|
|
32
|
+
self.weight_mode = stats.WeightMode(weight_mode)
|
|
33
|
+
|
|
34
|
+
def compute(self, node: ObjectNode) -> float:
|
|
35
|
+
verdicts = obj.matched_verdicts(
|
|
36
|
+
node, self.score_policy, self.threshold, self.weight_mode
|
|
37
|
+
)
|
|
38
|
+
tp, predicted, expected = stats.prf_counts(
|
|
39
|
+
verdicts,
|
|
40
|
+
obj.missing_weight(node, self.weight_mode),
|
|
41
|
+
obj.spurious_weight(node, self.weight_mode),
|
|
42
|
+
self.mode,
|
|
43
|
+
)
|
|
44
|
+
return stats.recall(tp, predicted, expected)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ObjectMetric
|
|
6
|
+
from structured_eval.metrics.invoker import MetricInvoker
|
|
7
|
+
from structured_eval.metrics.type_match import TypeMatch
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectTypeValidity(ObjectMetric):
|
|
14
|
+
"""Fraction of present fields that are type-valid.
|
|
15
|
+
|
|
16
|
+
A structural sanity check independent of value correctness: of the fields
|
|
17
|
+
present in both, how many carry the right JSON type. ``TypeMatch`` covers
|
|
18
|
+
every JSON type, so this validates scalars (``"100"`` vs ``100``) *and*
|
|
19
|
+
containers (a ``list`` where an object was expected) alike — a basic
|
|
20
|
+
type check, not a deep one. An object with no present fields is vacuously
|
|
21
|
+
1.0.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "object_type_validity"
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
self._type_match = MetricInvoker(TypeMatch())
|
|
28
|
+
|
|
29
|
+
def compute(self, node: ObjectNode) -> float:
|
|
30
|
+
present = node.matched
|
|
31
|
+
if not present:
|
|
32
|
+
return 1.0
|
|
33
|
+
valid = sum(self._type_match.scalar_on_node(n) for n in present)
|
|
34
|
+
return valid / len(present)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import RootMetric
|
|
6
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OverallLeafScore(RootMetric):
|
|
13
|
+
"""Weighted mean of leaf match-criterion scores over the whole document.
|
|
14
|
+
|
|
15
|
+
Each scalar field contributes its match-criterion verdict weighted by its
|
|
16
|
+
configured ``weight`` (``FieldConfig.weight``). Missing expected leaves
|
|
17
|
+
score 0; a document with no leaves is vacuously 1.0. The headline number.
|
|
18
|
+
|
|
19
|
+
Leaf-style: it flattens to scalar leaves across the whole tree (unlike the
|
|
20
|
+
hierarchical object metrics, which aggregate a node's direct children).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "overall_leaf_score"
|
|
24
|
+
|
|
25
|
+
def compute(self, node: EvalNode) -> float:
|
|
26
|
+
total_weight = 0.0
|
|
27
|
+
weighted = 0.0
|
|
28
|
+
for leaf in node.leaves():
|
|
29
|
+
assert isinstance(leaf, ScalarNode)
|
|
30
|
+
total_weight += leaf.weight
|
|
31
|
+
weighted += leaf.weight * leaf.representative
|
|
32
|
+
return weighted / total_weight if total_weight else 1.0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import FieldMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Presence(FieldMetric):
|
|
12
|
+
"""Was the field populated? 1.0 if present and non-null, else 0.0.
|
|
13
|
+
|
|
14
|
+
A single-value check — it ignores ``expected`` and looks only at ``actual``
|
|
15
|
+
(a missing key surfaces as ``None`` through the node). Overrides ``compute``
|
|
16
|
+
rather than ``score`` since it is not a comparison of two values.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "presence"
|
|
20
|
+
|
|
21
|
+
def compute(self, node: ScalarNode) -> float:
|
|
22
|
+
return 1.0 if node.actual is not None else 0.0
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.metrics.base import FieldMetric
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RegexMatch(FieldMetric):
|
|
10
|
+
"""String equality after an optional regex rewrite → 1.0, else 0.0.
|
|
11
|
+
|
|
12
|
+
A **string-only** metric: if either side is not a ``str`` the score is
|
|
13
|
+
``0.0`` (use ``Numeric`` for numbers, ``ExactMatch`` for verbatim
|
|
14
|
+
equality). For two strings it applies, in order, optional ``lower`` and
|
|
15
|
+
``strip``, then substitutes every match of ``pattern`` with ``repl``, and
|
|
16
|
+
compares the results exactly.
|
|
17
|
+
|
|
18
|
+
The default ``pattern=r"\\s+", repl=" "`` (with ``lower``/``strip`` on)
|
|
19
|
+
collapses whitespace and ignores casing. Tune the rewrite, e.g.::
|
|
20
|
+
|
|
21
|
+
RegexMatch(pattern=r"[^\\w\\s]", repl="") # drop punctuation
|
|
22
|
+
RegexMatch(pattern=r"[-_]", repl=" ") # dashes/underscores → spaces
|
|
23
|
+
RegexMatch(lower=False) # case-sensitive
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "regex_match"
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
pattern: str | re.Pattern[str] = r"\s+",
|
|
31
|
+
repl: str = " ",
|
|
32
|
+
lower: bool = True,
|
|
33
|
+
strip: bool = True,
|
|
34
|
+
):
|
|
35
|
+
self.pattern = re.compile(pattern) if isinstance(pattern, str) else pattern
|
|
36
|
+
self.repl = repl
|
|
37
|
+
self.lower = lower
|
|
38
|
+
self.strip = strip
|
|
39
|
+
|
|
40
|
+
def _normalize(self, value: str) -> str:
|
|
41
|
+
if self.lower:
|
|
42
|
+
value = value.lower()
|
|
43
|
+
if self.strip:
|
|
44
|
+
value = value.strip()
|
|
45
|
+
value = self.pattern.sub(self.repl, value)
|
|
46
|
+
return value.strip() if self.strip else value
|
|
47
|
+
|
|
48
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
49
|
+
if not (isinstance(actual, str) and isinstance(expected, str)):
|
|
50
|
+
return 0.0
|
|
51
|
+
return 1.0 if self._normalize(actual) == self._normalize(expected) else 0.0
|