structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,83 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ from typing import Any
5
+
6
+ from structured_eval.metrics.base import FieldMetric
7
+ from structured_eval.metrics.utils.number import parse_number
8
+
9
+
10
+ class NumericMode(StrEnum):
11
+ """Tolerance band for the single-band form of :class:`Numeric`."""
12
+
13
+ RELATIVE = "relative" # |a - e| / |e|
14
+ ABSOLUTE = "absolute" # |a - e|
15
+
16
+
17
+ class Numeric(FieldMetric):
18
+ """Numeric equality within a tolerance band → 1.0, otherwise 0.0.
19
+
20
+ Values are parsed leniently: currency symbols and thousands separators are
21
+ stripped (``"$1,234.50"`` → ``1234.50``), accounting notation is honored
22
+ (``"(123)"`` → ``-123``), and scientific notation is supported
23
+ (``"1e3"`` → ``1000``). A percent sign is only stripped, **not** interpreted
24
+ (``"50%"`` → ``50``, not ``0.5``). US format is assumed (``,`` = thousands,
25
+ ``.`` = decimal); other shapes that don't parse cleanly yield 0.0.
26
+
27
+ Tolerance can be given two ways:
28
+
29
+ * ``tolerance`` + ``mode`` (``"relative"`` | ``"absolute"``) — the original
30
+ single-band form; ``relative`` measures ``|a - e| / |e|``, ``absolute``
31
+ measures ``|a - e|``. A tolerance of 0 means exact numeric equality.
32
+ * ``relative_tolerance`` and/or ``absolute_tolerance`` — explicit bands; a
33
+ value matches if it falls within *either* band. When either is supplied it
34
+ takes precedence over ``tolerance``/``mode``.
35
+ """
36
+
37
+ name = "numeric"
38
+
39
+ def __init__(
40
+ self,
41
+ tolerance: float = 0.01,
42
+ mode: NumericMode = NumericMode.RELATIVE,
43
+ relative_tolerance: float | None = None,
44
+ absolute_tolerance: float | None = None,
45
+ ):
46
+ self.tolerance = tolerance
47
+ self.mode = NumericMode(mode)
48
+ self.relative_tolerance = relative_tolerance
49
+ self.absolute_tolerance = absolute_tolerance
50
+
51
+ def score(self, actual: Any, expected: Any) -> float:
52
+ a = parse_number(actual)
53
+ e = parse_number(expected)
54
+ if a is None or e is None:
55
+ return 0.0
56
+ return 1.0 if self._within_tolerance(a, e) else 0.0
57
+
58
+ def _within_tolerance(self, a: float, e: float) -> bool:
59
+ if a == e:
60
+ return True
61
+
62
+ # Explicit bands take precedence; match within either.
63
+ if self.relative_tolerance is not None or self.absolute_tolerance is not None:
64
+ if self.relative_tolerance is not None:
65
+ if e == 0:
66
+ if a == 0:
67
+ return True
68
+ elif abs(a - e) / abs(e) <= self.relative_tolerance:
69
+ return True
70
+ return (
71
+ self.absolute_tolerance is not None
72
+ and abs(a - e) <= self.absolute_tolerance
73
+ )
74
+
75
+ # Single-band form (tolerance + mode).
76
+ if self.mode == NumericMode.RELATIVE:
77
+ if e == 0:
78
+ deviation = 0.0 if a == 0 else float("inf")
79
+ else:
80
+ deviation = abs(a - e) / abs(e)
81
+ else:
82
+ deviation = abs(a - e)
83
+ return deviation <= self.tolerance
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.metrics.base import FieldMetric
6
+ from structured_eval.metrics.utils.number import parse_number
7
+
8
+
9
+ class NumericCloseness(FieldMetric):
10
+ """Graded numeric similarity in ``[0, 1]`` (not a pass/fail tolerance).
11
+
12
+ ``1 - |actual - expected| / max(|actual|, |expected|)`` — the ratio of the
13
+ smaller magnitude to the larger (``min/max`` for same-sign values): equal
14
+ values score 1.0, opposite signs trend toward 0.0, and ``0/0`` is 1.0.
15
+ Unlike :class:`Numeric` (a hard 0/1 verdict against a tolerance), this yields
16
+ a continuous score, making it the default element scorer for numbers under
17
+ the Hungarian array aligner where a graded cost matrix matters.
18
+
19
+ Values are parsed with the shared lenient numeric parser (same as
20
+ :class:`Numeric`), so numeric strings are graded too. The metric applies
21
+ **only to numbers**: if either side isn't numeric (``None``, a non-numeric
22
+ string, or a ``bool`` — ``True`` is not ``1``) the score is 0.0.
23
+ """
24
+
25
+ name = "numeric_closeness"
26
+
27
+ def score(self, actual: Any, expected: Any) -> float:
28
+ a = parse_number(actual)
29
+ e = parse_number(expected)
30
+ if a is None or e is None:
31
+ return 0.0
32
+ if a == e:
33
+ return 1.0
34
+ denom = max(abs(a), abs(e))
35
+ return max(0.0, 1.0 - abs(a - e) / denom) if denom else 1.0
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.utils import calculate as stats
7
+ from structured_eval.metrics.utils import object_utils as obj
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectAccuracy(ObjectMetric):
14
+ """Weighted soft mean of field correctness over an object's expected fields.
15
+
16
+ Equivalent to **soft recall**: ``Σ weight·score / (matched_weight +
17
+ missing_weight)``. Each matched field contributes its ``representative``
18
+ (any child kind — a nested object/array counts via its representative, not
19
+ only scalars), or a ``score_policy`` override. Missing expected fields count
20
+ as 0.0. **Spurious (extra) fields are not penalized** — the denominator is
21
+ the expected side only (use ``ObjectF1`` for a precision-aware score). An
22
+ object with no expected fields is vacuously 1.0.
23
+
24
+ ``weight_mode`` (default ``PROPORTIONAL``) makes this a weighted mean by each
25
+ child's configured ``weight``; ``NONE`` restores the plain mean.
26
+ """
27
+
28
+ name = "object_accuracy"
29
+
30
+ def __init__(
31
+ self,
32
+ score_policy: dict[str, Any] | None = None,
33
+ weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
34
+ ):
35
+ self.score_policy = score_policy
36
+ self.weight_mode = stats.WeightMode(weight_mode)
37
+
38
+ def compute(self, node: ObjectNode) -> float:
39
+ verdicts = obj.matched_verdicts(
40
+ node, self.score_policy, weight_mode=self.weight_mode
41
+ )
42
+ denom = sum(weight for _, _, weight in verdicts) + obj.missing_weight(
43
+ node, self.weight_mode
44
+ )
45
+ if denom == 0:
46
+ return 1.0
47
+ return sum(weight * score for score, _, weight in verdicts) / denom
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.object_node import ObjectNode
9
+
10
+
11
+ class ObjectExactMatch(ObjectMetric):
12
+ """Strict deep equality for objects: identical dicts → 1.0, else 0.0.
13
+
14
+ Compares the raw ``actual`` / ``expected`` mappings recursively — same keys,
15
+ and every value deep-equal (nested dicts and lists included). No partial
16
+ credit and no coercion: the object as a whole is either right or wrong. For
17
+ field-level partial credit use the aggregating ``Object*`` metrics
18
+ (``ObjectAccuracy`` / ``ObjectF1`` …) instead.
19
+ """
20
+
21
+ name = "object_exact_match"
22
+
23
+ def compute(self, node: ObjectNode) -> float:
24
+ return self.score(node.actual, node.expected)
25
+
26
+ def score(self, actual: Any, expected: Any) -> float:
27
+ return 1.0 if self._object_equal(actual, expected) else 0.0
28
+
29
+ def _object_equal(self, a: Any, b: Any) -> bool:
30
+ """Deep strict equality for JSON-like structures."""
31
+ if type(a) is not type(b):
32
+ return False
33
+ if isinstance(a, dict):
34
+ if set(a.keys()) != set(b.keys()):
35
+ return False
36
+ return all(self._object_equal(a[k], b[k]) for k in a)
37
+ if isinstance(a, list):
38
+ if len(a) != len(b):
39
+ return False
40
+ return all(self._object_equal(x, y) for x, y in zip(a, b, strict=False))
41
+ return bool(a == b)
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.utils import calculate as stats
7
+ from structured_eval.metrics.utils import object_utils as obj
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectF1(ObjectMetric):
14
+ """Harmonic mean of object precision and recall over an object's fields.
15
+
16
+ Slot-filling F1: matched-and-correct → TP, missing → FN, extra → FP. Match
17
+ criterion and ``mode`` behave as for ``ObjectPrecision`` (counts all child
18
+ kinds via their representative).
19
+ """
20
+
21
+ name = "object_f1"
22
+
23
+ def __init__(
24
+ self,
25
+ score_policy: dict[str, Any] | None = None,
26
+ threshold: float | None = None,
27
+ mode: stats.GradingMode = stats.GradingMode.HARD,
28
+ weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
29
+ ):
30
+ self.score_policy = score_policy
31
+ self.threshold = threshold
32
+ self.mode = stats.GradingMode(mode)
33
+ self.weight_mode = stats.WeightMode(weight_mode)
34
+
35
+ def compute(self, node: ObjectNode) -> float:
36
+ verdicts = obj.matched_verdicts(
37
+ node, self.score_policy, self.threshold, self.weight_mode
38
+ )
39
+ tp, predicted, expected = stats.prf_counts(
40
+ verdicts,
41
+ obj.missing_weight(node, self.weight_mode),
42
+ obj.spurious_weight(node, self.weight_mode),
43
+ self.mode,
44
+ )
45
+ p = stats.precision(tp, predicted, expected)
46
+ r = stats.recall(tp, predicted, expected)
47
+ return stats.f1(p, r)
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.utils import calculate as stats
7
+ from structured_eval.metrics.utils import object_utils as obj
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectPrecision(ObjectMetric):
14
+ """TP / (TP + FP) over an object's fields (slot-filling precision).
15
+
16
+ Each matched field is a TP when its ``representative`` clears its threshold
17
+ (any child kind — nested objects/arrays count via their representative);
18
+ extra fields are FP. The per-field score comes from ``score_policy`` →
19
+ the child's ``key_metric`` → ``ExactMatch``. Default ``mode=HARD`` with the
20
+ field threshold (``1.0`` unless configured), so a field counts only when its
21
+ score is a perfect match; ``mode="soft"`` drops the threshold and uses the
22
+ field score fractionally.
23
+ """
24
+
25
+ name = "object_precision"
26
+
27
+ def __init__(
28
+ self,
29
+ score_policy: dict[str, Any] | None = None,
30
+ threshold: float | None = None,
31
+ mode: stats.GradingMode = stats.GradingMode.HARD,
32
+ weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
33
+ ):
34
+ self.score_policy = score_policy
35
+ self.threshold = threshold
36
+ self.mode = stats.GradingMode(mode)
37
+ self.weight_mode = stats.WeightMode(weight_mode)
38
+
39
+ def compute(self, node: ObjectNode) -> float:
40
+ verdicts = obj.matched_verdicts(
41
+ node, self.score_policy, self.threshold, self.weight_mode
42
+ )
43
+ tp, predicted, expected = stats.prf_counts(
44
+ verdicts,
45
+ obj.missing_weight(node, self.weight_mode),
46
+ obj.spurious_weight(node, self.weight_mode),
47
+ self.mode,
48
+ )
49
+ return stats.precision(tp, predicted, expected)
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.utils import calculate as stats
7
+ from structured_eval.metrics.utils import object_utils as obj
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectPRF1(ObjectMetric):
14
+ """Precision, recall and F1 in one pass.
15
+
16
+ Returns a dict; the engine writes each key (``object_precision``,
17
+ ``object_recall``, ``object_f1``) into ``report.metrics`` directly. Match
18
+ criterion and ``mode`` behave as for ``ObjectPrecision``.
19
+ """
20
+
21
+ name = "object_prf1"
22
+
23
+ def __init__(
24
+ self,
25
+ score_policy: dict[str, Any] | None = None,
26
+ threshold: float | None = None,
27
+ mode: stats.GradingMode = stats.GradingMode.HARD,
28
+ weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
29
+ ):
30
+ self.score_policy = score_policy
31
+ self.threshold = threshold
32
+ self.mode = stats.GradingMode(mode)
33
+ self.weight_mode = stats.WeightMode(weight_mode)
34
+
35
+ def compute(self, node: ObjectNode) -> dict[str, float]:
36
+ verdicts = obj.matched_verdicts(
37
+ node, self.score_policy, self.threshold, self.weight_mode
38
+ )
39
+ tp, predicted, expected = stats.prf_counts(
40
+ verdicts,
41
+ obj.missing_weight(node, self.weight_mode),
42
+ obj.spurious_weight(node, self.weight_mode),
43
+ self.mode,
44
+ )
45
+ p = stats.precision(tp, predicted, expected)
46
+ r = stats.recall(tp, predicted, expected)
47
+ return {
48
+ "object_precision": p,
49
+ "object_recall": r,
50
+ "object_f1": stats.f1(p, r),
51
+ }
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.utils import calculate as stats
7
+ from structured_eval.metrics.utils import object_utils as obj
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectRecall(ObjectMetric):
14
+ """TP / (TP + FN) over an object's fields (slot-filling recall).
15
+
16
+ Missing expected fields are FN. Match criterion and ``mode`` behave as for
17
+ ``ObjectPrecision`` (counts all child kinds via their representative).
18
+ """
19
+
20
+ name = "object_recall"
21
+
22
+ def __init__(
23
+ self,
24
+ score_policy: dict[str, Any] | None = None,
25
+ threshold: float | None = None,
26
+ mode: stats.GradingMode = stats.GradingMode.HARD,
27
+ weight_mode: stats.WeightMode = stats.WeightMode.PROPORTIONAL,
28
+ ):
29
+ self.score_policy = score_policy
30
+ self.threshold = threshold
31
+ self.mode = stats.GradingMode(mode)
32
+ self.weight_mode = stats.WeightMode(weight_mode)
33
+
34
+ def compute(self, node: ObjectNode) -> float:
35
+ verdicts = obj.matched_verdicts(
36
+ node, self.score_policy, self.threshold, self.weight_mode
37
+ )
38
+ tp, predicted, expected = stats.prf_counts(
39
+ verdicts,
40
+ obj.missing_weight(node, self.weight_mode),
41
+ obj.spurious_weight(node, self.weight_mode),
42
+ self.mode,
43
+ )
44
+ return stats.recall(tp, predicted, expected)
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ObjectMetric
6
+ from structured_eval.metrics.invoker import MetricInvoker
7
+ from structured_eval.metrics.type_match import TypeMatch
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.object_node import ObjectNode
11
+
12
+
13
+ class ObjectTypeValidity(ObjectMetric):
14
+ """Fraction of present fields that are type-valid.
15
+
16
+ A structural sanity check independent of value correctness: of the fields
17
+ present in both, how many carry the right JSON type. ``TypeMatch`` covers
18
+ every JSON type, so this validates scalars (``"100"`` vs ``100``) *and*
19
+ containers (a ``list`` where an object was expected) alike — a basic
20
+ type check, not a deep one. An object with no present fields is vacuously
21
+ 1.0.
22
+ """
23
+
24
+ name = "object_type_validity"
25
+
26
+ def __init__(self) -> None:
27
+ self._type_match = MetricInvoker(TypeMatch())
28
+
29
+ def compute(self, node: ObjectNode) -> float:
30
+ present = node.matched
31
+ if not present:
32
+ return 1.0
33
+ valid = sum(self._type_match.scalar_on_node(n) for n in present)
34
+ return valid / len(present)
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import RootMetric
6
+ from structured_eval.models.nodes.scalar import ScalarNode
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.models.nodes.base import EvalNode
10
+
11
+
12
+ class OverallLeafScore(RootMetric):
13
+ """Weighted mean of leaf match-criterion scores over the whole document.
14
+
15
+ Each scalar field contributes its match-criterion verdict weighted by its
16
+ configured ``weight`` (``FieldConfig.weight``). Missing expected leaves
17
+ score 0; a document with no leaves is vacuously 1.0. The headline number.
18
+
19
+ Leaf-style: it flattens to scalar leaves across the whole tree (unlike the
20
+ hierarchical object metrics, which aggregate a node's direct children).
21
+ """
22
+
23
+ name = "overall_leaf_score"
24
+
25
+ def compute(self, node: EvalNode) -> float:
26
+ total_weight = 0.0
27
+ weighted = 0.0
28
+ for leaf in node.leaves():
29
+ assert isinstance(leaf, ScalarNode)
30
+ total_weight += leaf.weight
31
+ weighted += leaf.weight * leaf.representative
32
+ return weighted / total_weight if total_weight else 1.0
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import FieldMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.scalar import ScalarNode
9
+
10
+
11
+ class Presence(FieldMetric):
12
+ """Was the field populated? 1.0 if present and non-null, else 0.0.
13
+
14
+ A single-value check — it ignores ``expected`` and looks only at ``actual``
15
+ (a missing key surfaces as ``None`` through the node). Overrides ``compute``
16
+ rather than ``score`` since it is not a comparison of two values.
17
+ """
18
+
19
+ name = "presence"
20
+
21
+ def compute(self, node: ScalarNode) -> float:
22
+ return 1.0 if node.actual is not None else 0.0
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from structured_eval.metrics.base import FieldMetric
7
+
8
+
9
+ class RegexMatch(FieldMetric):
10
+ """String equality after an optional regex rewrite → 1.0, else 0.0.
11
+
12
+ A **string-only** metric: if either side is not a ``str`` the score is
13
+ ``0.0`` (use ``Numeric`` for numbers, ``ExactMatch`` for verbatim
14
+ equality). For two strings it applies, in order, optional ``lower`` and
15
+ ``strip``, then substitutes every match of ``pattern`` with ``repl``, and
16
+ compares the results exactly.
17
+
18
+ The default ``pattern=r"\\s+", repl=" "`` (with ``lower``/``strip`` on)
19
+ collapses whitespace and ignores casing. Tune the rewrite, e.g.::
20
+
21
+ RegexMatch(pattern=r"[^\\w\\s]", repl="") # drop punctuation
22
+ RegexMatch(pattern=r"[-_]", repl=" ") # dashes/underscores → spaces
23
+ RegexMatch(lower=False) # case-sensitive
24
+ """
25
+
26
+ name = "regex_match"
27
+
28
+ def __init__(
29
+ self,
30
+ pattern: str | re.Pattern[str] = r"\s+",
31
+ repl: str = " ",
32
+ lower: bool = True,
33
+ strip: bool = True,
34
+ ):
35
+ self.pattern = re.compile(pattern) if isinstance(pattern, str) else pattern
36
+ self.repl = repl
37
+ self.lower = lower
38
+ self.strip = strip
39
+
40
+ def _normalize(self, value: str) -> str:
41
+ if self.lower:
42
+ value = value.lower()
43
+ if self.strip:
44
+ value = value.strip()
45
+ value = self.pattern.sub(self.repl, value)
46
+ return value.strip() if self.strip else value
47
+
48
+ def score(self, actual: Any, expected: Any) -> float:
49
+ if not (isinstance(actual, str) and isinstance(expected, str)):
50
+ return 0.0
51
+ return 1.0 if self._normalize(actual) == self._normalize(expected) else 0.0
@@ -0,0 +1,5 @@
1
+ from structured_eval.metrics.rule_pass_rate.dsl import Rule
2
+ from structured_eval.metrics.rule_pass_rate.engine import RuleProcessor
3
+ from structured_eval.metrics.rule_pass_rate.metric import RulePassRate
4
+
5
+ __all__ = ["Rule", "RulePassRate", "RuleProcessor"]