structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,74 @@
1
+ """deepeval adapter: structured-eval as a ``BaseMetric``.
2
+
3
+ Usage (requires ``structured-eval[deepeval]``)::
4
+
5
+ from structured_eval.integrations.deepeval import StructuredMetric
6
+ from deepeval import assert_test
7
+ from deepeval.test_case import LLMTestCase
8
+
9
+ metric = StructuredMetric(config=cfg, threshold=0.85)
10
+ assert_test(LLMTestCase(input=..., actual_output=raw, expected_output=ref), [metric])
11
+
12
+ ``actual_output``/``expected_output`` may be JSON strings or already-parsed
13
+ objects — ``evaluate`` handles both. ``report.score`` becomes ``metric.score``;
14
+ failing fields are summarised into ``metric.reason``. Importing this module
15
+ requires deepeval to be installed (the ``[deepeval]`` extra).
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import TYPE_CHECKING, Any
21
+
22
+ try:
23
+ from deepeval.metrics import BaseMetric
24
+ except ImportError as exc: # pragma: no cover
25
+ raise ImportError(
26
+ "deepeval is required for this integration. "
27
+ "Install it with: pip install structured-eval[deepeval]"
28
+ ) from exc
29
+
30
+ from structured_eval.api import evaluate
31
+ from structured_eval.integrations._adapter import verdict
32
+ from structured_eval.models.config import EvalConfig
33
+
34
+ if TYPE_CHECKING:
35
+ from structured_eval.models.result import EvalReport
36
+
37
+
38
+ class StructuredMetric(BaseMetric):
39
+ """Field-level structured-output metric for deepeval."""
40
+
41
+ def __init__(
42
+ self,
43
+ config: EvalConfig | None = None,
44
+ threshold: float = 0.5,
45
+ *,
46
+ include_reason: bool = True,
47
+ ) -> None:
48
+ self.config = config or EvalConfig()
49
+ self.threshold = threshold
50
+ self.include_reason = include_reason
51
+ self.score: float = 0.0
52
+ self.success: bool = False
53
+ self.reason: str | None = None
54
+ self.report: EvalReport | None = None
55
+
56
+ def measure(self, test_case: Any, *args: Any, **kwargs: Any) -> float:
57
+ self.report = evaluate(
58
+ test_case.actual_output, test_case.expected_output, self.config
59
+ )
60
+ score, success, reason = verdict(self.report, self.threshold)
61
+ self.score = 0.0 if score is None else score
62
+ self.success = success
63
+ self.reason = reason if self.include_reason else None
64
+ return self.score
65
+
66
+ async def a_measure(self, test_case: Any, *args: Any, **kwargs: Any) -> float:
67
+ return self.measure(test_case, *args, **kwargs)
68
+
69
+ def is_successful(self) -> bool:
70
+ return self.success
71
+
72
+ @property
73
+ def __name__(self) -> str: # shown in deepeval output
74
+ return "Structured Eval"
@@ -0,0 +1,90 @@
1
+ """LangSmith adapter: structured-eval as an evaluator function.
2
+
3
+ Usage (requires ``structured-eval[langsmith]``)::
4
+
5
+ from langsmith import evaluate
6
+ from structured_eval.integrations.langsmith import structured_evaluator
7
+
8
+ evaluator = structured_evaluator(config=cfg, threshold=0.85)
9
+ evaluate(target, data=dataset, evaluators=[evaluator])
10
+
11
+ The returned callable follows LangSmith's ``(run, example) -> dict`` contract and
12
+ emits a single feedback key with ``report.score`` plus a ``comment`` summarising
13
+ failures. By default the actual output is read from ``run.outputs`` and the
14
+ reference from ``example.outputs``; pass ``extract_actual``/``extract_expected``
15
+ to point at a nested field or adapt a different object shape.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from collections.abc import Callable
21
+ from typing import Any
22
+
23
+ from structured_eval.api import evaluate
24
+ from structured_eval.integrations._adapter import verdict
25
+ from structured_eval.models.config import EvalConfig
26
+ from structured_eval.models.result import EvalReport
27
+
28
+ Extractor = Callable[[Any], Any]
29
+
30
+
31
+ def _outputs(obj: Any) -> Any:
32
+ """Default extraction: the ``outputs`` payload of a run/example."""
33
+ if obj is None:
34
+ return None
35
+ if isinstance(obj, dict):
36
+ return obj.get("outputs", obj)
37
+ return getattr(obj, "outputs", obj)
38
+
39
+
40
+ class StructuredEvaluator:
41
+ """A LangSmith evaluator that scores structured outputs field-by-field.
42
+
43
+ Instances are callable with LangSmith's ``(run, example) -> dict`` contract.
44
+ ``key`` is the feedback key recorded in LangSmith; ``threshold`` decides the
45
+ boolean only for the ``comment`` — LangSmith stores the numeric
46
+ ``report.score`` itself. ``extract_actual`` / ``extract_expected`` adapt the
47
+ run/example shape (default: their ``outputs`` payload).
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ config: EvalConfig | None = None,
53
+ *,
54
+ key: str = "structured_eval",
55
+ threshold: float = 0.5,
56
+ extract_actual: Extractor | None = None,
57
+ extract_expected: Extractor | None = None,
58
+ ) -> None:
59
+ self.config = config or EvalConfig()
60
+ self.key = key
61
+ self.threshold = threshold
62
+ self._get_actual = extract_actual or _outputs
63
+ self._get_expected = extract_expected or _outputs
64
+ self.__name__ = key
65
+
66
+ def __call__(self, run: Any, example: Any) -> dict[str, Any]:
67
+ report = evaluate(
68
+ self._get_actual(run), self._get_expected(example), self.config
69
+ )
70
+ assert isinstance(report, EvalReport) # single-document evaluation
71
+ score, _success, reason = verdict(report, self.threshold)
72
+ return {"key": self.key, "score": score, "comment": reason}
73
+
74
+
75
+ def structured_evaluator(
76
+ config: EvalConfig | None = None,
77
+ *,
78
+ key: str = "structured_eval",
79
+ threshold: float = 0.5,
80
+ extract_actual: Extractor | None = None,
81
+ extract_expected: Extractor | None = None,
82
+ ) -> StructuredEvaluator:
83
+ """Convenience factory returning a ``StructuredEvaluator`` instance."""
84
+ return StructuredEvaluator(
85
+ config,
86
+ key=key,
87
+ threshold=threshold,
88
+ extract_actual=extract_actual,
89
+ extract_expected=extract_expected,
90
+ )
@@ -0,0 +1,101 @@
1
+ from structured_eval.metrics.array_accuracy import ArrayAccuracy
2
+ from structured_eval.metrics.array_cardinality import ArrayCardinality
3
+ from structured_eval.metrics.array_exact_match import ArrayExactMatch
4
+ from structured_eval.metrics.array_f1 import ArrayF1
5
+ from structured_eval.metrics.array_jaccard_similarity import ArrayJaccardSimilarity
6
+ from structured_eval.metrics.array_precision import ArrayPrecision
7
+ from structured_eval.metrics.array_prf1 import ArrayPRF1
8
+ from structured_eval.metrics.array_recall import ArrayRecall
9
+ from structured_eval.metrics.base import (
10
+ ArrayMetric,
11
+ BaseMetric,
12
+ FieldMetric,
13
+ GenericMetric,
14
+ Metric,
15
+ ObjectMetric,
16
+ RootMetric,
17
+ get_metric_class,
18
+ resolve_metric,
19
+ )
20
+ from structured_eval.metrics.character_f1 import CharacterF1
21
+ from structured_eval.metrics.composite_score import CompositeScore
22
+ from structured_eval.metrics.coverage_leaf_score import CoverageLeafScore
23
+ from structured_eval.metrics.date_distance_score import DateDistanceScore
24
+ from structured_eval.metrics.exact import ExactMatch
25
+ from structured_eval.metrics.exponential_numeric_score import ExponentialNumericScore
26
+ from structured_eval.metrics.field_faithfulness import FieldFaithfulness
27
+ from structured_eval.metrics.fuzzy import Fuzzy
28
+ from structured_eval.metrics.levenshtein import Levenshtein
29
+ from structured_eval.metrics.mean_score import MeanScore
30
+ from structured_eval.metrics.numeric import Numeric
31
+ from structured_eval.metrics.numeric_closeness import NumericCloseness
32
+ from structured_eval.metrics.object_accuracy import ObjectAccuracy
33
+ from structured_eval.metrics.object_exact_match import ObjectExactMatch
34
+ from structured_eval.metrics.object_f1 import ObjectF1
35
+ from structured_eval.metrics.object_precision import ObjectPrecision
36
+ from structured_eval.metrics.object_prf1 import ObjectPRF1
37
+ from structured_eval.metrics.object_recall import ObjectRecall
38
+ from structured_eval.metrics.object_type_validity import ObjectTypeValidity
39
+ from structured_eval.metrics.overall_leaf_score import OverallLeafScore
40
+ from structured_eval.metrics.presence import Presence
41
+ from structured_eval.metrics.regex_match import RegexMatch
42
+ from structured_eval.metrics.rule_pass_rate import Rule, RulePassRate
43
+ from structured_eval.metrics.schema_validity import SchemaValidity
44
+ from structured_eval.metrics.structural_similarity import StructuralSimilarity
45
+ from structured_eval.metrics.token_f1 import TokenF1
46
+ from structured_eval.metrics.type_match import TypeMatch
47
+
48
+ __all__ = [
49
+ # array metrics
50
+ "ArrayAccuracy",
51
+ "ArrayCardinality",
52
+ "ArrayExactMatch",
53
+ "ArrayF1",
54
+ "ArrayJaccardSimilarity",
55
+ "ArrayMetric",
56
+ "ArrayPRF1",
57
+ "ArrayPrecision",
58
+ "ArrayRecall",
59
+ # base hierarchy
60
+ "BaseMetric",
61
+ # field metrics
62
+ "CharacterF1",
63
+ # any-node metrics
64
+ "CompositeScore",
65
+ "CoverageLeafScore",
66
+ "DateDistanceScore",
67
+ "ExactMatch",
68
+ "ExponentialNumericScore",
69
+ "FieldFaithfulness",
70
+ "FieldMetric",
71
+ "Fuzzy",
72
+ "GenericMetric",
73
+ "Levenshtein",
74
+ "MeanScore",
75
+ "Metric",
76
+ "Numeric",
77
+ "NumericCloseness",
78
+ # object metrics
79
+ "ObjectAccuracy",
80
+ "ObjectExactMatch",
81
+ "ObjectF1",
82
+ "ObjectMetric",
83
+ "ObjectPRF1",
84
+ "ObjectPrecision",
85
+ "ObjectRecall",
86
+ "ObjectTypeValidity",
87
+ # root metrics
88
+ "OverallLeafScore",
89
+ "Presence",
90
+ "RegexMatch",
91
+ "RootMetric",
92
+ # rules DSL
93
+ "Rule",
94
+ "RulePassRate",
95
+ "SchemaValidity",
96
+ "StructuralSimilarity",
97
+ "TokenF1",
98
+ "TypeMatch",
99
+ "get_metric_class",
100
+ "resolve_metric",
101
+ ]
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.array_node import ArrayNode
9
+
10
+
11
+ class ArrayAccuracy(ArrayMetric):
12
+ """Mean element score over the aligned items (soft).
13
+
14
+ How good the matched elements are, regardless of how many were produced:
15
+ the mean of each matched item's representative score over (items + missed).
16
+ Missed expected items count as 0.0; an empty/fully-missed array is vacuously
17
+ 1.0. The default array metric, and the array branch of the old
18
+ ``structural_score``.
19
+ """
20
+
21
+ name = "array_accuracy"
22
+
23
+ def compute(self, node: ArrayNode) -> float:
24
+ n_missing = len(node.match_result.missed) if node.match_result else 0
25
+ denom = len(node.items) + n_missing
26
+ if denom == 0:
27
+ return 1.0
28
+ return sum(item.representative for item in node.items) / denom
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.array_node import ArrayNode
9
+
10
+
11
+ class ArrayCardinality(ArrayMetric):
12
+ """Count agreement: ``min(|actual|, |expected|) / max(...)``.
13
+
14
+ A cheap length-ratio check independent of element correctness. Two empty
15
+ arrays are vacuously 1.0.
16
+ """
17
+
18
+ name = "array_cardinality"
19
+
20
+ def compute(self, node: ArrayNode) -> float:
21
+ mr = node.match_result
22
+ if mr is None:
23
+ return 1.0
24
+ actual_count = len(mr.matched) + len(mr.spurious)
25
+ expected_count = len(mr.matched) + len(mr.missed)
26
+ hi = max(actual_count, expected_count)
27
+ return 1.0 if hi == 0 else min(actual_count, expected_count) / hi
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+
7
+ if TYPE_CHECKING:
8
+ from structured_eval.models.nodes.array_node import ArrayNode
9
+
10
+
11
+ class ArrayExactMatch(ArrayMetric):
12
+ """Strict whole-array equality: identical lists → 1.0, else 0.0.
13
+
14
+ Compares the raw ``actual`` / ``expected`` lists element-by-element,
15
+ **order-sensitively** and recursively (nested dicts/lists are deep-compared).
16
+ No alignment, no partial credit — the array as a whole is either right or
17
+ wrong. Use it when element order is part of correctness; for set-style or
18
+ value-aware scoring reach for :class:`ArrayJaccardSimilarity` or the
19
+ aligned ``Array*`` P/R/F1 metrics instead.
20
+ """
21
+
22
+ name = "array_exact_match"
23
+
24
+ def compute(self, node: ArrayNode) -> float:
25
+ return self.score(node.actual, node.expected)
26
+
27
+ def score(self, actual: Any, expected: Any) -> float:
28
+ return 1.0 if self._array_equal(actual, expected) else 0.0
29
+
30
+ def _array_equal(self, a: Any, b: Any) -> bool:
31
+ """Strict order-sensitive array comparison."""
32
+ if not (isinstance(a, list) and isinstance(b, list)):
33
+ return False
34
+ if len(a) != len(b):
35
+ return False
36
+ return all(self._deep_equal(x, y) for x, y in zip(a, b, strict=False))
37
+
38
+ def _deep_equal(self, a: Any, b: Any) -> bool:
39
+ """Shared recursive equality helper."""
40
+ if type(a) is not type(b):
41
+ return False
42
+ if isinstance(a, dict):
43
+ if set(a.keys()) != set(b.keys()):
44
+ return False
45
+ return all(self._deep_equal(a[k], b[k]) for k in a)
46
+ if isinstance(a, list):
47
+ return self._array_equal(a, b)
48
+ return bool(a == b)
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+ from structured_eval.metrics.utils import array as astats
7
+ from structured_eval.metrics.utils import calculate as stats
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.array_node import ArrayNode
11
+
12
+
13
+ class ArrayF1(ArrayMetric):
14
+ """Harmonic mean of array precision and recall over aligned elements.
15
+
16
+ Threshold and ``mode`` behave as for ``ArrayPrecision``.
17
+ """
18
+
19
+ name = "array_f1"
20
+
21
+ def __init__(
22
+ self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
23
+ ):
24
+ self.threshold = threshold
25
+ self.mode = stats.GradingMode(mode)
26
+
27
+ def compute(self, node: ArrayNode) -> float:
28
+ n_missing, n_spurious = astats.missing_spurious(node)
29
+ tp, predicted, expected = stats.prf_counts(
30
+ astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
31
+ )
32
+ p = stats.precision(tp, predicted, expected)
33
+ r = stats.recall(tp, predicted, expected)
34
+ return stats.f1(p, r)
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from structured_eval.metrics.base import ArrayMetric
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.models.nodes.array_node import ArrayNode
10
+
11
+
12
+ def _member(value: Any) -> Any:
13
+ """A hashable, comparison-stable set key for one element.
14
+
15
+ Scalars are used as-is; an unhashable element (dict/list) is keyed by its
16
+ canonical JSON so set membership still works without a TypeError.
17
+ """
18
+ if isinstance(value, (dict, list)):
19
+ return json.dumps(value, sort_keys=True, default=str)
20
+ return value
21
+
22
+
23
+ class ArrayJaccardSimilarity(ArrayMetric):
24
+ """Set-overlap (Jaccard) similarity for arrays, order- and count-insensitive.
25
+
26
+ ``|A ∩ B| / |A ∪ B|`` over the two lists treated as **sets** (duplicates
27
+ collapse, order is ignored):
28
+
29
+ - ``1.0`` when the sets are identical (both empty → vacuously ``1.0``);
30
+ - ``0.0`` when there is no overlap (or exactly one side is empty);
31
+ - a value in ``(0, 1)`` otherwise.
32
+
33
+ Built for arrays of scalars — tags, labels, categories. Membership is exact
34
+ equality (no partial credit); for value-aware element matching use the
35
+ aligned ``Array*`` P/R/F1 metrics instead.
36
+ """
37
+
38
+ name = "array_jaccard_similarity"
39
+
40
+ def compute(self, node: ArrayNode) -> float:
41
+ return self.score(node.actual, node.expected)
42
+
43
+ def score(self, actual: Any, expected: Any) -> float:
44
+ a = self._to_set(actual)
45
+ e = self._to_set(expected)
46
+
47
+ if not a and not e:
48
+ return 1.0
49
+ if not a or not e:
50
+ return 0.0
51
+
52
+ return len(a & e) / len(a | e)
53
+
54
+ def _to_set(self, value: Any) -> set[Any]:
55
+ """Convert a value to a set of hashable members."""
56
+ if value is None:
57
+ return set()
58
+ if isinstance(value, (set, list, tuple)):
59
+ return {_member(item) for item in value}
60
+ return {_member(value)}
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+ from structured_eval.metrics.utils import array as astats
7
+ from structured_eval.metrics.utils import calculate as stats
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.array_node import ArrayNode
11
+
12
+
13
+ class ArrayPrecision(ArrayMetric):
14
+ """TP / (TP + FP) over aligned array elements.
15
+
16
+ An aligned item is a TP when its ``element_score`` clears ``threshold``
17
+ (``mode="soft"`` instead adds the score fractionally); ``spurious`` items
18
+ are FP. So a wrong-but-aligned element lowers precision.
19
+ """
20
+
21
+ name = "array_precision"
22
+
23
+ def __init__(
24
+ self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
25
+ ):
26
+ self.threshold = threshold
27
+ self.mode = stats.GradingMode(mode)
28
+
29
+ def compute(self, node: ArrayNode) -> float:
30
+ n_missing, n_spurious = astats.missing_spurious(node)
31
+ tp, predicted, expected = stats.prf_counts(
32
+ astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
33
+ )
34
+ return stats.precision(tp, predicted, expected)
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+ from structured_eval.metrics.utils import array as astats
7
+ from structured_eval.metrics.utils import calculate as stats
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.array_node import ArrayNode
11
+
12
+
13
+ class ArrayPRF1(ArrayMetric):
14
+ """Array precision, recall and F1 in one pass.
15
+
16
+ Returns a dict; the engine writes ``array_precision`` / ``array_recall`` /
17
+ ``array_f1`` into ``report.metrics`` directly. Threshold and ``mode`` behave
18
+ as for ``ArrayPrecision``.
19
+ """
20
+
21
+ name = "array_prf1"
22
+
23
+ def __init__(
24
+ self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
25
+ ):
26
+ self.threshold = threshold
27
+ self.mode = stats.GradingMode(mode)
28
+
29
+ def compute(self, node: ArrayNode) -> dict[str, float]:
30
+ n_missing, n_spurious = astats.missing_spurious(node)
31
+ tp, predicted, expected = stats.prf_counts(
32
+ astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
33
+ )
34
+ p = stats.precision(tp, predicted, expected)
35
+ r = stats.recall(tp, predicted, expected)
36
+ return {
37
+ "array_precision": p,
38
+ "array_recall": r,
39
+ "array_f1": stats.f1(p, r),
40
+ }
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import ArrayMetric
6
+ from structured_eval.metrics.utils import array as astats
7
+ from structured_eval.metrics.utils import calculate as stats
8
+
9
+ if TYPE_CHECKING:
10
+ from structured_eval.models.nodes.array_node import ArrayNode
11
+
12
+
13
+ class ArrayRecall(ArrayMetric):
14
+ """TP / (TP + FN) over aligned array elements.
15
+
16
+ ``missed`` expected items are FN; threshold and ``mode`` behave as for
17
+ ``ArrayPrecision``.
18
+ """
19
+
20
+ name = "array_recall"
21
+
22
+ def __init__(
23
+ self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
24
+ ):
25
+ self.threshold = threshold
26
+ self.mode = stats.GradingMode(mode)
27
+
28
+ def compute(self, node: ArrayNode) -> float:
29
+ n_missing, n_spurious = astats.missing_spurious(node)
30
+ tp, predicted, expected = stats.prf_counts(
31
+ astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
32
+ )
33
+ return stats.recall(tp, predicted, expected)