structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""deepeval adapter: structured-eval as a ``BaseMetric``.
|
|
2
|
+
|
|
3
|
+
Usage (requires ``structured-eval[deepeval]``)::
|
|
4
|
+
|
|
5
|
+
from structured_eval.integrations.deepeval import StructuredMetric
|
|
6
|
+
from deepeval import assert_test
|
|
7
|
+
from deepeval.test_case import LLMTestCase
|
|
8
|
+
|
|
9
|
+
metric = StructuredMetric(config=cfg, threshold=0.85)
|
|
10
|
+
assert_test(LLMTestCase(input=..., actual_output=raw, expected_output=ref), [metric])
|
|
11
|
+
|
|
12
|
+
``actual_output``/``expected_output`` may be JSON strings or already-parsed
|
|
13
|
+
objects — ``evaluate`` handles both. ``report.score`` becomes ``metric.score``;
|
|
14
|
+
failing fields are summarised into ``metric.reason``. Importing this module
|
|
15
|
+
requires deepeval to be installed (the ``[deepeval]`` extra).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import TYPE_CHECKING, Any
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from deepeval.metrics import BaseMetric
|
|
24
|
+
except ImportError as exc: # pragma: no cover
|
|
25
|
+
raise ImportError(
|
|
26
|
+
"deepeval is required for this integration. "
|
|
27
|
+
"Install it with: pip install structured-eval[deepeval]"
|
|
28
|
+
) from exc
|
|
29
|
+
|
|
30
|
+
from structured_eval.api import evaluate
|
|
31
|
+
from structured_eval.integrations._adapter import verdict
|
|
32
|
+
from structured_eval.models.config import EvalConfig
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from structured_eval.models.result import EvalReport
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class StructuredMetric(BaseMetric):
|
|
39
|
+
"""Field-level structured-output metric for deepeval."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
config: EvalConfig | None = None,
|
|
44
|
+
threshold: float = 0.5,
|
|
45
|
+
*,
|
|
46
|
+
include_reason: bool = True,
|
|
47
|
+
) -> None:
|
|
48
|
+
self.config = config or EvalConfig()
|
|
49
|
+
self.threshold = threshold
|
|
50
|
+
self.include_reason = include_reason
|
|
51
|
+
self.score: float = 0.0
|
|
52
|
+
self.success: bool = False
|
|
53
|
+
self.reason: str | None = None
|
|
54
|
+
self.report: EvalReport | None = None
|
|
55
|
+
|
|
56
|
+
def measure(self, test_case: Any, *args: Any, **kwargs: Any) -> float:
|
|
57
|
+
self.report = evaluate(
|
|
58
|
+
test_case.actual_output, test_case.expected_output, self.config
|
|
59
|
+
)
|
|
60
|
+
score, success, reason = verdict(self.report, self.threshold)
|
|
61
|
+
self.score = 0.0 if score is None else score
|
|
62
|
+
self.success = success
|
|
63
|
+
self.reason = reason if self.include_reason else None
|
|
64
|
+
return self.score
|
|
65
|
+
|
|
66
|
+
async def a_measure(self, test_case: Any, *args: Any, **kwargs: Any) -> float:
|
|
67
|
+
return self.measure(test_case, *args, **kwargs)
|
|
68
|
+
|
|
69
|
+
def is_successful(self) -> bool:
|
|
70
|
+
return self.success
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def __name__(self) -> str: # shown in deepeval output
|
|
74
|
+
return "Structured Eval"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""LangSmith adapter: structured-eval as an evaluator function.
|
|
2
|
+
|
|
3
|
+
Usage (requires ``structured-eval[langsmith]``)::
|
|
4
|
+
|
|
5
|
+
from langsmith import evaluate
|
|
6
|
+
from structured_eval.integrations.langsmith import structured_evaluator
|
|
7
|
+
|
|
8
|
+
evaluator = structured_evaluator(config=cfg, threshold=0.85)
|
|
9
|
+
evaluate(target, data=dataset, evaluators=[evaluator])
|
|
10
|
+
|
|
11
|
+
The returned callable follows LangSmith's ``(run, example) -> dict`` contract and
|
|
12
|
+
emits a single feedback key with ``report.score`` plus a ``comment`` summarising
|
|
13
|
+
failures. By default the actual output is read from ``run.outputs`` and the
|
|
14
|
+
reference from ``example.outputs``; pass ``extract_actual``/``extract_expected``
|
|
15
|
+
to point at a nested field or adapt a different object shape.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from collections.abc import Callable
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from structured_eval.api import evaluate
|
|
24
|
+
from structured_eval.integrations._adapter import verdict
|
|
25
|
+
from structured_eval.models.config import EvalConfig
|
|
26
|
+
from structured_eval.models.result import EvalReport
|
|
27
|
+
|
|
28
|
+
Extractor = Callable[[Any], Any]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _outputs(obj: Any) -> Any:
|
|
32
|
+
"""Default extraction: the ``outputs`` payload of a run/example."""
|
|
33
|
+
if obj is None:
|
|
34
|
+
return None
|
|
35
|
+
if isinstance(obj, dict):
|
|
36
|
+
return obj.get("outputs", obj)
|
|
37
|
+
return getattr(obj, "outputs", obj)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StructuredEvaluator:
|
|
41
|
+
"""A LangSmith evaluator that scores structured outputs field-by-field.
|
|
42
|
+
|
|
43
|
+
Instances are callable with LangSmith's ``(run, example) -> dict`` contract.
|
|
44
|
+
``key`` is the feedback key recorded in LangSmith; ``threshold`` decides the
|
|
45
|
+
boolean only for the ``comment`` — LangSmith stores the numeric
|
|
46
|
+
``report.score`` itself. ``extract_actual`` / ``extract_expected`` adapt the
|
|
47
|
+
run/example shape (default: their ``outputs`` payload).
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
config: EvalConfig | None = None,
|
|
53
|
+
*,
|
|
54
|
+
key: str = "structured_eval",
|
|
55
|
+
threshold: float = 0.5,
|
|
56
|
+
extract_actual: Extractor | None = None,
|
|
57
|
+
extract_expected: Extractor | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
self.config = config or EvalConfig()
|
|
60
|
+
self.key = key
|
|
61
|
+
self.threshold = threshold
|
|
62
|
+
self._get_actual = extract_actual or _outputs
|
|
63
|
+
self._get_expected = extract_expected or _outputs
|
|
64
|
+
self.__name__ = key
|
|
65
|
+
|
|
66
|
+
def __call__(self, run: Any, example: Any) -> dict[str, Any]:
|
|
67
|
+
report = evaluate(
|
|
68
|
+
self._get_actual(run), self._get_expected(example), self.config
|
|
69
|
+
)
|
|
70
|
+
assert isinstance(report, EvalReport) # single-document evaluation
|
|
71
|
+
score, _success, reason = verdict(report, self.threshold)
|
|
72
|
+
return {"key": self.key, "score": score, "comment": reason}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def structured_evaluator(
|
|
76
|
+
config: EvalConfig | None = None,
|
|
77
|
+
*,
|
|
78
|
+
key: str = "structured_eval",
|
|
79
|
+
threshold: float = 0.5,
|
|
80
|
+
extract_actual: Extractor | None = None,
|
|
81
|
+
extract_expected: Extractor | None = None,
|
|
82
|
+
) -> StructuredEvaluator:
|
|
83
|
+
"""Convenience factory returning a ``StructuredEvaluator`` instance."""
|
|
84
|
+
return StructuredEvaluator(
|
|
85
|
+
config,
|
|
86
|
+
key=key,
|
|
87
|
+
threshold=threshold,
|
|
88
|
+
extract_actual=extract_actual,
|
|
89
|
+
extract_expected=extract_expected,
|
|
90
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from structured_eval.metrics.array_accuracy import ArrayAccuracy
|
|
2
|
+
from structured_eval.metrics.array_cardinality import ArrayCardinality
|
|
3
|
+
from structured_eval.metrics.array_exact_match import ArrayExactMatch
|
|
4
|
+
from structured_eval.metrics.array_f1 import ArrayF1
|
|
5
|
+
from structured_eval.metrics.array_jaccard_similarity import ArrayJaccardSimilarity
|
|
6
|
+
from structured_eval.metrics.array_precision import ArrayPrecision
|
|
7
|
+
from structured_eval.metrics.array_prf1 import ArrayPRF1
|
|
8
|
+
from structured_eval.metrics.array_recall import ArrayRecall
|
|
9
|
+
from structured_eval.metrics.base import (
|
|
10
|
+
ArrayMetric,
|
|
11
|
+
BaseMetric,
|
|
12
|
+
FieldMetric,
|
|
13
|
+
GenericMetric,
|
|
14
|
+
Metric,
|
|
15
|
+
ObjectMetric,
|
|
16
|
+
RootMetric,
|
|
17
|
+
get_metric_class,
|
|
18
|
+
resolve_metric,
|
|
19
|
+
)
|
|
20
|
+
from structured_eval.metrics.character_f1 import CharacterF1
|
|
21
|
+
from structured_eval.metrics.composite_score import CompositeScore
|
|
22
|
+
from structured_eval.metrics.coverage_leaf_score import CoverageLeafScore
|
|
23
|
+
from structured_eval.metrics.date_distance_score import DateDistanceScore
|
|
24
|
+
from structured_eval.metrics.exact import ExactMatch
|
|
25
|
+
from structured_eval.metrics.exponential_numeric_score import ExponentialNumericScore
|
|
26
|
+
from structured_eval.metrics.field_faithfulness import FieldFaithfulness
|
|
27
|
+
from structured_eval.metrics.fuzzy import Fuzzy
|
|
28
|
+
from structured_eval.metrics.levenshtein import Levenshtein
|
|
29
|
+
from structured_eval.metrics.mean_score import MeanScore
|
|
30
|
+
from structured_eval.metrics.numeric import Numeric
|
|
31
|
+
from structured_eval.metrics.numeric_closeness import NumericCloseness
|
|
32
|
+
from structured_eval.metrics.object_accuracy import ObjectAccuracy
|
|
33
|
+
from structured_eval.metrics.object_exact_match import ObjectExactMatch
|
|
34
|
+
from structured_eval.metrics.object_f1 import ObjectF1
|
|
35
|
+
from structured_eval.metrics.object_precision import ObjectPrecision
|
|
36
|
+
from structured_eval.metrics.object_prf1 import ObjectPRF1
|
|
37
|
+
from structured_eval.metrics.object_recall import ObjectRecall
|
|
38
|
+
from structured_eval.metrics.object_type_validity import ObjectTypeValidity
|
|
39
|
+
from structured_eval.metrics.overall_leaf_score import OverallLeafScore
|
|
40
|
+
from structured_eval.metrics.presence import Presence
|
|
41
|
+
from structured_eval.metrics.regex_match import RegexMatch
|
|
42
|
+
from structured_eval.metrics.rule_pass_rate import Rule, RulePassRate
|
|
43
|
+
from structured_eval.metrics.schema_validity import SchemaValidity
|
|
44
|
+
from structured_eval.metrics.structural_similarity import StructuralSimilarity
|
|
45
|
+
from structured_eval.metrics.token_f1 import TokenF1
|
|
46
|
+
from structured_eval.metrics.type_match import TypeMatch
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
# array metrics
|
|
50
|
+
"ArrayAccuracy",
|
|
51
|
+
"ArrayCardinality",
|
|
52
|
+
"ArrayExactMatch",
|
|
53
|
+
"ArrayF1",
|
|
54
|
+
"ArrayJaccardSimilarity",
|
|
55
|
+
"ArrayMetric",
|
|
56
|
+
"ArrayPRF1",
|
|
57
|
+
"ArrayPrecision",
|
|
58
|
+
"ArrayRecall",
|
|
59
|
+
# base hierarchy
|
|
60
|
+
"BaseMetric",
|
|
61
|
+
# field metrics
|
|
62
|
+
"CharacterF1",
|
|
63
|
+
# any-node metrics
|
|
64
|
+
"CompositeScore",
|
|
65
|
+
"CoverageLeafScore",
|
|
66
|
+
"DateDistanceScore",
|
|
67
|
+
"ExactMatch",
|
|
68
|
+
"ExponentialNumericScore",
|
|
69
|
+
"FieldFaithfulness",
|
|
70
|
+
"FieldMetric",
|
|
71
|
+
"Fuzzy",
|
|
72
|
+
"GenericMetric",
|
|
73
|
+
"Levenshtein",
|
|
74
|
+
"MeanScore",
|
|
75
|
+
"Metric",
|
|
76
|
+
"Numeric",
|
|
77
|
+
"NumericCloseness",
|
|
78
|
+
# object metrics
|
|
79
|
+
"ObjectAccuracy",
|
|
80
|
+
"ObjectExactMatch",
|
|
81
|
+
"ObjectF1",
|
|
82
|
+
"ObjectMetric",
|
|
83
|
+
"ObjectPRF1",
|
|
84
|
+
"ObjectPrecision",
|
|
85
|
+
"ObjectRecall",
|
|
86
|
+
"ObjectTypeValidity",
|
|
87
|
+
# root metrics
|
|
88
|
+
"OverallLeafScore",
|
|
89
|
+
"Presence",
|
|
90
|
+
"RegexMatch",
|
|
91
|
+
"RootMetric",
|
|
92
|
+
# rules DSL
|
|
93
|
+
"Rule",
|
|
94
|
+
"RulePassRate",
|
|
95
|
+
"SchemaValidity",
|
|
96
|
+
"StructuralSimilarity",
|
|
97
|
+
"TokenF1",
|
|
98
|
+
"TypeMatch",
|
|
99
|
+
"get_metric_class",
|
|
100
|
+
"resolve_metric",
|
|
101
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ArrayAccuracy(ArrayMetric):
|
|
12
|
+
"""Mean element score over the aligned items (soft).
|
|
13
|
+
|
|
14
|
+
How good the matched elements are, regardless of how many were produced:
|
|
15
|
+
the mean of each matched item's representative score over (items + missed).
|
|
16
|
+
Missed expected items count as 0.0; an empty/fully-missed array is vacuously
|
|
17
|
+
1.0. The default array metric, and the array branch of the old
|
|
18
|
+
``structural_score``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "array_accuracy"
|
|
22
|
+
|
|
23
|
+
def compute(self, node: ArrayNode) -> float:
|
|
24
|
+
n_missing = len(node.match_result.missed) if node.match_result else 0
|
|
25
|
+
denom = len(node.items) + n_missing
|
|
26
|
+
if denom == 0:
|
|
27
|
+
return 1.0
|
|
28
|
+
return sum(item.representative for item in node.items) / denom
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ArrayCardinality(ArrayMetric):
|
|
12
|
+
"""Count agreement: ``min(|actual|, |expected|) / max(...)``.
|
|
13
|
+
|
|
14
|
+
A cheap length-ratio check independent of element correctness. Two empty
|
|
15
|
+
arrays are vacuously 1.0.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
name = "array_cardinality"
|
|
19
|
+
|
|
20
|
+
def compute(self, node: ArrayNode) -> float:
|
|
21
|
+
mr = node.match_result
|
|
22
|
+
if mr is None:
|
|
23
|
+
return 1.0
|
|
24
|
+
actual_count = len(mr.matched) + len(mr.spurious)
|
|
25
|
+
expected_count = len(mr.matched) + len(mr.missed)
|
|
26
|
+
hi = max(actual_count, expected_count)
|
|
27
|
+
return 1.0 if hi == 0 else min(actual_count, expected_count) / hi
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ArrayExactMatch(ArrayMetric):
|
|
12
|
+
"""Strict whole-array equality: identical lists → 1.0, else 0.0.
|
|
13
|
+
|
|
14
|
+
Compares the raw ``actual`` / ``expected`` lists element-by-element,
|
|
15
|
+
**order-sensitively** and recursively (nested dicts/lists are deep-compared).
|
|
16
|
+
No alignment, no partial credit — the array as a whole is either right or
|
|
17
|
+
wrong. Use it when element order is part of correctness; for set-style or
|
|
18
|
+
value-aware scoring reach for :class:`ArrayJaccardSimilarity` or the
|
|
19
|
+
aligned ``Array*`` P/R/F1 metrics instead.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "array_exact_match"
|
|
23
|
+
|
|
24
|
+
def compute(self, node: ArrayNode) -> float:
|
|
25
|
+
return self.score(node.actual, node.expected)
|
|
26
|
+
|
|
27
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
28
|
+
return 1.0 if self._array_equal(actual, expected) else 0.0
|
|
29
|
+
|
|
30
|
+
def _array_equal(self, a: Any, b: Any) -> bool:
|
|
31
|
+
"""Strict order-sensitive array comparison."""
|
|
32
|
+
if not (isinstance(a, list) and isinstance(b, list)):
|
|
33
|
+
return False
|
|
34
|
+
if len(a) != len(b):
|
|
35
|
+
return False
|
|
36
|
+
return all(self._deep_equal(x, y) for x, y in zip(a, b, strict=False))
|
|
37
|
+
|
|
38
|
+
def _deep_equal(self, a: Any, b: Any) -> bool:
|
|
39
|
+
"""Shared recursive equality helper."""
|
|
40
|
+
if type(a) is not type(b):
|
|
41
|
+
return False
|
|
42
|
+
if isinstance(a, dict):
|
|
43
|
+
if set(a.keys()) != set(b.keys()):
|
|
44
|
+
return False
|
|
45
|
+
return all(self._deep_equal(a[k], b[k]) for k in a)
|
|
46
|
+
if isinstance(a, list):
|
|
47
|
+
return self._array_equal(a, b)
|
|
48
|
+
return bool(a == b)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
from structured_eval.metrics.utils import array as astats
|
|
7
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArrayF1(ArrayMetric):
|
|
14
|
+
"""Harmonic mean of array precision and recall over aligned elements.
|
|
15
|
+
|
|
16
|
+
Threshold and ``mode`` behave as for ``ArrayPrecision``.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "array_f1"
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
|
|
23
|
+
):
|
|
24
|
+
self.threshold = threshold
|
|
25
|
+
self.mode = stats.GradingMode(mode)
|
|
26
|
+
|
|
27
|
+
def compute(self, node: ArrayNode) -> float:
|
|
28
|
+
n_missing, n_spurious = astats.missing_spurious(node)
|
|
29
|
+
tp, predicted, expected = stats.prf_counts(
|
|
30
|
+
astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
|
|
31
|
+
)
|
|
32
|
+
p = stats.precision(tp, predicted, expected)
|
|
33
|
+
r = stats.recall(tp, predicted, expected)
|
|
34
|
+
return stats.f1(p, r)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _member(value: Any) -> Any:
|
|
13
|
+
"""A hashable, comparison-stable set key for one element.
|
|
14
|
+
|
|
15
|
+
Scalars are used as-is; an unhashable element (dict/list) is keyed by its
|
|
16
|
+
canonical JSON so set membership still works without a TypeError.
|
|
17
|
+
"""
|
|
18
|
+
if isinstance(value, (dict, list)):
|
|
19
|
+
return json.dumps(value, sort_keys=True, default=str)
|
|
20
|
+
return value
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ArrayJaccardSimilarity(ArrayMetric):
|
|
24
|
+
"""Set-overlap (Jaccard) similarity for arrays, order- and count-insensitive.
|
|
25
|
+
|
|
26
|
+
``|A ∩ B| / |A ∪ B|`` over the two lists treated as **sets** (duplicates
|
|
27
|
+
collapse, order is ignored):
|
|
28
|
+
|
|
29
|
+
- ``1.0`` when the sets are identical (both empty → vacuously ``1.0``);
|
|
30
|
+
- ``0.0`` when there is no overlap (or exactly one side is empty);
|
|
31
|
+
- a value in ``(0, 1)`` otherwise.
|
|
32
|
+
|
|
33
|
+
Built for arrays of scalars — tags, labels, categories. Membership is exact
|
|
34
|
+
equality (no partial credit); for value-aware element matching use the
|
|
35
|
+
aligned ``Array*`` P/R/F1 metrics instead.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name = "array_jaccard_similarity"
|
|
39
|
+
|
|
40
|
+
def compute(self, node: ArrayNode) -> float:
|
|
41
|
+
return self.score(node.actual, node.expected)
|
|
42
|
+
|
|
43
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
44
|
+
a = self._to_set(actual)
|
|
45
|
+
e = self._to_set(expected)
|
|
46
|
+
|
|
47
|
+
if not a and not e:
|
|
48
|
+
return 1.0
|
|
49
|
+
if not a or not e:
|
|
50
|
+
return 0.0
|
|
51
|
+
|
|
52
|
+
return len(a & e) / len(a | e)
|
|
53
|
+
|
|
54
|
+
def _to_set(self, value: Any) -> set[Any]:
|
|
55
|
+
"""Convert a value to a set of hashable members."""
|
|
56
|
+
if value is None:
|
|
57
|
+
return set()
|
|
58
|
+
if isinstance(value, (set, list, tuple)):
|
|
59
|
+
return {_member(item) for item in value}
|
|
60
|
+
return {_member(value)}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
from structured_eval.metrics.utils import array as astats
|
|
7
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArrayPrecision(ArrayMetric):
|
|
14
|
+
"""TP / (TP + FP) over aligned array elements.
|
|
15
|
+
|
|
16
|
+
An aligned item is a TP when its ``element_score`` clears ``threshold``
|
|
17
|
+
(``mode="soft"`` instead adds the score fractionally); ``spurious`` items
|
|
18
|
+
are FP. So a wrong-but-aligned element lowers precision.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "array_precision"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
|
|
25
|
+
):
|
|
26
|
+
self.threshold = threshold
|
|
27
|
+
self.mode = stats.GradingMode(mode)
|
|
28
|
+
|
|
29
|
+
def compute(self, node: ArrayNode) -> float:
|
|
30
|
+
n_missing, n_spurious = astats.missing_spurious(node)
|
|
31
|
+
tp, predicted, expected = stats.prf_counts(
|
|
32
|
+
astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
|
|
33
|
+
)
|
|
34
|
+
return stats.precision(tp, predicted, expected)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
from structured_eval.metrics.utils import array as astats
|
|
7
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArrayPRF1(ArrayMetric):
|
|
14
|
+
"""Array precision, recall and F1 in one pass.
|
|
15
|
+
|
|
16
|
+
Returns a dict; the engine writes ``array_precision`` / ``array_recall`` /
|
|
17
|
+
``array_f1`` into ``report.metrics`` directly. Threshold and ``mode`` behave
|
|
18
|
+
as for ``ArrayPrecision``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "array_prf1"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
|
|
25
|
+
):
|
|
26
|
+
self.threshold = threshold
|
|
27
|
+
self.mode = stats.GradingMode(mode)
|
|
28
|
+
|
|
29
|
+
def compute(self, node: ArrayNode) -> dict[str, float]:
|
|
30
|
+
n_missing, n_spurious = astats.missing_spurious(node)
|
|
31
|
+
tp, predicted, expected = stats.prf_counts(
|
|
32
|
+
astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
|
|
33
|
+
)
|
|
34
|
+
p = stats.precision(tp, predicted, expected)
|
|
35
|
+
r = stats.recall(tp, predicted, expected)
|
|
36
|
+
return {
|
|
37
|
+
"array_precision": p,
|
|
38
|
+
"array_recall": r,
|
|
39
|
+
"array_f1": stats.f1(p, r),
|
|
40
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import ArrayMetric
|
|
6
|
+
from structured_eval.metrics.utils import array as astats
|
|
7
|
+
from structured_eval.metrics.utils import calculate as stats
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArrayRecall(ArrayMetric):
|
|
14
|
+
"""TP / (TP + FN) over aligned array elements.
|
|
15
|
+
|
|
16
|
+
``missed`` expected items are FN; threshold and ``mode`` behave as for
|
|
17
|
+
``ArrayPrecision``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name = "array_recall"
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self, threshold: float = 1.0, mode: stats.GradingMode = stats.GradingMode.HARD
|
|
24
|
+
):
|
|
25
|
+
self.threshold = threshold
|
|
26
|
+
self.mode = stats.GradingMode(mode)
|
|
27
|
+
|
|
28
|
+
def compute(self, node: ArrayNode) -> float:
|
|
29
|
+
n_missing, n_spurious = astats.missing_spurious(node)
|
|
30
|
+
tp, predicted, expected = stats.prf_counts(
|
|
31
|
+
astats.verdicts(node, self.threshold), n_missing, n_spurious, self.mode
|
|
32
|
+
)
|
|
33
|
+
return stats.recall(tp, predicted, expected)
|