structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,224 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import operator
5
+ import re
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from structured_eval.models.result import RuleResult
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Callable
12
+
13
+ # Matches a bare JSONPath like "$.field" or "$.nested.child"
14
+ _PLAIN_PATH_RE = re.compile(r"^\$(?:\.[a-zA-Z_][a-zA-Z0-9_]*)+$")
15
+ # Matches any JSONPath fragment inside a larger expression
16
+ _PATH_IN_EXPR_RE = re.compile(r"\$(?:\.[a-zA-Z_][a-zA-Z0-9_]*)+")
17
+
18
+ _ARITH_OPS: dict[type[ast.operator], Any] = {
19
+ ast.Add: operator.add,
20
+ ast.Sub: operator.sub,
21
+ ast.Mult: operator.mul,
22
+ ast.Div: operator.truediv,
23
+ }
24
+
25
+
26
+ # ── jsonpath-ng lazy import ────────────────────────────────────────────────────
27
+
28
+
29
+ def _ensure_jsonpath() -> None:
30
+ try:
31
+ import jsonpath_ng # noqa: F401
32
+ except ImportError as e:
33
+ raise ImportError(
34
+ "jsonpath-ng is required for the Rule DSL. "
35
+ "Install it with: pip install 'structured-eval[rules]'"
36
+ ) from e
37
+
38
+
39
+ def _resolve_path(path: str, document: dict[str, Any]) -> Any:
40
+ _ensure_jsonpath()
41
+ from jsonpath_ng import parse
42
+
43
+ matches = parse(path).find(document)
44
+ if not matches:
45
+ raise KeyError(f"Path {path!r} not found in document")
46
+ return matches[0].value
47
+
48
+
49
+ # ── Arithmetic expression evaluator ───────────────────────────────────────────
50
+
51
+
52
+ def _eval_arithmetic(expr: str, document: dict[str, Any]) -> Any:
53
+ """Resolve JSONPath fragments inside expr, then evaluate safe arithmetic."""
54
+
55
+ def _replace(m: re.Match[str]) -> str:
56
+ return repr(_resolve_path(m.group(), document))
57
+
58
+ resolved = _PATH_IN_EXPR_RE.sub(_replace, expr)
59
+ tree = ast.parse(resolved, mode="eval")
60
+ return _eval_node(tree.body)
61
+
62
+
63
+ def _eval_node(node: ast.expr) -> Any:
64
+ if isinstance(node, ast.Constant):
65
+ return node.value
66
+ if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub):
67
+ return -_eval_node(node.operand)
68
+ if isinstance(node, ast.BinOp):
69
+ op_fn = _ARITH_OPS.get(type(node.op))
70
+ if op_fn is None:
71
+ raise ValueError(f"Unsupported operator: {type(node.op).__name__}")
72
+ return op_fn(_eval_node(node.left), _eval_node(node.right))
73
+ raise ValueError(f"Unsupported expression node: {type(node).__name__}")
74
+
75
+
76
+ # ── CustomRule ────────────────────────────────────────────────────────────────
77
+
78
+
79
+ class _CustomRule:
80
+ """Returned by Rule.custom(). Evaluates an arbitrary function over a document."""
81
+
82
+ def __init__(self, fn: Callable[[dict[str, Any]], bool], *, name: str = "") -> None:
83
+ self._fn = fn
84
+ self._name = name
85
+
86
+ @property
87
+ def name(self) -> str:
88
+ return self._name or "custom"
89
+
90
+ def evaluate(self, document: dict[str, Any]) -> RuleResult:
91
+ try:
92
+ passed = bool(self._fn(document))
93
+ except Exception as exc:
94
+ return RuleResult(name=self.name, passed=False, message=str(exc))
95
+ return RuleResult(
96
+ name=self.name,
97
+ passed=passed,
98
+ message="" if passed else "custom rule failed",
99
+ )
100
+
101
+
102
+ # ── Rule ──────────────────────────────────────────────────────────────────────
103
+
104
+
105
+ class Rule:
106
+ """JSONPath-based document constraint.
107
+
108
+ Chain a comparison method to create a bound rule, then pass it to
109
+ EvalConfig.rules or call evaluate() directly.
110
+
111
+ Examples::
112
+
113
+ Rule("$.status").eq("paid")
114
+ Rule("$.total").gte(0)
115
+ Rule("$.total").eq("$.subtotal + $.tax")
116
+ Rule("$.currency").in_(["USD", "EUR"])
117
+ Rule.custom(lambda doc: doc["amount"] > 0, name="positive_amount")
118
+ """
119
+
120
+ def __init__(self, path: str, *, name: str = "") -> None:
121
+ self._path = path
122
+ self._name = name
123
+ self._op: str | None = None
124
+ self._rhs: Any = None
125
+
126
+ # ── Builder ───────────────────────────────────────────────────────────────
127
+
128
+ def _bind(self, op: str, rhs: Any) -> Rule:
129
+ r = Rule(self._path, name=self._name)
130
+ r._op = op
131
+ r._rhs = rhs
132
+ return r
133
+
134
+ def eq(self, rhs: Any) -> Rule:
135
+ return self._bind("eq", rhs)
136
+
137
+ def lt(self, rhs: Any) -> Rule:
138
+ return self._bind("lt", rhs)
139
+
140
+ def gt(self, rhs: Any) -> Rule:
141
+ return self._bind("gt", rhs)
142
+
143
+ def lte(self, rhs: Any) -> Rule:
144
+ return self._bind("lte", rhs)
145
+
146
+ def gte(self, rhs: Any) -> Rule:
147
+ return self._bind("gte", rhs)
148
+
149
+ def in_(self, collection: Any) -> Rule:
150
+ return self._bind("in", collection)
151
+
152
+ @classmethod
153
+ def custom(
154
+ cls, fn: Callable[[dict[str, Any]], bool], *, name: str = ""
155
+ ) -> _CustomRule:
156
+ """Wrap an arbitrary function as a rule.
157
+
158
+ Args:
159
+ fn: Callable(document) -> bool. Receives the full document dict.
160
+ name: Human-readable name shown in reports.
161
+ """
162
+ return _CustomRule(fn=fn, name=name)
163
+
164
+ # ── Evaluation ────────────────────────────────────────────────────────────
165
+
166
+ @property
167
+ def name(self) -> str:
168
+ if self._name:
169
+ return self._name
170
+ if self._op is not None:
171
+ rhs_str = self._rhs if isinstance(self._rhs, str) else repr(self._rhs)
172
+ return f"{self._path} {self._op} {rhs_str}"
173
+ return self._path
174
+
175
+ def evaluate(self, document: dict[str, Any]) -> RuleResult:
176
+ if self._op is None:
177
+ raise ValueError(
178
+ f"Rule {self._path!r} has no comparison — call .eq(), .lt(), etc."
179
+ )
180
+
181
+ try:
182
+ lhs = _resolve_path(self._path, document)
183
+ except (KeyError, ImportError) as exc:
184
+ return RuleResult(name=self.name, passed=False, message=str(exc))
185
+
186
+ try:
187
+ rhs = self._resolve_rhs(self._rhs, document)
188
+ except Exception as exc:
189
+ return RuleResult(name=self.name, passed=False, message=str(exc))
190
+
191
+ try:
192
+ passed = self._compare(lhs, rhs)
193
+ except TypeError as exc:
194
+ return RuleResult(name=self.name, passed=False, message=str(exc))
195
+
196
+ msg = (
197
+ ""
198
+ if passed
199
+ else f"{self._path!r} ({lhs!r}) does not satisfy {self._op}({rhs!r})"
200
+ )
201
+ return RuleResult(name=self.name, passed=passed, message=msg)
202
+
203
+ def _resolve_rhs(self, rhs: Any, document: dict[str, Any]) -> Any:
204
+ if isinstance(rhs, str) and "$" in rhs:
205
+ stripped = rhs.strip()
206
+ if _PLAIN_PATH_RE.match(stripped):
207
+ return _resolve_path(stripped, document)
208
+ return _eval_arithmetic(stripped, document)
209
+ return rhs
210
+
211
+ def _compare(self, lhs: Any, rhs: Any) -> bool:
212
+ if self._op == "eq":
213
+ return bool(lhs == rhs)
214
+ if self._op == "lt":
215
+ return bool(lhs < rhs)
216
+ if self._op == "gt":
217
+ return bool(lhs > rhs)
218
+ if self._op == "lte":
219
+ return bool(lhs <= rhs)
220
+ if self._op == "gte":
221
+ return bool(lhs >= rhs)
222
+ if self._op == "in":
223
+ return lhs in rhs
224
+ raise ValueError(f"Unknown operator: {self._op!r}")
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from structured_eval.models.result import RuleResult
7
+
8
+
9
+ class RuleProcessor:
10
+ """Evaluates a list of business rules against a document.
11
+
12
+ Each rule must expose ``evaluate(document) -> RuleResult`` (satisfied by both
13
+ ``Rule`` and the result of ``Rule.custom()``). ``run`` returns the per-rule
14
+ results and the pass rate (1.0 when there are no rules).
15
+ """
16
+
17
+ def run(
18
+ self, rules: list[Any], document: dict[str, Any]
19
+ ) -> tuple[list[RuleResult], float]:
20
+ results: list[RuleResult] = [rule.evaluate(document) for rule in rules]
21
+ if not results:
22
+ return results, 1.0
23
+ pass_rate = sum(1 for r in results if r.passed) / len(results)
24
+ return results, pass_rate
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import RootMetric
6
+ from structured_eval.metrics.rule_pass_rate.engine import RuleProcessor
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.models.nodes.base import EvalNode
10
+
11
+
12
+ class RulePassRate(RootMetric):
13
+ """Fraction of business rules that hold for the document.
14
+
15
+ ``rules`` is a list of ``Rule`` (DSL) or ``Rule.custom(...)`` objects, each
16
+ exposing ``evaluate(document) -> RuleResult``. Per-rule outcomes are returned
17
+ as the result's ``extra["rule_results"]`` (serialized ``RuleResult`` dicts) —
18
+ read via ``report.metrics["rule_pass_rate"].extra_values("rule_results")``. An
19
+ empty rule list scores 1.0 (vacuously true).
20
+ """
21
+
22
+ name = "rule_pass_rate"
23
+
24
+ def __init__(self, rules: list[Any]):
25
+ self.rules = rules
26
+ self.processor = RuleProcessor()
27
+
28
+ def compute(self, node: EvalNode) -> tuple[float, dict[str, Any]]:
29
+ document = node.actual
30
+ results, pass_rate = self.processor.run(
31
+ self.rules, document if isinstance(document, dict) else {}
32
+ )
33
+ return pass_rate, {"rule_results": [r.model_dump() for r in results]}
@@ -0,0 +1,7 @@
1
+ from structured_eval.metrics.schema_validity.metric import SchemaValidity
2
+ from structured_eval.metrics.schema_validity.validator import (
3
+ SchemaResult,
4
+ SchemaValidator,
5
+ )
6
+
7
+ __all__ = ["SchemaResult", "SchemaValidator", "SchemaValidity"]
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.base import RootMetric
6
+ from structured_eval.metrics.schema_validity.validator import SchemaValidator
7
+
8
+ if TYPE_CHECKING:
9
+ from pydantic import BaseModel
10
+
11
+ from structured_eval.models.nodes.base import EvalNode
12
+
13
+
14
+ class SchemaValidity(RootMetric):
15
+ """Does the actual document validate against ``schema``? 1.0 / 0.0.
16
+
17
+ ``schema`` is a Pydantic model class or a JSON Schema dict. Validation
18
+ errors are returned as the result's ``extra["schema_errors"]`` — read via
19
+ ``report.metrics["schema_validity"].extra_values("schema_errors")``.
20
+ """
21
+
22
+ name = "schema_validity"
23
+
24
+ def __init__(self, schema: type[BaseModel] | dict[str, Any]):
25
+ self.validator = SchemaValidator(schema)
26
+
27
+ def compute(self, node: EvalNode) -> tuple[float, dict[str, Any]]:
28
+ result = self.validator.validate(node.actual)
29
+ return (1.0 if result.valid else 0.0), {
30
+ "schema_errors": {
31
+ "type_errors": result.type_errors,
32
+ "missing_required": result.missing_required,
33
+ "extra_fields": result.extra_fields,
34
+ }
35
+ }
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ValidationError, computed_field
6
+
7
+
8
+ class SchemaResult(BaseModel):
9
+ """Outcome of validating actual against a schema."""
10
+
11
+ valid: bool
12
+ type_errors: list[str] = []
13
+ missing_required: list[str] = []
14
+ extra_fields: list[str] = []
15
+ total_fields: int = 0
16
+
17
+ @computed_field # type: ignore[prop-decorator]
18
+ @property
19
+ def type_error_rate(self) -> float | None:
20
+ if self.total_fields == 0:
21
+ return None
22
+ return len(self.type_errors) / self.total_fields
23
+
24
+
25
+ class SchemaValidator:
26
+ """Validates a document against a Pydantic model class or JSON Schema dict.
27
+
28
+ Constructed with the ``schema`` once; ``validate(actual)`` returns a
29
+ ``SchemaResult`` describing type errors, missing required and extra fields.
30
+ """
31
+
32
+ def __init__(self, schema: type[BaseModel] | dict[str, Any]):
33
+ self.schema = schema
34
+
35
+ def validate(self, actual: Any) -> SchemaResult:
36
+ schema = self.schema
37
+ if isinstance(schema, type) and issubclass(schema, BaseModel):
38
+ return self._validate_pydantic(actual, schema)
39
+ if isinstance(schema, dict):
40
+ return self._validate_jsonschema(actual, schema)
41
+ raise TypeError(
42
+ f"schema must be a Pydantic BaseModel subclass or a dict, got {type(schema)!r}"
43
+ )
44
+
45
+ # ── Pydantic ──────────────────────────────────────────────────────────
46
+
47
+ def _validate_pydantic(self, actual: Any, model: type[BaseModel]) -> SchemaResult:
48
+ total = len(model.model_fields)
49
+ try:
50
+ model.model_validate(actual)
51
+ return SchemaResult(valid=True, total_fields=total)
52
+ except ValidationError as exc:
53
+ type_errors: list[str] = []
54
+ missing_required: list[str] = []
55
+ extra_fields: list[str] = []
56
+
57
+ for err in exc.errors():
58
+ loc = ".".join(str(p) for p in err["loc"])
59
+ kind = err["type"]
60
+ if kind == "missing":
61
+ missing_required.append(loc)
62
+ elif kind == "extra_forbidden":
63
+ extra_fields.append(loc)
64
+ else:
65
+ type_errors.append(loc)
66
+
67
+ return SchemaResult(
68
+ valid=False,
69
+ type_errors=type_errors,
70
+ missing_required=missing_required,
71
+ extra_fields=extra_fields,
72
+ total_fields=total,
73
+ )
74
+
75
+ # ── JSON Schema ─────────────────────────────────────────────────────────
76
+
77
+ def _validate_jsonschema(self, actual: Any, schema: dict[str, Any]) -> SchemaResult:
78
+ try:
79
+ from jsonschema import Draft7Validator
80
+ except ImportError as exc: # pragma: no cover
81
+ raise ImportError(
82
+ "jsonschema is required for dict-schema validation. "
83
+ "Install it with: pip install 'structured-eval[jsonschema]'"
84
+ ) from exc
85
+
86
+ total = len(schema.get("properties", {}))
87
+ validator = Draft7Validator(schema)
88
+ errors = list(validator.iter_errors(actual))
89
+
90
+ if not errors:
91
+ return SchemaResult(valid=True, total_fields=total)
92
+
93
+ type_errors: list[str] = []
94
+ missing_required: list[str] = []
95
+ extra_fields: list[str] = []
96
+
97
+ for err in errors:
98
+ loc = (
99
+ ".".join(str(p) for p in err.absolute_path)
100
+ if err.absolute_path
101
+ else err.json_path
102
+ )
103
+ if err.validator == "required":
104
+ # err.message names the missing field directly
105
+ missing_field = err.message.split("'")[1] if "'" in err.message else loc
106
+ missing_required.append(missing_field)
107
+ elif err.validator == "additionalProperties":
108
+ extra_fields.append(loc or err.message)
109
+ else:
110
+ # type and other validators (pattern, minLength, …) → type errors
111
+ type_errors.append(loc)
112
+
113
+ return SchemaResult(
114
+ valid=False,
115
+ type_errors=type_errors,
116
+ missing_required=missing_required,
117
+ extra_fields=extra_fields,
118
+ total_fields=total,
119
+ )
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from structured_eval.metrics.base import RootMetric
6
+ from structured_eval.utils.flatten import extract_paths
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.metrics.base import MetricOutput
10
+ from structured_eval.models.nodes.base import EvalNode
11
+
12
+
13
+ class StructuralSimilarity(RootMetric):
14
+ """Structural similarity of two documents — Jaccard over their paths.
15
+
16
+ Compares the *shape* of ``actual`` and ``expected``, ignoring values::
17
+
18
+ |paths_actual ∩ paths_expected| / |paths_actual ∪ paths_expected|
19
+
20
+ where a path is enumerated for every dict key, list index and nested
21
+ sub-path (containers and leaves alike — see
22
+ :func:`~structured_eval.utils.flatten.extract_paths`). Returns ``1.0``
23
+ for identical structure (both empty → vacuously ``1.0``), ``0.0`` for no
24
+ shared path, and a value in ``(0, 1)`` otherwise. A complement to the
25
+ value-aware metrics: it answers "did the model produce the right skeleton"
26
+ regardless of whether the values are correct.
27
+ """
28
+
29
+ name = "structural_similarity"
30
+
31
+ def compute(self, node: EvalNode) -> MetricOutput:
32
+ paths_a = extract_paths(node.context.actual)
33
+ paths_e = extract_paths(node.context.expected)
34
+
35
+ if not paths_a and not paths_e:
36
+ return 1.0
37
+ if not paths_a or not paths_e:
38
+ return 0.0
39
+
40
+ return len(paths_a & paths_e) / len(paths_a | paths_e)
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections import Counter
5
+ from typing import Any
6
+
7
+ from structured_eval.metrics.base import FieldMetric
8
+
9
+ _NON_WORD = re.compile(r"[^\w\s]")
10
+
11
+
12
+ def _tokenize(value: Any) -> list[str]:
13
+ """Lowercase, drop punctuation, split on whitespace."""
14
+ return _NON_WORD.sub(" ", str(value).lower()).split()
15
+
16
+
17
+ class TokenF1(FieldMetric):
18
+ """SQuAD-style token-overlap F1 — a default for free-text fields.
19
+
20
+ Tokens are matched as a **multiset** (``Counter``), counting shared tokens
21
+ with multiplicity exactly like the official SQuAD F1 — so a repeated token
22
+ only helps as often as it appears on both sides (``"the the cat"`` vs
23
+ ``"the cat"`` is 0.8, not 1.0). Precision and recall are over the token
24
+ *counts*; their harmonic mean is the score. String-only: if either side is
25
+ not a ``str`` the score is 0.0 (no coercion).
26
+ """
27
+
28
+ name = "token_f1"
29
+
30
+ def score(self, actual: Any, expected: Any) -> float:
31
+ if not (isinstance(actual, str) and isinstance(expected, str)):
32
+ return 0.0
33
+ a = _tokenize(actual)
34
+ e = _tokenize(expected)
35
+ if not a and not e:
36
+ return 1.0
37
+ if not a or not e:
38
+ return 0.0
39
+ same = sum((Counter(a) & Counter(e)).values())
40
+ if not same:
41
+ return 0.0
42
+ precision = same / len(a)
43
+ recall = same / len(e)
44
+ return 2 * precision * recall / (precision + recall)
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.metrics.base import FieldMetric
6
+
7
+
8
+ def _json_type(value: Any) -> str:
9
+ """Map a Python value to its JSON type name (bool before int)."""
10
+ if isinstance(value, bool):
11
+ return "bool"
12
+ if isinstance(value, (int, float)):
13
+ return "number"
14
+ if isinstance(value, str):
15
+ return "string"
16
+ if isinstance(value, list):
17
+ return "array"
18
+ if isinstance(value, dict):
19
+ return "object"
20
+ if value is None:
21
+ return "null"
22
+ return type(value).__name__
23
+
24
+
25
+ class TypeMatch(FieldMetric):
26
+ """Right JSON type? 1.0 if actual and expected share a type, else 0.0.
27
+
28
+ Catches a common LLM error — emitting ``"100"`` (string) where ``100``
29
+ (number) is expected — independently of value correctness.
30
+ """
31
+
32
+ name = "type_match"
33
+
34
+ def score(self, actual: Any, expected: Any) -> float:
35
+ return 1.0 if _json_type(actual) == _json_type(expected) else 0.0
@@ -0,0 +1,10 @@
1
+ """Utilities shared across metric implementations (metric-layer only).
2
+
3
+ Clearly-scoped modules:
4
+
5
+ * ``calculate`` — the precision / recall / F1 arithmetic (and the ``GradingMode``
6
+ hard/soft enum) used by every P/R/F1 metric, object and array alike.
7
+ * ``object_utils`` — turning an object's matched fields into the
8
+ ``(score, threshold)`` pairs that ``calculate.prf_counts`` consumes.
9
+ * ``array`` — the same for an array's aligned items, plus missing/spurious counts.
10
+ """
@@ -0,0 +1,31 @@
1
+ """Verdicts for array metrics: aligned items → ``(score, threshold, weight)``.
2
+
3
+ An aligned item is graded by its representative score against a single
4
+ ``threshold`` (hard) or contributes that score fractionally (soft) — mirroring
5
+ how object fields are graded. ``missed`` items are FN, ``spurious`` items FP.
6
+ The verdicts feed ``calculate.prf_counts``.
7
+
8
+ Array elements share one ``item`` config, so they carry no individual weights:
9
+ every item (and every missed/spurious slot) weighs ``1.0`` and array metrics are
10
+ effectively count-based.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ if TYPE_CHECKING:
18
+ from structured_eval.models.nodes.array_node import ArrayNode
19
+
20
+
21
+ def verdicts(node: ArrayNode, threshold: float) -> list[tuple[float, float, float]]:
22
+ """``(representative, threshold, weight=1.0)`` for each aligned item."""
23
+ return [(item.representative, threshold, 1.0) for item in node.items]
24
+
25
+
26
+ def missing_spurious(node: ArrayNode) -> tuple[int, int]:
27
+ """``(n_missed, n_spurious)`` from the array's alignment result."""
28
+ mr = node.match_result
29
+ if mr is None:
30
+ return 0, 0
31
+ return len(mr.missed), len(mr.spurious)
@@ -0,0 +1,72 @@
1
+ """Precision / recall / F1 arithmetic over resolved field/item verdicts.
2
+
3
+ Each matched scalar field (or array item) is both a *predicted* and an
4
+ *expected* entry; ``spurious`` entries add to predicted (FP), ``missing`` ones
5
+ add to expected (FN). So a present-but-wrong entry lowers both precision and
6
+ recall. Nested object/array children are graded at their own node and are not
7
+ counted here.
8
+
9
+ A ``verdicts`` argument is a list of ``(score, threshold, weight)`` from
10
+ ``structured_eval.metrics.utils.verdicts``. Each entry contributes its
11
+ ``weight`` (``1.0`` by default → plain counts) rather than a flat 1: in
12
+ ``GradingMode.HARD`` an entry is a TP when ``score >= threshold`` (counts its
13
+ weight); in ``GradingMode.SOFT`` it contributes ``weight * score`` (threshold
14
+ ignored). ``missing_weight`` / ``spurious_weight`` are the summed weights of the
15
+ FN / FP entries (counts when uniform). How those weights are derived is the
16
+ caller's choice (see ``WeightMode``).
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from enum import StrEnum
22
+
23
+
24
+ class GradingMode(StrEnum):
25
+ """How a verdict counts toward true positives."""
26
+
27
+ HARD = "hard" # threshold gate: TP iff score >= threshold (counts its weight)
28
+ SOFT = "soft" # graded: weight * score contributes, no threshold
29
+
30
+
31
+ class WeightMode(StrEnum):
32
+ """How a node's children contribute to its weighted aggregate.
33
+
34
+ Extensible: more strategies (e.g. only first-level weights, or uniform per
35
+ level) can be added without touching the arithmetic below.
36
+ """
37
+
38
+ NONE = "none" # ignore configured weights — every child counts 1.0
39
+ PROPORTIONAL = "proportional" # weight each child by its configured ``weight``
40
+
41
+
42
+ def prf_counts(
43
+ verdicts: list[tuple[float, float, float]],
44
+ missing_weight: float,
45
+ spurious_weight: float,
46
+ mode: GradingMode = GradingMode.HARD,
47
+ ) -> tuple[float, float, float]:
48
+ """Return weighted ``(tp, predicted, expected)``; uniform weights → counts."""
49
+ matched_weight = sum(weight for _, _, weight in verdicts)
50
+ predicted = matched_weight + spurious_weight
51
+ expected = matched_weight + missing_weight
52
+ if mode == GradingMode.SOFT:
53
+ tp = sum(weight * score for score, _, weight in verdicts)
54
+ else:
55
+ tp = sum(weight for score, threshold, weight in verdicts if score >= threshold)
56
+ return tp, predicted, expected
57
+
58
+
59
+ def precision(tp: float, predicted: float, expected: float) -> float:
60
+ if predicted == 0:
61
+ return 1.0 if expected == 0 else 0.0 # empty object is vacuously precise
62
+ return tp / predicted
63
+
64
+
65
+ def recall(tp: float, predicted: float, expected: float) -> float:
66
+ if expected == 0:
67
+ return 1.0 if predicted == 0 else 0.0
68
+ return tp / expected
69
+
70
+
71
+ def f1(p: float, r: float) -> float:
72
+ return 2 * p * r / (p + r) if (p + r) else 0.0