structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import operator
|
|
5
|
+
import re
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from structured_eval.models.result import RuleResult
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
|
|
13
|
+
# Matches a bare JSONPath like "$.field" or "$.nested.child"
|
|
14
|
+
_PLAIN_PATH_RE = re.compile(r"^\$(?:\.[a-zA-Z_][a-zA-Z0-9_]*)+$")
|
|
15
|
+
# Matches any JSONPath fragment inside a larger expression
|
|
16
|
+
_PATH_IN_EXPR_RE = re.compile(r"\$(?:\.[a-zA-Z_][a-zA-Z0-9_]*)+")
|
|
17
|
+
|
|
18
|
+
_ARITH_OPS: dict[type[ast.operator], Any] = {
|
|
19
|
+
ast.Add: operator.add,
|
|
20
|
+
ast.Sub: operator.sub,
|
|
21
|
+
ast.Mult: operator.mul,
|
|
22
|
+
ast.Div: operator.truediv,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ── jsonpath-ng lazy import ────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ensure_jsonpath() -> None:
|
|
30
|
+
try:
|
|
31
|
+
import jsonpath_ng # noqa: F401
|
|
32
|
+
except ImportError as e:
|
|
33
|
+
raise ImportError(
|
|
34
|
+
"jsonpath-ng is required for the Rule DSL. "
|
|
35
|
+
"Install it with: pip install 'structured-eval[rules]'"
|
|
36
|
+
) from e
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _resolve_path(path: str, document: dict[str, Any]) -> Any:
|
|
40
|
+
_ensure_jsonpath()
|
|
41
|
+
from jsonpath_ng import parse
|
|
42
|
+
|
|
43
|
+
matches = parse(path).find(document)
|
|
44
|
+
if not matches:
|
|
45
|
+
raise KeyError(f"Path {path!r} not found in document")
|
|
46
|
+
return matches[0].value
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ── Arithmetic expression evaluator ───────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _eval_arithmetic(expr: str, document: dict[str, Any]) -> Any:
|
|
53
|
+
"""Resolve JSONPath fragments inside expr, then evaluate safe arithmetic."""
|
|
54
|
+
|
|
55
|
+
def _replace(m: re.Match[str]) -> str:
|
|
56
|
+
return repr(_resolve_path(m.group(), document))
|
|
57
|
+
|
|
58
|
+
resolved = _PATH_IN_EXPR_RE.sub(_replace, expr)
|
|
59
|
+
tree = ast.parse(resolved, mode="eval")
|
|
60
|
+
return _eval_node(tree.body)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _eval_node(node: ast.expr) -> Any:
|
|
64
|
+
if isinstance(node, ast.Constant):
|
|
65
|
+
return node.value
|
|
66
|
+
if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub):
|
|
67
|
+
return -_eval_node(node.operand)
|
|
68
|
+
if isinstance(node, ast.BinOp):
|
|
69
|
+
op_fn = _ARITH_OPS.get(type(node.op))
|
|
70
|
+
if op_fn is None:
|
|
71
|
+
raise ValueError(f"Unsupported operator: {type(node.op).__name__}")
|
|
72
|
+
return op_fn(_eval_node(node.left), _eval_node(node.right))
|
|
73
|
+
raise ValueError(f"Unsupported expression node: {type(node).__name__}")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ── CustomRule ────────────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class _CustomRule:
|
|
80
|
+
"""Returned by Rule.custom(). Evaluates an arbitrary function over a document."""
|
|
81
|
+
|
|
82
|
+
def __init__(self, fn: Callable[[dict[str, Any]], bool], *, name: str = "") -> None:
|
|
83
|
+
self._fn = fn
|
|
84
|
+
self._name = name
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def name(self) -> str:
|
|
88
|
+
return self._name or "custom"
|
|
89
|
+
|
|
90
|
+
def evaluate(self, document: dict[str, Any]) -> RuleResult:
|
|
91
|
+
try:
|
|
92
|
+
passed = bool(self._fn(document))
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
return RuleResult(name=self.name, passed=False, message=str(exc))
|
|
95
|
+
return RuleResult(
|
|
96
|
+
name=self.name,
|
|
97
|
+
passed=passed,
|
|
98
|
+
message="" if passed else "custom rule failed",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ── Rule ──────────────────────────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class Rule:
|
|
106
|
+
"""JSONPath-based document constraint.
|
|
107
|
+
|
|
108
|
+
Chain a comparison method to create a bound rule, then pass it to
|
|
109
|
+
EvalConfig.rules or call evaluate() directly.
|
|
110
|
+
|
|
111
|
+
Examples::
|
|
112
|
+
|
|
113
|
+
Rule("$.status").eq("paid")
|
|
114
|
+
Rule("$.total").gte(0)
|
|
115
|
+
Rule("$.total").eq("$.subtotal + $.tax")
|
|
116
|
+
Rule("$.currency").in_(["USD", "EUR"])
|
|
117
|
+
Rule.custom(lambda doc: doc["amount"] > 0, name="positive_amount")
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, path: str, *, name: str = "") -> None:
|
|
121
|
+
self._path = path
|
|
122
|
+
self._name = name
|
|
123
|
+
self._op: str | None = None
|
|
124
|
+
self._rhs: Any = None
|
|
125
|
+
|
|
126
|
+
# ── Builder ───────────────────────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
def _bind(self, op: str, rhs: Any) -> Rule:
|
|
129
|
+
r = Rule(self._path, name=self._name)
|
|
130
|
+
r._op = op
|
|
131
|
+
r._rhs = rhs
|
|
132
|
+
return r
|
|
133
|
+
|
|
134
|
+
def eq(self, rhs: Any) -> Rule:
|
|
135
|
+
return self._bind("eq", rhs)
|
|
136
|
+
|
|
137
|
+
def lt(self, rhs: Any) -> Rule:
|
|
138
|
+
return self._bind("lt", rhs)
|
|
139
|
+
|
|
140
|
+
def gt(self, rhs: Any) -> Rule:
|
|
141
|
+
return self._bind("gt", rhs)
|
|
142
|
+
|
|
143
|
+
def lte(self, rhs: Any) -> Rule:
|
|
144
|
+
return self._bind("lte", rhs)
|
|
145
|
+
|
|
146
|
+
def gte(self, rhs: Any) -> Rule:
|
|
147
|
+
return self._bind("gte", rhs)
|
|
148
|
+
|
|
149
|
+
def in_(self, collection: Any) -> Rule:
|
|
150
|
+
return self._bind("in", collection)
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def custom(
|
|
154
|
+
cls, fn: Callable[[dict[str, Any]], bool], *, name: str = ""
|
|
155
|
+
) -> _CustomRule:
|
|
156
|
+
"""Wrap an arbitrary function as a rule.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
fn: Callable(document) -> bool. Receives the full document dict.
|
|
160
|
+
name: Human-readable name shown in reports.
|
|
161
|
+
"""
|
|
162
|
+
return _CustomRule(fn=fn, name=name)
|
|
163
|
+
|
|
164
|
+
# ── Evaluation ────────────────────────────────────────────────────────────
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def name(self) -> str:
|
|
168
|
+
if self._name:
|
|
169
|
+
return self._name
|
|
170
|
+
if self._op is not None:
|
|
171
|
+
rhs_str = self._rhs if isinstance(self._rhs, str) else repr(self._rhs)
|
|
172
|
+
return f"{self._path} {self._op} {rhs_str}"
|
|
173
|
+
return self._path
|
|
174
|
+
|
|
175
|
+
def evaluate(self, document: dict[str, Any]) -> RuleResult:
|
|
176
|
+
if self._op is None:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
f"Rule {self._path!r} has no comparison — call .eq(), .lt(), etc."
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
lhs = _resolve_path(self._path, document)
|
|
183
|
+
except (KeyError, ImportError) as exc:
|
|
184
|
+
return RuleResult(name=self.name, passed=False, message=str(exc))
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
rhs = self._resolve_rhs(self._rhs, document)
|
|
188
|
+
except Exception as exc:
|
|
189
|
+
return RuleResult(name=self.name, passed=False, message=str(exc))
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
passed = self._compare(lhs, rhs)
|
|
193
|
+
except TypeError as exc:
|
|
194
|
+
return RuleResult(name=self.name, passed=False, message=str(exc))
|
|
195
|
+
|
|
196
|
+
msg = (
|
|
197
|
+
""
|
|
198
|
+
if passed
|
|
199
|
+
else f"{self._path!r} ({lhs!r}) does not satisfy {self._op}({rhs!r})"
|
|
200
|
+
)
|
|
201
|
+
return RuleResult(name=self.name, passed=passed, message=msg)
|
|
202
|
+
|
|
203
|
+
def _resolve_rhs(self, rhs: Any, document: dict[str, Any]) -> Any:
|
|
204
|
+
if isinstance(rhs, str) and "$" in rhs:
|
|
205
|
+
stripped = rhs.strip()
|
|
206
|
+
if _PLAIN_PATH_RE.match(stripped):
|
|
207
|
+
return _resolve_path(stripped, document)
|
|
208
|
+
return _eval_arithmetic(stripped, document)
|
|
209
|
+
return rhs
|
|
210
|
+
|
|
211
|
+
def _compare(self, lhs: Any, rhs: Any) -> bool:
|
|
212
|
+
if self._op == "eq":
|
|
213
|
+
return bool(lhs == rhs)
|
|
214
|
+
if self._op == "lt":
|
|
215
|
+
return bool(lhs < rhs)
|
|
216
|
+
if self._op == "gt":
|
|
217
|
+
return bool(lhs > rhs)
|
|
218
|
+
if self._op == "lte":
|
|
219
|
+
return bool(lhs <= rhs)
|
|
220
|
+
if self._op == "gte":
|
|
221
|
+
return bool(lhs >= rhs)
|
|
222
|
+
if self._op == "in":
|
|
223
|
+
return lhs in rhs
|
|
224
|
+
raise ValueError(f"Unknown operator: {self._op!r}")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from structured_eval.models.result import RuleResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RuleProcessor:
|
|
10
|
+
"""Evaluates a list of business rules against a document.
|
|
11
|
+
|
|
12
|
+
Each rule must expose ``evaluate(document) -> RuleResult`` (satisfied by both
|
|
13
|
+
``Rule`` and the result of ``Rule.custom()``). ``run`` returns the per-rule
|
|
14
|
+
results and the pass rate (1.0 when there are no rules).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def run(
|
|
18
|
+
self, rules: list[Any], document: dict[str, Any]
|
|
19
|
+
) -> tuple[list[RuleResult], float]:
|
|
20
|
+
results: list[RuleResult] = [rule.evaluate(document) for rule in rules]
|
|
21
|
+
if not results:
|
|
22
|
+
return results, 1.0
|
|
23
|
+
pass_rate = sum(1 for r in results if r.passed) / len(results)
|
|
24
|
+
return results, pass_rate
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import RootMetric
|
|
6
|
+
from structured_eval.metrics.rule_pass_rate.engine import RuleProcessor
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RulePassRate(RootMetric):
|
|
13
|
+
"""Fraction of business rules that hold for the document.
|
|
14
|
+
|
|
15
|
+
``rules`` is a list of ``Rule`` (DSL) or ``Rule.custom(...)`` objects, each
|
|
16
|
+
exposing ``evaluate(document) -> RuleResult``. Per-rule outcomes are returned
|
|
17
|
+
as the result's ``extra["rule_results"]`` (serialized ``RuleResult`` dicts) —
|
|
18
|
+
read via ``report.metrics["rule_pass_rate"].extra_values("rule_results")``. An
|
|
19
|
+
empty rule list scores 1.0 (vacuously true).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "rule_pass_rate"
|
|
23
|
+
|
|
24
|
+
def __init__(self, rules: list[Any]):
|
|
25
|
+
self.rules = rules
|
|
26
|
+
self.processor = RuleProcessor()
|
|
27
|
+
|
|
28
|
+
def compute(self, node: EvalNode) -> tuple[float, dict[str, Any]]:
|
|
29
|
+
document = node.actual
|
|
30
|
+
results, pass_rate = self.processor.run(
|
|
31
|
+
self.rules, document if isinstance(document, dict) else {}
|
|
32
|
+
)
|
|
33
|
+
return pass_rate, {"rule_results": [r.model_dump() for r in results]}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import RootMetric
|
|
6
|
+
from structured_eval.metrics.schema_validity.validator import SchemaValidator
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SchemaValidity(RootMetric):
|
|
15
|
+
"""Does the actual document validate against ``schema``? 1.0 / 0.0.
|
|
16
|
+
|
|
17
|
+
``schema`` is a Pydantic model class or a JSON Schema dict. Validation
|
|
18
|
+
errors are returned as the result's ``extra["schema_errors"]`` — read via
|
|
19
|
+
``report.metrics["schema_validity"].extra_values("schema_errors")``.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "schema_validity"
|
|
23
|
+
|
|
24
|
+
def __init__(self, schema: type[BaseModel] | dict[str, Any]):
|
|
25
|
+
self.validator = SchemaValidator(schema)
|
|
26
|
+
|
|
27
|
+
def compute(self, node: EvalNode) -> tuple[float, dict[str, Any]]:
|
|
28
|
+
result = self.validator.validate(node.actual)
|
|
29
|
+
return (1.0 if result.valid else 0.0), {
|
|
30
|
+
"schema_errors": {
|
|
31
|
+
"type_errors": result.type_errors,
|
|
32
|
+
"missing_required": result.missing_required,
|
|
33
|
+
"extra_fields": result.extra_fields,
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ValidationError, computed_field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SchemaResult(BaseModel):
|
|
9
|
+
"""Outcome of validating actual against a schema."""
|
|
10
|
+
|
|
11
|
+
valid: bool
|
|
12
|
+
type_errors: list[str] = []
|
|
13
|
+
missing_required: list[str] = []
|
|
14
|
+
extra_fields: list[str] = []
|
|
15
|
+
total_fields: int = 0
|
|
16
|
+
|
|
17
|
+
@computed_field # type: ignore[prop-decorator]
|
|
18
|
+
@property
|
|
19
|
+
def type_error_rate(self) -> float | None:
|
|
20
|
+
if self.total_fields == 0:
|
|
21
|
+
return None
|
|
22
|
+
return len(self.type_errors) / self.total_fields
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SchemaValidator:
|
|
26
|
+
"""Validates a document against a Pydantic model class or JSON Schema dict.
|
|
27
|
+
|
|
28
|
+
Constructed with the ``schema`` once; ``validate(actual)`` returns a
|
|
29
|
+
``SchemaResult`` describing type errors, missing required and extra fields.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, schema: type[BaseModel] | dict[str, Any]):
|
|
33
|
+
self.schema = schema
|
|
34
|
+
|
|
35
|
+
def validate(self, actual: Any) -> SchemaResult:
|
|
36
|
+
schema = self.schema
|
|
37
|
+
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
|
38
|
+
return self._validate_pydantic(actual, schema)
|
|
39
|
+
if isinstance(schema, dict):
|
|
40
|
+
return self._validate_jsonschema(actual, schema)
|
|
41
|
+
raise TypeError(
|
|
42
|
+
f"schema must be a Pydantic BaseModel subclass or a dict, got {type(schema)!r}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# ── Pydantic ──────────────────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
def _validate_pydantic(self, actual: Any, model: type[BaseModel]) -> SchemaResult:
|
|
48
|
+
total = len(model.model_fields)
|
|
49
|
+
try:
|
|
50
|
+
model.model_validate(actual)
|
|
51
|
+
return SchemaResult(valid=True, total_fields=total)
|
|
52
|
+
except ValidationError as exc:
|
|
53
|
+
type_errors: list[str] = []
|
|
54
|
+
missing_required: list[str] = []
|
|
55
|
+
extra_fields: list[str] = []
|
|
56
|
+
|
|
57
|
+
for err in exc.errors():
|
|
58
|
+
loc = ".".join(str(p) for p in err["loc"])
|
|
59
|
+
kind = err["type"]
|
|
60
|
+
if kind == "missing":
|
|
61
|
+
missing_required.append(loc)
|
|
62
|
+
elif kind == "extra_forbidden":
|
|
63
|
+
extra_fields.append(loc)
|
|
64
|
+
else:
|
|
65
|
+
type_errors.append(loc)
|
|
66
|
+
|
|
67
|
+
return SchemaResult(
|
|
68
|
+
valid=False,
|
|
69
|
+
type_errors=type_errors,
|
|
70
|
+
missing_required=missing_required,
|
|
71
|
+
extra_fields=extra_fields,
|
|
72
|
+
total_fields=total,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# ── JSON Schema ─────────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
def _validate_jsonschema(self, actual: Any, schema: dict[str, Any]) -> SchemaResult:
|
|
78
|
+
try:
|
|
79
|
+
from jsonschema import Draft7Validator
|
|
80
|
+
except ImportError as exc: # pragma: no cover
|
|
81
|
+
raise ImportError(
|
|
82
|
+
"jsonschema is required for dict-schema validation. "
|
|
83
|
+
"Install it with: pip install 'structured-eval[jsonschema]'"
|
|
84
|
+
) from exc
|
|
85
|
+
|
|
86
|
+
total = len(schema.get("properties", {}))
|
|
87
|
+
validator = Draft7Validator(schema)
|
|
88
|
+
errors = list(validator.iter_errors(actual))
|
|
89
|
+
|
|
90
|
+
if not errors:
|
|
91
|
+
return SchemaResult(valid=True, total_fields=total)
|
|
92
|
+
|
|
93
|
+
type_errors: list[str] = []
|
|
94
|
+
missing_required: list[str] = []
|
|
95
|
+
extra_fields: list[str] = []
|
|
96
|
+
|
|
97
|
+
for err in errors:
|
|
98
|
+
loc = (
|
|
99
|
+
".".join(str(p) for p in err.absolute_path)
|
|
100
|
+
if err.absolute_path
|
|
101
|
+
else err.json_path
|
|
102
|
+
)
|
|
103
|
+
if err.validator == "required":
|
|
104
|
+
# err.message names the missing field directly
|
|
105
|
+
missing_field = err.message.split("'")[1] if "'" in err.message else loc
|
|
106
|
+
missing_required.append(missing_field)
|
|
107
|
+
elif err.validator == "additionalProperties":
|
|
108
|
+
extra_fields.append(loc or err.message)
|
|
109
|
+
else:
|
|
110
|
+
# type and other validators (pattern, minLength, …) → type errors
|
|
111
|
+
type_errors.append(loc)
|
|
112
|
+
|
|
113
|
+
return SchemaResult(
|
|
114
|
+
valid=False,
|
|
115
|
+
type_errors=type_errors,
|
|
116
|
+
missing_required=missing_required,
|
|
117
|
+
extra_fields=extra_fields,
|
|
118
|
+
total_fields=total,
|
|
119
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import RootMetric
|
|
6
|
+
from structured_eval.utils.flatten import extract_paths
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.metrics.base import MetricOutput
|
|
10
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StructuralSimilarity(RootMetric):
|
|
14
|
+
"""Structural similarity of two documents — Jaccard over their paths.
|
|
15
|
+
|
|
16
|
+
Compares the *shape* of ``actual`` and ``expected``, ignoring values::
|
|
17
|
+
|
|
18
|
+
|paths_actual ∩ paths_expected| / |paths_actual ∪ paths_expected|
|
|
19
|
+
|
|
20
|
+
where a path is enumerated for every dict key, list index and nested
|
|
21
|
+
sub-path (containers and leaves alike — see
|
|
22
|
+
:func:`~structured_eval.utils.flatten.extract_paths`). Returns ``1.0``
|
|
23
|
+
for identical structure (both empty → vacuously ``1.0``), ``0.0`` for no
|
|
24
|
+
shared path, and a value in ``(0, 1)`` otherwise. A complement to the
|
|
25
|
+
value-aware metrics: it answers "did the model produce the right skeleton"
|
|
26
|
+
regardless of whether the values are correct.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name = "structural_similarity"
|
|
30
|
+
|
|
31
|
+
def compute(self, node: EvalNode) -> MetricOutput:
|
|
32
|
+
paths_a = extract_paths(node.context.actual)
|
|
33
|
+
paths_e = extract_paths(node.context.expected)
|
|
34
|
+
|
|
35
|
+
if not paths_a and not paths_e:
|
|
36
|
+
return 1.0
|
|
37
|
+
if not paths_a or not paths_e:
|
|
38
|
+
return 0.0
|
|
39
|
+
|
|
40
|
+
return len(paths_a & paths_e) / len(paths_a | paths_e)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from collections import Counter
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from structured_eval.metrics.base import FieldMetric
|
|
8
|
+
|
|
9
|
+
_NON_WORD = re.compile(r"[^\w\s]")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _tokenize(value: Any) -> list[str]:
|
|
13
|
+
"""Lowercase, drop punctuation, split on whitespace."""
|
|
14
|
+
return _NON_WORD.sub(" ", str(value).lower()).split()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TokenF1(FieldMetric):
|
|
18
|
+
"""SQuAD-style token-overlap F1 — a default for free-text fields.
|
|
19
|
+
|
|
20
|
+
Tokens are matched as a **multiset** (``Counter``), counting shared tokens
|
|
21
|
+
with multiplicity exactly like the official SQuAD F1 — so a repeated token
|
|
22
|
+
only helps as often as it appears on both sides (``"the the cat"`` vs
|
|
23
|
+
``"the cat"`` is 0.8, not 1.0). Precision and recall are over the token
|
|
24
|
+
*counts*; their harmonic mean is the score. String-only: if either side is
|
|
25
|
+
not a ``str`` the score is 0.0 (no coercion).
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "token_f1"
|
|
29
|
+
|
|
30
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
31
|
+
if not (isinstance(actual, str) and isinstance(expected, str)):
|
|
32
|
+
return 0.0
|
|
33
|
+
a = _tokenize(actual)
|
|
34
|
+
e = _tokenize(expected)
|
|
35
|
+
if not a and not e:
|
|
36
|
+
return 1.0
|
|
37
|
+
if not a or not e:
|
|
38
|
+
return 0.0
|
|
39
|
+
same = sum((Counter(a) & Counter(e)).values())
|
|
40
|
+
if not same:
|
|
41
|
+
return 0.0
|
|
42
|
+
precision = same / len(a)
|
|
43
|
+
recall = same / len(e)
|
|
44
|
+
return 2 * precision * recall / (precision + recall)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.base import FieldMetric
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _json_type(value: Any) -> str:
|
|
9
|
+
"""Map a Python value to its JSON type name (bool before int)."""
|
|
10
|
+
if isinstance(value, bool):
|
|
11
|
+
return "bool"
|
|
12
|
+
if isinstance(value, (int, float)):
|
|
13
|
+
return "number"
|
|
14
|
+
if isinstance(value, str):
|
|
15
|
+
return "string"
|
|
16
|
+
if isinstance(value, list):
|
|
17
|
+
return "array"
|
|
18
|
+
if isinstance(value, dict):
|
|
19
|
+
return "object"
|
|
20
|
+
if value is None:
|
|
21
|
+
return "null"
|
|
22
|
+
return type(value).__name__
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TypeMatch(FieldMetric):
|
|
26
|
+
"""Right JSON type? 1.0 if actual and expected share a type, else 0.0.
|
|
27
|
+
|
|
28
|
+
Catches a common LLM error — emitting ``"100"`` (string) where ``100``
|
|
29
|
+
(number) is expected — independently of value correctness.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "type_match"
|
|
33
|
+
|
|
34
|
+
def score(self, actual: Any, expected: Any) -> float:
|
|
35
|
+
return 1.0 if _json_type(actual) == _json_type(expected) else 0.0
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Utilities shared across metric implementations (metric-layer only).
|
|
2
|
+
|
|
3
|
+
Clearly-scoped modules:
|
|
4
|
+
|
|
5
|
+
* ``calculate`` — the precision / recall / F1 arithmetic (and the ``GradingMode``
|
|
6
|
+
hard/soft enum) used by every P/R/F1 metric, object and array alike.
|
|
7
|
+
* ``object_utils`` — turning an object's matched fields into the
|
|
8
|
+
``(score, threshold)`` pairs that ``calculate.prf_counts`` consumes.
|
|
9
|
+
* ``array`` — the same for an array's aligned items, plus missing/spurious counts.
|
|
10
|
+
"""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Verdicts for array metrics: aligned items → ``(score, threshold, weight)``.
|
|
2
|
+
|
|
3
|
+
An aligned item is graded by its representative score against a single
|
|
4
|
+
``threshold`` (hard) or contributes that score fractionally (soft) — mirroring
|
|
5
|
+
how object fields are graded. ``missed`` items are FN, ``spurious`` items FP.
|
|
6
|
+
The verdicts feed ``calculate.prf_counts``.
|
|
7
|
+
|
|
8
|
+
Array elements share one ``item`` config, so they carry no individual weights:
|
|
9
|
+
every item (and every missed/spurious slot) weighs ``1.0`` and array metrics are
|
|
10
|
+
effectively count-based.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def verdicts(node: ArrayNode, threshold: float) -> list[tuple[float, float, float]]:
|
|
22
|
+
"""``(representative, threshold, weight=1.0)`` for each aligned item."""
|
|
23
|
+
return [(item.representative, threshold, 1.0) for item in node.items]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def missing_spurious(node: ArrayNode) -> tuple[int, int]:
|
|
27
|
+
"""``(n_missed, n_spurious)`` from the array's alignment result."""
|
|
28
|
+
mr = node.match_result
|
|
29
|
+
if mr is None:
|
|
30
|
+
return 0, 0
|
|
31
|
+
return len(mr.missed), len(mr.spurious)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Precision / recall / F1 arithmetic over resolved field/item verdicts.
|
|
2
|
+
|
|
3
|
+
Each matched scalar field (or array item) is both a *predicted* and an
|
|
4
|
+
*expected* entry; ``spurious`` entries add to predicted (FP), ``missing`` ones
|
|
5
|
+
add to expected (FN). So a present-but-wrong entry lowers both precision and
|
|
6
|
+
recall. Nested object/array children are graded at their own node and are not
|
|
7
|
+
counted here.
|
|
8
|
+
|
|
9
|
+
A ``verdicts`` argument is a list of ``(score, threshold, weight)`` from
|
|
10
|
+
``structured_eval.metrics.utils.verdicts``. Each entry contributes its
|
|
11
|
+
``weight`` (``1.0`` by default → plain counts) rather than a flat 1: in
|
|
12
|
+
``GradingMode.HARD`` an entry is a TP when ``score >= threshold`` (counts its
|
|
13
|
+
weight); in ``GradingMode.SOFT`` it contributes ``weight * score`` (threshold
|
|
14
|
+
ignored). ``missing_weight`` / ``spurious_weight`` are the summed weights of the
|
|
15
|
+
FN / FP entries (counts when uniform). How those weights are derived is the
|
|
16
|
+
caller's choice (see ``WeightMode``).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from enum import StrEnum
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GradingMode(StrEnum):
|
|
25
|
+
"""How a verdict counts toward true positives."""
|
|
26
|
+
|
|
27
|
+
HARD = "hard" # threshold gate: TP iff score >= threshold (counts its weight)
|
|
28
|
+
SOFT = "soft" # graded: weight * score contributes, no threshold
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class WeightMode(StrEnum):
|
|
32
|
+
"""How a node's children contribute to its weighted aggregate.
|
|
33
|
+
|
|
34
|
+
Extensible: more strategies (e.g. only first-level weights, or uniform per
|
|
35
|
+
level) can be added without touching the arithmetic below.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
NONE = "none" # ignore configured weights — every child counts 1.0
|
|
39
|
+
PROPORTIONAL = "proportional" # weight each child by its configured ``weight``
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def prf_counts(
|
|
43
|
+
verdicts: list[tuple[float, float, float]],
|
|
44
|
+
missing_weight: float,
|
|
45
|
+
spurious_weight: float,
|
|
46
|
+
mode: GradingMode = GradingMode.HARD,
|
|
47
|
+
) -> tuple[float, float, float]:
|
|
48
|
+
"""Return weighted ``(tp, predicted, expected)``; uniform weights → counts."""
|
|
49
|
+
matched_weight = sum(weight for _, _, weight in verdicts)
|
|
50
|
+
predicted = matched_weight + spurious_weight
|
|
51
|
+
expected = matched_weight + missing_weight
|
|
52
|
+
if mode == GradingMode.SOFT:
|
|
53
|
+
tp = sum(weight * score for score, _, weight in verdicts)
|
|
54
|
+
else:
|
|
55
|
+
tp = sum(weight for score, threshold, weight in verdicts if score >= threshold)
|
|
56
|
+
return tp, predicted, expected
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def precision(tp: float, predicted: float, expected: float) -> float:
|
|
60
|
+
if predicted == 0:
|
|
61
|
+
return 1.0 if expected == 0 else 0.0 # empty object is vacuously precise
|
|
62
|
+
return tp / predicted
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def recall(tp: float, predicted: float, expected: float) -> float:
|
|
66
|
+
if expected == 0:
|
|
67
|
+
return 1.0 if predicted == 0 else 0.0
|
|
68
|
+
return tp / expected
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def f1(p: float, r: float) -> float:
|
|
72
|
+
return 2 * p * r / (p + r) if (p + r) else 0.0
|