structured-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structured_eval/__init__.py +27 -0
- structured_eval/alignment/__init__.py +15 -0
- structured_eval/alignment/base.py +40 -0
- structured_eval/alignment/by_index.py +24 -0
- structured_eval/alignment/by_key.py +73 -0
- structured_eval/alignment/factory.py +28 -0
- structured_eval/alignment/hungarian.py +156 -0
- structured_eval/api.py +79 -0
- structured_eval/engine/__init__.py +15 -0
- structured_eval/engine/aggregator.py +96 -0
- structured_eval/engine/evaluator.py +72 -0
- structured_eval/engine/metric_runner.py +69 -0
- structured_eval/engine/parser.py +42 -0
- structured_eval/engine/report_builder.py +68 -0
- structured_eval/engine/tree_builder.py +319 -0
- structured_eval/formats/__init__.py +5 -0
- structured_eval/formats/base.py +19 -0
- structured_eval/formats/json_parser.py +44 -0
- structured_eval/formats/yaml_parser.py +24 -0
- structured_eval/integrations/__init__.py +11 -0
- structured_eval/integrations/_adapter.py +47 -0
- structured_eval/integrations/deepeval.py +74 -0
- structured_eval/integrations/langsmith.py +90 -0
- structured_eval/metrics/__init__.py +101 -0
- structured_eval/metrics/array_accuracy.py +28 -0
- structured_eval/metrics/array_cardinality.py +27 -0
- structured_eval/metrics/array_exact_match.py +48 -0
- structured_eval/metrics/array_f1.py +34 -0
- structured_eval/metrics/array_jaccard_similarity.py +60 -0
- structured_eval/metrics/array_precision.py +34 -0
- structured_eval/metrics/array_prf1.py +40 -0
- structured_eval/metrics/array_recall.py +33 -0
- structured_eval/metrics/base.py +144 -0
- structured_eval/metrics/character_f1.py +50 -0
- structured_eval/metrics/composite_score.py +46 -0
- structured_eval/metrics/coverage_leaf_score.py +29 -0
- structured_eval/metrics/date_distance_score.py +63 -0
- structured_eval/metrics/exact.py +21 -0
- structured_eval/metrics/exponential_numeric_score.py +47 -0
- structured_eval/metrics/field_faithfulness.py +38 -0
- structured_eval/metrics/fuzzy.py +64 -0
- structured_eval/metrics/invoker.py +90 -0
- structured_eval/metrics/levenshtein.py +16 -0
- structured_eval/metrics/mean_score.py +31 -0
- structured_eval/metrics/numeric.py +83 -0
- structured_eval/metrics/numeric_closeness.py +35 -0
- structured_eval/metrics/object_accuracy.py +47 -0
- structured_eval/metrics/object_exact_match.py +41 -0
- structured_eval/metrics/object_f1.py +47 -0
- structured_eval/metrics/object_precision.py +49 -0
- structured_eval/metrics/object_prf1.py +51 -0
- structured_eval/metrics/object_recall.py +44 -0
- structured_eval/metrics/object_type_validity.py +34 -0
- structured_eval/metrics/overall_leaf_score.py +32 -0
- structured_eval/metrics/presence.py +22 -0
- structured_eval/metrics/regex_match.py +51 -0
- structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
- structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
- structured_eval/metrics/rule_pass_rate/engine.py +24 -0
- structured_eval/metrics/rule_pass_rate/metric.py +33 -0
- structured_eval/metrics/schema_validity/__init__.py +7 -0
- structured_eval/metrics/schema_validity/metric.py +35 -0
- structured_eval/metrics/schema_validity/validator.py +119 -0
- structured_eval/metrics/structural_similarity.py +40 -0
- structured_eval/metrics/token_f1.py +44 -0
- structured_eval/metrics/type_match.py +35 -0
- structured_eval/metrics/utils/__init__.py +10 -0
- structured_eval/metrics/utils/array.py +31 -0
- structured_eval/metrics/utils/calculate.py +72 -0
- structured_eval/metrics/utils/number.py +46 -0
- structured_eval/metrics/utils/object_utils.py +87 -0
- structured_eval/models/__init__.py +72 -0
- structured_eval/models/config.py +124 -0
- structured_eval/models/context.py +25 -0
- structured_eval/models/metric_result.py +121 -0
- structured_eval/models/nodes/__init__.py +13 -0
- structured_eval/models/nodes/array_node.py +32 -0
- structured_eval/models/nodes/base.py +113 -0
- structured_eval/models/nodes/object_node.py +19 -0
- structured_eval/models/nodes/scalar.py +14 -0
- structured_eval/models/result.py +361 -0
- structured_eval/models/sample.py +19 -0
- structured_eval/py.typed +0 -0
- structured_eval/reporting/__init__.py +5 -0
- structured_eval/reporting/console.py +194 -0
- structured_eval/utils/__init__.py +16 -0
- structured_eval/utils/flatten.py +66 -0
- structured_eval/utils/paths.py +58 -0
- structured_eval/utils/structured_diff.py +159 -0
- structured_eval-0.1.0.dist-info/METADATA +322 -0
- structured_eval-0.1.0.dist-info/RECORD +94 -0
- structured_eval-0.1.0.dist-info/WHEEL +5 -0
- structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
- structured_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.metrics.invoker import MetricInvoker
|
|
6
|
+
from structured_eval.models.metric_result import MetricResult
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from structured_eval.metrics.base import BaseMetric, MetricOutput
|
|
10
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MetricRunner:
|
|
14
|
+
"""Phase 2: compute each node's own metrics across the tree, in place.
|
|
15
|
+
|
|
16
|
+
Every node carries the metrics resolved for it by ``TreeBuilder``; this
|
|
17
|
+
phase walks the tree **post-order** (children before their parent), so an
|
|
18
|
+
aggregating parent reads its children's already-computed representative
|
|
19
|
+
scores — computation is uniform and fully recursive at any nesting depth.
|
|
20
|
+
Within a node the ``key_metric`` runs *last*: it is the representative score
|
|
21
|
+
and its logic may depend on the node's other metrics (the default
|
|
22
|
+
``MeanScore`` averages them). A metric returning ``None`` (e.g.
|
|
23
|
+
``Faithfulness`` without a source) is skipped.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def run(self, root: EvalNode) -> None:
|
|
27
|
+
self._visit(root)
|
|
28
|
+
|
|
29
|
+
def _visit(self, node: EvalNode) -> None:
|
|
30
|
+
for child in node.children_nodes():
|
|
31
|
+
self._visit(child)
|
|
32
|
+
key_metric = node.key_metric
|
|
33
|
+
for metric in node.metrics:
|
|
34
|
+
if metric is key_metric:
|
|
35
|
+
continue
|
|
36
|
+
self._apply(metric, node)
|
|
37
|
+
if key_metric is not None:
|
|
38
|
+
self._apply(key_metric, node)
|
|
39
|
+
|
|
40
|
+
def _apply(self, metric: BaseMetric, node: EvalNode) -> None:
|
|
41
|
+
result = MetricInvoker(metric).on_node(node)
|
|
42
|
+
node.metric_results.update(self._normalize(metric.name, result))
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _normalize(name: str, result: MetricOutput) -> dict[str, MetricResult]:
|
|
46
|
+
"""Coerce any ``compute`` return into ``{key: MetricResult}``.
|
|
47
|
+
|
|
48
|
+
Accepts ``None`` (skip), a bare value, a ``dict`` of sub-scores, a
|
|
49
|
+
``MetricResult``, or a ``(value | dict, extra)`` tuple — so a metric can
|
|
50
|
+
attach structured ``extra`` regardless of how it shapes its score. A
|
|
51
|
+
tuple's ``extra`` is attached to every key it produces.
|
|
52
|
+
"""
|
|
53
|
+
if result is None:
|
|
54
|
+
return {}
|
|
55
|
+
extra: dict[str, Any] = {}
|
|
56
|
+
if isinstance(result, tuple):
|
|
57
|
+
result, extra = result
|
|
58
|
+
if isinstance(result, dict):
|
|
59
|
+
return {
|
|
60
|
+
k: MetricResult(v, {**getattr(v, "extra", {}), **extra})
|
|
61
|
+
for k, v in result.items()
|
|
62
|
+
}
|
|
63
|
+
if isinstance(result, MetricResult):
|
|
64
|
+
return {
|
|
65
|
+
name: MetricResult(result, {**result.extra, **extra})
|
|
66
|
+
if extra
|
|
67
|
+
else result
|
|
68
|
+
}
|
|
69
|
+
return {name: MetricResult(result, extra)}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.formats.base import ParseError
|
|
6
|
+
from structured_eval.formats.json_parser import JsonParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Parser:
|
|
10
|
+
"""Coerces raw sample input into Python values without raising.
|
|
11
|
+
|
|
12
|
+
Already-structured input (dict/list/None/scalars) passes through. A string
|
|
13
|
+
is parsed as JSON; if that fails it is retried as YAML (when PyYAML is
|
|
14
|
+
installed) and accepted only when it yields a dict or list. ``parse`` returns
|
|
15
|
+
``(value, None)`` on success or ``(None, message)`` on failure, so the engine
|
|
16
|
+
can surface a parse error in the ``EvalReport`` rather than blowing up.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self._json = JsonParser()
|
|
21
|
+
|
|
22
|
+
def parse(self, raw: Any) -> tuple[Any, str | None]:
|
|
23
|
+
if not isinstance(raw, str):
|
|
24
|
+
return raw, None
|
|
25
|
+
try:
|
|
26
|
+
return self._json.parse(raw), None
|
|
27
|
+
except ParseError as json_error:
|
|
28
|
+
value = self._try_yaml(raw)
|
|
29
|
+
if value is not None:
|
|
30
|
+
return value, None
|
|
31
|
+
return None, str(json_error)
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _try_yaml(text: str) -> Any | None:
|
|
35
|
+
"""Parse ``text`` as YAML, returning a dict/list or None on any failure."""
|
|
36
|
+
from structured_eval.formats.yaml_parser import YamlParser
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
value = YamlParser().parse(text)
|
|
40
|
+
except (ParseError, ImportError):
|
|
41
|
+
return None
|
|
42
|
+
return value if isinstance(value, (dict, list)) else None
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
4
|
+
|
|
5
|
+
from structured_eval.models.metric_result import MetricCollection
|
|
6
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
7
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
8
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
9
|
+
from structured_eval.models.result import EvalReport, EvalWarning, FieldScore, NodeType
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from structured_eval.models.context import EvalContext
|
|
13
|
+
from structured_eval.models.nodes.base import EvalNode
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ReportBuilder:
|
|
17
|
+
"""Phase 3: flatten the computed node tree into an ``EvalReport``."""
|
|
18
|
+
|
|
19
|
+
_NODE_TYPE: ClassVar[dict[type, NodeType]] = {
|
|
20
|
+
ScalarNode: NodeType.SCALAR,
|
|
21
|
+
ObjectNode: NodeType.OBJECT,
|
|
22
|
+
ArrayNode: NodeType.ARRAY,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def build(
|
|
26
|
+
self, root: EvalNode, context: EvalContext, warnings: list[EvalWarning]
|
|
27
|
+
) -> EvalReport:
|
|
28
|
+
field_scores = {}
|
|
29
|
+
array_matches = {}
|
|
30
|
+
# report.metrics is a cross-field view: each metric name → its value at
|
|
31
|
+
# every node that produced it (a MetricCollection), built as we walk.
|
|
32
|
+
# A metric's structured detail (schema errors, hallucinated paths, …)
|
|
33
|
+
# rides along on each value's ``.extra``.
|
|
34
|
+
collections: dict[str, MetricCollection] = {}
|
|
35
|
+
for node in root.walk():
|
|
36
|
+
field_scores[node.path] = self._field_score(node)
|
|
37
|
+
for name, result in node.metric_results.items():
|
|
38
|
+
coll = collections.setdefault(name, MetricCollection(name=name))
|
|
39
|
+
coll.by_path[node.path] = result
|
|
40
|
+
if isinstance(node, ArrayNode) and node.match_result is not None:
|
|
41
|
+
array_matches[node.path] = node.match_result
|
|
42
|
+
|
|
43
|
+
# The headline number is the root node's representative (key) metric.
|
|
44
|
+
score_label = root.key_metric.name if root.key_metric is not None else None
|
|
45
|
+
root_score = (
|
|
46
|
+
root.metric_results.get(score_label) if score_label is not None else None
|
|
47
|
+
)
|
|
48
|
+
score = float(root_score) if root_score is not None else None
|
|
49
|
+
|
|
50
|
+
return EvalReport(
|
|
51
|
+
score=score,
|
|
52
|
+
score_label=score_label,
|
|
53
|
+
metrics=collections,
|
|
54
|
+
field_scores=field_scores,
|
|
55
|
+
array_matches=array_matches,
|
|
56
|
+
warnings=warnings,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _field_score(self, node: EvalNode) -> FieldScore:
|
|
60
|
+
return FieldScore(
|
|
61
|
+
path=node.path,
|
|
62
|
+
node_type=self._NODE_TYPE.get(type(node), NodeType.SCALAR),
|
|
63
|
+
actual=node.actual,
|
|
64
|
+
expected=node.expected,
|
|
65
|
+
metrics=dict(node.metric_results),
|
|
66
|
+
score=node.representative, # the node's representative (key-metric) value
|
|
67
|
+
threshold=node.threshold,
|
|
68
|
+
)
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
from structured_eval.alignment import make_aligner
|
|
6
|
+
from structured_eval.metrics.array_accuracy import ArrayAccuracy
|
|
7
|
+
from structured_eval.metrics.base import (
|
|
8
|
+
AnyNodeMetric,
|
|
9
|
+
ArrayMetric,
|
|
10
|
+
BaseMetric,
|
|
11
|
+
FieldMetric,
|
|
12
|
+
GenericMetric,
|
|
13
|
+
Metric,
|
|
14
|
+
ObjectMetric,
|
|
15
|
+
RootMetric,
|
|
16
|
+
resolve_metric,
|
|
17
|
+
)
|
|
18
|
+
from structured_eval.metrics.exact import ExactMatch
|
|
19
|
+
from structured_eval.metrics.invoker import GENERIC_NODE_METHOD
|
|
20
|
+
from structured_eval.metrics.mean_score import MeanScore
|
|
21
|
+
from structured_eval.metrics.object_accuracy import ObjectAccuracy
|
|
22
|
+
from structured_eval.models.config import (
|
|
23
|
+
AnyFieldConfig,
|
|
24
|
+
ArrayFieldConfig,
|
|
25
|
+
ArrayStrategy,
|
|
26
|
+
EvalConfig,
|
|
27
|
+
ExtraKeysPolicy,
|
|
28
|
+
ObjectFieldConfig,
|
|
29
|
+
weight_of,
|
|
30
|
+
)
|
|
31
|
+
from structured_eval.models.nodes.array_node import ArrayNode
|
|
32
|
+
from structured_eval.models.nodes.base import MISSING, EvalNode, navigate
|
|
33
|
+
from structured_eval.models.nodes.object_node import ObjectNode
|
|
34
|
+
from structured_eval.models.nodes.scalar import ScalarNode
|
|
35
|
+
from structured_eval.models.result import EvalWarning, WarningType
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from structured_eval.models.context import EvalContext
|
|
39
|
+
|
|
40
|
+
# Metric a node falls back to when the user configured none of its type, so every
|
|
41
|
+
# node always carries at least one metric for its key_metric (MeanScore) to mean.
|
|
42
|
+
DEFAULT_SCALAR_METRIC = ExactMatch
|
|
43
|
+
DEFAULT_OBJECT_METRIC = ObjectAccuracy
|
|
44
|
+
DEFAULT_ARRAY_METRIC = ArrayAccuracy
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TreeBuilder:
|
|
48
|
+
"""Phase 1: build the EvalNode tree and resolve each node's metric list.
|
|
49
|
+
|
|
50
|
+
``build`` returns ``(root_node, warnings)``. This phase is purely
|
|
51
|
+
structural: it shapes the tree, resolves which metrics apply to each node
|
|
52
|
+
(cascading the config's global metrics by type and adding any per-node
|
|
53
|
+
``cfg.metrics``), and attaches them to ``node.metrics``. Computation happens
|
|
54
|
+
later, uniformly, in ``MetricRunner``. Each node carries an ``actual``-side
|
|
55
|
+
``path`` and, when arrays reorder elements, a diverging ``expected_path`` so
|
|
56
|
+
each side navigates its own index.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, context: EvalContext):
|
|
60
|
+
self.context = context
|
|
61
|
+
self.config: EvalConfig = context.config
|
|
62
|
+
self.warnings: list[EvalWarning] = []
|
|
63
|
+
self._globals = self._resolve_globals()
|
|
64
|
+
|
|
65
|
+
def build(self) -> tuple[EvalNode, list[EvalWarning]]:
|
|
66
|
+
root = self.node("$", "$", self.root_config())
|
|
67
|
+
return root, self.warnings
|
|
68
|
+
|
|
69
|
+
# ── config resolution ──────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
def root_config(self) -> ObjectFieldConfig | ArrayFieldConfig | None:
|
|
72
|
+
if self.config.root is not None:
|
|
73
|
+
return self.config.root
|
|
74
|
+
if self.config.fields:
|
|
75
|
+
return ObjectFieldConfig(fields=dict(self.config.fields))
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _applies_to(metric: BaseMetric, node_cls: type, is_root: bool) -> bool:
|
|
80
|
+
"""Whether ``metric`` should be resolved onto a node of ``node_cls``.
|
|
81
|
+
|
|
82
|
+
Typed metrics match their node type (a ``RootMetric`` only at the root);
|
|
83
|
+
an ``AnyNodeMetric`` matches every node; a ``GenericMetric`` matches iff
|
|
84
|
+
it defines the node's ``compute_<kind>``.
|
|
85
|
+
"""
|
|
86
|
+
if isinstance(metric, RootMetric):
|
|
87
|
+
return is_root
|
|
88
|
+
if isinstance(metric, AnyNodeMetric):
|
|
89
|
+
return True
|
|
90
|
+
if isinstance(metric, FieldMetric):
|
|
91
|
+
return issubclass(node_cls, ScalarNode)
|
|
92
|
+
if isinstance(metric, ObjectMetric):
|
|
93
|
+
return issubclass(node_cls, ObjectNode)
|
|
94
|
+
if isinstance(metric, ArrayMetric):
|
|
95
|
+
return issubclass(node_cls, ArrayNode)
|
|
96
|
+
if isinstance(metric, GenericMetric):
|
|
97
|
+
method = GENERIC_NODE_METHOD.get(node_cls)
|
|
98
|
+
return method is not None and hasattr(metric, method)
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
def _resolve_globals(self) -> list[BaseMetric]:
|
|
102
|
+
"""The cascade set: ``config.metrics``, deduped by identity.
|
|
103
|
+
|
|
104
|
+
``key_metric`` is *not* cascaded here — it is each node's representative
|
|
105
|
+
metric, resolved per node by ``_key_metric`` (and computed last).
|
|
106
|
+
"""
|
|
107
|
+
out: list[BaseMetric] = []
|
|
108
|
+
for spec in self.config.metrics:
|
|
109
|
+
metric = resolve_metric(spec)
|
|
110
|
+
if not any(metric is seen for seen in out):
|
|
111
|
+
out.append(metric)
|
|
112
|
+
return out
|
|
113
|
+
|
|
114
|
+
def _resolve_metrics(
|
|
115
|
+
self, node_cls: type, cfg: AnyFieldConfig | None, is_root: bool
|
|
116
|
+
) -> list[BaseMetric]:
|
|
117
|
+
"""Metrics for one node: applicable globals + this node's own (additive).
|
|
118
|
+
|
|
119
|
+
Globals cascade by type (a ``RootMetric`` only at the root); per-node
|
|
120
|
+
``cfg.metrics`` are *added* (not a replacement), deduped by identity.
|
|
121
|
+
``out`` only ever holds metrics applicable to this node (``add`` filters
|
|
122
|
+
by ``_applies_to``); if it ends up empty the node gets the default for
|
|
123
|
+
its type so every node always carries at least one metric for its
|
|
124
|
+
``key_metric`` to summarise (a different default is set by putting a
|
|
125
|
+
metric of that type in ``config.metrics``, which cascades).
|
|
126
|
+
"""
|
|
127
|
+
out: list[BaseMetric] = []
|
|
128
|
+
|
|
129
|
+
def add(metric: BaseMetric) -> None:
|
|
130
|
+
if self._applies_to(metric, node_cls, is_root) and not any(
|
|
131
|
+
metric is s for s in out
|
|
132
|
+
):
|
|
133
|
+
out.append(metric)
|
|
134
|
+
|
|
135
|
+
for metric in self._globals:
|
|
136
|
+
add(metric)
|
|
137
|
+
for spec in getattr(cfg, "metrics", None) or []:
|
|
138
|
+
add(resolve_metric(spec))
|
|
139
|
+
|
|
140
|
+
if not out:
|
|
141
|
+
if issubclass(node_cls, ScalarNode):
|
|
142
|
+
add(DEFAULT_SCALAR_METRIC())
|
|
143
|
+
elif issubclass(node_cls, ObjectNode):
|
|
144
|
+
add(DEFAULT_OBJECT_METRIC())
|
|
145
|
+
elif issubclass(node_cls, ArrayNode):
|
|
146
|
+
add(DEFAULT_ARRAY_METRIC())
|
|
147
|
+
return out
|
|
148
|
+
|
|
149
|
+
def _key_metric(
|
|
150
|
+
self,
|
|
151
|
+
node_cls: type,
|
|
152
|
+
cfg: AnyFieldConfig | None,
|
|
153
|
+
is_root: bool,
|
|
154
|
+
metrics: list[BaseMetric],
|
|
155
|
+
) -> Metric[Any]:
|
|
156
|
+
"""The node's representative metric (computed last).
|
|
157
|
+
|
|
158
|
+
Prefers an explicit ``cfg.key_metric``, then a distributable
|
|
159
|
+
``config.key_metric`` (each applied only where its type fits), else the
|
|
160
|
+
default ``MeanScore`` (the mean of the node's own metrics).
|
|
161
|
+
|
|
162
|
+
A *name string* is resolved against the node's already-resolved
|
|
163
|
+
``metrics`` first: an equally-named metric is **reused as-is** (same
|
|
164
|
+
instance, same params, no duplicate computation). It is instantiated
|
|
165
|
+
fresh only when the name is not already on the node.
|
|
166
|
+
"""
|
|
167
|
+
for spec in (getattr(cfg, "key_metric", None), self.config.key_metric):
|
|
168
|
+
if spec is None:
|
|
169
|
+
continue
|
|
170
|
+
# Reuse an equally-named metric already on the node; else resolve fresh.
|
|
171
|
+
metric = next(
|
|
172
|
+
(m for m in metrics if isinstance(spec, str) and m.name == spec),
|
|
173
|
+
None,
|
|
174
|
+
) or resolve_metric(spec)
|
|
175
|
+
if self._applies_to(metric, node_cls, is_root):
|
|
176
|
+
assert isinstance(metric, Metric) # a key metric has compute()/score()
|
|
177
|
+
return metric
|
|
178
|
+
return MeanScore()
|
|
179
|
+
|
|
180
|
+
# ── tree construction ────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
def _value(self, doc: Any, path: str) -> Any:
|
|
183
|
+
value = navigate(doc, path)
|
|
184
|
+
return None if value is MISSING else value
|
|
185
|
+
|
|
186
|
+
def _child(self, path: str, key: str) -> str:
|
|
187
|
+
return key if path in ("$", "") else f"{path}.{key}"
|
|
188
|
+
|
|
189
|
+
def node(self, apath: str, epath: str, cfg: AnyFieldConfig | None) -> EvalNode:
|
|
190
|
+
actual = self._value(self.context.actual, apath)
|
|
191
|
+
expected = self._value(self.context.expected, epath)
|
|
192
|
+
ref = expected if expected is not None else actual
|
|
193
|
+
|
|
194
|
+
if isinstance(ref, dict):
|
|
195
|
+
return self._object(apath, epath, cfg, actual, expected)
|
|
196
|
+
if isinstance(ref, list):
|
|
197
|
+
return self._array(apath, epath, cfg, actual, expected)
|
|
198
|
+
return self._scalar(apath, epath, cfg)
|
|
199
|
+
|
|
200
|
+
def _object(
|
|
201
|
+
self,
|
|
202
|
+
apath: str,
|
|
203
|
+
epath: str,
|
|
204
|
+
cfg: AnyFieldConfig | None,
|
|
205
|
+
actual: Any,
|
|
206
|
+
expected: Any,
|
|
207
|
+
) -> ObjectNode:
|
|
208
|
+
a_keys = set(actual) if isinstance(actual, dict) else set()
|
|
209
|
+
e_keys = set(expected) if isinstance(expected, dict) else set()
|
|
210
|
+
both = a_keys & e_keys
|
|
211
|
+
missing = sorted(e_keys - a_keys) # in expected only → FN
|
|
212
|
+
extra = sorted(a_keys - e_keys) # in actual only → FP (subject to policy)
|
|
213
|
+
|
|
214
|
+
if self.config.extra_keys == ExtraKeysPolicy.PENALIZE:
|
|
215
|
+
spurious = extra
|
|
216
|
+
else:
|
|
217
|
+
spurious = []
|
|
218
|
+
for key in extra:
|
|
219
|
+
path = self._child(apath, key)
|
|
220
|
+
self.warnings.append(
|
|
221
|
+
EvalWarning(
|
|
222
|
+
type=WarningType.EXTRA_KEY,
|
|
223
|
+
path=path,
|
|
224
|
+
message=f"{path!r} not in expected (ExtraKeysPolicy.IGNORE)",
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
fields = cfg.fields if isinstance(cfg, ObjectFieldConfig) else {}
|
|
229
|
+
children: dict[str, Any] = {}
|
|
230
|
+
matched: list[Any] = []
|
|
231
|
+
for key in sorted(a_keys | e_keys):
|
|
232
|
+
child = self.node(
|
|
233
|
+
self._child(apath, key), self._child(epath, key), fields.get(key)
|
|
234
|
+
)
|
|
235
|
+
children[key] = child
|
|
236
|
+
if key in both:
|
|
237
|
+
matched.append(child)
|
|
238
|
+
for key in missing:
|
|
239
|
+
path = self._child(apath, key)
|
|
240
|
+
self.warnings.append(
|
|
241
|
+
EvalWarning(
|
|
242
|
+
type=WarningType.MISSING_FIELD,
|
|
243
|
+
path=path,
|
|
244
|
+
message=f"{path!r} absent in actual",
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
is_root = apath == "$"
|
|
249
|
+
metrics = self._resolve_metrics(ObjectNode, cfg, is_root)
|
|
250
|
+
return ObjectNode(
|
|
251
|
+
path=apath,
|
|
252
|
+
context=self.context,
|
|
253
|
+
expected_path=epath if epath != apath else None,
|
|
254
|
+
weight=weight_of(cfg),
|
|
255
|
+
metrics=metrics,
|
|
256
|
+
key_metric=self._key_metric(ObjectNode, cfg, is_root, metrics),
|
|
257
|
+
threshold=self._threshold(cfg),
|
|
258
|
+
matched=matched,
|
|
259
|
+
missing=missing,
|
|
260
|
+
spurious=spurious,
|
|
261
|
+
children=children,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _array(
|
|
265
|
+
self,
|
|
266
|
+
apath: str,
|
|
267
|
+
epath: str,
|
|
268
|
+
cfg: AnyFieldConfig | None,
|
|
269
|
+
actual: Any,
|
|
270
|
+
expected: Any,
|
|
271
|
+
) -> ArrayNode:
|
|
272
|
+
a_list: list[Any] = actual if isinstance(actual, list) else []
|
|
273
|
+
e_list: list[Any] = expected if isinstance(expected, list) else []
|
|
274
|
+
if isinstance(cfg, ArrayFieldConfig):
|
|
275
|
+
aligner = make_aligner(strategy=cfg.strategy, params=cfg.params)
|
|
276
|
+
item_cfg = cfg.item
|
|
277
|
+
else:
|
|
278
|
+
aligner = make_aligner(strategy=ArrayStrategy.BY_INDEX, params=None)
|
|
279
|
+
item_cfg = None
|
|
280
|
+
result = aligner.align(e_list, a_list)
|
|
281
|
+
# TODO: with no expected list (faithfulness / schema-only mode) there are
|
|
282
|
+
# no matched pairs, so array elements get no nodes — value-on-actual
|
|
283
|
+
# metrics (FieldFaithfulness) can't reach them. Materialize actual
|
|
284
|
+
# elements directly in that mode. Roadmap.
|
|
285
|
+
items = [
|
|
286
|
+
self.node(f"{apath}[{aidx}]", f"{epath}[{eidx}]", item_cfg)
|
|
287
|
+
for eidx, aidx in result.matched
|
|
288
|
+
]
|
|
289
|
+
is_root = apath == "$"
|
|
290
|
+
metrics = self._resolve_metrics(ArrayNode, cfg, is_root)
|
|
291
|
+
return ArrayNode(
|
|
292
|
+
path=apath,
|
|
293
|
+
context=self.context,
|
|
294
|
+
expected_path=epath if epath != apath else None,
|
|
295
|
+
weight=weight_of(cfg),
|
|
296
|
+
metrics=metrics,
|
|
297
|
+
key_metric=self._key_metric(ArrayNode, cfg, is_root, metrics),
|
|
298
|
+
threshold=self._threshold(cfg),
|
|
299
|
+
match_result=result,
|
|
300
|
+
items=items,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def _scalar(self, apath: str, epath: str, cfg: AnyFieldConfig | None) -> ScalarNode:
|
|
304
|
+
is_root = apath == "$"
|
|
305
|
+
metrics = self._resolve_metrics(ScalarNode, cfg, is_root)
|
|
306
|
+
return ScalarNode(
|
|
307
|
+
path=apath,
|
|
308
|
+
context=self.context,
|
|
309
|
+
expected_path=epath if epath != apath else None,
|
|
310
|
+
weight=weight_of(cfg),
|
|
311
|
+
metrics=metrics,
|
|
312
|
+
key_metric=self._key_metric(ScalarNode, cfg, is_root, metrics),
|
|
313
|
+
threshold=self._threshold(cfg),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
@staticmethod
|
|
317
|
+
def _threshold(cfg: AnyFieldConfig | None) -> float:
|
|
318
|
+
threshold = getattr(cfg, "threshold", None)
|
|
319
|
+
return float(threshold) if threshold is not None else 1.0
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
from structured_eval.formats.base import ParseError, Parser
|
|
2
|
+
from structured_eval.formats.json_parser import JsonlParser, JsonParser
|
|
3
|
+
from structured_eval.formats.yaml_parser import YamlParser
|
|
4
|
+
|
|
5
|
+
__all__ = ["JsonParser", "JsonlParser", "ParseError", "Parser", "YamlParser"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Protocol, runtime_checkable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ParseError(ValueError):
|
|
7
|
+
"""Raised when input text cannot be parsed into a structured value."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@runtime_checkable
|
|
11
|
+
class Parser(Protocol):
|
|
12
|
+
"""Parse a text string into a Python object.
|
|
13
|
+
|
|
14
|
+
Implementations must raise ParseError on malformed input.
|
|
15
|
+
Return type is Any because parsers may produce dict, list, or scalar
|
|
16
|
+
depending on the input (e.g. JSONL returns an iterator of dicts).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def parse(self, text: str) -> Any: ...
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from structured_eval.formats.base import ParseError
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Iterator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonParser:
|
|
13
|
+
"""Parse a JSON string into a Python object.
|
|
14
|
+
|
|
15
|
+
Accepts any valid JSON value (object, array, string, number, bool, null).
|
|
16
|
+
Raises ParseError on malformed input.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def parse(self, text: str) -> object:
|
|
20
|
+
try:
|
|
21
|
+
return json.loads(text)
|
|
22
|
+
except json.JSONDecodeError as exc:
|
|
23
|
+
raise ParseError(f"Invalid JSON: {exc}") from exc
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JsonlParser:
|
|
27
|
+
"""Parse a JSONL (JSON Lines) string into an iterator of Python objects.
|
|
28
|
+
|
|
29
|
+
Each non-empty line must be a valid JSON value. Blank lines are skipped.
|
|
30
|
+
Raises ParseError on the first malformed line, including the line number.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def parse(self, text: str) -> Iterator[object]:
|
|
34
|
+
return self._iter(text)
|
|
35
|
+
|
|
36
|
+
def _iter(self, text: str) -> Iterator[object]:
|
|
37
|
+
for lineno, raw_line in enumerate(text.splitlines(), start=1):
|
|
38
|
+
line = raw_line.strip()
|
|
39
|
+
if not line:
|
|
40
|
+
continue
|
|
41
|
+
try:
|
|
42
|
+
yield json.loads(line)
|
|
43
|
+
except json.JSONDecodeError as exc:
|
|
44
|
+
raise ParseError(f"Invalid JSON on line {lineno}: {exc}") from exc
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from structured_eval.formats.base import ParseError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class YamlParser:
|
|
7
|
+
"""Parse a YAML string into a Python object.
|
|
8
|
+
|
|
9
|
+
Uses yaml.safe_load — arbitrary Python object construction is disabled.
|
|
10
|
+
Raises ParseError on malformed input. PyYAML is imported lazily so the
|
|
11
|
+
core package stays importable without the ``yaml`` extra.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def parse(self, text: str) -> Any:
|
|
15
|
+
try:
|
|
16
|
+
import yaml
|
|
17
|
+
except ImportError as exc:
|
|
18
|
+
raise ImportError(
|
|
19
|
+
"PyYAML is required for YAML parsing. Install it with: pip install pyyaml"
|
|
20
|
+
) from exc
|
|
21
|
+
try:
|
|
22
|
+
return yaml.safe_load(text)
|
|
23
|
+
except yaml.YAMLError as exc:
|
|
24
|
+
raise ParseError(f"Invalid YAML: {exc}") from exc
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Adapters that plug structured-eval into host eval frameworks.
|
|
2
|
+
|
|
3
|
+
The core (``evaluate`` → ``EvalReport``) is framework-agnostic; each adapter
|
|
4
|
+
lazily imports its host library so ``import structured_eval`` never requires
|
|
5
|
+
deepeval/langsmith. Install with the matching extra (``structured-eval[deepeval]``
|
|
6
|
+
or ``[langsmith]``).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from structured_eval.integrations._adapter import reason_text, verdict
|
|
10
|
+
|
|
11
|
+
__all__ = ["reason_text", "verdict"]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Framework-agnostic mapping from an EvalReport to a (score, success, reason).
|
|
2
|
+
|
|
3
|
+
Shared by every integration so the host-specific classes stay thin. Tested
|
|
4
|
+
directly, without any host library installed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from structured_eval.models.result import EvalReport, NodeType
|
|
10
|
+
|
|
11
|
+
_MAX_REASONS = 5
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def reason_text(report: EvalReport) -> str:
|
|
15
|
+
"""Human-readable explanation of a report, focused on what failed."""
|
|
16
|
+
if report.parse_error:
|
|
17
|
+
return f"parse error: {report.parse_error_message or 'could not parse output'}"
|
|
18
|
+
|
|
19
|
+
failed = report.failed_fields()
|
|
20
|
+
if not failed:
|
|
21
|
+
return "all fields passed"
|
|
22
|
+
|
|
23
|
+
parts = []
|
|
24
|
+
for fs in list(failed.values())[:_MAX_REASONS]:
|
|
25
|
+
if fs.node_type == NodeType.SCALAR:
|
|
26
|
+
parts.append(f"{fs.path}: {fs.actual!r} != {fs.expected!r}")
|
|
27
|
+
else:
|
|
28
|
+
parts.append(
|
|
29
|
+
f"{fs.path}: score {fs.score:.2g}" if fs.score is not None else fs.path
|
|
30
|
+
)
|
|
31
|
+
if len(failed) > _MAX_REASONS:
|
|
32
|
+
parts.append(f"... +{len(failed) - _MAX_REASONS} more")
|
|
33
|
+
|
|
34
|
+
head = f"{len(failed)} field(s) failed: "
|
|
35
|
+
return head + "; ".join(parts)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def verdict(report: EvalReport, threshold: float) -> tuple[float | None, bool, str]:
|
|
39
|
+
"""Reduce a report to (score, success, reason) for a host framework.
|
|
40
|
+
|
|
41
|
+
``score`` is the key-metric value (``None`` when no key metric / no ground
|
|
42
|
+
truth). ``success`` requires a parsed document and ``score >= threshold``;
|
|
43
|
+
when ``score`` is ``None`` the pass/fail bar cannot be applied → ``False``.
|
|
44
|
+
"""
|
|
45
|
+
score = report.score
|
|
46
|
+
success = not report.parse_error and score is not None and score >= threshold
|
|
47
|
+
return score, success, reason_text(report)
|