structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.metrics.invoker import MetricInvoker
6
+ from structured_eval.models.metric_result import MetricResult
7
+
8
+ if TYPE_CHECKING:
9
+ from structured_eval.metrics.base import BaseMetric, MetricOutput
10
+ from structured_eval.models.nodes.base import EvalNode
11
+
12
+
13
+ class MetricRunner:
14
+ """Phase 2: compute each node's own metrics across the tree, in place.
15
+
16
+ Every node carries the metrics resolved for it by ``TreeBuilder``; this
17
+ phase walks the tree **post-order** (children before their parent), so an
18
+ aggregating parent reads its children's already-computed representative
19
+ scores — computation is uniform and fully recursive at any nesting depth.
20
+ Within a node the ``key_metric`` runs *last*: it is the representative score
21
+ and its logic may depend on the node's other metrics (the default
22
+ ``MeanScore`` averages them). A metric returning ``None`` (e.g.
23
+ ``Faithfulness`` without a source) is skipped.
24
+ """
25
+
26
+ def run(self, root: EvalNode) -> None:
27
+ self._visit(root)
28
+
29
+ def _visit(self, node: EvalNode) -> None:
30
+ for child in node.children_nodes():
31
+ self._visit(child)
32
+ key_metric = node.key_metric
33
+ for metric in node.metrics:
34
+ if metric is key_metric:
35
+ continue
36
+ self._apply(metric, node)
37
+ if key_metric is not None:
38
+ self._apply(key_metric, node)
39
+
40
+ def _apply(self, metric: BaseMetric, node: EvalNode) -> None:
41
+ result = MetricInvoker(metric).on_node(node)
42
+ node.metric_results.update(self._normalize(metric.name, result))
43
+
44
+ @staticmethod
45
+ def _normalize(name: str, result: MetricOutput) -> dict[str, MetricResult]:
46
+ """Coerce any ``compute`` return into ``{key: MetricResult}``.
47
+
48
+ Accepts ``None`` (skip), a bare value, a ``dict`` of sub-scores, a
49
+ ``MetricResult``, or a ``(value | dict, extra)`` tuple — so a metric can
50
+ attach structured ``extra`` regardless of how it shapes its score. A
51
+ tuple's ``extra`` is attached to every key it produces.
52
+ """
53
+ if result is None:
54
+ return {}
55
+ extra: dict[str, Any] = {}
56
+ if isinstance(result, tuple):
57
+ result, extra = result
58
+ if isinstance(result, dict):
59
+ return {
60
+ k: MetricResult(v, {**getattr(v, "extra", {}), **extra})
61
+ for k, v in result.items()
62
+ }
63
+ if isinstance(result, MetricResult):
64
+ return {
65
+ name: MetricResult(result, {**result.extra, **extra})
66
+ if extra
67
+ else result
68
+ }
69
+ return {name: MetricResult(result, extra)}
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from structured_eval.formats.base import ParseError
6
+ from structured_eval.formats.json_parser import JsonParser
7
+
8
+
9
+ class Parser:
10
+ """Coerces raw sample input into Python values without raising.
11
+
12
+ Already-structured input (dict/list/None/scalars) passes through. A string
13
+ is parsed as JSON; if that fails it is retried as YAML (when PyYAML is
14
+ installed) and accepted only when it yields a dict or list. ``parse`` returns
15
+ ``(value, None)`` on success or ``(None, message)`` on failure, so the engine
16
+ can surface a parse error in the ``EvalReport`` rather than blowing up.
17
+ """
18
+
19
+ def __init__(self) -> None:
20
+ self._json = JsonParser()
21
+
22
+ def parse(self, raw: Any) -> tuple[Any, str | None]:
23
+ if not isinstance(raw, str):
24
+ return raw, None
25
+ try:
26
+ return self._json.parse(raw), None
27
+ except ParseError as json_error:
28
+ value = self._try_yaml(raw)
29
+ if value is not None:
30
+ return value, None
31
+ return None, str(json_error)
32
+
33
+ @staticmethod
34
+ def _try_yaml(text: str) -> Any | None:
35
+ """Parse ``text`` as YAML, returning a dict/list or None on any failure."""
36
+ from structured_eval.formats.yaml_parser import YamlParser
37
+
38
+ try:
39
+ value = YamlParser().parse(text)
40
+ except (ParseError, ImportError):
41
+ return None
42
+ return value if isinstance(value, (dict, list)) else None
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, ClassVar
4
+
5
+ from structured_eval.models.metric_result import MetricCollection
6
+ from structured_eval.models.nodes.array_node import ArrayNode
7
+ from structured_eval.models.nodes.object_node import ObjectNode
8
+ from structured_eval.models.nodes.scalar import ScalarNode
9
+ from structured_eval.models.result import EvalReport, EvalWarning, FieldScore, NodeType
10
+
11
+ if TYPE_CHECKING:
12
+ from structured_eval.models.context import EvalContext
13
+ from structured_eval.models.nodes.base import EvalNode
14
+
15
+
16
+ class ReportBuilder:
17
+ """Phase 3: flatten the computed node tree into an ``EvalReport``."""
18
+
19
+ _NODE_TYPE: ClassVar[dict[type, NodeType]] = {
20
+ ScalarNode: NodeType.SCALAR,
21
+ ObjectNode: NodeType.OBJECT,
22
+ ArrayNode: NodeType.ARRAY,
23
+ }
24
+
25
+ def build(
26
+ self, root: EvalNode, context: EvalContext, warnings: list[EvalWarning]
27
+ ) -> EvalReport:
28
+ field_scores = {}
29
+ array_matches = {}
30
+ # report.metrics is a cross-field view: each metric name → its value at
31
+ # every node that produced it (a MetricCollection), built as we walk.
32
+ # A metric's structured detail (schema errors, hallucinated paths, …)
33
+ # rides along on each value's ``.extra``.
34
+ collections: dict[str, MetricCollection] = {}
35
+ for node in root.walk():
36
+ field_scores[node.path] = self._field_score(node)
37
+ for name, result in node.metric_results.items():
38
+ coll = collections.setdefault(name, MetricCollection(name=name))
39
+ coll.by_path[node.path] = result
40
+ if isinstance(node, ArrayNode) and node.match_result is not None:
41
+ array_matches[node.path] = node.match_result
42
+
43
+ # The headline number is the root node's representative (key) metric.
44
+ score_label = root.key_metric.name if root.key_metric is not None else None
45
+ root_score = (
46
+ root.metric_results.get(score_label) if score_label is not None else None
47
+ )
48
+ score = float(root_score) if root_score is not None else None
49
+
50
+ return EvalReport(
51
+ score=score,
52
+ score_label=score_label,
53
+ metrics=collections,
54
+ field_scores=field_scores,
55
+ array_matches=array_matches,
56
+ warnings=warnings,
57
+ )
58
+
59
+ def _field_score(self, node: EvalNode) -> FieldScore:
60
+ return FieldScore(
61
+ path=node.path,
62
+ node_type=self._NODE_TYPE.get(type(node), NodeType.SCALAR),
63
+ actual=node.actual,
64
+ expected=node.expected,
65
+ metrics=dict(node.metric_results),
66
+ score=node.representative, # the node's representative (key-metric) value
67
+ threshold=node.threshold,
68
+ )
@@ -0,0 +1,319 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from structured_eval.alignment import make_aligner
6
+ from structured_eval.metrics.array_accuracy import ArrayAccuracy
7
+ from structured_eval.metrics.base import (
8
+ AnyNodeMetric,
9
+ ArrayMetric,
10
+ BaseMetric,
11
+ FieldMetric,
12
+ GenericMetric,
13
+ Metric,
14
+ ObjectMetric,
15
+ RootMetric,
16
+ resolve_metric,
17
+ )
18
+ from structured_eval.metrics.exact import ExactMatch
19
+ from structured_eval.metrics.invoker import GENERIC_NODE_METHOD
20
+ from structured_eval.metrics.mean_score import MeanScore
21
+ from structured_eval.metrics.object_accuracy import ObjectAccuracy
22
+ from structured_eval.models.config import (
23
+ AnyFieldConfig,
24
+ ArrayFieldConfig,
25
+ ArrayStrategy,
26
+ EvalConfig,
27
+ ExtraKeysPolicy,
28
+ ObjectFieldConfig,
29
+ weight_of,
30
+ )
31
+ from structured_eval.models.nodes.array_node import ArrayNode
32
+ from structured_eval.models.nodes.base import MISSING, EvalNode, navigate
33
+ from structured_eval.models.nodes.object_node import ObjectNode
34
+ from structured_eval.models.nodes.scalar import ScalarNode
35
+ from structured_eval.models.result import EvalWarning, WarningType
36
+
37
+ if TYPE_CHECKING:
38
+ from structured_eval.models.context import EvalContext
39
+
40
+ # Metric a node falls back to when the user configured none of its type, so every
41
+ # node always carries at least one metric for its key_metric (MeanScore) to mean.
42
+ DEFAULT_SCALAR_METRIC = ExactMatch
43
+ DEFAULT_OBJECT_METRIC = ObjectAccuracy
44
+ DEFAULT_ARRAY_METRIC = ArrayAccuracy
45
+
46
+
47
+ class TreeBuilder:
48
+ """Phase 1: build the EvalNode tree and resolve each node's metric list.
49
+
50
+ ``build`` returns ``(root_node, warnings)``. This phase is purely
51
+ structural: it shapes the tree, resolves which metrics apply to each node
52
+ (cascading the config's global metrics by type and adding any per-node
53
+ ``cfg.metrics``), and attaches them to ``node.metrics``. Computation happens
54
+ later, uniformly, in ``MetricRunner``. Each node carries an ``actual``-side
55
+ ``path`` and, when arrays reorder elements, a diverging ``expected_path`` so
56
+ each side navigates its own index.
57
+ """
58
+
59
+ def __init__(self, context: EvalContext):
60
+ self.context = context
61
+ self.config: EvalConfig = context.config
62
+ self.warnings: list[EvalWarning] = []
63
+ self._globals = self._resolve_globals()
64
+
65
+ def build(self) -> tuple[EvalNode, list[EvalWarning]]:
66
+ root = self.node("$", "$", self.root_config())
67
+ return root, self.warnings
68
+
69
+ # ── config resolution ──────────────────────────────────────────────────
70
+
71
+ def root_config(self) -> ObjectFieldConfig | ArrayFieldConfig | None:
72
+ if self.config.root is not None:
73
+ return self.config.root
74
+ if self.config.fields:
75
+ return ObjectFieldConfig(fields=dict(self.config.fields))
76
+ return None
77
+
78
+ @staticmethod
79
+ def _applies_to(metric: BaseMetric, node_cls: type, is_root: bool) -> bool:
80
+ """Whether ``metric`` should be resolved onto a node of ``node_cls``.
81
+
82
+ Typed metrics match their node type (a ``RootMetric`` only at the root);
83
+ an ``AnyNodeMetric`` matches every node; a ``GenericMetric`` matches iff
84
+ it defines the node's ``compute_<kind>``.
85
+ """
86
+ if isinstance(metric, RootMetric):
87
+ return is_root
88
+ if isinstance(metric, AnyNodeMetric):
89
+ return True
90
+ if isinstance(metric, FieldMetric):
91
+ return issubclass(node_cls, ScalarNode)
92
+ if isinstance(metric, ObjectMetric):
93
+ return issubclass(node_cls, ObjectNode)
94
+ if isinstance(metric, ArrayMetric):
95
+ return issubclass(node_cls, ArrayNode)
96
+ if isinstance(metric, GenericMetric):
97
+ method = GENERIC_NODE_METHOD.get(node_cls)
98
+ return method is not None and hasattr(metric, method)
99
+ return False
100
+
101
+ def _resolve_globals(self) -> list[BaseMetric]:
102
+ """The cascade set: ``config.metrics``, deduped by identity.
103
+
104
+ ``key_metric`` is *not* cascaded here — it is each node's representative
105
+ metric, resolved per node by ``_key_metric`` (and computed last).
106
+ """
107
+ out: list[BaseMetric] = []
108
+ for spec in self.config.metrics:
109
+ metric = resolve_metric(spec)
110
+ if not any(metric is seen for seen in out):
111
+ out.append(metric)
112
+ return out
113
+
114
+ def _resolve_metrics(
115
+ self, node_cls: type, cfg: AnyFieldConfig | None, is_root: bool
116
+ ) -> list[BaseMetric]:
117
+ """Metrics for one node: applicable globals + this node's own (additive).
118
+
119
+ Globals cascade by type (a ``RootMetric`` only at the root); per-node
120
+ ``cfg.metrics`` are *added* (not a replacement), deduped by identity.
121
+ ``out`` only ever holds metrics applicable to this node (``add`` filters
122
+ by ``_applies_to``); if it ends up empty the node gets the default for
123
+ its type so every node always carries at least one metric for its
124
+ ``key_metric`` to summarise (a different default is set by putting a
125
+ metric of that type in ``config.metrics``, which cascades).
126
+ """
127
+ out: list[BaseMetric] = []
128
+
129
+ def add(metric: BaseMetric) -> None:
130
+ if self._applies_to(metric, node_cls, is_root) and not any(
131
+ metric is s for s in out
132
+ ):
133
+ out.append(metric)
134
+
135
+ for metric in self._globals:
136
+ add(metric)
137
+ for spec in getattr(cfg, "metrics", None) or []:
138
+ add(resolve_metric(spec))
139
+
140
+ if not out:
141
+ if issubclass(node_cls, ScalarNode):
142
+ add(DEFAULT_SCALAR_METRIC())
143
+ elif issubclass(node_cls, ObjectNode):
144
+ add(DEFAULT_OBJECT_METRIC())
145
+ elif issubclass(node_cls, ArrayNode):
146
+ add(DEFAULT_ARRAY_METRIC())
147
+ return out
148
+
149
+ def _key_metric(
150
+ self,
151
+ node_cls: type,
152
+ cfg: AnyFieldConfig | None,
153
+ is_root: bool,
154
+ metrics: list[BaseMetric],
155
+ ) -> Metric[Any]:
156
+ """The node's representative metric (computed last).
157
+
158
+ Prefers an explicit ``cfg.key_metric``, then a distributable
159
+ ``config.key_metric`` (each applied only where its type fits), else the
160
+ default ``MeanScore`` (the mean of the node's own metrics).
161
+
162
+ A *name string* is resolved against the node's already-resolved
163
+ ``metrics`` first: an equally-named metric is **reused as-is** (same
164
+ instance, same params, no duplicate computation). It is instantiated
165
+ fresh only when the name is not already on the node.
166
+ """
167
+ for spec in (getattr(cfg, "key_metric", None), self.config.key_metric):
168
+ if spec is None:
169
+ continue
170
+ # Reuse an equally-named metric already on the node; else resolve fresh.
171
+ metric = next(
172
+ (m for m in metrics if isinstance(spec, str) and m.name == spec),
173
+ None,
174
+ ) or resolve_metric(spec)
175
+ if self._applies_to(metric, node_cls, is_root):
176
+ assert isinstance(metric, Metric) # a key metric has compute()/score()
177
+ return metric
178
+ return MeanScore()
179
+
180
+ # ── tree construction ────────────────────────────────────────────────
181
+
182
+ def _value(self, doc: Any, path: str) -> Any:
183
+ value = navigate(doc, path)
184
+ return None if value is MISSING else value
185
+
186
+ def _child(self, path: str, key: str) -> str:
187
+ return key if path in ("$", "") else f"{path}.{key}"
188
+
189
+ def node(self, apath: str, epath: str, cfg: AnyFieldConfig | None) -> EvalNode:
190
+ actual = self._value(self.context.actual, apath)
191
+ expected = self._value(self.context.expected, epath)
192
+ ref = expected if expected is not None else actual
193
+
194
+ if isinstance(ref, dict):
195
+ return self._object(apath, epath, cfg, actual, expected)
196
+ if isinstance(ref, list):
197
+ return self._array(apath, epath, cfg, actual, expected)
198
+ return self._scalar(apath, epath, cfg)
199
+
200
+ def _object(
201
+ self,
202
+ apath: str,
203
+ epath: str,
204
+ cfg: AnyFieldConfig | None,
205
+ actual: Any,
206
+ expected: Any,
207
+ ) -> ObjectNode:
208
+ a_keys = set(actual) if isinstance(actual, dict) else set()
209
+ e_keys = set(expected) if isinstance(expected, dict) else set()
210
+ both = a_keys & e_keys
211
+ missing = sorted(e_keys - a_keys) # in expected only → FN
212
+ extra = sorted(a_keys - e_keys) # in actual only → FP (subject to policy)
213
+
214
+ if self.config.extra_keys == ExtraKeysPolicy.PENALIZE:
215
+ spurious = extra
216
+ else:
217
+ spurious = []
218
+ for key in extra:
219
+ path = self._child(apath, key)
220
+ self.warnings.append(
221
+ EvalWarning(
222
+ type=WarningType.EXTRA_KEY,
223
+ path=path,
224
+ message=f"{path!r} not in expected (ExtraKeysPolicy.IGNORE)",
225
+ )
226
+ )
227
+
228
+ fields = cfg.fields if isinstance(cfg, ObjectFieldConfig) else {}
229
+ children: dict[str, Any] = {}
230
+ matched: list[Any] = []
231
+ for key in sorted(a_keys | e_keys):
232
+ child = self.node(
233
+ self._child(apath, key), self._child(epath, key), fields.get(key)
234
+ )
235
+ children[key] = child
236
+ if key in both:
237
+ matched.append(child)
238
+ for key in missing:
239
+ path = self._child(apath, key)
240
+ self.warnings.append(
241
+ EvalWarning(
242
+ type=WarningType.MISSING_FIELD,
243
+ path=path,
244
+ message=f"{path!r} absent in actual",
245
+ )
246
+ )
247
+
248
+ is_root = apath == "$"
249
+ metrics = self._resolve_metrics(ObjectNode, cfg, is_root)
250
+ return ObjectNode(
251
+ path=apath,
252
+ context=self.context,
253
+ expected_path=epath if epath != apath else None,
254
+ weight=weight_of(cfg),
255
+ metrics=metrics,
256
+ key_metric=self._key_metric(ObjectNode, cfg, is_root, metrics),
257
+ threshold=self._threshold(cfg),
258
+ matched=matched,
259
+ missing=missing,
260
+ spurious=spurious,
261
+ children=children,
262
+ )
263
+
264
+ def _array(
265
+ self,
266
+ apath: str,
267
+ epath: str,
268
+ cfg: AnyFieldConfig | None,
269
+ actual: Any,
270
+ expected: Any,
271
+ ) -> ArrayNode:
272
+ a_list: list[Any] = actual if isinstance(actual, list) else []
273
+ e_list: list[Any] = expected if isinstance(expected, list) else []
274
+ if isinstance(cfg, ArrayFieldConfig):
275
+ aligner = make_aligner(strategy=cfg.strategy, params=cfg.params)
276
+ item_cfg = cfg.item
277
+ else:
278
+ aligner = make_aligner(strategy=ArrayStrategy.BY_INDEX, params=None)
279
+ item_cfg = None
280
+ result = aligner.align(e_list, a_list)
281
+ # TODO: with no expected list (faithfulness / schema-only mode) there are
282
+ # no matched pairs, so array elements get no nodes — value-on-actual
283
+ # metrics (FieldFaithfulness) can't reach them. Materialize actual
284
+ # elements directly in that mode. Roadmap.
285
+ items = [
286
+ self.node(f"{apath}[{aidx}]", f"{epath}[{eidx}]", item_cfg)
287
+ for eidx, aidx in result.matched
288
+ ]
289
+ is_root = apath == "$"
290
+ metrics = self._resolve_metrics(ArrayNode, cfg, is_root)
291
+ return ArrayNode(
292
+ path=apath,
293
+ context=self.context,
294
+ expected_path=epath if epath != apath else None,
295
+ weight=weight_of(cfg),
296
+ metrics=metrics,
297
+ key_metric=self._key_metric(ArrayNode, cfg, is_root, metrics),
298
+ threshold=self._threshold(cfg),
299
+ match_result=result,
300
+ items=items,
301
+ )
302
+
303
+ def _scalar(self, apath: str, epath: str, cfg: AnyFieldConfig | None) -> ScalarNode:
304
+ is_root = apath == "$"
305
+ metrics = self._resolve_metrics(ScalarNode, cfg, is_root)
306
+ return ScalarNode(
307
+ path=apath,
308
+ context=self.context,
309
+ expected_path=epath if epath != apath else None,
310
+ weight=weight_of(cfg),
311
+ metrics=metrics,
312
+ key_metric=self._key_metric(ScalarNode, cfg, is_root, metrics),
313
+ threshold=self._threshold(cfg),
314
+ )
315
+
316
+ @staticmethod
317
+ def _threshold(cfg: AnyFieldConfig | None) -> float:
318
+ threshold = getattr(cfg, "threshold", None)
319
+ return float(threshold) if threshold is not None else 1.0
@@ -0,0 +1,5 @@
1
+ from structured_eval.formats.base import ParseError, Parser
2
+ from structured_eval.formats.json_parser import JsonlParser, JsonParser
3
+ from structured_eval.formats.yaml_parser import YamlParser
4
+
5
+ __all__ = ["JsonParser", "JsonlParser", "ParseError", "Parser", "YamlParser"]
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Protocol, runtime_checkable
4
+
5
+
6
+ class ParseError(ValueError):
7
+ """Raised when input text cannot be parsed into a structured value."""
8
+
9
+
10
+ @runtime_checkable
11
+ class Parser(Protocol):
12
+ """Parse a text string into a Python object.
13
+
14
+ Implementations must raise ParseError on malformed input.
15
+ Return type is Any because parsers may produce dict, list, or scalar
16
+ depending on the input (e.g. JSONL returns an iterator of dicts).
17
+ """
18
+
19
+ def parse(self, text: str) -> Any: ...
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING
5
+
6
+ from structured_eval.formats.base import ParseError
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterator
10
+
11
+
12
+ class JsonParser:
13
+ """Parse a JSON string into a Python object.
14
+
15
+ Accepts any valid JSON value (object, array, string, number, bool, null).
16
+ Raises ParseError on malformed input.
17
+ """
18
+
19
+ def parse(self, text: str) -> object:
20
+ try:
21
+ return json.loads(text)
22
+ except json.JSONDecodeError as exc:
23
+ raise ParseError(f"Invalid JSON: {exc}") from exc
24
+
25
+
26
+ class JsonlParser:
27
+ """Parse a JSONL (JSON Lines) string into an iterator of Python objects.
28
+
29
+ Each non-empty line must be a valid JSON value. Blank lines are skipped.
30
+ Raises ParseError on the first malformed line, including the line number.
31
+ """
32
+
33
+ def parse(self, text: str) -> Iterator[object]:
34
+ return self._iter(text)
35
+
36
+ def _iter(self, text: str) -> Iterator[object]:
37
+ for lineno, raw_line in enumerate(text.splitlines(), start=1):
38
+ line = raw_line.strip()
39
+ if not line:
40
+ continue
41
+ try:
42
+ yield json.loads(line)
43
+ except json.JSONDecodeError as exc:
44
+ raise ParseError(f"Invalid JSON on line {lineno}: {exc}") from exc
@@ -0,0 +1,24 @@
1
+ from typing import Any
2
+
3
+ from structured_eval.formats.base import ParseError
4
+
5
+
6
+ class YamlParser:
7
+ """Parse a YAML string into a Python object.
8
+
9
+ Uses yaml.safe_load — arbitrary Python object construction is disabled.
10
+ Raises ParseError on malformed input. PyYAML is imported lazily so the
11
+ core package stays importable without the ``yaml`` extra.
12
+ """
13
+
14
+ def parse(self, text: str) -> Any:
15
+ try:
16
+ import yaml
17
+ except ImportError as exc:
18
+ raise ImportError(
19
+ "PyYAML is required for YAML parsing. Install it with: pip install pyyaml"
20
+ ) from exc
21
+ try:
22
+ return yaml.safe_load(text)
23
+ except yaml.YAMLError as exc:
24
+ raise ParseError(f"Invalid YAML: {exc}") from exc
@@ -0,0 +1,11 @@
1
+ """Adapters that plug structured-eval into host eval frameworks.
2
+
3
+ The core (``evaluate`` → ``EvalReport``) is framework-agnostic; each adapter
4
+ lazily imports its host library so ``import structured_eval`` never requires
5
+ deepeval/langsmith. Install with the matching extra (``structured-eval[deepeval]``
6
+ or ``[langsmith]``).
7
+ """
8
+
9
+ from structured_eval.integrations._adapter import reason_text, verdict
10
+
11
+ __all__ = ["reason_text", "verdict"]
@@ -0,0 +1,47 @@
1
+ """Framework-agnostic mapping from an EvalReport to a (score, success, reason).
2
+
3
+ Shared by every integration so the host-specific classes stay thin. Tested
4
+ directly, without any host library installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from structured_eval.models.result import EvalReport, NodeType
10
+
11
+ _MAX_REASONS = 5
12
+
13
+
14
+ def reason_text(report: EvalReport) -> str:
15
+ """Human-readable explanation of a report, focused on what failed."""
16
+ if report.parse_error:
17
+ return f"parse error: {report.parse_error_message or 'could not parse output'}"
18
+
19
+ failed = report.failed_fields()
20
+ if not failed:
21
+ return "all fields passed"
22
+
23
+ parts = []
24
+ for fs in list(failed.values())[:_MAX_REASONS]:
25
+ if fs.node_type == NodeType.SCALAR:
26
+ parts.append(f"{fs.path}: {fs.actual!r} != {fs.expected!r}")
27
+ else:
28
+ parts.append(
29
+ f"{fs.path}: score {fs.score:.2g}" if fs.score is not None else fs.path
30
+ )
31
+ if len(failed) > _MAX_REASONS:
32
+ parts.append(f"... +{len(failed) - _MAX_REASONS} more")
33
+
34
+ head = f"{len(failed)} field(s) failed: "
35
+ return head + "; ".join(parts)
36
+
37
+
38
+ def verdict(report: EvalReport, threshold: float) -> tuple[float | None, bool, str]:
39
+ """Reduce a report to (score, success, reason) for a host framework.
40
+
41
+ ``score`` is the key-metric value (``None`` when no key metric / no ground
42
+ truth). ``success`` requires a parsed document and ``score >= threshold``;
43
+ when ``score`` is ``None`` the pass/fail bar cannot be applied → ``False``.
44
+ """
45
+ score = report.score
46
+ success = not report.parse_error and score is not None and score >= threshold
47
+ return score, success, reason_text(report)