structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,361 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ from pathlib import Path
5
+ from statistics import mean
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from structured_eval.models.metric_result import ( # noqa: TC001
11
+ MetricCollection,
12
+ MetricResult,
13
+ )
14
+ from structured_eval.models.nodes.array_node import ArrayMatchResult # noqa: TC001
15
+
16
+
17
+ class NodeType(StrEnum):
18
+ """The kind of tree node a ``FieldScore`` describes."""
19
+
20
+ SCALAR = "scalar"
21
+ OBJECT = "object"
22
+ ARRAY = "array"
23
+
24
+
25
+ def _percentile(values: list[float], q: float) -> float:
26
+ """Linear-interpolation percentile (q in [0, 1]) over a non-empty list."""
27
+ ordered = sorted(values)
28
+ if len(ordered) == 1:
29
+ return ordered[0]
30
+ pos = q * (len(ordered) - 1)
31
+ lo = int(pos)
32
+ hi = min(lo + 1, len(ordered) - 1)
33
+ frac = pos - lo
34
+ return ordered[lo] + (ordered[hi] - ordered[lo]) * frac
35
+
36
+
37
+ # ── Warnings ────────────────────────────────────────────────────────────────
38
+
39
+
40
+ class WarningType(StrEnum):
41
+ """The kind of structural warning the engine raised while building the tree."""
42
+
43
+ EXTRA_KEY = (
44
+ "extra_key" # key present in actual but not expected (ExtraKeysPolicy.IGNORE)
45
+ )
46
+ MISSING_FIELD = "missing_field" # key present in expected but absent in actual
47
+
48
+
49
+ class EvalWarning(BaseModel):
50
+ """A structural warning, typed by ``WarningType`` and located by ``path``."""
51
+
52
+ type: WarningType
53
+ path: str
54
+ message: str = ""
55
+
56
+ def __str__(self) -> str:
57
+ tag = f"[{self.type.name}]"
58
+ return f"{tag} {self.message}" if self.message else f"{tag} {self.path}"
59
+
60
+
61
+ # ── Rules ─────────────────────────────────────────────────────────────────────
62
+
63
+
64
+ class RuleResult(BaseModel):
65
+ """Result of evaluating a single business rule."""
66
+
67
+ name: str
68
+ passed: bool
69
+ message: str = ""
70
+
71
+
72
+ # ── Per-path score ────────────────────────────────────────────────────────────
73
+
74
+
75
+ class FieldScore(BaseModel):
76
+ """Evaluation result for one node of the tree (flat, dot-notation path).
77
+
78
+ ``metrics`` holds only the metrics that were requested and applied to this
79
+ node (e.g. ``{"exact_match": 0.0, "token_f1": 0.62}``). Each value is a
80
+ ``MetricResult`` — a ``float`` that also carries ``.extra`` (structured
81
+ detail the metric chose to surface). ``score`` is the value of the key metric
82
+ at this path, ``threshold`` the bar applied to it.
83
+ """
84
+
85
+ model_config = ConfigDict(arbitrary_types_allowed=True)
86
+
87
+ path: str
88
+ node_type: NodeType
89
+ actual: Any = None
90
+ expected: Any = None
91
+ metrics: dict[str, MetricResult] = Field(default_factory=dict)
92
+ # TODO: Should be required by default key metric
93
+ score: float | None = None
94
+ # TODO: Reconsider default arguments - all possible should be defined in model as defaults
95
+ threshold: float | None = None
96
+
97
+
98
+ # ── Regression diff ─────────────────────────────────────────────────────────
99
+
100
+
101
+ class RegressionDiff(BaseModel):
102
+ """Metric deltas between two EvalReports (self minus other).
103
+
104
+ ``deltas`` are per-metric changes in the aggregate; ``field_deltas`` maps
105
+ each field path to its own per-metric changes. Positive means improvement.
106
+ """
107
+
108
+ deltas: dict[str, float] = Field(default_factory=dict)
109
+ field_deltas: dict[str, dict[str, float]] = Field(default_factory=dict)
110
+
111
+
112
+ # ── Eval report ───────────────────────────────────────────────────────────────
113
+
114
+
115
+ class EvalReport(BaseModel):
116
+ """Full evaluation result for a single document.
117
+
118
+ ``metrics`` maps each metric name to a ``MetricCollection`` — its value at
119
+ every node that produced it, plus that metric's structured detail (schema
120
+ errors, hallucinated paths, per-rule outcomes, …) on each value's ``.extra``.
121
+ ``field_scores`` is a flat map of every tree node keyed by its path. On a
122
+ parse error, ``parse_error`` is True and the metrics are left empty.
123
+ """
124
+
125
+ model_config = ConfigDict(arbitrary_types_allowed=True)
126
+
127
+ score: float | None = None
128
+ score_label: str | None = None
129
+ metrics: dict[str, MetricCollection] = Field(default_factory=dict)
130
+ field_scores: dict[str, FieldScore] = Field(default_factory=dict)
131
+ array_matches: dict[str, ArrayMatchResult] = Field(default_factory=dict)
132
+ parse_error: bool = False
133
+ parse_error_message: str | None = None
134
+ warnings: list[EvalWarning] = Field(default_factory=list)
135
+
136
+ # ── Queries ───────────────────────────────────────────────────────────
137
+
138
+ def failed_fields(self, threshold: float | None = None) -> dict[str, FieldScore]:
139
+ """Return fields whose score falls below the applicable threshold.
140
+
141
+ Keyed by field path (the same keys as ``field_scores``). Precedence per
142
+ field: the ``threshold`` argument, else the field's own ``threshold``,
143
+ else a perfect-match bar of 1.0. Fields without a score (no key metric
144
+ applied) are skipped.
145
+ """
146
+ failed: dict[str, FieldScore] = {}
147
+ for path, fs in self.field_scores.items():
148
+ if fs.score is None:
149
+ continue
150
+ bar = threshold if threshold is not None else fs.threshold
151
+ if bar is None:
152
+ bar = 1.0
153
+ if fs.score < bar:
154
+ failed[path] = fs
155
+ return failed
156
+
157
+ # ── Reporting / serialization ─────────────────────────────────────────
158
+
159
+ def print_summary(self) -> None:
160
+ """Print a field-level summary table to stdout."""
161
+ from structured_eval.reporting import render
162
+
163
+ print(render(self)) # noqa: T201
164
+
165
+ def to_dict(self) -> dict[str, Any]:
166
+ """Return a JSON-friendly dict of the full report."""
167
+ return self.model_dump(mode="json")
168
+
169
+ def to_json(self, path: str) -> None:
170
+ """Serialize the report to a JSON file."""
171
+ with Path(path).open("w", encoding="utf-8") as fh:
172
+ fh.write(self.model_dump_json(indent=2))
173
+
174
+ @classmethod
175
+ def from_dict(cls, data: dict[str, Any]) -> EvalReport:
176
+ """Reconstruct a report from a dict produced by ``to_dict``."""
177
+ return cls.model_validate(data)
178
+
179
+ @classmethod
180
+ def from_json(cls, path: str) -> EvalReport:
181
+ """Load a report from a JSON file."""
182
+ with Path(path).open(encoding="utf-8") as fh:
183
+ return cls.model_validate_json(fh.read())
184
+
185
+ def diff_from(
186
+ self, other: EvalReport, metrics: list[str] | None = None
187
+ ) -> RegressionDiff:
188
+ """Compute metric deltas relative to ``other`` (self minus other).
189
+
190
+ ``deltas`` covers document-level metrics present in both reports (or the
191
+ subset named in ``metrics``); ``field_deltas`` covers per-field metrics
192
+ for paths present in both. Positive means improvement.
193
+ """
194
+ names = metrics if metrics is not None else sorted(self.metrics)
195
+ deltas = {
196
+ name: self.metrics[name].representative()
197
+ - other.metrics[name].representative()
198
+ for name in names
199
+ if name in self.metrics and name in other.metrics
200
+ }
201
+
202
+ field_deltas: dict[str, dict[str, float]] = {}
203
+ for path, fs in self.field_scores.items():
204
+ other_fs = other.field_scores.get(path)
205
+ if other_fs is None:
206
+ continue
207
+ per: dict[str, float] = {
208
+ m: fs.metrics[m] - other_fs.metrics[m]
209
+ for m in fs.metrics
210
+ if m in other_fs.metrics
211
+ }
212
+ if fs.score is not None and other_fs.score is not None:
213
+ per["score"] = fs.score - other_fs.score
214
+ if per:
215
+ field_deltas[path] = per
216
+
217
+ return RegressionDiff(deltas=deltas, field_deltas=field_deltas)
218
+
219
+ # ── Assertions (pytest-style: raise AssertionError, else None) ────────
220
+
221
+ def assert_no_parse_errors(self) -> None:
222
+ """Fail if the document could not be parsed."""
223
+ if self.parse_error:
224
+ raise AssertionError(
225
+ f"parse error: {self.parse_error_message or 'could not parse document'}"
226
+ )
227
+
228
+ def assert_score(self, min_score: float) -> None:
229
+ """Fail if the key-metric score is below ``min_score``."""
230
+ self.assert_no_parse_errors()
231
+ if self.score is None:
232
+ raise AssertionError(
233
+ "no score available (no key metric configured); use assert_metric() instead"
234
+ )
235
+ if self.score < min_score:
236
+ label = self.score_label or "score"
237
+ raise AssertionError(f"{label} {self.score:.4g} < required {min_score:.4g}")
238
+
239
+ def assert_field(self, path: str, min_score: float) -> None:
240
+ """Fail if the field at ``path`` scores below ``min_score``."""
241
+ fs = self.field_scores.get(path)
242
+ if fs is None:
243
+ raise AssertionError(f"no field at path {path!r}")
244
+ if fs.score is None:
245
+ raise AssertionError(f"field {path!r} has no score (no key metric applied)")
246
+ if fs.score < min_score:
247
+ raise AssertionError(
248
+ f"field {path!r} scored {fs.score:.4g} < required {min_score:.4g} "
249
+ f"(actual={fs.actual!r}, expected={fs.expected!r})"
250
+ )
251
+
252
+ def assert_metric(self, metric_name: str, min_value: float) -> None:
253
+ """Fail if metric ``metric_name`` is missing or below ``min_value``.
254
+
255
+ Compares the metric's document-level value (the root, else its mean
256
+ across the tree).
257
+ """
258
+ if metric_name not in self.metrics:
259
+ available = ", ".join(sorted(self.metrics)) or "none"
260
+ raise AssertionError(
261
+ f"metric {metric_name!r} not computed (available: {available})"
262
+ )
263
+ value = self.metrics[metric_name].representative()
264
+ if value < min_value:
265
+ raise AssertionError(
266
+ f"metric {metric_name!r} {value:.4g} < required {min_value:.4g}"
267
+ )
268
+
269
+ def assert_schema_valid(self) -> None:
270
+ """Fail if schema validation produced errors."""
271
+ coll = self.metrics.get("schema_validity")
272
+ if coll is None:
273
+ return
274
+ errors = coll.extra_values("schema_errors")
275
+ if coll.representative() == 0.0 or errors:
276
+ message = "; ".join(errors) or "schema validation failed"
277
+ raise AssertionError(f"schema invalid: {message}")
278
+
279
+
280
+ # ── Batch / consistency reports ───────────────────────────────────────────────
281
+
282
+
283
+ class BatchEvalReport(BaseModel):
284
+ """Aggregate result over a list of documents (``evaluate(list[Sample])``).
285
+
286
+ ``metrics`` is the mean of each document-level metric across successfully
287
+ parsed samples; ``score`` is the mean key-metric score. ``perfect_response_rate``
288
+ is the fraction of samples that parsed and had no failing field;
289
+ ``parse_error_rate`` the fraction that failed to parse.
290
+ """
291
+
292
+ per_sample: list[EvalReport] = Field(default_factory=list)
293
+ metrics: dict[str, float] = Field(default_factory=dict)
294
+ score: float | None = None
295
+ score_label: str | None = None
296
+ perfect_response_rate: float = 0.0
297
+ parse_error_rate: float = 0.0
298
+
299
+ def field_breakdown(
300
+ self, threshold: float | None = None
301
+ ) -> dict[str, dict[str, float]]:
302
+ """Per-path statistics across the batch: mean/min/max/p95/fail_rate.
303
+
304
+ Only nodes with a score (a key metric applied) are counted. ``fail_rate``
305
+ is the fraction of samples where the field scored below its bar (the
306
+ ``threshold`` argument, else the field's own threshold, else 1.0).
307
+ """
308
+ scores: dict[str, list[float]] = {}
309
+ fails: dict[str, int] = {}
310
+ for r in self.per_sample:
311
+ if r.parse_error:
312
+ continue
313
+ for path, fs in r.field_scores.items():
314
+ if fs.score is None:
315
+ continue
316
+ scores.setdefault(path, []).append(fs.score)
317
+ bar = threshold if threshold is not None else fs.threshold
318
+ if bar is None:
319
+ bar = 1.0
320
+ if fs.score < bar:
321
+ fails[path] = fails.get(path, 0) + 1
322
+
323
+ return {
324
+ path: {
325
+ "mean": mean(vals),
326
+ "min": min(vals),
327
+ "max": max(vals),
328
+ "p95": _percentile(vals, 0.95),
329
+ "fail_rate": fails.get(path, 0) / len(vals),
330
+ }
331
+ for path, vals in scores.items()
332
+ }
333
+
334
+ def print_summary(self) -> None:
335
+ """Print a batch summary (aggregate metrics + field breakdown)."""
336
+ from structured_eval.reporting import render
337
+
338
+ print(render(self)) # noqa: T201
339
+
340
+
341
+ class ConsistencyReport(BaseModel):
342
+ """Stability of repeated runs of one prompt (``evaluate_consistency``).
343
+
344
+ ``field_variance`` is the variance of each field's score across runs;
345
+ fields with variance at or below ``variance_threshold`` are ``stable_fields``,
346
+ the rest ``unstable_fields``. ``score_variance`` is the variance of the
347
+ document-level key-metric score.
348
+ """
349
+
350
+ per_run: list[EvalReport] = Field(default_factory=list)
351
+ field_variance: dict[str, float] = Field(default_factory=dict)
352
+ stable_fields: list[str] = Field(default_factory=list)
353
+ unstable_fields: list[str] = Field(default_factory=list)
354
+ mean_score: float | None = None
355
+ score_variance: float | None = None
356
+
357
+ def print_summary(self) -> None:
358
+ """Print a consistency summary (stable/unstable fields + variance)."""
359
+ from structured_eval.reporting import render
360
+
361
+ print(render(self)) # noqa: T201
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class Sample(BaseModel):
9
+ """One document to evaluate.
10
+
11
+ Wrapping in ``Sample`` removes the ambiguity of a bare ``list``: a list
12
+ passed as ``actual`` is a single document whose root is an array, whereas
13
+ ``list[Sample]`` is a batch of documents.
14
+ """
15
+
16
+ actual: dict[str, Any] | list[Any] | str
17
+ expected: dict[str, Any] | list[Any] | str | None = None
18
+ source: str | None = None # original text, for Faithfulness
19
+ id: str | None = None # identifier in a BatchEvalReport
File without changes
@@ -0,0 +1,5 @@
1
+ """Report rendering: plain-text console summaries for the report types."""
2
+
3
+ from structured_eval.reporting.console import render
4
+
5
+ __all__ = ["render"]
@@ -0,0 +1,194 @@
1
+ """Plain-text console rendering for the report types (no hard dependency).
2
+
3
+ ``ConsoleRenderer().render(report)`` returns a string; the module-level
4
+ ``render`` is a thin convenience over it, used by ``EvalReport.print_summary``.
5
+ The layout is pure stdlib so it works out of the box; Rich can be layered on
6
+ for colour later.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from structured_eval.models.result import (
12
+ BatchEvalReport,
13
+ ConsistencyReport,
14
+ EvalReport,
15
+ )
16
+
17
+ _RULE = "─"
18
+ _BAR = "━"
19
+ _WIDTH = 60
20
+
21
+
22
+ class ConsoleRenderer:
23
+ """Renders ``EvalReport`` / ``BatchEvalReport`` / ``ConsistencyReport``."""
24
+
25
+ def render(self, report: EvalReport | BatchEvalReport | ConsistencyReport) -> str:
26
+ if isinstance(report, EvalReport):
27
+ return self._render_eval(report)
28
+ if isinstance(report, BatchEvalReport):
29
+ return self._render_batch(report)
30
+ if isinstance(report, ConsistencyReport):
31
+ return self._render_consistency(report)
32
+ raise NotImplementedError
33
+
34
+ # ── EvalReport ──────────────────────────────────────────────────────────
35
+
36
+ def _render_eval(self, report: EvalReport) -> str:
37
+ out: list[str] = [_BAR * _WIDTH]
38
+
39
+ if report.parse_error:
40
+ out.append(f" PARSE ERROR: {report.parse_error_message}")
41
+ out.append(_BAR * _WIDTH)
42
+ return "\n".join(out)
43
+
44
+ bar = _BAR * _WIDTH
45
+ if report.score is not None:
46
+ verdict = "✓ PASS" if report.score >= 1.0 else "✗ FAIL"
47
+ label = report.score_label or ""
48
+ out.append(f" OVERALL {report.score:.2f} {verdict} {label}")
49
+ else:
50
+ out.append(" OVERALL — (no ground truth)")
51
+
52
+ # Document-level metrics: those a metric produced at the root ("$").
53
+ doc_metrics: dict[str, float] = {}
54
+ for name, coll in report.metrics.items():
55
+ v = coll.root()
56
+ if v is not None:
57
+ doc_metrics[name] = v
58
+ grid = self._metric_grid(doc_metrics, skip=report.score_label)
59
+ if grid:
60
+ out += ["", *grid]
61
+ out.append(bar)
62
+
63
+ # scalar leaves with a key-metric score
64
+ rows = []
65
+ for fs in report.field_scores.values():
66
+ if fs.score is None:
67
+ continue
68
+ metric_name = next(
69
+ (k for k, v in fs.metrics.items() if v == fs.score), "score"
70
+ )
71
+ rows.append(
72
+ [
73
+ fs.path,
74
+ metric_name,
75
+ self._num(fs.score),
76
+ self._num(fs.threshold),
77
+ self._mark(fs.score, fs.threshold),
78
+ ]
79
+ )
80
+ if rows:
81
+ out += self._table(
82
+ ["Field", "Metric", "Score", "Threshold", "Mark"],
83
+ rows,
84
+ ["<", "<", ">", ">", "^"],
85
+ )
86
+ out.append(bar)
87
+
88
+ return "\n".join(out)
89
+
90
+ # ── BatchEvalReport ───────────────────────────────────────────────────
91
+
92
+ def _render_batch(self, report: BatchEvalReport) -> str:
93
+ bar = _BAR * _WIDTH
94
+ n = len(report.per_sample)
95
+ out = [bar, f" BATCH {n} samples"]
96
+ if report.score is not None:
97
+ out.append(f" mean {report.score_label or 'score'} {report.score:.2f}")
98
+ out.append(f" perfect_response_rate {report.perfect_response_rate:.2f}")
99
+ out.append(f" parse_error_rate {report.parse_error_rate:.2f}")
100
+ grid = self._metric_grid(report.metrics, skip=report.score_label)
101
+ if grid:
102
+ out += ["", *grid]
103
+ out.append(bar)
104
+
105
+ bd = report.field_breakdown()
106
+ ranked = sorted(bd.items(), key=lambda kv: kv[1]["fail_rate"], reverse=True)
107
+ if ranked:
108
+ rows = [
109
+ [
110
+ path,
111
+ self._num(s["mean"]),
112
+ self._num(s["p95"]),
113
+ self._num(s["fail_rate"]),
114
+ ]
115
+ for path, s in ranked
116
+ ]
117
+ out.append(" Field breakdown (worst first)")
118
+ out += self._table(
119
+ ["Field", "mean", "p95", "fail_rate"], rows, ["<", ">", ">", ">"]
120
+ )
121
+ out.append(bar)
122
+ return "\n".join(out)
123
+
124
+ # ── ConsistencyReport ─────────────────────────────────────────────────
125
+
126
+ def _render_consistency(self, report: ConsistencyReport) -> str:
127
+ bar = _BAR * _WIDTH
128
+ out = [bar, f" CONSISTENCY {len(report.per_run)} runs"]
129
+ if report.mean_score is not None:
130
+ out.append(f" mean score {report.mean_score:.2f}")
131
+ if report.score_variance is not None:
132
+ out.append(f" score variance {report.score_variance:.4f}")
133
+ out += [
134
+ f" stable {', '.join(report.stable_fields) or '—'}",
135
+ f" unstable {', '.join(report.unstable_fields) or '—'}",
136
+ bar,
137
+ ]
138
+ ranked = sorted(
139
+ report.field_variance.items(), key=lambda kv: kv[1], reverse=True
140
+ )
141
+ if ranked:
142
+ rows = [[path, f"{var:.4f}"] for path, var in ranked]
143
+ out += self._table(["Field", "variance"], rows, ["<", ">"])
144
+ out.append(bar)
145
+ return "\n".join(out)
146
+
147
+ # ── formatting helpers ─────────────────────────────────────────────────
148
+
149
+ @staticmethod
150
+ def _num(value: float | None) -> str:
151
+ return "—" if value is None else f"{value:.2f}"
152
+
153
+ @staticmethod
154
+ def _mark(score: float | None, bar: float | None) -> str:
155
+ if score is None or bar is None:
156
+ return " "
157
+ return "✓" if score >= bar else "✗"
158
+
159
+ @staticmethod
160
+ def _table(
161
+ headers: list[str], rows: list[list[str]], aligns: list[str] | None = None
162
+ ) -> list[str]:
163
+ """Render a simple monospace table as a list of lines."""
164
+ cols = (
165
+ list(zip(*([headers, *rows]), strict=False))
166
+ if rows
167
+ else [[h] for h in headers]
168
+ )
169
+ widths = [max(len(c) for c in col) for col in cols]
170
+ aligns = aligns or ["<"] * len(headers)
171
+
172
+ def fmt(cells: list[str]) -> str:
173
+ return " ".join(
174
+ f"{c:{a}{w}}" for c, w, a in zip(cells, widths, aligns, strict=False)
175
+ )
176
+
177
+ lines = [fmt(headers), fmt([_RULE * w for w in widths])]
178
+ lines += [fmt(r) for r in rows]
179
+ return [" " + line for line in lines]
180
+
181
+ @staticmethod
182
+ def _metric_grid(metrics: dict[str, float], skip: str | None = None) -> list[str]:
183
+ """Two-per-line key/value grid of document metrics."""
184
+ items = [(k, v) for k, v in metrics.items() if k != skip]
185
+ if not items:
186
+ return []
187
+ width = max(len(k) for k, _ in items)
188
+ cells = [f"{k:<{width}} {ConsoleRenderer._num(v)}" for k, v in items]
189
+ return [" " + " ".join(cells[i : i + 2]) for i in range(0, len(cells), 2)]
190
+
191
+
192
+ def render(report: EvalReport | BatchEvalReport | ConsistencyReport) -> str:
193
+ """Render any of the report types to a printable string."""
194
+ return ConsoleRenderer().render(report)
@@ -0,0 +1,16 @@
1
+ from structured_eval.utils.flatten import extract_paths, flatten
2
+ from structured_eval.utils.structured_diff import (
3
+ DiffEntry,
4
+ DiffType,
5
+ StructuredDiff,
6
+ structured_diff,
7
+ )
8
+
9
+ __all__ = [
10
+ "DiffEntry",
11
+ "DiffType",
12
+ "StructuredDiff",
13
+ "extract_paths",
14
+ "flatten",
15
+ "structured_diff",
16
+ ]
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ def flatten(obj: dict[str, Any] | list[Any], prefix: str = "") -> dict[str, Any]:
7
+ """Recursively flatten a nested dict/list into dot-and-bracket key paths.
8
+
9
+ Dict keys use dot notation: {"a": {"b": 1}} → {"a.b": 1}
10
+ List indices use brackets: {"a": [1, 2]} → {"a[0]": 1, "a[1]": 2}
11
+ Empty containers are left as-is: {"a": {}} → {"a": {}}
12
+
13
+ Args:
14
+ obj: Dict or list to flatten.
15
+ prefix: Internal prefix for recursive calls; do not pass externally.
16
+
17
+ Returns:
18
+ Flat dict mapping string paths to primitive (or empty container) values.
19
+
20
+ Example:
21
+ >>> flatten({"invoice": {"id": "1", "items": [{"price": 100}]}})
22
+ {"invoice.id": "1", "invoice.items[0].price": 100}
23
+ """
24
+ result: dict[str, Any] = {}
25
+ if isinstance(obj, dict):
26
+ for key, value in obj.items():
27
+ path = f"{prefix}.{key}" if prefix else key
28
+ if isinstance(value, (dict, list)) and value:
29
+ result.update(flatten(value, path))
30
+ else:
31
+ result[path] = value
32
+ elif isinstance(obj, list):
33
+ for i, item in enumerate(obj):
34
+ path = f"{prefix}[{i}]"
35
+ if isinstance(item, (dict, list)) and item:
36
+ result.update(flatten(item, path))
37
+ else:
38
+ result[path] = item
39
+ return result
40
+
41
+
42
+ def extract_paths(value: Any, prefix: str = "") -> set[str]:
43
+ """Every structural path in a JSON-like value — order- and value-blind.
44
+
45
+ Yields the path of each container *and* each leaf, so the set captures the
46
+ whole skeleton: dict keys (``a``, ``a.b``), list indices (``a[0]``) and the
47
+ leaf paths beneath them. Values themselves are ignored — only the shape.
48
+ Unlike :func:`flatten`, intermediate container paths are included, not just
49
+ leaves, and the result is a set of paths rather than a path→value mapping.
50
+
51
+ Example:
52
+ >>> sorted(extract_paths({"a": {"b": 1}, "c": [2]}))
53
+ ['a', 'a.b', 'c', 'c[0]']
54
+ """
55
+ paths: set[str] = set()
56
+ if isinstance(value, dict):
57
+ for key, child in value.items():
58
+ here = f"{prefix}.{key}" if prefix else str(key)
59
+ paths.add(here)
60
+ paths |= extract_paths(child, here)
61
+ elif isinstance(value, list):
62
+ for index, child in enumerate(value):
63
+ here = f"{prefix}[{index}]"
64
+ paths.add(here)
65
+ paths |= extract_paths(child, here)
66
+ return paths