structured-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. structured_eval/__init__.py +27 -0
  2. structured_eval/alignment/__init__.py +15 -0
  3. structured_eval/alignment/base.py +40 -0
  4. structured_eval/alignment/by_index.py +24 -0
  5. structured_eval/alignment/by_key.py +73 -0
  6. structured_eval/alignment/factory.py +28 -0
  7. structured_eval/alignment/hungarian.py +156 -0
  8. structured_eval/api.py +79 -0
  9. structured_eval/engine/__init__.py +15 -0
  10. structured_eval/engine/aggregator.py +96 -0
  11. structured_eval/engine/evaluator.py +72 -0
  12. structured_eval/engine/metric_runner.py +69 -0
  13. structured_eval/engine/parser.py +42 -0
  14. structured_eval/engine/report_builder.py +68 -0
  15. structured_eval/engine/tree_builder.py +319 -0
  16. structured_eval/formats/__init__.py +5 -0
  17. structured_eval/formats/base.py +19 -0
  18. structured_eval/formats/json_parser.py +44 -0
  19. structured_eval/formats/yaml_parser.py +24 -0
  20. structured_eval/integrations/__init__.py +11 -0
  21. structured_eval/integrations/_adapter.py +47 -0
  22. structured_eval/integrations/deepeval.py +74 -0
  23. structured_eval/integrations/langsmith.py +90 -0
  24. structured_eval/metrics/__init__.py +101 -0
  25. structured_eval/metrics/array_accuracy.py +28 -0
  26. structured_eval/metrics/array_cardinality.py +27 -0
  27. structured_eval/metrics/array_exact_match.py +48 -0
  28. structured_eval/metrics/array_f1.py +34 -0
  29. structured_eval/metrics/array_jaccard_similarity.py +60 -0
  30. structured_eval/metrics/array_precision.py +34 -0
  31. structured_eval/metrics/array_prf1.py +40 -0
  32. structured_eval/metrics/array_recall.py +33 -0
  33. structured_eval/metrics/base.py +144 -0
  34. structured_eval/metrics/character_f1.py +50 -0
  35. structured_eval/metrics/composite_score.py +46 -0
  36. structured_eval/metrics/coverage_leaf_score.py +29 -0
  37. structured_eval/metrics/date_distance_score.py +63 -0
  38. structured_eval/metrics/exact.py +21 -0
  39. structured_eval/metrics/exponential_numeric_score.py +47 -0
  40. structured_eval/metrics/field_faithfulness.py +38 -0
  41. structured_eval/metrics/fuzzy.py +64 -0
  42. structured_eval/metrics/invoker.py +90 -0
  43. structured_eval/metrics/levenshtein.py +16 -0
  44. structured_eval/metrics/mean_score.py +31 -0
  45. structured_eval/metrics/numeric.py +83 -0
  46. structured_eval/metrics/numeric_closeness.py +35 -0
  47. structured_eval/metrics/object_accuracy.py +47 -0
  48. structured_eval/metrics/object_exact_match.py +41 -0
  49. structured_eval/metrics/object_f1.py +47 -0
  50. structured_eval/metrics/object_precision.py +49 -0
  51. structured_eval/metrics/object_prf1.py +51 -0
  52. structured_eval/metrics/object_recall.py +44 -0
  53. structured_eval/metrics/object_type_validity.py +34 -0
  54. structured_eval/metrics/overall_leaf_score.py +32 -0
  55. structured_eval/metrics/presence.py +22 -0
  56. structured_eval/metrics/regex_match.py +51 -0
  57. structured_eval/metrics/rule_pass_rate/__init__.py +5 -0
  58. structured_eval/metrics/rule_pass_rate/dsl.py +224 -0
  59. structured_eval/metrics/rule_pass_rate/engine.py +24 -0
  60. structured_eval/metrics/rule_pass_rate/metric.py +33 -0
  61. structured_eval/metrics/schema_validity/__init__.py +7 -0
  62. structured_eval/metrics/schema_validity/metric.py +35 -0
  63. structured_eval/metrics/schema_validity/validator.py +119 -0
  64. structured_eval/metrics/structural_similarity.py +40 -0
  65. structured_eval/metrics/token_f1.py +44 -0
  66. structured_eval/metrics/type_match.py +35 -0
  67. structured_eval/metrics/utils/__init__.py +10 -0
  68. structured_eval/metrics/utils/array.py +31 -0
  69. structured_eval/metrics/utils/calculate.py +72 -0
  70. structured_eval/metrics/utils/number.py +46 -0
  71. structured_eval/metrics/utils/object_utils.py +87 -0
  72. structured_eval/models/__init__.py +72 -0
  73. structured_eval/models/config.py +124 -0
  74. structured_eval/models/context.py +25 -0
  75. structured_eval/models/metric_result.py +121 -0
  76. structured_eval/models/nodes/__init__.py +13 -0
  77. structured_eval/models/nodes/array_node.py +32 -0
  78. structured_eval/models/nodes/base.py +113 -0
  79. structured_eval/models/nodes/object_node.py +19 -0
  80. structured_eval/models/nodes/scalar.py +14 -0
  81. structured_eval/models/result.py +361 -0
  82. structured_eval/models/sample.py +19 -0
  83. structured_eval/py.typed +0 -0
  84. structured_eval/reporting/__init__.py +5 -0
  85. structured_eval/reporting/console.py +194 -0
  86. structured_eval/utils/__init__.py +16 -0
  87. structured_eval/utils/flatten.py +66 -0
  88. structured_eval/utils/paths.py +58 -0
  89. structured_eval/utils/structured_diff.py +159 -0
  90. structured_eval-0.1.0.dist-info/METADATA +322 -0
  91. structured_eval-0.1.0.dist-info/RECORD +94 -0
  92. structured_eval-0.1.0.dist-info/WHEEL +5 -0
  93. structured_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  94. structured_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,58 @@
1
+ """Path navigation over nested dict/list documents.
2
+
3
+ A pure utility (no model dependencies) shared by the node tree, the engine's
4
+ tree builder, and array alignment. ``navigate`` walks a dot-and-bracket path;
5
+ ``MISSING`` is the sentinel for an unresolvable step (distinct from ``None`` so
6
+ callers can tell "absent" from "present but null").
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import Any
13
+
14
+ # Splits a dot-and-bracket path into access steps.
15
+ # "roles[0].name" → ["roles", "[0]", "name"]
16
+ # "items[0]" → ["items", "[0]"]
17
+ # "a.b" → ["a", "b"]
18
+ _PATH_TOKEN = re.compile(r"[^.\[\]]+|\[[^\]]*\]")
19
+
20
+ # Sentinel returned when a path cannot be resolved (key/index missing).
21
+ # Distinct from None so callers can tell "absent" from "present but null".
22
+ MISSING = object()
23
+
24
+
25
+ def navigate(obj: Any, path: str) -> Any:
26
+ """Walk ``obj`` along a dot-and-bracket ``path``.
27
+
28
+ ``"$"`` returns the root unchanged. Dict keys use dot notation, list
29
+ indices use brackets: ``"roles[0].name"``. Returns ``MISSING`` when any
30
+ step cannot be resolved (missing key, out-of-range or non-integer index).
31
+
32
+ Examples:
33
+ >>> navigate({"a": {"b": 1}}, "a.b")
34
+ 1
35
+ >>> navigate({"items": [1, 2]}, "items[0]")
36
+ 1
37
+ """
38
+ if path in {"$", ""}:
39
+ return obj
40
+
41
+ current = obj
42
+ for token in _PATH_TOKEN.findall(path):
43
+ if token.startswith("["):
44
+ inner = token[1:-1]
45
+ if not isinstance(current, list):
46
+ return MISSING
47
+ try:
48
+ idx = int(inner)
49
+ except ValueError:
50
+ return MISSING
51
+ if not -len(current) <= idx < len(current):
52
+ return MISSING
53
+ current = current[idx]
54
+ else:
55
+ if not isinstance(current, dict) or token not in current:
56
+ return MISSING
57
+ current = current[token]
58
+ return current
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from enum import StrEnum
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class DiffType(StrEnum):
11
+ ADDED = "added" # present in actual, absent in expected
12
+ REMOVED = "removed" # present in expected, absent in actual
13
+ CHANGED = "changed" # present in both but value differs
14
+
15
+
16
+ class DiffEntry(BaseModel):
17
+ """Single difference between actual and expected at one field path."""
18
+
19
+ path: str = Field(description="Dot/bracket path to the differing field.")
20
+ diff_type: DiffType = Field(
21
+ description="Type of difference: added, removed, or changed."
22
+ )
23
+ actual: Any = Field(description="Value in actual (None for removed entries).")
24
+ expected: Any = Field(description="Value in expected (None for added entries).")
25
+
26
+
27
+ class StructuredDiff(BaseModel):
28
+ """Human-readable field-level diff between actual and expected documents.
29
+
30
+ Produced by structured_diff(). Use .added, .removed, .changed for
31
+ filtered views, or .is_equal to check whether the documents match.
32
+ """
33
+
34
+ entries: list[DiffEntry] = Field(default_factory=list)
35
+
36
+ @property
37
+ def added(self) -> list[DiffEntry]:
38
+ """Fields present in actual but absent in expected."""
39
+ return [e for e in self.entries if e.diff_type == DiffType.ADDED]
40
+
41
+ @property
42
+ def removed(self) -> list[DiffEntry]:
43
+ """Fields present in expected but absent in actual."""
44
+ return [e for e in self.entries if e.diff_type == DiffType.REMOVED]
45
+
46
+ @property
47
+ def changed(self) -> list[DiffEntry]:
48
+ """Fields present in both but with different values."""
49
+ return [e for e in self.entries if e.diff_type == DiffType.CHANGED]
50
+
51
+ @property
52
+ def is_equal(self) -> bool:
53
+ """True when actual and expected are identical (no differences)."""
54
+ return len(self.entries) == 0
55
+
56
+
57
+ def structured_diff(
58
+ actual: dict[str, Any],
59
+ expected: dict[str, Any],
60
+ ) -> StructuredDiff:
61
+ """Compute a readable field-level diff between actual and expected.
62
+
63
+ Uses DeepDiff to detect changes at every nesting level and converts
64
+ the result into DiffEntry objects with ADDED / REMOVED / CHANGED types.
65
+
66
+ Args:
67
+ actual: LLM output document.
68
+ expected: Ground truth document.
69
+
70
+ Returns:
71
+ StructuredDiff with one DiffEntry per differing field path.
72
+
73
+ Raises:
74
+ ImportError: If deepdiff is not installed.
75
+ """
76
+ try:
77
+ from deepdiff import DeepDiff
78
+ except ImportError as exc: # pragma: no cover
79
+ raise ImportError(
80
+ "deepdiff is required for structured_diff. "
81
+ "Install it with: pip install 'structured-eval[diff]'"
82
+ ) from exc
83
+
84
+ # DeepDiff(old, new) — expected is old, actual is new
85
+ diff = DeepDiff(expected, actual, verbose_level=2)
86
+ entries: list[DiffEntry] = []
87
+
88
+ for raw_path, value in diff.get("dictionary_item_added", {}).items():
89
+ entries.append(
90
+ DiffEntry(
91
+ path=_to_readable_path(raw_path),
92
+ diff_type=DiffType.ADDED,
93
+ actual=value,
94
+ expected=None,
95
+ )
96
+ )
97
+
98
+ for raw_path, value in diff.get("dictionary_item_removed", {}).items():
99
+ entries.append(
100
+ DiffEntry(
101
+ path=_to_readable_path(raw_path),
102
+ diff_type=DiffType.REMOVED,
103
+ actual=None,
104
+ expected=value,
105
+ )
106
+ )
107
+
108
+ for raw_path, change in diff.get("values_changed", {}).items():
109
+ entries.append(
110
+ DiffEntry(
111
+ path=_to_readable_path(raw_path),
112
+ diff_type=DiffType.CHANGED,
113
+ actual=change["new_value"],
114
+ expected=change["old_value"],
115
+ )
116
+ )
117
+
118
+ for raw_path, change in diff.get("type_changes", {}).items():
119
+ entries.append(
120
+ DiffEntry(
121
+ path=_to_readable_path(raw_path),
122
+ diff_type=DiffType.CHANGED,
123
+ actual=change["new_value"],
124
+ expected=change["old_value"],
125
+ )
126
+ )
127
+
128
+ for raw_path, value in diff.get("iterable_item_added", {}).items():
129
+ entries.append(
130
+ DiffEntry(
131
+ path=_to_readable_path(raw_path),
132
+ diff_type=DiffType.ADDED,
133
+ actual=value,
134
+ expected=None,
135
+ )
136
+ )
137
+
138
+ for raw_path, value in diff.get("iterable_item_removed", {}).items():
139
+ entries.append(
140
+ DiffEntry(
141
+ path=_to_readable_path(raw_path),
142
+ diff_type=DiffType.REMOVED,
143
+ actual=None,
144
+ expected=value,
145
+ )
146
+ )
147
+
148
+ entries.sort(key=lambda e: e.path)
149
+ return StructuredDiff(entries=entries)
150
+
151
+
152
+ def _to_readable_path(deepdiff_path: str) -> str:
153
+ """Convert DeepDiff path notation to dot/bracket notation.
154
+
155
+ root['a']['b'][0] → a.b[0]
156
+ """
157
+ path = deepdiff_path[4:] # strip leading "root"
158
+ path = re.sub(r"\['([^']+)'\]", r".\1", path)
159
+ return path.lstrip(".")
@@ -0,0 +1,322 @@
1
+ Metadata-Version: 2.4
2
+ Name: structured-eval
3
+ Version: 0.1.0
4
+ Summary: The LLM Structured Output Evaluation Framework
5
+ License: Apache-2.0
6
+ Project-URL: Homepage, https://github.com/kirillpechurin/structured-eval
7
+ Project-URL: Repository, https://github.com/kirillpechurin/structured-eval
8
+ Project-URL: Issues, https://github.com/kirillpechurin/structured-eval/issues
9
+ Keywords: structured llm evaluation,structured eval,json eval,llm,structured output,pydantic
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.12
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pydantic>=2.5.0
24
+ Provides-Extra: yaml
25
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
26
+ Provides-Extra: fuzzy
27
+ Requires-Dist: rapidfuzz>=3.0.0; extra == "fuzzy"
28
+ Provides-Extra: jsonschema
29
+ Requires-Dist: jsonschema>=4.20.0; extra == "jsonschema"
30
+ Provides-Extra: rules
31
+ Requires-Dist: jsonpath-ng>=1.6.0; extra == "rules"
32
+ Provides-Extra: diff
33
+ Requires-Dist: deepdiff>=7.0.0; extra == "diff"
34
+ Provides-Extra: align
35
+ Requires-Dist: scipy>=1.13.0; extra == "align"
36
+ Provides-Extra: deepeval
37
+ Requires-Dist: deepeval>=3.0.0; extra == "deepeval"
38
+ Provides-Extra: langsmith
39
+ Requires-Dist: langsmith>=0.8.0; extra == "langsmith"
40
+ Provides-Extra: all
41
+ Requires-Dist: structured-eval[align,diff,fuzzy,jsonschema,rules,yaml]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # structured-eval
45
+
46
+ **A declarative, field-level evaluation framework for LLM structured outputs (JSON/YAML).**
47
+
48
+ Getting an LLM to return well-formed JSON is mostly a solved problem — it parses,
49
+ it fits the schema, the types line up. But well-formed isn't the same as *right*.
50
+ The shape can be flawless while a price is wrong, a date is invented, or a status
51
+ quietly contradicts the rest of the record. Structural checks wave all of that
52
+ through.
53
+
54
+ structured-eval looks at what those checks skip: **the values themselves**. It
55
+ scores your output field by field, so you don't just learn *that* a response is
56
+ off — you see *which* fields matched and which didn't, where to look first, and,
57
+ across a dataset, which fields your model keeps getting wrong.
58
+
59
+ structured-eval lets you check not just that the JSON is valid, but that the
60
+ data itself is correct.
61
+
62
+ ## The gap it closes
63
+
64
+ Correctness is a ladder — each level assumes the ones below it:
65
+
66
+ | | |
67
+ |-----------|------------------------------------------------------|
68
+ | **L0–L3** | structure: parses · types · required · no extras |
69
+ | **L4** | values are close to expected |
70
+ | **L5** | values are grounded in the source (no hallucination) |
71
+ | **L6** | fields are logically consistent with one another |
72
+
73
+ L0–L3 is where most tools stop. **L4–L6 is where structured-eval earns its keep.**
74
+ See [the introduction](docs/introduction.md) for the full ladder.
75
+
76
+ ## Install
77
+
78
+ ```bash
79
+ pip install structured-eval # core depends only on Pydantic
80
+ pip install "structured-eval[all]" # + YAML, fuzzy, schema, rules, scipy alignment…
81
+ ```
82
+
83
+ Optional features live behind [extras](docs/getting-started.md#install) — install
84
+ only what you need.
85
+
86
+ ## Quick start
87
+
88
+ A model extracted a course record; you have the canonical one to check it against.
89
+ The two are *structurally* identical — same keys, mixed types, a nested array of
90
+ objects — but several values are off. A small config says *how* to judge each
91
+ field, and the report tells you exactly where the output stands:
92
+
93
+ ```python
94
+ from structured_eval import evaluate
95
+ from structured_eval.models import EvalConfig, FieldConfig
96
+ from structured_eval.metrics import Numeric, TokenF1
97
+
98
+ expected = {
99
+ "course_id": "COURSE-101",
100
+ "title": "Introduction to Python",
101
+ "published": True,
102
+ "duration_hours": 12,
103
+ "rating": 4.8,
104
+ "modules": [
105
+ {"name": "Basics", "lessons": 5},
106
+ {"name": "Functions", "lessons": 4},
107
+ {"name": "Classes", "lessons": 3},
108
+ ],
109
+ }
110
+
111
+ actual = {
112
+ "course_id": "COURSE-101",
113
+ "title": "Intro to Python", # paraphrased
114
+ "published": True,
115
+ "duration_hours": 10, # off by 2
116
+ "rating": 4.5, # off by 0.3
117
+ "modules": [
118
+ {"name": "Basics", "lessons": 5},
119
+ {"name": "Functions", "lessons": 4},
120
+ ], # "Classes" module missing
121
+ }
122
+
123
+ config = EvalConfig(fields={
124
+ "title": FieldConfig(metrics=[TokenF1()]), # reward paraphrases
125
+ "duration_hours": FieldConfig(metrics=[Numeric(tolerance=2)]),
126
+ "rating": FieldConfig(metrics=[Numeric(tolerance=0.5)]), # close enough is fine
127
+ })
128
+
129
+ report = evaluate(actual, expected, config)
130
+
131
+ report.score # 0.8889 — close, with the gaps pinpointed
132
+ report.field_scores["title"].score # 0.6667 — paraphrase gets partial credit
133
+ report.field_scores["duration_hours"].score # 1.0 — within tolerance
134
+ report.field_scores["modules"].score # 0.6667 — 2 of 3 modules recovered
135
+ report.field_scores["modules[0]"].score # 1.0 — first module is spot-on
136
+ ```
137
+
138
+ Every field is scored — nested objects and array elements included — so you see
139
+ not a single pass/fail but exactly which fields hold up and which don't.
140
+
141
+ ### Sensible default metrics
142
+
143
+ The config is optional. structured-eval ships a default metric for every node
144
+ type, so you only configure the fields where the default isn't what you want —
145
+ the rest just work. With no config at all, the same data is scored by those
146
+ defaults:
147
+
148
+ ```python
149
+ report = evaluate(actual, expected) # no config
150
+
151
+ report.score # 0.4444 — scored entirely by the defaults
152
+ report.field_scores["title"].score # 0.0 — exact match: "Intro to Python" ≠ "Introduction to Python"
153
+ ```
154
+
155
+ Each node type gets a structural default, and every node's headline score (its
156
+ *representative*) defaults to the mean of its own metrics:
157
+
158
+ | Node | Default metric | What it does |
159
+ |----------------------|------------------|------------------------------------------------|
160
+ | scalar (leaf) | `ExactMatch` | the value must match exactly |
161
+ | object | `ObjectAccuracy` | mean correctness of its fields |
162
+ | array | `ArrayAccuracy` | mean correctness of its aligned elements |
163
+ | any node (headline) | `MeanScore` | the node's representative = mean of its metrics |
164
+
165
+ Exact match is a strict baseline — it punishes every paraphrase and rounded value
166
+ as wrong, which is why the no-config score is low. Tuning metrics per field, as in
167
+ the first example, is how you tell the evaluator what "close enough" means for
168
+ *your* data. The defaults and the representative score are covered in
169
+ [the evaluation model](docs/core-concepts/evaluation-model.md) and the
170
+ [metric catalog](docs/metrics/index.md).
171
+
172
+ ## Explore — every level of correctness
173
+
174
+ structured-eval covers the whole ladder, L0 through L6. Each level has a tool and
175
+ a concept page behind it:
176
+
177
+ | Level | The question | Reach for | Learn more |
178
+ |---------------------|-------------------------------------|-------------------------------------------------------------|------------------------------------------------------------------------|
179
+ | **L0–L3** structure | does it parse / fit the schema? | `SchemaValidity` | [schema validity](docs/metrics/catalog/schema-validity.md) |
180
+ | **L4** values | is each value right? | field metrics — `ExactMatch`, `Numeric`, `TokenF1`, `Fuzzy` | [comparison is a metric](docs/core-concepts/comparison-is-a-metric.md) |
181
+ | **L4** roll-up | how do fields & elements aggregate? | `ObjectF1` / `ArrayF1`, alignment, weights | [array alignment](docs/core-concepts/array-alignment.md) |
182
+ | **L5** faithfulness | is it grounded in the source? | `FieldFaithfulness(source=…)` | [field faithfulness](docs/metrics/catalog/field_faithfulness.md) |
183
+ | **L6** logic | are fields mutually consistent? | `RulePassRate` + `Rule` DSL | [rule pass rate](docs/metrics/catalog/rule-pass-rate.md) |
184
+
185
+ **L0–L3 — structure.** Validate against a Pydantic model or JSON Schema, with no
186
+ ground-truth answer:
187
+
188
+ ```python
189
+ from pydantic import BaseModel
190
+ from structured_eval import evaluate
191
+ from structured_eval.models import EvalConfig
192
+ from structured_eval.metrics import SchemaValidity
193
+
194
+
195
+ class Course(BaseModel):
196
+ title: str
197
+ duration_hours: int
198
+
199
+
200
+ report = evaluate(
201
+ actual={"title": "ML", "duration_hours": "twelve"},
202
+ expected=None,
203
+ config=EvalConfig(key_metric=SchemaValidity(Course))
204
+ )
205
+ report.score # 0.0
206
+ report.metrics["schema_validity"].root().extra["schema_errors"]
207
+ # {'type_errors': ['duration_hours'], 'missing_required': [], 'extra_fields': []}
208
+ ```
209
+
210
+ **L4 — values.** Pick *how* each field is judged — exact match is just the default
211
+ (*comparison is a metric*):
212
+
213
+ ```python
214
+ from structured_eval import evaluate
215
+ from structured_eval.models import EvalConfig, FieldConfig
216
+ from structured_eval.metrics import Numeric, TokenF1
217
+
218
+ report = evaluate(
219
+ actual={"title": "Intro to Python", "duration_hours": 11},
220
+ expected={"title": "Introduction to Python", "duration_hours": 12},
221
+ config=EvalConfig(fields={
222
+ "title": FieldConfig(metrics=[TokenF1()]), # token overlap
223
+ "duration_hours": FieldConfig(metrics=[Numeric(tolerance=2)]),
224
+ }),
225
+ )
226
+ report.field_scores["title"].score # 0.6667 — partial credit for a paraphrase
227
+ report.field_scores["duration_hours"].score # 1.0 — within tolerance
228
+ ```
229
+
230
+ Fields roll up into objects and arrays with precision / recall / F1, and arrays are
231
+ [aligned](docs/core-concepts/array-alignment.md) by index, key, or optimally:
232
+
233
+ ```python
234
+ from structured_eval import evaluate
235
+ from structured_eval.models import EvalConfig
236
+ from structured_eval.metrics import ObjectF1
237
+
238
+ report = evaluate(
239
+ actual={"a": 1, "b": 9},
240
+ expected={"a": 1, "b": 2, "c": 3},
241
+ config=EvalConfig(metrics=[ObjectF1()])
242
+ )
243
+
244
+ report.metrics["object_f1"].root() # 0.4
245
+ ```
246
+
247
+ **L5 — faithfulness.** Catch hallucinations by checking each value against its
248
+ source — no `expected` required:
249
+
250
+ ```python
251
+ from structured_eval import evaluate
252
+ from structured_eval.models import EvalConfig
253
+ from structured_eval.metrics import FieldFaithfulness
254
+
255
+ report = evaluate(
256
+ actual={"title": "Introduction to Python", "duration_hours": 40},
257
+ expected=None,
258
+ config=EvalConfig(metrics=[FieldFaithfulness()]),
259
+ source="Course: Introduction to Python. Duration: 12 hours.",
260
+ )
261
+
262
+ report.metrics["field_faithfulness"].by_path # {'title': 1.0, 'duration_hours': 0.0 ← 40 ≠ 12}
263
+ ```
264
+
265
+ **L6 — logic.** Assert cross-field business rules with a small DSL:
266
+
267
+ ```python
268
+ from structured_eval import evaluate
269
+ from structured_eval.models import EvalConfig
270
+ from structured_eval.metrics import Rule, RulePassRate
271
+
272
+ report = evaluate(
273
+ actual={"subtotal": 100, "tax": 20, "total": 130},
274
+ expected=None,
275
+ config=EvalConfig(key_metric=RulePassRate([Rule("$.total").eq("$.subtotal + $.tax")]))
276
+ )
277
+
278
+ report.score # 0.0 — 130 ≠ 120
279
+ ```
280
+
281
+ ## Scale it
282
+
283
+ Evaluate a whole dataset, or measure how stable a prompt is across repeated runs:
284
+
285
+ ```python
286
+ from structured_eval import evaluate_batch, evaluate_consistency
287
+ from structured_eval.models import Sample
288
+
289
+ # one report per sample + dataset aggregates
290
+ batch = evaluate_batch([
291
+ Sample(actual={"x": 1}, expected={"x": 1}),
292
+ Sample(actual={"x": 1}, expected={"x": 2}),
293
+ ])
294
+ batch.score # 0.5
295
+ batch.perfect_response_rate # 0.5
296
+
297
+ # repeated runs of the same prompt → which fields drift?
298
+ runs = [
299
+ Sample(actual={"sentiment": "positive", "score": 0.9}, expected={"sentiment": "positive", "score": 0.9}),
300
+ Sample(actual={"sentiment": "positive", "score": 0.9}, expected={"sentiment": "positive", "score": 0.9}),
301
+ Sample(actual={"sentiment": "neutral", "score": 0.9}, expected={"sentiment": "positive", "score": 0.9}),
302
+ ]
303
+ report = evaluate_consistency(runs, variance_threshold=0.05)
304
+ report.stable_fields # ['score']
305
+ report.unstable_fields # ['sentiment'] — flipped on one run
306
+ ```
307
+
308
+ ## Documentation
309
+
310
+ - **[Introduction](docs/introduction.md)** — the L0–L6 ladder and why values matter.
311
+ - **[Getting started](docs/getting-started.md)** — install → first evaluation →
312
+ reading and tuning the report.
313
+ - **Core concepts** — [the evaluation model](docs/core-concepts/evaluation-model.md) ·
314
+ [comparison is a metric](docs/core-concepts/comparison-is-a-metric.md) ·
315
+ [array alignment](docs/core-concepts/array-alignment.md)
316
+ - **[Evaluation functions](docs/evaluation/index.md)** — `evaluate`,
317
+ `evaluate_batch`, `evaluate_consistency`.
318
+ - **[Metric catalog](docs/metrics/index.md)** — every metric, plus how to write your own.
319
+
320
+ ## License
321
+
322
+ Apache-2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,94 @@
1
+ structured_eval/__init__.py,sha256=HVWqijhPNqs4-GSFTv0Ew6i1dLlJchgGGq5Q8Vgx5I0,1142
2
+ structured_eval/api.py,sha256=ECYaLPH6C866W8BatYyNYBhlHXUpljjmwm6gI_bwMHc,2529
3
+ structured_eval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ structured_eval/alignment/__init__.py,sha256=wAb9899fDPt_HbclQUD0iW6QhL62DokA0Zn8TAnajtM,471
5
+ structured_eval/alignment/base.py,sha256=Qp-VmrZkU2JSLre2pxXwkdxBhk2oemTridbgvPDvZXk,1374
6
+ structured_eval/alignment/by_index.py,sha256=58vsTb87k7i6yaDe_AhZ1C42Ojk3OF1KNgDuZ8_LQtc,820
7
+ structured_eval/alignment/by_key.py,sha256=nB3mdD7F0z85xfjZedpKEjsI5m-FgPt6DmI5jiX-wDQ,3260
8
+ structured_eval/alignment/factory.py,sha256=vxdFxmCTvv0z-6tkkTDMnpErWImsjET-BgaADv8slpQ,981
9
+ structured_eval/alignment/hungarian.py,sha256=8sOfQwwI1o4tGI816icceGNy3vm8VwmNf1cZJDMMYys,6421
10
+ structured_eval/engine/__init__.py,sha256=WEyDiUfi-EZXEAUFiIC3vSmo6URpFSSTzh65R3gK_hg,481
11
+ structured_eval/engine/aggregator.py,sha256=Xm7hPbv8L8dUhz3RZifHqqidEO5bprvxKoSdTheX8Yw,4125
12
+ structured_eval/engine/evaluator.py,sha256=9vJv5aVUnWRmmag_I2p_TCtUwe8SPUGfk1dSjRKGGto,2929
13
+ structured_eval/engine/metric_runner.py,sha256=XpjhWX2ulSdnIRkvPFpevPIbi5JN9t4TP_KmGZW5LyM,2745
14
+ structured_eval/engine/parser.py,sha256=Ux10hDIG_TtMQ07JNO-9OaEln46sS2SG9C1SWoSQju4,1505
15
+ structured_eval/engine/report_builder.py,sha256=EnaHevbQvbsGkYYW_W8CYvhdmE2bMUw5iSSJFGJPY_Y,2791
16
+ structured_eval/engine/tree_builder.py,sha256=RAjXLJ42_W70QgfT0v7pZWLAmh1pmnWb3Ycf1HfazUI,12800
17
+ structured_eval/formats/__init__.py,sha256=uJ2_LF-8KtA_oW4mOjBMDz8_1X7KYz3HzTsvavou-pE,270
18
+ structured_eval/formats/base.py,sha256=isWyP1dYBIMGl6Uf_oH4YEnF98hqIKnesJyHf-RzW6k,547
19
+ structured_eval/formats/json_parser.py,sha256=GDkPICl42p0tkUHUktD5AsNzLrWCAq_avDJgX_WE_KE,1334
20
+ structured_eval/formats/yaml_parser.py,sha256=DUBGwm0YGUrRQH5jc7wJTbPcJj1KIGSIojczB1bUMk0,779
21
+ structured_eval/integrations/__init__.py,sha256=PMcSdcjDWcKBjWWf9koeqWs8DoT2gbeHMzs-ajN4q-Q,440
22
+ structured_eval/integrations/_adapter.py,sha256=iAza_T-85cEWLezW8NTFa2iO5-phLHN5e_67qOGJlks,1721
23
+ structured_eval/integrations/deepeval.py,sha256=efHjgJp2006cQiUygUq5Con25BaZgP8IaTqJ2Pk-30Y,2567
24
+ structured_eval/integrations/langsmith.py,sha256=7yUJ-4AK6UgoYsUWlGNGz9gQ1aX0rD4ufwGUb4XapFY,3232
25
+ structured_eval/metrics/__init__.py,sha256=NqCKu0nbhF7ZUpvUnDTLRNPJ15qrV3RZf9wzHph_i3s,3553
26
+ structured_eval/metrics/array_accuracy.py,sha256=NSvotfYyK3FrU8XjkY8LCwt4w7oVMZabDvQ5iSorkXU,959
27
+ structured_eval/metrics/array_cardinality.py,sha256=XzXZvZhICszt7GcMqyYVV2dh1hZL9xaKiPzNDuvWR0M,825
28
+ structured_eval/metrics/array_exact_match.py,sha256=15uL2fQ6x-2r-61VYavV1qfKVwS40YSF0dX7Y6SE_nY,1789
29
+ structured_eval/metrics/array_f1.py,sha256=SBZLNxVUDeog37Iini-LGwE2_OsI0bmJDYRTLkOU_mg,1107
30
+ structured_eval/metrics/array_jaccard_similarity.py,sha256=FZ7J5KOfG2rQX_22QcVqqNV9SVXtnHIQmYJuDtwvwAI,1947
31
+ structured_eval/metrics/array_precision.py,sha256=UUYXeSHMNNpuGSTCqblYOG6PHOYHA1CJvUYZ_aKBzOQ,1175
32
+ structured_eval/metrics/array_prf1.py,sha256=l8W6-ayc-REi96i5kQxYmgj0S5Fwh0lEb3aj8DevzbY,1331
33
+ structured_eval/metrics/array_recall.py,sha256=MK84b-KV18otCYDbPD4e4AcWFnTtMW_D7RVyaYIHJvQ,1050
34
+ structured_eval/metrics/base.py,sha256=wuxWK6pNTo-RsIhpDaFWjGsTWXO6Xtvpj4Q3CgsEa5s,5776
35
+ structured_eval/metrics/character_f1.py,sha256=ofYuOR0NnF6f-rBfjmJCCe9HpgMSFb1LvqwcnZPOhKk,1476
36
+ structured_eval/metrics/composite_score.py,sha256=okiZNzeKltBktavcIEFcP_kOCrvhJ41o_-jK-GM3RBQ,1788
37
+ structured_eval/metrics/coverage_leaf_score.py,sha256=5SjCIlA9XfrbvjELNRmuuCZJdtEyJlcgXDkIKGgsOxk,967
38
+ structured_eval/metrics/date_distance_score.py,sha256=In9VdFjFmazCHPK7GBDAfl8kIu1wh1POh0l28SkIZ7I,1960
39
+ structured_eval/metrics/exact.py,sha256=etBVsT9aJEqwiuorBaxOo75imMjwidk9CrPjZVaMJVQ,689
40
+ structured_eval/metrics/exponential_numeric_score.py,sha256=nfSVGCx6wlO8bJfsKWCLVAFWsrYgLuWu5nfoiV4AEAM,1608
41
+ structured_eval/metrics/field_faithfulness.py,sha256=0C8ye79C9o0esxp7SE46H6TN6-QI-GDXHByqnFfooBE,1519
42
+ structured_eval/metrics/fuzzy.py,sha256=ytjjkvYLx3dPT32tP9ivT8s_Se-QX5ol1lQfbddU88s,2181
43
+ structured_eval/metrics/invoker.py,sha256=P3RrJMXVLvBXpUihov-axAmehiexUuTFEyWttept7VM,3618
44
+ structured_eval/metrics/levenshtein.py,sha256=84zqiSmcRwzuS6ww0WyHFTsQ3CPDqy5MgwCkvJnqNwE,540
45
+ structured_eval/metrics/mean_score.py,sha256=Vjh9SHVVq1XyC_iLvvv60sjbFikd9tMC83mRsP-Aoj4,1283
46
+ structured_eval/metrics/numeric.py,sha256=Uw7d7r6ATzZKk__iy3vKKne_HoQV-U8VKFOSKMRrbf8,3094
47
+ structured_eval/metrics/numeric_closeness.py,sha256=1xcaRW8zBWcyiuuc2Y6XF1OpoErGISYmSeDXH-p_KWw,1423
48
+ structured_eval/metrics/object_accuracy.py,sha256=ZNFoRUKt4j7jbePbWswvuebdOeyzAVKI-gw0avToRxM,1849
49
+ structured_eval/metrics/object_exact_match.py,sha256=oGH3P2ciSoM4nAh_-41IL_WSA5TVARAQfu6QlRKgPeI,1526
50
+ structured_eval/metrics/object_f1.py,sha256=d-sNwb3ZPpB3dFN5zlw7Ukd89ogyYv7w_9VUfiRWsEQ,1645
51
+ structured_eval/metrics/object_precision.py,sha256=tNhfGE8bHVjSor9dmvnYE9OMEVNdm95LpWWMhznk9tI,1884
52
+ structured_eval/metrics/object_prf1.py,sha256=LThknlAb58QhP_GagYPE7N5C6ld7iAPedZ25XvOO3EM,1742
53
+ structured_eval/metrics/object_recall.py,sha256=vI2UV0FZyx4EEgWYeIzGlgcWeUTPbNexCuJEWyeryiY,1518
54
+ structured_eval/metrics/object_type_validity.py,sha256=apGdukmyFFaBP8XHumaWCVPJGSYxqAR4lxBAEZmLnDs,1186
55
+ structured_eval/metrics/overall_leaf_score.py,sha256=rQDg9RRLefjt_iyzE6mGKINMre--aDL7GPdHJxOsnsU,1148
56
+ structured_eval/metrics/presence.py,sha256=D9PChEw3Cy_Fm9BBj-gUjill4ddtmQrjg5emalz7HxY,678
57
+ structured_eval/metrics/regex_match.py,sha256=-Kp4uoffrz_fLWj8f1U9Fq0Qg94gxKCKNNubgX-gr-s,1806
58
+ structured_eval/metrics/structural_similarity.py,sha256=N6MY-decMEK3MHPSOd_BLTf0ZS-JKwWh08boOuSt1rc,1466
59
+ structured_eval/metrics/token_f1.py,sha256=E2lE5398JbaQdZ3RwC54Liyyw_A1Lz5hTlI2dq3Mw6U,1466
60
+ structured_eval/metrics/type_match.py,sha256=n1KBgLYVaDvfSmapvoAzLR7vlpYH2ttgnvtMWowOjFg,996
61
+ structured_eval/metrics/rule_pass_rate/__init__.py,sha256=oFt5_NOdjFtmOrMPE2lemrTduxlO2RPRF6uvUi3wWhM,256
62
+ structured_eval/metrics/rule_pass_rate/dsl.py,sha256=sPKdRgr7cmGC5DPf7fWtcqEVMXVWVWxX3vguOfebEEg,7788
63
+ structured_eval/metrics/rule_pass_rate/engine.py,sha256=m02eH7DL6HD4KyASLlkGzmOnvVkTqI4lEzLfOTx6Cjg,820
64
+ structured_eval/metrics/rule_pass_rate/metric.py,sha256=Cr67SY2kF5UbfEmBZeTeuelfSYhKtK33FsZrjVJ5e-A,1211
65
+ structured_eval/metrics/schema_validity/__init__.py,sha256=9OBRWGFPL_Z_0NmQLyJs0E358Y4hS4qsMiiUMzNwLd0,244
66
+ structured_eval/metrics/schema_validity/metric.py,sha256=-PCg4-0clke0UVVQa1nH2QKK36nuGnMEBfSblq05tTI,1205
67
+ structured_eval/metrics/schema_validity/validator.py,sha256=aBoxB7Txq4Quf2n3Sjkwj4Q6wSQ4Q1mEhcIQIn4nhOA,4507
68
+ structured_eval/metrics/utils/__init__.py,sha256=JggEJ034wTR1QIBOsT6M0gQyKjiRm59ncWddOBJFb14,479
69
+ structured_eval/metrics/utils/array.py,sha256=c9YS66yHZGQrWVTCGVNR5SVkYF_M3GBwvef0IU_O-sQ,1181
70
+ structured_eval/metrics/utils/calculate.py,sha256=Cwwxd-d5-so3S4z1lvvpv2eLLGMrMkuCSZwJ0Ogo_Hs,2799
71
+ structured_eval/metrics/utils/number.py,sha256=JQ2x19GtZGIXQm5uJ1KL8Z87Fo6l-8ZNmUIlo9PGkcg,1638
72
+ structured_eval/metrics/utils/object_utils.py,sha256=8N0Z9Fy02r_D8I2nTBeeeRbca2CWnsvoB89a2-gHFgc,3533
73
+ structured_eval/models/__init__.py,sha256=Cb123VhFlVG7EOfdc32F4zRpkrdMsENQn-0axb8cBsU,1903
74
+ structured_eval/models/config.py,sha256=te4clYdn1CR5RQ_AYgvQdCQQ4u_w2qeUli0poW3kPg0,5069
75
+ structured_eval/models/context.py,sha256=FxnsdA3DyK5pmFnA6ubj5bmO9qHPLnnXy0TIaFMM7_Y,684
76
+ structured_eval/models/metric_result.py,sha256=n0w-AN5Fd2lXUUi1GJaao6jczp7C1X1ocZhjZfWpLWE,4351
77
+ structured_eval/models/result.py,sha256=uKJ_M3_Y_T4gKGincCrLuhax0JabOJS8pF_ftr8CwJM,14517
78
+ structured_eval/models/sample.py,sha256=0KfX6a78NyYNkkh9BTWhyFyweKjm5MnRng2B3qF6cOQ,593
79
+ structured_eval/models/nodes/__init__.py,sha256=mEKvtT5qWypdaOZiW3rV1JDeQ0ERgQZJbwql-0gvdGY,392
80
+ structured_eval/models/nodes/array_node.py,sha256=259OXuamXZecfkce-HgLJ3zi9xFI3LioWGggP_L5RXg,1207
81
+ structured_eval/models/nodes/base.py,sha256=P4dHPdvZa9KgeA7Oafhha6dSVJpRuF2cXhujWdYyNws,4756
82
+ structured_eval/models/nodes/object_node.py,sha256=DAvrc7OedJkmu7RUpHUoKpyyJwmJ6YAcKkpxBQ-Ama4,626
83
+ structured_eval/models/nodes/scalar.py,sha256=nE1KTH88gmqFBYoBam43qeT4HnpXeFWGEVCXVP3ADwc,559
84
+ structured_eval/reporting/__init__.py,sha256=NCR1g2TlWvTChC-gjZMR0g3MkyDlw_anHBwWAJp6iCU,151
85
+ structured_eval/reporting/console.py,sha256=ee4CM_yjrHR4ymCZYpNUGT7xcYDX5cf37geMNctAulU,7576
86
+ structured_eval/utils/__init__.py,sha256=2azRMRqcHctvvHPQIhwL3IT7hsjefq5uCVgSBvDRJyU,318
87
+ structured_eval/utils/flatten.py,sha256=1aSlCxCh-tl3BomYldQC3pPGcDCUIpq9-F-1Lfw8_LA,2550
88
+ structured_eval/utils/paths.py,sha256=WHBh6nZxvTmZhvlLjDOHP5dGnl6y0rGlis63jC1IeE8,1930
89
+ structured_eval/utils/structured_diff.py,sha256=hDz749pKcklMel4tNEexZKi3zqWU4kpXGasQsva0Kj0,5078
90
+ structured_eval-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
91
+ structured_eval-0.1.0.dist-info/METADATA,sha256=l480hpihBw_NBj7XW9v837QFGGZFC4xXIMbeG7tS-6k,13434
92
+ structured_eval-0.1.0.dist-info/WHEEL,sha256=K260EYznzXsJYBQGqmI8VTxEdiZYNvDZwW9cBh9-_MA,91
93
+ structured_eval-0.1.0.dist-info/top_level.txt,sha256=lXeN2DTo1LTRN5ujZgaxlssE0DDw0NdQqLI0VfhON2A,16
94
+ structured_eval-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (83.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+