extract-bench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. extract_bench-0.1.0/.env.example +46 -0
  2. extract_bench-0.1.0/.gitignore +62 -0
  3. extract_bench-0.1.0/LICENSE +21 -0
  4. extract_bench-0.1.0/PKG-INFO +342 -0
  5. extract_bench-0.1.0/README.md +323 -0
  6. extract_bench-0.1.0/pyproject.toml +36 -0
  7. extract_bench-0.1.0/src/extract_bench/__init__.py +42 -0
  8. extract_bench-0.1.0/src/extract_bench/evaluation/__init__.py +42 -0
  9. extract_bench-0.1.0/src/extract_bench/evaluation/evaluation_config.py +33 -0
  10. extract_bench-0.1.0/src/extract_bench/evaluation/metric_id_collector.py +41 -0
  11. extract_bench-0.1.0/src/extract_bench/evaluation/metric_registry.py +120 -0
  12. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/__init__.py +43 -0
  13. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/array_metrics.py +116 -0
  14. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/base_metric.py +40 -0
  15. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/boolean_metrics.py +56 -0
  16. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/llm_metrics.py +231 -0
  17. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_descriptors.py +36 -0
  18. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/__init__.py +16 -0
  19. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/array_llm.txt +35 -0
  20. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/llm_judge.txt +15 -0
  21. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/string_semantic.txt +17 -0
  22. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_utils.py +123 -0
  23. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/number_metrics.py +148 -0
  24. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/policy_metric.py +44 -0
  25. extract_bench-0.1.0/src/extract_bench/evaluation/metrics/string_metrics.py +195 -0
  26. extract_bench-0.1.0/src/extract_bench/evaluation/presets.py +109 -0
  27. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/README.md +191 -0
  28. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/__init__.py +47 -0
  29. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/content_stats.py +160 -0
  30. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/formatters.py +195 -0
  31. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/models.py +181 -0
  32. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/outcome_stats.py +290 -0
  33. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/report_builder.py +169 -0
  34. extract_bench-0.1.0/src/extract_bench/evaluation/reporting/schema_stats.py +104 -0
  35. extract_bench-0.1.0/src/extract_bench/evaluation/schema_config_helpers.py +107 -0
  36. extract_bench-0.1.0/src/extract_bench/evaluation/schema_value_instantiator.py +213 -0
  37. extract_bench-0.1.0/src/extract_bench/evaluation/structured_evaluator.py +226 -0
  38. extract_bench-0.1.0/src/extract_bench/infra/__init__.py +60 -0
  39. extract_bench-0.1.0/src/extract_bench/infra/asyncio_utils.py +53 -0
  40. extract_bench-0.1.0/src/extract_bench/infra/construct_ast.py +110 -0
  41. extract_bench-0.1.0/src/extract_bench/infra/nodes.py +384 -0
  42. extract_bench-0.1.0/src/extract_bench/infra/ref_expander.py +43 -0
  43. extract_bench-0.1.0/src/extract_bench/infra/schema_instance_visitor.py +125 -0
  44. extract_bench-0.1.0/src/extract_bench/infra/visitors.py +452 -0
  45. extract_bench-0.1.0/tests/test_evaluation.py +386 -0
  46. extract_bench-0.1.0/tests/test_real_llm.py +246 -0
@@ -0,0 +1,46 @@
1
+ # Structured Extraction Evaluation Suite - Environment Configuration
2
+ #
3
+ # Copy this file to .env and fill in your credentials.
4
+ # The .env file should NOT be committed to version control.
5
+ #
6
+ # LLM-based metrics use LiteLLM, so provider-specific environment variables
7
+ # are detected automatically. Configure only one provider unless needed.
8
+
9
+ # ============================================================================
10
+ # LLM Provider Configuration (choose one)
11
+ # ============================================================================
12
+
13
+ # Option 1: Vertex AI (Google Cloud) - Recommended
14
+ # Requires: gcloud CLI authenticated, or service account JSON.
15
+ # If using gcloud, run: gcloud auth application-default login
16
+ # GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
17
+ # VERTEX_AI_PROJECT=your-gcp-project-id
18
+ # VERTEX_AI_LOCATION=us-central1
19
+
20
+ # Option 2: OpenAI
21
+ # OPENAI_API_KEY=sk-...
22
+
23
+ # Option 3: Anthropic
24
+ # ANTHROPIC_API_KEY=sk-ant-...
25
+
26
+ # Option 4: Azure OpenAI
27
+ # AZURE_API_KEY=...
28
+ # AZURE_API_BASE=https://your-resource.openai.azure.com/
29
+ # AZURE_API_VERSION=2024-02-15-preview
30
+
31
+ # ============================================================================
32
+ # Default Model Override (optional)
33
+ # ============================================================================
34
+ # Model IDs follow LiteLLM naming, e.g.:
35
+ # - openai/gpt-4o-mini
36
+ # - anthropic/claude-3-5-sonnet-20241022
37
+ # - vertex_ai/gemini-2.5-flash
38
+ #
39
+ # If unset, the default is vertex_ai/gemini-2.5-flash.
40
+ # You can override this globally by setting:
41
+ # DEFAULT_LLM_MODEL=openai/gpt-4o-mini
42
+
43
+ # ============================================================================
44
+ # Logging Configuration (optional)
45
+ # ============================================================================
46
+ # LOG_LEVEL=INFO
@@ -0,0 +1,62 @@
1
+ .DS_Store
2
+ sync-drive.sh
3
+
4
+ # LaTeX compiler output
5
+ *.aux
6
+ *.bbl
7
+ *.blg
8
+ *.log
9
+ *.out
10
+ *.fls
11
+ *.fdb_latexmk
12
+ *.synctex.gz
13
+ *.toc
14
+ *.lof
15
+ *.lot
16
+ *.nav
17
+ *.snm
18
+ *.vrb
19
+ comment.cut
20
+
21
+ # Python
22
+ __pycache__/
23
+ *.py[cod]
24
+ *$py.class
25
+ *.so
26
+ .Python
27
+ build/
28
+ develop-eggs/
29
+ dist/
30
+ downloads/
31
+ eggs/
32
+ .eggs/
33
+ lib/
34
+ lib64/
35
+ parts/
36
+ sdist/
37
+ var/
38
+ wheels/
39
+ *.egg-info/
40
+ .installed.cfg
41
+ *.egg
42
+
43
+ # Virtual environments
44
+ .env
45
+ .venv
46
+ env/
47
+ venv/
48
+ ENV/
49
+
50
+ # IDE
51
+ .idea/
52
+ .vscode/
53
+ *.swp
54
+ *.swo
55
+
56
+ # Test
57
+ .pytest_cache/
58
+ .coverage
59
+ htmlcov/
60
+
61
+ # Logs
62
+ *.log
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Contextual AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,342 @@
1
+ Metadata-Version: 2.4
2
+ Name: extract-bench
3
+ Version: 0.1.0
4
+ Summary: Evaluate structured extraction quality with per-field metrics
5
+ Author: Contextual AI
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: jsonschema>=4.0
10
+ Requires-Dist: litellm>=1.0
11
+ Requires-Dist: loguru>=0.7
12
+ Requires-Dist: pydantic>=2.0
13
+ Provides-Extra: dev
14
+ Requires-Dist: google-cloud-aiplatform==1.135.0; extra == 'dev'
15
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
16
+ Requires-Dist: pytest>=7.0; extra == 'dev'
17
+ Requires-Dist: python-dotenv>=1.0; extra == 'dev'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Extract Bench Evaluation Suite
21
+
22
+ A standalone Python package for evaluating structured extraction quality by comparing predicted JSON against gold JSON with per-field metrics.
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install -e .
28
+
29
+ # With dev dependencies
30
+ pip install -e ".[dev]"
31
+ ```
32
+
33
+ ## Quick Start
34
+
35
+ ```python
36
+ import json
37
+ from pathlib import Path
38
+ from extract_bench import ReportBuilder, ReportConfig
39
+
40
+ # Load your data
41
+ schema = json.load(open("schema.json"))
42
+ gold = json.load(open("gold.json"))
43
+ extracted = json.load(open("model_output.json"))
44
+
45
+ # Configure and build report
46
+ config = ReportConfig(
47
+ output_dir=Path("./eval_results"),
48
+ output_name="nvidia-10k-extract-gemini-flash", # Identifies this experiment
49
+ )
50
+ builder = ReportBuilder(config)
51
+ report = builder.build(schema, gold, extracted)
52
+
53
+ # Save all outputs
54
+ output_path = builder.save(report)
55
+ print(f"Results saved to: {output_path}")
56
+ ```
57
+
58
+ This creates `eval_results/nvidia-10k-extract-gemini-flash/` containing:
59
+
60
+ | File | Purpose |
61
+ | ------------- | -------------------------------------------------------- |
62
+ | `report.json` | Machine-readable full report (for programmatic analysis) |
63
+ | `summary.txt` | Human-readable one-page summary (for quick inspection) |
64
+ | `fields.csv` | Per-field outcomes (for csv analysis) |
65
+ | `fields.md` | Markdown table (for documentation/sharing) |
66
+
67
+ ### Key Metrics in the Report
68
+
69
+ ```python
70
+ # After building the report
71
+ print(f"Overall pass rate: {report.overall_pass_rate:.1%}")
72
+ print(f"Overall score: {report.overall_score:.3f}")
73
+ print(f"Fields evaluated: {report.outcomes.total_evaluated}")
74
+ print(f"Passed: {report.outcomes.total_passed}")
75
+ print(f"Failed: {report.outcomes.total_failed}")
76
+ ```
77
+
78
+ ## Batch Evaluation Example
79
+
80
+ For running many experiments:
81
+
82
+ ```python
83
+ import asyncio
84
+ import json
85
+ from pathlib import Path
86
+ from extract_bench import ReportBuilder, ReportConfig
87
+
88
+ async def evaluate_model_outputs(
89
+ schema_path: Path,
90
+ gold_path: Path,
91
+ outputs_dir: Path,
92
+ results_dir: Path,
93
+ ):
94
+ """Evaluate all model outputs in a directory."""
95
+ schema = json.load(schema_path.open())
96
+ gold = json.load(gold_path.open())
97
+
98
+ results = []
99
+ for output_file in outputs_dir.glob("*.json"):
100
+ extracted = json.load(output_file.open())
101
+
102
+ config = ReportConfig(
103
+ output_dir=results_dir,
104
+ output_name=output_file.stem, # Use filename as experiment ID
105
+ )
106
+ builder = ReportBuilder(config)
107
+ report = await builder.build_async(schema, gold, extracted)
108
+ builder.save(report)
109
+
110
+ results.append({
111
+ "model": output_file.stem,
112
+ "pass_rate": report.overall_pass_rate,
113
+ "score": report.overall_score,
114
+ })
115
+
116
+ return results
117
+
118
+ # Run batch evaluation
119
+ results = asyncio.run(evaluate_model_outputs(
120
+ schema_path=Path("schema.json"),
121
+ gold_path=Path("gold.json"),
122
+ outputs_dir=Path("./model_outputs"),
123
+ results_dir=Path("./eval_results"),
124
+ ))
125
+
126
+ # Print comparison
127
+ for r in sorted(results, key=lambda x: -x["score"]):
128
+ print(f"{r['model']}: {r['pass_rate']:.1%} pass, {r['score']:.3f} avg score")
129
+ ```
130
+
131
+ ## ReportConfig Options
132
+
133
+ ```python
134
+ config = ReportConfig(
135
+ output_dir=Path("./outputs"), # Where to save reports
136
+ output_name="my-experiment", # Subdirectory name (auto-generated if None)
137
+ max_reasoning_length=200, # Truncate LLM reasoning in outputs
138
+ top_n_lowest_fields=5, # Track N lowest-scoring fields
139
+ save_json=True, # Generate report.json
140
+ save_text=True, # Generate summary.txt
141
+ save_csv=True, # Generate fields.csv
142
+ save_markdown=True, # Generate fields.md
143
+ )
144
+ ```
145
+
146
+ ## Understanding Report Output
147
+
148
+ ### summary.txt Structure
149
+
150
+ ```
151
+ ================================================================================
152
+ EVALUATION REPORT: my-experiment
153
+ ================================================================================
154
+
155
+ OVERALL RESULTS
156
+ ---------------
157
+ Pass Rate: 85.2% (23/27 fields)
158
+ Average Score: 0.891
159
+
160
+ SCHEMA SHAPE
161
+ ------------
162
+ Total nodes: 45
163
+ By type: object=12, string=18, number=8, array=5, boolean=2
164
+
165
+ COVERAGE
166
+ --------
167
+ Present in both: 25
168
+ Missing in extracted: 2
169
+ Spurious in extracted: 0
170
+
171
+ PASS/FAIL BY METRIC
172
+ -------------------
173
+ string_semantic: 15/18 passed (83.3%)
174
+ number_tolerance: 6/6 passed (100.0%)
175
+ integer_exact: 2/3 passed (66.7%)
176
+
177
+ LOWEST SCORING FIELDS
178
+ ---------------------
179
+ 1. borrower.address (0.45) - Partial match, missing suite number
180
+ 2. terms.rate_type (0.60) - Semantic mismatch
181
+ ...
182
+ ```
183
+
184
+ ### fields.csv Columns
185
+
186
+ | Column | Description |
187
+ | ----------------- | ------------------------------------------- |
188
+ | `path` | Full JSONPath to the field |
189
+ | `normalized_path` | Human-readable path (e.g., `borrower.name`) |
190
+ | `metric_id` | Metric used for evaluation |
191
+ | `score` | Numeric score (0.0-1.0) |
192
+ | `passed` | Boolean pass/fail |
193
+ | `gold_value` | Expected value |
194
+ | `extracted_value` | Model's output value |
195
+ | `reasoning` | LLM reasoning (for semantic metrics) |
196
+
197
+ ## Low-Level API
198
+
199
+ For direct access to evaluation results without reporting:
200
+
201
+ ```python
202
+ from extract_bench import StructuredEvaluator, StructuredEvaluatorConfig
203
+
204
+ evaluator = StructuredEvaluator(StructuredEvaluatorConfig(metrics=[]))
205
+ result = evaluator.evaluate(schema, gold, predicted)
206
+
207
+ # Raw results dict: path -> metric_id -> MetricResult
208
+ for path, metrics in result["results"].items():
209
+ for metric_id, metric_result in metrics.items():
210
+ print(f"{path} [{metric_id}]: passed={metric_result.passed}, score={metric_result.score}")
211
+ ```
212
+
213
+ Use `evaluate_async()` for better performance with LLM-based metrics.
214
+
215
+ ## Configuration
216
+
217
+ ### Environment Setup
218
+
219
+ LLM-based metrics use LiteLLM. Configure your provider:
220
+
221
+ ```bash
222
+ # Vertex AI (Google Cloud)
223
+ gcloud auth application-default login
224
+
225
+ # OpenAI
226
+ export OPENAI_API_KEY=sk-...
227
+
228
+ # Or copy .env.example to .env
229
+ ```
230
+
231
+ ### LLM Model Configuration
232
+
233
+ Default model: `vertex_ai/gemini-2.5-flash` (or set `DEFAULT_LLM_MODEL` in `.env`).
234
+
235
+ Override per-field in schema:
236
+
237
+ ```python
238
+ schema = {
239
+ "type": "object",
240
+ "properties": {
241
+ "company": {
242
+ "type": "string",
243
+ "evaluation_config": {
244
+ "metrics": [{"metric_id": "string_semantic", "params": {"model": "openai/gpt-4o-mini"}}]
245
+ },
246
+ }
247
+ },
248
+ }
249
+ ```
250
+
251
+ ### Evaluation Presets
252
+
253
+ Specify `evaluation_config` in schema fields to control metrics:
254
+
255
+ | Preset | Description |
256
+ | ------------------------- | ---------------------------------------------------- |
257
+ | `string_exact` | Case-sensitive exact match |
258
+ | `string_fuzzy` | Levenshtein similarity (case-insensitive by default) |
259
+ | `string_case_insensitive` | Case-insensitive match |
260
+ | `string_semantic` | LLM-based semantic similarity (default for strings) |
261
+ | `number_exact` | Exact numeric equality |
262
+ | `number_tolerance` | Match within tolerance (default for numbers) |
263
+ | `integer_exact` | Exact integer equality (default for integers) |
264
+ | `boolean_exact` | Exact boolean equality (default for booleans) |
265
+ | `array_llm` | LLM evaluation of entire array (default for arrays) |
266
+ | `skip` | Skip evaluation for this node |
267
+
268
+ ### Custom Metric Configuration
269
+
270
+ ```python
271
+ schema = {
272
+ "type": "object",
273
+ "properties": {
274
+ "price": {
275
+ "type": "number",
276
+ "evaluation_config": {
277
+ "metrics": [{"metric_id": "number_tolerance", "params": {"tolerance": 0.01}}]
278
+ }
279
+ },
280
+ "description": {
281
+ "type": "string",
282
+ "evaluation_config": "string_fuzzy" # Use preset
283
+ }
284
+ }
285
+ }
286
+ ```
287
+
288
+ ## Available Metrics
289
+
290
+ | Category | Metric | Description |
291
+ | -------- | ------------------------- | --------------------------------------- |
292
+ | String | `string_exact` | Case-sensitive exact match |
293
+ | | `string_case_insensitive` | Case-insensitive match |
294
+ | | `string_fuzzy` | Levenshtein similarity |
295
+ | | `string_semantic` | LLM-based semantic comparison (default) |
296
+ | Number | `number_exact` | Exact numeric equality |
297
+ | | `number_tolerance` | Match within tolerance (default) |
298
+ | | `integer_exact` | Exact integer equality |
299
+ | Boolean | `boolean_exact` | Exact boolean equality |
300
+ | Array | `array_llm` | LLM-based array comparison |
301
+ | General | `string_llm` | LLM judge for any comparison |
302
+
303
+ ## Custom Metrics
304
+
305
+ ```python
306
+ from extract_bench import global_metric_registry
307
+ from extract_bench.evaluation.metrics import BaseMetric, MetricResult
308
+
309
+ class MyCustomMetric(BaseMetric):
310
+ metric_id = "my_custom"
311
+
312
+ async def evaluate(self, node, config=None):
313
+ gold = node.get_gold_value()
314
+ extracted = node.get_extracted_value()
315
+ return MetricResult(
316
+ metric_id=self.metric_id,
317
+ score=1.0,
318
+ passed=True,
319
+ details={"custom": "data"}
320
+ )
321
+
322
+ global_metric_registry.register_metric(MyCustomMetric)
323
+ ```
324
+
325
+ ## Architecture
326
+
327
+ ```
328
+ extract_bench/
329
+ ├── infra/ # Schema AST (nodes, visitors)
330
+ ├── evaluation/
331
+ │ ├── metrics/ # Metric implementations
332
+ │ └── reporting/ # Report generation (see reporting/README.md)
333
+ ```
334
+
335
+ Schema → AST → Values instantiated → Metrics evaluated async in parallel → Report generated.
336
+
337
+ ## Development
338
+
339
+ ```bash
340
+ pip install -e ".[dev]"
341
+ pytest tests/ -v
342
+ ```