extract-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_bench-0.1.0/.env.example +46 -0
- extract_bench-0.1.0/.gitignore +62 -0
- extract_bench-0.1.0/LICENSE +21 -0
- extract_bench-0.1.0/PKG-INFO +342 -0
- extract_bench-0.1.0/README.md +323 -0
- extract_bench-0.1.0/pyproject.toml +36 -0
- extract_bench-0.1.0/src/extract_bench/__init__.py +42 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/__init__.py +42 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/evaluation_config.py +33 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metric_id_collector.py +41 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metric_registry.py +120 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/__init__.py +43 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/array_metrics.py +116 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/base_metric.py +40 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/boolean_metrics.py +56 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/llm_metrics.py +231 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_descriptors.py +36 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/__init__.py +16 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/array_llm.txt +35 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/llm_judge.txt +15 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_prompts/string_semantic.txt +17 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/metric_utils.py +123 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/number_metrics.py +148 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/policy_metric.py +44 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/metrics/string_metrics.py +195 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/presets.py +109 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/README.md +191 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/__init__.py +47 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/content_stats.py +160 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/formatters.py +195 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/models.py +181 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/outcome_stats.py +290 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/report_builder.py +169 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/reporting/schema_stats.py +104 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/schema_config_helpers.py +107 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/schema_value_instantiator.py +213 -0
- extract_bench-0.1.0/src/extract_bench/evaluation/structured_evaluator.py +226 -0
- extract_bench-0.1.0/src/extract_bench/infra/__init__.py +60 -0
- extract_bench-0.1.0/src/extract_bench/infra/asyncio_utils.py +53 -0
- extract_bench-0.1.0/src/extract_bench/infra/construct_ast.py +110 -0
- extract_bench-0.1.0/src/extract_bench/infra/nodes.py +384 -0
- extract_bench-0.1.0/src/extract_bench/infra/ref_expander.py +43 -0
- extract_bench-0.1.0/src/extract_bench/infra/schema_instance_visitor.py +125 -0
- extract_bench-0.1.0/src/extract_bench/infra/visitors.py +452 -0
- extract_bench-0.1.0/tests/test_evaluation.py +386 -0
- extract_bench-0.1.0/tests/test_real_llm.py +246 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Structured Extraction Evaluation Suite - Environment Configuration
|
|
2
|
+
#
|
|
3
|
+
# Copy this file to .env and fill in your credentials.
|
|
4
|
+
# The .env file should NOT be committed to version control.
|
|
5
|
+
#
|
|
6
|
+
# LLM-based metrics use LiteLLM, so provider-specific environment variables
|
|
7
|
+
# are detected automatically. Configure only one provider unless needed.
|
|
8
|
+
|
|
9
|
+
# ============================================================================
|
|
10
|
+
# LLM Provider Configuration (choose one)
|
|
11
|
+
# ============================================================================
|
|
12
|
+
|
|
13
|
+
# Option 1: Vertex AI (Google Cloud) - Recommended
|
|
14
|
+
# Requires: gcloud CLI authenticated, or service account JSON.
|
|
15
|
+
# If using gcloud, run: gcloud auth application-default login
|
|
16
|
+
# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
|
|
17
|
+
# VERTEX_AI_PROJECT=your-gcp-project-id
|
|
18
|
+
# VERTEX_AI_LOCATION=us-central1
|
|
19
|
+
|
|
20
|
+
# Option 2: OpenAI
|
|
21
|
+
# OPENAI_API_KEY=sk-...
|
|
22
|
+
|
|
23
|
+
# Option 3: Anthropic
|
|
24
|
+
# ANTHROPIC_API_KEY=sk-ant-...
|
|
25
|
+
|
|
26
|
+
# Option 4: Azure OpenAI
|
|
27
|
+
# AZURE_API_KEY=...
|
|
28
|
+
# AZURE_API_BASE=https://your-resource.openai.azure.com/
|
|
29
|
+
# AZURE_API_VERSION=2024-02-15-preview
|
|
30
|
+
|
|
31
|
+
# ============================================================================
|
|
32
|
+
# Default Model Override (optional)
|
|
33
|
+
# ============================================================================
|
|
34
|
+
# Model IDs follow LiteLLM naming, e.g.:
|
|
35
|
+
# - openai/gpt-4o-mini
|
|
36
|
+
# - anthropic/claude-3-5-sonnet-20241022
|
|
37
|
+
# - vertex_ai/gemini-2.5-flash
|
|
38
|
+
#
|
|
39
|
+
# If unset, the default is vertex_ai/gemini-2.5-flash.
|
|
40
|
+
# You can override this globally by setting:
|
|
41
|
+
# DEFAULT_LLM_MODEL=openai/gpt-4o-mini
|
|
42
|
+
|
|
43
|
+
# ============================================================================
|
|
44
|
+
# Logging Configuration (optional)
|
|
45
|
+
# ============================================================================
|
|
46
|
+
# LOG_LEVEL=INFO
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
.DS_Store
|
|
2
|
+
sync-drive.sh
|
|
3
|
+
|
|
4
|
+
# LaTeX compiler output
|
|
5
|
+
*.aux
|
|
6
|
+
*.bbl
|
|
7
|
+
*.blg
|
|
8
|
+
*.log
|
|
9
|
+
*.out
|
|
10
|
+
*.fls
|
|
11
|
+
*.fdb_latexmk
|
|
12
|
+
*.synctex.gz
|
|
13
|
+
*.toc
|
|
14
|
+
*.lof
|
|
15
|
+
*.lot
|
|
16
|
+
*.nav
|
|
17
|
+
*.snm
|
|
18
|
+
*.vrb
|
|
19
|
+
comment.cut
|
|
20
|
+
|
|
21
|
+
# Python
|
|
22
|
+
__pycache__/
|
|
23
|
+
*.py[cod]
|
|
24
|
+
*$py.class
|
|
25
|
+
*.so
|
|
26
|
+
.Python
|
|
27
|
+
build/
|
|
28
|
+
develop-eggs/
|
|
29
|
+
dist/
|
|
30
|
+
downloads/
|
|
31
|
+
eggs/
|
|
32
|
+
.eggs/
|
|
33
|
+
lib/
|
|
34
|
+
lib64/
|
|
35
|
+
parts/
|
|
36
|
+
sdist/
|
|
37
|
+
var/
|
|
38
|
+
wheels/
|
|
39
|
+
*.egg-info/
|
|
40
|
+
.installed.cfg
|
|
41
|
+
*.egg
|
|
42
|
+
|
|
43
|
+
# Virtual environments
|
|
44
|
+
.env
|
|
45
|
+
.venv
|
|
46
|
+
env/
|
|
47
|
+
venv/
|
|
48
|
+
ENV/
|
|
49
|
+
|
|
50
|
+
# IDE
|
|
51
|
+
.idea/
|
|
52
|
+
.vscode/
|
|
53
|
+
*.swp
|
|
54
|
+
*.swo
|
|
55
|
+
|
|
56
|
+
# Test
|
|
57
|
+
.pytest_cache/
|
|
58
|
+
.coverage
|
|
59
|
+
htmlcov/
|
|
60
|
+
|
|
61
|
+
# Logs
|
|
62
|
+
*.log
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Contextual AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: extract-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evaluate structured extraction quality with per-field metrics
|
|
5
|
+
Author: Contextual AI
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: jsonschema>=4.0
|
|
10
|
+
Requires-Dist: litellm>=1.0
|
|
11
|
+
Requires-Dist: loguru>=0.7
|
|
12
|
+
Requires-Dist: pydantic>=2.0
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: google-cloud-aiplatform==1.135.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: python-dotenv>=1.0; extra == 'dev'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Extract Bench Evaluation Suite
|
|
21
|
+
|
|
22
|
+
A standalone Python package for evaluating structured extraction quality by comparing predicted JSON against gold JSON with per-field metrics.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install -e .
|
|
28
|
+
|
|
29
|
+
# With dev dependencies
|
|
30
|
+
pip install -e ".[dev]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import json
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from extract_bench import ReportBuilder, ReportConfig
|
|
39
|
+
|
|
40
|
+
# Load your data
|
|
41
|
+
schema = json.load(open("schema.json"))
|
|
42
|
+
gold = json.load(open("gold.json"))
|
|
43
|
+
extracted = json.load(open("model_output.json"))
|
|
44
|
+
|
|
45
|
+
# Configure and build report
|
|
46
|
+
config = ReportConfig(
|
|
47
|
+
output_dir=Path("./eval_results"),
|
|
48
|
+
output_name="nvidia-10k-extract-gemini-flash", # Identifies this experiment
|
|
49
|
+
)
|
|
50
|
+
builder = ReportBuilder(config)
|
|
51
|
+
report = builder.build(schema, gold, extracted)
|
|
52
|
+
|
|
53
|
+
# Save all outputs
|
|
54
|
+
output_path = builder.save(report)
|
|
55
|
+
print(f"Results saved to: {output_path}")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
This creates `eval_results/nvidia-10k-extract-gemini-flash/` containing:
|
|
59
|
+
|
|
60
|
+
| File | Purpose |
|
|
61
|
+
| ------------- | -------------------------------------------------------- |
|
|
62
|
+
| `report.json` | Machine-readable full report (for programmatic analysis) |
|
|
63
|
+
| `summary.txt` | Human-readable one-page summary (for quick inspection) |
|
|
64
|
+
| `fields.csv` | Per-field outcomes (for csv analysis) |
|
|
65
|
+
| `fields.md` | Markdown table (for documentation/sharing) |
|
|
66
|
+
|
|
67
|
+
### Key Metrics in the Report
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# After building the report
|
|
71
|
+
print(f"Overall pass rate: {report.overall_pass_rate:.1%}")
|
|
72
|
+
print(f"Overall score: {report.overall_score:.3f}")
|
|
73
|
+
print(f"Fields evaluated: {report.outcomes.total_evaluated}")
|
|
74
|
+
print(f"Passed: {report.outcomes.total_passed}")
|
|
75
|
+
print(f"Failed: {report.outcomes.total_failed}")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Batch Evaluation Example
|
|
79
|
+
|
|
80
|
+
For running many experiments:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
import asyncio
|
|
84
|
+
import json
|
|
85
|
+
from pathlib import Path
|
|
86
|
+
from extract_bench import ReportBuilder, ReportConfig
|
|
87
|
+
|
|
88
|
+
async def evaluate_model_outputs(
|
|
89
|
+
schema_path: Path,
|
|
90
|
+
gold_path: Path,
|
|
91
|
+
outputs_dir: Path,
|
|
92
|
+
results_dir: Path,
|
|
93
|
+
):
|
|
94
|
+
"""Evaluate all model outputs in a directory."""
|
|
95
|
+
schema = json.load(schema_path.open())
|
|
96
|
+
gold = json.load(gold_path.open())
|
|
97
|
+
|
|
98
|
+
results = []
|
|
99
|
+
for output_file in outputs_dir.glob("*.json"):
|
|
100
|
+
extracted = json.load(output_file.open())
|
|
101
|
+
|
|
102
|
+
config = ReportConfig(
|
|
103
|
+
output_dir=results_dir,
|
|
104
|
+
output_name=output_file.stem, # Use filename as experiment ID
|
|
105
|
+
)
|
|
106
|
+
builder = ReportBuilder(config)
|
|
107
|
+
report = await builder.build_async(schema, gold, extracted)
|
|
108
|
+
builder.save(report)
|
|
109
|
+
|
|
110
|
+
results.append({
|
|
111
|
+
"model": output_file.stem,
|
|
112
|
+
"pass_rate": report.overall_pass_rate,
|
|
113
|
+
"score": report.overall_score,
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
return results
|
|
117
|
+
|
|
118
|
+
# Run batch evaluation
|
|
119
|
+
results = asyncio.run(evaluate_model_outputs(
|
|
120
|
+
schema_path=Path("schema.json"),
|
|
121
|
+
gold_path=Path("gold.json"),
|
|
122
|
+
outputs_dir=Path("./model_outputs"),
|
|
123
|
+
results_dir=Path("./eval_results"),
|
|
124
|
+
))
|
|
125
|
+
|
|
126
|
+
# Print comparison
|
|
127
|
+
for r in sorted(results, key=lambda x: -x["score"]):
|
|
128
|
+
print(f"{r['model']}: {r['pass_rate']:.1%} pass, {r['score']:.3f} avg score")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## ReportConfig Options
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
config = ReportConfig(
|
|
135
|
+
output_dir=Path("./outputs"), # Where to save reports
|
|
136
|
+
output_name="my-experiment", # Subdirectory name (auto-generated if None)
|
|
137
|
+
max_reasoning_length=200, # Truncate LLM reasoning in outputs
|
|
138
|
+
top_n_lowest_fields=5, # Track N lowest-scoring fields
|
|
139
|
+
save_json=True, # Generate report.json
|
|
140
|
+
save_text=True, # Generate summary.txt
|
|
141
|
+
save_csv=True, # Generate fields.csv
|
|
142
|
+
save_markdown=True, # Generate fields.md
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Understanding Report Output
|
|
147
|
+
|
|
148
|
+
### summary.txt Structure
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
================================================================================
|
|
152
|
+
EVALUATION REPORT: my-experiment
|
|
153
|
+
================================================================================
|
|
154
|
+
|
|
155
|
+
OVERALL RESULTS
|
|
156
|
+
---------------
|
|
157
|
+
Pass Rate: 85.2% (23/27 fields)
|
|
158
|
+
Average Score: 0.891
|
|
159
|
+
|
|
160
|
+
SCHEMA SHAPE
|
|
161
|
+
------------
|
|
162
|
+
Total nodes: 45
|
|
163
|
+
By type: object=12, string=18, number=8, array=5, boolean=2
|
|
164
|
+
|
|
165
|
+
COVERAGE
|
|
166
|
+
--------
|
|
167
|
+
Present in both: 25
|
|
168
|
+
Missing in extracted: 2
|
|
169
|
+
Spurious in extracted: 0
|
|
170
|
+
|
|
171
|
+
PASS/FAIL BY METRIC
|
|
172
|
+
-------------------
|
|
173
|
+
string_semantic: 15/18 passed (83.3%)
|
|
174
|
+
number_tolerance: 6/6 passed (100.0%)
|
|
175
|
+
integer_exact: 2/3 passed (66.7%)
|
|
176
|
+
|
|
177
|
+
LOWEST SCORING FIELDS
|
|
178
|
+
---------------------
|
|
179
|
+
1. borrower.address (0.45) - Partial match, missing suite number
|
|
180
|
+
2. terms.rate_type (0.60) - Semantic mismatch
|
|
181
|
+
...
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### fields.csv Columns
|
|
185
|
+
|
|
186
|
+
| Column | Description |
|
|
187
|
+
| ----------------- | ------------------------------------------- |
|
|
188
|
+
| `path` | Full JSONPath to the field |
|
|
189
|
+
| `normalized_path` | Human-readable path (e.g., `borrower.name`) |
|
|
190
|
+
| `metric_id` | Metric used for evaluation |
|
|
191
|
+
| `score` | Numeric score (0.0-1.0) |
|
|
192
|
+
| `passed` | Boolean pass/fail |
|
|
193
|
+
| `gold_value` | Expected value |
|
|
194
|
+
| `extracted_value` | Model's output value |
|
|
195
|
+
| `reasoning` | LLM reasoning (for semantic metrics) |
|
|
196
|
+
|
|
197
|
+
## Low-Level API
|
|
198
|
+
|
|
199
|
+
For direct access to evaluation results without reporting:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from extract_bench import StructuredEvaluator, StructuredEvaluatorConfig
|
|
203
|
+
|
|
204
|
+
evaluator = StructuredEvaluator(StructuredEvaluatorConfig(metrics=[]))
|
|
205
|
+
result = evaluator.evaluate(schema, gold, predicted)
|
|
206
|
+
|
|
207
|
+
# Raw results dict: path -> metric_id -> MetricResult
|
|
208
|
+
for path, metrics in result["results"].items():
|
|
209
|
+
for metric_id, metric_result in metrics.items():
|
|
210
|
+
print(f"{path} [{metric_id}]: passed={metric_result.passed}, score={metric_result.score}")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Use `evaluate_async()` for better performance with LLM-based metrics.
|
|
214
|
+
|
|
215
|
+
## Configuration
|
|
216
|
+
|
|
217
|
+
### Environment Setup
|
|
218
|
+
|
|
219
|
+
LLM-based metrics use LiteLLM. Configure your provider:
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Vertex AI (Google Cloud)
|
|
223
|
+
gcloud auth application-default login
|
|
224
|
+
|
|
225
|
+
# OpenAI
|
|
226
|
+
export OPENAI_API_KEY=sk-...
|
|
227
|
+
|
|
228
|
+
# Or copy .env.example to .env
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### LLM Model Configuration
|
|
232
|
+
|
|
233
|
+
Default model: `vertex_ai/gemini-2.5-flash` (or set `DEFAULT_LLM_MODEL` in `.env`).
|
|
234
|
+
|
|
235
|
+
Override per-field in schema:
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
schema = {
|
|
239
|
+
"type": "object",
|
|
240
|
+
"properties": {
|
|
241
|
+
"company": {
|
|
242
|
+
"type": "string",
|
|
243
|
+
"evaluation_config": {
|
|
244
|
+
"metrics": [{"metric_id": "string_semantic", "params": {"model": "openai/gpt-4o-mini"}}]
|
|
245
|
+
},
|
|
246
|
+
}
|
|
247
|
+
},
|
|
248
|
+
}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Evaluation Presets
|
|
252
|
+
|
|
253
|
+
Specify `evaluation_config` in schema fields to control metrics:
|
|
254
|
+
|
|
255
|
+
| Preset | Description |
|
|
256
|
+
| ------------------------- | ---------------------------------------------------- |
|
|
257
|
+
| `string_exact` | Case-sensitive exact match |
|
|
258
|
+
| `string_fuzzy` | Levenshtein similarity (case-insensitive by default) |
|
|
259
|
+
| `string_case_insensitive` | Case-insensitive match |
|
|
260
|
+
| `string_semantic` | LLM-based semantic similarity (default for strings) |
|
|
261
|
+
| `number_exact` | Exact numeric equality |
|
|
262
|
+
| `number_tolerance` | Match within tolerance (default for numbers) |
|
|
263
|
+
| `integer_exact` | Exact integer equality (default for integers) |
|
|
264
|
+
| `boolean_exact` | Exact boolean equality (default for booleans) |
|
|
265
|
+
| `array_llm` | LLM evaluation of entire array (default for arrays) |
|
|
266
|
+
| `skip` | Skip evaluation for this node |
|
|
267
|
+
|
|
268
|
+
### Custom Metric Configuration
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
schema = {
|
|
272
|
+
"type": "object",
|
|
273
|
+
"properties": {
|
|
274
|
+
"price": {
|
|
275
|
+
"type": "number",
|
|
276
|
+
"evaluation_config": {
|
|
277
|
+
"metrics": [{"metric_id": "number_tolerance", "params": {"tolerance": 0.01}}]
|
|
278
|
+
}
|
|
279
|
+
},
|
|
280
|
+
"description": {
|
|
281
|
+
"type": "string",
|
|
282
|
+
"evaluation_config": "string_fuzzy" # Use preset
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Available Metrics
|
|
289
|
+
|
|
290
|
+
| Category | Metric | Description |
|
|
291
|
+
| -------- | ------------------------- | --------------------------------------- |
|
|
292
|
+
| String | `string_exact` | Case-sensitive exact match |
|
|
293
|
+
| | `string_case_insensitive` | Case-insensitive match |
|
|
294
|
+
| | `string_fuzzy` | Levenshtein similarity |
|
|
295
|
+
| | `string_semantic` | LLM-based semantic comparison (default) |
|
|
296
|
+
| Number | `number_exact` | Exact numeric equality |
|
|
297
|
+
| | `number_tolerance` | Match within tolerance (default) |
|
|
298
|
+
| | `integer_exact` | Exact integer equality |
|
|
299
|
+
| Boolean | `boolean_exact` | Exact boolean equality |
|
|
300
|
+
| Array | `array_llm` | LLM-based array comparison |
|
|
301
|
+
| General | `string_llm` | LLM judge for any comparison |
|
|
302
|
+
|
|
303
|
+
## Custom Metrics
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from extract_bench import global_metric_registry
|
|
307
|
+
from extract_bench.evaluation.metrics import BaseMetric, MetricResult
|
|
308
|
+
|
|
309
|
+
class MyCustomMetric(BaseMetric):
|
|
310
|
+
metric_id = "my_custom"
|
|
311
|
+
|
|
312
|
+
async def evaluate(self, node, config=None):
|
|
313
|
+
gold = node.get_gold_value()
|
|
314
|
+
extracted = node.get_extracted_value()
|
|
315
|
+
return MetricResult(
|
|
316
|
+
metric_id=self.metric_id,
|
|
317
|
+
score=1.0,
|
|
318
|
+
passed=True,
|
|
319
|
+
details={"custom": "data"}
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
global_metric_registry.register_metric(MyCustomMetric)
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
## Architecture
|
|
326
|
+
|
|
327
|
+
```
|
|
328
|
+
extract_bench/
|
|
329
|
+
├── infra/ # Schema AST (nodes, visitors)
|
|
330
|
+
├── evaluation/
|
|
331
|
+
│ ├── metrics/ # Metric implementations
|
|
332
|
+
│ └── reporting/ # Report generation (see reporting/README.md)
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
Schema → AST → Values instantiated → Metrics evaluated async in parallel → Report generated.
|
|
336
|
+
|
|
337
|
+
## Development
|
|
338
|
+
|
|
339
|
+
```bash
|
|
340
|
+
pip install -e ".[dev]"
|
|
341
|
+
pytest tests/ -v
|
|
342
|
+
```
|