dataforge-07-evals 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_07_evals-0.1.0/PKG-INFO +263 -0
- dataforge_07_evals-0.1.0/README.md +223 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/PKG-INFO +263 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/SOURCES.txt +32 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/dependency_links.txt +1 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/entry_points.txt +3 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/requires.txt +25 -0
- dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/top_level.txt +1 -0
- dataforge_07_evals-0.1.0/dataforge_evals/__init__.py +37 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/__init__.py +23 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/base.py +208 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/cerebras_llama.py +65 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/gemini_flash.py +159 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/groq_llama.py +66 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/hf_local.py +209 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/local_ollama.py +62 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/mock.py +41 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/openrouter.py +66 -0
- dataforge_07_evals-0.1.0/dataforge_evals/agents/provider_base.py +319 -0
- dataforge_07_evals-0.1.0/dataforge_evals/cli.py +326 -0
- dataforge_07_evals-0.1.0/dataforge_evals/grader.py +152 -0
- dataforge_07_evals-0.1.0/dataforge_evals/harness.py +406 -0
- dataforge_07_evals-0.1.0/dataforge_evals/py.typed +1 -0
- dataforge_07_evals-0.1.0/dataforge_evals/repair_contract.py +332 -0
- dataforge_07_evals-0.1.0/dataforge_evals/report.py +182 -0
- dataforge_07_evals-0.1.0/dataforge_evals/tasks.py +250 -0
- dataforge_07_evals-0.1.0/pyproject.toml +106 -0
- dataforge_07_evals-0.1.0/setup.cfg +4 -0
- dataforge_07_evals-0.1.0/tests/test_grader.py +225 -0
- dataforge_07_evals-0.1.0/tests/test_harness.py +339 -0
- dataforge_07_evals-0.1.0/tests/test_hf_local_agent.py +139 -0
- dataforge_07_evals-0.1.0/tests/test_package_naming.py +19 -0
- dataforge_07_evals-0.1.0/tests/test_provider_base.py +91 -0
- dataforge_07_evals-0.1.0/tests/test_release_truth.py +21 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge_07_evals
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Agent-agnostic evaluation harness for data-quality repair agents.
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/Aegis15/dataforge
|
|
7
|
+
Project-URL: Repository, https://github.com/Aegis15/dataforge
|
|
8
|
+
Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
|
|
9
|
+
Keywords: data-quality,evaluation,agents,benchmarks,llm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: <3.13,>=3.11
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: httpx>=0.27
|
|
20
|
+
Requires-Dist: pandas>=2.2
|
|
21
|
+
Requires-Dist: pydantic>=2.7
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0
|
|
23
|
+
Requires-Dist: rich>=13.7
|
|
24
|
+
Requires-Dist: tenacity>=8.3
|
|
25
|
+
Requires-Dist: typer>=0.12
|
|
26
|
+
Provides-Extra: dataforge
|
|
27
|
+
Requires-Dist: dataforge_07>=0.1.0; extra == "dataforge"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.2; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.11; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
33
|
+
Requires-Dist: pandas-stubs>=2.2; extra == "dev"
|
|
34
|
+
Provides-Extra: all
|
|
35
|
+
Requires-Dist: dataforge_07_evals[dataforge,dev]; extra == "all"
|
|
36
|
+
Provides-Extra: hf
|
|
37
|
+
Requires-Dist: transformers>=4.44; extra == "hf"
|
|
38
|
+
Requires-Dist: torch>=2.3; extra == "hf"
|
|
39
|
+
Requires-Dist: accelerate>=0.33; extra == "hf"
|
|
40
|
+
|
|
41
|
+
# dataforge-evals
|
|
42
|
+
|
|
43
|
+
`dataforge-evals` is an agent-agnostic evaluation harness for data-quality repair agents.
|
|
44
|
+
|
|
45
|
+
It gives any agent the same task, accepts only proposed cell fixes, and lets the grader compute exact precision, recall, F1, steps, failures, and free-tier quota usage. The harness can load DataForge's canonical Hospital, Flights, and Beers benchmark tasks when `dataforge_07` is installed, while the import namespace remains `dataforge` for the 0.1 line.
|
|
46
|
+
The PyPI package is not published yet; use the source install instructions
|
|
47
|
+
below until release ownership is configured.
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e ".[dev]"
|
|
51
|
+
dataforge-evals run --agent mock --dataset synthetic --trials 3
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
### From source (development)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
python -m venv .venv
|
|
60
|
+
# Linux/macOS:
|
|
61
|
+
source .venv/bin/activate
|
|
62
|
+
# Windows PowerShell:
|
|
63
|
+
.\.venv\Scripts\Activate.ps1
|
|
64
|
+
|
|
65
|
+
pip install -e ".[dev]"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### With canonical DataForge datasets
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install -e "../data_quality_env"
|
|
72
|
+
dataforge-evals run --agent mock --dataset hospital --trials 3
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Run a provider
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
set GROQ_API_KEY=...
|
|
79
|
+
dataforge-evals run --agent groq-llama-70b --dataset hospital --trials 3 --output reports/groq-hospital.md
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Bounded Groq smoke test
|
|
83
|
+
|
|
84
|
+
Use a single synthetic trial to verify Groq wiring without turning the smoke
|
|
85
|
+
check into a benchmark:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
dataforge-evals run --agent groq-llama-70b --dataset synthetic --trials 1 --seed 0 --timeout-s 20 --output reports/groq-synthetic-smoke.md --output-json reports/groq-synthetic-smoke.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For this smoke path, `trials_completed=1` and `Failures=none` prove the
|
|
92
|
+
integration completed successfully. F1 is a quality signal for the model's
|
|
93
|
+
proposed repairs, not the API health check. The JSON report includes the
|
|
94
|
+
normalized proposed `fixes` for debugging; Markdown stays summary-only.
|
|
95
|
+
|
|
96
|
+
### Built-in adapters
|
|
97
|
+
|
|
98
|
+
| Agent ID | Provider | Required Setup |
|
|
99
|
+
| --- | --- | --- |
|
|
100
|
+
| `mock` | local deterministic oracle for tests | none |
|
|
101
|
+
| `groq-llama-70b` | Groq | `GROQ_API_KEY` |
|
|
102
|
+
| `gemini-flash` | Gemini | `GEMINI_API_KEY` |
|
|
103
|
+
| `cerebras-llama` | Cerebras | `CEREBRAS_API_KEY` |
|
|
104
|
+
| `openrouter` | OpenRouter | `OPENROUTER_API_KEY` |
|
|
105
|
+
| `local-ollama` | local Ollama OpenAI-compatible endpoint | Ollama server on `localhost:11434` |
|
|
106
|
+
| `hf-local` | Hugging Face Transformers | optional `HF_TOKEN`; install `.[hf]` |
|
|
107
|
+
|
|
108
|
+
### Evaluating the historical DataForge SFT checkpoint
|
|
109
|
+
|
|
110
|
+
Use `hf-local` for base-vs-SFT checks with the same exact-match grader used by
|
|
111
|
+
hosted providers:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install -e ".[hf]"
|
|
115
|
+
dataforge-evals run --agent hf-local --dataset synthetic --trials 1 \
|
|
116
|
+
--model-id Praneshrajan15/DataForge-0.5B-SFT \
|
|
117
|
+
--output reports/dataforge-sft-smoke.md \
|
|
118
|
+
--output-json reports/dataforge-sft-smoke.json
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
If `--model-id` is omitted, the adapter uses `DATAFORGE_EVAL_MODEL`, then the
|
|
122
|
+
authenticated `HF_TOKEN` user's `DataForge-0.5B-SFT`, then
|
|
123
|
+
`Praneshrajan15/DataForge-0.5B-SFT`.
|
|
124
|
+
|
|
125
|
+
### Discover agents and datasets
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
dataforge-evals list-agents
|
|
129
|
+
dataforge-evals list-datasets
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Custom CSV-pair evaluation
|
|
133
|
+
|
|
134
|
+
Bring your own dirty and clean CSV files:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
dataforge-evals run --agent mock --dataset my-data \
|
|
138
|
+
--dirty-csv path/to/dirty.csv \
|
|
139
|
+
--clean-csv path/to/clean.csv \
|
|
140
|
+
--trials 3
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The dirty and clean CSVs must have the same number of rows and columns. Column names are taken from the clean file.
|
|
144
|
+
|
|
145
|
+
## Agent protocol
|
|
146
|
+
|
|
147
|
+
Any agent can plug in by implementing:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from dataforge_evals import AgentTask, Fix
|
|
151
|
+
|
|
152
|
+
class MyAgent:
|
|
153
|
+
name = "my-agent"
|
|
154
|
+
|
|
155
|
+
def run(self, task: AgentTask) -> list[Fix]:
|
|
156
|
+
return [Fix(row=0, column="Score", new_value="4.5", reason="example")]
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Agents never report their own score. They return candidate fixes only. The grader is the only source of truth.
|
|
160
|
+
Normal agents receive a label-hidden `AgentTask`; only the built-in `mock`
|
|
161
|
+
oracle used by tests is marked to receive full ground truth.
|
|
162
|
+
|
|
163
|
+
### What agents receive
|
|
164
|
+
|
|
165
|
+
- `task.name` — dataset identifier
|
|
166
|
+
- `task.dirty_df` — pandas DataFrame with data-quality issues (all values as strings)
|
|
167
|
+
- `task.canonical_columns` — ordered column names from the clean reference
|
|
168
|
+
- `task.metadata` — provenance and descriptive metadata
|
|
169
|
+
|
|
170
|
+
### What agents return
|
|
171
|
+
|
|
172
|
+
Either a `list[Fix]` or an `AgentRunResult` with usage accounting:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from dataforge_evals import AgentRunResult, Fix, Usage
|
|
176
|
+
|
|
177
|
+
return AgentRunResult(
|
|
178
|
+
fixes=[Fix(row=0, column="Score", new_value="4.5")],
|
|
179
|
+
usage=Usage(calls=1, prompt_tokens=500, completion_tokens=100, quota_units=0.001),
|
|
180
|
+
steps=1,
|
|
181
|
+
model="my-model-v1",
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## What is graded
|
|
186
|
+
|
|
187
|
+
A `Fix` is correct only when `(row, column, new_value)` exactly matches a ground-truth dirty-to-clean cell correction. Duplicate predictions for the same cell use last-write-wins normalization. A wrong value on the right cell counts as both a false positive and a false negative.
|
|
188
|
+
|
|
189
|
+
## Quota accounting
|
|
190
|
+
|
|
191
|
+
Each report uses provider-normalized free-tier quota units rather than dollars. Built-in adapters record raw calls, prompt tokens, completion tokens, and quota units.
|
|
192
|
+
|
|
193
|
+
Provider-specific normalization (as of 2026-05-01):
|
|
194
|
+
|
|
195
|
+
| Provider | Free-tier basis | 1 quota unit = |
|
|
196
|
+
| --- | --- | --- |
|
|
197
|
+
| Groq | 14,400 RPD | 1 request |
|
|
198
|
+
| Gemini | 1,500 RPD | 1 request |
|
|
199
|
+
| Cerebras | 1,000 RPD | 1 request |
|
|
200
|
+
| OpenRouter | Nominal 1,000 RPD | 1 request |
|
|
201
|
+
| Ollama | unlimited (local) | always 0 |
|
|
202
|
+
|
|
203
|
+
On HTTP 429, the adapter waits with exponential backoff and logs `waiting N seconds for quota reset` to stderr. It does not fall back to another provider because fallback would contaminate the comparison.
|
|
204
|
+
|
|
205
|
+
## Reproducibility
|
|
206
|
+
|
|
207
|
+
Each report records:
|
|
208
|
+
|
|
209
|
+
- `dataforge-evals` commit hash
|
|
210
|
+
- `dataforge` source commit hash when canonical datasets are loaded through DataForge
|
|
211
|
+
- exact seeds
|
|
212
|
+
- provider model identifiers
|
|
213
|
+
- UTC run date
|
|
214
|
+
- dependency versions (pandas, pydantic, httpx, etc.)
|
|
215
|
+
- an explicit nondeterminism note
|
|
216
|
+
|
|
217
|
+
Deterministic and mock agents reproduce exactly from the recorded seeds. Hosted LLM providers may still change outputs because providers can update model weights, routing, safety systems, or tokenization without notice.
|
|
218
|
+
|
|
219
|
+
### Reproducibility limitations
|
|
220
|
+
|
|
221
|
+
- Provider model identifiers (e.g., `llama-3.3-70b-versatile`) may point to different weights on different dates.
|
|
222
|
+
- Token counts and quota units depend on provider-side tokenization, which can change.
|
|
223
|
+
- Network latency, rate limiting, and provider availability affect runtime measurements.
|
|
224
|
+
- Temperature 0 does not guarantee determinism across all providers.
|
|
225
|
+
|
|
226
|
+
## Not a leaderboard by default
|
|
227
|
+
|
|
228
|
+
Only compare reports when dataset versions, seeds, provider model identifiers, run date, and prompt/adapter code are identical. Otherwise the report is an evaluation artifact, not a leaderboard row.
|
|
229
|
+
|
|
230
|
+
## When dataforge-evals is the wrong tool
|
|
231
|
+
|
|
232
|
+
Do not use `dataforge-evals` if:
|
|
233
|
+
|
|
234
|
+
- **Your agent operates on streaming data** — the harness is batch-oriented and expects a complete dirty DataFrame.
|
|
235
|
+
- **You need end-to-end pipeline evaluation** — this tool evaluates cell-level repair accuracy, not detection, diagnosis, or pipeline orchestration.
|
|
236
|
+
- **Your ground truth is fuzzy or approximate** — the grader uses exact string match. If multiple correct values exist for a cell, you need a custom grader.
|
|
237
|
+
- **You need sub-second latency benchmarking** — the harness measures wall-clock time but is not designed as a latency benchmarking tool.
|
|
238
|
+
- **Your data is > 100K rows** — the harness loads the full DataFrame into memory and passes it to agents. For large-scale evaluation, sample first.
|
|
239
|
+
|
|
240
|
+
## Development
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
make setup # pip install -e ".[dev]"
|
|
244
|
+
make lint # ruff check
|
|
245
|
+
make format # ruff format --check
|
|
246
|
+
make type # mypy --strict
|
|
247
|
+
make test # pytest
|
|
248
|
+
make test-cov # pytest with coverage
|
|
249
|
+
make smoke # end-to-end smoke test with mock agent
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Environment Variables
|
|
253
|
+
|
|
254
|
+
Provider keys belong in a root `.env` file (gitignored) loaded with `python-dotenv`:
|
|
255
|
+
|
|
256
|
+
- `GROQ_API_KEY`
|
|
257
|
+
- `GEMINI_API_KEY`
|
|
258
|
+
- `CEREBRAS_API_KEY`
|
|
259
|
+
- `OPENROUTER_API_KEY`
|
|
260
|
+
|
|
261
|
+
## License
|
|
262
|
+
|
|
263
|
+
Apache-2.0.
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# dataforge-evals
|
|
2
|
+
|
|
3
|
+
`dataforge-evals` is an agent-agnostic evaluation harness for data-quality repair agents.
|
|
4
|
+
|
|
5
|
+
It gives any agent the same task, accepts only proposed cell fixes, and lets the grader compute exact precision, recall, F1, steps, failures, and free-tier quota usage. The harness can load DataForge's canonical Hospital, Flights, and Beers benchmark tasks when `dataforge_07` is installed, while the import namespace remains `dataforge` for the 0.1 line.
|
|
6
|
+
The PyPI package is not published yet; use the source install instructions
|
|
7
|
+
below until release ownership is configured.
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
dataforge-evals run --agent mock --dataset synthetic --trials 3
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
### From source (development)
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
python -m venv .venv
|
|
20
|
+
# Linux/macOS:
|
|
21
|
+
source .venv/bin/activate
|
|
22
|
+
# Windows PowerShell:
|
|
23
|
+
.\.venv\Scripts\Activate.ps1
|
|
24
|
+
|
|
25
|
+
pip install -e ".[dev]"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### With canonical DataForge datasets
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install -e "../data_quality_env"
|
|
32
|
+
dataforge-evals run --agent mock --dataset hospital --trials 3
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Run a provider
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
set GROQ_API_KEY=...
|
|
39
|
+
dataforge-evals run --agent groq-llama-70b --dataset hospital --trials 3 --output reports/groq-hospital.md
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Bounded Groq smoke test
|
|
43
|
+
|
|
44
|
+
Use a single synthetic trial to verify Groq wiring without turning the smoke
|
|
45
|
+
check into a benchmark:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
dataforge-evals run --agent groq-llama-70b --dataset synthetic --trials 1 --seed 0 --timeout-s 20 --output reports/groq-synthetic-smoke.md --output-json reports/groq-synthetic-smoke.json
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
For this smoke path, `trials_completed=1` and `Failures=none` prove the
|
|
52
|
+
integration completed successfully. F1 is a quality signal for the model's
|
|
53
|
+
proposed repairs, not the API health check. The JSON report includes the
|
|
54
|
+
normalized proposed `fixes` for debugging; Markdown stays summary-only.
|
|
55
|
+
|
|
56
|
+
### Built-in adapters
|
|
57
|
+
|
|
58
|
+
| Agent ID | Provider | Required Setup |
|
|
59
|
+
| --- | --- | --- |
|
|
60
|
+
| `mock` | local deterministic oracle for tests | none |
|
|
61
|
+
| `groq-llama-70b` | Groq | `GROQ_API_KEY` |
|
|
62
|
+
| `gemini-flash` | Gemini | `GEMINI_API_KEY` |
|
|
63
|
+
| `cerebras-llama` | Cerebras | `CEREBRAS_API_KEY` |
|
|
64
|
+
| `openrouter` | OpenRouter | `OPENROUTER_API_KEY` |
|
|
65
|
+
| `local-ollama` | local Ollama OpenAI-compatible endpoint | Ollama server on `localhost:11434` |
|
|
66
|
+
| `hf-local` | Hugging Face Transformers | optional `HF_TOKEN`; install `.[hf]` |
|
|
67
|
+
|
|
68
|
+
### Evaluating the historical DataForge SFT checkpoint
|
|
69
|
+
|
|
70
|
+
Use `hf-local` for base-vs-SFT checks with the same exact-match grader used by
|
|
71
|
+
hosted providers:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install -e ".[hf]"
|
|
75
|
+
dataforge-evals run --agent hf-local --dataset synthetic --trials 1 \
|
|
76
|
+
--model-id Praneshrajan15/DataForge-0.5B-SFT \
|
|
77
|
+
--output reports/dataforge-sft-smoke.md \
|
|
78
|
+
--output-json reports/dataforge-sft-smoke.json
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
If `--model-id` is omitted, the adapter uses `DATAFORGE_EVAL_MODEL`, then the
|
|
82
|
+
authenticated `HF_TOKEN` user's `DataForge-0.5B-SFT`, then
|
|
83
|
+
`Praneshrajan15/DataForge-0.5B-SFT`.
|
|
84
|
+
|
|
85
|
+
### Discover agents and datasets
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
dataforge-evals list-agents
|
|
89
|
+
dataforge-evals list-datasets
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Custom CSV-pair evaluation
|
|
93
|
+
|
|
94
|
+
Bring your own dirty and clean CSV files:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
dataforge-evals run --agent mock --dataset my-data \
|
|
98
|
+
--dirty-csv path/to/dirty.csv \
|
|
99
|
+
--clean-csv path/to/clean.csv \
|
|
100
|
+
--trials 3
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The dirty and clean CSVs must have the same number of rows and columns. Column names are taken from the clean file.
|
|
104
|
+
|
|
105
|
+
## Agent protocol
|
|
106
|
+
|
|
107
|
+
Any agent can plug in by implementing:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from dataforge_evals import AgentTask, Fix
|
|
111
|
+
|
|
112
|
+
class MyAgent:
|
|
113
|
+
name = "my-agent"
|
|
114
|
+
|
|
115
|
+
def run(self, task: AgentTask) -> list[Fix]:
|
|
116
|
+
return [Fix(row=0, column="Score", new_value="4.5", reason="example")]
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Agents never report their own score. They return candidate fixes only. The grader is the only source of truth.
|
|
120
|
+
Normal agents receive a label-hidden `AgentTask`; only the built-in `mock`
|
|
121
|
+
oracle used by tests is marked to receive full ground truth.
|
|
122
|
+
|
|
123
|
+
### What agents receive
|
|
124
|
+
|
|
125
|
+
- `task.name` — dataset identifier
|
|
126
|
+
- `task.dirty_df` — pandas DataFrame with data-quality issues (all values as strings)
|
|
127
|
+
- `task.canonical_columns` — ordered column names from the clean reference
|
|
128
|
+
- `task.metadata` — provenance and descriptive metadata
|
|
129
|
+
|
|
130
|
+
### What agents return
|
|
131
|
+
|
|
132
|
+
Either a `list[Fix]` or an `AgentRunResult` with usage accounting:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from dataforge_evals import AgentRunResult, Fix, Usage
|
|
136
|
+
|
|
137
|
+
return AgentRunResult(
|
|
138
|
+
fixes=[Fix(row=0, column="Score", new_value="4.5")],
|
|
139
|
+
usage=Usage(calls=1, prompt_tokens=500, completion_tokens=100, quota_units=0.001),
|
|
140
|
+
steps=1,
|
|
141
|
+
model="my-model-v1",
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## What is graded
|
|
146
|
+
|
|
147
|
+
A `Fix` is correct only when `(row, column, new_value)` exactly matches a ground-truth dirty-to-clean cell correction. Duplicate predictions for the same cell use last-write-wins normalization. A wrong value on the right cell counts as both a false positive and a false negative.
|
|
148
|
+
|
|
149
|
+
## Quota accounting
|
|
150
|
+
|
|
151
|
+
Each report uses provider-normalized free-tier quota units rather than dollars. Built-in adapters record raw calls, prompt tokens, completion tokens, and quota units.
|
|
152
|
+
|
|
153
|
+
Provider-specific normalization (as of 2026-05-01):
|
|
154
|
+
|
|
155
|
+
| Provider | Free-tier basis | 1 quota unit = |
|
|
156
|
+
| --- | --- | --- |
|
|
157
|
+
| Groq | 14,400 RPD | 1 request |
|
|
158
|
+
| Gemini | 1,500 RPD | 1 request |
|
|
159
|
+
| Cerebras | 1,000 RPD | 1 request |
|
|
160
|
+
| OpenRouter | Nominal 1,000 RPD | 1 request |
|
|
161
|
+
| Ollama | unlimited (local) | always 0 |
|
|
162
|
+
|
|
163
|
+
On HTTP 429, the adapter waits with exponential backoff and logs `waiting N seconds for quota reset` to stderr. It does not fall back to another provider because fallback would contaminate the comparison.
|
|
164
|
+
|
|
165
|
+
## Reproducibility
|
|
166
|
+
|
|
167
|
+
Each report records:
|
|
168
|
+
|
|
169
|
+
- `dataforge-evals` commit hash
|
|
170
|
+
- `dataforge` source commit hash when canonical datasets are loaded through DataForge
|
|
171
|
+
- exact seeds
|
|
172
|
+
- provider model identifiers
|
|
173
|
+
- UTC run date
|
|
174
|
+
- dependency versions (pandas, pydantic, httpx, etc.)
|
|
175
|
+
- an explicit nondeterminism note
|
|
176
|
+
|
|
177
|
+
Deterministic and mock agents reproduce exactly from the recorded seeds. Hosted LLM providers may still change outputs because providers can update model weights, routing, safety systems, or tokenization without notice.
|
|
178
|
+
|
|
179
|
+
### Reproducibility limitations
|
|
180
|
+
|
|
181
|
+
- Provider model identifiers (e.g., `llama-3.3-70b-versatile`) may point to different weights on different dates.
|
|
182
|
+
- Token counts and quota units depend on provider-side tokenization, which can change.
|
|
183
|
+
- Network latency, rate limiting, and provider availability affect runtime measurements.
|
|
184
|
+
- Temperature 0 does not guarantee determinism across all providers.
|
|
185
|
+
|
|
186
|
+
## Not a leaderboard by default
|
|
187
|
+
|
|
188
|
+
Only compare reports when dataset versions, seeds, provider model identifiers, run date, and prompt/adapter code are identical. Otherwise the report is an evaluation artifact, not a leaderboard row.
|
|
189
|
+
|
|
190
|
+
## When dataforge-evals is the wrong tool
|
|
191
|
+
|
|
192
|
+
Do not use `dataforge-evals` if:
|
|
193
|
+
|
|
194
|
+
- **Your agent operates on streaming data** — the harness is batch-oriented and expects a complete dirty DataFrame.
|
|
195
|
+
- **You need end-to-end pipeline evaluation** — this tool evaluates cell-level repair accuracy, not detection, diagnosis, or pipeline orchestration.
|
|
196
|
+
- **Your ground truth is fuzzy or approximate** — the grader uses exact string match. If multiple correct values exist for a cell, you need a custom grader.
|
|
197
|
+
- **You need sub-second latency benchmarking** — the harness measures wall-clock time but is not designed as a latency benchmarking tool.
|
|
198
|
+
- **Your data is > 100K rows** — the harness loads the full DataFrame into memory and passes it to agents. For large-scale evaluation, sample first.
|
|
199
|
+
|
|
200
|
+
## Development
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
make setup # pip install -e ".[dev]"
|
|
204
|
+
make lint # ruff check
|
|
205
|
+
make format # ruff format --check
|
|
206
|
+
make type # mypy --strict
|
|
207
|
+
make test # pytest
|
|
208
|
+
make test-cov # pytest with coverage
|
|
209
|
+
make smoke # end-to-end smoke test with mock agent
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Environment Variables
|
|
213
|
+
|
|
214
|
+
Provider keys belong in a root `.env` file (gitignored) loaded with `python-dotenv`:
|
|
215
|
+
|
|
216
|
+
- `GROQ_API_KEY`
|
|
217
|
+
- `GEMINI_API_KEY`
|
|
218
|
+
- `CEREBRAS_API_KEY`
|
|
219
|
+
- `OPENROUTER_API_KEY`
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
Apache-2.0.
|