dataforge-07-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_07_evals-0.1.0.dist-info/METADATA +263 -0
- dataforge_07_evals-0.1.0.dist-info/RECORD +23 -0
- dataforge_07_evals-0.1.0.dist-info/WHEEL +5 -0
- dataforge_07_evals-0.1.0.dist-info/entry_points.txt +3 -0
- dataforge_07_evals-0.1.0.dist-info/top_level.txt +1 -0
- dataforge_evals/__init__.py +37 -0
- dataforge_evals/agents/__init__.py +23 -0
- dataforge_evals/agents/base.py +208 -0
- dataforge_evals/agents/cerebras_llama.py +65 -0
- dataforge_evals/agents/gemini_flash.py +159 -0
- dataforge_evals/agents/groq_llama.py +66 -0
- dataforge_evals/agents/hf_local.py +209 -0
- dataforge_evals/agents/local_ollama.py +62 -0
- dataforge_evals/agents/mock.py +41 -0
- dataforge_evals/agents/openrouter.py +66 -0
- dataforge_evals/agents/provider_base.py +319 -0
- dataforge_evals/cli.py +326 -0
- dataforge_evals/grader.py +152 -0
- dataforge_evals/harness.py +406 -0
- dataforge_evals/py.typed +1 -0
- dataforge_evals/repair_contract.py +332 -0
- dataforge_evals/report.py +182 -0
- dataforge_evals/tasks.py +250 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge_07_evals
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Agent-agnostic evaluation harness for data-quality repair agents.
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/Aegis15/dataforge
|
|
7
|
+
Project-URL: Repository, https://github.com/Aegis15/dataforge
|
|
8
|
+
Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
|
|
9
|
+
Keywords: data-quality,evaluation,agents,benchmarks,llm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: <3.13,>=3.11
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: httpx>=0.27
|
|
20
|
+
Requires-Dist: pandas>=2.2
|
|
21
|
+
Requires-Dist: pydantic>=2.7
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0
|
|
23
|
+
Requires-Dist: rich>=13.7
|
|
24
|
+
Requires-Dist: tenacity>=8.3
|
|
25
|
+
Requires-Dist: typer>=0.12
|
|
26
|
+
Provides-Extra: dataforge
|
|
27
|
+
Requires-Dist: dataforge_07>=0.1.0; extra == "dataforge"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.2; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.11; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
33
|
+
Requires-Dist: pandas-stubs>=2.2; extra == "dev"
|
|
34
|
+
Provides-Extra: all
|
|
35
|
+
Requires-Dist: dataforge_07_evals[dataforge,dev]; extra == "all"
|
|
36
|
+
Provides-Extra: hf
|
|
37
|
+
Requires-Dist: transformers>=4.44; extra == "hf"
|
|
38
|
+
Requires-Dist: torch>=2.3; extra == "hf"
|
|
39
|
+
Requires-Dist: accelerate>=0.33; extra == "hf"
|
|
40
|
+
|
|
41
|
+
# dataforge-evals
|
|
42
|
+
|
|
43
|
+
`dataforge-evals` is an agent-agnostic evaluation harness for data-quality repair agents.
|
|
44
|
+
|
|
45
|
+
It gives any agent the same task, accepts only proposed cell fixes, and lets the grader compute exact precision, recall, F1, steps, failures, and free-tier quota usage. The harness can load DataForge's canonical Hospital, Flights, and Beers benchmark tasks when `dataforge_07` is installed, while the import namespace remains `dataforge` for the 0.1 line.
|
|
46
|
+
The PyPI package is not published yet; use the source install instructions
|
|
47
|
+
below until release ownership is configured.
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e ".[dev]"
|
|
51
|
+
dataforge-evals run --agent mock --dataset synthetic --trials 3
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
### From source (development)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
python -m venv .venv
|
|
60
|
+
# Linux/macOS:
|
|
61
|
+
source .venv/bin/activate
|
|
62
|
+
# Windows PowerShell:
|
|
63
|
+
.\.venv\Scripts\Activate.ps1
|
|
64
|
+
|
|
65
|
+
pip install -e ".[dev]"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### With canonical DataForge datasets
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install -e "../data_quality_env"
|
|
72
|
+
dataforge-evals run --agent mock --dataset hospital --trials 3
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Run a provider
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
set GROQ_API_KEY=...
|
|
79
|
+
dataforge-evals run --agent groq-llama-70b --dataset hospital --trials 3 --output reports/groq-hospital.md
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Bounded Groq smoke test
|
|
83
|
+
|
|
84
|
+
Use a single synthetic trial to verify Groq wiring without turning the smoke
|
|
85
|
+
check into a benchmark:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
dataforge-evals run --agent groq-llama-70b --dataset synthetic --trials 1 --seed 0 --timeout-s 20 --output reports/groq-synthetic-smoke.md --output-json reports/groq-synthetic-smoke.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For this smoke path, `trials_completed=1` and `Failures=none` prove the
|
|
92
|
+
integration completed successfully. F1 is a quality signal for the model's
|
|
93
|
+
proposed repairs, not the API health check. The JSON report includes the
|
|
94
|
+
normalized proposed `fixes` for debugging; Markdown stays summary-only.
|
|
95
|
+
|
|
96
|
+
### Built-in adapters
|
|
97
|
+
|
|
98
|
+
| Agent ID | Provider | Required Setup |
|
|
99
|
+
| --- | --- | --- |
|
|
100
|
+
| `mock` | local deterministic oracle for tests | none |
|
|
101
|
+
| `groq-llama-70b` | Groq | `GROQ_API_KEY` |
|
|
102
|
+
| `gemini-flash` | Gemini | `GEMINI_API_KEY` |
|
|
103
|
+
| `cerebras-llama` | Cerebras | `CEREBRAS_API_KEY` |
|
|
104
|
+
| `openrouter` | OpenRouter | `OPENROUTER_API_KEY` |
|
|
105
|
+
| `local-ollama` | local Ollama OpenAI-compatible endpoint | Ollama server on `localhost:11434` |
|
|
106
|
+
| `hf-local` | Hugging Face Transformers | optional `HF_TOKEN`; install `.[hf]` |
|
|
107
|
+
|
|
108
|
+
### Evaluating the historical DataForge SFT checkpoint
|
|
109
|
+
|
|
110
|
+
Use `hf-local` for base-vs-SFT checks with the same exact-match grader used by
|
|
111
|
+
hosted providers:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install -e ".[hf]"
|
|
115
|
+
dataforge-evals run --agent hf-local --dataset synthetic --trials 1 \
|
|
116
|
+
--model-id Praneshrajan15/DataForge-0.5B-SFT \
|
|
117
|
+
--output reports/dataforge-sft-smoke.md \
|
|
118
|
+
--output-json reports/dataforge-sft-smoke.json
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
If `--model-id` is omitted, the adapter uses `DATAFORGE_EVAL_MODEL`, then the
|
|
122
|
+
authenticated `HF_TOKEN` user's `DataForge-0.5B-SFT`, then
|
|
123
|
+
`Praneshrajan15/DataForge-0.5B-SFT`.
|
|
124
|
+
|
|
125
|
+
### Discover agents and datasets
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
dataforge-evals list-agents
|
|
129
|
+
dataforge-evals list-datasets
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Custom CSV-pair evaluation
|
|
133
|
+
|
|
134
|
+
Bring your own dirty and clean CSV files:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
dataforge-evals run --agent mock --dataset my-data \
|
|
138
|
+
--dirty-csv path/to/dirty.csv \
|
|
139
|
+
--clean-csv path/to/clean.csv \
|
|
140
|
+
--trials 3
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The dirty and clean CSVs must have the same number of rows and columns. Column names are taken from the clean file.
|
|
144
|
+
|
|
145
|
+
## Agent protocol
|
|
146
|
+
|
|
147
|
+
Any agent can plug in by implementing:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from dataforge_evals import AgentTask, Fix
|
|
151
|
+
|
|
152
|
+
class MyAgent:
|
|
153
|
+
name = "my-agent"
|
|
154
|
+
|
|
155
|
+
def run(self, task: AgentTask) -> list[Fix]:
|
|
156
|
+
return [Fix(row=0, column="Score", new_value="4.5", reason="example")]
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Agents never report their own score. They return candidate fixes only. The grader is the only source of truth.
|
|
160
|
+
Normal agents receive a label-hidden `AgentTask`; only the built-in `mock`
|
|
161
|
+
oracle used by tests is marked to receive full ground truth.
|
|
162
|
+
|
|
163
|
+
### What agents receive
|
|
164
|
+
|
|
165
|
+
- `task.name` — dataset identifier
|
|
166
|
+
- `task.dirty_df` — pandas DataFrame with data-quality issues (all values as strings)
|
|
167
|
+
- `task.canonical_columns` — ordered column names from the clean reference
|
|
168
|
+
- `task.metadata` — provenance and descriptive metadata
|
|
169
|
+
|
|
170
|
+
### What agents return
|
|
171
|
+
|
|
172
|
+
Either a `list[Fix]` or an `AgentRunResult` with usage accounting:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from dataforge_evals import AgentRunResult, Fix, Usage
|
|
176
|
+
|
|
177
|
+
return AgentRunResult(
|
|
178
|
+
fixes=[Fix(row=0, column="Score", new_value="4.5")],
|
|
179
|
+
usage=Usage(calls=1, prompt_tokens=500, completion_tokens=100, quota_units=0.001),
|
|
180
|
+
steps=1,
|
|
181
|
+
model="my-model-v1",
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## What is graded
|
|
186
|
+
|
|
187
|
+
A `Fix` is correct only when `(row, column, new_value)` exactly matches a ground-truth dirty-to-clean cell correction. Duplicate predictions for the same cell use last-write-wins normalization. A wrong value on the right cell counts as both a false positive and a false negative.
|
|
188
|
+
|
|
189
|
+
## Quota accounting
|
|
190
|
+
|
|
191
|
+
Each report uses provider-normalized free-tier quota units rather than dollars. Built-in adapters record raw calls, prompt tokens, completion tokens, and quota units.
|
|
192
|
+
|
|
193
|
+
Provider-specific normalization (as of 2026-05-01):
|
|
194
|
+
|
|
195
|
+
| Provider | Free-tier basis | 1 quota unit = |
|
|
196
|
+
| --- | --- | --- |
|
|
197
|
+
| Groq | 14,400 RPD | 1 request |
|
|
198
|
+
| Gemini | 1,500 RPD | 1 request |
|
|
199
|
+
| Cerebras | 1,000 RPD | 1 request |
|
|
200
|
+
| OpenRouter | Nominal 1,000 RPD | 1 request |
|
|
201
|
+
| Ollama | unlimited (local) | always 0 |
|
|
202
|
+
|
|
203
|
+
On HTTP 429, the adapter waits with exponential backoff and logs `waiting N seconds for quota reset` to stderr. It does not fall back to another provider because fallback would contaminate the comparison.
|
|
204
|
+
|
|
205
|
+
## Reproducibility
|
|
206
|
+
|
|
207
|
+
Each report records:
|
|
208
|
+
|
|
209
|
+
- `dataforge-evals` commit hash
|
|
210
|
+
- `dataforge` source commit hash when canonical datasets are loaded through DataForge
|
|
211
|
+
- exact seeds
|
|
212
|
+
- provider model identifiers
|
|
213
|
+
- UTC run date
|
|
214
|
+
- dependency versions (pandas, pydantic, httpx, etc.)
|
|
215
|
+
- an explicit nondeterminism note
|
|
216
|
+
|
|
217
|
+
Deterministic and mock agents reproduce exactly from the recorded seeds. Hosted LLM providers may still change outputs because providers can update model weights, routing, safety systems, or tokenization without notice.
|
|
218
|
+
|
|
219
|
+
### Reproducibility limitations
|
|
220
|
+
|
|
221
|
+
- Provider model identifiers (e.g., `llama-3.3-70b-versatile`) may point to different weights on different dates.
|
|
222
|
+
- Token counts and quota units depend on provider-side tokenization, which can change.
|
|
223
|
+
- Network latency, rate limiting, and provider availability affect runtime measurements.
|
|
224
|
+
- Temperature 0 does not guarantee determinism across all providers.
|
|
225
|
+
|
|
226
|
+
## Not a leaderboard by default
|
|
227
|
+
|
|
228
|
+
Only compare reports when dataset versions, seeds, provider model identifiers, run date, and prompt/adapter code are identical. Otherwise the report is an evaluation artifact, not a leaderboard row.
|
|
229
|
+
|
|
230
|
+
## When dataforge-evals is the wrong tool
|
|
231
|
+
|
|
232
|
+
Do not use `dataforge-evals` if:
|
|
233
|
+
|
|
234
|
+
- **Your agent operates on streaming data** — the harness is batch-oriented and expects a complete dirty DataFrame.
|
|
235
|
+
- **You need end-to-end pipeline evaluation** — this tool evaluates cell-level repair accuracy, not detection, diagnosis, or pipeline orchestration.
|
|
236
|
+
- **Your ground truth is fuzzy or approximate** — the grader uses exact string match. If multiple correct values exist for a cell, you need a custom grader.
|
|
237
|
+
- **You need sub-second latency benchmarking** — the harness measures wall-clock time but is not designed as a latency benchmarking tool.
|
|
238
|
+
- **Your data is > 100K rows** — the harness loads the full DataFrame into memory and passes it to agents. For large-scale evaluation, sample first.
|
|
239
|
+
|
|
240
|
+
## Development
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
make setup # pip install -e ".[dev]"
|
|
244
|
+
make lint # ruff check
|
|
245
|
+
make format # ruff format --check
|
|
246
|
+
make type # mypy --strict
|
|
247
|
+
make test # pytest
|
|
248
|
+
make test-cov # pytest with coverage
|
|
249
|
+
make smoke # end-to-end smoke test with mock agent
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Environment Variables
|
|
253
|
+
|
|
254
|
+
Provider keys belong in a root `.env` file (gitignored) loaded with `python-dotenv`:
|
|
255
|
+
|
|
256
|
+
- `GROQ_API_KEY`
|
|
257
|
+
- `GEMINI_API_KEY`
|
|
258
|
+
- `CEREBRAS_API_KEY`
|
|
259
|
+
- `OPENROUTER_API_KEY`
|
|
260
|
+
|
|
261
|
+
## License
|
|
262
|
+
|
|
263
|
+
Apache-2.0.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
dataforge_evals/__init__.py,sha256=DnQGiVeXrBPvnOimCab6KxHirNsqu2iqd-wN1xQzI-g,844
|
|
2
|
+
dataforge_evals/cli.py,sha256=A2GwU2Du0N5q3VWHwZ2YM5l5g424HnzpNSFTD8o_jEY,11078
|
|
3
|
+
dataforge_evals/grader.py,sha256=0tsvXhTIMBBSKnxfgjfHbkaiNaS4aai2mHbN5aocwuI,5241
|
|
4
|
+
dataforge_evals/harness.py,sha256=Fb3rLgh5U-aIJnqcmdSPTgRjJqQYJvpPuvhW--e8lqk,14597
|
|
5
|
+
dataforge_evals/py.typed,sha256=umvyuNnMuIqk4trirUMN2k3zkXXle1csrUNb-H4-iog,65
|
|
6
|
+
dataforge_evals/repair_contract.py,sha256=-6g-5fiRKHwZDIQ_cYMiWWXzEGM0lNTteb7oFM-agdw,11598
|
|
7
|
+
dataforge_evals/report.py,sha256=-0VnQgRVy_-dEJeeJBq84YUaHNDuHUS0OIBAqnjfNhk,6469
|
|
8
|
+
dataforge_evals/tasks.py,sha256=AYPbxonDqJ5_-Yw1zQHomd9DAKrmd3RW5pRHWCKSj0Q,9310
|
|
9
|
+
dataforge_evals/agents/__init__.py,sha256=rbuME8zQrlMQsHdpUrqUthtAI-DZZPax_OkXKQT8RKw,412
|
|
10
|
+
dataforge_evals/agents/base.py,sha256=SoTmJFL99M0CWN_YaexAtAJlGP0iuSRT437-jm7_IUk,7600
|
|
11
|
+
dataforge_evals/agents/cerebras_llama.py,sha256=_DDwT6Gd9OXwtRXovdmpZMh4DHLUV2EtekZl7rMHOn8,2268
|
|
12
|
+
dataforge_evals/agents/gemini_flash.py,sha256=gBAJijxL58U3qhWXxAcHlcPGw_xR4l5mC0xfCh4ITgk,5965
|
|
13
|
+
dataforge_evals/agents/groq_llama.py,sha256=IBeomteTaPgkZ46iZO2YKSW9SE8Ki0mnmvD8OMPM7LQ,2287
|
|
14
|
+
dataforge_evals/agents/hf_local.py,sha256=lPXbkduG4G5VJIVQ64haMWDy32BqHdtJbS7Z1E9deSk,8326
|
|
15
|
+
dataforge_evals/agents/local_ollama.py,sha256=5JDTYQ5fRWyp5fKkHqhv8TLmvPt5K-9bPcjYBxLxj-U,2008
|
|
16
|
+
dataforge_evals/agents/mock.py,sha256=GlsYdpHUyhN6zGnjIOshUpPogJCE_hhfvk3MRMHRxFE,1400
|
|
17
|
+
dataforge_evals/agents/openrouter.py,sha256=iTY-AH-i3HW9pPjfzcze2CW1ufZ4Z1T0hT7pnVqRw8s,2290
|
|
18
|
+
dataforge_evals/agents/provider_base.py,sha256=1W9WLjjIsQJ7HkBI_y0AbYdKRpKnPjkVe2zfDPGTZPQ,11643
|
|
19
|
+
dataforge_07_evals-0.1.0.dist-info/METADATA,sha256=MAxqSo-z3jaYC2RJYT6sjpCwB9fnx_TEuFPs8HIuzrA,9669
|
|
20
|
+
dataforge_07_evals-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
dataforge_07_evals-0.1.0.dist-info/entry_points.txt,sha256=-sylQUAkVa_SrzMCztTHJcPqBm1934IBt70N0lOnZ7w,104
|
|
22
|
+
dataforge_07_evals-0.1.0.dist-info/top_level.txt,sha256=3e16aDQIlTzh-A41CGEfnIy3up3eNtOM7EGWuV0pEdM,16
|
|
23
|
+
dataforge_07_evals-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dataforge_evals
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""dataforge-evals: agent-agnostic evaluation harness for data-quality repair agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
6
|
+
|
|
7
|
+
from dataforge_evals.agents.base import (
|
|
8
|
+
Agent,
|
|
9
|
+
AgentRunResult,
|
|
10
|
+
AgentTask,
|
|
11
|
+
Fix,
|
|
12
|
+
GroundTruthCell,
|
|
13
|
+
Task,
|
|
14
|
+
Usage,
|
|
15
|
+
)
|
|
16
|
+
from dataforge_evals.grader import Grade
|
|
17
|
+
|
|
18
|
+
for _distribution_name in ("dataforge_07_evals", "dataforge-evals", "dataforge15-evals"):
|
|
19
|
+
try:
|
|
20
|
+
__version__: str = version(_distribution_name)
|
|
21
|
+
break
|
|
22
|
+
except PackageNotFoundError:
|
|
23
|
+
continue
|
|
24
|
+
else: # pragma: no cover - editable install normally has metadata
|
|
25
|
+
__version__ = "0.0.0-dev"
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"Agent",
|
|
29
|
+
"AgentRunResult",
|
|
30
|
+
"AgentTask",
|
|
31
|
+
"Fix",
|
|
32
|
+
"Grade",
|
|
33
|
+
"GroundTruthCell",
|
|
34
|
+
"Task",
|
|
35
|
+
"Usage",
|
|
36
|
+
"__version__",
|
|
37
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Agent adapter re-exports for the dataforge-evals public API."""
|
|
2
|
+
|
|
3
|
+
from dataforge_evals.agents.base import (
|
|
4
|
+
Agent,
|
|
5
|
+
AgentRunResult,
|
|
6
|
+
AgentTask,
|
|
7
|
+
Fix,
|
|
8
|
+
GroundTruthCell,
|
|
9
|
+
Task,
|
|
10
|
+
Usage,
|
|
11
|
+
)
|
|
12
|
+
from dataforge_evals.agents.hf_local import HfLocalAgent
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Agent",
|
|
16
|
+
"AgentRunResult",
|
|
17
|
+
"AgentTask",
|
|
18
|
+
"Fix",
|
|
19
|
+
"GroundTruthCell",
|
|
20
|
+
"HfLocalAgent",
|
|
21
|
+
"Task",
|
|
22
|
+
"Usage",
|
|
23
|
+
]
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Public data models and agent protocol for dataforge-evals.
|
|
2
|
+
|
|
3
|
+
This module defines the stable public contract that every agent adapter,
|
|
4
|
+
the grader, the harness, and external consumers depend on. Changes to
|
|
5
|
+
these types require a spec update and version bump.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Literal, Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
InferabilityLabel = Literal[
|
|
17
|
+
"deterministic_normalization",
|
|
18
|
+
"context_derivable",
|
|
19
|
+
"external_reference_required",
|
|
20
|
+
"not_inferable_from_prompt",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Fix(BaseModel):
|
|
25
|
+
"""One proposed cell repair emitted by an agent.
|
|
26
|
+
|
|
27
|
+
A fix identifies a single cell by ``(row, column)`` and proposes
|
|
28
|
+
``new_value`` as the corrected content. The ``reason`` field is for
|
|
29
|
+
human audit and observability — it is never used for scoring.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
row: Zero-based row index in the dirty DataFrame.
|
|
33
|
+
column: Column name in the canonical column set.
|
|
34
|
+
new_value: Proposed corrected cell value as a string.
|
|
35
|
+
reason: Human-readable rationale for the repair proposal.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
row: int = Field(ge=0, description="Zero-based row index in the dirty DataFrame.")
|
|
39
|
+
column: str = Field(min_length=1, description="Column name in the canonical column set.")
|
|
40
|
+
new_value: str = Field(description="Proposed corrected cell value as a string.")
|
|
41
|
+
reason: str = Field(
|
|
42
|
+
default="agent proposal",
|
|
43
|
+
min_length=1,
|
|
44
|
+
description="Human-readable rationale for the repair proposal.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
model_config = {"frozen": True}
|
|
48
|
+
|
|
49
|
+
def __repr__(self) -> str:
|
|
50
|
+
return f"Fix(row={self.row}, column={self.column!r}, new_value={self.new_value!r})"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class GroundTruthCell(BaseModel):
|
|
54
|
+
"""One canonical dirty-to-clean cell correction used for grading.
|
|
55
|
+
|
|
56
|
+
Ground-truth cells are computed from aligned dirty/clean DataFrames
|
|
57
|
+
and represent the authoritative answer the grader scores against.
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
row: Zero-based row index in the aligned DataFrames.
|
|
61
|
+
column: Column name from the canonical (clean) column set.
|
|
62
|
+
dirty_value: The original incorrect cell content.
|
|
63
|
+
clean_value: The authoritative corrected cell content.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
row: int = Field(ge=0, description="Zero-based row index.")
|
|
67
|
+
column: str = Field(min_length=1, description="Canonical column name.")
|
|
68
|
+
dirty_value: str = Field(description="Original incorrect cell content.")
|
|
69
|
+
clean_value: str = Field(description="Authoritative corrected cell content.")
|
|
70
|
+
|
|
71
|
+
model_config = {"frozen": True}
|
|
72
|
+
|
|
73
|
+
def __repr__(self) -> str:
|
|
74
|
+
return (
|
|
75
|
+
f"GroundTruthCell(row={self.row}, column={self.column!r}, "
|
|
76
|
+
f"dirty={self.dirty_value!r} -> clean={self.clean_value!r})"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True, kw_only=True)
|
|
81
|
+
class AgentTask:
|
|
82
|
+
"""A label-hidden data-quality repair task passed to normal agents.
|
|
83
|
+
|
|
84
|
+
This is the public runtime view of an evaluation task. It deliberately
|
|
85
|
+
omits ground-truth labels; the harness keeps labels separately for grading.
|
|
86
|
+
|
|
87
|
+
Attributes:
|
|
88
|
+
name: Human-readable task identifier (e.g. ``"hospital"``).
|
|
89
|
+
dirty_df: The DataFrame containing data-quality issues.
|
|
90
|
+
canonical_columns: Ordered column names from the clean reference.
|
|
91
|
+
metadata: Provenance and descriptive metadata for reporting.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
name: str
|
|
95
|
+
dirty_df: pd.DataFrame
|
|
96
|
+
canonical_columns: tuple[str, ...]
|
|
97
|
+
metadata: dict[str, str | int | float | tuple[str, ...]]
|
|
98
|
+
inferability: InferabilityLabel = "deterministic_normalization"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass(frozen=True, kw_only=True)
|
|
102
|
+
class Task(AgentTask):
|
|
103
|
+
"""Full grading task retained inside the harness and oracle tests only.
|
|
104
|
+
|
|
105
|
+
Normal agents receive ``AgentTask``. Only adapters explicitly marked with
|
|
106
|
+
``uses_ground_truth = True`` receive this full task.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
ground_truth: tuple[GroundTruthCell, ...]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class Usage(BaseModel):
|
|
113
|
+
"""Provider usage accounting for one agent run.
|
|
114
|
+
|
|
115
|
+
Tracks raw API call counts, token consumption, and a provider-normalized
|
|
116
|
+
free-tier quota fraction. The ``quota_units`` field represents a
|
|
117
|
+
fraction of the provider's free-tier allocation consumed, enabling
|
|
118
|
+
cross-provider cost comparison on a common scale.
|
|
119
|
+
|
|
120
|
+
Attributes:
|
|
121
|
+
calls: Number of HTTP requests made to the provider.
|
|
122
|
+
prompt_tokens: Total prompt/input tokens consumed.
|
|
123
|
+
completion_tokens: Total completion/output tokens consumed.
|
|
124
|
+
quota_units: Provider-normalized free-tier fraction consumed.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
calls: int = Field(default=0, ge=0, description="Number of HTTP requests made.")
|
|
128
|
+
prompt_tokens: int = Field(default=0, ge=0, description="Total prompt/input tokens.")
|
|
129
|
+
completion_tokens: int = Field(default=0, ge=0, description="Total completion/output tokens.")
|
|
130
|
+
quota_units: float = Field(
|
|
131
|
+
default=0.0, ge=0.0, description="Provider-normalized quota fraction."
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
model_config = {"frozen": True}
|
|
135
|
+
|
|
136
|
+
def __repr__(self) -> str:
|
|
137
|
+
return (
|
|
138
|
+
f"Usage(calls={self.calls}, prompt={self.prompt_tokens}, "
|
|
139
|
+
f"completion={self.completion_tokens}, quota={self.quota_units:.4f})"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def __add__(self, other: Usage) -> Usage:
|
|
143
|
+
"""Accumulate usage across multiple API calls within a single run.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
other: Another Usage instance to merge.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
A new Usage with summed fields.
|
|
150
|
+
"""
|
|
151
|
+
return Usage(
|
|
152
|
+
calls=self.calls + other.calls,
|
|
153
|
+
prompt_tokens=self.prompt_tokens + other.prompt_tokens,
|
|
154
|
+
completion_tokens=self.completion_tokens + other.completion_tokens,
|
|
155
|
+
quota_units=round(self.quota_units + other.quota_units, 4),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class AgentRunResult(BaseModel):
|
|
160
|
+
"""Normalized result returned by built-in adapters.
|
|
161
|
+
|
|
162
|
+
Wraps the agent's proposed fixes alongside usage accounting,
|
|
163
|
+
step count, and optional model identification for reproducibility.
|
|
164
|
+
|
|
165
|
+
Attributes:
|
|
166
|
+
fixes: Ordered list of proposed cell repairs.
|
|
167
|
+
usage: Provider usage accounting for this run.
|
|
168
|
+
steps: Number of reasoning steps the agent performed.
|
|
169
|
+
model: Provider model identifier for reproducibility.
|
|
170
|
+
warnings: Non-fatal diagnostic messages from the adapter.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
fixes: list[Fix]
|
|
174
|
+
usage: Usage = Field(default_factory=Usage)
|
|
175
|
+
steps: int = Field(default=1, ge=0, description="Reasoning steps performed.")
|
|
176
|
+
model: str | None = Field(default=None, description="Provider model identifier.")
|
|
177
|
+
warnings: list[str] = Field(default_factory=list)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@runtime_checkable
|
|
181
|
+
class Agent(Protocol):
|
|
182
|
+
"""Protocol implemented by every data-quality repair agent adapter.
|
|
183
|
+
|
|
184
|
+
Any object with a ``name`` attribute and a ``run`` method matching
|
|
185
|
+
this signature can be used as an agent in the evaluation harness.
|
|
186
|
+
The agent receives a ``Task`` and returns proposed ``Fix`` objects.
|
|
187
|
+
Agents must never set their own metrics — the grader is the sole
|
|
188
|
+
source of truth.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
>>> class MyAgent:
|
|
192
|
+
... name = "my-agent"
|
|
193
|
+
... def run(self, task: Task) -> list[Fix]:
|
|
194
|
+
... return [Fix(row=0, column="Score", new_value="4.5")]
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
name: str
|
|
198
|
+
|
|
199
|
+
def run(self, task: AgentTask) -> list[Fix] | AgentRunResult:
|
|
200
|
+
"""Run the agent on a task and return proposed fixes.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
task: The data-quality repair task to evaluate.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
A list of Fix objects or an AgentRunResult with usage accounting.
|
|
207
|
+
"""
|
|
208
|
+
...
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Cerebras-hosted Llama adapter for data-quality repair evaluation.
|
|
2
|
+
|
|
3
|
+
Cerebras free tier (as of 2026-05-01): 30 RPM, 1,000 RPD on Llama 3.1 70B.
|
|
4
|
+
Quota unit = fraction of the daily request allocation consumed per API call.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataforge_evals.agents.provider_base import ChatProviderAgent
|
|
10
|
+
|
|
11
|
+
# Cerebras free-tier daily request limit (as of 2026-05-01).
|
|
12
|
+
_CEREBRAS_FREE_DAILY_REQUESTS = 1_000
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CerebrasLlamaAgent(ChatProviderAgent):
|
|
16
|
+
"""Cerebras-hosted Llama adapter for data-quality repair evaluation.
|
|
17
|
+
|
|
18
|
+
Uses the Cerebras OpenAI-compatible endpoint with Llama 3.1 70B.
|
|
19
|
+
Quota units are normalized against the free-tier daily request allocation.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
name: CLI identifier ``"cerebras-llama"``.
|
|
23
|
+
provider: ``"cerebras"``.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "cerebras-llama"
|
|
27
|
+
provider = "cerebras"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
api_key: str,
|
|
33
|
+
model: str = "llama3.1-70b",
|
|
34
|
+
http_timeout_s: float = 15.0,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Initialize the Cerebras adapter.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
api_key: Cerebras API key (``CEREBRAS_API_KEY``).
|
|
40
|
+
model: Cerebras model identifier.
|
|
41
|
+
http_timeout_s: Per-request HTTP timeout in seconds.
|
|
42
|
+
"""
|
|
43
|
+
super().__init__(api_key=api_key, model=model, http_timeout_s=http_timeout_s)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def endpoint(self) -> str:
|
|
47
|
+
"""Return the Cerebras OpenAI-compatible endpoint."""
|
|
48
|
+
return "https://api.cerebras.ai/v1/chat/completions"
|
|
49
|
+
|
|
50
|
+
def headers(self) -> dict[str, str]:
|
|
51
|
+
"""Return Cerebras authorization headers."""
|
|
52
|
+
return {"Authorization": f"Bearer {self._api_key}", "Content-Type": "application/json"}
|
|
53
|
+
|
|
54
|
+
def quota_units(self, *, calls: int, prompt_tokens: int, completion_tokens: int) -> float:
|
|
55
|
+
"""Cerebras quota: fraction of daily request allocation.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
calls: Number of HTTP requests made.
|
|
59
|
+
prompt_tokens: Prompt tokens (tracked but not primary quota dimension).
|
|
60
|
+
completion_tokens: Completion tokens (tracked but not primary quota dimension).
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Fraction of free-tier daily request quota consumed.
|
|
64
|
+
"""
|
|
65
|
+
return round(calls / _CEREBRAS_FREE_DAILY_REQUESTS, 6)
|