dataforge-07-evals 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. dataforge_07_evals-0.1.0/PKG-INFO +263 -0
  2. dataforge_07_evals-0.1.0/README.md +223 -0
  3. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/PKG-INFO +263 -0
  4. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/SOURCES.txt +32 -0
  5. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/dependency_links.txt +1 -0
  6. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/entry_points.txt +3 -0
  7. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/requires.txt +25 -0
  8. dataforge_07_evals-0.1.0/dataforge_07_evals.egg-info/top_level.txt +1 -0
  9. dataforge_07_evals-0.1.0/dataforge_evals/__init__.py +37 -0
  10. dataforge_07_evals-0.1.0/dataforge_evals/agents/__init__.py +23 -0
  11. dataforge_07_evals-0.1.0/dataforge_evals/agents/base.py +208 -0
  12. dataforge_07_evals-0.1.0/dataforge_evals/agents/cerebras_llama.py +65 -0
  13. dataforge_07_evals-0.1.0/dataforge_evals/agents/gemini_flash.py +159 -0
  14. dataforge_07_evals-0.1.0/dataforge_evals/agents/groq_llama.py +66 -0
  15. dataforge_07_evals-0.1.0/dataforge_evals/agents/hf_local.py +209 -0
  16. dataforge_07_evals-0.1.0/dataforge_evals/agents/local_ollama.py +62 -0
  17. dataforge_07_evals-0.1.0/dataforge_evals/agents/mock.py +41 -0
  18. dataforge_07_evals-0.1.0/dataforge_evals/agents/openrouter.py +66 -0
  19. dataforge_07_evals-0.1.0/dataforge_evals/agents/provider_base.py +319 -0
  20. dataforge_07_evals-0.1.0/dataforge_evals/cli.py +326 -0
  21. dataforge_07_evals-0.1.0/dataforge_evals/grader.py +152 -0
  22. dataforge_07_evals-0.1.0/dataforge_evals/harness.py +406 -0
  23. dataforge_07_evals-0.1.0/dataforge_evals/py.typed +1 -0
  24. dataforge_07_evals-0.1.0/dataforge_evals/repair_contract.py +332 -0
  25. dataforge_07_evals-0.1.0/dataforge_evals/report.py +182 -0
  26. dataforge_07_evals-0.1.0/dataforge_evals/tasks.py +250 -0
  27. dataforge_07_evals-0.1.0/pyproject.toml +106 -0
  28. dataforge_07_evals-0.1.0/setup.cfg +4 -0
  29. dataforge_07_evals-0.1.0/tests/test_grader.py +225 -0
  30. dataforge_07_evals-0.1.0/tests/test_harness.py +339 -0
  31. dataforge_07_evals-0.1.0/tests/test_hf_local_agent.py +139 -0
  32. dataforge_07_evals-0.1.0/tests/test_package_naming.py +19 -0
  33. dataforge_07_evals-0.1.0/tests/test_provider_base.py +91 -0
  34. dataforge_07_evals-0.1.0/tests/test_release_truth.py +21 -0
@@ -0,0 +1,263 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge_07_evals
3
+ Version: 0.1.0
4
+ Summary: Agent-agnostic evaluation harness for data-quality repair agents.
5
+ License-Expression: Apache-2.0
6
+ Project-URL: Homepage, https://github.com/Aegis15/dataforge
7
+ Project-URL: Repository, https://github.com/Aegis15/dataforge
8
+ Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
9
+ Keywords: data-quality,evaluation,agents,benchmarks,llm
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Typing :: Typed
17
+ Requires-Python: <3.13,>=3.11
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: httpx>=0.27
20
+ Requires-Dist: pandas>=2.2
21
+ Requires-Dist: pydantic>=2.7
22
+ Requires-Dist: python-dotenv>=1.0
23
+ Requires-Dist: rich>=13.7
24
+ Requires-Dist: tenacity>=8.3
25
+ Requires-Dist: typer>=0.12
26
+ Provides-Extra: dataforge
27
+ Requires-Dist: dataforge_07>=0.1.0; extra == "dataforge"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.2; extra == "dev"
30
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
31
+ Requires-Dist: ruff>=0.11; extra == "dev"
32
+ Requires-Dist: mypy>=1.10; extra == "dev"
33
+ Requires-Dist: pandas-stubs>=2.2; extra == "dev"
34
+ Provides-Extra: all
35
+ Requires-Dist: dataforge_07_evals[dataforge,dev]; extra == "all"
36
+ Provides-Extra: hf
37
+ Requires-Dist: transformers>=4.44; extra == "hf"
38
+ Requires-Dist: torch>=2.3; extra == "hf"
39
+ Requires-Dist: accelerate>=0.33; extra == "hf"
40
+
41
+ # dataforge-evals
42
+
43
+ `dataforge-evals` is an agent-agnostic evaluation harness for data-quality repair agents.
44
+
45
+ It gives any agent the same task, accepts only proposed cell fixes, and lets the grader compute exact precision, recall, F1, steps, failures, and free-tier quota usage. The harness can load DataForge's canonical Hospital, Flights, and Beers benchmark tasks when `dataforge_07` is installed, while the import namespace remains `dataforge` for the 0.1 line.
46
+ The PyPI package is not published yet; use the source install instructions
47
+ below until release ownership is configured.
48
+
49
+ ```bash
50
+ pip install -e ".[dev]"
51
+ dataforge-evals run --agent mock --dataset synthetic --trials 3
52
+ ```
53
+
54
+ ## Install
55
+
56
+ ### From source (development)
57
+
58
+ ```bash
59
+ python -m venv .venv
60
+ # Linux/macOS:
61
+ source .venv/bin/activate
62
+ # Windows PowerShell:
63
+ .\.venv\Scripts\Activate.ps1
64
+
65
+ pip install -e ".[dev]"
66
+ ```
67
+
68
+ ### With canonical DataForge datasets
69
+
70
+ ```bash
71
+ pip install -e "../data_quality_env"
72
+ dataforge-evals run --agent mock --dataset hospital --trials 3
73
+ ```
74
+
75
+ ## Run a provider
76
+
77
+ ```bash
78
+ set GROQ_API_KEY=...
79
+ dataforge-evals run --agent groq-llama-70b --dataset hospital --trials 3 --output reports/groq-hospital.md
80
+ ```
81
+
82
+ ### Bounded Groq smoke test
83
+
84
+ Use a single synthetic trial to verify Groq wiring without turning the smoke
85
+ check into a benchmark:
86
+
87
+ ```bash
88
+ dataforge-evals run --agent groq-llama-70b --dataset synthetic --trials 1 --seed 0 --timeout-s 20 --output reports/groq-synthetic-smoke.md --output-json reports/groq-synthetic-smoke.json
89
+ ```
90
+
91
+ For this smoke path, `trials_completed=1` and `Failures=none` prove the
92
+ integration completed successfully. F1 is a quality signal for the model's
93
+ proposed repairs, not the API health check. The JSON report includes the
94
+ normalized proposed `fixes` for debugging; Markdown stays summary-only.
95
+
96
+ ### Built-in adapters
97
+
98
+ | Agent ID | Provider | Required Setup |
99
+ | --- | --- | --- |
100
+ | `mock` | local deterministic oracle for tests | none |
101
+ | `groq-llama-70b` | Groq | `GROQ_API_KEY` |
102
+ | `gemini-flash` | Gemini | `GEMINI_API_KEY` |
103
+ | `cerebras-llama` | Cerebras | `CEREBRAS_API_KEY` |
104
+ | `openrouter` | OpenRouter | `OPENROUTER_API_KEY` |
105
+ | `local-ollama` | local Ollama OpenAI-compatible endpoint | Ollama server on `localhost:11434` |
106
+ | `hf-local` | Hugging Face Transformers | optional `HF_TOKEN`; install `.[hf]` |
107
+
108
+ ### Evaluating the historical DataForge SFT checkpoint
109
+
110
+ Use `hf-local` for base-vs-SFT checks with the same exact-match grader used by
111
+ hosted providers:
112
+
113
+ ```bash
114
+ pip install -e ".[hf]"
115
+ dataforge-evals run --agent hf-local --dataset synthetic --trials 1 \
116
+ --model-id Praneshrajan15/DataForge-0.5B-SFT \
117
+ --output reports/dataforge-sft-smoke.md \
118
+ --output-json reports/dataforge-sft-smoke.json
119
+ ```
120
+
121
+ If `--model-id` is omitted, the adapter uses `DATAFORGE_EVAL_MODEL`, then the
122
+ authenticated `HF_TOKEN` user's `DataForge-0.5B-SFT`, then
123
+ `Praneshrajan15/DataForge-0.5B-SFT`.
124
+
125
+ ### Discover agents and datasets
126
+
127
+ ```bash
128
+ dataforge-evals list-agents
129
+ dataforge-evals list-datasets
130
+ ```
131
+
132
+ ## Custom CSV-pair evaluation
133
+
134
+ Bring your own dirty and clean CSV files:
135
+
136
+ ```bash
137
+ dataforge-evals run --agent mock --dataset my-data \
138
+ --dirty-csv path/to/dirty.csv \
139
+ --clean-csv path/to/clean.csv \
140
+ --trials 3
141
+ ```
142
+
143
+ The dirty and clean CSVs must have the same number of rows and columns. Column names are taken from the clean file.
144
+
145
+ ## Agent protocol
146
+
147
+ Any agent can plug in by implementing:
148
+
149
+ ```python
150
+ from dataforge_evals import AgentTask, Fix
151
+
152
+ class MyAgent:
153
+ name = "my-agent"
154
+
155
+ def run(self, task: AgentTask) -> list[Fix]:
156
+ return [Fix(row=0, column="Score", new_value="4.5", reason="example")]
157
+ ```
158
+
159
+ Agents never report their own score. They return candidate fixes only. The grader is the only source of truth.
160
+ Normal agents receive a label-hidden `AgentTask`; only the built-in `mock`
161
+ oracle used by tests is marked to receive full ground truth.
162
+
163
+ ### What agents receive
164
+
165
+ - `task.name` — dataset identifier
166
+ - `task.dirty_df` — pandas DataFrame with data-quality issues (all values as strings)
167
+ - `task.canonical_columns` — ordered column names from the clean reference
168
+ - `task.metadata` — provenance and descriptive metadata
169
+
170
+ ### What agents return
171
+
172
+ Either a `list[Fix]` or an `AgentRunResult` with usage accounting:
173
+
174
+ ```python
175
+ from dataforge_evals import AgentRunResult, Fix, Usage
176
+
177
+ return AgentRunResult(
178
+ fixes=[Fix(row=0, column="Score", new_value="4.5")],
179
+ usage=Usage(calls=1, prompt_tokens=500, completion_tokens=100, quota_units=0.001),
180
+ steps=1,
181
+ model="my-model-v1",
182
+ )
183
+ ```
184
+
185
+ ## What is graded
186
+
187
+ A `Fix` is correct only when `(row, column, new_value)` exactly matches a ground-truth dirty-to-clean cell correction. Duplicate predictions for the same cell use last-write-wins normalization. A wrong value on the right cell counts as both a false positive and a false negative.
188
+
189
+ ## Quota accounting
190
+
191
+ Each report uses provider-normalized free-tier quota units rather than dollars. Built-in adapters record raw calls, prompt tokens, completion tokens, and quota units.
192
+
193
+ Provider-specific normalization (as of 2026-05-01):
194
+
195
+ | Provider | Free-tier basis | 1 quota unit = |
196
+ | --- | --- | --- |
197
+ | Groq | 14,400 RPD | 1 request |
198
+ | Gemini | 1,500 RPD | 1 request |
199
+ | Cerebras | 1,000 RPD | 1 request |
200
+ | OpenRouter | Nominal 1,000 RPD | 1 request |
201
+ | Ollama | unlimited (local) | always 0 |
202
+
203
+ On HTTP 429, the adapter waits with exponential backoff and logs `waiting N seconds for quota reset` to stderr. It does not fall back to another provider because fallback would contaminate the comparison.
204
+
205
+ ## Reproducibility
206
+
207
+ Each report records:
208
+
209
+ - `dataforge-evals` commit hash
210
+ - `dataforge` source commit hash when canonical datasets are loaded through DataForge
211
+ - exact seeds
212
+ - provider model identifiers
213
+ - UTC run date
214
+ - dependency versions (pandas, pydantic, httpx, etc.)
215
+ - an explicit nondeterminism note
216
+
217
+ Deterministic and mock agents reproduce exactly from the recorded seeds. Hosted LLM providers may still change outputs because providers can update model weights, routing, safety systems, or tokenization without notice.
218
+
219
+ ### Reproducibility limitations
220
+
221
+ - Provider model identifiers (e.g., `llama-3.3-70b-versatile`) may point to different weights on different dates.
222
+ - Token counts and quota units depend on provider-side tokenization, which can change.
223
+ - Network latency, rate limiting, and provider availability affect runtime measurements.
224
+ - Temperature 0 does not guarantee determinism across all providers.
225
+
226
+ ## Not a leaderboard by default
227
+
228
+ Only compare reports when dataset versions, seeds, provider model identifiers, run date, and prompt/adapter code are identical. Otherwise the report is an evaluation artifact, not a leaderboard row.
229
+
230
+ ## When dataforge-evals is the wrong tool
231
+
232
+ Do not use `dataforge-evals` if:
233
+
234
+ - **Your agent operates on streaming data** — the harness is batch-oriented and expects a complete dirty DataFrame.
235
+ - **You need end-to-end pipeline evaluation** — this tool evaluates cell-level repair accuracy, not detection, diagnosis, or pipeline orchestration.
236
+ - **Your ground truth is fuzzy or approximate** — the grader uses exact string match. If multiple correct values exist for a cell, you need a custom grader.
237
+ - **You need sub-second latency benchmarking** — the harness measures wall-clock time but is not designed as a latency benchmarking tool.
238
+ - **Your data is > 100K rows** — the harness loads the full DataFrame into memory and passes it to agents. For large-scale evaluation, sample first.
239
+
240
+ ## Development
241
+
242
+ ```bash
243
+ make setup # pip install -e ".[dev]"
244
+ make lint # ruff check
245
+ make format # ruff format --check
246
+ make type # mypy --strict
247
+ make test # pytest
248
+ make test-cov # pytest with coverage
249
+ make smoke # end-to-end smoke test with mock agent
250
+ ```
251
+
252
+ ## Environment Variables
253
+
254
+ Provider keys belong in a root `.env` file (gitignored) loaded with `python-dotenv`:
255
+
256
+ - `GROQ_API_KEY`
257
+ - `GEMINI_API_KEY`
258
+ - `CEREBRAS_API_KEY`
259
+ - `OPENROUTER_API_KEY`
260
+
261
+ ## License
262
+
263
+ Apache-2.0.
@@ -0,0 +1,223 @@
1
+ # dataforge-evals
2
+
3
+ `dataforge-evals` is an agent-agnostic evaluation harness for data-quality repair agents.
4
+
5
+ It gives any agent the same task, accepts only proposed cell fixes, and lets the grader compute exact precision, recall, F1, steps, failures, and free-tier quota usage. The harness can load DataForge's canonical Hospital, Flights, and Beers benchmark tasks when `dataforge_07` is installed, while the import namespace remains `dataforge` for the 0.1 line.
6
+ The PyPI package is not published yet; use the source install instructions
7
+ below until release ownership is configured.
8
+
9
+ ```bash
10
+ pip install -e ".[dev]"
11
+ dataforge-evals run --agent mock --dataset synthetic --trials 3
12
+ ```
13
+
14
+ ## Install
15
+
16
+ ### From source (development)
17
+
18
+ ```bash
19
+ python -m venv .venv
20
+ # Linux/macOS:
21
+ source .venv/bin/activate
22
+ # Windows PowerShell:
23
+ .\.venv\Scripts\Activate.ps1
24
+
25
+ pip install -e ".[dev]"
26
+ ```
27
+
28
+ ### With canonical DataForge datasets
29
+
30
+ ```bash
31
+ pip install -e "../data_quality_env"
32
+ dataforge-evals run --agent mock --dataset hospital --trials 3
33
+ ```
34
+
35
+ ## Run a provider
36
+
37
+ ```bash
38
+ set GROQ_API_KEY=...
39
+ dataforge-evals run --agent groq-llama-70b --dataset hospital --trials 3 --output reports/groq-hospital.md
40
+ ```
41
+
42
+ ### Bounded Groq smoke test
43
+
44
+ Use a single synthetic trial to verify Groq wiring without turning the smoke
45
+ check into a benchmark:
46
+
47
+ ```bash
48
+ dataforge-evals run --agent groq-llama-70b --dataset synthetic --trials 1 --seed 0 --timeout-s 20 --output reports/groq-synthetic-smoke.md --output-json reports/groq-synthetic-smoke.json
49
+ ```
50
+
51
+ For this smoke path, `trials_completed=1` and `Failures=none` prove the
52
+ integration completed successfully. F1 is a quality signal for the model's
53
+ proposed repairs, not the API health check. The JSON report includes the
54
+ normalized proposed `fixes` for debugging; Markdown stays summary-only.
55
+
56
+ ### Built-in adapters
57
+
58
+ | Agent ID | Provider | Required Setup |
59
+ | --- | --- | --- |
60
+ | `mock` | local deterministic oracle for tests | none |
61
+ | `groq-llama-70b` | Groq | `GROQ_API_KEY` |
62
+ | `gemini-flash` | Gemini | `GEMINI_API_KEY` |
63
+ | `cerebras-llama` | Cerebras | `CEREBRAS_API_KEY` |
64
+ | `openrouter` | OpenRouter | `OPENROUTER_API_KEY` |
65
+ | `local-ollama` | local Ollama OpenAI-compatible endpoint | Ollama server on `localhost:11434` |
66
+ | `hf-local` | Hugging Face Transformers | optional `HF_TOKEN`; install `.[hf]` |
67
+
68
+ ### Evaluating the historical DataForge SFT checkpoint
69
+
70
+ Use `hf-local` for base-vs-SFT checks with the same exact-match grader used by
71
+ hosted providers:
72
+
73
+ ```bash
74
+ pip install -e ".[hf]"
75
+ dataforge-evals run --agent hf-local --dataset synthetic --trials 1 \
76
+ --model-id Praneshrajan15/DataForge-0.5B-SFT \
77
+ --output reports/dataforge-sft-smoke.md \
78
+ --output-json reports/dataforge-sft-smoke.json
79
+ ```
80
+
81
+ If `--model-id` is omitted, the adapter uses `DATAFORGE_EVAL_MODEL`, then the
82
+ authenticated `HF_TOKEN` user's `DataForge-0.5B-SFT`, then
83
+ `Praneshrajan15/DataForge-0.5B-SFT`.
84
+
85
+ ### Discover agents and datasets
86
+
87
+ ```bash
88
+ dataforge-evals list-agents
89
+ dataforge-evals list-datasets
90
+ ```
91
+
92
+ ## Custom CSV-pair evaluation
93
+
94
+ Bring your own dirty and clean CSV files:
95
+
96
+ ```bash
97
+ dataforge-evals run --agent mock --dataset my-data \
98
+ --dirty-csv path/to/dirty.csv \
99
+ --clean-csv path/to/clean.csv \
100
+ --trials 3
101
+ ```
102
+
103
+ The dirty and clean CSVs must have the same number of rows and columns. Column names are taken from the clean file.
104
+
105
+ ## Agent protocol
106
+
107
+ Any agent can plug in by implementing:
108
+
109
+ ```python
110
+ from dataforge_evals import AgentTask, Fix
111
+
112
+ class MyAgent:
113
+ name = "my-agent"
114
+
115
+ def run(self, task: AgentTask) -> list[Fix]:
116
+ return [Fix(row=0, column="Score", new_value="4.5", reason="example")]
117
+ ```
118
+
119
+ Agents never report their own score. They return candidate fixes only. The grader is the only source of truth.
120
+ Normal agents receive a label-hidden `AgentTask`; only the built-in `mock`
121
+ oracle used by tests is marked to receive full ground truth.
122
+
123
+ ### What agents receive
124
+
125
+ - `task.name` — dataset identifier
126
+ - `task.dirty_df` — pandas DataFrame with data-quality issues (all values as strings)
127
+ - `task.canonical_columns` — ordered column names from the clean reference
128
+ - `task.metadata` — provenance and descriptive metadata
129
+
130
+ ### What agents return
131
+
132
+ Either a `list[Fix]` or an `AgentRunResult` with usage accounting:
133
+
134
+ ```python
135
+ from dataforge_evals import AgentRunResult, Fix, Usage
136
+
137
+ return AgentRunResult(
138
+ fixes=[Fix(row=0, column="Score", new_value="4.5")],
139
+ usage=Usage(calls=1, prompt_tokens=500, completion_tokens=100, quota_units=0.001),
140
+ steps=1,
141
+ model="my-model-v1",
142
+ )
143
+ ```
144
+
145
+ ## What is graded
146
+
147
+ A `Fix` is correct only when `(row, column, new_value)` exactly matches a ground-truth dirty-to-clean cell correction. Duplicate predictions for the same cell use last-write-wins normalization. A wrong value on the right cell counts as both a false positive and a false negative.
148
+
149
+ ## Quota accounting
150
+
151
+ Each report uses provider-normalized free-tier quota units rather than dollars. Built-in adapters record raw calls, prompt tokens, completion tokens, and quota units.
152
+
153
+ Provider-specific normalization (as of 2026-05-01):
154
+
155
+ | Provider | Free-tier basis | 1 quota unit = |
156
+ | --- | --- | --- |
157
+ | Groq | 14,400 RPD | 1 request |
158
+ | Gemini | 1,500 RPD | 1 request |
159
+ | Cerebras | 1,000 RPD | 1 request |
160
+ | OpenRouter | Nominal 1,000 RPD | 1 request |
161
+ | Ollama | unlimited (local) | always 0 |
162
+
163
+ On HTTP 429, the adapter waits with exponential backoff and logs `waiting N seconds for quota reset` to stderr. It does not fall back to another provider because fallback would contaminate the comparison.
164
+
165
+ ## Reproducibility
166
+
167
+ Each report records:
168
+
169
+ - `dataforge-evals` commit hash
170
+ - `dataforge` source commit hash when canonical datasets are loaded through DataForge
171
+ - exact seeds
172
+ - provider model identifiers
173
+ - UTC run date
174
+ - dependency versions (pandas, pydantic, httpx, etc.)
175
+ - an explicit nondeterminism note
176
+
177
+ Deterministic and mock agents reproduce exactly from the recorded seeds. Hosted LLM providers may still change outputs because providers can update model weights, routing, safety systems, or tokenization without notice.
178
+
179
+ ### Reproducibility limitations
180
+
181
+ - Provider model identifiers (e.g., `llama-3.3-70b-versatile`) may point to different weights on different dates.
182
+ - Token counts and quota units depend on provider-side tokenization, which can change.
183
+ - Network latency, rate limiting, and provider availability affect runtime measurements.
184
+ - Temperature 0 does not guarantee determinism across all providers.
185
+
186
+ ## Not a leaderboard by default
187
+
188
+ Only compare reports when dataset versions, seeds, provider model identifiers, run date, and prompt/adapter code are identical. Otherwise the report is an evaluation artifact, not a leaderboard row.
189
+
190
+ ## When dataforge-evals is the wrong tool
191
+
192
+ Do not use `dataforge-evals` if:
193
+
194
+ - **Your agent operates on streaming data** — the harness is batch-oriented and expects a complete dirty DataFrame.
195
+ - **You need end-to-end pipeline evaluation** — this tool evaluates cell-level repair accuracy, not detection, diagnosis, or pipeline orchestration.
196
+ - **Your ground truth is fuzzy or approximate** — the grader uses exact string match. If multiple correct values exist for a cell, you need a custom grader.
197
+ - **You need sub-second latency benchmarking** — the harness measures wall-clock time but is not designed as a latency benchmarking tool.
198
+ - **Your data is > 100K rows** — the harness loads the full DataFrame into memory and passes it to agents. For large-scale evaluation, sample first.
199
+
200
+ ## Development
201
+
202
+ ```bash
203
+ make setup # pip install -e ".[dev]"
204
+ make lint # ruff check
205
+ make format # ruff format --check
206
+ make type # mypy --strict
207
+ make test # pytest
208
+ make test-cov # pytest with coverage
209
+ make smoke # end-to-end smoke test with mock agent
210
+ ```
211
+
212
+ ## Environment Variables
213
+
214
+ Provider keys belong in a root `.env` file (gitignored) loaded with `python-dotenv`:
215
+
216
+ - `GROQ_API_KEY`
217
+ - `GEMINI_API_KEY`
218
+ - `CEREBRAS_API_KEY`
219
+ - `OPENROUTER_API_KEY`
220
+
221
+ ## License
222
+
223
+ Apache-2.0.