evaluatorq 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluatorq-1.0.0/.gitignore +51 -0
- evaluatorq-1.0.0/.python-version +1 -0
- evaluatorq-1.0.0/PKG-INFO +500 -0
- evaluatorq-1.0.0/README.md +487 -0
- evaluatorq-1.0.0/project.json +74 -0
- evaluatorq-1.0.0/pyproject.toml +42 -0
- evaluatorq-1.0.0/src/evaluatorq/__init__.py +42 -0
- evaluatorq-1.0.0/src/evaluatorq/evaluatorq.py +181 -0
- evaluatorq-1.0.0/src/evaluatorq/fetch_data.py +76 -0
- evaluatorq-1.0.0/src/evaluatorq/job_helper.py +103 -0
- evaluatorq-1.0.0/src/evaluatorq/processings.py +225 -0
- evaluatorq-1.0.0/src/evaluatorq/progress.py +220 -0
- evaluatorq-1.0.0/src/evaluatorq/py.typed +0 -0
- evaluatorq-1.0.0/src/evaluatorq/send_results.py +122 -0
- evaluatorq-1.0.0/src/evaluatorq/table_display.py +280 -0
- evaluatorq-1.0.0/src/evaluatorq/types.py +108 -0
- evaluatorq-1.0.0/src/py.typed +0 -0
- evaluatorq-1.0.0/uv.lock +646 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# See https://docs.github.com/en/get-started/getting-started-with-git/ignoring-files for more about ignoring files.
|
|
2
|
+
|
|
3
|
+
# compiled output
|
|
4
|
+
dist
|
|
5
|
+
tmp
|
|
6
|
+
out-tsc
|
|
7
|
+
*.tsbuildinfo
|
|
8
|
+
**/__pycache__/**
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
|
|
12
|
+
# dependencies
|
|
13
|
+
node_modules
|
|
14
|
+
|
|
15
|
+
# IDEs and editors
|
|
16
|
+
/.idea
|
|
17
|
+
.project
|
|
18
|
+
.classpath
|
|
19
|
+
.c9/
|
|
20
|
+
*.launch
|
|
21
|
+
.settings/
|
|
22
|
+
*.sublime-workspace
|
|
23
|
+
|
|
24
|
+
# IDE - VSCode
|
|
25
|
+
.vscode/*
|
|
26
|
+
!.vscode/settings.json
|
|
27
|
+
!.vscode/tasks.json
|
|
28
|
+
!.vscode/launch.json
|
|
29
|
+
!.vscode/extensions.json
|
|
30
|
+
|
|
31
|
+
# misc
|
|
32
|
+
/.sass-cache
|
|
33
|
+
/connect.lock
|
|
34
|
+
/coverage
|
|
35
|
+
/libpeerconnection.log
|
|
36
|
+
npm-debug.log
|
|
37
|
+
yarn-error.log
|
|
38
|
+
testem.log
|
|
39
|
+
/typings
|
|
40
|
+
|
|
41
|
+
# System Files
|
|
42
|
+
.DS_Store
|
|
43
|
+
Thumbs.db
|
|
44
|
+
|
|
45
|
+
.nx/cache
|
|
46
|
+
.nx/workspace-data
|
|
47
|
+
.cursor/rules/nx-rules.mdc
|
|
48
|
+
.github/instructions/nx.instructions.md
|
|
49
|
+
.npmrc
|
|
50
|
+
.venv
|
|
51
|
+
**/*.tsbuildinfo
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evaluatorq
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An evaluation framework library for Python that provides a flexible way to run parallel evaluations and optionally integrate with the Orq AI platform.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: httpx>=0.28.1
|
|
8
|
+
Requires-Dist: pydantic>=2.0
|
|
9
|
+
Requires-Dist: rich>=14.2.0
|
|
10
|
+
Provides-Extra: orq
|
|
11
|
+
Requires-Dist: orq-ai-sdk>=3.13.16; extra == 'orq'
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# evaluatorq-py
|
|
15
|
+
|
|
16
|
+
An evaluation framework library for Python that provides a flexible way to run parallel evaluations and optionally integrate with the Orq AI platform.
|
|
17
|
+
|
|
18
|
+
## 🎯 Features
|
|
19
|
+
|
|
20
|
+
- **Parallel Execution**: Run multiple evaluation jobs concurrently with progress tracking
|
|
21
|
+
- **Flexible Data Sources**: Support for inline data, async iterables, and Orq platform datasets
|
|
22
|
+
- **Type-safe**: Fully typed with Python type hints and Pydantic models with runtime validation
|
|
23
|
+
- **Rich Terminal UI**: Beautiful progress indicators and result tables powered by Rich
|
|
24
|
+
- **Orq Platform Integration**: Seamlessly fetch and evaluate datasets from Orq AI (optional)
|
|
25
|
+
|
|
26
|
+
## 📥 Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install evaluatorq
|
|
30
|
+
# or
|
|
31
|
+
uv add evaluatorq
|
|
32
|
+
# or
|
|
33
|
+
poetry add evaluatorq
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Optional Dependencies
|
|
37
|
+
|
|
38
|
+
If you want to use the Orq platform integration:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install orq-ai-sdk
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## 🚀 Quick Start
|
|
45
|
+
|
|
46
|
+
### Basic Usage
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import asyncio
|
|
50
|
+
from evaluatorq import evaluatorq, job, DataPoint, EvaluationResult
|
|
51
|
+
|
|
52
|
+
@job("text-analyzer")
|
|
53
|
+
async def text_analyzer(data: DataPoint, row: int):
|
|
54
|
+
"""Analyze text data and return analysis results."""
|
|
55
|
+
text = data.inputs["text"]
|
|
56
|
+
analysis = {
|
|
57
|
+
"length": len(text),
|
|
58
|
+
"word_count": len(text.split()),
|
|
59
|
+
"uppercase": text.upper(),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return analysis
|
|
63
|
+
|
|
64
|
+
async def length_check_scorer(params):
|
|
65
|
+
"""Evaluate if output length is sufficient."""
|
|
66
|
+
output = params["output"]
|
|
67
|
+
passes_check = output["length"] > 10
|
|
68
|
+
|
|
69
|
+
return EvaluationResult(
|
|
70
|
+
value=1 if passes_check else 0,
|
|
71
|
+
explanation=(
|
|
72
|
+
"Output length is sufficient"
|
|
73
|
+
if passes_check
|
|
74
|
+
else f"Output too short ({output['length']} chars, need >10)"
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
async def main():
|
|
79
|
+
await evaluatorq(
|
|
80
|
+
"text-analysis",
|
|
81
|
+
data=[
|
|
82
|
+
DataPoint(inputs={"text": "Hello world"}),
|
|
83
|
+
DataPoint(inputs={"text": "Testing evaluation"}),
|
|
84
|
+
],
|
|
85
|
+
jobs=[text_analyzer],
|
|
86
|
+
evaluators=[
|
|
87
|
+
{
|
|
88
|
+
"name": "length-check",
|
|
89
|
+
"scorer": length_check_scorer,
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
asyncio.run(main())
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Using Orq Platform Datasets
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import asyncio
|
|
102
|
+
from evaluatorq import evaluatorq, job, DataPoint, EvaluationResult
|
|
103
|
+
|
|
104
|
+
@job("processor")
|
|
105
|
+
async def processor(data: DataPoint, row: int):
|
|
106
|
+
"""Process each data point from the dataset."""
|
|
107
|
+
result = await process_data(data)
|
|
108
|
+
return result
|
|
109
|
+
|
|
110
|
+
async def accuracy_scorer(params):
|
|
111
|
+
"""Calculate accuracy by comparing output with expected results."""
|
|
112
|
+
data = params["data"]
|
|
113
|
+
output = params["output"]
|
|
114
|
+
|
|
115
|
+
score = calculate_score(output, data.expected_output)
|
|
116
|
+
|
|
117
|
+
if score > 0.8:
|
|
118
|
+
explanation = "High accuracy match"
|
|
119
|
+
elif score > 0.5:
|
|
120
|
+
explanation = "Partial match"
|
|
121
|
+
else:
|
|
122
|
+
explanation = "Low accuracy match"
|
|
123
|
+
|
|
124
|
+
return EvaluationResult(value=score, explanation=explanation)
|
|
125
|
+
|
|
126
|
+
async def main():
|
|
127
|
+
# Requires ORQ_API_KEY environment variable
|
|
128
|
+
await evaluatorq(
|
|
129
|
+
"dataset-evaluation",
|
|
130
|
+
data={"dataset_id": "your-dataset-id"}, # From Orq platform
|
|
131
|
+
jobs=[processor],
|
|
132
|
+
evaluators=[
|
|
133
|
+
{
|
|
134
|
+
"name": "accuracy",
|
|
135
|
+
"scorer": accuracy_scorer,
|
|
136
|
+
}
|
|
137
|
+
],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
asyncio.run(main())
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Advanced Features
|
|
145
|
+
|
|
146
|
+
#### Multiple Jobs
|
|
147
|
+
|
|
148
|
+
Run multiple jobs in parallel for each data point:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from evaluatorq import job
|
|
152
|
+
|
|
153
|
+
@job("preprocessor")
|
|
154
|
+
async def preprocessor(data: DataPoint, row: int):
|
|
155
|
+
result = await preprocess(data)
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
@job("analyzer")
|
|
159
|
+
async def analyzer(data: DataPoint, row: int):
|
|
160
|
+
result = await analyze(data)
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
@job("transformer")
|
|
164
|
+
async def transformer(data: DataPoint, row: int):
|
|
165
|
+
result = await transform(data)
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
await evaluatorq(
|
|
169
|
+
"multi-job-eval",
|
|
170
|
+
data=[...],
|
|
171
|
+
jobs=[preprocessor, analyzer, transformer],
|
|
172
|
+
evaluators=[...],
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
#### The `@job()` Decorator
|
|
177
|
+
|
|
178
|
+
The `@job()` decorator provides two key benefits:
|
|
179
|
+
|
|
180
|
+
1. **Eliminates boilerplate** - No need to manually wrap returns with `{"name": ..., "output": ...}`
|
|
181
|
+
2. **Preserves job names in errors** - When a job fails, the error will include the job name for better debugging
|
|
182
|
+
|
|
183
|
+
**Decorator pattern (recommended):**
|
|
184
|
+
```python
|
|
185
|
+
from evaluatorq import job
|
|
186
|
+
|
|
187
|
+
@job("text-processor")
|
|
188
|
+
async def process_text(data: DataPoint, row: int):
|
|
189
|
+
# Clean return - just the data!
|
|
190
|
+
return {"result": data.inputs["text"].upper()}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Functional pattern (for lambdas):**
|
|
194
|
+
```python
|
|
195
|
+
from evaluatorq import job
|
|
196
|
+
|
|
197
|
+
# Simple transformations with lambda
|
|
198
|
+
uppercase_job = job("uppercase", lambda data, row: data.inputs["text"].upper())
|
|
199
|
+
word_count_job = job("word-count", lambda data, row: len(data.inputs["text"].split()))
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Manual pattern (not recommended):**
|
|
203
|
+
```python
|
|
204
|
+
# Without decorator - requires manual wrapper every time
|
|
205
|
+
async def process_text(data: DataPoint, row: int):
|
|
206
|
+
return {"name": "text-processor", "output": {"result": data.inputs["text"].upper()}}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
#### Automatic Error Handling
|
|
210
|
+
|
|
211
|
+
The `@job()` decorator automatically preserves job names even when errors occur:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
from evaluatorq import job
|
|
215
|
+
|
|
216
|
+
@job("risky-job")
|
|
217
|
+
async def risky_operation(data: DataPoint, row: int):
|
|
218
|
+
# If this raises an error, the job name "risky-job" will be preserved
|
|
219
|
+
result = await potentially_failing_operation(data)
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
await evaluatorq(
|
|
223
|
+
"error-handling",
|
|
224
|
+
data=[...],
|
|
225
|
+
jobs=[risky_operation],
|
|
226
|
+
evaluators=[...],
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Error output will show: "Job 'risky-job' failed: <error details>"
|
|
230
|
+
# Without @job decorator, you'd only see: "<error details>"
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
#### Async Data Sources
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
import asyncio
|
|
237
|
+
|
|
238
|
+
# Create an array of coroutines for async data
|
|
239
|
+
async def get_data_point(i: int) -> DataPoint:
|
|
240
|
+
await asyncio.sleep(0.01) # Simulate async data fetching
|
|
241
|
+
return DataPoint(inputs={"value": i})
|
|
242
|
+
|
|
243
|
+
data_promises = [get_data_point(i) for i in range(1000)]
|
|
244
|
+
|
|
245
|
+
await evaluatorq(
|
|
246
|
+
"async-eval",
|
|
247
|
+
data=data_promises,
|
|
248
|
+
jobs=[...],
|
|
249
|
+
evaluators=[...],
|
|
250
|
+
)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
#### Controlling Parallelism
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
await evaluatorq(
|
|
257
|
+
"parallel-eval",
|
|
258
|
+
data=[...],
|
|
259
|
+
jobs=[...],
|
|
260
|
+
evaluators=[...],
|
|
261
|
+
parallelism=10, # Run up to 10 jobs concurrently
|
|
262
|
+
)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
#### Disable Progress Display
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
# Get raw results without terminal output
|
|
269
|
+
results = await evaluatorq(
|
|
270
|
+
"silent-eval",
|
|
271
|
+
data=[...],
|
|
272
|
+
jobs=[...],
|
|
273
|
+
evaluators=[...],
|
|
274
|
+
print_results=False, # Disable progress and table display
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Process results programmatically
|
|
278
|
+
for result in results:
|
|
279
|
+
print(result.data_point.inputs)
|
|
280
|
+
for job_result in result.job_results:
|
|
281
|
+
print(f"{job_result.job_name}: {job_result.output}")
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## 🔧 Configuration
|
|
285
|
+
|
|
286
|
+
### Environment Variables
|
|
287
|
+
|
|
288
|
+
- `ORQ_API_KEY`: API key for Orq platform integration (required for dataset access and sending results)
|
|
289
|
+
|
|
290
|
+
### Evaluation Parameters
|
|
291
|
+
|
|
292
|
+
Parameters are validated at runtime using Pydantic. The `evaluatorq` function supports three calling styles:
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
from evaluatorq import evaluatorq, EvaluatorParams
|
|
296
|
+
|
|
297
|
+
# 1. Keyword arguments (recommended)
|
|
298
|
+
await evaluatorq(
|
|
299
|
+
"my-eval",
|
|
300
|
+
data=[...],
|
|
301
|
+
jobs=[...],
|
|
302
|
+
parallelism=5,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# 2. Dict style
|
|
306
|
+
await evaluatorq("my-eval", {
|
|
307
|
+
"data": [...],
|
|
308
|
+
"jobs": [...],
|
|
309
|
+
"parallelism": 5,
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
# 3. EvaluatorParams instance
|
|
313
|
+
await evaluatorq("my-eval", EvaluatorParams(
|
|
314
|
+
data=[...],
|
|
315
|
+
jobs=[...],
|
|
316
|
+
parallelism=5,
|
|
317
|
+
))
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
#### Parameter Reference
|
|
321
|
+
|
|
322
|
+
| Parameter | Type | Default | Description |
|
|
323
|
+
|-----------|------|---------|-------------|
|
|
324
|
+
| `data` | `list[DataPoint]` \| `list[Awaitable[DataPoint]]` \| `DatasetIdInput` | **required** | Data to evaluate |
|
|
325
|
+
| `jobs` | `list[Job]` | **required** | Jobs to run on each data point |
|
|
326
|
+
| `evaluators` | `list[Evaluator]` \| `None` | `None` | Evaluators to score job outputs |
|
|
327
|
+
| `parallelism` | `int` (≥1) | `1` | Number of concurrent jobs |
|
|
328
|
+
| `print_results` | `bool` | `True` | Display progress and results table |
|
|
329
|
+
| `description` | `str` \| `None` | `None` | Optional evaluation description |
|
|
330
|
+
|
|
331
|
+
## 📊 Orq Platform Integration
|
|
332
|
+
|
|
333
|
+
### Automatic Result Sending
|
|
334
|
+
|
|
335
|
+
When the `ORQ_API_KEY` environment variable is set, evaluatorq automatically sends evaluation results to the Orq platform for visualization and analysis.
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
# Results are automatically sent when ORQ_API_KEY is set
|
|
339
|
+
await evaluatorq(
|
|
340
|
+
"my-evaluation",
|
|
341
|
+
data=[...],
|
|
342
|
+
jobs=[...],
|
|
343
|
+
evaluators=[...],
|
|
344
|
+
)
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
#### What Gets Sent
|
|
348
|
+
|
|
349
|
+
When the `ORQ_API_KEY` is set, the following information is sent to Orq:
|
|
350
|
+
- Evaluation name
|
|
351
|
+
- Dataset ID (when using Orq datasets)
|
|
352
|
+
- Job results with outputs and errors
|
|
353
|
+
- Evaluator scores with values and explanations
|
|
354
|
+
- Execution timing information
|
|
355
|
+
|
|
356
|
+
Note: Evaluator explanations are included in the data sent to Orq but are not displayed in the terminal output to keep the console clean.
|
|
357
|
+
|
|
358
|
+
#### Result Visualization
|
|
359
|
+
|
|
360
|
+
After successful submission, you'll see a console message with a link to view your results:
|
|
361
|
+
|
|
362
|
+
```
|
|
363
|
+
📊 View your evaluation results at: <url to the evaluation>
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
The Orq platform provides:
|
|
367
|
+
- Interactive result tables
|
|
368
|
+
- Score statistics
|
|
369
|
+
- Performance metrics
|
|
370
|
+
- Historical comparisons
|
|
371
|
+
|
|
372
|
+
## 📚 API Reference
|
|
373
|
+
|
|
374
|
+
### `evaluatorq(name, params?, *, data?, jobs?, evaluators?, parallelism?, print_results?, description?) -> EvaluatorqResult`
|
|
375
|
+
|
|
376
|
+
Main async function to run evaluations.
|
|
377
|
+
|
|
378
|
+
#### Signature:
|
|
379
|
+
|
|
380
|
+
```python
|
|
381
|
+
async def evaluatorq(
|
|
382
|
+
name: str,
|
|
383
|
+
params: EvaluatorParams | dict[str, Any] | None = None,
|
|
384
|
+
*,
|
|
385
|
+
data: DatasetIdInput | Sequence[Awaitable[DataPoint] | DataPoint] | None = None,
|
|
386
|
+
jobs: list[Job] | None = None,
|
|
387
|
+
evaluators: list[Evaluator] | None = None,
|
|
388
|
+
parallelism: int = 1,
|
|
389
|
+
print_results: bool = True,
|
|
390
|
+
description: str | None = None,
|
|
391
|
+
) -> EvaluatorqResult
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
#### Parameters:
|
|
395
|
+
|
|
396
|
+
- `name`: String identifier for the evaluation run
|
|
397
|
+
- `params`: (Optional) `EvaluatorParams` instance or dict with evaluation parameters
|
|
398
|
+
- `data`: List of DataPoint objects, awaitables, or `DatasetIdInput`
|
|
399
|
+
- `jobs`: List of job functions to run on each data point
|
|
400
|
+
- `evaluators`: Optional list of evaluator configurations
|
|
401
|
+
- `parallelism`: Number of concurrent jobs (default: 1, must be ≥1)
|
|
402
|
+
- `print_results`: Whether to display progress and results (default: True)
|
|
403
|
+
- `description`: Optional description for the evaluation run
|
|
404
|
+
|
|
405
|
+
> **Note:** Parameters can be passed either via the `params` argument (as dict or `EvaluatorParams`) or as keyword arguments. Keyword arguments take precedence over `params` values.
|
|
406
|
+
|
|
407
|
+
#### Returns:
|
|
408
|
+
|
|
409
|
+
`EvaluatorqResult` - List of `DataPointResult` objects containing job outputs and evaluator scores.
|
|
410
|
+
|
|
411
|
+
### Types
|
|
412
|
+
|
|
413
|
+
```python
|
|
414
|
+
from typing import Any, Callable, Awaitable
|
|
415
|
+
from pydantic import BaseModel, Field
|
|
416
|
+
from typing_extensions import TypedDict
|
|
417
|
+
|
|
418
|
+
# Output type alias
|
|
419
|
+
Output = str | int | float | bool | dict[str, Any] | None
|
|
420
|
+
|
|
421
|
+
class DataPoint(BaseModel):
|
|
422
|
+
"""A data point for evaluation."""
|
|
423
|
+
inputs: dict[str, Any]
|
|
424
|
+
expected_output: Output | None = None
|
|
425
|
+
|
|
426
|
+
class EvaluationResult(BaseModel):
|
|
427
|
+
"""Result from an evaluator."""
|
|
428
|
+
value: str | float | bool
|
|
429
|
+
explanation: str | None = None
|
|
430
|
+
|
|
431
|
+
class EvaluatorScore(BaseModel):
|
|
432
|
+
"""Score from an evaluator for a job output."""
|
|
433
|
+
evaluator_name: str
|
|
434
|
+
score: EvaluationResult
|
|
435
|
+
error: str | None = None
|
|
436
|
+
|
|
437
|
+
class JobResult(BaseModel):
|
|
438
|
+
"""Result from a job execution."""
|
|
439
|
+
job_name: str
|
|
440
|
+
output: Output
|
|
441
|
+
error: str | None = None
|
|
442
|
+
evaluator_scores: list[EvaluatorScore] | None = None
|
|
443
|
+
|
|
444
|
+
class DataPointResult(BaseModel):
|
|
445
|
+
"""Result for a single data point."""
|
|
446
|
+
data_point: DataPoint
|
|
447
|
+
error: str | None = None
|
|
448
|
+
job_results: list[JobResult] | None = None
|
|
449
|
+
|
|
450
|
+
# Type aliases
|
|
451
|
+
EvaluatorqResult = list[DataPointResult]
|
|
452
|
+
|
|
453
|
+
class DatasetIdInput(BaseModel):
|
|
454
|
+
"""Input for fetching a dataset from Orq platform."""
|
|
455
|
+
dataset_id: str
|
|
456
|
+
|
|
457
|
+
class EvaluatorParams(BaseModel):
|
|
458
|
+
"""Parameters for running an evaluation (validated at runtime)."""
|
|
459
|
+
data: DatasetIdInput | Sequence[Awaitable[DataPoint] | DataPoint]
|
|
460
|
+
jobs: list[Job]
|
|
461
|
+
evaluators: list[Evaluator] | None = None
|
|
462
|
+
parallelism: int = Field(default=1, ge=1)
|
|
463
|
+
print_results: bool = True
|
|
464
|
+
description: str | None = None
|
|
465
|
+
|
|
466
|
+
class JobReturn(TypedDict):
|
|
467
|
+
"""Job return structure."""
|
|
468
|
+
name: str
|
|
469
|
+
output: Output
|
|
470
|
+
|
|
471
|
+
Job = Callable[[DataPoint, int], Awaitable[JobReturn]]
|
|
472
|
+
|
|
473
|
+
class ScorerParameter(TypedDict):
|
|
474
|
+
"""Parameters passed to scorer functions."""
|
|
475
|
+
data: DataPoint
|
|
476
|
+
output: Output
|
|
477
|
+
|
|
478
|
+
Scorer = Callable[[ScorerParameter], Awaitable[EvaluationResult]]
|
|
479
|
+
|
|
480
|
+
class Evaluator(TypedDict):
|
|
481
|
+
"""Evaluator configuration."""
|
|
482
|
+
name: str
|
|
483
|
+
scorer: Scorer
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
## 🛠️ Development
|
|
487
|
+
|
|
488
|
+
```bash
|
|
489
|
+
# Install dependencies
|
|
490
|
+
uv sync
|
|
491
|
+
|
|
492
|
+
# Run type checking
|
|
493
|
+
pyright
|
|
494
|
+
|
|
495
|
+
# Format code
|
|
496
|
+
ruff format
|
|
497
|
+
|
|
498
|
+
# Lint code
|
|
499
|
+
ruff check
|
|
500
|
+
```
|