evaluatorq 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ # See https://docs.github.com/en/get-started/getting-started-with-git/ignoring-files for more about ignoring files.
2
+
3
+ # compiled output
4
+ dist
5
+ tmp
6
+ out-tsc
7
+ *.tsbuildinfo
8
+ **/__pycache__/**
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # dependencies
13
+ node_modules
14
+
15
+ # IDEs and editors
16
+ /.idea
17
+ .project
18
+ .classpath
19
+ .c9/
20
+ *.launch
21
+ .settings/
22
+ *.sublime-workspace
23
+
24
+ # IDE - VSCode
25
+ .vscode/*
26
+ !.vscode/settings.json
27
+ !.vscode/tasks.json
28
+ !.vscode/launch.json
29
+ !.vscode/extensions.json
30
+
31
+ # misc
32
+ /.sass-cache
33
+ /connect.lock
34
+ /coverage
35
+ /libpeerconnection.log
36
+ npm-debug.log
37
+ yarn-error.log
38
+ testem.log
39
+ /typings
40
+
41
+ # System Files
42
+ .DS_Store
43
+ Thumbs.db
44
+
45
+ .nx/cache
46
+ .nx/workspace-data
47
+ .cursor/rules/nx-rules.mdc
48
+ .github/instructions/nx.instructions.md
49
+ .npmrc
50
+ .venv
51
+ **/*.tsbuildinfo
@@ -0,0 +1 @@
1
+ 3.10
@@ -0,0 +1,500 @@
1
+ Metadata-Version: 2.4
2
+ Name: evaluatorq
3
+ Version: 1.0.0
4
+ Summary: An evaluation framework library for Python that provides a flexible way to run parallel evaluations and optionally integrate with the Orq AI platform.
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: httpx>=0.28.1
8
+ Requires-Dist: pydantic>=2.0
9
+ Requires-Dist: rich>=14.2.0
10
+ Provides-Extra: orq
11
+ Requires-Dist: orq-ai-sdk>=3.13.16; extra == 'orq'
12
+ Description-Content-Type: text/markdown
13
+
14
+ # evaluatorq-py
15
+
16
+ An evaluation framework library for Python that provides a flexible way to run parallel evaluations and optionally integrate with the Orq AI platform.
17
+
18
+ ## 🎯 Features
19
+
20
+ - **Parallel Execution**: Run multiple evaluation jobs concurrently with progress tracking
21
+ - **Flexible Data Sources**: Support for inline data, async iterables, and Orq platform datasets
22
+ - **Type-safe**: Fully typed with Python type hints and Pydantic models with runtime validation
23
+ - **Rich Terminal UI**: Beautiful progress indicators and result tables powered by Rich
24
+ - **Orq Platform Integration**: Seamlessly fetch and evaluate datasets from Orq AI (optional)
25
+
26
+ ## 📥 Installation
27
+
28
+ ```bash
29
+ pip install evaluatorq
30
+ # or
31
+ uv add evaluatorq
32
+ # or
33
+ poetry add evaluatorq
34
+ ```
35
+
36
+ ### Optional Dependencies
37
+
38
+ If you want to use the Orq platform integration:
39
+
40
+ ```bash
41
+ pip install orq-ai-sdk
42
+ ```
43
+
44
+ ## 🚀 Quick Start
45
+
46
+ ### Basic Usage
47
+
48
+ ```python
49
+ import asyncio
50
+ from evaluatorq import evaluatorq, job, DataPoint, EvaluationResult
51
+
52
+ @job("text-analyzer")
53
+ async def text_analyzer(data: DataPoint, row: int):
54
+ """Analyze text data and return analysis results."""
55
+ text = data.inputs["text"]
56
+ analysis = {
57
+ "length": len(text),
58
+ "word_count": len(text.split()),
59
+ "uppercase": text.upper(),
60
+ }
61
+
62
+ return analysis
63
+
64
+ async def length_check_scorer(params):
65
+ """Evaluate if output length is sufficient."""
66
+ output = params["output"]
67
+ passes_check = output["length"] > 10
68
+
69
+ return EvaluationResult(
70
+ value=1 if passes_check else 0,
71
+ explanation=(
72
+ "Output length is sufficient"
73
+ if passes_check
74
+ else f"Output too short ({output['length']} chars, need >10)"
75
+ )
76
+ )
77
+
78
+ async def main():
79
+ await evaluatorq(
80
+ "text-analysis",
81
+ data=[
82
+ DataPoint(inputs={"text": "Hello world"}),
83
+ DataPoint(inputs={"text": "Testing evaluation"}),
84
+ ],
85
+ jobs=[text_analyzer],
86
+ evaluators=[
87
+ {
88
+ "name": "length-check",
89
+ "scorer": length_check_scorer,
90
+ }
91
+ ],
92
+ )
93
+
94
+ if __name__ == "__main__":
95
+ asyncio.run(main())
96
+ ```
97
+
98
+ ### Using Orq Platform Datasets
99
+
100
+ ```python
101
+ import asyncio
102
+ from evaluatorq import evaluatorq, job, DataPoint, EvaluationResult
103
+
104
+ @job("processor")
105
+ async def processor(data: DataPoint, row: int):
106
+ """Process each data point from the dataset."""
107
+ result = await process_data(data)
108
+ return result
109
+
110
+ async def accuracy_scorer(params):
111
+ """Calculate accuracy by comparing output with expected results."""
112
+ data = params["data"]
113
+ output = params["output"]
114
+
115
+ score = calculate_score(output, data.expected_output)
116
+
117
+ if score > 0.8:
118
+ explanation = "High accuracy match"
119
+ elif score > 0.5:
120
+ explanation = "Partial match"
121
+ else:
122
+ explanation = "Low accuracy match"
123
+
124
+ return EvaluationResult(value=score, explanation=explanation)
125
+
126
+ async def main():
127
+ # Requires ORQ_API_KEY environment variable
128
+ await evaluatorq(
129
+ "dataset-evaluation",
130
+ data={"dataset_id": "your-dataset-id"}, # From Orq platform
131
+ jobs=[processor],
132
+ evaluators=[
133
+ {
134
+ "name": "accuracy",
135
+ "scorer": accuracy_scorer,
136
+ }
137
+ ],
138
+ )
139
+
140
+ if __name__ == "__main__":
141
+ asyncio.run(main())
142
+ ```
143
+
144
+ ### Advanced Features
145
+
146
+ #### Multiple Jobs
147
+
148
+ Run multiple jobs in parallel for each data point:
149
+
150
+ ```python
151
+ from evaluatorq import job
152
+
153
+ @job("preprocessor")
154
+ async def preprocessor(data: DataPoint, row: int):
155
+ result = await preprocess(data)
156
+ return result
157
+
158
+ @job("analyzer")
159
+ async def analyzer(data: DataPoint, row: int):
160
+ result = await analyze(data)
161
+ return result
162
+
163
+ @job("transformer")
164
+ async def transformer(data: DataPoint, row: int):
165
+ result = await transform(data)
166
+ return result
167
+
168
+ await evaluatorq(
169
+ "multi-job-eval",
170
+ data=[...],
171
+ jobs=[preprocessor, analyzer, transformer],
172
+ evaluators=[...],
173
+ )
174
+ ```
175
+
176
+ #### The `@job()` Decorator
177
+
178
+ The `@job()` decorator provides two key benefits:
179
+
180
+ 1. **Eliminates boilerplate** - No need to manually wrap returns with `{"name": ..., "output": ...}`
181
+ 2. **Preserves job names in errors** - When a job fails, the error will include the job name for better debugging
182
+
183
+ **Decorator pattern (recommended):**
184
+ ```python
185
+ from evaluatorq import job
186
+
187
+ @job("text-processor")
188
+ async def process_text(data: DataPoint, row: int):
189
+ # Clean return - just the data!
190
+ return {"result": data.inputs["text"].upper()}
191
+ ```
192
+
193
+ **Functional pattern (for lambdas):**
194
+ ```python
195
+ from evaluatorq import job
196
+
197
+ # Simple transformations with lambda
198
+ uppercase_job = job("uppercase", lambda data, row: data.inputs["text"].upper())
199
+ word_count_job = job("word-count", lambda data, row: len(data.inputs["text"].split()))
200
+ ```
201
+
202
+ **Manual pattern (not recommended):**
203
+ ```python
204
+ # Without decorator - requires manual wrapper every time
205
+ async def process_text(data: DataPoint, row: int):
206
+ return {"name": "text-processor", "output": {"result": data.inputs["text"].upper()}}
207
+ ```
208
+
209
+ #### Automatic Error Handling
210
+
211
+ The `@job()` decorator automatically preserves job names even when errors occur:
212
+
213
+ ```python
214
+ from evaluatorq import job
215
+
216
+ @job("risky-job")
217
+ async def risky_operation(data: DataPoint, row: int):
218
+ # If this raises an error, the job name "risky-job" will be preserved
219
+ result = await potentially_failing_operation(data)
220
+ return result
221
+
222
+ await evaluatorq(
223
+ "error-handling",
224
+ data=[...],
225
+ jobs=[risky_operation],
226
+ evaluators=[...],
227
+ )
228
+
229
+ # Error output will show: "Job 'risky-job' failed: <error details>"
230
+ # Without @job decorator, you'd only see: "<error details>"
231
+ ```
232
+
233
+ #### Async Data Sources
234
+
235
+ ```python
236
+ import asyncio
237
+
238
+ # Create an array of coroutines for async data
239
+ async def get_data_point(i: int) -> DataPoint:
240
+ await asyncio.sleep(0.01) # Simulate async data fetching
241
+ return DataPoint(inputs={"value": i})
242
+
243
+ data_promises = [get_data_point(i) for i in range(1000)]
244
+
245
+ await evaluatorq(
246
+ "async-eval",
247
+ data=data_promises,
248
+ jobs=[...],
249
+ evaluators=[...],
250
+ )
251
+ ```
252
+
253
+ #### Controlling Parallelism
254
+
255
+ ```python
256
+ await evaluatorq(
257
+ "parallel-eval",
258
+ data=[...],
259
+ jobs=[...],
260
+ evaluators=[...],
261
+ parallelism=10, # Run up to 10 jobs concurrently
262
+ )
263
+ ```
264
+
265
+ #### Disable Progress Display
266
+
267
+ ```python
268
+ # Get raw results without terminal output
269
+ results = await evaluatorq(
270
+ "silent-eval",
271
+ data=[...],
272
+ jobs=[...],
273
+ evaluators=[...],
274
+ print_results=False, # Disable progress and table display
275
+ )
276
+
277
+ # Process results programmatically
278
+ for result in results:
279
+ print(result.data_point.inputs)
280
+ for job_result in result.job_results:
281
+ print(f"{job_result.job_name}: {job_result.output}")
282
+ ```
283
+
284
+ ## 🔧 Configuration
285
+
286
+ ### Environment Variables
287
+
288
+ - `ORQ_API_KEY`: API key for Orq platform integration (required for dataset access and sending results)
289
+
290
+ ### Evaluation Parameters
291
+
292
+ Parameters are validated at runtime using Pydantic. The `evaluatorq` function supports three calling styles:
293
+
294
+ ```python
295
+ from evaluatorq import evaluatorq, EvaluatorParams
296
+
297
+ # 1. Keyword arguments (recommended)
298
+ await evaluatorq(
299
+ "my-eval",
300
+ data=[...],
301
+ jobs=[...],
302
+ parallelism=5,
303
+ )
304
+
305
+ # 2. Dict style
306
+ await evaluatorq("my-eval", {
307
+ "data": [...],
308
+ "jobs": [...],
309
+ "parallelism": 5,
310
+ })
311
+
312
+ # 3. EvaluatorParams instance
313
+ await evaluatorq("my-eval", EvaluatorParams(
314
+ data=[...],
315
+ jobs=[...],
316
+ parallelism=5,
317
+ ))
318
+ ```
319
+
320
+ #### Parameter Reference
321
+
322
+ | Parameter | Type | Default | Description |
323
+ |-----------|------|---------|-------------|
324
+ | `data` | `list[DataPoint]` \| `list[Awaitable[DataPoint]]` \| `DatasetIdInput` | **required** | Data to evaluate |
325
+ | `jobs` | `list[Job]` | **required** | Jobs to run on each data point |
326
+ | `evaluators` | `list[Evaluator]` \| `None` | `None` | Evaluators to score job outputs |
327
+ | `parallelism` | `int` (≥1) | `1` | Number of concurrent jobs |
328
+ | `print_results` | `bool` | `True` | Display progress and results table |
329
+ | `description` | `str` \| `None` | `None` | Optional evaluation description |
330
+
331
+ ## 📊 Orq Platform Integration
332
+
333
+ ### Automatic Result Sending
334
+
335
+ When the `ORQ_API_KEY` environment variable is set, evaluatorq automatically sends evaluation results to the Orq platform for visualization and analysis.
336
+
337
+ ```python
338
+ # Results are automatically sent when ORQ_API_KEY is set
339
+ await evaluatorq(
340
+ "my-evaluation",
341
+ data=[...],
342
+ jobs=[...],
343
+ evaluators=[...],
344
+ )
345
+ ```
346
+
347
+ #### What Gets Sent
348
+
349
+ When the `ORQ_API_KEY` is set, the following information is sent to Orq:
350
+ - Evaluation name
351
+ - Dataset ID (when using Orq datasets)
352
+ - Job results with outputs and errors
353
+ - Evaluator scores with values and explanations
354
+ - Execution timing information
355
+
356
+ Note: Evaluator explanations are included in the data sent to Orq but are not displayed in the terminal output to keep the console clean.
357
+
358
+ #### Result Visualization
359
+
360
+ After successful submission, you'll see a console message with a link to view your results:
361
+
362
+ ```
363
+ 📊 View your evaluation results at: <url to the evaluation>
364
+ ```
365
+
366
+ The Orq platform provides:
367
+ - Interactive result tables
368
+ - Score statistics
369
+ - Performance metrics
370
+ - Historical comparisons
371
+
372
+ ## 📚 API Reference
373
+
374
+ ### `evaluatorq(name, params?, *, data?, jobs?, evaluators?, parallelism?, print_results?, description?) -> EvaluatorqResult`
375
+
376
+ Main async function to run evaluations.
377
+
378
+ #### Signature:
379
+
380
+ ```python
381
+ async def evaluatorq(
382
+ name: str,
383
+ params: EvaluatorParams | dict[str, Any] | None = None,
384
+ *,
385
+ data: DatasetIdInput | Sequence[Awaitable[DataPoint] | DataPoint] | None = None,
386
+ jobs: list[Job] | None = None,
387
+ evaluators: list[Evaluator] | None = None,
388
+ parallelism: int = 1,
389
+ print_results: bool = True,
390
+ description: str | None = None,
391
+ ) -> EvaluatorqResult
392
+ ```
393
+
394
+ #### Parameters:
395
+
396
+ - `name`: String identifier for the evaluation run
397
+ - `params`: (Optional) `EvaluatorParams` instance or dict with evaluation parameters
398
+ - `data`: List of DataPoint objects, awaitables, or `DatasetIdInput`
399
+ - `jobs`: List of job functions to run on each data point
400
+ - `evaluators`: Optional list of evaluator configurations
401
+ - `parallelism`: Number of concurrent jobs (default: 1, must be ≥1)
402
+ - `print_results`: Whether to display progress and results (default: True)
403
+ - `description`: Optional description for the evaluation run
404
+
405
+ > **Note:** Parameters can be passed either via the `params` argument (as dict or `EvaluatorParams`) or as keyword arguments. Keyword arguments take precedence over `params` values.
406
+
407
+ #### Returns:
408
+
409
+ `EvaluatorqResult` - List of `DataPointResult` objects containing job outputs and evaluator scores.
410
+
411
+ ### Types
412
+
413
+ ```python
414
+ from typing import Any, Callable, Awaitable
415
+ from pydantic import BaseModel, Field
416
+ from typing_extensions import TypedDict
417
+
418
+ # Output type alias
419
+ Output = str | int | float | bool | dict[str, Any] | None
420
+
421
+ class DataPoint(BaseModel):
422
+ """A data point for evaluation."""
423
+ inputs: dict[str, Any]
424
+ expected_output: Output | None = None
425
+
426
+ class EvaluationResult(BaseModel):
427
+ """Result from an evaluator."""
428
+ value: str | float | bool
429
+ explanation: str | None = None
430
+
431
+ class EvaluatorScore(BaseModel):
432
+ """Score from an evaluator for a job output."""
433
+ evaluator_name: str
434
+ score: EvaluationResult
435
+ error: str | None = None
436
+
437
+ class JobResult(BaseModel):
438
+ """Result from a job execution."""
439
+ job_name: str
440
+ output: Output
441
+ error: str | None = None
442
+ evaluator_scores: list[EvaluatorScore] | None = None
443
+
444
+ class DataPointResult(BaseModel):
445
+ """Result for a single data point."""
446
+ data_point: DataPoint
447
+ error: str | None = None
448
+ job_results: list[JobResult] | None = None
449
+
450
+ # Type aliases
451
+ EvaluatorqResult = list[DataPointResult]
452
+
453
+ class DatasetIdInput(BaseModel):
454
+ """Input for fetching a dataset from Orq platform."""
455
+ dataset_id: str
456
+
457
+ class EvaluatorParams(BaseModel):
458
+ """Parameters for running an evaluation (validated at runtime)."""
459
+ data: DatasetIdInput | Sequence[Awaitable[DataPoint] | DataPoint]
460
+ jobs: list[Job]
461
+ evaluators: list[Evaluator] | None = None
462
+ parallelism: int = Field(default=1, ge=1)
463
+ print_results: bool = True
464
+ description: str | None = None
465
+
466
+ class JobReturn(TypedDict):
467
+ """Job return structure."""
468
+ name: str
469
+ output: Output
470
+
471
+ Job = Callable[[DataPoint, int], Awaitable[JobReturn]]
472
+
473
+ class ScorerParameter(TypedDict):
474
+ """Parameters passed to scorer functions."""
475
+ data: DataPoint
476
+ output: Output
477
+
478
+ Scorer = Callable[[ScorerParameter], Awaitable[EvaluationResult]]
479
+
480
+ class Evaluator(TypedDict):
481
+ """Evaluator configuration."""
482
+ name: str
483
+ scorer: Scorer
484
+ ```
485
+
486
+ ## 🛠️ Development
487
+
488
+ ```bash
489
+ # Install dependencies
490
+ uv sync
491
+
492
+ # Run type checking
493
+ pyright
494
+
495
+ # Format code
496
+ ruff format
497
+
498
+ # Lint code
499
+ ruff check
500
+ ```