scorebook 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. scorebook/__init__.py +12 -4
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/evaluate/__init__.py +15 -0
  4. scorebook/evaluate/_async/__init__.py +0 -0
  5. scorebook/evaluate/_async/evaluate_async.py +413 -0
  6. scorebook/evaluate/_sync/__init__.py +0 -0
  7. scorebook/evaluate/_sync/evaluate.py +413 -0
  8. scorebook/evaluate/evaluate_helpers.py +365 -0
  9. scorebook/inference/__init__.py +4 -0
  10. scorebook/inference/clients/__init__.py +8 -0
  11. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  12. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  13. scorebook/settings.py +18 -0
  14. scorebook/trismik/__init__.py +10 -0
  15. scorebook/utils/__init__.py +9 -2
  16. scorebook/utils/async_utils.py +20 -1
  17. scorebook/utils/progress_bars.py +22 -61
  18. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
  19. scorebook-0.0.10.dist-info/RECORD +41 -0
  20. scorebook/evaluate.py +0 -623
  21. scorebook/trismik_services/__init__.py +0 -6
  22. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  23. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  24. scorebook-0.0.9.dist-info/RECORD +0 -36
  25. /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
  26. /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
  27. /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
  28. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  29. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
  30. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
  31. {scorebook-0.0.9.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
scorebook/evaluate.py DELETED
@@ -1,623 +0,0 @@
1
- """
2
- Model evaluation functionality for the Scorebook framework.
3
-
4
- This module provides the core evaluation logic to assess model predictions
5
- against ground truth labels using configurable metrics. It supports:
6
-
7
- - Batch evaluation of models across multiple datasets
8
- - Flexible metric computation and aggregation
9
- - Optional parameter sweeping and experiment tracking
10
- - Customizable inference functions
11
-
12
- The main entry point is the `evaluate()` function which handles running
13
- models on datasets and computing metric scores.
14
- """
15
-
16
- import asyncio
17
- import logging
18
- from typing import Any, Callable, Dict, List, Literal, Optional, Union
19
-
20
- from scorebook.eval_dataset import EvalDataset
21
- from scorebook.exceptions import (
22
- DataMismatchError,
23
- MetricComputationError,
24
- ParallelExecutionError,
25
- ParameterValidationError,
26
- ScoreBookError,
27
- )
28
- from scorebook.trismik_services import run_adaptive_evaluation
29
- from scorebook.trismik_services.login import get_token
30
- from scorebook.trismik_services.upload_classic_eval_run import upload_classic_eval_run
31
- from scorebook.types import (
32
- AdaptiveEvalDataset,
33
- AdaptiveEvalRunResult,
34
- AdaptiveEvalRunSpec,
35
- ClassicEvalRunResult,
36
- EvalResult,
37
- EvalRunSpec,
38
- )
39
- from scorebook.utils import evaluation_progress, expand_dict, is_awaitable
40
-
41
- logger = logging.getLogger(__name__)
42
-
43
-
44
- def evaluate(
45
- inference: Callable,
46
- datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
47
- hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
48
- experiment_id: Optional[str] = None,
49
- project_id: Optional[str] = None,
50
- metadata: Optional[Dict[str, Any]] = None,
51
- upload_results: Union[Literal["auto"], bool] = "auto",
52
- sample_size: Optional[int] = None,
53
- parallel: bool = False,
54
- return_dict: bool = True,
55
- return_aggregates: bool = True,
56
- return_items: bool = False,
57
- return_output: bool = False,
58
- ) -> Union[Dict, List]:
59
- """
60
- Evaluate a model and collection of hyperparameters over datasets with specified metrics.
61
-
62
- Args:
63
- inference: A callable that runs model inference over a list of evaluation items
64
- datasets: One or more evaluation datasets to run evaluation on.
65
- hyperparameters: Optional list of hyperparameter configurations or grid to evaluate
66
- experiment_id: Optional ID of the experiment to upload results to on Trismik's dashboard.
67
- project_id: Optional ID of the project to upload results to on Trismik's dashboard.
68
- metadata: Optional metadata to attach to the evaluation.
69
- upload_results: If True, uploads results to Trismik's dashboard.
70
- sample_size: Optional number of items to sample from each dataset.
71
- parallel: If True, runs evaluation in parallel. Requires the inference callable to be async.
72
- return_dict: If True, returns eval results as a dict
73
- return_aggregates: If True, returns aggregate scores for each dataset
74
- return_items: If True, returns individual items for each dataset
75
- return_output: If True, returns model outputs for each dataset item evaluated
76
-
77
- Returns:
78
- Union[Dict, List, EvalResult]:
79
- The evaluation results in the format specified by return parameters:
80
- - If return_dict=False: Returns an EvalResult object containing all run results
81
- - If return_dict=True Returns the evaluation results as a dict
82
- """
83
-
84
- logger.info(
85
- "Starting evaluation: experiment_id=%s, project_id=%s, parallel=%s",
86
- experiment_id,
87
- project_id,
88
- parallel,
89
- )
90
-
91
- return asyncio.run(
92
- _evaluate_async(
93
- inference=inference,
94
- datasets=datasets,
95
- hyperparameters=hyperparameters,
96
- metadata=metadata,
97
- experiment_id=experiment_id,
98
- project_id=project_id,
99
- parallel=parallel,
100
- return_dict=return_dict,
101
- return_aggregates=return_aggregates,
102
- return_items=return_items,
103
- return_output=return_output,
104
- upload_results=upload_results,
105
- sample_size=sample_size,
106
- )
107
- )
108
-
109
-
110
- async def _evaluate_async(
111
- inference: Callable,
112
- datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
113
- hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
114
- metadata: Optional[Dict[str, Any]] = None,
115
- experiment_id: Optional[str] = None,
116
- project_id: Optional[str] = None,
117
- return_dict: bool = True,
118
- return_aggregates: bool = True,
119
- return_items: bool = False,
120
- return_output: bool = False,
121
- parallel: bool = False,
122
- upload_results: Union[Literal["auto"], bool] = "auto",
123
- sample_size: Optional[int] = None,
124
- ) -> Union[Dict, List]:
125
- """Run evaluation asynchronously."""
126
-
127
- upload_results = _resolve_upload_results(upload_results)
128
-
129
- _validate_parameters(locals())
130
- datasets = _prepare_datasets(datasets, sample_size)
131
- hyperparameter_configs = _prepare_hyperparameter_configs(hyperparameters)
132
-
133
- logger.info(
134
- "Prepared %d datasets and %d hyperparameter configurations",
135
- len(datasets),
136
- len(hyperparameter_configs),
137
- )
138
-
139
- eval_run_specs = _build_eval_run_specs(
140
- datasets, hyperparameter_configs, experiment_id, project_id, metadata
141
- )
142
- eval_run_specs.sort(key=lambda run: (run.dataset_index, run.hyperparameters_index))
143
-
144
- logger.info("Created %d evaluation run specs", len(eval_run_specs))
145
-
146
- with evaluation_progress(
147
- datasets, len(hyperparameter_configs), parallel, len(eval_run_specs)
148
- ) as progress_bars:
149
- if parallel:
150
- eval_result = await _run_parallel(
151
- inference,
152
- eval_run_specs,
153
- progress_bars,
154
- experiment_id,
155
- project_id,
156
- metadata,
157
- upload_results,
158
- )
159
- else:
160
- eval_result = await _run_sequential(
161
- inference,
162
- eval_run_specs,
163
- progress_bars,
164
- experiment_id,
165
- project_id,
166
- metadata,
167
- upload_results,
168
- )
169
-
170
- logger.info("Evaluation completed successfully")
171
-
172
- return _format_results(eval_result, return_dict, return_aggregates, return_items, return_output)
173
-
174
-
175
- # ===== ORCHESTRATION PATHS =====
176
-
177
-
178
- async def _run_parallel(
179
- inference: Callable,
180
- runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
181
- progress_bars: Any,
182
- experiment_id: Optional[str] = None,
183
- project_id: Optional[str] = None,
184
- metadata: Optional[Dict[str, Any]] = None,
185
- upload_results: bool = False,
186
- ) -> EvalResult:
187
- """Run evaluation in parallel."""
188
-
189
- logger.debug("Running inference in parallel")
190
-
191
- # Worker function to execute individual runs and handle uploads
192
- async def worker(
193
- run: Union[EvalRunSpec, AdaptiveEvalRunSpec]
194
- ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
195
- run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
196
- progress_bars.on_eval_run_completed(run.dataset_index)
197
-
198
- if (
199
- upload_results
200
- and isinstance(run_result, ClassicEvalRunResult)
201
- and experiment_id
202
- and project_id
203
- ):
204
- # Only upload runs that completed successfully
205
- if run_result.run_completed:
206
- run_id = await _upload_classic_run(
207
- run_result, experiment_id, project_id, inference, metadata
208
- )
209
- run_result.run_id = run_id
210
-
211
- return run_result
212
-
213
- # Execute all runs concurrently
214
- run_results = await asyncio.gather(*[worker(run) for run in runs])
215
- # Return in canonical (dataset_idx, hp_idx) order for stability
216
- run_results.sort(
217
- key=lambda result: (result.run_spec.dataset_index, result.run_spec.hyperparameters_index)
218
- )
219
- return EvalResult(run_results)
220
-
221
-
222
- async def _run_sequential(
223
- inference: Callable,
224
- runs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]],
225
- progress_bars: Any,
226
- experiment_id: Optional[str] = None,
227
- project_id: Optional[str] = None,
228
- metadata: Optional[Dict[str, Any]] = None,
229
- upload_results: bool = False,
230
- ) -> EvalResult:
231
- """Run evaluation sequentially."""
232
-
233
- logger.debug("Running inference sequentially")
234
-
235
- run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]] = []
236
- for run in runs:
237
- run_result = await _execute_run(inference, run, experiment_id, project_id, metadata)
238
- run_results.append(run_result)
239
- progress_bars.on_hyperparam_completed(run_result.run_spec.dataset_index)
240
-
241
- # Upload a classic eval run result immediately if upload_results is enabled
242
- if (
243
- upload_results
244
- and isinstance(run_result, ClassicEvalRunResult)
245
- and experiment_id
246
- and project_id
247
- ):
248
- # Only upload runs that completed successfully
249
- if run_result.run_completed:
250
- run_id = await _upload_classic_run(
251
- run_result, experiment_id, project_id, inference, metadata
252
- )
253
- run_result.run_id = run_id
254
-
255
- return EvalResult(run_results)
256
-
257
-
258
- # ===== EVALUATION RUN EXECUTIONS =====
259
-
260
-
261
- async def _execute_run(
262
- inference: Callable,
263
- run: Union[EvalRunSpec, AdaptiveEvalRunSpec],
264
- experiment_id: Optional[str] = None,
265
- project_id: Optional[str] = None,
266
- metadata: Optional[Dict[str, Any]] = None,
267
- ) -> Union[ClassicEvalRunResult, AdaptiveEvalRunResult]:
268
- """Execute a single evaluation run."""
269
-
270
- if isinstance(run, EvalRunSpec):
271
- return await _execute_classic_eval_run(inference, run)
272
-
273
- elif isinstance(run, AdaptiveEvalRunSpec):
274
- if not experiment_id or not project_id:
275
- raise ScoreBookError(
276
- "experiment_id and project_id are required for adaptive evaluations"
277
- )
278
- return await _execute_adaptive_eval_run(inference, run, experiment_id, project_id, metadata)
279
-
280
- else:
281
- raise ScoreBookError(f"An internal error occurred: {type(run)} is not a valid run type")
282
-
283
-
284
- async def _execute_classic_eval_run(inference: Callable, run: EvalRunSpec) -> ClassicEvalRunResult:
285
- """Execute a classic evaluation run."""
286
- logger.debug("Executing classic eval run for %s", run)
287
-
288
- inference_outputs = None
289
- metric_scores = None
290
-
291
- try:
292
- inference_outputs = await _run_inference_callable(
293
- inference, run.dataset.items, run.hyperparameter_config
294
- )
295
- metric_scores = _score_metrics(run.dataset, inference_outputs, run.labels)
296
- logger.debug("Classic evaluation completed for run %s", run)
297
- return ClassicEvalRunResult(run, True, inference_outputs, metric_scores)
298
-
299
- except Exception as e:
300
- logger.warning("Failed to complete classic eval run for %s: %s", run, str(e))
301
- return ClassicEvalRunResult(run, False, inference_outputs, metric_scores)
302
-
303
-
304
- async def _execute_adaptive_eval_run(
305
- inference: Callable,
306
- run: AdaptiveEvalRunSpec,
307
- experiment_id: str,
308
- project_id: str,
309
- metadata: Optional[Dict[str, Any]] = None,
310
- ) -> AdaptiveEvalRunResult:
311
- """Execute an adaptive evaluation run."""
312
- logger.debug("Executing adaptive run for %s", run)
313
-
314
- adaptive_eval_run_result = await run_adaptive_evaluation(
315
- inference, run, experiment_id, project_id, metadata
316
- )
317
- logger.debug("Adaptive evaluation completed for run %s", adaptive_eval_run_result)
318
-
319
- return adaptive_eval_run_result
320
-
321
-
322
- # ===== HELPER FUNCTIONS =====
323
-
324
-
325
- def _resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
326
- """Resolve the upload_results parameter based on trismik login status."""
327
-
328
- if upload_results == "auto":
329
- upload_results = get_token() is not None
330
- logger.debug("Auto upload results resolved to: %s", upload_results)
331
-
332
- return upload_results
333
-
334
-
335
- def _validate_parameters(params: Dict[str, Any]) -> None:
336
- """Validate all parameters for evaluation."""
337
-
338
- # If returning a dict, it must contain items and/or aggregates
339
- if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
340
- raise ParameterValidationError(
341
- "When return_dict=True, at least one of return_aggregates or return_items must be True"
342
- )
343
-
344
- # Parallel runs require an asynchronous inference callable
345
- if params["parallel"] and not is_awaitable(params["inference"]):
346
- raise ParallelExecutionError(
347
- "parallel=True requires the inference_callable to be async. "
348
- "Please make your inference function async or set parallel=False."
349
- )
350
-
351
- # If uploading results, experiment_id and project_id must be specified
352
- if params["upload_results"]:
353
- if params["experiment_id"] is None or params["project_id"] is None:
354
- raise ParameterValidationError(
355
- "experiment_id and project_id are required for upload_results=True"
356
- )
357
-
358
- logger.debug("Parameter validation successful")
359
-
360
-
361
- def _prepare_datasets(
362
- datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
363
- sample_size: Optional[int] = None,
364
- ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
365
- """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
366
-
367
- # Ensure datasets is always a list for consistent processing
368
- if not isinstance(datasets, list):
369
- datasets = [datasets]
370
-
371
- datasets_out: List[Union[EvalDataset, AdaptiveEvalDataset]] = []
372
- for dataset in datasets:
373
-
374
- # Prepare classic datasets
375
- if isinstance(dataset, EvalDataset):
376
-
377
- if sample_size is not None:
378
- dataset = dataset.sample(sample_size)
379
-
380
- datasets_out.append(dataset)
381
-
382
- # Prepare adaptive datasets
383
- elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
384
- datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
385
-
386
- # TODO: dataset name string registry
387
- elif isinstance(dataset, str):
388
- pass
389
-
390
- else:
391
- raise ParameterValidationError(f"Unrecognized dataset type: {type(dataset)}")
392
-
393
- return datasets_out
394
-
395
-
396
- def _prepare_hyperparameter_configs(
397
- hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
398
- ) -> List[Dict[str, Any]]:
399
- """Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
400
- if hyperparameters is None:
401
- return [{}]
402
- if not isinstance(hyperparameters, list): # TODO: THIS LOOKS BROKEN
403
- expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
404
- return expanded
405
-
406
- logger.info("Evaluating with hyperparameters: %s", hyperparameters)
407
-
408
- return hyperparameters
409
-
410
-
411
- def _build_eval_run_specs(
412
- datasets: List[Union[EvalDataset, str]],
413
- hyperparameters: Any,
414
- experiment_id: Optional[str],
415
- project_id: Optional[str],
416
- metadata: Optional[Dict[str, Any]] = None,
417
- ) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
418
- """Build All RunSpec objects for each dataset/hyperparameter combination."""
419
-
420
- eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
421
- for dataset_index, dataset in enumerate(datasets):
422
- for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
423
-
424
- # Create classic eval run spec
425
- if isinstance(dataset, EvalDataset):
426
- eval_run_specs.append(
427
- _build_classic_eval_run_spec(
428
- dataset, dataset_index, hyperparameter_config, hyperparameters_index
429
- )
430
- )
431
-
432
- # Create adaptive eval run spec from string
433
- elif isinstance(dataset, AdaptiveEvalDataset):
434
- if not experiment_id or not project_id:
435
- raise ScoreBookError(
436
- "experiment_id and project_id are required for adaptive evaluations"
437
- )
438
- eval_run_specs.append(
439
- _build_adaptive_eval_run_spec(
440
- dataset.name,
441
- dataset_index,
442
- hyperparameter_config,
443
- hyperparameters_index,
444
- experiment_id,
445
- project_id,
446
- metadata,
447
- )
448
- )
449
-
450
- # Log warning - should never happen
451
- else:
452
- logger.warning("Unrecognized dataset type: %s", dataset)
453
-
454
- return eval_run_specs
455
-
456
-
457
- def _build_classic_eval_run_spec(
458
- dataset: EvalDataset,
459
- dataset_index: int,
460
- hyperparameters: Dict[str, Any],
461
- hyperparameters_index: int,
462
- ) -> EvalRunSpec:
463
- """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
464
- items = dataset.items
465
- labels = [item.get(dataset.label) for item in items]
466
- eval_run_spec = EvalRunSpec(
467
- dataset,
468
- dataset_index,
469
- hyperparameters,
470
- hyperparameters_index,
471
- items,
472
- labels,
473
- )
474
- logger.debug("Built EvalRunSpec: %s", eval_run_spec)
475
- return eval_run_spec
476
-
477
-
478
- def _build_adaptive_eval_run_spec(
479
- adaptive_dataset: str,
480
- dataset_index: int,
481
- hyperparameter_config: Dict[str, Any],
482
- hyperparameter_config_index: int,
483
- experiment_id: str,
484
- project_id: str,
485
- metadata: Optional[Dict[str, Any]] = None,
486
- ) -> AdaptiveEvalRunSpec:
487
- """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
488
- dataset = adaptive_dataset.replace(":adaptive", "")
489
- adaptive_eval_run_spec = AdaptiveEvalRunSpec(
490
- dataset,
491
- dataset_index,
492
- hyperparameter_config,
493
- hyperparameter_config_index,
494
- experiment_id,
495
- project_id,
496
- metadata,
497
- )
498
- logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
499
- return adaptive_eval_run_spec
500
-
501
-
502
- async def _run_inference_callable(
503
- inference: Callable,
504
- items: List[Dict[str, Any]],
505
- hyperparameter_config: Dict[str, Any],
506
- ) -> Any:
507
- """Run inference on a given dataset and hyperparameter configuration."""
508
- if is_awaitable(inference):
509
- return await inference(items, **hyperparameter_config)
510
- else:
511
- return inference(items, **hyperparameter_config)
512
-
513
-
514
- def _score_metrics(
515
- eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
516
- ) -> Dict[str, Dict[str, Any]]:
517
- """Compute metric scores for a given dataset and inference outputs."""
518
- metric_scores: Dict[str, Dict[str, Any]] = {}
519
-
520
- if len(outputs) != len(labels):
521
- raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
522
-
523
- for metric in eval_dataset.metrics:
524
- try:
525
- aggregate_scores, item_scores = metric.score(outputs, labels)
526
- metric_scores[metric.name] = {
527
- "aggregate_scores": aggregate_scores,
528
- "item_scores": item_scores,
529
- }
530
- except Exception as e:
531
- logger.error(
532
- "Failed to compute metric '%s' for dataset '%s': %s",
533
- metric.name,
534
- eval_dataset.name,
535
- str(e),
536
- )
537
- raise MetricComputationError(metric.name, eval_dataset.name, e)
538
-
539
- return metric_scores
540
-
541
-
542
- async def _upload_classic_run(
543
- run_result: ClassicEvalRunResult,
544
- experiment_id: str,
545
- project_id: str,
546
- inference_callable: Optional[Callable] = None,
547
- metadata: Optional[Dict[str, Any]] = None,
548
- ) -> Optional[str]:
549
- """Upload a ClassicEvalRunResult to Trismik."""
550
-
551
- logger.debug("Uploading classic eval run: %s", run_result.run_spec)
552
- try:
553
- model_name = _get_model_name(inference_callable, metadata)
554
- response = await upload_classic_eval_run(
555
- run=run_result,
556
- experiment_id=experiment_id,
557
- project_id=project_id,
558
- model=model_name,
559
- metadata=metadata,
560
- )
561
- logger.info("Successfully uploaded classic eval run: %s", response.id)
562
- return str(response.id)
563
-
564
- except Exception as e:
565
- logger.error("Failed to upload classic eval run: %s", str(e))
566
- return None
567
-
568
-
569
- def _get_model_name(
570
- inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
571
- ) -> str:
572
- """Determine a model's name with the fallback "unspecified"."""
573
-
574
- # First priority: metadata.model
575
- if metadata and "model" in metadata:
576
- return str(metadata["model"])
577
-
578
- # Second priority: inference_pipeline.model (if callable is an InferencePipeline)
579
- if inference_callable and hasattr(inference_callable, "model"):
580
- return str(inference_callable.model)
581
-
582
- # Fallback: "unspecified"
583
- return "unspecified"
584
-
585
-
586
- def _format_results(
587
- eval_result: EvalResult,
588
- return_dict: bool,
589
- return_aggregates: bool,
590
- return_items: bool,
591
- return_output: bool,
592
- ) -> Union[EvalResult, Dict, List]:
593
-
594
- # Return results as a dict
595
- if return_dict:
596
- results = {}
597
-
598
- if return_aggregates:
599
- results["aggregate_results"] = eval_result.aggregate_scores
600
-
601
- if return_items:
602
- item_scores = eval_result.item_scores
603
-
604
- # Remove inference output if not requested
605
- if not return_output:
606
- for item in item_scores:
607
- item.pop("inference_output", None)
608
-
609
- results["item_results"] = item_scores
610
-
611
- # If both are requested, return the combined structure
612
- if return_aggregates and return_items:
613
- return results
614
- # If only aggregates requested, return just the list
615
- elif return_aggregates:
616
- return results["aggregate_results"]
617
- # If only items requested, return just the list
618
- else:
619
- return results["item_results"]
620
-
621
- # Return results as an EvalResult object
622
- else:
623
- return eval_result
@@ -1,6 +0,0 @@
1
- """Trismik authentication and API integration."""
2
-
3
- from .adaptive_testing_service import run_adaptive_evaluation
4
- from .login import get_stored_token, get_token, login, logout, whoami
5
-
6
- __all__ = ["login", "logout", "whoami", "get_stored_token", "get_token", "run_adaptive_evaluation"]