scorebook 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. scorebook/__init__.py +12 -4
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/evaluate/__init__.py +15 -0
  4. scorebook/evaluate/_async/__init__.py +0 -0
  5. scorebook/evaluate/_async/evaluate_async.py +413 -0
  6. scorebook/evaluate/_sync/__init__.py +0 -0
  7. scorebook/evaluate/_sync/evaluate.py +413 -0
  8. scorebook/evaluate/evaluate_helpers.py +365 -0
  9. scorebook/inference/__init__.py +4 -0
  10. scorebook/inference/clients/__init__.py +8 -0
  11. scorebook/inference/{openai.py → clients/openai.py} +35 -23
  12. scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
  13. scorebook/settings.py +18 -0
  14. scorebook/trismik/__init__.py +10 -0
  15. scorebook/utils/__init__.py +9 -2
  16. scorebook/utils/async_utils.py +20 -1
  17. scorebook/utils/progress_bars.py +22 -61
  18. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/METADATA +3 -4
  19. scorebook-0.0.10.dist-info/RECORD +41 -0
  20. scorebook/evaluate.py +0 -623
  21. scorebook/trismik_services/__init__.py +0 -6
  22. scorebook/trismik_services/adaptive_testing_service.py +0 -141
  23. scorebook/trismik_services/upload_classic_eval_run.py +0 -102
  24. scorebook-0.0.8.dist-info/RECORD +0 -36
  25. /scorebook/inference/{bedrock.py → clients/bedrock.py} +0 -0
  26. /scorebook/inference/{portkey.py → clients/portkey.py} +0 -0
  27. /scorebook/inference/{vertex.py → clients/vertex.py} +0 -0
  28. /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
  29. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/WHEEL +0 -0
  30. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/entry_points.txt +0 -0
  31. {scorebook-0.0.8.dist-info → scorebook-0.0.10.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,365 @@
1
+ """Helper utilities shared by synchronous and asynchronous evaluation flows."""
2
+
3
+ import asyncio
4
+ import dataclasses
5
+ import inspect
6
+ import logging
7
+ from typing import Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Union
8
+
9
+ from trismik._async.client import TrismikAsyncClient
10
+ from trismik._sync.client import TrismikClient
11
+ from trismik.types import TrismikMultipleChoiceTextItem
12
+
13
+ from scorebook import EvalDataset
14
+ from scorebook.exceptions import (
15
+ DataMismatchError,
16
+ MetricComputationError,
17
+ ParameterValidationError,
18
+ ScoreBookError,
19
+ )
20
+ from scorebook.settings import TRISMIK_SERVICE_URL
21
+ from scorebook.trismik.credentials import get_token
22
+ from scorebook.types import AdaptiveEvalDataset, AdaptiveEvalRunSpec, EvalResult, EvalRunSpec
23
+ from scorebook.utils import expand_dict, is_awaitable
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
29
+ """Resolve the upload_results parameter based on trismik login status."""
30
+
31
+ if upload_results == "auto":
32
+ upload_results = get_token() is not None
33
+ logger.debug("Auto upload results resolved to: %s", upload_results)
34
+
35
+ return upload_results
36
+
37
+
38
+ def validate_parameters(params: Dict[str, Any], caller: Callable[..., Any]) -> None:
39
+ """Validate all parameters for evaluation."""
40
+
41
+ caller_is_async = is_awaitable(caller)
42
+
43
+ # Sync evaluate() should only accept sync inference functions
44
+ if not caller_is_async and is_awaitable(params.get("inference")):
45
+ raise ParameterValidationError(
46
+ "evaluate() only accepts synchronous inference functions. "
47
+ "Use evaluate_async() for async inference functions."
48
+ )
49
+
50
+ # Async evaluate_async() should only accept async inference functions
51
+ if caller_is_async and not is_awaitable(params.get("inference")):
52
+ raise ParameterValidationError(
53
+ "evaluate_async() only accepts asynchronous inference functions. "
54
+ "Use evaluate() for sync inference functions."
55
+ )
56
+
57
+ # If returning a dict, it must contain items and/or aggregates
58
+ if params["return_dict"] and not params["return_aggregates"] and not params["return_items"]:
59
+ raise ParameterValidationError(
60
+ "When return_dict=True, at least one of return_aggregates or return_items must be True"
61
+ )
62
+
63
+ # If uploading results, experiment_id and project_id must be specified
64
+ if params["upload_results"]:
65
+ if params["experiment_id"] is None or params["project_id"] is None:
66
+ raise ParameterValidationError(
67
+ "experiment_id and project_id are required for upload_results=True"
68
+ )
69
+
70
+ logger.debug("Parameter validation successful")
71
+
72
+
73
+ def prepare_datasets(
74
+ datasets: Union[str, EvalDataset, List[Union[str, EvalDataset]]],
75
+ sample_size: Optional[int] = None,
76
+ ) -> List[Union[EvalDataset, AdaptiveEvalDataset]]:
77
+ """Prepare and separate input datasets into classic and adaptive evaluation datasets."""
78
+
79
+ # Ensure datasets is always a list for consistent processing
80
+ if not isinstance(datasets, list):
81
+ datasets = [datasets]
82
+
83
+ datasets_out: List[Union[EvalDataset, AdaptiveEvalDataset]] = []
84
+ for dataset in datasets:
85
+
86
+ # Prepare classic datasets
87
+ if isinstance(dataset, EvalDataset):
88
+
89
+ if sample_size is not None:
90
+ dataset = dataset.sample(sample_size)
91
+
92
+ datasets_out.append(dataset)
93
+
94
+ # Prepare adaptive datasets
95
+ elif isinstance(dataset, str) and dataset.endswith(":adaptive"):
96
+ datasets_out.append(AdaptiveEvalDataset(dataset.replace(":adaptive", "")))
97
+
98
+ # TODO: dataset name string registry
99
+ elif isinstance(dataset, str):
100
+ pass
101
+
102
+ else:
103
+ raise ParameterValidationError(f"Unrecognized dataset type: {type(dataset)}")
104
+
105
+ return datasets_out
106
+
107
+
108
+ def prepare_hyperparameter_configs(
109
+ hyperparameters: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]]
110
+ ) -> List[Dict[str, Any]]:
111
+ """Prepare hyperparameters for evaluation by returning a list of hyper-param configs."""
112
+ if hyperparameters is None:
113
+ return [{}]
114
+ if not isinstance(hyperparameters, list):
115
+ expanded: List[Dict[str, Any]] = expand_dict(hyperparameters or {})
116
+ return expanded
117
+
118
+ logger.info("Evaluating with hyperparameters: %s", hyperparameters)
119
+
120
+ return hyperparameters
121
+
122
+
123
+ def build_eval_run_specs(
124
+ datasets: List[Union[EvalDataset, str]],
125
+ hyperparameters: Any,
126
+ experiment_id: Optional[str],
127
+ project_id: Optional[str],
128
+ metadata: Optional[Dict[str, Any]] = None,
129
+ ) -> List[Union[EvalRunSpec, AdaptiveEvalRunSpec]]:
130
+ """Build All RunSpec objects for each dataset/hyperparameter combination."""
131
+
132
+ eval_run_specs: List[Union[EvalRunSpec, AdaptiveEvalRunSpec]] = []
133
+ for dataset_index, dataset in enumerate(datasets):
134
+ for hyperparameters_index, hyperparameter_config in enumerate(hyperparameters):
135
+
136
+ # Create classic eval run spec
137
+ if isinstance(dataset, EvalDataset):
138
+ eval_run_specs.append(
139
+ build_classic_eval_run_spec(
140
+ dataset, dataset_index, hyperparameter_config, hyperparameters_index
141
+ )
142
+ )
143
+
144
+ # Create adaptive eval run spec from string
145
+ elif isinstance(dataset, AdaptiveEvalDataset):
146
+ if not experiment_id or not project_id:
147
+ raise ScoreBookError(
148
+ "experiment_id and project_id are required for adaptive evaluations"
149
+ )
150
+ eval_run_specs.append(
151
+ build_adaptive_eval_run_spec(
152
+ dataset.name,
153
+ dataset_index,
154
+ hyperparameter_config,
155
+ hyperparameters_index,
156
+ experiment_id,
157
+ project_id,
158
+ metadata,
159
+ )
160
+ )
161
+
162
+ # Log warning - should never happen
163
+ else:
164
+ logger.warning("Unrecognized dataset type: %s", dataset)
165
+
166
+ return eval_run_specs
167
+
168
+
169
+ def build_classic_eval_run_spec(
170
+ dataset: EvalDataset,
171
+ dataset_index: int,
172
+ hyperparameters: Dict[str, Any],
173
+ hyperparameters_index: int,
174
+ ) -> EvalRunSpec:
175
+ """Build EvalRunSpec objects for a classic dataset and hyperparameter combination."""
176
+ items = dataset.items
177
+ labels = [item.get(dataset.label) for item in items]
178
+ eval_run_spec = EvalRunSpec(
179
+ dataset,
180
+ dataset_index,
181
+ hyperparameters,
182
+ hyperparameters_index,
183
+ items,
184
+ labels,
185
+ )
186
+ logger.debug("Built EvalRunSpec: %s", eval_run_spec)
187
+ return eval_run_spec
188
+
189
+
190
+ def build_adaptive_eval_run_spec(
191
+ adaptive_dataset: str,
192
+ dataset_index: int,
193
+ hyperparameter_config: Dict[str, Any],
194
+ hyperparameter_config_index: int,
195
+ experiment_id: str,
196
+ project_id: str,
197
+ metadata: Optional[Dict[str, Any]] = None,
198
+ ) -> AdaptiveEvalRunSpec:
199
+ """Build AdaptiveEvalRunSpec objects for a dataset/hyperparameter combination."""
200
+ dataset = adaptive_dataset.replace(":adaptive", "")
201
+ adaptive_eval_run_spec = AdaptiveEvalRunSpec(
202
+ dataset,
203
+ dataset_index,
204
+ hyperparameter_config,
205
+ hyperparameter_config_index,
206
+ experiment_id,
207
+ project_id,
208
+ metadata,
209
+ )
210
+ logger.debug("Built AdaptiveEvalRunSpec: %s", adaptive_eval_run_spec)
211
+ return adaptive_eval_run_spec
212
+
213
+
214
+ def score_metrics(
215
+ eval_dataset: EvalDataset, outputs: List[Any], labels: List[Any]
216
+ ) -> Dict[str, Dict[str, Any]]:
217
+ """Compute metric scores for a given dataset and inference outputs."""
218
+ metric_scores: Dict[str, Dict[str, Any]] = {}
219
+
220
+ if len(outputs) != len(labels):
221
+ raise DataMismatchError(len(outputs), len(labels), eval_dataset.name)
222
+
223
+ for metric in eval_dataset.metrics:
224
+ try:
225
+ aggregate_scores, item_scores = metric.score(outputs, labels)
226
+ metric_scores[metric.name] = {
227
+ "aggregate_scores": aggregate_scores,
228
+ "item_scores": item_scores,
229
+ }
230
+ except Exception as e:
231
+ logger.error(
232
+ "Failed to compute metric '%s' for dataset '%s': %s",
233
+ metric.name,
234
+ eval_dataset.name,
235
+ str(e),
236
+ )
237
+ raise MetricComputationError(metric.name, eval_dataset.name, e)
238
+
239
+ return metric_scores
240
+
241
+
242
+ def create_trismik_async_client() -> TrismikAsyncClient:
243
+ """Create a new async Trismik client instance."""
244
+ api_key = get_token()
245
+ logger.debug("Creating new async Trismik client")
246
+ return TrismikAsyncClient(service_url=TRISMIK_SERVICE_URL, api_key=api_key)
247
+
248
+
249
+ def create_trismik_sync_client() -> TrismikClient:
250
+ """Create a new sync Trismik client instance."""
251
+ api_key = get_token()
252
+ logger.debug("Creating new sync Trismik client")
253
+ return TrismikClient(service_url=TRISMIK_SERVICE_URL, api_key=api_key)
254
+
255
+
256
+ def get_model_name(
257
+ inference_callable: Optional[Callable] = None, metadata: Optional[Dict[str, Any]] = None
258
+ ) -> str:
259
+ """Determine a model's name with the fallback "unspecified"."""
260
+
261
+ # First priority: metadata.model
262
+ if metadata and "model" in metadata:
263
+ return str(metadata["model"])
264
+
265
+ # Second priority: inference_pipeline.model (if callable is an InferencePipeline)
266
+ if inference_callable and hasattr(inference_callable, "model"):
267
+ return str(inference_callable.model)
268
+
269
+ # Fallback: "unspecified"
270
+ return "unspecified"
271
+
272
+
273
+ def format_results(
274
+ eval_result: EvalResult,
275
+ return_dict: bool,
276
+ return_aggregates: bool,
277
+ return_items: bool,
278
+ return_output: bool,
279
+ ) -> Union[EvalResult, Dict, List]:
280
+ """Format an `EvalResult` into the requested output structure."""
281
+
282
+ # Return results as a dict
283
+ if return_dict:
284
+ results = {}
285
+
286
+ if return_aggregates:
287
+ results["aggregate_results"] = eval_result.aggregate_scores
288
+
289
+ if return_items:
290
+ item_scores = eval_result.item_scores
291
+
292
+ # Remove inference output if not requested
293
+ if not return_output:
294
+ for item in item_scores:
295
+ item.pop("inference_output", None)
296
+
297
+ results["item_results"] = item_scores
298
+
299
+ # If both are requested, return the combined structure
300
+ if return_aggregates and return_items:
301
+ return results
302
+ # If only aggregates requested, return just the list
303
+ elif return_aggregates:
304
+ return results["aggregate_results"]
305
+ # If only items requested, return just the list
306
+ else:
307
+ return results["item_results"]
308
+
309
+ # Return results as an EvalResult object
310
+ else:
311
+ return eval_result
312
+
313
+
314
+ def make_trismik_inference(
315
+ inference_function: Callable[..., Any],
316
+ return_list: bool = False,
317
+ ) -> Callable[[Any], Any]:
318
+ """Wrap an inference function for flexible input handling.
319
+
320
+ Takes a function expecting list[dict] and makes it accept single dict
321
+ or TrismikMultipleChoiceTextItem.
322
+ """
323
+
324
+ # Check if the inference function is async
325
+ is_async = inspect.iscoroutinefunction(inference_function) or (
326
+ hasattr(inference_function, "__call__")
327
+ and inspect.iscoroutinefunction(inference_function.__call__)
328
+ )
329
+
330
+ def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
331
+ # Single TrismikMultipleChoiceTextItem dataclass
332
+ if isinstance(eval_items, TrismikMultipleChoiceTextItem):
333
+ eval_item_dict = dataclasses.asdict(eval_items)
334
+ results = inference_function([eval_item_dict], **kwargs)
335
+ if is_async:
336
+ results = asyncio.run(results)
337
+ return results if return_list else results[0]
338
+
339
+ # Single item (a mapping)
340
+ if isinstance(eval_items, Mapping):
341
+ results = inference_function([eval_items], **kwargs)
342
+ if is_async:
343
+ results = asyncio.run(results)
344
+ return results if return_list else results[0]
345
+
346
+ # Iterable of items (but not a string/bytes)
347
+ if isinstance(eval_items, Iterable) and not isinstance(eval_items, (str, bytes)):
348
+ # Convert any TrismikMultipleChoiceTextItem instances to dicts
349
+ converted_items = []
350
+ for item in eval_items:
351
+ if isinstance(item, TrismikMultipleChoiceTextItem):
352
+ converted_items.append(dataclasses.asdict(item))
353
+ else:
354
+ converted_items.append(item)
355
+ results = inference_function(converted_items, **kwargs)
356
+ if is_async:
357
+ results = asyncio.run(results)
358
+ return results
359
+
360
+ raise TypeError(
361
+ "Expected a single item (Mapping[str, Any] or TrismikMultipleChoiceTextItem) "
362
+ "or an iterable of such items."
363
+ )
364
+
365
+ return sync_trismik_inference_function
@@ -5,3 +5,7 @@ This module provides functionality for running inference with various models
5
5
  and processing their responses. It includes utilities for both single and
6
6
  batch inference operations.
7
7
  """
8
+
9
+ from scorebook.inference.inference_pipeline import InferencePipeline
10
+
11
+ __all__ = ["InferencePipeline"]
@@ -0,0 +1,8 @@
1
+ """
2
+ Inference clients for various LLM providers.
3
+
4
+ This module provides client implementations for different LLM providers including
5
+ OpenAI, AWS Bedrock, Google Vertex AI, and Portkey.
6
+ """
7
+
8
+ __all__ = ["bedrock", "openai", "portkey", "vertex"]
@@ -10,7 +10,7 @@ import asyncio
10
10
  import json
11
11
  import logging
12
12
  import tempfile
13
- from typing import Any, List
13
+ from typing import Any, List, Optional
14
14
 
15
15
  from openai import AsyncOpenAI
16
16
 
@@ -18,7 +18,10 @@ logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  async def responses(
21
- items: List[Any], model: str = "gpt-4.1-nano", client: Any = None, **hyperparameters: Any
21
+ items: List[Any],
22
+ model: str = "gpt-4.1-nano",
23
+ client: Optional[AsyncOpenAI] = None,
24
+ **hyperparameters: Any,
22
25
  ) -> List[Any]:
23
26
  """Process multiple inference requests using OpenAI's Async API.
24
27
 
@@ -28,23 +31,28 @@ async def responses(
28
31
  Args:
29
32
  items: List of preprocessed items to process.
30
33
  model: OpenAI model to use.
31
- client: Optional OpenAI client instance.
34
+ client: Optional OpenAI client instance. If not provided, creates a new client
35
+ with automatic cleanup using a context manager.
32
36
  hyperparameters: Dictionary of hyperparameters for inference.
33
37
 
34
38
  Returns:
35
39
  List of raw model responses.
36
-
37
- Raises:
38
- NotImplementedError: Currently not implemented.
39
40
  """
41
+ if client is None:
42
+ async with AsyncOpenAI() as client:
43
+ return await _do_responses(items, model, client, **hyperparameters)
44
+ else:
45
+ return await _do_responses(items, model, client, **hyperparameters)
46
+
47
+
48
+ async def _do_responses(
49
+ items: List[Any], model: str, client: AsyncOpenAI, **hyperparameters: Any
50
+ ) -> List[Any]:
51
+ """Process responses internally with provided client."""
40
52
  logger.debug("OpenAI responses function called with %d items", len(items))
41
53
  logger.debug("Using model: %s", model)
42
54
  logger.debug("Hyperparameters: %s", hyperparameters)
43
55
 
44
- if client is None:
45
- logger.debug("Creating new AsyncOpenAI client")
46
- client = AsyncOpenAI()
47
-
48
56
  # Create all tasks concurrently for true parallelism
49
57
  tasks = []
50
58
  for i, item in enumerate(items):
@@ -127,7 +135,7 @@ async def responses(
127
135
  async def batch(
128
136
  items: List[Any],
129
137
  model: str = "gpt-4.1-nano",
130
- client: Any = None,
138
+ client: Optional[AsyncOpenAI] = None,
131
139
  **hyperparameters: Any,
132
140
  ) -> List[Any]:
133
141
  """Process multiple inference requests in batch using OpenAI's API.
@@ -138,18 +146,24 @@ async def batch(
138
146
  Args:
139
147
  items: List of preprocessed items to process.
140
148
  model: OpenAI model to use.
141
- client: Optional OpenAI client instance.
149
+ client: Optional OpenAI client instance. If not provided, creates a new client
150
+ with automatic cleanup using a context manager.
142
151
  hyperparameters: Dictionary of hyperparameters for inference.
143
152
 
144
153
  Returns:
145
154
  A list of raw model responses.
146
-
147
- Raises:
148
- NotImplementedError: Currently not implemented.
149
155
  """
150
156
  if client is None:
151
- client = AsyncOpenAI()
157
+ async with AsyncOpenAI() as client:
158
+ return await _do_batch(items, model, client, **hyperparameters)
159
+ else:
160
+ return await _do_batch(items, model, client, **hyperparameters)
152
161
 
162
+
163
+ async def _do_batch(
164
+ items: List[Any], model: str, client: AsyncOpenAI, **hyperparameters: Any
165
+ ) -> List[Any]:
166
+ """Process batch internally with provided client."""
153
167
  file_id = await _upload_batch(items, client)
154
168
  batch_id = await _start_batch(file_id, client)
155
169
 
@@ -173,18 +187,16 @@ async def batch(
173
187
  return batch_result
174
188
 
175
189
 
176
- async def _upload_batch(items: List[Any], client: Any) -> str:
190
+ async def _upload_batch(items: List[Any], client: AsyncOpenAI) -> str:
177
191
  """Create a .jsonl file from preprocessed items and upload to OpenAI for batch processing.
178
192
 
179
193
  Args:
180
194
  items: A list of preprocessed items, each representing a single dataset eval item.
195
+ client: OpenAI client instance.
181
196
 
182
197
  Returns:
183
198
  The file ID returned by OpenAI after uploading.
184
199
  """
185
- # Instantiate OpenAI client
186
- if client is None:
187
- client = AsyncOpenAI()
188
200
 
189
201
  # Create temp .jsonl file
190
202
  with tempfile.NamedTemporaryFile(mode="w+", suffix=".jsonl", delete=False) as f:
@@ -206,7 +218,7 @@ async def _upload_batch(items: List[Any], client: Any) -> str:
206
218
  return str(response.id)
207
219
 
208
220
 
209
- async def _start_batch(file_id: str, client: Any) -> str:
221
+ async def _start_batch(file_id: str, client: AsyncOpenAI) -> str:
210
222
  batch_response = await client.batches.create(
211
223
  input_file_id=file_id,
212
224
  endpoint="/v1/chat/completions",
@@ -215,12 +227,12 @@ async def _start_batch(file_id: str, client: Any) -> str:
215
227
  return str(batch_response.id)
216
228
 
217
229
 
218
- async def _get_batch(batch_id: str, client: Any) -> Any:
230
+ async def _get_batch(batch_id: str, client: AsyncOpenAI) -> Any:
219
231
  batch_object = await client.batches.retrieve(batch_id)
220
232
  return batch_object
221
233
 
222
234
 
223
- async def _get_results_file(output_file_id: str, client: Any) -> List[str]:
235
+ async def _get_results_file(output_file_id: str, client: AsyncOpenAI) -> List[str]:
224
236
  """Download and parse the batch results file from OpenAI."""
225
237
  response = await client.files.content(output_file_id)
226
238
 
@@ -9,6 +9,8 @@ configurable way.
9
9
  import asyncio
10
10
  from typing import Any, Callable, Dict, List, Optional, cast
11
11
 
12
+ from scorebook.utils import is_awaitable
13
+
12
14
 
13
15
  class InferencePipeline:
14
16
  """A pipeline for processing items through model inference.
@@ -18,6 +20,8 @@ class InferencePipeline:
18
20
  2. Model inference
19
21
  3. Postprocessing of model outputs
20
22
 
23
+ The pipeline automatically adapts to sync or async execution based on the
24
+ inference function provided during initialization.
21
25
 
22
26
  Attributes:
23
27
  model: Name or identifier of the model being used
@@ -35,6 +39,9 @@ class InferencePipeline:
35
39
  ) -> None:
36
40
  """Initialize the inference pipeline.
37
41
 
42
+ The pipeline will automatically become sync or async based on the
43
+ inference_function provided.
44
+
38
45
  Args:
39
46
  model: Name or identifier of the model to use
40
47
  inference_function: Function that performs model inference
@@ -46,8 +53,59 @@ class InferencePipeline:
46
53
  self.preprocessor: Optional[Callable] = preprocessor
47
54
  self.postprocessor: Optional[Callable] = postprocessor
48
55
 
56
+ # Dynamically change the class to provide appropriate sync/async interface
57
+ self.__class__ = (
58
+ _AsyncInferencePipeline if is_awaitable(inference_function) else _SyncInferencePipeline
59
+ )
60
+
61
+
62
+ class _SyncInferencePipeline(InferencePipeline):
63
+ """Synchronous version of InferencePipeline."""
64
+
65
+ def run(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
66
+ """Execute the complete inference pipeline synchronously.
67
+
68
+ Args:
69
+ items: List of items to process through the pipeline
70
+ **hyperparameters: Model-specific parameters for inference
71
+
72
+ Returns:
73
+ List of processed outputs after running through the complete pipeline
74
+ """
75
+ if self.preprocessor:
76
+ input_items = [self.preprocessor(item, **hyperparameters) for item in items]
77
+ else:
78
+ input_items = items
79
+
80
+ # Sync inference function - call directly
81
+ inference_outputs = self.inference_function(input_items, **hyperparameters)
82
+
83
+ if self.postprocessor:
84
+ return [
85
+ self.postprocessor(inference_output, **hyperparameters)
86
+ for inference_output in inference_outputs
87
+ ]
88
+ else:
89
+ return cast(List[Any], inference_outputs)
90
+
91
+ def __call__(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
92
+ """Make the pipeline instance callable synchronously.
93
+
94
+ Args:
95
+ items: List of items to process through the pipeline
96
+ **hyperparameters: Model-specific parameters for inference
97
+
98
+ Returns:
99
+ List of processed outputs after running through the complete pipeline
100
+ """
101
+ return self.run(items, **hyperparameters)
102
+
103
+
104
+ class _AsyncInferencePipeline(InferencePipeline):
105
+ """Asynchronous version of InferencePipeline."""
106
+
49
107
  async def run(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
50
- """Execute the complete inference pipeline on a list of items.
108
+ """Execute the complete inference pipeline asynchronously.
51
109
 
52
110
  Args:
53
111
  items: List of items to process through the pipeline
@@ -61,10 +119,14 @@ class InferencePipeline:
61
119
  else:
62
120
  input_items = items
63
121
 
64
- if asyncio.iscoroutinefunction(self.inference_function):
122
+ # Handle both sync and async inference functions
123
+ if is_awaitable(self.inference_function):
65
124
  inference_outputs = await self.inference_function(input_items, **hyperparameters)
66
125
  else:
67
- inference_outputs = self.inference_function(input_items, **hyperparameters)
126
+ # Run sync function in thread pool to avoid blocking
127
+ inference_outputs = await asyncio.to_thread(
128
+ self.inference_function, input_items, **hyperparameters
129
+ )
68
130
 
69
131
  if self.postprocessor:
70
132
  return [
@@ -75,7 +137,7 @@ class InferencePipeline:
75
137
  return cast(List[Any], inference_outputs)
76
138
 
77
139
  async def __call__(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
78
- """Make the pipeline instance callable by wrapping the run method.
140
+ """Make the pipeline instance callable asynchronously.
79
141
 
80
142
  Args:
81
143
  items: List of items to process through the pipeline
scorebook/settings.py ADDED
@@ -0,0 +1,18 @@
1
+ """Configuration settings for Scorebook."""
2
+
3
+ import os
4
+
5
+ # Optional: Load environment variables from .env file if python-dotenv is available
6
+ try:
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv(verbose=False)
10
+ except ImportError: # pragma: no cover
11
+ pass # python-dotenv not installed, skip .env file loading
12
+
13
+ # Trismik API settings
14
+ TRISMIK_API_BASE_URL = "https://api.trismik.com"
15
+ TRISMIK_ADAPTIVE_TESTING_URL = f"{TRISMIK_API_BASE_URL}/adaptive-testing"
16
+
17
+ # Allow override via environment variable
18
+ TRISMIK_SERVICE_URL = os.environ.get("TRISMIK_SERVICE_URL", TRISMIK_ADAPTIVE_TESTING_URL)
@@ -0,0 +1,10 @@
1
+ """Trismik authentication and API integration.
2
+
3
+ Note: Trismik evaluation functionality has been moved to scorebook.evaluate module.
4
+ This module now only provides authentication functions.
5
+ """
6
+
7
+ # Import shared credential functions
8
+ from .credentials import get_stored_token, get_token, login, logout, whoami
9
+
10
+ __all__ = ["login", "logout", "whoami", "get_stored_token", "get_token"]
@@ -1,9 +1,16 @@
1
1
  """Utility functions and common helpers for the Scorebook framework."""
2
2
 
3
- from scorebook.utils.async_utils import is_awaitable
3
+ from scorebook.utils.async_utils import async_nullcontext, is_awaitable
4
4
  from scorebook.utils.build_prompt import build_prompt
5
5
  from scorebook.utils.io_helpers import validate_path
6
6
  from scorebook.utils.progress_bars import evaluation_progress
7
7
  from scorebook.utils.transform_helpers import expand_dict
8
8
 
9
- __all__ = ["is_awaitable", "validate_path", "expand_dict", "evaluation_progress", "build_prompt"]
9
+ __all__ = [
10
+ "async_nullcontext",
11
+ "is_awaitable",
12
+ "validate_path",
13
+ "expand_dict",
14
+ "evaluation_progress",
15
+ "build_prompt",
16
+ ]