scorebook 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +14 -6
- scorebook/cli/auth.py +1 -1
- scorebook/eval_datasets/__init__.py +5 -0
- scorebook/eval_datasets/eval_dataset.py +719 -0
- scorebook/evaluate/__init__.py +15 -0
- scorebook/evaluate/_async/__init__.py +0 -0
- scorebook/evaluate/_async/evaluate_async.py +443 -0
- scorebook/evaluate/_sync/__init__.py +0 -0
- scorebook/evaluate/_sync/evaluate.py +443 -0
- scorebook/evaluate/evaluate_helpers.py +388 -0
- scorebook/exceptions.py +48 -0
- scorebook/inference/__init__.py +4 -0
- scorebook/inference/clients/__init__.py +8 -0
- scorebook/inference/{bedrock.py → clients/bedrock.py} +1 -1
- scorebook/inference/{openai.py → clients/openai.py} +35 -23
- scorebook/inference/{portkey.py → clients/portkey.py} +1 -1
- scorebook/inference/{vertex.py → clients/vertex.py} +1 -1
- scorebook/{inference_pipeline.py → inference/inference_pipeline.py} +66 -4
- scorebook/settings.py +21 -0
- scorebook/trismik/__init__.py +10 -0
- scorebook/types.py +8 -5
- scorebook/utils/__init__.py +11 -4
- scorebook/utils/async_utils.py +20 -1
- scorebook/utils/io_helpers.py +18 -5
- scorebook/utils/progress_bars.py +739 -96
- scorebook/utils/{build_prompt.py → render_template.py} +13 -12
- {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/METADATA +4 -4
- scorebook-0.0.11.dist-info/RECORD +42 -0
- scorebook/eval_dataset.py +0 -404
- scorebook/evaluate.py +0 -623
- scorebook/trismik_services/__init__.py +0 -6
- scorebook/trismik_services/adaptive_testing_service.py +0 -141
- scorebook/trismik_services/upload_classic_eval_run.py +0 -102
- scorebook-0.0.9.dist-info/RECORD +0 -36
- /scorebook/{trismik_services/login.py → trismik/credentials.py} +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/WHEEL +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.9.dist-info → scorebook-0.0.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,6 +9,8 @@ configurable way.
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from typing import Any, Callable, Dict, List, Optional, cast
|
|
11
11
|
|
|
12
|
+
from scorebook.utils import is_awaitable
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class InferencePipeline:
|
|
14
16
|
"""A pipeline for processing items through model inference.
|
|
@@ -18,6 +20,8 @@ class InferencePipeline:
|
|
|
18
20
|
2. Model inference
|
|
19
21
|
3. Postprocessing of model outputs
|
|
20
22
|
|
|
23
|
+
The pipeline automatically adapts to sync or async execution based on the
|
|
24
|
+
inference function provided during initialization.
|
|
21
25
|
|
|
22
26
|
Attributes:
|
|
23
27
|
model: Name or identifier of the model being used
|
|
@@ -35,6 +39,9 @@ class InferencePipeline:
|
|
|
35
39
|
) -> None:
|
|
36
40
|
"""Initialize the inference pipeline.
|
|
37
41
|
|
|
42
|
+
The pipeline will automatically become sync or async based on the
|
|
43
|
+
inference_function provided.
|
|
44
|
+
|
|
38
45
|
Args:
|
|
39
46
|
model: Name or identifier of the model to use
|
|
40
47
|
inference_function: Function that performs model inference
|
|
@@ -46,8 +53,59 @@ class InferencePipeline:
|
|
|
46
53
|
self.preprocessor: Optional[Callable] = preprocessor
|
|
47
54
|
self.postprocessor: Optional[Callable] = postprocessor
|
|
48
55
|
|
|
56
|
+
# Dynamically change the class to provide appropriate sync/async interface
|
|
57
|
+
self.__class__ = (
|
|
58
|
+
_AsyncInferencePipeline if is_awaitable(inference_function) else _SyncInferencePipeline
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _SyncInferencePipeline(InferencePipeline):
|
|
63
|
+
"""Synchronous version of InferencePipeline."""
|
|
64
|
+
|
|
65
|
+
def run(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
66
|
+
"""Execute the complete inference pipeline synchronously.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
items: List of items to process through the pipeline
|
|
70
|
+
**hyperparameters: Model-specific parameters for inference
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
List of processed outputs after running through the complete pipeline
|
|
74
|
+
"""
|
|
75
|
+
if self.preprocessor:
|
|
76
|
+
input_items = [self.preprocessor(item, **hyperparameters) for item in items]
|
|
77
|
+
else:
|
|
78
|
+
input_items = items
|
|
79
|
+
|
|
80
|
+
# Sync inference function - call directly
|
|
81
|
+
inference_outputs = self.inference_function(input_items, **hyperparameters)
|
|
82
|
+
|
|
83
|
+
if self.postprocessor:
|
|
84
|
+
return [
|
|
85
|
+
self.postprocessor(inference_output, **hyperparameters)
|
|
86
|
+
for inference_output in inference_outputs
|
|
87
|
+
]
|
|
88
|
+
else:
|
|
89
|
+
return cast(List[Any], inference_outputs)
|
|
90
|
+
|
|
91
|
+
def __call__(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
92
|
+
"""Make the pipeline instance callable synchronously.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
items: List of items to process through the pipeline
|
|
96
|
+
**hyperparameters: Model-specific parameters for inference
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of processed outputs after running through the complete pipeline
|
|
100
|
+
"""
|
|
101
|
+
return self.run(items, **hyperparameters)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class _AsyncInferencePipeline(InferencePipeline):
|
|
105
|
+
"""Asynchronous version of InferencePipeline."""
|
|
106
|
+
|
|
49
107
|
async def run(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
50
|
-
"""Execute the complete inference pipeline
|
|
108
|
+
"""Execute the complete inference pipeline asynchronously.
|
|
51
109
|
|
|
52
110
|
Args:
|
|
53
111
|
items: List of items to process through the pipeline
|
|
@@ -61,10 +119,14 @@ class InferencePipeline:
|
|
|
61
119
|
else:
|
|
62
120
|
input_items = items
|
|
63
121
|
|
|
64
|
-
|
|
122
|
+
# Handle both sync and async inference functions
|
|
123
|
+
if is_awaitable(self.inference_function):
|
|
65
124
|
inference_outputs = await self.inference_function(input_items, **hyperparameters)
|
|
66
125
|
else:
|
|
67
|
-
|
|
126
|
+
# Run sync function in thread pool to avoid blocking
|
|
127
|
+
inference_outputs = await asyncio.to_thread(
|
|
128
|
+
self.inference_function, input_items, **hyperparameters
|
|
129
|
+
)
|
|
68
130
|
|
|
69
131
|
if self.postprocessor:
|
|
70
132
|
return [
|
|
@@ -75,7 +137,7 @@ class InferencePipeline:
|
|
|
75
137
|
return cast(List[Any], inference_outputs)
|
|
76
138
|
|
|
77
139
|
async def __call__(self, items: List[Dict[str, Any]], **hyperparameters: Any) -> List[Any]:
|
|
78
|
-
"""Make the pipeline instance callable
|
|
140
|
+
"""Make the pipeline instance callable asynchronously.
|
|
79
141
|
|
|
80
142
|
Args:
|
|
81
143
|
items: List of items to process through the pipeline
|
scorebook/settings.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Configuration settings for Scorebook."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Optional: Load environment variables from .env file if python-dotenv is available
|
|
6
|
+
try:
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
load_dotenv(verbose=False)
|
|
10
|
+
except ImportError: # pragma: no cover
|
|
11
|
+
pass # python-dotenv not installed, skip .env file loading
|
|
12
|
+
|
|
13
|
+
# Trismik API settings
|
|
14
|
+
TRISMIK_API_BASE_URL = "https://api.trismik.com"
|
|
15
|
+
TRISMIK_ADAPTIVE_TESTING_URL = f"{TRISMIK_API_BASE_URL}/adaptive-testing"
|
|
16
|
+
|
|
17
|
+
# Allow override via environment variable
|
|
18
|
+
TRISMIK_SERVICE_URL = os.environ.get("TRISMIK_SERVICE_URL", TRISMIK_ADAPTIVE_TESTING_URL)
|
|
19
|
+
|
|
20
|
+
# Progress bar configuration
|
|
21
|
+
SHOW_PROGRESS_BARS = os.environ.get("SCOREBOOK_SHOW_PROGRESS_BARS", "true").lower() == "true"
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Trismik authentication and API integration.
|
|
2
|
+
|
|
3
|
+
Note: Trismik evaluation functionality has been moved to scorebook.evaluate module.
|
|
4
|
+
This module now only provides authentication functions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Import shared credential functions
|
|
8
|
+
from .credentials import get_stored_token, get_token, login, logout, whoami
|
|
9
|
+
|
|
10
|
+
__all__ = ["login", "logout", "whoami", "get_stored_token", "get_token"]
|
scorebook/types.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from scorebook.
|
|
6
|
+
from scorebook.eval_datasets import EvalDataset
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass
|
|
@@ -21,7 +21,7 @@ class EvalRunSpec:
|
|
|
21
21
|
dataset_index: int
|
|
22
22
|
hyperparameter_config: Dict[str, Any]
|
|
23
23
|
hyperparameters_index: int
|
|
24
|
-
|
|
24
|
+
inputs: List[Any]
|
|
25
25
|
labels: List[Any]
|
|
26
26
|
|
|
27
27
|
def __str__(self) -> str:
|
|
@@ -64,13 +64,15 @@ class ClassicEvalRunResult:
|
|
|
64
64
|
|
|
65
65
|
if self.outputs:
|
|
66
66
|
for idx, output in enumerate(self.outputs):
|
|
67
|
-
if idx >= len(self.run_spec.
|
|
67
|
+
if idx >= len(self.run_spec.inputs):
|
|
68
68
|
break
|
|
69
69
|
|
|
70
70
|
result = {
|
|
71
|
-
"
|
|
71
|
+
"id": idx,
|
|
72
72
|
"dataset_name": self.run_spec.dataset.name,
|
|
73
|
-
"
|
|
73
|
+
"input": self.run_spec.inputs[idx],
|
|
74
|
+
"label": self.run_spec.labels[idx] if idx < len(self.run_spec.labels) else None,
|
|
75
|
+
"output": output,
|
|
74
76
|
**self.run_spec.hyperparameter_config,
|
|
75
77
|
}
|
|
76
78
|
|
|
@@ -125,6 +127,7 @@ class AdaptiveEvalRunResult:
|
|
|
125
127
|
"""Results from executing an adaptive evaluation run."""
|
|
126
128
|
|
|
127
129
|
run_spec: AdaptiveEvalRunSpec
|
|
130
|
+
run_completed: bool
|
|
128
131
|
scores: Dict[str, Any]
|
|
129
132
|
|
|
130
133
|
@property
|
scorebook/utils/__init__.py
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
2
|
|
|
3
|
-
from scorebook.utils.async_utils import is_awaitable
|
|
4
|
-
from scorebook.utils.build_prompt import build_prompt
|
|
3
|
+
from scorebook.utils.async_utils import async_nullcontext, is_awaitable
|
|
5
4
|
from scorebook.utils.io_helpers import validate_path
|
|
6
|
-
from scorebook.utils.progress_bars import
|
|
5
|
+
from scorebook.utils.progress_bars import evaluation_progress_context
|
|
6
|
+
from scorebook.utils.render_template import render_template
|
|
7
7
|
from scorebook.utils.transform_helpers import expand_dict
|
|
8
8
|
|
|
9
|
-
__all__ = [
|
|
9
|
+
__all__ = [
|
|
10
|
+
"async_nullcontext",
|
|
11
|
+
"is_awaitable",
|
|
12
|
+
"validate_path",
|
|
13
|
+
"expand_dict",
|
|
14
|
+
"evaluation_progress_context",
|
|
15
|
+
"render_template",
|
|
16
|
+
]
|
scorebook/utils/async_utils.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
"""Async utilities for handling callable objects and coroutines."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
from
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from typing import AsyncIterator, Callable, Optional, TypeVar
|
|
6
|
+
|
|
7
|
+
T = TypeVar("T")
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
def is_awaitable(obj: Callable) -> bool:
|
|
@@ -25,3 +28,19 @@ def is_awaitable(obj: Callable) -> bool:
|
|
|
25
28
|
return True
|
|
26
29
|
|
|
27
30
|
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@asynccontextmanager
|
|
34
|
+
async def async_nullcontext(value: Optional[T] = None) -> AsyncIterator[Optional[T]]:
|
|
35
|
+
"""Async version of contextlib.nullcontext for Python 3.9 compatibility.
|
|
36
|
+
|
|
37
|
+
contextlib.nullcontext() is sync-only and cannot be used with async with on Python 3.9.
|
|
38
|
+
This provides an async equivalent that can be used with async context managers.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
value: Optional value to yield from the context manager
|
|
42
|
+
|
|
43
|
+
Yields:
|
|
44
|
+
The provided value
|
|
45
|
+
"""
|
|
46
|
+
yield value
|
scorebook/utils/io_helpers.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
"""Input/output helper functions for Scorebook."""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def validate_path(
|
|
7
|
+
def validate_path(
|
|
8
|
+
file_path: Union[str, Path], expected_suffix: Optional[Union[str, Tuple[str, ...]]] = None
|
|
9
|
+
) -> Path:
|
|
8
10
|
"""Validate that a file path exists and optionally check its suffix.
|
|
9
11
|
|
|
10
12
|
Args:
|
|
11
13
|
file_path: Path to the file as string or Path object
|
|
12
|
-
expected_suffix: Optional file extension to validate
|
|
14
|
+
expected_suffix: Optional file extension(s) to validate.
|
|
15
|
+
Can be a single string (e.g. ".json") or tuple of strings (e.g. (".yaml", ".yml"))
|
|
13
16
|
|
|
14
17
|
Returns:
|
|
15
18
|
Path object for the validated file path
|
|
@@ -22,7 +25,17 @@ def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path
|
|
|
22
25
|
if not path.exists():
|
|
23
26
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
24
27
|
|
|
25
|
-
if expected_suffix
|
|
26
|
-
|
|
28
|
+
if expected_suffix:
|
|
29
|
+
# Convert single suffix to tuple for uniform handling
|
|
30
|
+
allowed_suffixes = (
|
|
31
|
+
(expected_suffix,) if isinstance(expected_suffix, str) else expected_suffix
|
|
32
|
+
)
|
|
33
|
+
allowed_suffixes_lower = tuple(s.lower() for s in allowed_suffixes)
|
|
34
|
+
|
|
35
|
+
if path.suffix.lower() not in allowed_suffixes_lower:
|
|
36
|
+
suffix_list = ", ".join(f"'{s}'" for s in allowed_suffixes)
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"File must have one of ({suffix_list}) extensions, got: '{path.suffix}'"
|
|
39
|
+
)
|
|
27
40
|
|
|
28
41
|
return path
|