scorebook 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +15 -0
- scorebook/evaluator.py +228 -0
- scorebook/inference/__init__.py +11 -0
- scorebook/inference/openai.py +185 -0
- scorebook/inference/portkey.py +186 -0
- scorebook/metrics/__init__.py +18 -0
- scorebook/metrics/accuracy.py +42 -0
- scorebook/metrics/metric_base.py +28 -0
- scorebook/metrics/metric_registry.py +105 -0
- scorebook/metrics/precision.py +19 -0
- scorebook/types/__init__.py +11 -0
- scorebook/types/eval_dataset.py +310 -0
- scorebook/types/eval_result.py +129 -0
- scorebook/types/inference_pipeline.py +84 -0
- scorebook/utils/__init__.py +8 -0
- scorebook/utils/async_utils.py +27 -0
- scorebook/utils/io_helpers.py +28 -0
- scorebook/utils/mappers.py +36 -0
- scorebook/utils/progress_bars.py +89 -0
- scorebook/utils/transform_helpers.py +25 -0
- scorebook-0.0.1.dist-info/LICENSE +21 -0
- scorebook-0.0.1.dist-info/METADATA +376 -0
- scorebook-0.0.1.dist-info/RECORD +24 -0
- scorebook-0.0.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Utility functions and common helpers for the Scorebook framework."""
|
|
2
|
+
|
|
3
|
+
from scorebook.utils.async_utils import is_awaitable
|
|
4
|
+
from scorebook.utils.io_helpers import validate_path
|
|
5
|
+
from scorebook.utils.progress_bars import evaluation_progress
|
|
6
|
+
from scorebook.utils.transform_helpers import expand_dict
|
|
7
|
+
|
|
8
|
+
__all__ = ["is_awaitable", "validate_path", "expand_dict", "evaluation_progress"]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Async utilities for handling callable objects and coroutines."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_awaitable(obj: Callable) -> bool:
|
|
8
|
+
"""
|
|
9
|
+
Check if a callable object is awaitable.
|
|
10
|
+
|
|
11
|
+
This handles both coroutine functions and callable instances (like classes
|
|
12
|
+
with __call__ methods) that may return coroutines.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
obj: The callable object to check
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
True if the object is awaitable, False otherwise
|
|
19
|
+
"""
|
|
20
|
+
if asyncio.iscoroutinefunction(obj):
|
|
21
|
+
return True
|
|
22
|
+
|
|
23
|
+
# Check if it's a callable instance with an awaitable __call__ method
|
|
24
|
+
if hasattr(obj, "__call__") and asyncio.iscoroutinefunction(obj.__call__):
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
return False
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Input/output helper functions for Scorebook."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path:
|
|
8
|
+
"""Validate that a file path exists and optionally check its suffix.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
file_path: Path to the file as string or Path object
|
|
12
|
+
expected_suffix: Optional file extension to validate (e.g. ".json", ".csv")
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Path object for the validated file path
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
FileNotFoundError: If the file does not exist
|
|
19
|
+
ValueError: If the file has the wrong extension
|
|
20
|
+
"""
|
|
21
|
+
path = Path(file_path)
|
|
22
|
+
if not path.exists():
|
|
23
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
24
|
+
|
|
25
|
+
if expected_suffix and path.suffix.lower() != expected_suffix.lower():
|
|
26
|
+
raise ValueError(f"File must have {expected_suffix} extension, got: {path.suffix}")
|
|
27
|
+
|
|
28
|
+
return path
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Utility functions for mapping and converting data types in Scorebook."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
ClassificationResult = Literal["true_positive", "false_positive", "true_negative", "false_negative"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_binary(value: Any) -> int:
|
|
9
|
+
"""Transform various input types to binary (0/1) classification value."""
|
|
10
|
+
if value is None:
|
|
11
|
+
return 0
|
|
12
|
+
if isinstance(value, str):
|
|
13
|
+
if value.upper() in ["A", "1", "TRUE", "YES", "Y"]:
|
|
14
|
+
return 1
|
|
15
|
+
return 0
|
|
16
|
+
return 1 if value else 0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_binary_classification(prediction: Any, reference: Any) -> ClassificationResult:
|
|
20
|
+
"""
|
|
21
|
+
Determine classification result based on prediction and reference values.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
prediction: Predicted value (will be converted to binary)
|
|
25
|
+
reference: Reference/true value (will be converted to binary)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Classification result as one of: "true_positive", "false_positive",
|
|
29
|
+
"true_negative", "false_negative"
|
|
30
|
+
"""
|
|
31
|
+
pred_binary = to_binary(prediction)
|
|
32
|
+
ref_binary = to_binary(reference)
|
|
33
|
+
|
|
34
|
+
if pred_binary == 1:
|
|
35
|
+
return "true_positive" if ref_binary == 1 else "false_positive"
|
|
36
|
+
return "false_negative" if ref_binary == 1 else "true_negative"
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Progress bar utilities for evaluation tracking."""
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import Any, Generator, List, Optional
|
|
5
|
+
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EvaluationProgressBars:
|
|
10
|
+
"""Manages nested progress bars for evaluation tracking."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, datasets: List[Any], hyperparam_count: int) -> None:
|
|
13
|
+
"""Initialize progress bar manager.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
datasets: List of datasets being evaluated
|
|
17
|
+
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
18
|
+
"""
|
|
19
|
+
self.datasets = datasets
|
|
20
|
+
self.hyperparam_count = hyperparam_count
|
|
21
|
+
self.dataset_pbar: Optional[tqdm] = None
|
|
22
|
+
self.hyperparam_pbar: Optional[tqdm] = None
|
|
23
|
+
|
|
24
|
+
def start_dataset_progress(self) -> None:
|
|
25
|
+
"""Start the outer progress bar for datasets."""
|
|
26
|
+
self.dataset_pbar = tqdm(
|
|
27
|
+
total=len(self.datasets),
|
|
28
|
+
desc="Datasets ",
|
|
29
|
+
unit="dataset",
|
|
30
|
+
position=0,
|
|
31
|
+
leave=True,
|
|
32
|
+
ncols=80,
|
|
33
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def update_dataset_progress(self) -> None:
|
|
37
|
+
"""Update the dataset progress bar."""
|
|
38
|
+
if self.dataset_pbar:
|
|
39
|
+
self.dataset_pbar.update(1)
|
|
40
|
+
|
|
41
|
+
def close_dataset_progress(self) -> None:
|
|
42
|
+
"""Close the dataset progress bar."""
|
|
43
|
+
if self.dataset_pbar:
|
|
44
|
+
self.dataset_pbar.close()
|
|
45
|
+
self.dataset_pbar = None
|
|
46
|
+
|
|
47
|
+
@contextmanager
|
|
48
|
+
def hyperparam_progress_context(self) -> Generator[tqdm, None, None]:
|
|
49
|
+
"""Context manager for hyperparameter progress bar."""
|
|
50
|
+
self.hyperparam_pbar = tqdm(
|
|
51
|
+
total=self.hyperparam_count,
|
|
52
|
+
desc="Hyperparams",
|
|
53
|
+
unit="config",
|
|
54
|
+
position=1,
|
|
55
|
+
leave=False,
|
|
56
|
+
ncols=80,
|
|
57
|
+
bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
|
|
58
|
+
)
|
|
59
|
+
try:
|
|
60
|
+
yield self.hyperparam_pbar
|
|
61
|
+
finally:
|
|
62
|
+
self.hyperparam_pbar.close()
|
|
63
|
+
self.hyperparam_pbar = None
|
|
64
|
+
|
|
65
|
+
def update_hyperparam_progress(self) -> None:
|
|
66
|
+
"""Update the hyperparameter progress bar."""
|
|
67
|
+
if self.hyperparam_pbar:
|
|
68
|
+
self.hyperparam_pbar.update(1)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@contextmanager
|
|
72
|
+
def evaluation_progress(
|
|
73
|
+
datasets: List[Any], hyperparam_count: int
|
|
74
|
+
) -> Generator[EvaluationProgressBars, None, None]:
|
|
75
|
+
"""Context manager for evaluation progress bars.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
datasets: List of datasets being evaluated
|
|
79
|
+
hyperparam_count: Number of hyperparameter configurations per dataset
|
|
80
|
+
|
|
81
|
+
Yields:
|
|
82
|
+
EvaluationProgressBars: Progress bar manager instance
|
|
83
|
+
"""
|
|
84
|
+
progress_bars = EvaluationProgressBars(datasets, hyperparam_count)
|
|
85
|
+
progress_bars.start_dataset_progress()
|
|
86
|
+
try:
|
|
87
|
+
yield progress_bars
|
|
88
|
+
finally:
|
|
89
|
+
progress_bars.close_dataset_progress()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Utility functions for transforming and manipulating data structures."""
|
|
2
|
+
|
|
3
|
+
from itertools import product
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def expand_dict(data: dict) -> list[dict]:
|
|
7
|
+
"""Expand a dictionary with list values into multiple dictionaries.
|
|
8
|
+
|
|
9
|
+
Takes a dictionary that may contain list values and expands it into a list of dictionaries,
|
|
10
|
+
where each dictionary represents one possible combination of values from the lists.
|
|
11
|
+
Non-list values remain constant across all generated dictionaries.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
data: A dictionary potentially containing list values to be expanded
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
A list of dictionaries representing all possible combinations of the input values
|
|
18
|
+
"""
|
|
19
|
+
fixed = {k: v for k, v in data.items() if not isinstance(v, list)}
|
|
20
|
+
expandables = {k: v for k, v in data.items() if isinstance(v, list)}
|
|
21
|
+
|
|
22
|
+
keys, values = zip(*expandables.items()) if expandables else ([], [])
|
|
23
|
+
combinations = product(*values) if values else [()]
|
|
24
|
+
|
|
25
|
+
return [{**fixed, **dict(zip(keys, combo))} for combo in combinations]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Trismik Ltd
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: scorebook
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Python project for LLM evaluation.
|
|
5
|
+
Author: Euan Campbell
|
|
6
|
+
Author-email: euan@trismik.com
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Provides-Extra: examples
|
|
15
|
+
Provides-Extra: openai
|
|
16
|
+
Provides-Extra: portkey
|
|
17
|
+
Requires-Dist: accelerate ; extra == "examples"
|
|
18
|
+
Requires-Dist: datasets (>=3.6.0)
|
|
19
|
+
Requires-Dist: notebook ; extra == "examples"
|
|
20
|
+
Requires-Dist: openai ; extra == "openai"
|
|
21
|
+
Requires-Dist: portkey-ai ; extra == "portkey"
|
|
22
|
+
Requires-Dist: python-dotenv ; extra == "openai"
|
|
23
|
+
Requires-Dist: python-dotenv ; extra == "portkey"
|
|
24
|
+
Requires-Dist: torch ; extra == "examples"
|
|
25
|
+
Requires-Dist: torchaudio ; extra == "examples"
|
|
26
|
+
Requires-Dist: torchvision ; extra == "examples"
|
|
27
|
+
Requires-Dist: transformers ; extra == "examples"
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Scorebook
|
|
31
|
+
|
|
32
|
+
**A Python library for LLM evaluation**
|
|
33
|
+
|
|
34
|
+
<p align="center">
|
|
35
|
+
<img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
|
|
36
|
+
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
37
|
+
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
Scorebook is a flexible and extensible framework for evaluating Large Language Models (LLMs). It provides clear contracts for data loading, model inference, and metrics computation, making it easy to run comprehensive evaluations across different datasets, models, and metrics.
|
|
41
|
+
|
|
42
|
+
## β¨ Key Features
|
|
43
|
+
|
|
44
|
+
- **π Flexible Data Loading**: Support for Hugging Face datasets, CSV, JSON, and Python lists
|
|
45
|
+
- **π Model Agnostic**: Works with any model or inference provider
|
|
46
|
+
- **π Extensible Metric Engine**: Use the metrics we provide or implement your own
|
|
47
|
+
- **π Automated Sweeping**: Test multiple model configurations automatically
|
|
48
|
+
- **π Rich Results**: Export results to JSON, CSV, or structured formats like pandas DataFrames
|
|
49
|
+
|
|
50
|
+
## π Quick Start
|
|
51
|
+
|
|
52
|
+
### Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install scorebook
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For OpenAI integration:
|
|
59
|
+
```bash
|
|
60
|
+
pip install scorebook[openai]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For local model examples:
|
|
64
|
+
```bash
|
|
65
|
+
pip install scorebook[examples]
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Basic Usage
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from scorebook import EvalDataset, evaluate
|
|
72
|
+
from scorebook.metrics import Accuracy
|
|
73
|
+
|
|
74
|
+
# 1. Create an evaluation dataset
|
|
75
|
+
data = [
|
|
76
|
+
{"question": "What is 2 + 2?", "answer": "4"},
|
|
77
|
+
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
78
|
+
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
dataset = EvalDataset.from_list(
|
|
82
|
+
name="basic_qa",
|
|
83
|
+
label="answer",
|
|
84
|
+
metrics=[Accuracy],
|
|
85
|
+
data=data
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# 2. Define your inference function
|
|
89
|
+
def my_inference_function(items, **hyperparameters):
|
|
90
|
+
# Your model logic here
|
|
91
|
+
predictions = []
|
|
92
|
+
for item in items:
|
|
93
|
+
# Process each item and generate prediction
|
|
94
|
+
prediction = your_model.predict(item["question"])
|
|
95
|
+
predictions.append(prediction)
|
|
96
|
+
return predictions
|
|
97
|
+
|
|
98
|
+
# 3. Run evaluation
|
|
99
|
+
results = evaluate(my_inference_function, dataset)
|
|
100
|
+
print(results)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## π Core Components
|
|
104
|
+
|
|
105
|
+
### 1. Evaluation Datasets
|
|
106
|
+
|
|
107
|
+
Scorebook supports multiple data sources through the `EvalDataset` class:
|
|
108
|
+
|
|
109
|
+
#### From Hugging Face
|
|
110
|
+
```python
|
|
111
|
+
dataset = EvalDataset.from_huggingface(
|
|
112
|
+
"TIGER-Lab/MMLU-Pro",
|
|
113
|
+
label="answer",
|
|
114
|
+
metrics=[Accuracy],
|
|
115
|
+
split="validation"
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
#### From CSV
|
|
120
|
+
```python
|
|
121
|
+
dataset = EvalDataset.from_csv(
|
|
122
|
+
"dataset.csv",
|
|
123
|
+
label="answer",
|
|
124
|
+
metrics=[Accuracy]
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
#### From JSON
|
|
129
|
+
```python
|
|
130
|
+
dataset = EvalDataset.from_json(
|
|
131
|
+
"dataset.json",
|
|
132
|
+
label="answer",
|
|
133
|
+
metrics=[Accuracy]
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### From Python List
|
|
138
|
+
```python
|
|
139
|
+
dataset = EvalDataset.from_list(
|
|
140
|
+
name="custom_dataset",
|
|
141
|
+
label="answer",
|
|
142
|
+
metrics=[Accuracy],
|
|
143
|
+
data=[{"question": "...", "answer": "..."}]
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### 2. Model Integration
|
|
148
|
+
|
|
149
|
+
Scorebook offers two approaches for model integration:
|
|
150
|
+
|
|
151
|
+
#### Inference Functions
|
|
152
|
+
A single function that handles the complete pipeline:
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
def inference_function(eval_items, **hyperparameters):
|
|
156
|
+
results = []
|
|
157
|
+
for item in eval_items:
|
|
158
|
+
# 1. Preprocessing
|
|
159
|
+
prompt = format_prompt(item)
|
|
160
|
+
|
|
161
|
+
# 2. Inference
|
|
162
|
+
output = model.generate(prompt)
|
|
163
|
+
|
|
164
|
+
# 3. Postprocessing
|
|
165
|
+
prediction = extract_answer(output)
|
|
166
|
+
results.append(prediction)
|
|
167
|
+
|
|
168
|
+
return results
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
#### Inference Pipelines
|
|
172
|
+
Modular approach with separate stages:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from scorebook.types.inference_pipeline import InferencePipeline
|
|
176
|
+
|
|
177
|
+
def preprocessor(item):
|
|
178
|
+
return {"messages": [{"role": "user", "content": item["question"]}]}
|
|
179
|
+
|
|
180
|
+
def inference_function(processed_items, **hyperparameters):
|
|
181
|
+
return [model.generate(item) for item in processed_items]
|
|
182
|
+
|
|
183
|
+
def postprocessor(output):
|
|
184
|
+
return output.strip()
|
|
185
|
+
|
|
186
|
+
pipeline = InferencePipeline(
|
|
187
|
+
model="my-model",
|
|
188
|
+
preprocessor=preprocessor,
|
|
189
|
+
inference_function=inference_function,
|
|
190
|
+
postprocessor=postprocessor
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
results = evaluate(pipeline, dataset)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### 3. Metrics System
|
|
197
|
+
|
|
198
|
+
#### Built-in Metrics
|
|
199
|
+
- **Accuracy**: Percentage of correct predictions
|
|
200
|
+
- **Precision**: Accuracy of positive predictions
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from scorebook.metrics import Accuracy, Precision
|
|
204
|
+
|
|
205
|
+
dataset = EvalDataset.from_list(
|
|
206
|
+
name="test",
|
|
207
|
+
label="answer",
|
|
208
|
+
metrics=[Accuracy, Precision], # Multiple metrics
|
|
209
|
+
data=data
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### Custom Metrics
|
|
214
|
+
Create custom metrics by extending `MetricBase`:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from scorebook.metrics import MetricBase, MetricRegistry
|
|
218
|
+
|
|
219
|
+
@MetricRegistry.register()
|
|
220
|
+
class F1Score(MetricBase):
|
|
221
|
+
@staticmethod
|
|
222
|
+
def score(outputs, labels):
|
|
223
|
+
# Calculate F1 score
|
|
224
|
+
item_scores = [calculate_f1_item(o, l) for o, l in zip(outputs, labels)]
|
|
225
|
+
aggregate_score = {"f1": sum(item_scores) / len(item_scores)}
|
|
226
|
+
return aggregate_score, item_scores
|
|
227
|
+
|
|
228
|
+
# Use by string name or class
|
|
229
|
+
dataset = EvalDataset.from_list(..., metrics=["f1score"])
|
|
230
|
+
# or
|
|
231
|
+
dataset = EvalDataset.from_list(..., metrics=[F1Score])
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### 4. Hyperparameter Sweeping
|
|
235
|
+
|
|
236
|
+
Test multiple configurations automatically:
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
hyperparameters = {
|
|
240
|
+
"temperature": [0.7, 0.9, 1.0],
|
|
241
|
+
"max_tokens": [50, 100, 150],
|
|
242
|
+
"top_p": [0.8, 0.9]
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
results = evaluate(
|
|
246
|
+
inference_function,
|
|
247
|
+
dataset,
|
|
248
|
+
hyperparameters=hyperparameters,
|
|
249
|
+
score_type="all"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Results include all combinations: 3 Γ 3 Γ 2 = 18 configurations
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### 5. Results and Export
|
|
256
|
+
|
|
257
|
+
Control result format with `score_type`:
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
# Only aggregate scores (default)
|
|
261
|
+
results = evaluate(model, dataset, score_type="aggregate")
|
|
262
|
+
|
|
263
|
+
# Only per-item scores
|
|
264
|
+
results = evaluate(model, dataset, score_type="item")
|
|
265
|
+
|
|
266
|
+
# Both aggregate and per-item
|
|
267
|
+
results = evaluate(model, dataset, score_type="all")
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
Export results:
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
# Get EvalResult objects for advanced usage
|
|
274
|
+
results = evaluate(model, dataset, return_type="object")
|
|
275
|
+
|
|
276
|
+
# Export to files
|
|
277
|
+
for result in results:
|
|
278
|
+
result.to_json("results.json")
|
|
279
|
+
result.to_csv("results.csv")
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## π§ OpenAI Integration
|
|
283
|
+
|
|
284
|
+
Scorebook includes built-in OpenAI support for both single requests and batch processing:
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from scorebook.inference.openai import responses, batch
|
|
288
|
+
from scorebook.types.inference_pipeline import InferencePipeline
|
|
289
|
+
|
|
290
|
+
# For single requests
|
|
291
|
+
pipeline = InferencePipeline(
|
|
292
|
+
model="gpt-4o-mini",
|
|
293
|
+
preprocessor=format_for_openai,
|
|
294
|
+
inference_function=responses,
|
|
295
|
+
postprocessor=extract_response
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# For batch processing (more efficient for large datasets)
|
|
299
|
+
batch_pipeline = InferencePipeline(
|
|
300
|
+
model="gpt-4o-mini",
|
|
301
|
+
preprocessor=format_for_openai,
|
|
302
|
+
inference_function=batch,
|
|
303
|
+
postprocessor=extract_response
|
|
304
|
+
)
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## π Examples
|
|
308
|
+
|
|
309
|
+
The `examples/` directory contains comprehensive examples:
|
|
310
|
+
|
|
311
|
+
- **`basic_example.py`**: Local model evaluation with Hugging Face
|
|
312
|
+
- **`openai_responses_api.py`**: OpenAI API integration
|
|
313
|
+
- **`openai_batch_api.py`**: OpenAI Batch API for large-scale evaluation
|
|
314
|
+
- **`hyperparam_sweep.py`**: Hyperparameter optimization
|
|
315
|
+
- **`scorebook_showcase.ipynb`**: Interactive Jupyter notebook tutorial
|
|
316
|
+
|
|
317
|
+
Run an example:
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
cd examples/
|
|
321
|
+
python basic_example.py --output-dir ./my_results
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## ποΈ Architecture
|
|
325
|
+
|
|
326
|
+
Scorebook follows a modular architecture:
|
|
327
|
+
|
|
328
|
+
```
|
|
329
|
+
βββββββββββββββββββ ββββββββββββββββ βββββββββββββββββββ
|
|
330
|
+
β EvalDataset β β Inference β β Metrics β
|
|
331
|
+
β β β Pipeline β β β
|
|
332
|
+
β β’ Data Loading β β β β β’ Accuracy β
|
|
333
|
+
β β’ HF Integrationβ β β’ Preprocess β β β’ Precision β
|
|
334
|
+
β β’ CSV/JSON β β β’ Inference β β β’ Custom β
|
|
335
|
+
β β’ Validation β β β’ Postprocessβ β β’ Registry β
|
|
336
|
+
βββββββββββββββββββ ββββββββββββββββ βββββββββββββββββββ
|
|
337
|
+
β β β
|
|
338
|
+
βββββββββββββββββββββββββΌββββββββββββββββββββββββ
|
|
339
|
+
β
|
|
340
|
+
βββββββββββββββββββββββ
|
|
341
|
+
β evaluate() β
|
|
342
|
+
β β
|
|
343
|
+
β β’ Orchestration β
|
|
344
|
+
β β’ Progress Tracking β
|
|
345
|
+
β β’ Result Formatting β
|
|
346
|
+
β β’ Export Options β
|
|
347
|
+
βββββββββββββββββββββββ
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## π― Use Cases
|
|
351
|
+
|
|
352
|
+
Scorebook is designed for:
|
|
353
|
+
|
|
354
|
+
- **π Model Benchmarking**: Compare different models on standard datasets
|
|
355
|
+
- **βοΈ Hyperparameter Optimization**: Find optimal model configurations
|
|
356
|
+
- **π Dataset Analysis**: Understand model performance across different data types
|
|
357
|
+
- **π A/B Testing**: Compare model versions or approaches
|
|
358
|
+
- **π¬ Research Experiments**: Reproducible evaluation workflows
|
|
359
|
+
- **π Production Monitoring**: Track model performance over time
|
|
360
|
+
|
|
361
|
+
## π€ Contributing
|
|
362
|
+
|
|
363
|
+
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
364
|
+
|
|
365
|
+
## π License
|
|
366
|
+
|
|
367
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
368
|
+
|
|
369
|
+
## π’ About
|
|
370
|
+
|
|
371
|
+
Scorebook is developed by [Trismik](https://trismik.com) to speed up your LLM evaluation.
|
|
372
|
+
|
|
373
|
+
---
|
|
374
|
+
|
|
375
|
+
*For more examples and detailed documentation, check out the Jupyter notebook in `examples/scorebook_showcase.ipynb`*
|
|
376
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
scorebook/__init__.py,sha256=cYv8bT3_7o2MTxPVKiv51DcpaPtH_A9qOH5yF_FULZo,336
|
|
2
|
+
scorebook/evaluator.py,sha256=Ce4KerLVPlaF63xng9RKH9M1l-ldo3mdrd3T2dBs_YE,8908
|
|
3
|
+
scorebook/inference/__init__.py,sha256=sU_ZSN9eO7ajZ-QklNpx8_gf3jCdDn69J-SfU0z07-E,333
|
|
4
|
+
scorebook/inference/openai.py,sha256=XD1dbPrEHQJVXOMtqCt9a0yQ-qR381N5XXhCrgz8jio,5826
|
|
5
|
+
scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
|
|
6
|
+
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
7
|
+
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
8
|
+
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
9
|
+
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
10
|
+
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
11
|
+
scorebook/types/__init__.py,sha256=xQMOae_fIBbeyeuqoa7SbNwjxAiVinPBbckOcUzo57U,358
|
|
12
|
+
scorebook/types/eval_dataset.py,sha256=TeIeVHQ597NxedxaTEXohZO8gR5iAiDtJbCja_u69EI,11703
|
|
13
|
+
scorebook/types/eval_result.py,sha256=y0vLN6RMgiz1lyai5ltmzDibBHE25-k9bTrQ7U27RZ8,4552
|
|
14
|
+
scorebook/types/inference_pipeline.py,sha256=M3JgchpcVdhRJPzn3mh5ys6iivSt8eBmHIj4F5LcFYU,3167
|
|
15
|
+
scorebook/utils/__init__.py,sha256=DmhS61OZ2nNWkGxDfVrMBwwiH7dmLAbg3MHuNgaXhQg,382
|
|
16
|
+
scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
|
|
17
|
+
scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
|
|
18
|
+
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
19
|
+
scorebook/utils/progress_bars.py,sha256=BlKqYlXDbik5eUn5nf5f7QnMvnTj8CU_CfXKxCWp3Ww,2909
|
|
20
|
+
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
21
|
+
scorebook-0.0.1.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
22
|
+
scorebook-0.0.1.dist-info/METADATA,sha256=oiwYbuJkRVkoFZkIAQej09LdG5xBLxhKPy2ozWTV-_w,10976
|
|
23
|
+
scorebook-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
24
|
+
scorebook-0.0.1.dist-info/RECORD,,
|