scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,389 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: scorebook
|
|
3
|
-
Version: 0.0.13
|
|
4
|
-
Summary: A Python project for LLM evaluation.
|
|
5
|
-
License-File: LICENSE
|
|
6
|
-
Author: Euan Campbell
|
|
7
|
-
Author-email: euan@trismik.com
|
|
8
|
-
Requires-Python: >=3.9, <3.14
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
-
Provides-Extra: bedrock
|
|
16
|
-
Provides-Extra: examples
|
|
17
|
-
Provides-Extra: openai
|
|
18
|
-
Provides-Extra: portkey
|
|
19
|
-
Provides-Extra: vertex
|
|
20
|
-
Requires-Dist: accelerate ; extra == "examples"
|
|
21
|
-
Requires-Dist: boto3 (==1.40.0) ; extra == "bedrock"
|
|
22
|
-
Requires-Dist: datasets (>=3.6.0)
|
|
23
|
-
Requires-Dist: fsspec[gcs] ; extra == "vertex"
|
|
24
|
-
Requires-Dist: google-cloud-storage ; extra == "vertex"
|
|
25
|
-
Requires-Dist: google-genai ; extra == "vertex"
|
|
26
|
-
Requires-Dist: ipywidgets (>=8.0.0)
|
|
27
|
-
Requires-Dist: notebook (>=7.4.5,<8.0.0)
|
|
28
|
-
Requires-Dist: notebook ; extra == "examples"
|
|
29
|
-
Requires-Dist: openai ; extra == "openai"
|
|
30
|
-
Requires-Dist: pandas ; extra == "vertex"
|
|
31
|
-
Requires-Dist: portkey-ai ; extra == "portkey"
|
|
32
|
-
Requires-Dist: python-dotenv ; extra == "bedrock"
|
|
33
|
-
Requires-Dist: python-dotenv ; extra == "openai"
|
|
34
|
-
Requires-Dist: python-dotenv ; extra == "portkey"
|
|
35
|
-
Requires-Dist: python-dotenv ; extra == "vertex"
|
|
36
|
-
Requires-Dist: torch ; extra == "examples"
|
|
37
|
-
Requires-Dist: torchaudio ; extra == "examples"
|
|
38
|
-
Requires-Dist: torchvision ; extra == "examples"
|
|
39
|
-
Requires-Dist: transformers ; extra == "examples"
|
|
40
|
-
Requires-Dist: trismik (==1.0.2)
|
|
41
|
-
Description-Content-Type: text/markdown
|
|
42
|
-
|
|
43
|
-
# Scorebook
|
|
44
|
-
|
|
45
|
-
**A Python library for LLM evaluation**
|
|
46
|
-
|
|
47
|
-
<p align="center">
|
|
48
|
-
<img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
|
|
49
|
-
<img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
|
|
50
|
-
<img alt="License" src="https://img.shields.io/badge/license-MIT-green">
|
|
51
|
-
</p>
|
|
52
|
-
|
|
53
|
-
Scorebook is a flexible and extensible framework for evaluating Large Language Models (LLMs). It provides clear contracts for data loading, model inference, and metrics computation, making it easy to run comprehensive evaluations across different datasets, models, and metrics.
|
|
54
|
-
|
|
55
|
-
## ✨ Key Features
|
|
56
|
-
|
|
57
|
-
- **🔌 Flexible Data Loading**: Support for Hugging Face datasets, CSV, JSON, and Python lists
|
|
58
|
-
- **🚀 Model Agnostic**: Works with any model or inference provider
|
|
59
|
-
- **📊 Extensible Metric Engine**: Use the metrics we provide or implement your own
|
|
60
|
-
- **🔄 Automated Sweeping**: Test multiple model configurations automatically
|
|
61
|
-
- **📈 Rich Results**: Export results to JSON, CSV, or structured formats like pandas DataFrames
|
|
62
|
-
|
|
63
|
-
## 🚀 Quick Start
|
|
64
|
-
|
|
65
|
-
### Installation
|
|
66
|
-
|
|
67
|
-
```bash
|
|
68
|
-
pip install scorebook
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
For OpenAI integration:
|
|
72
|
-
```bash
|
|
73
|
-
pip install scorebook[openai]
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
For local model examples:
|
|
77
|
-
```bash
|
|
78
|
-
pip install scorebook[examples]
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
### Basic Usage
|
|
82
|
-
|
|
83
|
-
```python
|
|
84
|
-
from scorebook import EvalDataset, evaluate
|
|
85
|
-
from scorebook.metrics import Accuracy
|
|
86
|
-
|
|
87
|
-
# 1. Create an evaluation dataset
|
|
88
|
-
data = [
|
|
89
|
-
{"question": "What is 2 + 2?", "answer": "4"},
|
|
90
|
-
{"question": "What is the capital of France?", "answer": "Paris"},
|
|
91
|
-
{"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
|
|
92
|
-
]
|
|
93
|
-
|
|
94
|
-
dataset = EvalDataset.from_list(
|
|
95
|
-
name="basic_qa",
|
|
96
|
-
label="answer",
|
|
97
|
-
metrics=[Accuracy],
|
|
98
|
-
data=data
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
# 2. Define your inference function
|
|
102
|
-
def my_inference_function(items, **hyperparameters):
|
|
103
|
-
# Your model logic here
|
|
104
|
-
predictions = []
|
|
105
|
-
for item in items:
|
|
106
|
-
# Process each item and generate prediction
|
|
107
|
-
prediction = your_model.predict(item["question"])
|
|
108
|
-
predictions.append(prediction)
|
|
109
|
-
return predictions
|
|
110
|
-
|
|
111
|
-
# 3. Run evaluation
|
|
112
|
-
results = evaluate(my_inference_function, dataset)
|
|
113
|
-
print(results)
|
|
114
|
-
```
|
|
115
|
-
|
|
116
|
-
## 📊 Core Components
|
|
117
|
-
|
|
118
|
-
### 1. Evaluation Datasets
|
|
119
|
-
|
|
120
|
-
Scorebook supports multiple data sources through the `EvalDataset` class:
|
|
121
|
-
|
|
122
|
-
#### From Hugging Face
|
|
123
|
-
```python
|
|
124
|
-
dataset = EvalDataset.from_huggingface(
|
|
125
|
-
"TIGER-Lab/MMLU-Pro",
|
|
126
|
-
label="answer",
|
|
127
|
-
metrics=[Accuracy],
|
|
128
|
-
split="validation"
|
|
129
|
-
)
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
#### From CSV
|
|
133
|
-
```python
|
|
134
|
-
dataset = EvalDataset.from_csv(
|
|
135
|
-
"dataset.csv",
|
|
136
|
-
label="answer",
|
|
137
|
-
metrics=[Accuracy]
|
|
138
|
-
)
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
#### From JSON
|
|
142
|
-
```python
|
|
143
|
-
dataset = EvalDataset.from_json(
|
|
144
|
-
"dataset.json",
|
|
145
|
-
label="answer",
|
|
146
|
-
metrics=[Accuracy]
|
|
147
|
-
)
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
#### From Python List
|
|
151
|
-
```python
|
|
152
|
-
dataset = EvalDataset.from_list(
|
|
153
|
-
name="custom_dataset",
|
|
154
|
-
label="answer",
|
|
155
|
-
metrics=[Accuracy],
|
|
156
|
-
data=[{"question": "...", "answer": "..."}]
|
|
157
|
-
)
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
### 2. Model Integration
|
|
161
|
-
|
|
162
|
-
Scorebook offers two approaches for model integration:
|
|
163
|
-
|
|
164
|
-
#### Inference Functions
|
|
165
|
-
A single function that handles the complete pipeline:
|
|
166
|
-
|
|
167
|
-
```python
|
|
168
|
-
def inference_function(eval_items, **hyperparameters):
|
|
169
|
-
results = []
|
|
170
|
-
for item in eval_items:
|
|
171
|
-
# 1. Preprocessing
|
|
172
|
-
prompt = format_prompt(item)
|
|
173
|
-
|
|
174
|
-
# 2. Inference
|
|
175
|
-
output = model.generate(prompt)
|
|
176
|
-
|
|
177
|
-
# 3. Postprocessing
|
|
178
|
-
prediction = extract_answer(output)
|
|
179
|
-
results.append(prediction)
|
|
180
|
-
|
|
181
|
-
return results
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
#### Inference Pipelines
|
|
185
|
-
Modular approach with separate stages:
|
|
186
|
-
|
|
187
|
-
```python
|
|
188
|
-
from scorebook.types.inference_pipeline import InferencePipeline
|
|
189
|
-
|
|
190
|
-
def preprocessor(item):
|
|
191
|
-
return {"messages": [{"role": "user", "content": item["question"]}]}
|
|
192
|
-
|
|
193
|
-
def inference_function(processed_items, **hyperparameters):
|
|
194
|
-
return [model.generate(item) for item in processed_items]
|
|
195
|
-
|
|
196
|
-
def postprocessor(output):
|
|
197
|
-
return output.strip()
|
|
198
|
-
|
|
199
|
-
pipeline = InferencePipeline(
|
|
200
|
-
model="my-model",
|
|
201
|
-
preprocessor=preprocessor,
|
|
202
|
-
inference_function=inference_function,
|
|
203
|
-
postprocessor=postprocessor
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
results = evaluate(pipeline, dataset)
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
### 3. Metrics System
|
|
210
|
-
|
|
211
|
-
#### Built-in Metrics
|
|
212
|
-
- **Accuracy**: Percentage of correct predictions
|
|
213
|
-
- **Precision**: Accuracy of positive predictions
|
|
214
|
-
|
|
215
|
-
```python
|
|
216
|
-
from scorebook.metrics import Accuracy, Precision
|
|
217
|
-
|
|
218
|
-
dataset = EvalDataset.from_list(
|
|
219
|
-
name="test",
|
|
220
|
-
label="answer",
|
|
221
|
-
metrics=[Accuracy, Precision], # Multiple metrics
|
|
222
|
-
data=data
|
|
223
|
-
)
|
|
224
|
-
```
|
|
225
|
-
|
|
226
|
-
#### Custom Metrics
|
|
227
|
-
Create custom metrics by extending `MetricBase`:
|
|
228
|
-
|
|
229
|
-
```python
|
|
230
|
-
from scorebook.metrics import MetricBase, MetricRegistry
|
|
231
|
-
|
|
232
|
-
@MetricRegistry.register()
|
|
233
|
-
class F1Score(MetricBase):
|
|
234
|
-
@staticmethod
|
|
235
|
-
def score(outputs, labels):
|
|
236
|
-
# Calculate F1 score
|
|
237
|
-
item_scores = [calculate_f1_item(o, l) for o, l in zip(outputs, labels)]
|
|
238
|
-
aggregate_score = {"f1": sum(item_scores) / len(item_scores)}
|
|
239
|
-
return aggregate_score, item_scores
|
|
240
|
-
|
|
241
|
-
# Use by string name or class
|
|
242
|
-
dataset = EvalDataset.from_list(..., metrics=["f1score"])
|
|
243
|
-
# or
|
|
244
|
-
dataset = EvalDataset.from_list(..., metrics=[F1Score])
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
### 4. Hyperparameter Sweeping
|
|
248
|
-
|
|
249
|
-
Test multiple configurations automatically:
|
|
250
|
-
|
|
251
|
-
```python
|
|
252
|
-
hyperparameters = {
|
|
253
|
-
"temperature": [0.7, 0.9, 1.0],
|
|
254
|
-
"max_tokens": [50, 100, 150],
|
|
255
|
-
"top_p": [0.8, 0.9]
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
results = evaluate(
|
|
259
|
-
inference_function,
|
|
260
|
-
dataset,
|
|
261
|
-
hyperparameters=hyperparameters,
|
|
262
|
-
score_type="all"
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
# Results include all combinations: 3 × 3 × 2 = 18 configurations
|
|
266
|
-
```
|
|
267
|
-
|
|
268
|
-
### 5. Results and Export
|
|
269
|
-
|
|
270
|
-
Control result format with `score_type`:
|
|
271
|
-
|
|
272
|
-
```python
|
|
273
|
-
# Only aggregate scores (default)
|
|
274
|
-
results = evaluate(model, dataset, score_type="aggregate")
|
|
275
|
-
|
|
276
|
-
# Only per-item scores
|
|
277
|
-
results = evaluate(model, dataset, score_type="item")
|
|
278
|
-
|
|
279
|
-
# Both aggregate and per-item
|
|
280
|
-
results = evaluate(model, dataset, score_type="all")
|
|
281
|
-
```
|
|
282
|
-
|
|
283
|
-
Export results:
|
|
284
|
-
|
|
285
|
-
```python
|
|
286
|
-
# Get EvalResult objects for advanced usage
|
|
287
|
-
results = evaluate(model, dataset, return_type="object")
|
|
288
|
-
|
|
289
|
-
# Export to files
|
|
290
|
-
for result in results:
|
|
291
|
-
result.to_json("results.json")
|
|
292
|
-
result.to_csv("results.csv")
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
## 🔧 OpenAI Integration
|
|
296
|
-
|
|
297
|
-
Scorebook includes built-in OpenAI support for both single requests and batch processing:
|
|
298
|
-
|
|
299
|
-
```python
|
|
300
|
-
from scorebook.inference.openai import responses, batch
|
|
301
|
-
from scorebook.types.inference_pipeline import InferencePipeline
|
|
302
|
-
|
|
303
|
-
# For single requests
|
|
304
|
-
pipeline = InferencePipeline(
|
|
305
|
-
model="gpt-4o-mini",
|
|
306
|
-
preprocessor=format_for_openai,
|
|
307
|
-
inference_function=responses,
|
|
308
|
-
postprocessor=extract_response
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
# For batch processing (more efficient for large datasets)
|
|
312
|
-
batch_pipeline = InferencePipeline(
|
|
313
|
-
model="gpt-4o-mini",
|
|
314
|
-
preprocessor=format_for_openai,
|
|
315
|
-
inference_function=batch,
|
|
316
|
-
postprocessor=extract_response
|
|
317
|
-
)
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
## 📋 Examples
|
|
321
|
-
|
|
322
|
-
The `examples/` directory contains comprehensive examples:
|
|
323
|
-
|
|
324
|
-
- **`basic_example.py`**: Local model evaluation with Hugging Face
|
|
325
|
-
- **`openai_responses_api.py`**: OpenAI API integration
|
|
326
|
-
- **`openai_batch_api.py`**: OpenAI Batch API for large-scale evaluation
|
|
327
|
-
- **`hyperparam_sweep.py`**: Hyperparameter optimization
|
|
328
|
-
- **`scorebook_showcase.ipynb`**: Interactive Jupyter notebook tutorial
|
|
329
|
-
|
|
330
|
-
Run an example:
|
|
331
|
-
|
|
332
|
-
```bash
|
|
333
|
-
cd examples/
|
|
334
|
-
python basic_example.py --output-dir ./my_results
|
|
335
|
-
```
|
|
336
|
-
|
|
337
|
-
## 🏗️ Architecture
|
|
338
|
-
|
|
339
|
-
Scorebook follows a modular architecture:
|
|
340
|
-
|
|
341
|
-
```
|
|
342
|
-
┌─────────────────┐ ┌──────────────┐ ┌─────────────────┐
|
|
343
|
-
│ EvalDataset │ │ Inference │ │ Metrics │
|
|
344
|
-
│ │ │ Pipeline │ │ │
|
|
345
|
-
│ • Data Loading │ │ │ │ • Accuracy │
|
|
346
|
-
│ • HF Integration│ │ • Preprocess │ │ • Precision │
|
|
347
|
-
│ • CSV/JSON │ │ • Inference │ │ • Custom │
|
|
348
|
-
│ • Validation │ │ • Postprocess│ │ • Registry │
|
|
349
|
-
└─────────────────┘ └──────────────┘ └─────────────────┘
|
|
350
|
-
│ │ │
|
|
351
|
-
└───────────────────────┼───────────────────────┘
|
|
352
|
-
│
|
|
353
|
-
┌─────────────────────┐
|
|
354
|
-
│ evaluate() │
|
|
355
|
-
│ │
|
|
356
|
-
│ • Orchestration │
|
|
357
|
-
│ • Progress Tracking │
|
|
358
|
-
│ • Result Formatting │
|
|
359
|
-
│ • Export Options │
|
|
360
|
-
└─────────────────────┘
|
|
361
|
-
```
|
|
362
|
-
|
|
363
|
-
## 🎯 Use Cases
|
|
364
|
-
|
|
365
|
-
Scorebook is designed for:
|
|
366
|
-
|
|
367
|
-
- **🏆 Model Benchmarking**: Compare different models on standard datasets
|
|
368
|
-
- **⚙️ Hyperparameter Optimization**: Find optimal model configurations
|
|
369
|
-
- **📊 Dataset Analysis**: Understand model performance across different data types
|
|
370
|
-
- **🔄 A/B Testing**: Compare model versions or approaches
|
|
371
|
-
- **🔬 Research Experiments**: Reproducible evaluation workflows
|
|
372
|
-
- **📈 Production Monitoring**: Track model performance over time
|
|
373
|
-
|
|
374
|
-
## 🤝 Contributing
|
|
375
|
-
|
|
376
|
-
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
377
|
-
|
|
378
|
-
## 📄 License
|
|
379
|
-
|
|
380
|
-
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
381
|
-
|
|
382
|
-
## 🏢 About
|
|
383
|
-
|
|
384
|
-
Scorebook is developed by [Trismik](https://trismik.com) to speed up your LLM evaluation.
|
|
385
|
-
|
|
386
|
-
---
|
|
387
|
-
|
|
388
|
-
*For more examples and detailed documentation, check out the Jupyter notebook in `examples/scorebook_showcase.ipynb`*
|
|
389
|
-
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
scorebook/__init__.py,sha256=dcaqd4-qxLHPCw6p-LS_0b8JumEpHDtEilgwP8qNKRY,868
|
|
2
|
-
scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
|
|
3
|
-
scorebook/cli/auth.py,sha256=T6-5662Jh-HEhZvfUgy82BvxIiRzjZne-4LRp9Gb2JE,2937
|
|
4
|
-
scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
|
|
5
|
-
scorebook/eval_datasets/__init__.py,sha256=9YPjxjdaMaOrBUzJwvsUlFPl-KdYMgUGTV3WNd7OCU0,128
|
|
6
|
-
scorebook/eval_datasets/eval_dataset.py,sha256=rgNLPajIe6RqEq_qEeV6UExT1ZRx4dFX6XA_qnpbRIM,27930
|
|
7
|
-
scorebook/evaluate/__init__.py,sha256=m3mCjeLildghT86ZDwY4GxCmaYZmhjbxkuTk0M9S_mc,423
|
|
8
|
-
scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
scorebook/evaluate/_async/evaluate_async.py,sha256=QAvAA6_upg_vK2M8BV_DVegQ7hxZCO_PaOaM3yj3e6Y,16284
|
|
10
|
-
scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
scorebook/evaluate/_sync/evaluate.py,sha256=MQ53VMPLlVBcZm2mOUs3zXeM3hvoWz3fLj_BNvrOLvU,16087
|
|
12
|
-
scorebook/evaluate/evaluate_helpers.py,sha256=j-oTKAfVhyH-kxdM7YjuoolS_SwBb2WbjWhe5jeZiwU,16942
|
|
13
|
-
scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
|
|
14
|
-
scorebook/inference/__init__.py,sha256=u3TmfftO0oMkz8ngwxAKLPfL1so1w2hbK7c5UNlRq-M,345
|
|
15
|
-
scorebook/inference/clients/__init__.py,sha256=QCjbrXYeFd7xK-5ZH7o7bSaKUJqHtGnH5285pezNKyY,242
|
|
16
|
-
scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
|
|
17
|
-
scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
|
|
18
|
-
scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
|
|
19
|
-
scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
|
|
20
|
-
scorebook/inference/inference_pipeline.py,sha256=SOr1xnglPvFMcJFSpDRLQ6222NJgy_-fVtZLC423TUE,5559
|
|
21
|
-
scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
|
|
22
|
-
scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
|
|
23
|
-
scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
|
|
24
|
-
scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
|
|
25
|
-
scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
|
|
26
|
-
scorebook/score/__init__.py,sha256=pwjSEb8Tc1edQpYDuu49wnupazISpRX3DQGD2cfiJek,208
|
|
27
|
-
scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
-
scorebook/score/_async/score_async.py,sha256=GM84UcuFvW1x6ZIePEshG2cwVNB9GvwhhjouOduUwTA,6097
|
|
29
|
-
scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
scorebook/score/_sync/score.py,sha256=rbJhYEhu8auHG4AwpZIkmzw_0ZK1bzbDiIK7Q0ApxhY,6043
|
|
31
|
-
scorebook/score/score_helpers.py,sha256=lq0t5UrOgxa_pDiwL3yHbBlT2BL5B-SkWw1nyaXVoZU,7074
|
|
32
|
-
scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
|
|
33
|
-
scorebook/trismik/__init__.py,sha256=kWZkEC57LJscRZNLE3sJR1L5w-ltb5mEbQd3_ePtZPQ,380
|
|
34
|
-
scorebook/trismik/credentials.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
|
|
35
|
-
scorebook/trismik/upload_results.py,sha256=jgT9EVFpuv6OmrYgZVi032cbRrcCOyX4ulLDeWPFBWU,9743
|
|
36
|
-
scorebook/types.py,sha256=f6X3lNtpIB6jC12WeLugtSpOgY4KlQgUlhxRG_d2GGE,4891
|
|
37
|
-
scorebook/utils/__init__.py,sha256=crefSaTUWkhFF-w4kotUzcz9_GGZukQDgRit4HxJRHY,805
|
|
38
|
-
scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
|
|
39
|
-
scorebook/utils/common_helpers.py,sha256=jewPdQH4JqTWcYT31wn1WNucOPLtGbrGdViwwlYRhD4,1216
|
|
40
|
-
scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
|
|
41
|
-
scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
|
|
42
|
-
scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
|
|
43
|
-
scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
|
|
44
|
-
scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
|
|
45
|
-
scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
|
|
46
|
-
scorebook-0.0.13.dist-info/METADATA,sha256=Z4ZyN6upriTbl-8RciJGFufcwbuY_eNO3O7aSeIAr6U,11508
|
|
47
|
-
scorebook-0.0.13.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
48
|
-
scorebook-0.0.13.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
|
|
49
|
-
scorebook-0.0.13.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
|
|
50
|
-
scorebook-0.0.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|