scorebook 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """Utility functions and common helpers for the Scorebook framework."""
2
+
3
+ from scorebook.utils.async_utils import is_awaitable
4
+ from scorebook.utils.io_helpers import validate_path
5
+ from scorebook.utils.progress_bars import evaluation_progress
6
+ from scorebook.utils.transform_helpers import expand_dict
7
+
8
+ __all__ = ["is_awaitable", "validate_path", "expand_dict", "evaluation_progress"]
@@ -0,0 +1,27 @@
1
+ """Async utilities for handling callable objects and coroutines."""
2
+
3
+ import asyncio
4
+ from typing import Callable
5
+
6
+
7
+ def is_awaitable(obj: Callable) -> bool:
8
+ """
9
+ Check if a callable object is awaitable.
10
+
11
+ This handles both coroutine functions and callable instances (like classes
12
+ with __call__ methods) that may return coroutines.
13
+
14
+ Args:
15
+ obj: The callable object to check
16
+
17
+ Returns:
18
+ True if the object is awaitable, False otherwise
19
+ """
20
+ if asyncio.iscoroutinefunction(obj):
21
+ return True
22
+
23
+ # Check if it's a callable instance with an awaitable __call__ method
24
+ if hasattr(obj, "__call__") and asyncio.iscoroutinefunction(obj.__call__):
25
+ return True
26
+
27
+ return False
@@ -0,0 +1,28 @@
1
+ """Input/output helper functions for Scorebook."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+
7
+ def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path:
8
+ """Validate that a file path exists and optionally check its suffix.
9
+
10
+ Args:
11
+ file_path: Path to the file as string or Path object
12
+ expected_suffix: Optional file extension to validate (e.g. ".json", ".csv")
13
+
14
+ Returns:
15
+ Path object for the validated file path
16
+
17
+ Raises:
18
+ FileNotFoundError: If the file does not exist
19
+ ValueError: If the file has the wrong extension
20
+ """
21
+ path = Path(file_path)
22
+ if not path.exists():
23
+ raise FileNotFoundError(f"File not found: {file_path}")
24
+
25
+ if expected_suffix and path.suffix.lower() != expected_suffix.lower():
26
+ raise ValueError(f"File must have {expected_suffix} extension, got: {path.suffix}")
27
+
28
+ return path
@@ -0,0 +1,36 @@
1
+ """Utility functions for mapping and converting data types in Scorebook."""
2
+
3
+ from typing import Any, Literal
4
+
5
+ ClassificationResult = Literal["true_positive", "false_positive", "true_negative", "false_negative"]
6
+
7
+
8
+ def to_binary(value: Any) -> int:
9
+ """Transform various input types to binary (0/1) classification value."""
10
+ if value is None:
11
+ return 0
12
+ if isinstance(value, str):
13
+ if value.upper() in ["A", "1", "TRUE", "YES", "Y"]:
14
+ return 1
15
+ return 0
16
+ return 1 if value else 0
17
+
18
+
19
+ def to_binary_classification(prediction: Any, reference: Any) -> ClassificationResult:
20
+ """
21
+ Determine classification result based on prediction and reference values.
22
+
23
+ Args:
24
+ prediction: Predicted value (will be converted to binary)
25
+ reference: Reference/true value (will be converted to binary)
26
+
27
+ Returns:
28
+ Classification result as one of: "true_positive", "false_positive",
29
+ "true_negative", "false_negative"
30
+ """
31
+ pred_binary = to_binary(prediction)
32
+ ref_binary = to_binary(reference)
33
+
34
+ if pred_binary == 1:
35
+ return "true_positive" if ref_binary == 1 else "false_positive"
36
+ return "false_negative" if ref_binary == 1 else "true_negative"
@@ -0,0 +1,89 @@
1
+ """Progress bar utilities for evaluation tracking."""
2
+
3
+ from contextlib import contextmanager
4
+ from typing import Any, Generator, List, Optional
5
+
6
+ from tqdm import tqdm
7
+
8
+
9
+ class EvaluationProgressBars:
10
+ """Manages nested progress bars for evaluation tracking."""
11
+
12
+ def __init__(self, datasets: List[Any], hyperparam_count: int) -> None:
13
+ """Initialize progress bar manager.
14
+
15
+ Args:
16
+ datasets: List of datasets being evaluated
17
+ hyperparam_count: Number of hyperparameter configurations per dataset
18
+ """
19
+ self.datasets = datasets
20
+ self.hyperparam_count = hyperparam_count
21
+ self.dataset_pbar: Optional[tqdm] = None
22
+ self.hyperparam_pbar: Optional[tqdm] = None
23
+
24
+ def start_dataset_progress(self) -> None:
25
+ """Start the outer progress bar for datasets."""
26
+ self.dataset_pbar = tqdm(
27
+ total=len(self.datasets),
28
+ desc="Datasets ",
29
+ unit="dataset",
30
+ position=0,
31
+ leave=True,
32
+ ncols=80,
33
+ bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
34
+ )
35
+
36
+ def update_dataset_progress(self) -> None:
37
+ """Update the dataset progress bar."""
38
+ if self.dataset_pbar:
39
+ self.dataset_pbar.update(1)
40
+
41
+ def close_dataset_progress(self) -> None:
42
+ """Close the dataset progress bar."""
43
+ if self.dataset_pbar:
44
+ self.dataset_pbar.close()
45
+ self.dataset_pbar = None
46
+
47
+ @contextmanager
48
+ def hyperparam_progress_context(self) -> Generator[tqdm, None, None]:
49
+ """Context manager for hyperparameter progress bar."""
50
+ self.hyperparam_pbar = tqdm(
51
+ total=self.hyperparam_count,
52
+ desc="Hyperparams",
53
+ unit="config",
54
+ position=1,
55
+ leave=False,
56
+ ncols=80,
57
+ bar_format="{desc} {percentage:3.0f}%|{bar:40}| {n_fmt}/{total_fmt}",
58
+ )
59
+ try:
60
+ yield self.hyperparam_pbar
61
+ finally:
62
+ self.hyperparam_pbar.close()
63
+ self.hyperparam_pbar = None
64
+
65
+ def update_hyperparam_progress(self) -> None:
66
+ """Update the hyperparameter progress bar."""
67
+ if self.hyperparam_pbar:
68
+ self.hyperparam_pbar.update(1)
69
+
70
+
71
+ @contextmanager
72
+ def evaluation_progress(
73
+ datasets: List[Any], hyperparam_count: int
74
+ ) -> Generator[EvaluationProgressBars, None, None]:
75
+ """Context manager for evaluation progress bars.
76
+
77
+ Args:
78
+ datasets: List of datasets being evaluated
79
+ hyperparam_count: Number of hyperparameter configurations per dataset
80
+
81
+ Yields:
82
+ EvaluationProgressBars: Progress bar manager instance
83
+ """
84
+ progress_bars = EvaluationProgressBars(datasets, hyperparam_count)
85
+ progress_bars.start_dataset_progress()
86
+ try:
87
+ yield progress_bars
88
+ finally:
89
+ progress_bars.close_dataset_progress()
@@ -0,0 +1,25 @@
1
+ """Utility functions for transforming and manipulating data structures."""
2
+
3
+ from itertools import product
4
+
5
+
6
+ def expand_dict(data: dict) -> list[dict]:
7
+ """Expand a dictionary with list values into multiple dictionaries.
8
+
9
+ Takes a dictionary that may contain list values and expands it into a list of dictionaries,
10
+ where each dictionary represents one possible combination of values from the lists.
11
+ Non-list values remain constant across all generated dictionaries.
12
+
13
+ Args:
14
+ data: A dictionary potentially containing list values to be expanded
15
+
16
+ Returns:
17
+ A list of dictionaries representing all possible combinations of the input values
18
+ """
19
+ fixed = {k: v for k, v in data.items() if not isinstance(v, list)}
20
+ expandables = {k: v for k, v in data.items() if isinstance(v, list)}
21
+
22
+ keys, values = zip(*expandables.items()) if expandables else ([], [])
23
+ combinations = product(*values) if values else [()]
24
+
25
+ return [{**fixed, **dict(zip(keys, combo))} for combo in combinations]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Trismik Ltd
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,376 @@
1
+ Metadata-Version: 2.3
2
+ Name: scorebook
3
+ Version: 0.0.1
4
+ Summary: A Python project for LLM evaluation.
5
+ Author: Euan Campbell
6
+ Author-email: euan@trismik.com
7
+ Requires-Python: >=3.9
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Provides-Extra: examples
15
+ Provides-Extra: openai
16
+ Provides-Extra: portkey
17
+ Requires-Dist: accelerate ; extra == "examples"
18
+ Requires-Dist: datasets (>=3.6.0)
19
+ Requires-Dist: notebook ; extra == "examples"
20
+ Requires-Dist: openai ; extra == "openai"
21
+ Requires-Dist: portkey-ai ; extra == "portkey"
22
+ Requires-Dist: python-dotenv ; extra == "openai"
23
+ Requires-Dist: python-dotenv ; extra == "portkey"
24
+ Requires-Dist: torch ; extra == "examples"
25
+ Requires-Dist: torchaudio ; extra == "examples"
26
+ Requires-Dist: torchvision ; extra == "examples"
27
+ Requires-Dist: transformers ; extra == "examples"
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Scorebook
31
+
32
+ **A Python library for LLM evaluation**
33
+
34
+ <p align="center">
35
+ <img alt="Dynamic TOML Badge" src="https://img.shields.io/badge/dynamic/toml?url=https%3A%2F%2Fraw.githubusercontent.com%2Ftrismik%2Fscorebook%2Frefs%2Fheads%2Fmain%2Fpyproject.toml&query=tool.poetry.version&style=flat&label=version">
36
+ <img alt="Python Version" src="https://img.shields.io/badge/python-3.9%2B-blue">
37
+ <img alt="License" src="https://img.shields.io/badge/license-MIT-green">
38
+ </p>
39
+
40
+ Scorebook is a flexible and extensible framework for evaluating Large Language Models (LLMs). It provides clear contracts for data loading, model inference, and metrics computation, making it easy to run comprehensive evaluations across different datasets, models, and metrics.
41
+
42
+ ## ✨ Key Features
43
+
44
+ - **πŸ”Œ Flexible Data Loading**: Support for Hugging Face datasets, CSV, JSON, and Python lists
45
+ - **πŸš€ Model Agnostic**: Works with any model or inference provider
46
+ - **πŸ“Š Extensible Metric Engine**: Use the metrics we provide or implement your own
47
+ - **πŸ”„ Automated Sweeping**: Test multiple model configurations automatically
48
+ - **πŸ“ˆ Rich Results**: Export results to JSON, CSV, or structured formats like pandas DataFrames
49
+
50
+ ## πŸš€ Quick Start
51
+
52
+ ### Installation
53
+
54
+ ```bash
55
+ pip install scorebook
56
+ ```
57
+
58
+ For OpenAI integration:
59
+ ```bash
60
+ pip install scorebook[openai]
61
+ ```
62
+
63
+ For local model examples:
64
+ ```bash
65
+ pip install scorebook[examples]
66
+ ```
67
+
68
+ ### Basic Usage
69
+
70
+ ```python
71
+ from scorebook import EvalDataset, evaluate
72
+ from scorebook.metrics import Accuracy
73
+
74
+ # 1. Create an evaluation dataset
75
+ data = [
76
+ {"question": "What is 2 + 2?", "answer": "4"},
77
+ {"question": "What is the capital of France?", "answer": "Paris"},
78
+ {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
79
+ ]
80
+
81
+ dataset = EvalDataset.from_list(
82
+ name="basic_qa",
83
+ label="answer",
84
+ metrics=[Accuracy],
85
+ data=data
86
+ )
87
+
88
+ # 2. Define your inference function
89
+ def my_inference_function(items, **hyperparameters):
90
+ # Your model logic here
91
+ predictions = []
92
+ for item in items:
93
+ # Process each item and generate prediction
94
+ prediction = your_model.predict(item["question"])
95
+ predictions.append(prediction)
96
+ return predictions
97
+
98
+ # 3. Run evaluation
99
+ results = evaluate(my_inference_function, dataset)
100
+ print(results)
101
+ ```
102
+
103
+ ## πŸ“Š Core Components
104
+
105
+ ### 1. Evaluation Datasets
106
+
107
+ Scorebook supports multiple data sources through the `EvalDataset` class:
108
+
109
+ #### From Hugging Face
110
+ ```python
111
+ dataset = EvalDataset.from_huggingface(
112
+ "TIGER-Lab/MMLU-Pro",
113
+ label="answer",
114
+ metrics=[Accuracy],
115
+ split="validation"
116
+ )
117
+ ```
118
+
119
+ #### From CSV
120
+ ```python
121
+ dataset = EvalDataset.from_csv(
122
+ "dataset.csv",
123
+ label="answer",
124
+ metrics=[Accuracy]
125
+ )
126
+ ```
127
+
128
+ #### From JSON
129
+ ```python
130
+ dataset = EvalDataset.from_json(
131
+ "dataset.json",
132
+ label="answer",
133
+ metrics=[Accuracy]
134
+ )
135
+ ```
136
+
137
+ #### From Python List
138
+ ```python
139
+ dataset = EvalDataset.from_list(
140
+ name="custom_dataset",
141
+ label="answer",
142
+ metrics=[Accuracy],
143
+ data=[{"question": "...", "answer": "..."}]
144
+ )
145
+ ```
146
+
147
+ ### 2. Model Integration
148
+
149
+ Scorebook offers two approaches for model integration:
150
+
151
+ #### Inference Functions
152
+ A single function that handles the complete pipeline:
153
+
154
+ ```python
155
+ def inference_function(eval_items, **hyperparameters):
156
+ results = []
157
+ for item in eval_items:
158
+ # 1. Preprocessing
159
+ prompt = format_prompt(item)
160
+
161
+ # 2. Inference
162
+ output = model.generate(prompt)
163
+
164
+ # 3. Postprocessing
165
+ prediction = extract_answer(output)
166
+ results.append(prediction)
167
+
168
+ return results
169
+ ```
170
+
171
+ #### Inference Pipelines
172
+ Modular approach with separate stages:
173
+
174
+ ```python
175
+ from scorebook.types.inference_pipeline import InferencePipeline
176
+
177
+ def preprocessor(item):
178
+ return {"messages": [{"role": "user", "content": item["question"]}]}
179
+
180
+ def inference_function(processed_items, **hyperparameters):
181
+ return [model.generate(item) for item in processed_items]
182
+
183
+ def postprocessor(output):
184
+ return output.strip()
185
+
186
+ pipeline = InferencePipeline(
187
+ model="my-model",
188
+ preprocessor=preprocessor,
189
+ inference_function=inference_function,
190
+ postprocessor=postprocessor
191
+ )
192
+
193
+ results = evaluate(pipeline, dataset)
194
+ ```
195
+
196
+ ### 3. Metrics System
197
+
198
+ #### Built-in Metrics
199
+ - **Accuracy**: Percentage of correct predictions
200
+ - **Precision**: Accuracy of positive predictions
201
+
202
+ ```python
203
+ from scorebook.metrics import Accuracy, Precision
204
+
205
+ dataset = EvalDataset.from_list(
206
+ name="test",
207
+ label="answer",
208
+ metrics=[Accuracy, Precision], # Multiple metrics
209
+ data=data
210
+ )
211
+ ```
212
+
213
+ #### Custom Metrics
214
+ Create custom metrics by extending `MetricBase`:
215
+
216
+ ```python
217
+ from scorebook.metrics import MetricBase, MetricRegistry
218
+
219
+ @MetricRegistry.register()
220
+ class F1Score(MetricBase):
221
+ @staticmethod
222
+ def score(outputs, labels):
223
+ # Calculate F1 score
224
+ item_scores = [calculate_f1_item(o, l) for o, l in zip(outputs, labels)]
225
+ aggregate_score = {"f1": sum(item_scores) / len(item_scores)}
226
+ return aggregate_score, item_scores
227
+
228
+ # Use by string name or class
229
+ dataset = EvalDataset.from_list(..., metrics=["f1score"])
230
+ # or
231
+ dataset = EvalDataset.from_list(..., metrics=[F1Score])
232
+ ```
233
+
234
+ ### 4. Hyperparameter Sweeping
235
+
236
+ Test multiple configurations automatically:
237
+
238
+ ```python
239
+ hyperparameters = {
240
+ "temperature": [0.7, 0.9, 1.0],
241
+ "max_tokens": [50, 100, 150],
242
+ "top_p": [0.8, 0.9]
243
+ }
244
+
245
+ results = evaluate(
246
+ inference_function,
247
+ dataset,
248
+ hyperparameters=hyperparameters,
249
+ score_type="all"
250
+ )
251
+
252
+ # Results include all combinations: 3 Γ— 3 Γ— 2 = 18 configurations
253
+ ```
254
+
255
+ ### 5. Results and Export
256
+
257
+ Control result format with `score_type`:
258
+
259
+ ```python
260
+ # Only aggregate scores (default)
261
+ results = evaluate(model, dataset, score_type="aggregate")
262
+
263
+ # Only per-item scores
264
+ results = evaluate(model, dataset, score_type="item")
265
+
266
+ # Both aggregate and per-item
267
+ results = evaluate(model, dataset, score_type="all")
268
+ ```
269
+
270
+ Export results:
271
+
272
+ ```python
273
+ # Get EvalResult objects for advanced usage
274
+ results = evaluate(model, dataset, return_type="object")
275
+
276
+ # Export to files
277
+ for result in results:
278
+ result.to_json("results.json")
279
+ result.to_csv("results.csv")
280
+ ```
281
+
282
+ ## πŸ”§ OpenAI Integration
283
+
284
+ Scorebook includes built-in OpenAI support for both single requests and batch processing:
285
+
286
+ ```python
287
+ from scorebook.inference.openai import responses, batch
288
+ from scorebook.types.inference_pipeline import InferencePipeline
289
+
290
+ # For single requests
291
+ pipeline = InferencePipeline(
292
+ model="gpt-4o-mini",
293
+ preprocessor=format_for_openai,
294
+ inference_function=responses,
295
+ postprocessor=extract_response
296
+ )
297
+
298
+ # For batch processing (more efficient for large datasets)
299
+ batch_pipeline = InferencePipeline(
300
+ model="gpt-4o-mini",
301
+ preprocessor=format_for_openai,
302
+ inference_function=batch,
303
+ postprocessor=extract_response
304
+ )
305
+ ```
306
+
307
+ ## πŸ“‹ Examples
308
+
309
+ The `examples/` directory contains comprehensive examples:
310
+
311
+ - **`basic_example.py`**: Local model evaluation with Hugging Face
312
+ - **`openai_responses_api.py`**: OpenAI API integration
313
+ - **`openai_batch_api.py`**: OpenAI Batch API for large-scale evaluation
314
+ - **`hyperparam_sweep.py`**: Hyperparameter optimization
315
+ - **`scorebook_showcase.ipynb`**: Interactive Jupyter notebook tutorial
316
+
317
+ Run an example:
318
+
319
+ ```bash
320
+ cd examples/
321
+ python basic_example.py --output-dir ./my_results
322
+ ```
323
+
324
+ ## πŸ—οΈ Architecture
325
+
326
+ Scorebook follows a modular architecture:
327
+
328
+ ```
329
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
330
+ β”‚ EvalDataset β”‚ β”‚ Inference β”‚ β”‚ Metrics β”‚
331
+ β”‚ β”‚ β”‚ Pipeline β”‚ β”‚ β”‚
332
+ β”‚ β€’ Data Loading β”‚ β”‚ β”‚ β”‚ β€’ Accuracy β”‚
333
+ β”‚ β€’ HF Integrationβ”‚ β”‚ β€’ Preprocess β”‚ β”‚ β€’ Precision β”‚
334
+ β”‚ β€’ CSV/JSON β”‚ β”‚ β€’ Inference β”‚ β”‚ β€’ Custom β”‚
335
+ β”‚ β€’ Validation β”‚ β”‚ β€’ Postprocessβ”‚ β”‚ β€’ Registry β”‚
336
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
337
+ β”‚ β”‚ β”‚
338
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
339
+ β”‚
340
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
341
+ β”‚ evaluate() β”‚
342
+ β”‚ β”‚
343
+ β”‚ β€’ Orchestration β”‚
344
+ β”‚ β€’ Progress Tracking β”‚
345
+ β”‚ β€’ Result Formatting β”‚
346
+ β”‚ β€’ Export Options β”‚
347
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
348
+ ```
349
+
350
+ ## 🎯 Use Cases
351
+
352
+ Scorebook is designed for:
353
+
354
+ - **πŸ† Model Benchmarking**: Compare different models on standard datasets
355
+ - **βš™οΈ Hyperparameter Optimization**: Find optimal model configurations
356
+ - **πŸ“Š Dataset Analysis**: Understand model performance across different data types
357
+ - **πŸ”„ A/B Testing**: Compare model versions or approaches
358
+ - **πŸ”¬ Research Experiments**: Reproducible evaluation workflows
359
+ - **πŸ“ˆ Production Monitoring**: Track model performance over time
360
+
361
+ ## 🀝 Contributing
362
+
363
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
364
+
365
+ ## πŸ“„ License
366
+
367
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
368
+
369
+ ## 🏒 About
370
+
371
+ Scorebook is developed by [Trismik](https://trismik.com) to speed up your LLM evaluation.
372
+
373
+ ---
374
+
375
+ *For more examples and detailed documentation, check out the Jupyter notebook in `examples/scorebook_showcase.ipynb`*
376
+
@@ -0,0 +1,24 @@
1
+ scorebook/__init__.py,sha256=cYv8bT3_7o2MTxPVKiv51DcpaPtH_A9qOH5yF_FULZo,336
2
+ scorebook/evaluator.py,sha256=Ce4KerLVPlaF63xng9RKH9M1l-ldo3mdrd3T2dBs_YE,8908
3
+ scorebook/inference/__init__.py,sha256=sU_ZSN9eO7ajZ-QklNpx8_gf3jCdDn69J-SfU0z07-E,333
4
+ scorebook/inference/openai.py,sha256=XD1dbPrEHQJVXOMtqCt9a0yQ-qR381N5XXhCrgz8jio,5826
5
+ scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
6
+ scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
7
+ scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
8
+ scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
9
+ scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
10
+ scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
11
+ scorebook/types/__init__.py,sha256=xQMOae_fIBbeyeuqoa7SbNwjxAiVinPBbckOcUzo57U,358
12
+ scorebook/types/eval_dataset.py,sha256=TeIeVHQ597NxedxaTEXohZO8gR5iAiDtJbCja_u69EI,11703
13
+ scorebook/types/eval_result.py,sha256=y0vLN6RMgiz1lyai5ltmzDibBHE25-k9bTrQ7U27RZ8,4552
14
+ scorebook/types/inference_pipeline.py,sha256=M3JgchpcVdhRJPzn3mh5ys6iivSt8eBmHIj4F5LcFYU,3167
15
+ scorebook/utils/__init__.py,sha256=DmhS61OZ2nNWkGxDfVrMBwwiH7dmLAbg3MHuNgaXhQg,382
16
+ scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
17
+ scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
18
+ scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
19
+ scorebook/utils/progress_bars.py,sha256=BlKqYlXDbik5eUn5nf5f7QnMvnTj8CU_CfXKxCWp3Ww,2909
20
+ scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
21
+ scorebook-0.0.1.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
22
+ scorebook-0.0.1.dist-info/METADATA,sha256=oiwYbuJkRVkoFZkIAQej09LdG5xBLxhKPy2ozWTV-_w,10976
23
+ scorebook-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
+ scorebook-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any