scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scorebook/__init__.py +12 -5
- scorebook/cli/auth.py +1 -1
- scorebook/dashboard/__init__.py +1 -0
- scorebook/dashboard/create_project.py +91 -0
- scorebook/{trismik → dashboard}/credentials.py +57 -12
- scorebook/{trismik → dashboard}/upload_results.py +1 -1
- scorebook/eval_datasets/__init__.py +0 -4
- scorebook/eval_datasets/eval_dataset.py +4 -2
- scorebook/evaluate/__init__.py +1 -15
- scorebook/evaluate/_async/evaluate_async.py +36 -19
- scorebook/evaluate/_sync/evaluate.py +36 -19
- scorebook/evaluate/evaluate_helpers.py +4 -3
- scorebook/inference/__init__.py +1 -11
- scorebook/inference/clients/__init__.py +1 -8
- scorebook/inference/inference_pipeline.py +1 -1
- scorebook/metrics/README.md +121 -0
- scorebook/metrics/__init__.py +7 -16
- scorebook/metrics/accuracy.py +2 -6
- scorebook/metrics/bertscore.py +50 -0
- scorebook/metrics/bleu.py +82 -0
- scorebook/metrics/core/__init__.py +1 -0
- scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
- scorebook/metrics/core/metric_registry.py +195 -0
- scorebook/metrics/exactmatch.py +95 -0
- scorebook/metrics/f1.py +96 -0
- scorebook/metrics/precision.py +84 -9
- scorebook/metrics/recall.py +94 -0
- scorebook/metrics/rouge.py +85 -0
- scorebook/score/__init__.py +0 -5
- scorebook/score/_async/score_async.py +3 -2
- scorebook/score/_sync/score.py +3 -2
- scorebook/score/score_helpers.py +29 -12
- scorebook/types.py +3 -3
- scorebook/utils/__init__.py +0 -22
- scorebook/utils/common_helpers.py +1 -1
- scorebook/utils/mock_llm/__init__.py +41 -0
- scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
- scorebook/utils/progress_bars.py +58 -786
- scorebook-0.0.15.dist-info/METADATA +300 -0
- scorebook-0.0.15.dist-info/RECORD +110 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
- tutorials/README.md +147 -0
- tutorials/__init__.py +5 -0
- tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
- tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
- tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
- tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
- tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
- tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
- tutorials/examples/1-score/__init__.py +0 -0
- tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
- tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
- tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
- tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
- tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
- tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
- tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
- tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
- tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
- tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
- tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
- tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
- tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
- tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
- tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
- tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
- tutorials/examples/6-providers/aws/__init__.py +1 -0
- tutorials/examples/6-providers/aws/batch_example.py +219 -0
- tutorials/examples/6-providers/portkey/__init__.py +1 -0
- tutorials/examples/6-providers/portkey/batch_example.py +120 -0
- tutorials/examples/6-providers/portkey/messages_example.py +121 -0
- tutorials/examples/6-providers/vertex/__init__.py +1 -0
- tutorials/examples/6-providers/vertex/batch_example.py +166 -0
- tutorials/examples/6-providers/vertex/messages_example.py +142 -0
- tutorials/examples/__init__.py +0 -0
- tutorials/notebooks/1-scoring.ipynb +162 -0
- tutorials/notebooks/2-evaluating.ipynb +316 -0
- tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
- tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
- tutorials/notebooks/4-uploading_results.ipynb +175 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
- tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
- tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
- tutorials/quickstarts/getting_started.ipynb +197 -0
- tutorials/utils/__init__.py +35 -0
- tutorials/utils/args_parser.py +132 -0
- tutorials/utils/output.py +23 -0
- tutorials/utils/setup.py +98 -0
- scorebook/metrics/metric_registry.py +0 -105
- scorebook/trismik/__init__.py +0 -10
- scorebook-0.0.13.dist-info/METADATA +0 -389
- scorebook-0.0.13.dist-info/RECORD +0 -50
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
- {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "bda7b5add96e4d97",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# Getting Started with Trismik's Adaptive Testing\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This notebook demonstrates how to run Trismik's adaptive evaluations using Scorebook.\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"## What is Adaptive Testing?\n",
|
|
13
|
+
"\n",
|
|
14
|
+
"Trismik’s adaptive testing service leverages item response theory (IRT), a psychometric framework, to evaluate large language models. Using computerized adaptive testing (CAT) it dynamically selects the most informative items, enabling faster, more cost-efficient model evaluations with fewer items required.\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"## Setup\n",
|
|
17
|
+
"\n",
|
|
18
|
+
"### Install Scorebook\n"
|
|
19
|
+
]
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"cell_type": "code",
|
|
23
|
+
"id": "286eca2349c6ddc6",
|
|
24
|
+
"metadata": {},
|
|
25
|
+
"source": "!pip install scorebook",
|
|
26
|
+
"outputs": [],
|
|
27
|
+
"execution_count": null
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"cell_type": "markdown",
|
|
31
|
+
"id": "d2fa56528e14c46d",
|
|
32
|
+
"metadata": {},
|
|
33
|
+
"source": [
|
|
34
|
+
"\n",
|
|
35
|
+
"\n",
|
|
36
|
+
"### Generate a Trismik API Key\n",
|
|
37
|
+
"\n",
|
|
38
|
+
"To run an adaptive evaluation, a Trismik API key is required. You can [sign up](https://dashboard.trismik.com/signup) for a free Trismik account and generate an API key.\n",
|
|
39
|
+
"\n",
|
|
40
|
+
"**How to generate an API key from the Trismik dashboard**:\n",
|
|
41
|
+
"1. click on your initials in the top-right corner of the screen.\n",
|
|
42
|
+
"2. click on \"API Keys\" in the drop-down menu.\n",
|
|
43
|
+
"3. click \"Create API Key\" to create a new API key.\n",
|
|
44
|
+
"\n",
|
|
45
|
+
"### Set Trismik API Key"
|
|
46
|
+
]
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"cell_type": "code",
|
|
50
|
+
"id": "5ed8a62ac56560e9",
|
|
51
|
+
"metadata": {},
|
|
52
|
+
"source": [
|
|
53
|
+
"# Set your API key here and run this cell to login\n",
|
|
54
|
+
"TRISMIK_API_KEY = \"your-trismik-api-key-here\""
|
|
55
|
+
],
|
|
56
|
+
"outputs": [],
|
|
57
|
+
"execution_count": null
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"cell_type": "markdown",
|
|
61
|
+
"id": "10d842e5b99e95bd",
|
|
62
|
+
"metadata": {},
|
|
63
|
+
"source": [
|
|
64
|
+
"### Login with Trismik API Key"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"id": "initial_id",
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"source": [
|
|
72
|
+
"import scorebook\n",
|
|
73
|
+
"\n",
|
|
74
|
+
"scorebook.login(TRISMIK_API_KEY)\n",
|
|
75
|
+
"print(\"✓ Logged in to Trismik\")"
|
|
76
|
+
],
|
|
77
|
+
"outputs": [],
|
|
78
|
+
"execution_count": null
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"cell_type": "markdown",
|
|
82
|
+
"id": "eedbc41adba92248",
|
|
83
|
+
"metadata": {},
|
|
84
|
+
"source": [
|
|
85
|
+
"### Create a Project\n",
|
|
86
|
+
"\n",
|
|
87
|
+
"When running an adaptive evaluation, your evaluation results are stored under a project on the Trismik dashboard. Projects can be created from the dashboard's interface, or programmatically via Scorebook."
|
|
88
|
+
]
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"cell_type": "code",
|
|
92
|
+
"id": "bd7271800bf91478",
|
|
93
|
+
"metadata": {},
|
|
94
|
+
"source": [
|
|
95
|
+
"from scorebook import create_project\n",
|
|
96
|
+
"\n",
|
|
97
|
+
"# Create a project\n",
|
|
98
|
+
"project = create_project(\n",
|
|
99
|
+
" name = \"Quick-Start Guides\",\n",
|
|
100
|
+
" description = \"A project created for Trismik's quick-start guides.\"\n",
|
|
101
|
+
")\n",
|
|
102
|
+
"\n",
|
|
103
|
+
"print(\"✓ Project created\")\n",
|
|
104
|
+
"print(f\"Project ID: {project.id}\")"
|
|
105
|
+
],
|
|
106
|
+
"outputs": [],
|
|
107
|
+
"execution_count": null
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"cell_type": "markdown",
|
|
111
|
+
"id": "e30ea2a2674e6005",
|
|
112
|
+
"metadata": {},
|
|
113
|
+
"source": [
|
|
114
|
+
"## Run an Adaptive Evaluation\n",
|
|
115
|
+
"\n",
|
|
116
|
+
"For this quick-start guide, we will use a mock model, that replicates the responses generated by an Amazon LLM (Nova-Pro). `mock_llm` is an inference function which accepts a list of model inputs and returns a list of model outputs for scoring. In this example, we use model responses that we pre-computed to showcase how adaptive evaluations work."
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"cell_type": "code",
|
|
121
|
+
"id": "d9f96d063e08c4fd",
|
|
122
|
+
"metadata": {},
|
|
123
|
+
"source": [
|
|
124
|
+
"from scorebook.utils.mock_llm import mock_llm\n",
|
|
125
|
+
"\n",
|
|
126
|
+
"# Run adaptive evaluation\n",
|
|
127
|
+
"results = scorebook.evaluate(\n",
|
|
128
|
+
" inference = mock_llm,\n",
|
|
129
|
+
" datasets = \"trismik/MMLUPro:adaptive\",\n",
|
|
130
|
+
" experiment_id = \"Getting-Started\",\n",
|
|
131
|
+
" project_id = project.id,\n",
|
|
132
|
+
")\n",
|
|
133
|
+
"\n",
|
|
134
|
+
"# Print the adaptive evaluation results\n",
|
|
135
|
+
"print(\"✓ Adaptive evaluation complete!\")\n",
|
|
136
|
+
"print(\"Results: \", results[0][\"score\"])\n",
|
|
137
|
+
"\n",
|
|
138
|
+
"print(f\"You can view your results here: https://dashboard.trismik.com/projects/{project.id}\")"
|
|
139
|
+
],
|
|
140
|
+
"outputs": [],
|
|
141
|
+
"execution_count": null
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
"cell_type": "markdown",
|
|
145
|
+
"id": "63d3f34a76e90ac0",
|
|
146
|
+
"metadata": {},
|
|
147
|
+
"source": [
|
|
148
|
+
"### Adaptive Evaluation Results\n",
|
|
149
|
+
"\n",
|
|
150
|
+
"The metrics generated by an adaptive evaluation are:\n",
|
|
151
|
+
"\n",
|
|
152
|
+
"- Theta (θ): The primary score measuring model ability on the dataset, a higher value represents better performance.\n",
|
|
153
|
+
"- Standard Error: The theta score is a proxy for the underlying metric, and the standard error is the uncertainty in the theta estimate.\n",
|
|
154
|
+
"\n",
|
|
155
|
+
"You can find more information about adaptive testing [here](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/)!\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"---\n",
|
|
158
|
+
"\n",
|
|
159
|
+
"## Next Steps\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"**More Quick-Start Guides**:\n",
|
|
162
|
+
"\n",
|
|
163
|
+
"1. [HuggingFace Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for local HF-based models. \n",
|
|
164
|
+
"2. [API-based Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for API-based models, using the OpenAI API.\n",
|
|
165
|
+
"3. [HuggingFace Full Dataset Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb): For a demo showcasing how Scorebook can be used for full dataset evaluations with familiar metrics, like `Accuracy`.\n",
|
|
166
|
+
"\n",
|
|
167
|
+
"**More details on Adaptive Testing and Scorebook**:\n",
|
|
168
|
+
"\n",
|
|
169
|
+
"- [Introduction to Adaptive Testing](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): a quick introduction to Adaptive Testing.\n",
|
|
170
|
+
"- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
|
|
171
|
+
"- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
|
|
172
|
+
"- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
|
|
173
|
+
]
|
|
174
|
+
}
|
|
175
|
+
],
|
|
176
|
+
"metadata": {
|
|
177
|
+
"kernelspec": {
|
|
178
|
+
"display_name": "Python 3 (ipykernel)",
|
|
179
|
+
"language": "python",
|
|
180
|
+
"name": "python3"
|
|
181
|
+
},
|
|
182
|
+
"language_info": {
|
|
183
|
+
"codemirror_mode": {
|
|
184
|
+
"name": "ipython",
|
|
185
|
+
"version": 3
|
|
186
|
+
},
|
|
187
|
+
"file_extension": ".py",
|
|
188
|
+
"mimetype": "text/x-python",
|
|
189
|
+
"name": "python",
|
|
190
|
+
"nbconvert_exporter": "python",
|
|
191
|
+
"pygments_lexer": "ipython3",
|
|
192
|
+
"version": "3.13.7"
|
|
193
|
+
}
|
|
194
|
+
},
|
|
195
|
+
"nbformat": 4,
|
|
196
|
+
"nbformat_minor": 5
|
|
197
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper utilities for Scorebook examples.
|
|
3
|
+
|
|
4
|
+
This module provides common helper functions used across multiple Scorebook examples
|
|
5
|
+
for setup, output handling, and argument parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Argument parsing utilities
|
|
9
|
+
from .args_parser import (
|
|
10
|
+
add_model_selection_arg,
|
|
11
|
+
create_parser,
|
|
12
|
+
parse_args_with_config,
|
|
13
|
+
setup_batch_model_parser,
|
|
14
|
+
setup_openai_model_parser,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Output utilities
|
|
18
|
+
from .output import save_results_to_json
|
|
19
|
+
|
|
20
|
+
# Setup utilities
|
|
21
|
+
from .setup import setup_logging, setup_output_directory
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
# Setup
|
|
25
|
+
"setup_logging",
|
|
26
|
+
"setup_output_directory",
|
|
27
|
+
# Output
|
|
28
|
+
"save_results_to_json",
|
|
29
|
+
# Argument parsing
|
|
30
|
+
"create_parser",
|
|
31
|
+
"add_model_selection_arg",
|
|
32
|
+
"setup_openai_model_parser",
|
|
33
|
+
"setup_batch_model_parser",
|
|
34
|
+
"parse_args_with_config",
|
|
35
|
+
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generic argument parsing utilities for Scorebook examples.
|
|
3
|
+
|
|
4
|
+
This module provides reusable argument parsing functions that can be used
|
|
5
|
+
across multiple Scorebook examples for consistent command-line interfaces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_parser(description: str) -> argparse.ArgumentParser:
|
|
13
|
+
"""Create a basic argument parser with a description.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
description: Description for the argument parser
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Configured ArgumentParser instance
|
|
20
|
+
"""
|
|
21
|
+
return argparse.ArgumentParser(description=description)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def add_model_selection_arg(
|
|
25
|
+
parser: argparse.ArgumentParser,
|
|
26
|
+
default: str = "gpt-4o-mini",
|
|
27
|
+
help_text: Optional[str] = None,
|
|
28
|
+
supported_models: Optional[List[str]] = None,
|
|
29
|
+
) -> argparse.ArgumentParser:
|
|
30
|
+
"""Add model selection argument to parser.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
parser: ArgumentParser to add the argument to
|
|
34
|
+
default: Default model name
|
|
35
|
+
help_text: Custom help text for the argument
|
|
36
|
+
supported_models: List of supported models for validation
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The modified parser
|
|
40
|
+
"""
|
|
41
|
+
if help_text is None:
|
|
42
|
+
help_text = f"OpenAI model to use for inference (default: {default})"
|
|
43
|
+
if supported_models:
|
|
44
|
+
help_text += f". Supported models: {', '.join(supported_models)}"
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--model",
|
|
48
|
+
type=str,
|
|
49
|
+
default=default,
|
|
50
|
+
help=help_text,
|
|
51
|
+
)
|
|
52
|
+
return parser
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def setup_openai_model_parser(
|
|
56
|
+
description: str = "Select OpenAI model for evaluation.",
|
|
57
|
+
default: str = "gpt-4o-mini",
|
|
58
|
+
supported_models: Optional[List[str]] = None,
|
|
59
|
+
) -> str:
|
|
60
|
+
"""Set up and parse OpenAI model selection arguments.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
description: Description for the argument parser
|
|
64
|
+
default: Default model name
|
|
65
|
+
supported_models: List of supported models for help text
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Selected model name
|
|
69
|
+
"""
|
|
70
|
+
parser = create_parser(description)
|
|
71
|
+
add_model_selection_arg(parser, default=default, supported_models=supported_models)
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
return str(args.model)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def setup_batch_model_parser(
|
|
77
|
+
description: str = "Select OpenAI model for batch evaluation.", default: str = "gpt-4o-mini"
|
|
78
|
+
) -> str:
|
|
79
|
+
"""Set up and parse OpenAI model selection arguments for batch inference.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
description: Description for the argument parser
|
|
83
|
+
default: Default model name
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Selected model name
|
|
87
|
+
"""
|
|
88
|
+
supported_models = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"]
|
|
89
|
+
help_text = (
|
|
90
|
+
f"OpenAI model to use for batch inference. "
|
|
91
|
+
f"Note: Only select models support the Batch API. "
|
|
92
|
+
f"Supported models include: {', '.join(supported_models)}. "
|
|
93
|
+
f"Default: {default}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
parser = create_parser(description)
|
|
97
|
+
add_model_selection_arg(parser, default=default, help_text=help_text)
|
|
98
|
+
args = parser.parse_args()
|
|
99
|
+
return str(args.model)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def parse_args_with_config(config: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
103
|
+
"""Parse arguments using a configuration dictionary.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
config: Dictionary defining arguments to add. Format:
|
|
107
|
+
{
|
|
108
|
+
"arg_name": {
|
|
109
|
+
"type": str,
|
|
110
|
+
"default": "default_value",
|
|
111
|
+
"help": "Help text",
|
|
112
|
+
"required": False # optional
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Dictionary of parsed argument values
|
|
118
|
+
"""
|
|
119
|
+
parser = argparse.ArgumentParser()
|
|
120
|
+
|
|
121
|
+
for arg_name, arg_config in config.items():
|
|
122
|
+
kwargs = {"type": arg_config.get("type", str), "help": arg_config.get("help", "")}
|
|
123
|
+
|
|
124
|
+
if "default" in arg_config:
|
|
125
|
+
kwargs["default"] = arg_config["default"]
|
|
126
|
+
if "required" in arg_config:
|
|
127
|
+
kwargs["required"] = arg_config["required"]
|
|
128
|
+
|
|
129
|
+
parser.add_argument(f"--{arg_name.replace('_', '-')}", **kwargs)
|
|
130
|
+
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
return vars(args)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for saving Scorebook evaluation results.
|
|
3
|
+
|
|
4
|
+
This module provides common helper functions used across multiple Scorebook examples
|
|
5
|
+
for saving evaluation results to files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def save_results_to_json(results: Any, output_dir: Path, filename: str) -> None:
|
|
14
|
+
"""Save evaluation results to a JSON file.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
results: The evaluation results to save
|
|
18
|
+
output_dir: Directory to save the file in
|
|
19
|
+
filename: Name of the output file (should include .json extension)
|
|
20
|
+
"""
|
|
21
|
+
output_path = output_dir / filename
|
|
22
|
+
with open(output_path, "w") as output_file:
|
|
23
|
+
json.dump(results, output_file, indent=4)
|
tutorials/utils/setup.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for setting up Scorebook examples.
|
|
3
|
+
|
|
4
|
+
This module provides common helper functions used across multiple Scorebook examples
|
|
5
|
+
for output directory setup and logging configuration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def setup_output_directory() -> Path:
|
|
15
|
+
"""Parse command line arguments and setup output directory."""
|
|
16
|
+
import argparse
|
|
17
|
+
|
|
18
|
+
parser = argparse.ArgumentParser(description="Run evaluation and save results.")
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--output-dir",
|
|
21
|
+
type=str,
|
|
22
|
+
default=str(Path.cwd() / "examples/example_results"),
|
|
23
|
+
help=(
|
|
24
|
+
"Directory to save evaluation outputs (CSV and JSON). "
|
|
25
|
+
"Defaults to ./examples/example_results in the current working directory."
|
|
26
|
+
),
|
|
27
|
+
)
|
|
28
|
+
args = parser.parse_args()
|
|
29
|
+
output_dir = Path(args.output_dir)
|
|
30
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
print(f"Saving results to {output_dir}")
|
|
32
|
+
return output_dir
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def setup_logging(
|
|
36
|
+
log_dir: str = "logs",
|
|
37
|
+
experiment_id: Optional[str] = None,
|
|
38
|
+
base_dir: Optional[Path] = None,
|
|
39
|
+
) -> Path:
|
|
40
|
+
"""Configure logging for evaluation runs.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
log_dir: Name of the log directory (default: "logs")
|
|
44
|
+
experiment_id: Optional identifier for the experiment
|
|
45
|
+
base_dir: Base directory where log_dir should be created.
|
|
46
|
+
If None, uses current working directory.
|
|
47
|
+
"""
|
|
48
|
+
if base_dir is None:
|
|
49
|
+
base_dir = Path.cwd()
|
|
50
|
+
|
|
51
|
+
log_dir_path: Path = base_dir / log_dir
|
|
52
|
+
log_dir_path.mkdir(exist_ok=True, parents=True)
|
|
53
|
+
|
|
54
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
55
|
+
if experiment_id:
|
|
56
|
+
log_file = log_dir_path / f"evaluation_{experiment_id}_{timestamp}.log"
|
|
57
|
+
else:
|
|
58
|
+
log_file = log_dir_path / f"evaluation_{timestamp}.log"
|
|
59
|
+
|
|
60
|
+
# Create file handler for all logs (same as before)
|
|
61
|
+
file_handler = logging.FileHandler(log_file)
|
|
62
|
+
file_handler.setLevel(logging.DEBUG)
|
|
63
|
+
file_handler.setFormatter(
|
|
64
|
+
logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Create console handler for warnings and errors only
|
|
68
|
+
console_handler = logging.StreamHandler()
|
|
69
|
+
console_handler.setLevel(logging.WARNING)
|
|
70
|
+
console_handler.setFormatter(logging.Formatter("%(levelname)s - %(name)s - %(message)s"))
|
|
71
|
+
|
|
72
|
+
# Configure root logger with both handlers
|
|
73
|
+
logging.basicConfig(
|
|
74
|
+
level=logging.INFO,
|
|
75
|
+
handlers=[file_handler, console_handler],
|
|
76
|
+
force=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Set scorebook loggers to DEBUG level to capture all scorebook logs
|
|
80
|
+
scorebook_logger = logging.getLogger("scorebook")
|
|
81
|
+
scorebook_logger.setLevel(logging.DEBUG)
|
|
82
|
+
|
|
83
|
+
# Ensure trismik_services logs are captured at DEBUG level
|
|
84
|
+
trismik_services_logger = logging.getLogger("scorebook.trismik_services")
|
|
85
|
+
trismik_services_logger.setLevel(logging.DEBUG)
|
|
86
|
+
|
|
87
|
+
# Ensure evaluate logs are captured at DEBUG level
|
|
88
|
+
evaluate_logger = logging.getLogger("scorebook.evaluate._sync.evaluate")
|
|
89
|
+
evaluate_logger.setLevel(logging.DEBUG)
|
|
90
|
+
evaluate_logger = logging.getLogger("scorebook.evaluate._async.evaluate_async")
|
|
91
|
+
evaluate_logger.setLevel(logging.DEBUG)
|
|
92
|
+
|
|
93
|
+
# Exclude OpenAI inference logs to reduce noise
|
|
94
|
+
openai_logger = logging.getLogger("scorebook.inference.openai")
|
|
95
|
+
openai_logger.setLevel(logging.WARNING) # Only log warnings and errors
|
|
96
|
+
|
|
97
|
+
print(f"Logging to {log_file}")
|
|
98
|
+
return log_file
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Registry module for evaluation metrics.
|
|
3
|
-
|
|
4
|
-
This module maintains a centralized registry of available evaluation metrics
|
|
5
|
-
that can be used to assess model performance. It provides a single access point
|
|
6
|
-
to retrieve all implemented metric classes.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from typing import Any, Callable, Dict, List, Type, Union
|
|
10
|
-
|
|
11
|
-
from scorebook.metrics.metric_base import MetricBase
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MetricRegistry:
|
|
15
|
-
"""A registry for evaluation metrics.
|
|
16
|
-
|
|
17
|
-
This class provides a central registry for all evaluation metrics in the system.
|
|
18
|
-
It allows metrics to be registered with unique names and retrieved either by
|
|
19
|
-
name or by class. The registry ensures that metrics are properly initialized
|
|
20
|
-
and accessible throughout the application.
|
|
21
|
-
|
|
22
|
-
The registry supports:
|
|
23
|
-
- Registering new metric classes with optional custom names
|
|
24
|
-
- Retrieving metric instances by name or class
|
|
25
|
-
- Listing all available metrics
|
|
26
|
-
|
|
27
|
-
Usage:
|
|
28
|
-
@MetricRegistry.register("custom_name")
|
|
29
|
-
class MyMetric(MetricBase):
|
|
30
|
-
...
|
|
31
|
-
|
|
32
|
-
# Get by name
|
|
33
|
-
metric = MetricRegistry.get("custom_name")
|
|
34
|
-
|
|
35
|
-
# Get by class
|
|
36
|
-
metric = MetricRegistry.get(MyMetric)
|
|
37
|
-
|
|
38
|
-
# List available metrics
|
|
39
|
-
metrics = MetricRegistry.list_metrics()
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
_registry: Dict[str, Type[MetricBase]] = {}
|
|
43
|
-
|
|
44
|
-
@classmethod
|
|
45
|
-
def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
|
|
46
|
-
"""
|
|
47
|
-
Register a metric class in the registry.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
A decorator that registers the class and returns it.
|
|
51
|
-
|
|
52
|
-
Raises:
|
|
53
|
-
ValueError: If a metric with the given name is already registered.
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
|
|
57
|
-
|
|
58
|
-
key = metric_cls.__name__.lower()
|
|
59
|
-
if key in cls._registry:
|
|
60
|
-
raise ValueError(f"Metric '{key}' is already registered")
|
|
61
|
-
cls._registry[key] = metric_cls
|
|
62
|
-
return metric_cls
|
|
63
|
-
|
|
64
|
-
return decorator
|
|
65
|
-
|
|
66
|
-
@classmethod
|
|
67
|
-
def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
|
|
68
|
-
"""
|
|
69
|
-
Get an instance of a registered metric by name or class.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
name_or_class: The metric name (string) or class (subclass of BaseMetric).
|
|
73
|
-
**kwargs: Additional arguments to pass to the metric's constructor.
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
An instance of the requested metric.
|
|
77
|
-
|
|
78
|
-
Raises:
|
|
79
|
-
ValueError: If the metric name is not registered.
|
|
80
|
-
"""
|
|
81
|
-
# If input is a class that's a subclass of BaseMetric, instantiate it directly
|
|
82
|
-
if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
|
|
83
|
-
return name_or_class(**kwargs)
|
|
84
|
-
|
|
85
|
-
# If input is a string, look up the class in the registry
|
|
86
|
-
if isinstance(name_or_class, str):
|
|
87
|
-
key = name_or_class.lower()
|
|
88
|
-
if key not in cls._registry:
|
|
89
|
-
raise ValueError(f"Metric '{name_or_class}' not registered.")
|
|
90
|
-
return cls._registry[key](**kwargs)
|
|
91
|
-
|
|
92
|
-
raise ValueError(
|
|
93
|
-
f"Invalid metric type: {type(name_or_class)}."
|
|
94
|
-
f"Must be string name or BaseMetric subclass"
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
@classmethod
|
|
98
|
-
def list_metrics(cls) -> List[str]:
|
|
99
|
-
"""
|
|
100
|
-
List all registered metrics.
|
|
101
|
-
|
|
102
|
-
Returns:
|
|
103
|
-
A list of metric names.
|
|
104
|
-
"""
|
|
105
|
-
return list(cls._registry.keys())
|
scorebook/trismik/__init__.py
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
"""Trismik authentication and API integration.
|
|
2
|
-
|
|
3
|
-
Note: Trismik evaluation functionality has been moved to scorebook.evaluate module.
|
|
4
|
-
This module now only provides authentication functions.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
# Import shared credential functions
|
|
8
|
-
from .credentials import get_stored_token, get_token, login, logout, whoami
|
|
9
|
-
|
|
10
|
-
__all__ = ["login", "logout", "whoami", "get_stored_token", "get_token"]
|