scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. scorebook/__init__.py +12 -5
  2. scorebook/cli/auth.py +1 -1
  3. scorebook/dashboard/__init__.py +1 -0
  4. scorebook/dashboard/create_project.py +91 -0
  5. scorebook/{trismik → dashboard}/credentials.py +57 -12
  6. scorebook/{trismik → dashboard}/upload_results.py +1 -1
  7. scorebook/eval_datasets/__init__.py +0 -4
  8. scorebook/eval_datasets/eval_dataset.py +4 -2
  9. scorebook/evaluate/__init__.py +1 -15
  10. scorebook/evaluate/_async/evaluate_async.py +36 -19
  11. scorebook/evaluate/_sync/evaluate.py +36 -19
  12. scorebook/evaluate/evaluate_helpers.py +4 -3
  13. scorebook/inference/__init__.py +1 -11
  14. scorebook/inference/clients/__init__.py +1 -8
  15. scorebook/inference/inference_pipeline.py +1 -1
  16. scorebook/metrics/README.md +121 -0
  17. scorebook/metrics/__init__.py +7 -16
  18. scorebook/metrics/accuracy.py +2 -6
  19. scorebook/metrics/bertscore.py +50 -0
  20. scorebook/metrics/bleu.py +82 -0
  21. scorebook/metrics/core/__init__.py +1 -0
  22. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  23. scorebook/metrics/core/metric_registry.py +195 -0
  24. scorebook/metrics/exactmatch.py +95 -0
  25. scorebook/metrics/f1.py +96 -0
  26. scorebook/metrics/precision.py +84 -9
  27. scorebook/metrics/recall.py +94 -0
  28. scorebook/metrics/rouge.py +85 -0
  29. scorebook/score/__init__.py +0 -5
  30. scorebook/score/_async/score_async.py +3 -2
  31. scorebook/score/_sync/score.py +3 -2
  32. scorebook/score/score_helpers.py +29 -12
  33. scorebook/types.py +3 -3
  34. scorebook/utils/__init__.py +0 -22
  35. scorebook/utils/common_helpers.py +1 -1
  36. scorebook/utils/mock_llm/__init__.py +41 -0
  37. scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
  38. scorebook/utils/progress_bars.py +58 -786
  39. scorebook-0.0.15.dist-info/METADATA +300 -0
  40. scorebook-0.0.15.dist-info/RECORD +110 -0
  41. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  42. tutorials/README.md +147 -0
  43. tutorials/__init__.py +5 -0
  44. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  45. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  46. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  47. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  48. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  49. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  50. tutorials/examples/1-score/__init__.py +0 -0
  51. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  52. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  53. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  54. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  55. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  56. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  57. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  58. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  59. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  60. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  61. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  62. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  63. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  64. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  65. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  66. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  67. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  68. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  69. tutorials/examples/6-providers/aws/__init__.py +1 -0
  70. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  71. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  72. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  73. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  74. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  75. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  76. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  77. tutorials/examples/__init__.py +0 -0
  78. tutorials/notebooks/1-scoring.ipynb +162 -0
  79. tutorials/notebooks/2-evaluating.ipynb +316 -0
  80. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  81. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  82. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  83. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  84. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  85. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  86. tutorials/quickstarts/getting_started.ipynb +197 -0
  87. tutorials/utils/__init__.py +35 -0
  88. tutorials/utils/args_parser.py +132 -0
  89. tutorials/utils/output.py +23 -0
  90. tutorials/utils/setup.py +98 -0
  91. scorebook/metrics/metric_registry.py +0 -105
  92. scorebook/trismik/__init__.py +0 -10
  93. scorebook-0.0.13.dist-info/METADATA +0 -389
  94. scorebook-0.0.13.dist-info/RECORD +0 -50
  95. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  96. {scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,197 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bda7b5add96e4d97",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Getting Started with Trismik's Adaptive Testing\n",
9
+ "\n",
10
+ "This notebook demonstrates how to run Trismik's adaptive evaluations using Scorebook.\n",
11
+ "\n",
12
+ "## What is Adaptive Testing?\n",
13
+ "\n",
14
+ "Trismik’s adaptive testing service leverages item response theory (IRT), a psychometric framework, to evaluate large language models. Using computerized adaptive testing (CAT) it dynamically selects the most informative items, enabling faster, more cost-efficient model evaluations with fewer items required.\n",
15
+ "\n",
16
+ "## Setup\n",
17
+ "\n",
18
+ "### Install Scorebook\n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "id": "286eca2349c6ddc6",
24
+ "metadata": {},
25
+ "source": "!pip install scorebook",
26
+ "outputs": [],
27
+ "execution_count": null
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "id": "d2fa56528e14c46d",
32
+ "metadata": {},
33
+ "source": [
34
+ "\n",
35
+ "\n",
36
+ "### Generate a Trismik API Key\n",
37
+ "\n",
38
+ "To run an adaptive evaluation, a Trismik API key is required. You can [sign up](https://dashboard.trismik.com/signup) for a free Trismik account and generate an API key.\n",
39
+ "\n",
40
+ "**How to generate an API key from the Trismik dashboard**:\n",
41
+ "1. click on your initials in the top-right corner of the screen.\n",
42
+ "2. click on \"API Keys\" in the drop-down menu.\n",
43
+ "3. click \"Create API Key\" to create a new API key.\n",
44
+ "\n",
45
+ "### Set Trismik API Key"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "id": "5ed8a62ac56560e9",
51
+ "metadata": {},
52
+ "source": [
53
+ "# Set your API key here and run this cell to login\n",
54
+ "TRISMIK_API_KEY = \"your-trismik-api-key-here\""
55
+ ],
56
+ "outputs": [],
57
+ "execution_count": null
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "id": "10d842e5b99e95bd",
62
+ "metadata": {},
63
+ "source": [
64
+ "### Login with Trismik API Key"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "id": "initial_id",
70
+ "metadata": {},
71
+ "source": [
72
+ "import scorebook\n",
73
+ "\n",
74
+ "scorebook.login(TRISMIK_API_KEY)\n",
75
+ "print(\"✓ Logged in to Trismik\")"
76
+ ],
77
+ "outputs": [],
78
+ "execution_count": null
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "id": "eedbc41adba92248",
83
+ "metadata": {},
84
+ "source": [
85
+ "### Create a Project\n",
86
+ "\n",
87
+ "When running an adaptive evaluation, your evaluation results are stored under a project on the Trismik dashboard. Projects can be created from the dashboard's interface, or programmatically via Scorebook."
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "id": "bd7271800bf91478",
93
+ "metadata": {},
94
+ "source": [
95
+ "from scorebook import create_project\n",
96
+ "\n",
97
+ "# Create a project\n",
98
+ "project = create_project(\n",
99
+ " name = \"Quick-Start Guides\",\n",
100
+ " description = \"A project created for Trismik's quick-start guides.\"\n",
101
+ ")\n",
102
+ "\n",
103
+ "print(\"✓ Project created\")\n",
104
+ "print(f\"Project ID: {project.id}\")"
105
+ ],
106
+ "outputs": [],
107
+ "execution_count": null
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "id": "e30ea2a2674e6005",
112
+ "metadata": {},
113
+ "source": [
114
+ "## Run an Adaptive Evaluation\n",
115
+ "\n",
116
+ "For this quick-start guide, we will use a mock model, that replicates the responses generated by an Amazon LLM (Nova-Pro). `mock_llm` is an inference function which accepts a list of model inputs and returns a list of model outputs for scoring. In this example, we use model responses that we pre-computed to showcase how adaptive evaluations work."
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "id": "d9f96d063e08c4fd",
122
+ "metadata": {},
123
+ "source": [
124
+ "from scorebook.utils.mock_llm import mock_llm\n",
125
+ "\n",
126
+ "# Run adaptive evaluation\n",
127
+ "results = scorebook.evaluate(\n",
128
+ " inference = mock_llm,\n",
129
+ " datasets = \"trismik/MMLUPro:adaptive\",\n",
130
+ " experiment_id = \"Getting-Started\",\n",
131
+ " project_id = project.id,\n",
132
+ ")\n",
133
+ "\n",
134
+ "# Print the adaptive evaluation results\n",
135
+ "print(\"✓ Adaptive evaluation complete!\")\n",
136
+ "print(\"Results: \", results[0][\"score\"])\n",
137
+ "\n",
138
+ "print(f\"You can view your results here: https://dashboard.trismik.com/projects/{project.id}\")"
139
+ ],
140
+ "outputs": [],
141
+ "execution_count": null
142
+ },
143
+ {
144
+ "cell_type": "markdown",
145
+ "id": "63d3f34a76e90ac0",
146
+ "metadata": {},
147
+ "source": [
148
+ "### Adaptive Evaluation Results\n",
149
+ "\n",
150
+ "The metrics generated by an adaptive evaluation are:\n",
151
+ "\n",
152
+ "- Theta (θ): The primary score measuring model ability on the dataset, a higher value represents better performance.\n",
153
+ "- Standard Error: The theta score is a proxy for the underlying metric, and the standard error is the uncertainty in the theta estimate.\n",
154
+ "\n",
155
+ "You can find more information about adaptive testing [here](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/)!\n",
156
+ "\n",
157
+ "---\n",
158
+ "\n",
159
+ "## Next Steps\n",
160
+ "\n",
161
+ "**More Quick-Start Guides**:\n",
162
+ "\n",
163
+ "1. [HuggingFace Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for local HF-based models. \n",
164
+ "2. [API-based Adaptive Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb): For a demo showcasing how Scorebook's adaptive evaluations can be used for API-based models, using the OpenAI API.\n",
165
+ "3. [HuggingFace Full Dataset Evaluation](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb): For a demo showcasing how Scorebook can be used for full dataset evaluations with familiar metrics, like `Accuracy`.\n",
166
+ "\n",
167
+ "**More details on Adaptive Testing and Scorebook**:\n",
168
+ "\n",
169
+ "- [Introduction to Adaptive Testing](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): a quick introduction to Adaptive Testing.\n",
170
+ "- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
171
+ "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
172
+ "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
173
+ ]
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "Python 3 (ipykernel)",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.13.7"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 5
197
+ }
@@ -0,0 +1,35 @@
1
+ """
2
+ Helper utilities for Scorebook examples.
3
+
4
+ This module provides common helper functions used across multiple Scorebook examples
5
+ for setup, output handling, and argument parsing.
6
+ """
7
+
8
+ # Argument parsing utilities
9
+ from .args_parser import (
10
+ add_model_selection_arg,
11
+ create_parser,
12
+ parse_args_with_config,
13
+ setup_batch_model_parser,
14
+ setup_openai_model_parser,
15
+ )
16
+
17
+ # Output utilities
18
+ from .output import save_results_to_json
19
+
20
+ # Setup utilities
21
+ from .setup import setup_logging, setup_output_directory
22
+
23
+ __all__ = [
24
+ # Setup
25
+ "setup_logging",
26
+ "setup_output_directory",
27
+ # Output
28
+ "save_results_to_json",
29
+ # Argument parsing
30
+ "create_parser",
31
+ "add_model_selection_arg",
32
+ "setup_openai_model_parser",
33
+ "setup_batch_model_parser",
34
+ "parse_args_with_config",
35
+ ]
@@ -0,0 +1,132 @@
1
+ """
2
+ Generic argument parsing utilities for Scorebook examples.
3
+
4
+ This module provides reusable argument parsing functions that can be used
5
+ across multiple Scorebook examples for consistent command-line interfaces.
6
+ """
7
+
8
+ import argparse
9
+ from typing import Any, Dict, List, Optional
10
+
11
+
12
+ def create_parser(description: str) -> argparse.ArgumentParser:
13
+ """Create a basic argument parser with a description.
14
+
15
+ Args:
16
+ description: Description for the argument parser
17
+
18
+ Returns:
19
+ Configured ArgumentParser instance
20
+ """
21
+ return argparse.ArgumentParser(description=description)
22
+
23
+
24
+ def add_model_selection_arg(
25
+ parser: argparse.ArgumentParser,
26
+ default: str = "gpt-4o-mini",
27
+ help_text: Optional[str] = None,
28
+ supported_models: Optional[List[str]] = None,
29
+ ) -> argparse.ArgumentParser:
30
+ """Add model selection argument to parser.
31
+
32
+ Args:
33
+ parser: ArgumentParser to add the argument to
34
+ default: Default model name
35
+ help_text: Custom help text for the argument
36
+ supported_models: List of supported models for validation
37
+
38
+ Returns:
39
+ The modified parser
40
+ """
41
+ if help_text is None:
42
+ help_text = f"OpenAI model to use for inference (default: {default})"
43
+ if supported_models:
44
+ help_text += f". Supported models: {', '.join(supported_models)}"
45
+
46
+ parser.add_argument(
47
+ "--model",
48
+ type=str,
49
+ default=default,
50
+ help=help_text,
51
+ )
52
+ return parser
53
+
54
+
55
+ def setup_openai_model_parser(
56
+ description: str = "Select OpenAI model for evaluation.",
57
+ default: str = "gpt-4o-mini",
58
+ supported_models: Optional[List[str]] = None,
59
+ ) -> str:
60
+ """Set up and parse OpenAI model selection arguments.
61
+
62
+ Args:
63
+ description: Description for the argument parser
64
+ default: Default model name
65
+ supported_models: List of supported models for help text
66
+
67
+ Returns:
68
+ Selected model name
69
+ """
70
+ parser = create_parser(description)
71
+ add_model_selection_arg(parser, default=default, supported_models=supported_models)
72
+ args = parser.parse_args()
73
+ return str(args.model)
74
+
75
+
76
+ def setup_batch_model_parser(
77
+ description: str = "Select OpenAI model for batch evaluation.", default: str = "gpt-4o-mini"
78
+ ) -> str:
79
+ """Set up and parse OpenAI model selection arguments for batch inference.
80
+
81
+ Args:
82
+ description: Description for the argument parser
83
+ default: Default model name
84
+
85
+ Returns:
86
+ Selected model name
87
+ """
88
+ supported_models = ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"]
89
+ help_text = (
90
+ f"OpenAI model to use for batch inference. "
91
+ f"Note: Only select models support the Batch API. "
92
+ f"Supported models include: {', '.join(supported_models)}. "
93
+ f"Default: {default}"
94
+ )
95
+
96
+ parser = create_parser(description)
97
+ add_model_selection_arg(parser, default=default, help_text=help_text)
98
+ args = parser.parse_args()
99
+ return str(args.model)
100
+
101
+
102
+ def parse_args_with_config(config: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
103
+ """Parse arguments using a configuration dictionary.
104
+
105
+ Args:
106
+ config: Dictionary defining arguments to add. Format:
107
+ {
108
+ "arg_name": {
109
+ "type": str,
110
+ "default": "default_value",
111
+ "help": "Help text",
112
+ "required": False # optional
113
+ }
114
+ }
115
+
116
+ Returns:
117
+ Dictionary of parsed argument values
118
+ """
119
+ parser = argparse.ArgumentParser()
120
+
121
+ for arg_name, arg_config in config.items():
122
+ kwargs = {"type": arg_config.get("type", str), "help": arg_config.get("help", "")}
123
+
124
+ if "default" in arg_config:
125
+ kwargs["default"] = arg_config["default"]
126
+ if "required" in arg_config:
127
+ kwargs["required"] = arg_config["required"]
128
+
129
+ parser.add_argument(f"--{arg_name.replace('_', '-')}", **kwargs)
130
+
131
+ args = parser.parse_args()
132
+ return vars(args)
@@ -0,0 +1,23 @@
1
+ """
2
+ Utility functions for saving Scorebook evaluation results.
3
+
4
+ This module provides common helper functions used across multiple Scorebook examples
5
+ for saving evaluation results to files.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ def save_results_to_json(results: Any, output_dir: Path, filename: str) -> None:
14
+ """Save evaluation results to a JSON file.
15
+
16
+ Args:
17
+ results: The evaluation results to save
18
+ output_dir: Directory to save the file in
19
+ filename: Name of the output file (should include .json extension)
20
+ """
21
+ output_path = output_dir / filename
22
+ with open(output_path, "w") as output_file:
23
+ json.dump(results, output_file, indent=4)
@@ -0,0 +1,98 @@
1
+ """
2
+ Utility functions for setting up Scorebook examples.
3
+
4
+ This module provides common helper functions used across multiple Scorebook examples
5
+ for output directory setup and logging configuration.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+
14
+ def setup_output_directory() -> Path:
15
+ """Parse command line arguments and setup output directory."""
16
+ import argparse
17
+
18
+ parser = argparse.ArgumentParser(description="Run evaluation and save results.")
19
+ parser.add_argument(
20
+ "--output-dir",
21
+ type=str,
22
+ default=str(Path.cwd() / "examples/example_results"),
23
+ help=(
24
+ "Directory to save evaluation outputs (CSV and JSON). "
25
+ "Defaults to ./examples/example_results in the current working directory."
26
+ ),
27
+ )
28
+ args = parser.parse_args()
29
+ output_dir = Path(args.output_dir)
30
+ output_dir.mkdir(parents=True, exist_ok=True)
31
+ print(f"Saving results to {output_dir}")
32
+ return output_dir
33
+
34
+
35
+ def setup_logging(
36
+ log_dir: str = "logs",
37
+ experiment_id: Optional[str] = None,
38
+ base_dir: Optional[Path] = None,
39
+ ) -> Path:
40
+ """Configure logging for evaluation runs.
41
+
42
+ Args:
43
+ log_dir: Name of the log directory (default: "logs")
44
+ experiment_id: Optional identifier for the experiment
45
+ base_dir: Base directory where log_dir should be created.
46
+ If None, uses current working directory.
47
+ """
48
+ if base_dir is None:
49
+ base_dir = Path.cwd()
50
+
51
+ log_dir_path: Path = base_dir / log_dir
52
+ log_dir_path.mkdir(exist_ok=True, parents=True)
53
+
54
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
55
+ if experiment_id:
56
+ log_file = log_dir_path / f"evaluation_{experiment_id}_{timestamp}.log"
57
+ else:
58
+ log_file = log_dir_path / f"evaluation_{timestamp}.log"
59
+
60
+ # Create file handler for all logs (same as before)
61
+ file_handler = logging.FileHandler(log_file)
62
+ file_handler.setLevel(logging.DEBUG)
63
+ file_handler.setFormatter(
64
+ logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
65
+ )
66
+
67
+ # Create console handler for warnings and errors only
68
+ console_handler = logging.StreamHandler()
69
+ console_handler.setLevel(logging.WARNING)
70
+ console_handler.setFormatter(logging.Formatter("%(levelname)s - %(name)s - %(message)s"))
71
+
72
+ # Configure root logger with both handlers
73
+ logging.basicConfig(
74
+ level=logging.INFO,
75
+ handlers=[file_handler, console_handler],
76
+ force=True,
77
+ )
78
+
79
+ # Set scorebook loggers to DEBUG level to capture all scorebook logs
80
+ scorebook_logger = logging.getLogger("scorebook")
81
+ scorebook_logger.setLevel(logging.DEBUG)
82
+
83
+ # Ensure trismik_services logs are captured at DEBUG level
84
+ trismik_services_logger = logging.getLogger("scorebook.trismik_services")
85
+ trismik_services_logger.setLevel(logging.DEBUG)
86
+
87
+ # Ensure evaluate logs are captured at DEBUG level
88
+ evaluate_logger = logging.getLogger("scorebook.evaluate._sync.evaluate")
89
+ evaluate_logger.setLevel(logging.DEBUG)
90
+ evaluate_logger = logging.getLogger("scorebook.evaluate._async.evaluate_async")
91
+ evaluate_logger.setLevel(logging.DEBUG)
92
+
93
+ # Exclude OpenAI inference logs to reduce noise
94
+ openai_logger = logging.getLogger("scorebook.inference.openai")
95
+ openai_logger.setLevel(logging.WARNING) # Only log warnings and errors
96
+
97
+ print(f"Logging to {log_file}")
98
+ return log_file
@@ -1,105 +0,0 @@
1
- """
2
- Registry module for evaluation metrics.
3
-
4
- This module maintains a centralized registry of available evaluation metrics
5
- that can be used to assess model performance. It provides a single access point
6
- to retrieve all implemented metric classes.
7
- """
8
-
9
- from typing import Any, Callable, Dict, List, Type, Union
10
-
11
- from scorebook.metrics.metric_base import MetricBase
12
-
13
-
14
- class MetricRegistry:
15
- """A registry for evaluation metrics.
16
-
17
- This class provides a central registry for all evaluation metrics in the system.
18
- It allows metrics to be registered with unique names and retrieved either by
19
- name or by class. The registry ensures that metrics are properly initialized
20
- and accessible throughout the application.
21
-
22
- The registry supports:
23
- - Registering new metric classes with optional custom names
24
- - Retrieving metric instances by name or class
25
- - Listing all available metrics
26
-
27
- Usage:
28
- @MetricRegistry.register("custom_name")
29
- class MyMetric(MetricBase):
30
- ...
31
-
32
- # Get by name
33
- metric = MetricRegistry.get("custom_name")
34
-
35
- # Get by class
36
- metric = MetricRegistry.get(MyMetric)
37
-
38
- # List available metrics
39
- metrics = MetricRegistry.list_metrics()
40
- """
41
-
42
- _registry: Dict[str, Type[MetricBase]] = {}
43
-
44
- @classmethod
45
- def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
46
- """
47
- Register a metric class in the registry.
48
-
49
- Returns:
50
- A decorator that registers the class and returns it.
51
-
52
- Raises:
53
- ValueError: If a metric with the given name is already registered.
54
- """
55
-
56
- def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
57
-
58
- key = metric_cls.__name__.lower()
59
- if key in cls._registry:
60
- raise ValueError(f"Metric '{key}' is already registered")
61
- cls._registry[key] = metric_cls
62
- return metric_cls
63
-
64
- return decorator
65
-
66
- @classmethod
67
- def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
68
- """
69
- Get an instance of a registered metric by name or class.
70
-
71
- Args:
72
- name_or_class: The metric name (string) or class (subclass of BaseMetric).
73
- **kwargs: Additional arguments to pass to the metric's constructor.
74
-
75
- Returns:
76
- An instance of the requested metric.
77
-
78
- Raises:
79
- ValueError: If the metric name is not registered.
80
- """
81
- # If input is a class that's a subclass of BaseMetric, instantiate it directly
82
- if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
83
- return name_or_class(**kwargs)
84
-
85
- # If input is a string, look up the class in the registry
86
- if isinstance(name_or_class, str):
87
- key = name_or_class.lower()
88
- if key not in cls._registry:
89
- raise ValueError(f"Metric '{name_or_class}' not registered.")
90
- return cls._registry[key](**kwargs)
91
-
92
- raise ValueError(
93
- f"Invalid metric type: {type(name_or_class)}."
94
- f"Must be string name or BaseMetric subclass"
95
- )
96
-
97
- @classmethod
98
- def list_metrics(cls) -> List[str]:
99
- """
100
- List all registered metrics.
101
-
102
- Returns:
103
- A list of metric names.
104
- """
105
- return list(cls._registry.keys())
@@ -1,10 +0,0 @@
1
- """Trismik authentication and API integration.
2
-
3
- Note: Trismik evaluation functionality has been moved to scorebook.evaluate module.
4
- This module now only provides authentication functions.
5
- """
6
-
7
- # Import shared credential functions
8
- from .credentials import get_stored_token, get_token, login, logout, whoami
9
-
10
- __all__ = ["login", "logout", "whoami", "get_stored_token", "get_token"]