scorebook 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ """Trismik adaptive testing service integration."""
2
+
3
+ import asyncio
4
+ import dataclasses
5
+ import inspect
6
+ import logging
7
+ from typing import Any, Callable, Iterable, Mapping
8
+
9
+ from trismik.adaptive_test import AdaptiveTest
10
+ from trismik.client_async import TrismikAsyncClient
11
+ from trismik.types import TrismikMultipleChoiceTextItem, TrismikRunMetadata
12
+
13
+ from scorebook.types import AdaptiveEvalRunResult, AdaptiveEvalRunSpec
14
+
15
+ from .login import get_token
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ async def run_adaptive_evaluation(
21
+ inference: Callable,
22
+ adaptive_run_spec: AdaptiveEvalRunSpec,
23
+ experiment_id: str,
24
+ project_id: str,
25
+ metadata: Any,
26
+ ) -> AdaptiveEvalRunResult:
27
+ """Run an adaptive evaluation using the Trismik API.
28
+
29
+ Args:
30
+ inference: Function to run inference
31
+ adaptive_run_spec: Specification for the adaptive evaluation
32
+ experiment_id: Experiment identifier
33
+ project_id: Trismik project ID
34
+ metadata: Additional metadata
35
+ Returns:
36
+ Results from the adaptive evaluation
37
+ """
38
+ runner = AdaptiveTest(
39
+ make_trismik_inference(inference),
40
+ client=TrismikAsyncClient(
41
+ service_url="https://api-stage.trismik.com/adaptive-testing", api_key=get_token()
42
+ ),
43
+ )
44
+
45
+ logger.debug(
46
+ "test_id: %s, project_id: %s, experiment: %s ",
47
+ adaptive_run_spec.dataset,
48
+ project_id,
49
+ experiment_id,
50
+ )
51
+ trismik_results = runner.run(
52
+ adaptive_run_spec.dataset,
53
+ project_id,
54
+ experiment_id,
55
+ run_metadata=TrismikRunMetadata(
56
+ model_metadata=TrismikRunMetadata.ModelMetadata(name="unknown"),
57
+ test_configuration={},
58
+ inference_setup={},
59
+ ),
60
+ return_dict=False,
61
+ )
62
+
63
+ # Convert TrismikRunResults to AdaptiveEvalRunResult
64
+ # Extract scores from the Trismik results
65
+ scores = {}
66
+ if hasattr(trismik_results, "scores") and trismik_results.scores:
67
+ scores = trismik_results.scores
68
+ elif hasattr(trismik_results, "__dict__"):
69
+ # If scores aren't directly available, include all attributes as scores
70
+ scores = {k: v for k, v in trismik_results.__dict__.items() if not k.startswith("_")}
71
+
72
+ # Convert AdaptiveTestScore objects to JSON-serializable dictionaries
73
+ def make_json_serializable(obj: Any) -> Any:
74
+ if hasattr(obj, "theta") and hasattr(obj, "std_error"):
75
+ # This is likely an AdaptiveTestScore object
76
+ return {"theta": obj.theta, "std_error": obj.std_error}
77
+ elif isinstance(obj, dict):
78
+ return {k: make_json_serializable(v) for k, v in obj.items()}
79
+ elif isinstance(obj, (list, tuple)):
80
+ return [make_json_serializable(item) for item in obj]
81
+ else:
82
+ return obj
83
+
84
+ # Make scores JSON serializable
85
+ scores = make_json_serializable(scores)
86
+
87
+ return AdaptiveEvalRunResult(run_spec=adaptive_run_spec, scores=scores)
88
+
89
+
90
+ def make_trismik_inference(
91
+ inference_function: Callable,
92
+ return_list: bool = False,
93
+ ) -> Callable[[Any], Any]:
94
+ """Wrap an inference function for flexible input handling.
95
+
96
+ Takes a function expecting list[dict] and makes it accept single dict
97
+ or TrismikMultipleChoiceTextItem.
98
+ """
99
+
100
+ # Check if the inference function is async
101
+ is_async = inspect.iscoroutinefunction(inference_function) or (
102
+ hasattr(inference_function, "__call__")
103
+ and inspect.iscoroutinefunction(inference_function.__call__)
104
+ )
105
+
106
+ def sync_trismik_inference_function(eval_items: Any, **kwargs: Any) -> Any:
107
+ # Single TrismikMultipleChoiceTextItem dataclass
108
+ if isinstance(eval_items, TrismikMultipleChoiceTextItem):
109
+ eval_item_dict = dataclasses.asdict(eval_items)
110
+ results = inference_function([eval_item_dict], **kwargs)
111
+ if is_async:
112
+ results = asyncio.run(results)
113
+ return results if return_list else results[0]
114
+
115
+ # Single item (a mapping)
116
+ if isinstance(eval_items, Mapping):
117
+ results = inference_function([eval_items], **kwargs)
118
+ if is_async:
119
+ results = asyncio.run(results)
120
+ return results if return_list else results[0]
121
+
122
+ # Iterable of items (but not a string/bytes)
123
+ if isinstance(eval_items, Iterable) and not isinstance(eval_items, (str, bytes)):
124
+ # Convert any TrismikMultipleChoiceTextItem instances to dicts
125
+ converted_items = []
126
+ for item in eval_items:
127
+ if isinstance(item, TrismikMultipleChoiceTextItem):
128
+ converted_items.append(dataclasses.asdict(item))
129
+ else:
130
+ converted_items.append(item)
131
+ results = inference_function(converted_items, **kwargs)
132
+ if is_async:
133
+ results = asyncio.run(results)
134
+ return results
135
+
136
+ raise TypeError(
137
+ "Expected a single item (Mapping[str, Any] or TrismikMultipleChoiceTextItem) "
138
+ "or an iterable of such items."
139
+ )
140
+
141
+ return sync_trismik_inference_function
@@ -0,0 +1,120 @@
1
+ """Authentication and token management for Trismik API."""
2
+
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ from typing import Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def get_scorebook_config_dir() -> str:
12
+ """Get the scorebook config directory."""
13
+ return os.path.join(os.path.expanduser("~"), ".scorebook")
14
+
15
+
16
+ def get_token_path() -> str:
17
+ """Get the path where the trismik token is stored."""
18
+ if "TRISMIK_TOKEN_PATH" in os.environ:
19
+ return os.environ["TRISMIK_TOKEN_PATH"]
20
+ return os.path.join(get_scorebook_config_dir(), "config")
21
+
22
+
23
+ def save_token(token: str) -> None:
24
+ """Save the token to the local cache directory."""
25
+ token_path = get_token_path()
26
+
27
+ # Create a directory if it doesn't exist
28
+ os.makedirs(os.path.dirname(token_path), exist_ok=True)
29
+
30
+ # Write token to file
31
+ pathlib.Path(token_path).write_text(token.strip())
32
+
33
+ # Set restrictive permissions (owner read/write only)
34
+ os.chmod(token_path, 0o600)
35
+
36
+
37
+ def get_stored_token() -> Optional[str]:
38
+ """Retrieve the stored token from the cache directory."""
39
+ token_path = get_token_path()
40
+
41
+ if not os.path.exists(token_path):
42
+ return None
43
+
44
+ try:
45
+ token = pathlib.Path(token_path).read_text().strip()
46
+ return token if token else None
47
+ except (OSError, IOError) as e:
48
+ logger.warning(f"Failed to read token from {token_path}: {e}")
49
+ return None
50
+
51
+
52
+ def get_token() -> Optional[str]:
53
+ """Get the trismik API token in order of priority.
54
+
55
+ Priority order:
56
+ 1. TRISMIK_API_KEY environment variable
57
+ 2. Stored token file
58
+ """
59
+ # Check environment variable first
60
+ env_token = os.environ.get("TRISMIK_API_KEY")
61
+ if env_token:
62
+ return env_token.strip()
63
+
64
+ # Fallback to stored token
65
+ return get_stored_token()
66
+
67
+
68
+ def validate_token(token: str) -> bool:
69
+ """Validate the token by making a test API call to trismik."""
70
+ # TODO: Implement actual API validation once you have an endpoint
71
+ # This would typically make a request to something like:
72
+ # response = requests.get("https://api.trismik.com/whoami",
73
+ # headers={"Authorization": f"Bearer {token}"})
74
+ # return response.status_code == 200
75
+
76
+ # For now, just check it's not empty
77
+ return bool(token and token.strip())
78
+
79
+
80
+ def login(trismik_api_key: str) -> None:
81
+ """Login to trismik by saving API key locally.
82
+
83
+ Args:
84
+ trismik_api_key: The API key to use.
85
+ Raises:
86
+ ValueError: If API key is empty or invalid.
87
+ """
88
+ if not trismik_api_key:
89
+ raise ValueError("API key cannot be empty")
90
+
91
+ # Validate token
92
+ if not validate_token(trismik_api_key):
93
+ raise ValueError("Invalid API key provided")
94
+
95
+ # Save token
96
+ save_token(trismik_api_key)
97
+
98
+
99
+ def logout() -> bool:
100
+ """Remove the stored token.
101
+
102
+ Returns:
103
+ bool: True if a token was removed, False if no token was found.
104
+ """
105
+ token_path = get_token_path()
106
+
107
+ if os.path.exists(token_path):
108
+ os.remove(token_path)
109
+ return True
110
+ else:
111
+ return False
112
+
113
+
114
+ def whoami() -> Optional[str]:
115
+ """Return information about the current user/token.
116
+
117
+ Returns:
118
+ str: The stored token if logged in, None if not logged in.
119
+ """
120
+ return get_stored_token()
scorebook/types.py ADDED
@@ -0,0 +1,165 @@
1
+ """Type definitions for scorebook evaluation framework."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ from scorebook.eval_dataset import EvalDataset
7
+
8
+
9
+ @dataclass
10
+ class AdaptiveEvalDataset:
11
+ """Represents a dataset configured for adaptive evaluation."""
12
+
13
+ name: str
14
+
15
+
16
+ @dataclass
17
+ class EvalRunSpec:
18
+ """Specification for a single evaluation run with dataset and hyperparameters."""
19
+
20
+ dataset: EvalDataset
21
+ dataset_index: int
22
+ hyperparameter_config: Dict[str, Any]
23
+ hyperparameters_index: int
24
+ items: List[Dict[str, Any]]
25
+ labels: List[Any]
26
+
27
+ def __str__(self) -> str:
28
+ """Return string representation of EvalRunSpec."""
29
+ return (
30
+ f"EvalRunSpec(dataset={self.dataset.name}, "
31
+ f"dataset_index={self.dataset_index}, "
32
+ f"hyperparameter_config={self.hyperparameter_config}, "
33
+ f"hyperparameters_index={self.hyperparameters_index})"
34
+ )
35
+
36
+
37
+ @dataclass
38
+ class AdaptiveEvalRunSpec:
39
+ """Specification for an adaptive evaluation run."""
40
+
41
+ dataset: str
42
+ dataset_index: int
43
+ hyperparameter_config: Dict[str, Any]
44
+ hyperparameters_index: int
45
+ experiment_id: str
46
+ project_id: str
47
+ metadata: Optional[Dict[str, Any]] = None
48
+
49
+
50
+ @dataclass
51
+ class ClassicEvalRunResult:
52
+ """Results from executing a classic evaluation run."""
53
+
54
+ run_spec: EvalRunSpec
55
+ outputs: List[Any]
56
+ scores: Dict[str, Any]
57
+
58
+ @property
59
+ def item_scores(self) -> List[Dict[str, Any]]:
60
+ """Return a list of dictionaries containing scores for each evaluated item."""
61
+ results = []
62
+
63
+ for idx, output in enumerate(self.outputs):
64
+ if idx >= len(self.run_spec.items):
65
+ break
66
+
67
+ result = {
68
+ "item_id": idx,
69
+ "dataset_name": self.run_spec.dataset.name,
70
+ "inference_output": output,
71
+ **self.run_spec.hyperparameter_config,
72
+ }
73
+
74
+ # Add individual item scores if available
75
+ for metric_name, metric_data in self.scores.items():
76
+ if isinstance(metric_data, dict) and "item_scores" in metric_data:
77
+ if idx < len(metric_data["item_scores"]):
78
+ result[metric_name] = metric_data["item_scores"][idx]
79
+ else:
80
+ # If scores is just a single value, replicate it for each item
81
+ result[metric_name] = metric_data
82
+
83
+ results.append(result)
84
+
85
+ return results
86
+
87
+ @property
88
+ def aggregate_scores(self) -> Dict[str, Any]:
89
+ """Return the aggregated scores for this run."""
90
+ result = {
91
+ "dataset": self.run_spec.dataset.name,
92
+ **self.run_spec.hyperparameter_config,
93
+ }
94
+
95
+ # Add aggregate scores from metrics
96
+ for metric_name, metric_data in self.scores.items():
97
+ if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
98
+ # Flatten the aggregate scores from each metric
99
+ for key, value in metric_data["aggregate_scores"].items():
100
+ score_key = key if key == metric_name else f"{metric_name}_{key}"
101
+ result[score_key] = value
102
+ else:
103
+ # If scores is just a single value, use it as is
104
+ result[metric_name] = metric_data
105
+
106
+ return result
107
+
108
+
109
+ @dataclass
110
+ class AdaptiveEvalRunResult:
111
+ """Results from executing an adaptive evaluation run."""
112
+
113
+ run_spec: AdaptiveEvalRunSpec
114
+ scores: Dict[str, Any]
115
+
116
+ @property
117
+ def aggregate_scores(self) -> Dict[str, Any]:
118
+ """Return the aggregated scores for this adaptive run."""
119
+ result = {
120
+ "dataset": self.run_spec.dataset,
121
+ "experiment_id": self.run_spec.experiment_id,
122
+ "project_id": self.run_spec.project_id,
123
+ }
124
+
125
+ # Safely unpack hyperparameter_config if it's not None
126
+ if self.run_spec.hyperparameter_config:
127
+ result.update(self.run_spec.hyperparameter_config)
128
+
129
+ # Safely unpack metadata if it's not None
130
+ if self.run_spec.metadata:
131
+ result.update(self.run_spec.metadata)
132
+
133
+ # Safely unpack scores if it's not None
134
+ if self.scores:
135
+ result.update(self.scores)
136
+
137
+ return result
138
+
139
+
140
+ @dataclass
141
+ class EvalResult:
142
+ """Container for evaluation results across multiple runs."""
143
+
144
+ run_results: List[Union[ClassicEvalRunResult, AdaptiveEvalRunResult]]
145
+
146
+ @property
147
+ def item_scores(self) -> List[Dict[str, Any]]:
148
+ """Return a list of dictionaries containing scores for each evaluated item."""
149
+ results = []
150
+
151
+ for run_result in self.run_results:
152
+ if isinstance(run_result, ClassicEvalRunResult):
153
+ results.extend(run_result.item_scores)
154
+
155
+ return results
156
+
157
+ @property
158
+ def aggregate_scores(self) -> List[Dict[str, Any]]:
159
+ """Return the aggregated scores across all evaluated runs."""
160
+ results = []
161
+
162
+ for run_result in self.run_results:
163
+ results.append(run_result.aggregate_scores)
164
+
165
+ return results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: scorebook
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: A Python project for LLM evaluation.
5
5
  Author: Euan Campbell
6
6
  Author-email: euan@trismik.com
@@ -35,6 +35,7 @@ Requires-Dist: torch ; extra == "examples"
35
35
  Requires-Dist: torchaudio ; extra == "examples"
36
36
  Requires-Dist: torchvision ; extra == "examples"
37
37
  Requires-Dist: transformers ; extra == "examples"
38
+ Requires-Dist: trismik (>=0.9.3rc3)
38
39
  Description-Content-Type: text/markdown
39
40
 
40
41
  # Scorebook
@@ -1,31 +1,35 @@
1
- scorebook/__init__.py,sha256=7ac3KpXU3kKFekq8mZ3cVbF7oQ6Q9E-uqX7ijyte1Q0,406
2
- scorebook/evaluator.py,sha256=mS3G3PI26nHzqkYX4tqusQZJL5Q1xTxzqshAdwscl0s,14170
1
+ scorebook/__init__.py,sha256=yHhNIHeLeRwjdyfSg3jtCz-NbQXMlN9fLhHJ1QzPQGQ,548
2
+ scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
3
+ scorebook/cli/auth.py,sha256=bv3imsgmY_t52wFoMJt9iu-cKPwvKYkVqZ7nE8EVc6E,2931
4
+ scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
5
+ scorebook/eval_dataset.py,sha256=LSTyxUkT06iEAVYCnjIDFxFgZzRejwiS5CZA-jvy1ns,15098
6
+ scorebook/evaluate.py,sha256=0Begs5Py9rpapoMixpqjlS2ofigQaGofbjqucABRfuM,19088
3
7
  scorebook/exceptions.py,sha256=emq2QY-4mW6VXlq1dxunPjt-xZpLQIxo8Ck_gYxz1VE,1827
4
8
  scorebook/inference/__init__.py,sha256=tqSXSyVurc_YRfPypYed8iTH7Fwt7iFCXMxBXnqY-9I,242
5
9
  scorebook/inference/bedrock.py,sha256=wllq0ysNFQKWJDEqoN-k96Jx43BHCAvfxm14zMRCf90,10074
6
- scorebook/inference/openai.py,sha256=FqXua4v4PTYSHrdTm_9fM0Us8Mo2n2LSN94CwRipRw4,7658
10
+ scorebook/inference/openai.py,sha256=iJVWp0HT9skyM4KXf21yaEjLafdETT5kK3HKl7MZ1hg,8292
7
11
  scorebook/inference/portkey.py,sha256=OHSS-sa2aLxuO6fEfG8MsPlhXc_95_-6j7ImbCkY8KE,5952
8
12
  scorebook/inference/vertex.py,sha256=jv_Nbt1NJQ6mMUyEuW_idxhj_3fugBojshtpGP9fMeY,9874
13
+ scorebook/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
9
14
  scorebook/metrics/__init__.py,sha256=be_riJNojebXw2xfkMsHHjl3HFKgk9jQWlLkXJHhheI,782
10
15
  scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
11
16
  scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
12
17
  scorebook/metrics/metric_registry.py,sha256=jWwt9P3zvtFLlEYrd60v7LS7X251nZczouE02zcCxWg,3402
13
18
  scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
14
- scorebook/types/__init__.py,sha256=dXY3Y-GiMipVExzVu7H5pbdFfg4HBMEKxqSTfENywSs,427
15
- scorebook/types/eval_dataset.py,sha256=dCqOHjGaEb7pGG1VF4aGFn6hngFvlxpxddqsDtM4nTs,13870
16
- scorebook/types/eval_result.py,sha256=R2zuWrx8p9_4A2W3Gmlp-xGgmelPdg8QB5PoV1hiqRc,4728
17
- scorebook/types/eval_run_spec.py,sha256=nf7LGa_dG60Qb385W6O6qiu7VlJ03-dpo2X1PgKGcRQ,845
18
- scorebook/types/inference_pipeline.py,sha256=-HcGGbwM34fGJ_FlXcyqj_pV6DjWIXRGgICiN_63UsU,3251
19
+ scorebook/trismik/__init__.py,sha256=CiGl1u4GcfYhWmB_fGOlsJPwYeKXtIr-uCXoOv4O8yg,284
20
+ scorebook/trismik/adaptive_testing_service.py,sha256=S1yAVnrzqtVWprsiNS_l3q5FibQkMuAs7I7YaSFNtKM,5109
21
+ scorebook/trismik/login.py,sha256=WtJLaNmBMwCi6gT1Bgp4J9x2tq5HDrDI9U074r08TnU,3275
22
+ scorebook/types.py,sha256=wQNFewn9Ji7nQJhXwRS-hVAL4XV6ePcLVdVQiMfWYzg,5149
19
23
  scorebook/utils/__init__.py,sha256=l_bfi9lAMz1oyGnuyKuzYasQKt2DJwffqsbfSl4-GIQ,452
20
24
  scorebook/utils/async_utils.py,sha256=OeNvMrOT9P4rIyaCf5IbR3ZIFMtEzXgoAArNbINRtMU,728
21
25
  scorebook/utils/build_prompt.py,sha256=L_Y84a1ewm3GvwnSSuUXfPO_M0QL1Dl8UgOS_l_zvh4,1617
22
26
  scorebook/utils/io_helpers.py,sha256=ksOJ9ILcZqqt-HwRUYy1NMQbS6RuMh8i2ZzUADLMlQ8,913
23
27
  scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
24
- scorebook/utils/logging_utils.py,sha256=M4BXt369mJo037WYpvuWDoe3oGWVdHWaGo4Vbl0WDL0,60
25
28
  scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
26
29
  scorebook/utils/progress_bars.py,sha256=TBz41w3yFujsO9n8vUjeubgOrmdiAMI2P2SSVqTJzAA,5269
27
30
  scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
28
- scorebook-0.0.4.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
29
- scorebook-0.0.4.dist-info/METADATA,sha256=7odU7Q8SHfuHru2oBBk1XlZ2tXLi2WaSShbUhfmX60A,11409
30
- scorebook-0.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- scorebook-0.0.4.dist-info/RECORD,,
31
+ scorebook-0.0.6.dist-info/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
32
+ scorebook-0.0.6.dist-info/METADATA,sha256=nAnw4oMdvTXlE79pXqhOmTSZ3ITmVJbFDjc7o_vwOD8,11445
33
+ scorebook-0.0.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
34
+ scorebook-0.0.6.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
35
+ scorebook-0.0.6.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ scorebook=scorebook.cli.main:main
3
+