scorebook 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,254 @@
1
+ """Upload evaluation and scoring results to Trismik's experimentation platform."""
2
+
3
+ import logging
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from trismik.types import (
7
+ TrismikClassicEvalItem,
8
+ TrismikClassicEvalMetric,
9
+ TrismikClassicEvalRequest,
10
+ TrismikClassicEvalResponse,
11
+ )
12
+
13
+ from scorebook.evaluate.evaluate_helpers import (
14
+ create_trismik_async_client,
15
+ create_trismik_sync_client,
16
+ get_model_name,
17
+ normalize_metric_value,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Known fields that are not metrics or hyperparameters
23
+ KNOWN_AGGREGATE_FIELDS = {"dataset", "run_id", "run_completed"}
24
+ KNOWN_ITEM_FIELDS = {"id", "dataset_name", "input", "output", "label", "run_id"}
25
+
26
+
27
+ def upload_result(
28
+ run_result: Dict[str, List[Dict[str, Any]]],
29
+ experiment_id: str,
30
+ project_id: str,
31
+ dataset_name: Optional[str] = None,
32
+ hyperparameters: Optional[Dict[str, Any]] = None,
33
+ metadata: Optional[Dict[str, Any]] = None,
34
+ model_name: Optional[str] = None,
35
+ ) -> str:
36
+ """Upload evaluation or scoring results to Trismik's platform (synchronous).
37
+
38
+ This function uploads results in the format returned by the evaluate or score
39
+ functions to the Trismik platform for tracking and analysis.
40
+
41
+ Args:
42
+ run_result: Dict with keys 'aggregate_results' and 'item_results' containing
43
+ evaluation/scoring results. Structure matches the output of evaluate()/score().
44
+ experiment_id: Trismik experiment identifier
45
+ project_id: Trismik project identifier
46
+ dataset_name: Optional dataset name. If not provided, extracted from metadata
47
+ or defaults to "Dataset"
48
+ hyperparameters: Optional dict of hyperparameters. If not provided, extracted
49
+ from run_result.
50
+ metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
51
+ model_name: Optional model name. If not provided, extracted from metadata
52
+ or defaults to "Model"
53
+
54
+ Returns:
55
+ str: Run ID assigned by Trismik
56
+
57
+ Raises:
58
+ Exception: If upload fails (re-raises underlying exceptions)
59
+ """
60
+ # Create Trismik client
61
+ trismik_client = create_trismik_sync_client()
62
+
63
+ # Get model name - use provided model_name, or extract from metadata, or use default
64
+ if model_name is not None:
65
+ model = model_name
66
+ else:
67
+ model = get_model_name(metadata=metadata)
68
+
69
+ # Get dataset name - use provided dataset_name, or extract from metadata, or use default
70
+ if dataset_name is None:
71
+ if metadata and "dataset" in metadata:
72
+ dataset_name = str(metadata["dataset"])
73
+ else:
74
+ dataset_name = "Dataset"
75
+
76
+ # Extract aggregate and item results
77
+ aggregate_results = run_result.get("aggregate_results", [])
78
+ item_results = run_result.get("item_results", [])
79
+
80
+ # Use provided hyperparameters or default to empty dict
81
+ # Note: We don't extract hyperparameters from aggregate_results to avoid
82
+ # misclassifying metrics as hyperparameters
83
+ if hyperparameters is None:
84
+ hyperparameters = {}
85
+
86
+ # Create eval items from item_results
87
+ trismik_items: List[TrismikClassicEvalItem] = []
88
+ for item in item_results:
89
+ # Extract inputs, outputs, labels
90
+ item_id = str(item.get("id", 0))
91
+ model_input = str(item.get("input", ""))
92
+ model_output = str(item.get("output", ""))
93
+ gold_output = str(item.get("label", ""))
94
+
95
+ # Extract item-level metrics (exclude known fields and hyperparameters)
96
+ item_metrics: Dict[str, Any] = {}
97
+ for key, value in item.items():
98
+ if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
99
+ # Normalize metric value for API compatibility
100
+ item_metrics[key] = normalize_metric_value(value)
101
+
102
+ eval_item = TrismikClassicEvalItem( # pragma: allowlist secret
103
+ datasetItemId=item_id,
104
+ modelInput=model_input,
105
+ modelOutput=model_output,
106
+ goldOutput=gold_output,
107
+ metrics=item_metrics,
108
+ )
109
+ trismik_items.append(eval_item)
110
+
111
+ # Extract aggregate metrics from aggregate_results
112
+ trismik_metrics: List[TrismikClassicEvalMetric] = []
113
+ if aggregate_results:
114
+ for key, value in aggregate_results[0].items():
115
+ if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
116
+ # This is a metric # pragma: allowlist secret
117
+ metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
118
+ trismik_metrics.append(metric) # pragma: allowlist secret
119
+
120
+ # Create classic eval request
121
+ classic_eval_request = TrismikClassicEvalRequest(
122
+ project_id,
123
+ experiment_id,
124
+ dataset_name,
125
+ model,
126
+ hyperparameters,
127
+ trismik_items,
128
+ trismik_metrics,
129
+ )
130
+
131
+ # Submit to Trismik # pragma: allowlist secret
132
+ response: TrismikClassicEvalResponse = trismik_client.submit_classic_eval(
133
+ classic_eval_request
134
+ ) # pragma: allowlist secret
135
+
136
+ run_id: str = response.id
137
+ logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
138
+
139
+ return run_id
140
+
141
+
142
+ async def upload_result_async(
143
+ run_result: Dict[str, List[Dict[str, Any]]],
144
+ experiment_id: str,
145
+ project_id: str,
146
+ dataset_name: Optional[str] = None,
147
+ hyperparameters: Optional[Dict[str, Any]] = None,
148
+ metadata: Optional[Dict[str, Any]] = None,
149
+ model_name: Optional[str] = None,
150
+ ) -> str:
151
+ """Upload evaluation or scoring results to Trismik's platform (asynchronous).
152
+
153
+ This function uploads results in the format returned by the evaluate or
154
+ score functions to the Trismik platform for tracking and analysis.
155
+
156
+ Args:
157
+ run_result: Dict with keys 'aggregate_results' and 'item_results' containing
158
+ evaluation/scoring results. Structure matches the output of evaluate()/score().
159
+ experiment_id: Trismik experiment identifier
160
+ project_id: Trismik project identifier
161
+ dataset_name: Optional dataset name. If not provided, extracted from metadata
162
+ or defaults to "Dataset"
163
+ hyperparameters: Optional dict of hyperparameters. If not provided, extracted
164
+ from run_result.
165
+ metadata: Optional metadata dict (can include 'model' and 'dataset' keys)
166
+ model_name: Optional model name. If not provided, extracted from metadata
167
+ or defaults to "Model"
168
+
169
+ Returns:
170
+ str: Run ID assigned by Trismik
171
+
172
+ Raises:
173
+ Exception: If upload fails (re-raises underlying exceptions)
174
+ """
175
+ # Create Trismik async client
176
+ trismik_client = create_trismik_async_client()
177
+
178
+ # Get model name - use provided model_name, or extract from metadata, or use default
179
+ if model_name is not None:
180
+ model = model_name
181
+ else:
182
+ model = get_model_name(metadata=metadata)
183
+
184
+ # Get dataset name - use provided dataset_name, or extract from metadata, or use default
185
+ if dataset_name is None:
186
+ if metadata and "dataset" in metadata:
187
+ dataset_name = str(metadata["dataset"])
188
+ else:
189
+ dataset_name = "Dataset"
190
+
191
+ # Extract aggregate and item results
192
+ aggregate_results = run_result.get("aggregate_results", [])
193
+ item_results = run_result.get("item_results", [])
194
+
195
+ # Use provided hyperparameters or default to empty dict
196
+ # Note: We don't extract hyperparameters from aggregate_results to avoid
197
+ # misclassifying metrics as hyperparameters
198
+ if hyperparameters is None:
199
+ hyperparameters = {}
200
+
201
+ # Create eval items from item_results
202
+ trismik_items: List[TrismikClassicEvalItem] = []
203
+ for item in item_results:
204
+ # Extract inputs, outputs, labels
205
+ item_id = str(item.get("id", 0))
206
+ model_input = str(item.get("input", ""))
207
+ model_output = str(item.get("output", ""))
208
+ gold_output = str(item.get("label", ""))
209
+
210
+ # Extract item-level metrics (exclude known fields and hyperparameters)
211
+ item_metrics: Dict[str, Any] = {}
212
+ for key, value in item.items():
213
+ if key not in KNOWN_ITEM_FIELDS and key not in (hyperparameters or {}):
214
+ # Normalize metric value for API compatibility
215
+ item_metrics[key] = normalize_metric_value(value)
216
+
217
+ eval_item = TrismikClassicEvalItem( # pragma: allowlist secret
218
+ datasetItemId=item_id,
219
+ modelInput=model_input,
220
+ modelOutput=model_output,
221
+ goldOutput=gold_output,
222
+ metrics=item_metrics,
223
+ )
224
+ trismik_items.append(eval_item)
225
+
226
+ # Extract aggregate metrics from aggregate_results
227
+ trismik_metrics: List[TrismikClassicEvalMetric] = []
228
+ if aggregate_results:
229
+ for key, value in aggregate_results[0].items():
230
+ if key not in KNOWN_AGGREGATE_FIELDS and key not in (hyperparameters or {}):
231
+ # This is a metric # pragma: allowlist secret
232
+ metric = TrismikClassicEvalMetric(metricId=key, value=normalize_metric_value(value))
233
+ trismik_metrics.append(metric) # pragma: allowlist secret
234
+
235
+ # Create classic eval request
236
+ classic_eval_request = TrismikClassicEvalRequest(
237
+ project_id,
238
+ experiment_id,
239
+ dataset_name,
240
+ model,
241
+ hyperparameters,
242
+ trismik_items,
243
+ trismik_metrics,
244
+ )
245
+
246
+ # Submit to Trismik (async) # pragma: allowlist secret
247
+ response: TrismikClassicEvalResponse = await trismik_client.submit_classic_eval(
248
+ classic_eval_request
249
+ ) # pragma: allowlist secret
250
+
251
+ run_id: str = response.id
252
+ logger.info(f"Run result uploaded successfully to Trismik with run_id: {run_id}")
253
+
254
+ return run_id
scorebook/types.py CHANGED
@@ -1,9 +1,15 @@
1
1
  """Type definitions for scorebook evaluation framework."""
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Any, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Sequence, Type, Union
5
5
 
6
- from scorebook.eval_dataset import EvalDataset
6
+ from scorebook.eval_datasets import EvalDataset
7
+ from scorebook.metrics.metric_base import MetricBase
8
+
9
+ # Type alias for metrics parameter
10
+ Metrics = Union[
11
+ str, "MetricBase", Type["MetricBase"], Sequence[Union[str, "MetricBase", Type["MetricBase"]]]
12
+ ]
7
13
 
8
14
 
9
15
  @dataclass
@@ -21,7 +27,7 @@ class EvalRunSpec:
21
27
  dataset_index: int
22
28
  hyperparameter_config: Dict[str, Any]
23
29
  hyperparameters_index: int
24
- items: List[Dict[str, Any]]
30
+ inputs: List[Any]
25
31
  labels: List[Any]
26
32
 
27
33
  def __str__(self) -> str:
@@ -54,77 +60,44 @@ class ClassicEvalRunResult:
54
60
  run_spec: EvalRunSpec
55
61
  run_completed: bool
56
62
  outputs: Optional[List[Any]]
57
- scores: Optional[Dict[str, Any]]
63
+ scores: Optional[Dict[str, List[Dict[str, Any]]]] # score_async format
58
64
  run_id: Optional[str] = None
59
65
 
60
66
  @property
61
67
  def item_scores(self) -> List[Dict[str, Any]]:
62
68
  """Return a list of dictionaries containing scores for each evaluated item."""
63
- results = []
64
-
65
- if self.outputs:
66
- for idx, output in enumerate(self.outputs):
67
- if idx >= len(self.run_spec.items):
68
- break
69
-
70
- result = {
71
- "item_id": idx,
72
- "dataset_name": self.run_spec.dataset.name,
73
- "inference_output": output,
74
- **self.run_spec.hyperparameter_config,
75
- }
76
-
77
- # Add run_id if available
78
- if self.run_id is not None:
79
- result["run_id"] = self.run_id
80
-
81
- # Add individual item scores if available
82
- if self.scores is not None:
83
- for metric_name, metric_data in self.scores.items():
84
- if isinstance(metric_data, dict) and "item_scores" in metric_data:
85
- if idx < len(metric_data["item_scores"]):
86
- result[metric_name] = metric_data["item_scores"][idx]
87
- else:
88
- # If scores is just a single value, replicate it for each item
89
- result[metric_name] = metric_data
90
-
91
- results.append(result)
92
-
93
- return results
69
+ if self.scores and "item_results" in self.scores:
70
+ # score_async already built this in the exact format we need
71
+ return self.scores["item_results"]
72
+ return []
94
73
 
95
74
  @property
96
75
  def aggregate_scores(self) -> Dict[str, Any]:
97
76
  """Return the aggregated scores for this run."""
98
- result = {
77
+ if (
78
+ self.scores
79
+ and "aggregate_results" in self.scores
80
+ and len(self.scores["aggregate_results"]) > 0
81
+ ):
82
+ result = self.scores["aggregate_results"][0].copy()
83
+ # Add run_completed (not included in score_async format)
84
+ result["run_completed"] = self.run_completed
85
+ return result
86
+
87
+ # Fallback if no scores available
88
+ return {
99
89
  "dataset": self.run_spec.dataset.name,
100
90
  "run_completed": self.run_completed,
101
91
  **self.run_spec.hyperparameter_config,
102
92
  }
103
93
 
104
- # Add run_id if available
105
- if self.run_id is not None:
106
- result["run_id"] = self.run_id
107
-
108
- # Add aggregate scores from metrics
109
- if self.scores is not None:
110
- for metric_name, metric_data in self.scores.items():
111
- if isinstance(metric_data, dict) and "aggregate_scores" in metric_data:
112
- # Flatten the aggregate scores from each metric
113
- for key, value in metric_data["aggregate_scores"].items():
114
- score_key = key if key == metric_name else f"{metric_name}_{key}"
115
- result[score_key] = value
116
- else:
117
- # If scores is just a single value, use it as is
118
- result[metric_name] = metric_data
119
-
120
- return result
121
-
122
94
 
123
95
  @dataclass
124
96
  class AdaptiveEvalRunResult:
125
97
  """Results from executing an adaptive evaluation run."""
126
98
 
127
99
  run_spec: AdaptiveEvalRunSpec
100
+ run_completed: bool
128
101
  scores: Dict[str, Any]
129
102
 
130
103
  @property
@@ -177,3 +150,12 @@ class EvalResult:
177
150
  results.append(run_result.aggregate_scores)
178
151
 
179
152
  return results
153
+
154
+
155
+ @dataclass
156
+ class MetricScore:
157
+ """Container for metric scores across multiple runs."""
158
+
159
+ metric_name: str
160
+ aggregate_scores: Dict[str, Any]
161
+ item_scores: List[Dict[str, Any]]
@@ -1,16 +1,23 @@
1
1
  """Utility functions and common helpers for the Scorebook framework."""
2
2
 
3
+ from contextlib import nullcontext
4
+
3
5
  from scorebook.utils.async_utils import async_nullcontext, is_awaitable
4
- from scorebook.utils.build_prompt import build_prompt
6
+ from scorebook.utils.common_helpers import resolve_show_progress, resolve_upload_results
5
7
  from scorebook.utils.io_helpers import validate_path
6
- from scorebook.utils.progress_bars import evaluation_progress
8
+ from scorebook.utils.progress_bars import evaluation_progress_context, scoring_progress_context
9
+ from scorebook.utils.render_template import render_template
7
10
  from scorebook.utils.transform_helpers import expand_dict
8
11
 
9
12
  __all__ = [
10
13
  "async_nullcontext",
14
+ "nullcontext",
11
15
  "is_awaitable",
16
+ "resolve_show_progress",
17
+ "resolve_upload_results",
12
18
  "validate_path",
13
19
  "expand_dict",
14
- "evaluation_progress",
15
- "build_prompt",
20
+ "evaluation_progress_context",
21
+ "scoring_progress_context",
22
+ "render_template",
16
23
  ]
@@ -0,0 +1,41 @@
1
+ """Common helper functions shared across scorebook modules."""
2
+
3
+ import logging
4
+ from typing import Literal, Optional, Union
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def resolve_upload_results(upload_results: Union[Literal["auto"], bool]) -> bool:
10
+ """Resolve the upload_results parameter based on trismik login status.
11
+
12
+ Args:
13
+ upload_results: Can be True, False, or "auto". When "auto", resolves to True
14
+ if user is logged in to Trismik, False otherwise.
15
+
16
+ Returns:
17
+ bool: Whether to upload results to Trismik
18
+ """
19
+ if upload_results == "auto":
20
+ from scorebook.trismik.credentials import get_token
21
+
22
+ upload_results = get_token() is not None
23
+ logger.debug("Auto upload results resolved to: %s", upload_results)
24
+
25
+ return upload_results
26
+
27
+
28
+ def resolve_show_progress(show_progress: Optional[bool]) -> bool:
29
+ """Resolve whether to show progress bars.
30
+
31
+ Args:
32
+ show_progress: Explicit setting (None uses default from settings)
33
+
34
+ Returns:
35
+ bool: Whether to show progress bars
36
+ """
37
+ if show_progress is None:
38
+ from scorebook.settings import SHOW_PROGRESS_BARS
39
+
40
+ return bool(SHOW_PROGRESS_BARS)
41
+ return show_progress
@@ -1,15 +1,18 @@
1
1
  """Input/output helper functions for Scorebook."""
2
2
 
3
3
  from pathlib import Path
4
- from typing import Optional
4
+ from typing import Optional, Tuple, Union
5
5
 
6
6
 
7
- def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path:
7
+ def validate_path(
8
+ file_path: Union[str, Path], expected_suffix: Optional[Union[str, Tuple[str, ...]]] = None
9
+ ) -> Path:
8
10
  """Validate that a file path exists and optionally check its suffix.
9
11
 
10
12
  Args:
11
13
  file_path: Path to the file as string or Path object
12
- expected_suffix: Optional file extension to validate (e.g. ".json", ".csv")
14
+ expected_suffix: Optional file extension(s) to validate.
15
+ Can be a single string (e.g. ".json") or tuple of strings (e.g. (".yaml", ".yml"))
13
16
 
14
17
  Returns:
15
18
  Path object for the validated file path
@@ -22,7 +25,17 @@ def validate_path(file_path: str, expected_suffix: Optional[str] = None) -> Path
22
25
  if not path.exists():
23
26
  raise FileNotFoundError(f"File not found: {file_path}")
24
27
 
25
- if expected_suffix and path.suffix.lower() != expected_suffix.lower():
26
- raise ValueError(f"File must have {expected_suffix} extension, got: {path.suffix}")
28
+ if expected_suffix:
29
+ # Convert single suffix to tuple for uniform handling
30
+ allowed_suffixes = (
31
+ (expected_suffix,) if isinstance(expected_suffix, str) else expected_suffix
32
+ )
33
+ allowed_suffixes_lower = tuple(s.lower() for s in allowed_suffixes)
34
+
35
+ if path.suffix.lower() not in allowed_suffixes_lower:
36
+ suffix_list = ", ".join(f"'{s}'" for s in allowed_suffixes)
37
+ raise ValueError(
38
+ f"File must have one of ({suffix_list}) extensions, got: '{path.suffix}'"
39
+ )
27
40
 
28
41
  return path