scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. scorebook/__init__.py +2 -0
  2. scorebook/dashboard/credentials.py +34 -4
  3. scorebook/eval_datasets/eval_dataset.py +2 -2
  4. scorebook/evaluate/_async/evaluate_async.py +27 -11
  5. scorebook/evaluate/_sync/evaluate.py +27 -11
  6. scorebook/metrics/README.md +121 -0
  7. scorebook/metrics/__init__.py +8 -0
  8. scorebook/metrics/accuracy.py +2 -6
  9. scorebook/metrics/bertscore.py +50 -0
  10. scorebook/metrics/bleu.py +82 -0
  11. scorebook/metrics/core/__init__.py +1 -0
  12. scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
  13. scorebook/metrics/core/metric_registry.py +195 -0
  14. scorebook/metrics/exactmatch.py +95 -0
  15. scorebook/metrics/f1.py +96 -0
  16. scorebook/metrics/precision.py +84 -9
  17. scorebook/metrics/recall.py +94 -0
  18. scorebook/metrics/rouge.py +85 -0
  19. scorebook/score/score_helpers.py +28 -11
  20. scorebook/types.py +2 -2
  21. scorebook/utils/progress_bars.py +58 -786
  22. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
  23. scorebook-0.0.15.dist-info/RECORD +110 -0
  24. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
  25. tutorials/README.md +147 -0
  26. tutorials/__init__.py +5 -0
  27. tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
  28. tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
  29. tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
  30. tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
  31. tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
  32. tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
  33. tutorials/examples/1-score/__init__.py +0 -0
  34. tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
  35. tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
  36. tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
  37. tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
  38. tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
  39. tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
  40. tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
  41. tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
  42. tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
  43. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
  44. tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
  45. tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
  46. tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
  47. tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
  48. tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
  49. tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
  50. tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
  51. tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
  52. tutorials/examples/6-providers/aws/__init__.py +1 -0
  53. tutorials/examples/6-providers/aws/batch_example.py +219 -0
  54. tutorials/examples/6-providers/portkey/__init__.py +1 -0
  55. tutorials/examples/6-providers/portkey/batch_example.py +120 -0
  56. tutorials/examples/6-providers/portkey/messages_example.py +121 -0
  57. tutorials/examples/6-providers/vertex/__init__.py +1 -0
  58. tutorials/examples/6-providers/vertex/batch_example.py +166 -0
  59. tutorials/examples/6-providers/vertex/messages_example.py +142 -0
  60. tutorials/examples/__init__.py +0 -0
  61. tutorials/notebooks/1-scoring.ipynb +162 -0
  62. tutorials/notebooks/2-evaluating.ipynb +316 -0
  63. tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
  64. tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
  65. tutorials/notebooks/4-uploading_results.ipynb +175 -0
  66. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
  67. tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
  68. tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
  69. tutorials/quickstarts/getting_started.ipynb +197 -0
  70. tutorials/utils/__init__.py +35 -0
  71. tutorials/utils/args_parser.py +132 -0
  72. tutorials/utils/output.py +23 -0
  73. tutorials/utils/setup.py +98 -0
  74. scorebook/metrics/metric_registry.py +0 -107
  75. scorebook-0.0.14.dist-info/RECORD +0 -53
  76. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
  77. {scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -4,8 +4,8 @@ import logging
4
4
  from typing import Any, Dict, List, Mapping, Optional, Type, Union
5
5
 
6
6
  from scorebook.exceptions import DataMismatchError, ParameterValidationError
7
- from scorebook.metrics.metric_base import MetricBase
8
- from scorebook.metrics.metric_registry import MetricRegistry
7
+ from scorebook.metrics.core.metric_base import MetricBase
8
+ from scorebook.metrics.core.metric_registry import MetricRegistry
9
9
  from scorebook.types import MetricScore
10
10
  from scorebook.utils.async_utils import is_awaitable
11
11
 
@@ -89,7 +89,7 @@ async def calculate_metric_scores_async(
89
89
  for metric in metrics:
90
90
 
91
91
  if progress_bar is not None:
92
- progress_bar.set_current_metric(metric.name)
92
+ progress_bar.set_postfix(metric=metric.name)
93
93
 
94
94
  if is_awaitable(metric.score):
95
95
  aggregate_scores, item_scores = await metric.score(outputs, labels)
@@ -134,7 +134,7 @@ def calculate_metric_scores(
134
134
  for metric in metrics:
135
135
 
136
136
  if progress_bar is not None:
137
- progress_bar.set_current_metric(metric.name)
137
+ progress_bar.set_postfix(metric=metric.name)
138
138
 
139
139
  if is_awaitable(metric.score):
140
140
  raise ParameterValidationError(
@@ -164,18 +164,27 @@ def format_results(
164
164
  hyperparameters = hyperparameters or {}
165
165
  dataset_name = dataset_name or "scored_items"
166
166
 
167
+ # Detect key collisions across all metrics (for both aggregate and item scores)
168
+ all_keys: Dict[str, set] = {}
169
+ for metric_score in metric_scores:
170
+ for key in metric_score.aggregate_scores.keys():
171
+ all_keys.setdefault(key, set()).add(metric_score.metric_name)
172
+ # Also check item_scores keys if they are dicts
173
+ if metric_score.item_scores and isinstance(metric_score.item_scores[0], dict):
174
+ for key in metric_score.item_scores[0].keys():
175
+ all_keys.setdefault(key, set()).add(metric_score.metric_name)
176
+ colliding_keys = {k for k, metrics in all_keys.items() if len(metrics) > 1}
177
+
167
178
  # Build aggregate results
168
- aggregate_result = {
179
+ aggregate_result: Dict[str, Any] = {
169
180
  "dataset": dataset_name,
170
181
  **hyperparameters,
171
182
  }
172
183
 
173
- # Add aggregate scores from metrics
184
+ # Add aggregate scores from metrics (flat, with suffix on collision)
174
185
  for metric_score in metric_scores:
175
186
  for key, value in metric_score.aggregate_scores.items():
176
- score_key = (
177
- key if key == metric_score.metric_name else f"{metric_score.metric_name}_{key}"
178
- )
187
+ score_key = f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
179
188
  aggregate_result[score_key] = value
180
189
 
181
190
  # Build item results
@@ -193,10 +202,18 @@ def format_results(
193
202
  if inputs is not None and inputs[idx] is not None:
194
203
  item_result["input"] = inputs[idx]
195
204
 
196
- # Add item-level metric scores
205
+ # Add item-level metric scores (flat, with suffix on collision)
197
206
  for metric_score in metric_scores:
198
207
  if idx < len(metric_score.item_scores):
199
- item_result[metric_score.metric_name] = metric_score.item_scores[idx]
208
+ item_scores = metric_score.item_scores[idx]
209
+ if isinstance(item_scores, dict):
210
+ for key, value in item_scores.items():
211
+ score_key = (
212
+ f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
213
+ )
214
+ item_result[score_key] = value
215
+ else:
216
+ item_result[metric_score.metric_name] = item_scores
200
217
 
201
218
  item_results.append(item_result)
202
219
 
scorebook/types.py CHANGED
@@ -4,11 +4,11 @@ from dataclasses import dataclass
4
4
  from typing import Any, Dict, List, Optional, Sequence, Type, Union
5
5
 
6
6
  from scorebook.eval_datasets.eval_dataset import EvalDataset
7
- from scorebook.metrics.metric_base import MetricBase
7
+ from scorebook.metrics.core.metric_base import MetricBase
8
8
 
9
9
  # Type alias for metrics parameter
10
10
  Metrics = Union[
11
- str, "MetricBase", Type["MetricBase"], Sequence[Union[str, "MetricBase", Type["MetricBase"]]]
11
+ str, MetricBase, Type[MetricBase], Sequence[Union[str, MetricBase, Type[MetricBase]]]
12
12
  ]
13
13
 
14
14