PyPI - scorebook - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

scorebook/__init__.py +2 -0
scorebook/dashboard/credentials.py +34 -4
scorebook/eval_datasets/eval_dataset.py +2 -2
scorebook/evaluate/_async/evaluate_async.py +27 -11
scorebook/evaluate/_sync/evaluate.py +27 -11
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +8 -0
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/score_helpers.py +28 -11
scorebook/types.py +2 -2
scorebook/utils/progress_bars.py +58 -786
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
scorebook-0.0.15.dist-info/RECORD +110 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -107
scorebook-0.0.14.dist-info/RECORD +0 -53
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0

scorebook/score/score_helpers.py CHANGED Viewed

@@ -4,8 +4,8 @@ import logging
 from typing import Any, Dict, List, Mapping, Optional, Type, Union
 from scorebook.exceptions import DataMismatchError, ParameterValidationError
-from scorebook.metrics.metric_base import MetricBase
-from scorebook.metrics.metric_registry import MetricRegistry
+from scorebook.metrics.core.metric_base import MetricBase
+from scorebook.metrics.core.metric_registry import MetricRegistry
 from scorebook.types import MetricScore
 from scorebook.utils.async_utils import is_awaitable
@@ -89,7 +89,7 @@ async def calculate_metric_scores_async(
     for metric in metrics:
         if progress_bar is not None:
-            progress_bar.set_current_metric(metric.name)
+            progress_bar.set_postfix(metric=metric.name)
         if is_awaitable(metric.score):
             aggregate_scores, item_scores = await metric.score(outputs, labels)
@@ -134,7 +134,7 @@ def calculate_metric_scores(
     for metric in metrics:
         if progress_bar is not None:
-            progress_bar.set_current_metric(metric.name)
+            progress_bar.set_postfix(metric=metric.name)
         if is_awaitable(metric.score):
             raise ParameterValidationError(
@@ -164,18 +164,27 @@ def format_results(
     hyperparameters = hyperparameters or {}
     dataset_name = dataset_name or "scored_items"
+    # Detect key collisions across all metrics (for both aggregate and item scores)
+    all_keys: Dict[str, set] = {}
+    for metric_score in metric_scores:
+        for key in metric_score.aggregate_scores.keys():
+            all_keys.setdefault(key, set()).add(metric_score.metric_name)
+        # Also check item_scores keys if they are dicts
+        if metric_score.item_scores and isinstance(metric_score.item_scores[0], dict):
+            for key in metric_score.item_scores[0].keys():
+                all_keys.setdefault(key, set()).add(metric_score.metric_name)
+    colliding_keys = {k for k, metrics in all_keys.items() if len(metrics) > 1}
     # Build aggregate results
-    aggregate_result = {
+    aggregate_result: Dict[str, Any] = {
         "dataset": dataset_name,
         **hyperparameters,
     }
-    # Add aggregate scores from metrics
+    # Add aggregate scores from metrics (flat, with suffix on collision)
     for metric_score in metric_scores:
         for key, value in metric_score.aggregate_scores.items():
-            score_key = (
-                key if key == metric_score.metric_name else f"{metric_score.metric_name}_{key}"
-            )
+            score_key = f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
             aggregate_result[score_key] = value
     # Build item results
@@ -193,10 +202,18 @@ def format_results(
         if inputs is not None and inputs[idx] is not None:
             item_result["input"] = inputs[idx]
-        # Add item-level metric scores
+        # Add item-level metric scores (flat, with suffix on collision)
         for metric_score in metric_scores:
             if idx < len(metric_score.item_scores):
-                item_result[metric_score.metric_name] = metric_score.item_scores[idx]
+                item_scores = metric_score.item_scores[idx]
+                if isinstance(item_scores, dict):
+                    for key, value in item_scores.items():
+                        score_key = (
+                            f"{key}_{metric_score.metric_name}" if key in colliding_keys else key
+                        )
+                        item_result[score_key] = value
+                else:
+                    item_result[metric_score.metric_name] = item_scores
         item_results.append(item_result)

scorebook/types.py CHANGED Viewed

@@ -4,11 +4,11 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Sequence, Type, Union
 from scorebook.eval_datasets.eval_dataset import EvalDataset
-from scorebook.metrics.metric_base import MetricBase
+from scorebook.metrics.core.metric_base import MetricBase
 # Type alias for metrics parameter
 Metrics = Union[
-    str, "MetricBase", Type["MetricBase"], Sequence[Union[str, "MetricBase", Type["MetricBase"]]]
+    str, MetricBase, Type[MetricBase], Sequence[Union[str, MetricBase, Type[MetricBase]]]
 ]

scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl