PyPI - scorebook - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

scorebook 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

scorebook/__init__.py +2 -0
scorebook/dashboard/credentials.py +34 -4
scorebook/eval_datasets/eval_dataset.py +2 -2
scorebook/evaluate/_async/evaluate_async.py +27 -11
scorebook/evaluate/_sync/evaluate.py +27 -11
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +8 -0
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/score_helpers.py +28 -11
scorebook/types.py +2 -2
scorebook/utils/progress_bars.py +58 -786
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/METADATA +32 -24
scorebook-0.0.16.dist-info/RECORD +110 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -107
scorebook-0.0.14.dist-info/RECORD +0 -53
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE +0 -0

tutorials/utils/setup.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""
+Utility functions for setting up Scorebook examples.
+This module provides common helper functions used across multiple Scorebook examples
+for output directory setup and logging configuration.
+"""
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+def setup_output_directory() -> Path:
+    """Parse command line arguments and setup output directory."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Run evaluation and save results.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(Path.cwd() / "examples/example_results"),
+        help=(
+            "Directory to save evaluation outputs (CSV and JSON). "
+            "Defaults to ./examples/example_results in the current working directory."
+        ),
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Saving results to {output_dir}")
+    return output_dir
+def setup_logging(
+    log_dir: str = "logs",
+    experiment_id: Optional[str] = None,
+    base_dir: Optional[Path] = None,
+) -> Path:
+    """Configure logging for evaluation runs.
+    Args:
+        log_dir: Name of the log directory (default: "logs")
+        experiment_id: Optional identifier for the experiment
+        base_dir: Base directory where log_dir should be created.
+                  If None, uses current working directory.
+    """
+    if base_dir is None:
+        base_dir = Path.cwd()
+    log_dir_path: Path = base_dir / log_dir
+    log_dir_path.mkdir(exist_ok=True, parents=True)
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    if experiment_id:
+        log_file = log_dir_path / f"evaluation_{experiment_id}_{timestamp}.log"
+    else:
+        log_file = log_dir_path / f"evaluation_{timestamp}.log"
+    # Create file handler for all logs (same as before)
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(
+        logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
+    )
+    # Create console handler for warnings and errors only
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.WARNING)
+    console_handler.setFormatter(logging.Formatter("%(levelname)s - %(name)s - %(message)s"))
+    # Configure root logger with both handlers
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[file_handler, console_handler],
+        force=True,
+    )
+    # Set scorebook loggers to DEBUG level to capture all scorebook logs
+    scorebook_logger = logging.getLogger("scorebook")
+    scorebook_logger.setLevel(logging.DEBUG)
+    # Ensure trismik_services logs are captured at DEBUG level
+    trismik_services_logger = logging.getLogger("scorebook.trismik_services")
+    trismik_services_logger.setLevel(logging.DEBUG)
+    # Ensure evaluate logs are captured at DEBUG level
+    evaluate_logger = logging.getLogger("scorebook.evaluate._sync.evaluate")
+    evaluate_logger.setLevel(logging.DEBUG)
+    evaluate_logger = logging.getLogger("scorebook.evaluate._async.evaluate_async")
+    evaluate_logger.setLevel(logging.DEBUG)
+    # Exclude OpenAI inference logs to reduce noise
+    openai_logger = logging.getLogger("scorebook.inference.openai")
+    openai_logger.setLevel(logging.WARNING)  # Only log warnings and errors
+    print(f"Logging to {log_file}")
+    return log_file

scorebook/metrics/metric_registry.py DELETED Viewed

@@ -1,107 +0,0 @@
-"""
-Registry module for evaluation metrics.
-This module maintains a centralized registry of available evaluation metrics
-that can be used to assess model performance. It provides a single access point
-to retrieve all implemented metric classes.
-"""
-from typing import Any, Callable, Dict, List, Type, Union
-from scorebook.metrics.metric_base import MetricBase
-class MetricRegistry:
-    """A registry for evaluation metrics.
-    This class provides a central registry for all evaluation metrics in the system.
-    It allows metrics to be registered with unique names and retrieved either by
-    name or by class. The registry ensures that metrics are properly initialized
-    and accessible throughout the application.
-    The registry supports:
-    - Registering new metric classes with optional custom names
-    - Retrieving metric instances by name or class
-    - Listing all available metrics
-    Usage:
-        @MetricRegistry.register("custom_name")
-        class MyMetric(MetricBase):
-            ...
-        # Get by name
-        metric = MetricRegistry.get("custom_name")
-        # Get by class
-        metric = MetricRegistry.get(MyMetric)
-        # List available metrics
-        metrics = MetricRegistry.list_metrics()
-    """
-    _registry: Dict[str, Type[MetricBase]] = {}
-    @classmethod
-    def register(cls) -> Callable[[Type[MetricBase]], Type[MetricBase]]:
-        """
-        Register a metric class in the registry.
-        Returns:
-            A decorator that registers the class and returns it.
-        Raises:
-            ValueError: If a metric with the given name is already registered.
-        """
-        def decorator(metric_cls: Type[MetricBase]) -> Type[MetricBase]:
-            key = metric_cls.__name__.lower()
-            if key in cls._registry:
-                raise ValueError(f"Metric '{key}' is already registered")
-            cls._registry[key] = metric_cls
-            return metric_cls
-        return decorator
-    @classmethod
-    def get(cls, name_or_class: Union[str, Type[MetricBase]], **kwargs: Any) -> MetricBase:
-        """
-        Get an instance of a registered metric by name or class.
-        Args:
-            name_or_class: The metric name (string) or class (subclass of BaseMetric).
-            **kwargs: Additional arguments to pass to the metric's constructor.
-        Returns:
-            An instance of the requested metric.
-        Raises:
-            ValueError: If the metric name is not registered.
-        """
-        # If input is a class that's a subclass of BaseMetric, instantiate it directly
-        if isinstance(name_or_class, type) and issubclass(name_or_class, MetricBase):
-            return name_or_class(**kwargs)
-        # If input is a string, look up the class in the registry
-        if isinstance(name_or_class, str):
-            key = name_or_class.lower()
-            if key not in cls._registry:
-                raise ValueError(f"Metric '{name_or_class}' not registered.")
-            return cls._registry[key](**kwargs)
-        raise ValueError(
-            f"Invalid metric type: {type(name_or_class)}."
-            f"Must be string name or BaseMetric subclass"
-        )
-    @classmethod
-    def list_metrics(cls) -> List[str]:
-        """
-        List all registered metrics.
-        Returns:
-            A list of metric names.
-        """
-        return list(cls._registry.keys())

scorebook-0.0.14.dist-info/RECORD DELETED Viewed

@@ -1,53 +0,0 @@
-scorebook/__init__.py,sha256=S2JaZZsx76p0EjXtKz4UPdSzuO60jAjOvooYP-idBu8,1144
-scorebook/cli/__init__.py,sha256=E89jR1DljFSHhfjEGSRKLgz0KhxGyRQ9a3vpUOmQL9o,32
-scorebook/cli/auth.py,sha256=VGS5T0CSeS0n_7bntNggrYx-vDwxJJHdYxbKedFAq74,2939
-scorebook/cli/main.py,sha256=cEvShENl6L6feX_sa7FGNTeoz5UtwqzwenmcHaON1hg,1589
-scorebook/dashboard/__init__.py,sha256=36DxO3oXVcZ2I6kizLFCcJkLBpXOU8UIXFT_ZjeFTB4,50
-scorebook/dashboard/create_project.py,sha256=RK90aMN0_XVM-DnawTY_b59yPJaRnpb_GoidCqXB5Vw,2845
-scorebook/dashboard/credentials.py,sha256=Q_khY5AX3fnyWshHe6LaesBHcCmNBse6a_XFGT8OOaw,3474
-scorebook/dashboard/upload_results.py,sha256=sdgOEf0C7QLt7t2QiXvSoceQpAiiPmlG_4SFEEzVPlc,9738
-scorebook/eval_datasets/__init__.py,sha256=wsmFNyuZJdBxjokcKG4NRfuUzPZKuzsKX3aG21zfFV4,39
-scorebook/eval_datasets/eval_dataset.py,sha256=xnG7VaceWUmg8Wrk2IGnVFZs9umzmZrW8F7THvtWMqs,28041
-scorebook/evaluate/__init__.py,sha256=Qqe-l4y3Nu81Fdx83RbtCQESoXC0XukBgOC3DPSWZZA,39
-scorebook/evaluate/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scorebook/evaluate/_async/evaluate_async.py,sha256=G0RB_A1f5mQ42D82DnxkzAZhyV5kgbxi9Lr7qKaKUyY,16590
-scorebook/evaluate/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scorebook/evaluate/_sync/evaluate.py,sha256=OIUsW2U1IrdwYIIPsfpTCOfJDAYJ6BYl-6pQQiafSNE,16364
-scorebook/evaluate/evaluate_helpers.py,sha256=NnanxLEeHwoZNztGXQJc6u_WqKfDkn1vYmck2BrKF-c,17028
-scorebook/exceptions.py,sha256=3sxCWhFqYgXiWNUAMRR2ggLfqvbDI8e5vLjnT9V7X1M,3649
-scorebook/inference/__init__.py,sha256=gGuZG1rdpxKYC54q0eAS6oTHQbRYhgxlBeAqonqHvRU,60
-scorebook/inference/clients/__init__.py,sha256=VaLW7mi4tywJtR3Q9wr2pPci8NlEQ3bJanZyM5S81Z4,51
-scorebook/inference/clients/bedrock.py,sha256=bsnz0IB6ufjZVPd6syD3yVaOelerB5G_YAmPAVqmBmI,10071
-scorebook/inference/clients/openai.py,sha256=JPPcJdxYwwZNUfXCGTRRzzUUA8I8WiV3bu6-pgS1_UE,9043
-scorebook/inference/clients/portkey.py,sha256=RCuEZB8xNAVeGEt3IJ0esh_wqreZNB3jrDKiWH6miV0,5949
-scorebook/inference/clients/vertex.py,sha256=g6oNnag0qcLOYCtQ4SXAXfnqKtvPAVdigB--I7wU1yM,9871
-scorebook/inference/inference_pipeline.py,sha256=1qSmfI4fBJFS3EcAhRlA-f4-8aI6wDiupSJu-vNXoYI,5571
-scorebook/metrics/__init__.py,sha256=bsEq15LpFt3h0AQQFbnvL4CU7KpIpifVdJAsfduPGXk,48
-scorebook/metrics/accuracy.py,sha256=5KQ4hfOn9M94sB7WsXUelJWJiuKfoCGQEl5q5q9vNfo,1467
-scorebook/metrics/metric_base.py,sha256=I3L0DGcRojFp93UGFnXG1tZ2UK9ilTcXXJG6lj5ddXA,857
-scorebook/metrics/metric_registry.py,sha256=YcbKGf2kPMQqyqJ9NYVq_-J19rARXSo22HjTW5WU-QU,3404
-scorebook/metrics/precision.py,sha256=AaYPYYKnY74Nwqp_p3jd2Ewf3VHNOJjoRWf5fhb-tXk,563
-scorebook/score/__init__.py,sha256=CqkslUvOw8QfCCbSfwZgGrbmXeSLpZqIVo4ntrctYuY,66
-scorebook/score/_async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scorebook/score/_async/score_async.py,sha256=SatV9hEUT8MAru2ACSyM03weKX6VTFx7crW59_uX0L8,6155
-scorebook/score/_sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scorebook/score/_sync/score.py,sha256=nANQbuyYyIaWnoTQzyGMwPZRMFP6MmyIyHb1GO1mktQ,6101
-scorebook/score/score_helpers.py,sha256=Gjx2Lgd94ISvunb5CHj-tDWYVEOVj9ySjjVYnnhpk_Q,7086
-scorebook/settings.py,sha256=qZrNiki6rFXn43udmhjQSmdDKOEaX62WYoEs2Rbggr0,720
-scorebook/types.py,sha256=2lv1YUky7aDGIEPjgj18aKTpBMdmqD01TKLbwli19pQ,4904
-scorebook/utils/__init__.py,sha256=oBTybVHI5EdHIgzb0TeoAnSLMQdUh20Ww6vcL9542Pk,72
-scorebook/utils/async_utils.py,sha256=2ewk_VOePib8z7DTRl-pZQBGzVI3L3JvnEuYW-DTkRA,1325
-scorebook/utils/common_helpers.py,sha256=lJIqO9XGf1T3S3rdGBTjZJ1BzVPvaU_XTONEfPApnEM,1218
-scorebook/utils/io_helpers.py,sha256=ORO6DwtXOKWJq9v_isuunUrz0viE3xy2qYO4lrgU-TM,1437
-scorebook/utils/jinja_helpers.py,sha256=ksIKHiKdj8N0o7ZJZGasfbSNoAY6K5d9X_KM6mcKYD4,4208
-scorebook/utils/mappers.py,sha256=OcUnPBrnSUxZNhAzJhVmVWUWmqIKFXLTrK-xLi6_SUg,1259
-scorebook/utils/mock_llm/__init__.py,sha256=dK70wNVBKk4hv1o3fceDTBG1_maFbkMvoOtTriPCe78,1293
-scorebook/utils/mock_llm/data/mock_llm_data.json,sha256=b28j7OCR0igpP0rkXDJAR2NWIiuVkOaAkzB-Miv665Y,381567
-scorebook/utils/progress_bars.py,sha256=gdT6dJ9LMLYzs7TospP3wQNY9htm_FhVLdX0ueluC6E,31890
-scorebook/utils/render_template.py,sha256=NOaZt-N1WcR5MA7at1XxzD-4sFMFKo9X0k7fKq6oSSM,1654
-scorebook/utils/transform_helpers.py,sha256=UnVLtFvcJrtmBEmLsuA4rrX4iJlNUKxm2DkIOGLl-2o,1030
-scorebook-0.0.14.dist-info/METADATA,sha256=jPqVszfpCiAKf3yt45XD6lXfIJL1-TFvSMDVGrIoCPs,9491
-scorebook-0.0.14.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-scorebook-0.0.14.dist-info/entry_points.txt,sha256=9gNd3Q0MEozhJ7fog-Q-Z_PrcGMnF-404Jon40MH2_U,53
-scorebook-0.0.14.dist-info/licenses/LICENSE,sha256=JLH1g9FhxHZf6CBCeQ_xAisPtICVObuNGW1bLPiTYEs,1068
-scorebook-0.0.14.dist-info/RECORD,,

{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{scorebook-0.0.14.dist-info → scorebook-0.0.16.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

scorebook 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

scorebook 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl