PyPI - scorebook - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

scorebook 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

scorebook/__init__.py +12 -5
scorebook/cli/auth.py +1 -1
scorebook/dashboard/__init__.py +1 -0
scorebook/dashboard/create_project.py +91 -0
scorebook/{trismik → dashboard}/credentials.py +57 -12
scorebook/{trismik → dashboard}/upload_results.py +1 -1
scorebook/eval_datasets/__init__.py +0 -4
scorebook/eval_datasets/eval_dataset.py +4 -2
scorebook/evaluate/__init__.py +1 -15
scorebook/evaluate/_async/evaluate_async.py +36 -19
scorebook/evaluate/_sync/evaluate.py +36 -19
scorebook/evaluate/evaluate_helpers.py +4 -3
scorebook/inference/__init__.py +1 -11
scorebook/inference/clients/__init__.py +1 -8
scorebook/inference/inference_pipeline.py +1 -1
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +7 -16
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/__init__.py +0 -5
scorebook/score/_async/score_async.py +3 -2
scorebook/score/_sync/score.py +3 -2
scorebook/score/score_helpers.py +29 -12
scorebook/types.py +3 -3
scorebook/utils/__init__.py +0 -22
scorebook/utils/common_helpers.py +1 -1
scorebook/utils/mock_llm/__init__.py +41 -0
scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
scorebook/utils/progress_bars.py +58 -786
scorebook-0.0.15.dist-info/METADATA +300 -0
scorebook-0.0.15.dist-info/RECORD +110 -0
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -105
scorebook/trismik/__init__.py +0 -10
scorebook-0.0.13.dist-info/METADATA +0 -389
scorebook-0.0.13.dist-info/RECORD +0 -50
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0

tutorials/examples/6-providers/vertex/batch_example.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""
+Google Cloud Vertex AI Batch Inference Example.
+This example demonstrates how to leverage Google Cloud Vertex AI's Batch API for
+cost-effective, large-scale model evaluation using Scorebook. It uses Gemini models
+for batch processing with automatic GCS upload/download and job management.
+This example requires Google Cloud SDK (gsutil) to be installed and authenticated,
+and a Google Cloud project with Vertex AI enabled. Set the project ID in the
+GOOGLE_CLOUD_PROJECT environment variable or pass it as a command line argument.
+Compare with the Portkey batch example to understand the differences
+between different cloud providers' batch processing approaches.
+"""
+import json
+import os
+from pathlib import Path
+from typing import Any
+from dotenv import load_dotenv
+from scorebook import EvalDataset, InferencePipeline, evaluate
+from scorebook.inference.clients.vertex import batch
+from scorebook.metrics.accuracy import Accuracy
+def main() -> None:
+    """Run the Vertex AI batch inference example."""
+    # Load environment variables from .env file for configuration
+    load_dotenv()
+    output_dir, model_name, input_bucket, output_bucket, project_id = setup_arguments()
+    # Step 1: Load the evaluation dataset
+    dataset = EvalDataset.from_json(
+        "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
+    )
+    # Step 2: Define the preprocessing function for Vertex AI Batch API
+    def preprocessor(eval_item: dict) -> list:
+        """Pre-process dataset items into Vertex AI Batch API format."""
+        prompt = eval_item["question"]
+        # Create the batch API request messages format for Vertex AI
+        messages = [
+            {
+                "role": "system",
+                "content": "Answer the question directly and concisely as a single word",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        return messages
+    # Step 3: Define the postprocessing function
+    def postprocessor(response: str) -> str:
+        """Post-process Vertex AI batch response to extract the answer."""
+        # The batch function returns the message content directly
+        return response.strip()
+    # Step 4: Create the inference pipeline for batch processing
+    async def inference_function(items: list, **hyperparams: Any) -> Any:  # noqa
+        return await batch(
+            items,
+            model=model_name,
+            project_id=project_id,
+            input_bucket=input_bucket,
+            output_bucket=output_bucket,
+            **hyperparams,
+        )
+    inference_pipeline = InferencePipeline(
+        model=model_name,
+        preprocessor=preprocessor,
+        inference_function=inference_function,
+        postprocessor=postprocessor,
+    )
+    # Step 5: Run the batch evaluation
+    print(f"Running Vertex AI Batch API evaluation with model: {model_name}")
+    print(f"Project ID: {project_id}")
+    print(f"Input bucket: {input_bucket}")
+    print(f"Output bucket: {output_bucket}")
+    print(f"Processing {len(dataset)} items using batch inference...")
+    print("Note: Batch processing may take several minutes to complete.")
+    # For demonstration, limit to 25 items
+    results = evaluate(inference_pipeline, dataset, item_limit=25, score_type="all")
+    print("\nBatch evaluation completed!")
+    print(results)
+    # Step 6: Save results to file
+    output_file = output_dir / "vertex_batch_output.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=4)
+        print(f"Results saved in {output_file}")
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def setup_arguments() -> tuple[Path, str, str, str, str]:
+    """Parse command line arguments."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Run Vertex AI Batch API evaluation and save results."
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(Path.cwd() / "results"),
+        help="Directory to save evaluation outputs (JSON).",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Gemini model to use for batch inference (e.g., gemini-2.0-flash-001)",
+    )
+    parser.add_argument(
+        "--input-bucket",
+        type=str,
+        required=True,
+        help="GCS bucket name for input data (without gs:// prefix)",
+    )
+    parser.add_argument(
+        "--output-bucket",
+        type=str,
+        required=True,
+        help="GCS bucket name for output data (without gs:// prefix)",
+    )
+    parser.add_argument(
+        "--project-id",
+        type=str,
+        help="Google Cloud Project ID (defaults to GOOGLE_CLOUD_PROJECT env var)",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Handle project ID fallback
+    project_id = args.project_id or os.getenv("GOOGLE_CLOUD_PROJECT")
+    if not project_id:
+        raise ValueError(
+            "Project ID must be provided via --project-id or "
+            "GOOGLE_CLOUD_PROJECT environment variable"
+        )
+    return (
+        output_dir,
+        str(args.model),
+        str(args.input_bucket),
+        str(args.output_bucket),
+        str(project_id),
+    )
+if __name__ == "__main__":
+    main()

tutorials/examples/6-providers/vertex/messages_example.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+Google Cloud Vertex AI Model Inference Example.
+This example demonstrates how to evaluate language models using Google Cloud
+Vertex AI's Gemini models with Scorebook for real-time API calls.
+Prerequisites: Google Cloud SDK (gcloud) authenticated and GOOGLE_CLOUD_PROJECT
+environment variable set, or pass project ID as command line argument.
+"""
+import json
+from pathlib import Path
+from typing import Any, Dict
+from dotenv import load_dotenv
+from scorebook import EvalDataset, InferencePipeline, evaluate
+from scorebook.inference.clients.vertex import responses
+from scorebook.metrics.accuracy import Accuracy
+def main() -> None:
+    """Run the Vertex AI inference example."""
+    # Load environment variables from .env file for configuration
+    load_dotenv()
+    output_dir, model_name, project_id = setup_arguments()
+    # Step 1: Load the evaluation dataset
+    # Create an EvalDataset from local JSON file
+    # - Uses 'answer' field as ground truth labels
+    # - Configures Accuracy metric for evaluation
+    # - Loads from examples/example_datasets/dataset.json
+    dataset = EvalDataset.from_json(
+        "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
+    )
+    # Step 2: Define the preprocessing function
+    # Convert raw dataset items into Vertex AI API-compatible format
+    # This function formats the question for the Gemini model
+    def preprocessor(eval_item: Dict[str, str]) -> str:
+        """Pre-process dataset items into Vertex AI string format."""
+        return eval_item["question"]
+    # Step 3: Define the postprocessing function
+    # Extract the final answer from Vertex AI API response
+    # Handles response parsing and returns the response text
+    def postprocessor(response: Any) -> str:
+        """Post-process Vertex AI response to extract the answer."""
+        return str(response.text.strip())
+    # Step 4: Create the inference pipeline for cloud-based evaluation
+    # Combine preprocessing, Vertex AI inference, and postprocessing
+    # Uses scorebook's built-in Vertex AI responses function for API calls
+    # Create a system message with instructions for direct answers
+    system_prompt = """
+Answer the question directly and concisely.
+Do not provide lengthy explanations unless specifically asked.
+""".strip()
+    async def inference_function(items: list, **hyperparams: Any) -> Any:
+        return await responses(
+            items,
+            model=model_name,
+            project_id=project_id,
+            system_instruction=system_prompt,
+            **hyperparams,
+        )
+    inference_pipeline = InferencePipeline(
+        model=model_name,
+        preprocessor=preprocessor,
+        inference_function=inference_function,
+        postprocessor=postprocessor,
+    )
+    # Step 5: Run the cloud-based evaluation
+    # Execute evaluation using Vertex AI with the inference pipeline
+    # - Uses score_type="all" to get both aggregate and per-item results
+    # - Limits to 10 items for quick demonstration and cost control
+    print(f"Running Vertex AI evaluation with model: {model_name}")
+    print(f"Project ID: {project_id}")
+    print("Evaluating 10 items from local dataset...")
+    results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
+    print(results)
+    # Step 6: Save results to file
+    # Export evaluation results as JSON for later analysis
+    output_file = output_dir / "vertex_messages_output.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=4)
+        print(f"Results saved in {output_file}")
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def setup_arguments() -> tuple[Path, str, str]:
+    """Parse command line arguments."""
+    import argparse
+    import os
+    parser = argparse.ArgumentParser(description="Run Vertex AI evaluation and save results.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(Path.cwd() / "results"),
+        help="Directory to save evaluation outputs (JSON).",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Gemini model to use for inference (e.g., gemini-2.0-flash-001)",
+    )
+    parser.add_argument(
+        "--project-id",
+        type=str,
+        help="Google Cloud Project ID (defaults to GOOGLE_CLOUD_PROJECT env var)",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Handle project ID fallback
+    project_id = args.project_id or os.getenv("GOOGLE_CLOUD_PROJECT")
+    if not project_id:
+        raise ValueError(
+            "Project ID must be provided via --project-id or "
+            "GOOGLE_CLOUD_PROJECT environment variable"
+        )
+    return output_dir, str(args.model), str(project_id)
+if __name__ == "__main__":
+    main()

tutorials/examples/__init__.py ADDED Viewed

File without changes

tutorials/notebooks/1-scoring.ipynb ADDED Viewed

@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Scoring Model Outputs with Scorebook\n\nThis notebook demonstrates how to use Scorebook's `score()` function to evaluate pre-generated model predictions.\n\n## When to use `score()`\n\n- You already have model predictions and want to compute metrics\n- You want to re-score existing results with different metrics\n- You're importing evaluation results from another framework\n"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Setup\n",
+    "\n",
+    "First, let's import the necessary modules:"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from pprint import pprint\n",
+    "from scorebook import score\n",
+    "from scorebook.metrics.accuracy import Accuracy"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Your Data\n",
+    "\n",
+    "The `score()` function expects a list of items, where each item is a dictionary with:\n",
+    "- `input`: The input to the model (optional, for reference)\n",
+    "- `output`: The model's prediction\n",
+    "- `label`: The ground truth answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Example: Pre-generated model outputs\n",
+    "items = [\n",
+    "    {\n",
+    "        \"input\": \"What is 2 + 2?\",\n",
+    "        \"output\": \"4\",\n",
+    "        \"label\": \"4\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is the capital of France?\",\n",
+    "        \"output\": \"Paris\",\n",
+    "        \"label\": \"Paris\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"Who wrote Romeo and Juliet?\",\n",
+    "        \"output\": \"William Shakespeare\",\n",
+    "        \"label\": \"William Shakespeare\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is 5 * 6?\",\n",
+    "        \"output\": \"30\",\n",
+    "        \"label\": \"30\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is the largest planet in our solar system?\",\n",
+    "        \"output\": \"Jupiter\",\n",
+    "        \"label\": \"Jupiter\"\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "print(f\"Prepared {len(items)} items for scoring\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score the Results\n",
+    "\n",
+    "Now we'll use the `score()` function to compute accuracy metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "results = score(\n",
+    "    items=items,\n",
+    "    metrics=Accuracy,\n",
+    "    dataset_name=\"basic_questions\",\n",
+    "    model_name=\"example-model\",\n",
+    "    upload_results=False,  # Set to True to upload to Trismik\n",
+    ")\n",
+    "\n",
+    "pprint(results)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the Results\n",
+    "\n",
+    "The results dictionary contains:\n",
+    "- `aggregates`: Overall metrics (e.g., accuracy across all items)\n",
+    "- `items`: Per-item scores and predictions\n",
+    "- `metadata`: Information about the dataset and model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# View aggregate metrics\n",
+    "print(\"\\nAggregate Metrics:\")\n",
+    "print(f\"\\nAccuracy: {results['aggregate_results'][0]['accuracy']}\")\n",
+    "\n",
+    "# View per-item scores\n",
+    "print(\"\\nPer-Item Scores:\")\n",
+    "for i, item in enumerate(results['item_results'][:3], 1):\n",
+    "    print(f\"\\nItem {i}:\")\n",
+    "    print(f\"    Output: {item['output']}\")\n",
+    "    print(f\"     Label: {item['label']}\")\n",
+    "    print(f\"  Accuracy: {item['accuracy']}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": "## Next Steps\n\n- Try the **Evaluate** notebook to learn how to run inference and scoring together\n- See the **Upload Results** notebook to upload your scores to Trismik's dashboard\n- Explore custom metrics in the Scorebook documentation",
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

scorebook 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl