PyPI - scorebook - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

scorebook/__init__.py +2 -0
scorebook/dashboard/credentials.py +34 -4
scorebook/eval_datasets/eval_dataset.py +2 -2
scorebook/evaluate/_async/evaluate_async.py +27 -11
scorebook/evaluate/_sync/evaluate.py +27 -11
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +8 -0
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/score_helpers.py +28 -11
scorebook/types.py +2 -2
scorebook/utils/progress_bars.py +58 -786
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/METADATA +32 -24
scorebook-0.0.15.dist-info/RECORD +110 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -107
scorebook-0.0.14.dist-info/RECORD +0 -53
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.14.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0

tutorials/examples/6-providers/vertex/messages_example.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+Google Cloud Vertex AI Model Inference Example.
+This example demonstrates how to evaluate language models using Google Cloud
+Vertex AI's Gemini models with Scorebook for real-time API calls.
+Prerequisites: Google Cloud SDK (gcloud) authenticated and GOOGLE_CLOUD_PROJECT
+environment variable set, or pass project ID as command line argument.
+"""
+import json
+from pathlib import Path
+from typing import Any, Dict
+from dotenv import load_dotenv
+from scorebook import EvalDataset, InferencePipeline, evaluate
+from scorebook.inference.clients.vertex import responses
+from scorebook.metrics.accuracy import Accuracy
+def main() -> None:
+    """Run the Vertex AI inference example."""
+    # Load environment variables from .env file for configuration
+    load_dotenv()
+    output_dir, model_name, project_id = setup_arguments()
+    # Step 1: Load the evaluation dataset
+    # Create an EvalDataset from local JSON file
+    # - Uses 'answer' field as ground truth labels
+    # - Configures Accuracy metric for evaluation
+    # - Loads from examples/example_datasets/dataset.json
+    dataset = EvalDataset.from_json(
+        "examples/example_datasets/dataset.json", label="answer", metrics=[Accuracy]
+    )
+    # Step 2: Define the preprocessing function
+    # Convert raw dataset items into Vertex AI API-compatible format
+    # This function formats the question for the Gemini model
+    def preprocessor(eval_item: Dict[str, str]) -> str:
+        """Pre-process dataset items into Vertex AI string format."""
+        return eval_item["question"]
+    # Step 3: Define the postprocessing function
+    # Extract the final answer from Vertex AI API response
+    # Handles response parsing and returns the response text
+    def postprocessor(response: Any) -> str:
+        """Post-process Vertex AI response to extract the answer."""
+        return str(response.text.strip())
+    # Step 4: Create the inference pipeline for cloud-based evaluation
+    # Combine preprocessing, Vertex AI inference, and postprocessing
+    # Uses scorebook's built-in Vertex AI responses function for API calls
+    # Create a system message with instructions for direct answers
+    system_prompt = """
+Answer the question directly and concisely.
+Do not provide lengthy explanations unless specifically asked.
+""".strip()
+    async def inference_function(items: list, **hyperparams: Any) -> Any:
+        return await responses(
+            items,
+            model=model_name,
+            project_id=project_id,
+            system_instruction=system_prompt,
+            **hyperparams,
+        )
+    inference_pipeline = InferencePipeline(
+        model=model_name,
+        preprocessor=preprocessor,
+        inference_function=inference_function,
+        postprocessor=postprocessor,
+    )
+    # Step 5: Run the cloud-based evaluation
+    # Execute evaluation using Vertex AI with the inference pipeline
+    # - Uses score_type="all" to get both aggregate and per-item results
+    # - Limits to 10 items for quick demonstration and cost control
+    print(f"Running Vertex AI evaluation with model: {model_name}")
+    print(f"Project ID: {project_id}")
+    print("Evaluating 10 items from local dataset...")
+    results = evaluate(inference_pipeline, dataset, item_limit=10, score_type="all")
+    print(results)
+    # Step 6: Save results to file
+    # Export evaluation results as JSON for later analysis
+    output_file = output_dir / "vertex_messages_output.json"
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=4)
+        print(f"Results saved in {output_file}")
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def setup_arguments() -> tuple[Path, str, str]:
+    """Parse command line arguments."""
+    import argparse
+    import os
+    parser = argparse.ArgumentParser(description="Run Vertex AI evaluation and save results.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(Path.cwd() / "results"),
+        help="Directory to save evaluation outputs (JSON).",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Gemini model to use for inference (e.g., gemini-2.0-flash-001)",
+    )
+    parser.add_argument(
+        "--project-id",
+        type=str,
+        help="Google Cloud Project ID (defaults to GOOGLE_CLOUD_PROJECT env var)",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Handle project ID fallback
+    project_id = args.project_id or os.getenv("GOOGLE_CLOUD_PROJECT")
+    if not project_id:
+        raise ValueError(
+            "Project ID must be provided via --project-id or "
+            "GOOGLE_CLOUD_PROJECT environment variable"
+        )
+    return output_dir, str(args.model), str(project_id)
+if __name__ == "__main__":
+    main()

tutorials/examples/__init__.py ADDED Viewed

File without changes

tutorials/notebooks/1-scoring.ipynb ADDED Viewed

@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Scoring Model Outputs with Scorebook\n\nThis notebook demonstrates how to use Scorebook's `score()` function to evaluate pre-generated model predictions.\n\n## When to use `score()`\n\n- You already have model predictions and want to compute metrics\n- You want to re-score existing results with different metrics\n- You're importing evaluation results from another framework\n"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Setup\n",
+    "\n",
+    "First, let's import the necessary modules:"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from pprint import pprint\n",
+    "from scorebook import score\n",
+    "from scorebook.metrics.accuracy import Accuracy"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Your Data\n",
+    "\n",
+    "The `score()` function expects a list of items, where each item is a dictionary with:\n",
+    "- `input`: The input to the model (optional, for reference)\n",
+    "- `output`: The model's prediction\n",
+    "- `label`: The ground truth answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Example: Pre-generated model outputs\n",
+    "items = [\n",
+    "    {\n",
+    "        \"input\": \"What is 2 + 2?\",\n",
+    "        \"output\": \"4\",\n",
+    "        \"label\": \"4\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is the capital of France?\",\n",
+    "        \"output\": \"Paris\",\n",
+    "        \"label\": \"Paris\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"Who wrote Romeo and Juliet?\",\n",
+    "        \"output\": \"William Shakespeare\",\n",
+    "        \"label\": \"William Shakespeare\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is 5 * 6?\",\n",
+    "        \"output\": \"30\",\n",
+    "        \"label\": \"30\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input\": \"What is the largest planet in our solar system?\",\n",
+    "        \"output\": \"Jupiter\",\n",
+    "        \"label\": \"Jupiter\"\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "print(f\"Prepared {len(items)} items for scoring\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score the Results\n",
+    "\n",
+    "Now we'll use the `score()` function to compute accuracy metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "results = score(\n",
+    "    items=items,\n",
+    "    metrics=Accuracy,\n",
+    "    dataset_name=\"basic_questions\",\n",
+    "    model_name=\"example-model\",\n",
+    "    upload_results=False,  # Set to True to upload to Trismik\n",
+    ")\n",
+    "\n",
+    "pprint(results)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the Results\n",
+    "\n",
+    "The results dictionary contains:\n",
+    "- `aggregates`: Overall metrics (e.g., accuracy across all items)\n",
+    "- `items`: Per-item scores and predictions\n",
+    "- `metadata`: Information about the dataset and model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# View aggregate metrics\n",
+    "print(\"\\nAggregate Metrics:\")\n",
+    "print(f\"\\nAccuracy: {results['aggregate_results'][0]['accuracy']}\")\n",
+    "\n",
+    "# View per-item scores\n",
+    "print(\"\\nPer-Item Scores:\")\n",
+    "for i, item in enumerate(results['item_results'][:3], 1):\n",
+    "    print(f\"\\nItem {i}:\")\n",
+    "    print(f\"    Output: {item['output']}\")\n",
+    "    print(f\"     Label: {item['label']}\")\n",
+    "    print(f\"  Accuracy: {item['accuracy']}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": "## Next Steps\n\n- Try the **Evaluate** notebook to learn how to run inference and scoring together\n- See the **Upload Results** notebook to upload your scores to Trismik's dashboard\n- Explore custom metrics in the Scorebook documentation",
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

tutorials/notebooks/2-evaluating.ipynb ADDED Viewed

@@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluating Models with Scorebook\n",
+    "\n",
+    "This notebook demonstrates how to use Scorebook's `evaluate()` function to run inference and compute metrics in a single step.\n",
+    "\n",
+    "## When to use `evaluate()`\n",
+    "\n",
+    "- You want to run inference on a dataset and score the results\n",
+    "- You're comparing different models on the same dataset\n",
+    "- You want to track hyperparameters alongside results\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "This example uses a local HuggingFace model. For cloud models (OpenAI), see the examples directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Import necessary modules:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "from pprint import pprint\n",
+    "from typing import Any, List\n",
+    "import transformers\n",
+    "\n",
+    "from scorebook import EvalDataset, evaluate"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize Your Model\n",
+    "\n",
+    "Set up a HuggingFace pipeline for inference:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "model_name = \"microsoft/Phi-4-mini-instruct\"\n",
+    "\n",
+    "pipeline = transformers.pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model_name,\n",
+    "    model_kwargs={\"torch_dtype\": \"auto\"},\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "\n",
+    "print(f\"Model loaded: {model_name}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Your Inference Function\n",
+    "\n",
+    "Create a function that processes inputs and returns outputs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
+    "    \"\"\"Process inputs through the model.\n",
+    "    \n",
+    "    Args:\n",
+    "        inputs: List of input values from the dataset\n",
+    "        hyperparameters: Model hyperparameters (e.g., temperature, system_message)\n",
+    "        \n",
+    "    Returns:\n",
+    "        List of model outputs\n",
+    "    \"\"\"\n",
+    "    outputs = []\n",
+    "    \n",
+    "    for input_val in inputs:\n",
+    "        # Build messages for the model\n",
+    "        messages = [\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": hyperparameters.get(\"system_message\", \"You are a helpful assistant.\")\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": str(input_val)},\n",
+    "        ]\n",
+    "        \n",
+    "        # Run inference\n",
+    "        result = pipeline(\n",
+    "            messages,\n",
+    "            max_new_tokens=hyperparameters.get(\"max_new_tokens\", 100),\n",
+    "        )\n",
+    "        \n",
+    "        # Extract the answer\n",
+    "        output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
+    "        outputs.append(output)\n",
+    "    \n",
+    "    return outputs"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Your Dataset\n",
+    "\n",
+    "Create an evaluation dataset from a JSON file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Create a sample dataset\n",
+    "sample_data = [\n",
+    "    {\"question\": \"What is 2 + 2?\", \"answer\": \"4\"},\n",
+    "    {\"question\": \"What is the capital of France?\", \"answer\": \"Paris\"},\n",
+    "    {\"question\": \"Who wrote Romeo and Juliet?\", \"answer\": \"William Shakespeare\"},\n",
+    "]\n",
+    "\n",
+    "# Create EvalDataset directly from list\n",
+    "dataset = EvalDataset.from_list(\n",
+    "    name=\"sample_questions\",\n",
+    "    metrics=\"accuracy\",\n",
+    "    items=sample_data,\n",
+    "    input=\"question\",\n",
+    "    label=\"answer\",\n",
+    ")\n",
+    "\n",
+    "print(f\"Loaded dataset with {len(dataset.items)} items\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Evaluation\n",
+    "\n",
+    "Use `evaluate()` to run inference and compute metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "results = evaluate(\n",
+    "    inference,\n",
+    "    dataset,\n",
+    "    hyperparameters={\n",
+    "        \"system_message\": \"Answer the question directly and concisely.\",\n",
+    "        \"max_new_tokens\": 50,\n",
+    "    },\n",
+    "    return_aggregates=True,\n",
+    "    return_items=True,\n",
+    "    return_output=True,\n",
+    "    upload_results=False,  # Set to True to upload to Trismik\n",
+    ")\n",
+    "\n",
+    "pprint(results)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analyze Results\n",
+    "\n",
+    "Examine the outputs and metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Overall accuracy\n",
+    "print(f\"\\nOverall Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")\n",
+    "\n",
+    "# Per-item results\n",
+    "print(\"\\nPer-Item Results:\")\n",
+    "for i, item in enumerate(results['item_results'], 1):\n",
+    "    print(f\"\\nQuestion {i}: {item['input']}\")\n",
+    "    print(f\"  Model Output: {item['output']}\")\n",
+    "    print(f\"  Expected: {item['label']}\")\n",
+    "    print(f\"  Correct: {'✓' if item['accuracy'] == 1.0 else '✗'}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## Hyperparameter Sweeps\n",
+    "\n",
+    "Evaluate with different hyperparameters to find optimal settings:"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# Define an inference function\n",
+    "def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
+    "    \"\"\"Process inputs through the model.\n",
+    "\n",
+    "    Args:\n",
+    "        inputs: Input values from an EvalDataset.\n",
+    "        hyperparameters: Model hyperparameters including system_message, temperature, top_p, top_k.\n",
+    "\n",
+    "    Returns:\n",
+    "        List of model outputs for all inputs.\n",
+    "    \"\"\"\n",
+    "    outputs = []\n",
+    "    for input_val in inputs:\n",
+    "        # Preprocess: Build messages\n",
+    "        messages = [\n",
+    "            {\"role\": \"system\", \"content\": hyperparameters[\"system_message\"]},\n",
+    "            {\"role\": \"user\", \"content\": str(input_val)},\n",
+    "        ]\n",
+    "\n",
+    "        # Run inference\n",
+    "        result = pipeline(\n",
+    "            messages,\n",
+    "            temperature=hyperparameters[\"temperature\"],\n",
+    "            top_p=hyperparameters.get(\"top_p\"),\n",
+    "            top_k=hyperparameters.get(\"top_k\"),\n",
+    "        )\n",
+    "\n",
+    "        # Postprocess: Extract the answer\n",
+    "        output = str(result[0][\"generated_text\"][-1][\"content\"])\n",
+    "        outputs.append(output)\n",
+    "\n",
+    "    return outputs\n",
+    "\n",
+    "# Define hyperparameters with lists of values to create a sweep\n",
+    "hyperparameters = {\n",
+    "    \"system_message\": \"Answer the question directly and concisely.\",\n",
+    "    \"temperature\": [0.6, 0.7, 0.8],\n",
+    "    \"top_p\": [0.7, 0.8, 0.9],\n",
+    "    \"top_k\": [10, 20, 30],\n",
+    "}\n",
+    "\n",
+    "# Run evaluation across all hyperparameter combinations\n",
+    "results = evaluate(\n",
+    "    inference,\n",
+    "    dataset,\n",
+    "    hyperparameters=hyperparameters,\n",
+    "    return_aggregates=True,\n",
+    "    return_items=True,\n",
+    "    return_output=True,\n",
+    "    upload_results=False,\n",
+    ")\n",
+    "\n",
+    "pprint(results)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next Steps\n",
+    "\n",
+    "- Try the **Adaptive Evaluations** notebook for efficient testing with fewer questions\n",
+    "- See the **Upload Results** notebook to track results in Trismik's dashboard\n",
+    "- Explore batch processing for faster evaluation of large datasets"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

scorebook 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

scorebook 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl