PyPI - scorebook - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

scorebook 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

scorebook/__init__.py +12 -5
scorebook/cli/auth.py +1 -1
scorebook/dashboard/__init__.py +1 -0
scorebook/dashboard/create_project.py +91 -0
scorebook/{trismik → dashboard}/credentials.py +57 -12
scorebook/{trismik → dashboard}/upload_results.py +1 -1
scorebook/eval_datasets/__init__.py +0 -4
scorebook/eval_datasets/eval_dataset.py +4 -2
scorebook/evaluate/__init__.py +1 -15
scorebook/evaluate/_async/evaluate_async.py +36 -19
scorebook/evaluate/_sync/evaluate.py +36 -19
scorebook/evaluate/evaluate_helpers.py +4 -3
scorebook/inference/__init__.py +1 -11
scorebook/inference/clients/__init__.py +1 -8
scorebook/inference/inference_pipeline.py +1 -1
scorebook/metrics/README.md +121 -0
scorebook/metrics/__init__.py +7 -16
scorebook/metrics/accuracy.py +2 -6
scorebook/metrics/bertscore.py +50 -0
scorebook/metrics/bleu.py +82 -0
scorebook/metrics/core/__init__.py +1 -0
scorebook/metrics/{metric_base.py → core/metric_base.py} +1 -2
scorebook/metrics/core/metric_registry.py +195 -0
scorebook/metrics/exactmatch.py +95 -0
scorebook/metrics/f1.py +96 -0
scorebook/metrics/precision.py +84 -9
scorebook/metrics/recall.py +94 -0
scorebook/metrics/rouge.py +85 -0
scorebook/score/__init__.py +0 -5
scorebook/score/_async/score_async.py +3 -2
scorebook/score/_sync/score.py +3 -2
scorebook/score/score_helpers.py +29 -12
scorebook/types.py +3 -3
scorebook/utils/__init__.py +0 -22
scorebook/utils/common_helpers.py +1 -1
scorebook/utils/mock_llm/__init__.py +41 -0
scorebook/utils/mock_llm/data/mock_llm_data.json +21970 -0
scorebook/utils/progress_bars.py +58 -786
scorebook-0.0.15.dist-info/METADATA +300 -0
scorebook-0.0.15.dist-info/RECORD +110 -0
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/WHEEL +1 -1
tutorials/README.md +147 -0
tutorials/__init__.py +5 -0
tutorials/examples/1-score/1-scoring_model_accuracy.py +47 -0
tutorials/examples/1-score/2-scoring_model_bleu.py +46 -0
tutorials/examples/1-score/3-scoring_model_f1.py +64 -0
tutorials/examples/1-score/4-scoring_model_rouge.py +64 -0
tutorials/examples/1-score/5-scoring_model_exact_match.py +84 -0
tutorials/examples/1-score/6-scoring_with_bertscore.py +57 -0
tutorials/examples/1-score/__init__.py +0 -0
tutorials/examples/2-evaluate/1-evaluating_local_models.py +106 -0
tutorials/examples/2-evaluate/2-evaluating_local_models_with_batching.py +108 -0
tutorials/examples/2-evaluate/3-evaluating_cloud_models.py +109 -0
tutorials/examples/2-evaluate/4-evaluating_cloud_models_with_batching.py +170 -0
tutorials/examples/2-evaluate/5-hyperparameter_sweeps.py +122 -0
tutorials/examples/2-evaluate/6-inference_pipelines.py +141 -0
tutorials/examples/3-evaluation_datasets/1-evaluation_datasets_from_files.py +110 -0
tutorials/examples/3-evaluation_datasets/2-evaluation_datasets_from_huggingface.py +101 -0
tutorials/examples/3-evaluation_datasets/3-evaluation_datasets_from_huggingface_with_yaml_configs.py +110 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.csv +11 -0
tutorials/examples/3-evaluation_datasets/example_datasets/basic_questions.json +42 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/Cais-MMLU.yaml +19 -0
tutorials/examples/3-evaluation_datasets/example_yaml_configs/TIGER-Lab-MMLU-Pro.yaml +18 -0
tutorials/examples/4-adaptive_evaluations/1-adaptive_evaluation.py +114 -0
tutorials/examples/4-adaptive_evaluations/2-adaptive_dataset_splits.py +106 -0
tutorials/examples/5-upload_results/1-uploading_score_results.py +92 -0
tutorials/examples/5-upload_results/2-uploading_evaluate_results.py +117 -0
tutorials/examples/5-upload_results/3-uploading_your_results.py +153 -0
tutorials/examples/6-providers/aws/__init__.py +1 -0
tutorials/examples/6-providers/aws/batch_example.py +219 -0
tutorials/examples/6-providers/portkey/__init__.py +1 -0
tutorials/examples/6-providers/portkey/batch_example.py +120 -0
tutorials/examples/6-providers/portkey/messages_example.py +121 -0
tutorials/examples/6-providers/vertex/__init__.py +1 -0
tutorials/examples/6-providers/vertex/batch_example.py +166 -0
tutorials/examples/6-providers/vertex/messages_example.py +142 -0
tutorials/examples/__init__.py +0 -0
tutorials/notebooks/1-scoring.ipynb +162 -0
tutorials/notebooks/2-evaluating.ipynb +316 -0
tutorials/notebooks/3.1-adaptive_evaluation_phi.ipynb +354 -0
tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb +243 -0
tutorials/notebooks/4-uploading_results.ipynb +175 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb +229 -0
tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_qwen_demo.ipynb +256 -0
tutorials/quickstarts/classical_evaluations/classical_evaluation_demo.ipynb +277 -0
tutorials/quickstarts/getting_started.ipynb +197 -0
tutorials/utils/__init__.py +35 -0
tutorials/utils/args_parser.py +132 -0
tutorials/utils/output.py +23 -0
tutorials/utils/setup.py +98 -0
scorebook/metrics/metric_registry.py +0 -105
scorebook/trismik/__init__.py +0 -10
scorebook-0.0.13.dist-info/METADATA +0 -389
scorebook-0.0.13.dist-info/RECORD +0 -50
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/entry_points.txt +0 -0
{scorebook-0.0.13.dist-info → scorebook-0.0.15.dist-info}/licenses/LICENSE +0 -0

tutorials/notebooks/3.2-adaptive_evaluation_gpt.ipynb ADDED Viewed

@@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adaptive Evaluations with Scorebook (GPT)\n",
+    "\n",
+    "This notebook demonstrates Trismik's adaptive evaluation feature using **OpenAI's GPT models** for high-accuracy results.\n",
+    "\n",
+    "> **Looking for a version without API costs?** See `3-adaptive_evaluation_local.ipynb` for a version using local open-source models (Phi-3) that runs on your machine without API keys.\n",
+    "\n",
+    "## What are Adaptive Evaluations?\n",
+    "\n",
+    "Adaptive evaluations dynamically select questions based on a model's previous responses, similar to adaptive testing in education (like the GRE or GMAT).\n",
+    "\n",
+    "### Benefits:\n",
+    "- **More efficient**: Fewer questions needed to assess capability\n",
+    "- **Precise measurement**: Better statistical confidence intervals\n",
+    "- **Optimal difficulty**: Questions adapt to the model's skill level\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "- **Trismik API key**: Get yours at https://app.trismik.com/settings\n",
+    "- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID\n",
+    "- **OpenAI API key**: For high-accuracy results on complex reasoning tasks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Setup Credentials\n\nSet your API credentials here:"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n# STEP 3: Get your OpenAI API key from https://platform.openai.com/api-keys\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"\nOPENAI_API_KEY = \"your-openai-api-key\"",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import asyncio\n",
+    "import string\n",
+    "from pprint import pprint\n",
+    "from typing import Any, List\n",
+    "\n",
+    "from openai import AsyncOpenAI\n",
+    "from scorebook import evaluate_async, login"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Login to Trismik\n",
+    "\n",
+    "Authenticate with your Trismik account:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n    raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n\nlogin(TRISMIK_API_KEY)\nprint(\"✓ Logged in to Trismik\")\n\nif not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n    raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n\nprint(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize OpenAI Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "if not OPENAI_API_KEY or OPENAI_API_KEY == \"your-openai-api-key\":\n    raise ValueError(\"Please set OPENAI_API_KEY. Get your API key from https://platform.openai.com/api-keys\")\n\nclient = AsyncOpenAI(api_key=OPENAI_API_KEY)  # pragma: allowlist secret\nmodel_name = \"gpt-4o-mini\"\n\nprint(f\"✓ Using model: {model_name}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Async Inference Function\n",
+    "\n",
+    "Create an async function to process inputs through the OpenAI API:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "async def inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
+    "    \"\"\"Process inputs through OpenAI's API.\n",
+    "    \n",
+    "    Args:\n",
+    "        inputs: Input values from an EvalDataset. For adaptive MMLU-Pro,\n",
+    "               each input is a dict with 'question' and 'options' keys.\n",
+    "        hyperparameters: Model hyperparameters.\n",
+    "    \n",
+    "    Returns:\n",
+    "        List of model outputs for all inputs.\n",
+    "    \"\"\"\n",
+    "    outputs = []\n",
+    "    \n",
+    "    for input_val in inputs:\n",
+    "        # Handle dict input from adaptive dataset\n",
+    "        if isinstance(input_val, dict):\n",
+    "            prompt = input_val.get(\"question\", \"\")\n",
+    "            if \"options\" in input_val:\n",
+    "                prompt += \"\\nOptions:\\n\" + \"\\n\".join(\n",
+    "                    f\"{letter}: {choice}\"\n",
+    "                    for letter, choice in zip(string.ascii_uppercase, input_val[\"options\"])\n",
+    "                )\n",
+    "        else:\n",
+    "            prompt = str(input_val)\n",
+    "        \n",
+    "        # Build messages for OpenAI API\n",
+    "        messages = [\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": \"Answer the question with a single letter representing the correct answer from the list of choices. Do not provide any additional explanation or output beyond the single letter.\",\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": prompt},\n",
+    "        ]\n",
+    "        \n",
+    "        # Call OpenAI API\n",
+    "        try:\n",
+    "            response = await client.chat.completions.create(\n",
+    "                model=model_name,\n",
+    "                messages=messages,\n",
+    "                temperature=0.7,\n",
+    "            )\n",
+    "            output = response.choices[0].message.content.strip()\n",
+    "        except Exception as e:\n",
+    "            output = f\"Error: {str(e)}\"\n",
+    "        \n",
+    "        outputs.append(output)\n",
+    "    \n",
+    "    return outputs"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Adaptive Evaluation\n",
+    "\n",
+    "Use `evaluate_async()` with an adaptive dataset (indicated by the `:adaptive` suffix):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "print(f\"Running adaptive evaluation on Common Sense QA with model: {model_name}\")\n",
+    "print(\"Note: Adaptive evaluation selects questions dynamically based on responses.\\n\")\n",
+    "\n",
+    "# Run adaptive evaluation\n",
+    "results = await evaluate_async(\n",
+    "    inference,\n",
+    "    datasets=\"trismik/CommonSenseQA:adaptive\",  # Adaptive datasets have the \":adaptive\" suffix\n",
+    "    experiment_id=\"Adaptive-Common-Sense-QA-Notebook\",\n",
+    "    project_id=TRISMIK_PROJECT_ID,\n",
+    "    return_dict=True,\n",
+    "    return_aggregates=True,\n",
+    "    return_items=True,\n",
+    "    return_output=True,\n",
+    ")\n",
+    "\n",
+    "print(\"\\n✓ Adaptive evaluation complete!\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## View Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "pprint(results)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## View Results on Dashboard\n",
+    "\n",
+    "Your results have been uploaded to Trismik's dashboard for visualization and tracking:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Understanding Adaptive Testing\n\n### How it works:\n1. **Initial Questions**: Start with medium-difficulty questions\n2. **Adaptation**: If the model answers correctly, harder questions follow; if incorrect, easier questions are selected\n3. **Convergence**: The test converges to the model's true ability level\n4. **Stopping Criteria**: Stops when sufficient confidence is reached\n\n### Benefits vs. Traditional Testing:\n- **Efficiency**: Typically requires 50-70% fewer questions for the same precision\n- **Precision**: Better estimates of model capability\n- **Engagement**: Questions are appropriately challenging\n\n## Next Steps\n\n- Try adaptive evaluation with different models to compare\n- **Don't have an OpenAI API key?** See `3-adaptive_evaluation_local.ipynb` to run adaptive evaluations with local open-source models (Phi-3, Llama, etc.)\n- Explore other adaptive datasets available on Trismik\n- See the **Upload Results** notebook for non-adaptive result tracking"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

tutorials/notebooks/4-uploading_results.ipynb ADDED Viewed

@@ -0,0 +1,175 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "# Uploading Results to Trismik Dashboard\n\nThis notebook demonstrates three ways to upload evaluation results to Trismik's dashboard for tracking and visualization.\n\n## Why Upload Results?\n\n- **Track Progress**: Monitor model performance over time\n- **Compare Models**: Visualize performance across different models and experiments\n- **Share Results**: Collaborate with your team on evaluation insights\n- **Historical Analysis**: Maintain a record of all evaluations\n\n## Prerequisites\n\n- **Trismik API key**: Get yours at https://app.trismik.com/settings\n- **Trismik Project**: Create a project at https://app.trismik.com and copy its Project ID"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Setup Credentials\n\nSet your Trismik credentials here:"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# STEP 1: Get your Trismik API key from https://app.trismik.com/settings\n# STEP 2: Create a project at https://app.trismik.com and copy the Project ID\n\n# Set your credentials here\nTRISMIK_API_KEY = \"your-trismik-api-key\"\nTRISMIK_PROJECT_ID = \"your-project-id\"",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "from pprint import pprint\n",
+    "from scorebook import score, login\n",
+    "from scorebook.metrics.accuracy import Accuracy"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Login to Trismik"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "if not TRISMIK_API_KEY or TRISMIK_API_KEY == \"your-trismik-api-key\":\n    raise ValueError(\"Please set TRISMIK_API_KEY. Get your API key from https://app.trismik.com/settings\")\n\nlogin(TRISMIK_API_KEY)\nprint(\"✓ Logged in to Trismik\")\n\nif not TRISMIK_PROJECT_ID or TRISMIK_PROJECT_ID == \"your-project-id\":\n    raise ValueError(\"Please set TRISMIK_PROJECT_ID. Create a project at https://app.trismik.com\")\n\nprint(f\"✓ Using project: {TRISMIK_PROJECT_ID}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Method 1: Upload score() Results\n",
+    "\n",
+    "Score pre-computed outputs and upload to Trismik:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# Prepare items with pre-computed outputs\nitems = [\n    {\"input\": \"What is 2 + 2?\", \"output\": \"4\", \"label\": \"4\"},\n    {\"input\": \"What is the capital of France?\", \"output\": \"Paris\", \"label\": \"Paris\"},\n    {\"input\": \"Who wrote Romeo and Juliet?\", \"output\": \"William Shakespeare\", \"label\": \"William Shakespeare\"},\n    {\"input\": \"What is 5 * 6?\", \"output\": \"30\", \"label\": \"30\"},\n    {\"input\": \"What is the largest planet?\", \"output\": \"Jupiter\", \"label\": \"Jupiter\"},\n]\n\n# Score and upload\nresults = score(\n    items=items,\n    metrics=Accuracy,\n    dataset_name=\"basic_questions\",\n    model_name=\"example-model-v1\",\n    experiment_id=\"Score-Upload-Notebook\",\n    project_id=TRISMIK_PROJECT_ID,\n    metadata={\n        \"description\": \"Example from Jupyter notebook\",\n        \"note\": \"Pre-computed outputs uploaded via score()\",\n    },\n    upload_results=True,  # Enable uploading\n)\n\nprint(f\"\\n✓ Results uploaded successfully!\")\nprint(f\"Accuracy: {results['aggregate_results'][0]['accuracy']:.2%}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Method 2: Upload evaluate() Results\n",
+    "\n",
+    "Run inference and automatically upload results:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "from typing import Any, List\nfrom scorebook import EvalDataset, evaluate\n\n# Create a simple dataset\nimport json\nfrom pathlib import Path\n\nsample_data = [\n    {\"question\": \"What is 10 + 5?\", \"answer\": \"15\"},\n    {\"question\": \"What is the capital of Spain?\", \"answer\": \"Madrid\"},\n]\n\ntemp_file = Path(\"temp_eval_dataset.json\")\nwith open(temp_file, \"w\") as f:\n    json.dump(sample_data, f)\n\ndataset = EvalDataset.from_json(\n    path=str(temp_file),\n    metrics=\"accuracy\",\n    input=\"question\",\n    label=\"answer\",\n)\n\n# Define a simple inference function (mock)\ndef mock_inference(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n    \"\"\"Mock inference that returns the expected answers.\"\"\"\n    # In practice, this would call your model\n    return [\"15\", \"Madrid\"]  # Mock perfect answers\n\n# Run evaluation with upload\neval_results = evaluate(\n    mock_inference,\n    dataset,\n    hyperparameters={\"temperature\": 0.7},\n    experiment_id=\"Evaluate-Upload-Notebook\",\n    project_id=TRISMIK_PROJECT_ID,\n    metadata={\n        \"model\": \"mock-model\",\n        \"description\": \"Evaluation results from notebook\",\n    },\n    return_aggregates=True,\n    return_items=True,\n    return_output=True,\n)\n\nprint(f\"\\n✓ Evaluation results uploaded!\")\nprint(f\"Accuracy: {eval_results['aggregate_results'][0]['accuracy']:.2%}\")\n\n# Cleanup\ntemp_file.unlink()",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Method 3: Upload External Results\n",
+    "\n",
+    "Import results from external evaluation frameworks or historical data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# Example: Import results from another evaluation framework\nexternal_results = [\n    {\"input\": \"Translate 'hello' to Spanish\", \"output\": \"hola\", \"label\": \"hola\"},\n    {\"input\": \"Translate 'goodbye' to Spanish\", \"output\": \"adiós\", \"label\": \"adiós\"},\n    {\"input\": \"Translate 'thank you' to Spanish\", \"output\": \"gracias\", \"label\": \"gracias\"},\n    {\"input\": \"Translate 'please' to Spanish\", \"output\": \"por favor\", \"label\": \"por favor\"},\n]\n\n# Upload external results\nexternal_upload = score(\n    items=external_results,\n    metrics=\"accuracy\",\n    dataset_name=\"spanish_translation\",\n    model_name=\"external-translator-v2\",\n    experiment_id=\"External-Results-Upload\",\n    project_id=TRISMIK_PROJECT_ID,\n    metadata={\n        \"description\": \"Historical results imported from external framework\",\n        \"source\": \"Custom evaluation pipeline\",\n        \"date\": \"2025-01-15\",\n    },\n    upload_results=True,\n)\n\nprint(f\"\\n✓ External results uploaded!\")\nprint(f\"Accuracy: {external_upload['aggregate_results'][0]['accuracy']:.2%}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## View Results on Dashboard\n",
+    "\n",
+    "All uploaded results are now visible on your Trismik dashboard:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "from IPython.display import display, Markdown\n\ndashboard_url = f\"https://app.trismik.com/projects/{TRISMIK_PROJECT_ID}\"\ndisplay(Markdown(f\"### 📊 [View All Results on Dashboard]({dashboard_url})\"))\nprint(f\"\\nDirect link: {dashboard_url}\")\nprint(\"\\nYou should see three experiments:\")\nprint(\"  1. Score-Upload-Notebook\")\nprint(\"  2. Evaluate-Upload-Notebook\")\nprint(\"  3. External-Results-Upload\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Organizing Results with Metadata\n",
+    "\n",
+    "Use metadata to add context and organization to your results:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# Example: Organizing a model comparison experiment\nmodels_to_test = [\n    {\"name\": \"model-a\", \"version\": \"1.0\"},\n    {\"name\": \"model-b\", \"version\": \"2.0\"},\n]\n\ntest_items = [\n    {\"output\": \"positive\", \"label\": \"positive\"},\n    {\"output\": \"negative\", \"label\": \"negative\"},\n]\n\nfor model_info in models_to_test:\n    result = score(\n        items=test_items,\n        metrics=Accuracy,\n        dataset_name=\"sentiment_test\",\n        model_name=model_info[\"name\"],\n        experiment_id=\"Model-Comparison-Notebook\",\n        project_id=TRISMIK_PROJECT_ID,\n        metadata={\n            \"model_version\": model_info[\"version\"],\n            \"comparison_group\": \"sentiment_analysis\",\n            \"date\": \"2025-01-26\",\n            \"notes\": f\"Testing {model_info['name']} v{model_info['version']}\",\n        },\n        upload_results=True,\n    )\n    print(f\"✓ Uploaded results for {model_info['name']} v{model_info['version']}\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Best Practices\n",
+    "\n",
+    "### Experiment Naming\n",
+    "- Use descriptive `experiment_id` values (e.g., \"GPT4-MMLU-Baseline\")\n",
+    "- Group related runs under the same experiment ID\n",
+    "- Use different experiment IDs for different types of tests\n",
+    "\n",
+    "### Metadata\n",
+    "- Include model version, hyperparameters, and configuration\n",
+    "- Add timestamps and descriptions for historical tracking\n",
+    "- Use consistent keys across experiments for easy comparison\n",
+    "\n",
+    "### Organization\n",
+    "- Create separate projects for different use cases\n",
+    "- Use tags or metadata fields to categorize experiments\n",
+    "- Document your evaluation methodology in metadata\n",
+    "\n",
+    "## Next Steps\n",
+    "\n",
+    "- Explore the Trismik dashboard to visualize trends and comparisons\n",
+    "- Set up automated evaluation pipelines with result uploading\n",
+    "- Try the **Adaptive Evaluations** notebook for efficient testing with automatic uploads"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

tutorials/quickstarts/adaptive_evaluations/adaptive_evaluation_openai_demo.ipynb ADDED Viewed

@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bc3ba3cd77800bb4",
+   "metadata": {},
+   "source": [
+    "# Adaptive Evaluations with Scorebook - Evaluating an OpenAI GPT Model\n",
+    "\n",
+    "This quick-start guide showcases an adaptive evaluation of OpenAI's GPT-4o Mini model.\n",
+    "\n",
+    "We recommend that you first see our [getting started quick-start guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb) if you have not done so already, for more of a detailed overview on adaptive testing and setting up Trismik credentials.\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "- **Trismik API key**: Generate a Trismik API key from the [Trismik dashboard's settings page](https://app.trismik.com/settings).\n",
+    "- **Trismik Project Id**: We recommend you use the project id generated in the [Getting Started Quick-Start Guide](https://colab.research.google.com/github/trismik/scorebook/blob/main/tutorials/quickstarts/getting_started.ipynb).\n",
+    "- **OpenAI API key**: Generate an OpenAI API key from [OpenAI's API Platform](https://openai.com/api/).\n",
+    "\n",
+    "## Install Scorebook"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "!pip install scorebook\n",
+    "# if you're running this locally, please run !pip install scorebook\"[examples, providers]\""
+   ],
+   "id": "f454e876551a4a0c",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "\n",
+    "## Setup Credentials\n",
+    "\n",
+    "Enter your Trismik API key, project id and OpenAI API Key below."
+   ],
+   "id": "cad992b287d4d0ac"
+  },
+  {
+   "cell_type": "code",
+   "id": "14e576282749edb7",
+   "metadata": {},
+   "source": [
+    "# Set your credentials here\n",
+    "TRISMIK_API_KEY = \"your-trismik-api-key-here\"\n",
+    "TRISMIK_PROJECT_ID = \"your-trismik-project-id-here\"\n",
+    "OPENAI_API_KEY = \"your-openai-api-key-here\""
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "700950d039e4c0f6",
+   "metadata": {},
+   "source": [
+    "## Login with Trismik API Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {},
+   "source": [
+    "from scorebook import login\n",
+    "\n",
+    "# Login to Trismik\n",
+    "login(TRISMIK_API_KEY)\n",
+    "print(\"✓ Logged in to Trismik\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13084db21e549ccf",
+   "metadata": {},
+   "source": [
+    "## Define an Inference Function\n",
+    "\n",
+    "To evaluate a model with Scorebook, it must be encapsulated within an inference function. An inference function must accept a list of model inputs, pass these to the model for inference, collect and return outputs generated.\n",
+    "\n",
+    "An inference function can be defined to encapsulate any model, local or cloud-hosted. There is flexibility in how an inference function can be defined, the only requirements are the function signature. An inference function must,\n",
+    "\n",
+    "Accept:\n",
+    "\n",
+    "- A list of model inputs.\n",
+    "- Hyperparameters which can be optionally accessed via kwargs.\n",
+    "\n",
+    "Return\n",
+    "\n",
+    "- A list of parsed model outputs for scoring."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "8aa99f513db6241a",
+   "metadata": {},
+   "source": [
+    "from openai import OpenAI\n",
+    "from typing import Any, List\n",
+    "import string\n",
+    "\n",
+    "client = OpenAI(api_key=OPENAI_API_KEY)\n",
+    "\n",
+    "# define an inference function for GPT-4o Mini.\n",
+    "def gpt4o_mini(inputs: List[Any], **hyperparameters: Any) -> List[Any]:\n",
+    "    \"\"\"Process inputs through OpenAI's API\"\"\"\n",
+    "\n",
+    "    outputs = []\n",
+    "    for idx, input_item in enumerate(inputs):\n",
+    "\n",
+    "        # Format prompt\n",
+    "        choices = input_item.get(\"options\", [])\n",
+    "        prompt = (\n",
+    "            str(input_item.get(\"question\", \"\"))\n",
+    "            + \"\\nOptions:\\n\"\n",
+    "            + \"\\n\".join(\n",
+    "                f\"{letter}: {choice['text'] if isinstance(choice, dict) else choice}\"\n",
+    "                for letter, choice in zip(string.ascii_uppercase, choices)\n",
+    "            )\n",
+    "        )\n",
+    "\n",
+    "        # Build messages for OpenAI API\n",
+    "        messages = [\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": hyperparameters[\"system_message\"]\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": prompt},\n",
+    "        ]\n",
+    "\n",
+    "        # Call OpenAI API and extract output from the response\n",
+    "        try:\n",
+    "            response = client.chat.completions.create(\n",
+    "                model=\"gpt-4o-mini\",\n",
+    "                messages=messages,\n",
+    "                temperature=0.7,\n",
+    "            )\n",
+    "            output = response.choices[0].message.content.strip()\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            output = f\"Error: {str(e)}\"\n",
+    "\n",
+    "        outputs.append(output)\n",
+    "\n",
+    "    return outputs"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "efa5c3ea791bbcd1",
+   "metadata": {},
+   "source": [
+    "## Run an Adaptive Evaluation\n",
+    "\n",
+    "When running an adaptive evaluation, we can use any single or multiple adaptive datasets and specify a split to be evaluated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "3cbf1b2f13d5553e",
+   "metadata": {},
+   "source": [
+    "from scorebook import evaluate\n",
+    "\n",
+    "# Run adaptive evaluation\n",
+    "results = evaluate(\n",
+    "    inference = gpt4o_mini,\n",
+    "    datasets = \"trismik/figQA:adaptive\",\n",
+    "    hyperparameters = {\"system_message\": \"Answer the question with only the letter of the correct option. No additional text or context\"},\n",
+    "    split = \"validation\",\n",
+    "    experiment_id = \"GPT-4o-Mini-Adaptive-Evaluation\",\n",
+    "    project_id = TRISMIK_PROJECT_ID,\n",
+    ")\n",
+    "\n",
+    "# Print the adaptive evaluation results\n",
+    "print(\"✓ Adaptive evaluation complete!\")\n",
+    "print(\"Results: \", results[0][\"score\"])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d37cb5e87cc297fe",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## Next Steps\n",
+    "\n",
+    "- [Adaptive Testing White Paper](https://docs.trismik.com/adaptiveTesting/adaptive-testing-introduction/): An in depth overview of the science behind the adaptive testing methodology.\n",
+    "- [Dataset Page](https://dashboard.trismik.com/datasets): Trismik's full set of currently adaptive datasets from the Trismik dashboard.\n",
+    "- [Scorebook Docs](https://docs.trismik.com/scorebook/introduction-to-scorebook/): Scorebook's full documentation.\n",
+    "- [Scorebook Repository](https://github.com/trismik/scorebook): Scorebook is an open-source library, view the code and more examples."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scorebook 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

scorebook 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl