PyPI - themis-eval - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +429 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/commands/results.py +252 -0
themis/cli/main.py +427 -57
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/core/entities.py +23 -3
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/pipelines/standard_pipeline.py +68 -8
themis/experiment/cache_manager.py +8 -3
themis/experiment/export.py +110 -2
themis/experiment/orchestrator.py +109 -11
themis/experiment/storage.py +1457 -110
themis/generation/providers/litellm_provider.py +46 -0
themis/generation/runner.py +22 -6
themis/integrations/huggingface.py +12 -1
themis/integrations/wandb.py +13 -1
themis/interfaces/__init__.py +86 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis_eval-0.2.1.dist-info/METADATA +596 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
themis_eval-0.1.1.dist-info/METADATA +0 -758
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0

themis/presets/models.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""Model name parsing and provider detection.
+This module automatically detects the appropriate provider based on
+model names, eliminating the need for users to specify providers manually.
+"""
+from __future__ import annotations
+import re
+from typing import Any
+def parse_model_name(model: str, **kwargs: Any) -> tuple[str, str, dict[str, Any]]:
+    """Parse model name and detect provider.
+    Args:
+        model: Model identifier (e.g., "gpt-4", "claude-3-opus", "llama-2-70b")
+        **kwargs: Additional provider-specific options
+    Returns:
+        Tuple of (provider_name, model_id, provider_options)
+    Examples:
+        >>> parse_model_name("gpt-4")
+        ("litellm", "gpt-4", {})
+        >>> parse_model_name("claude-3-opus-20240229")
+        ("litellm", "claude-3-opus-20240229", {})
+        >>> parse_model_name("local-llm", base_url="http://localhost:1234/v1")
+        ("litellm", "local-llm", {"base_url": "http://localhost:1234/v1"})
+    """
+    model_lower = model.lower()
+    # OpenAI models
+    if any(pattern in model_lower for pattern in ["gpt-", "o1-", "text-davinci"]):
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Anthropic models
+    if "claude" in model_lower:
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Google models
+    if any(pattern in model_lower for pattern in ["gemini", "palm"]):
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Meta models
+    if "llama" in model_lower:
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Mistral models
+    if "mistral" in model_lower or "mixtral" in model_lower:
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Cohere models
+    if "command" in model_lower and "xl" in model_lower:
+        return "litellm", model, _extract_provider_options(kwargs)
+    # AI21 models
+    if "j2-" in model_lower:
+        return "litellm", model, _extract_provider_options(kwargs)
+    # Fake model for testing
+    if "fake" in model_lower:
+        return "fake", model, {}
+    # Default: assume it's a litellm-compatible model
+    # User can provide base_url for custom endpoints
+    return "litellm", model, _extract_provider_options(kwargs)
+def _extract_provider_options(kwargs: dict[str, Any]) -> dict[str, Any]:
+    """Extract provider-specific options from kwargs.
+    Args:
+        kwargs: Dictionary of options
+    Returns:
+        Dictionary of provider options
+    """
+    provider_options = {}
+    # Known provider options
+    option_keys = [
+        "api_key",
+        "base_url",
+        "api_base",
+        "api_version",
+        "timeout",
+        "max_retries",
+        "n_parallel",
+        "organization",
+        "api_type",
+        "region_name",
+    ]
+    for key in option_keys:
+        if key in kwargs:
+            provider_options[key] = kwargs[key]
+    return provider_options
+def get_provider_for_model(model: str) -> str:
+    """Get provider name for a model (without parsing full options).
+    Args:
+        model: Model identifier
+    Returns:
+        Provider name
+    Examples:
+        >>> get_provider_for_model("gpt-4")
+        "litellm"
+        >>> get_provider_for_model("claude-3-opus")
+        "litellm"
+    """
+    provider, _, _ = parse_model_name(model)
+    return provider
+# Model family detection for preset selection
+def get_model_family(model: str) -> str:
+    """Get the model family for capability detection.
+    Args:
+        model: Model identifier
+    Returns:
+        Model family name
+    Examples:
+        >>> get_model_family("gpt-4-turbo")
+        "gpt-4"
+        >>> get_model_family("claude-3-opus-20240229")
+        "claude-3"
+    """
+    model_lower = model.lower()
+    # OpenAI families
+    if "gpt-4" in model_lower:
+        return "gpt-4"
+    if "gpt-3.5" in model_lower:
+        return "gpt-3.5"
+    if "o1" in model_lower:
+        return "o1"
+    # Anthropic families
+    if "claude-3" in model_lower:
+        if "opus" in model_lower:
+            return "claude-3-opus"
+        elif "sonnet" in model_lower:
+            return "claude-3-sonnet"
+        elif "haiku" in model_lower:
+            return "claude-3-haiku"
+        return "claude-3"
+    if "claude-2" in model_lower:
+        return "claude-2"
+    # Google families
+    if "gemini-pro" in model_lower:
+        return "gemini-pro"
+    if "gemini-ultra" in model_lower:
+        return "gemini-ultra"
+    # Meta families
+    if "llama-2" in model_lower:
+        if "70b" in model_lower:
+            return "llama-2-70b"
+        elif "13b" in model_lower:
+            return "llama-2-13b"
+        elif "7b" in model_lower:
+            return "llama-2-7b"
+        return "llama-2"
+    if "llama-3" in model_lower:
+        return "llama-3"
+    # Mistral families
+    if "mixtral" in model_lower:
+        return "mixtral"
+    if "mistral" in model_lower:
+        return "mistral"
+    return "unknown"
+__all__ = ["parse_model_name", "get_provider_for_model", "get_model_family"]

themis/server/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""FastAPI server for Themis web dashboard.
+This module provides a REST API and WebSocket interface for:
+- Listing and viewing experiment runs
+- Comparing multiple runs
+- Real-time monitoring of running experiments
+- Exporting results in various formats
+The server is optional and requires the 'server' extra:
+    pip install themis[server]
+    # or
+    uv pip install themis[server]
+Usage:
+    # Start the server
+    themis serve --port 8080
+    # Or programmatically
+    from themis.server import create_app
+    app = create_app(storage_path=".cache/experiments")
+    # Run with uvicorn
+    uvicorn themis.server:app --host 0.0.0.0 --port 8080
+"""
+from themis.server.app import create_app
+__all__ = ["create_app"]

themis/server/app.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""FastAPI application for Themis server.
+This module defines the main FastAPI app with REST endpoints and WebSocket support.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+from themis.comparison import compare_runs
+from themis.comparison.statistics import StatisticalTest
+from themis.experiment.storage import ExperimentStorage
+class RunSummary(BaseModel):
+    """Summary of an experiment run."""
+    run_id: str
+    experiment_id: str = "default"
+    status: str
+    num_samples: int = 0
+    metrics: Dict[str, float] = Field(default_factory=dict)
+    created_at: str | None = None
+class RunDetail(BaseModel):
+    """Detailed information about a run."""
+    run_id: str
+    experiment_id: str = "default"
+    status: str
+    num_samples: int
+    metrics: Dict[str, float]
+    samples: List[Dict[str, Any]] = Field(default_factory=list)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class ComparisonRequest(BaseModel):
+    """Request to compare multiple runs."""
+    run_ids: List[str]
+    metrics: List[str] | None = None
+    statistical_test: str = "bootstrap"
+    alpha: float = 0.05
+class ErrorResponse(BaseModel):
+    """Error response model."""
+    error: str
+    detail: str | None = None
+def create_app(storage_path: str | Path = ".cache/experiments") -> FastAPI:
+    """Create FastAPI application.
+    Args:
+        storage_path: Path to experiment storage
+    Returns:
+        Configured FastAPI application
+    """
+    app = FastAPI(
+        title="Themis API",
+        description="REST API for Themis experiment management",
+        version="2.0.0",
+    )
+    # Enable CORS for web dashboard
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],  # Configure appropriately for production
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # Initialize storage
+    storage = ExperimentStorage(storage_path)
+    # Mount static files (dashboard)
+    static_dir = Path(__file__).parent / "static"
+    if static_dir.exists():
+        app.mount("/dashboard", StaticFiles(directory=str(static_dir), html=True), name="static")
+    # WebSocket connection manager
+    class ConnectionManager:
+        def __init__(self):
+            self.active_connections: List[WebSocket] = []
+        async def connect(self, websocket: WebSocket):
+            await websocket.accept()
+            self.active_connections.append(websocket)
+        def disconnect(self, websocket: WebSocket):
+            self.active_connections.remove(websocket)
+        async def broadcast(self, message: dict):
+            for connection in self.active_connections:
+                await connection.send_json(message)
+    manager = ConnectionManager()
+    # ===== REST ENDPOINTS =====
+    @app.get("/", tags=["health"])
+    async def root():
+        """Health check endpoint."""
+        return {
+            "status": "ok",
+            "service": "themis-api",
+            "version": "2.0.0",
+        }
+    @app.get("/api/runs", response_model=List[RunSummary], tags=["runs"])
+    async def list_runs():
+        """List all experiment runs."""
+        run_ids = storage.list_runs()
+        summaries = []
+        for run_id in run_ids:
+            # Load basic info
+            eval_records = storage.load_cached_evaluations(run_id)
+            # Calculate average metrics
+            metrics_dict: Dict[str, List[float]] = {}
+            for record in eval_records.values():
+                for metric_name, score_obj in record.scores.items():
+                    if metric_name not in metrics_dict:
+                        metrics_dict[metric_name] = []
+                    # Extract numeric score
+                    if hasattr(score_obj, 'value'):
+                        metrics_dict[metric_name].append(score_obj.value)
+                    elif isinstance(score_obj, (int, float)):
+                        metrics_dict[metric_name].append(float(score_obj))
+            # Average metrics
+            avg_metrics = {
+                name: sum(scores) / len(scores) if scores else 0.0
+                for name, scores in metrics_dict.items()
+            }
+            summaries.append(RunSummary(
+                run_id=run_id,
+                experiment_id="default",
+                status="completed",
+                num_samples=len(eval_records),
+                metrics=avg_metrics,
+            ))
+        return summaries
+    @app.get("/api/runs/{run_id}", response_model=RunDetail, tags=["runs"])
+    async def get_run(run_id: str):
+        """Get detailed information about a run."""
+        if run_id not in storage.list_runs():
+            raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
+        # Load records
+        eval_records = storage.load_cached_evaluations(run_id)
+        gen_records_dict = storage.load_cached_records(run_id)
+        # Calculate metrics
+        metrics_dict: Dict[str, List[float]] = {}
+        samples = []
+        for cache_key, eval_record in eval_records.items():
+            # Get generation record
+            gen_record = gen_records_dict.get(cache_key)
+            # Extract scores
+            scores = {}
+            for metric_name, score_obj in eval_record.scores.items():
+                if hasattr(score_obj, 'value'):
+                    value = score_obj.value
+                elif isinstance(score_obj, (int, float)):
+                    value = float(score_obj)
+                else:
+                    continue
+                scores[metric_name] = value
+                if metric_name not in metrics_dict:
+                    metrics_dict[metric_name] = []
+                metrics_dict[metric_name].append(value)
+            # Build sample
+            sample = {
+                "id": gen_record.id if gen_record else cache_key,
+                "prompt": gen_record.prompt if gen_record else "",
+                "response": gen_record.response if gen_record else "",
+                "scores": scores,
+            }
+            samples.append(sample)
+        # Average metrics
+        avg_metrics = {
+            name: sum(scores) / len(scores) if scores else 0.0
+            for name, scores in metrics_dict.items()
+        }
+        return RunDetail(
+            run_id=run_id,
+            experiment_id="default",
+            status="completed",
+            num_samples=len(eval_records),
+            metrics=avg_metrics,
+            samples=samples,
+        )
+    @app.delete("/api/runs/{run_id}", tags=["runs"])
+    async def delete_run(run_id: str):
+        """Delete a run."""
+        if run_id not in storage.list_runs():
+            raise HTTPException(status_code=404, detail=f"Run not found: {run_id}")
+        # Note: Current storage doesn't implement delete
+        # This is a placeholder for future implementation
+        raise HTTPException(
+            status_code=501,
+            detail="Delete not implemented in current storage"
+        )
+    @app.post("/api/compare", tags=["comparison"])
+    async def compare_runs_api(request: ComparisonRequest):
+        """Compare multiple runs."""
+        # Validate runs exist
+        existing_runs = set(storage.list_runs())
+        for run_id in request.run_ids:
+            if run_id not in existing_runs:
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"Run not found: {run_id}"
+                )
+        if len(request.run_ids) < 2:
+            raise HTTPException(
+                status_code=400,
+                detail="Need at least 2 runs to compare"
+            )
+        # Parse statistical test
+        try:
+            test_enum = StatisticalTest(request.statistical_test)
+        except ValueError:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid statistical test: {request.statistical_test}"
+            )
+        # Run comparison
+        report = compare_runs(
+            run_ids=request.run_ids,
+            storage_path=storage._base_dir,
+            metrics=request.metrics,
+            statistical_test=test_enum,
+            alpha=request.alpha,
+        )
+        return report.to_dict()
+    @app.get("/api/benchmarks", tags=["presets"])
+    async def list_benchmarks():
+        """List available benchmark presets."""
+        from themis.presets import list_benchmarks
+        benchmarks = list_benchmarks()
+        return {"benchmarks": benchmarks}
+    # ===== WEBSOCKET ENDPOINTS =====
+    @app.websocket("/ws")
+    async def websocket_endpoint(websocket: WebSocket):
+        """WebSocket endpoint for real-time updates.
+        Messages sent from server:
+        - {"type": "run_started", "run_id": "...", "data": {...}}
+        - {"type": "run_progress", "run_id": "...", "progress": 0.5}
+        - {"type": "run_completed", "run_id": "...", "data": {...}}
+        - {"type": "error", "message": "..."}
+        Messages expected from client:
+        - {"type": "subscribe", "run_id": "..."}
+        - {"type": "unsubscribe", "run_id": "..."}
+        - {"type": "ping"}
+        """
+        await manager.connect(websocket)
+        try:
+            while True:
+                # Receive message from client
+                data = await websocket.receive_text()
+                message = json.loads(data)
+                msg_type = message.get("type")
+                if msg_type == "ping":
+                    await websocket.send_json({"type": "pong"})
+                elif msg_type == "subscribe":
+                    run_id = message.get("run_id")
+                    # TODO: Implement run subscription logic
+                    await websocket.send_json({
+                        "type": "subscribed",
+                        "run_id": run_id
+                    })
+                elif msg_type == "unsubscribe":
+                    run_id = message.get("run_id")
+                    # TODO: Implement unsubscribe logic
+                    await websocket.send_json({
+                        "type": "unsubscribed",
+                        "run_id": run_id
+                    })
+                else:
+                    await websocket.send_json({
+                        "type": "error",
+                        "message": f"Unknown message type: {msg_type}"
+                    })
+        except WebSocketDisconnect:
+            manager.disconnect(websocket)
+    return app
+__all__ = ["create_app"]

themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl