PyPI - maxllm-gate - Versions diffs - 0.2.0__py3-none-any.whl - Mend

maxllm-gate 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

llm_scheduler/__init__.py +8 -0
llm_scheduler/api/__init__.py +6 -0
llm_scheduler/api/dependencies.py +10 -0
llm_scheduler/api/routes.py +275 -0
llm_scheduler/api/schemas.py +135 -0
llm_scheduler/config.py +117 -0
llm_scheduler/core/__init__.py +8 -0
llm_scheduler/core/dispatcher.py +225 -0
llm_scheduler/core/queue_manager.py +251 -0
llm_scheduler/core/scheduler.py +236 -0
llm_scheduler/core/token_estimator.py +201 -0
llm_scheduler/main.py +86 -0
llm_scheduler/models/__init__.py +6 -0
llm_scheduler/models/provider.py +103 -0
llm_scheduler/models/request.py +101 -0
llm_scheduler/observability/__init__.py +6 -0
llm_scheduler/observability/logging.py +65 -0
llm_scheduler/observability/metrics.py +92 -0
llm_scheduler/rate_limiting/__init__.py +7 -0
llm_scheduler/rate_limiting/key_manager.py +252 -0
llm_scheduler/rate_limiting/token_bucket.py +152 -0
llm_scheduler/rate_limiting/tracker.py +281 -0
llm_scheduler/strategies/__init__.py +7 -0
llm_scheduler/strategies/base.py +56 -0
llm_scheduler/strategies/fallback.py +52 -0
llm_scheduler/strategies/least_utilized.py +30 -0
llm_scheduler/strategies/round_robin.py +29 -0
llm_scheduler/strategies/token_aware.py +46 -0
llm_scheduler/utils/__init__.py +6 -0
llm_scheduler/utils/retry.py +136 -0
llm_scheduler/utils/time_utils.py +115 -0
maxllm/__init__.py +77 -0
maxllm/client.py +598 -0
maxllm/config.py +181 -0
maxllm/rate_limiter.py +432 -0
maxllm/redis_backend.py +495 -0
maxllm/scheduler.py +559 -0
maxllm/validation.py +183 -0
maxllm_gate-0.2.0.dist-info/METADATA +771 -0
maxllm_gate-0.2.0.dist-info/RECORD +43 -0
maxllm_gate-0.2.0.dist-info/WHEEL +4 -0
maxllm_gate-0.2.0.dist-info/entry_points.txt +2 -0
maxllm_gate-0.2.0.dist-info/licenses/LICENSE +21 -0

llm_scheduler/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""LLM Rate Limit Scheduler - Intelligent scheduling layer on top of LiteLLM."""
+__version__ = "0.1.0"
+from llm_scheduler.config import settings
+from llm_scheduler.core.scheduler import Scheduler
+__all__ = ["settings", "Scheduler", "__version__"]

llm_scheduler/api/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""API module initialization."""
+from llm_scheduler.api.routes import router
+from llm_scheduler.api.schemas import ChatRequest, ChatResponse
+__all__ = ["router", "ChatRequest", "ChatResponse"]

llm_scheduler/api/dependencies.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""FastAPI dependencies."""
+from fastapi import Request
+from llm_scheduler.core.scheduler import Scheduler
+def get_scheduler(request: Request) -> Scheduler:
+    """Get scheduler instance from app state."""
+    return request.app.state.scheduler

llm_scheduler/api/routes.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""FastAPI route definitions."""
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
+from llm_scheduler.api.schemas import (
+    ChatRequest,
+    ChatResponse,
+    BatchRequest,
+    BatchResponse,
+    HealthResponse,
+    StatusResponse,
+    CapacityResponse,
+    KeyStatus,
+)
+from llm_scheduler.api.dependencies import get_scheduler
+from llm_scheduler.core.scheduler import Scheduler, SchedulerError
+from llm_scheduler.observability.logging import get_logger
+router = APIRouter()
+logger = get_logger()
+@router.post("/chat", response_model=ChatResponse, tags=["LLM"])
+async def chat(
+    request: ChatRequest,
+    scheduler: Scheduler = Depends(get_scheduler),
+) -> ChatResponse:
+    """
+    Send a chat completion request.
+    The request is queued and scheduled based on:
+    - Priority (high > medium > low)
+    - Available API key capacity
+    - Rate limits (TPM/RPM)
+    If all keys are at capacity, the request is deferred until
+    capacity becomes available.
+    """
+    try:
+        messages = [m.model_dump() for m in request.messages]
+        result = await scheduler.schedule(
+            model=request.model,
+            messages=messages,
+            priority=request.priority,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+        )
+        # Extract content from response
+        content = ""
+        if "choices" in result and result["choices"]:
+            choice = result["choices"][0]
+            if "message" in choice:
+                content = choice["message"].get("content", "")
+            elif "text" in choice:
+                content = choice["text"]
+        elif "content" in result:
+            content = result["content"]
+        return ChatResponse(
+            id=result.get("id", ""),
+            model=result.get("model", request.model),
+            content=content,
+            usage=result.get("usage"),
+            finish_reason=result.get("choices", [{}])[0].get("finish_reason"),
+        )
+    except SchedulerError as e:
+        logger.warning("Scheduler error", error=str(e))
+        raise HTTPException(status_code=503, detail=str(e))
+    except Exception as e:
+        logger.error("Chat request failed", error=str(e))
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/chat/stream", tags=["LLM"])
+async def chat_stream(
+    request: ChatRequest,
+    scheduler: Scheduler = Depends(get_scheduler),
+):
+    """
+    Send a streaming chat completion request.
+    Returns a Server-Sent Events (SSE) stream of response chunks.
+    """
+    async def generate():
+        try:
+            messages = [m.model_dump() for m in request.messages]
+            # For streaming, we need direct access to dispatcher
+            # This is a simplified version
+            result = await scheduler.schedule(
+                model=request.model,
+                messages=messages,
+                priority=request.priority,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+            )
+            content = result.get("content", "")
+            # Simulate streaming for collected content
+            for chunk in [content[i:i+10] for i in range(0, len(content), 10)]:
+                yield f"data: {chunk}\n\n"
+            yield "data: [DONE]\n\n"
+        except Exception as e:
+            yield f"data: [ERROR] {str(e)}\n\n"
+    return StreamingResponse(
+        generate(),
+        media_type="text/event-stream",
+    )
+@router.post("/batch", response_model=BatchResponse, tags=["LLM"])
+async def batch(
+    request: BatchRequest,
+    scheduler: Scheduler = Depends(get_scheduler),
+) -> BatchResponse:
+    """
+    Process multiple chat requests in parallel.
+    Returns results for all requests, with errors inline.
+    """
+    results = []
+    successful = 0
+    failed = 0
+    batch_requests = [
+        {
+            "model": req.model,
+            "messages": [m.model_dump() for m in req.messages],
+            "priority": req.priority,
+            "max_tokens": req.max_tokens,
+            "temperature": req.temperature,
+        }
+        for req in request.requests
+    ]
+    raw_results = await scheduler.schedule_batch(batch_requests)
+    for raw in raw_results:
+        if isinstance(raw, Exception):
+            failed += 1
+            results.append({"error": str(raw)})
+        else:
+            successful += 1
+            content = ""
+            if "choices" in raw and raw["choices"]:
+                content = raw["choices"][0].get("message", {}).get("content", "")
+            results.append(ChatResponse(
+                id=raw.get("id", ""),
+                model=raw.get("model", ""),
+                content=content,
+                usage=raw.get("usage"),
+                finish_reason=raw.get("choices", [{}])[0].get("finish_reason"),
+            ))
+    return BatchResponse(
+        results=results,
+        total=len(request.requests),
+        successful=successful,
+        failed=failed,
+    )
+@router.get("/health", response_model=HealthResponse, tags=["System"])
+async def health(
+    scheduler: Scheduler = Depends(get_scheduler),
+) -> HealthResponse:
+    """
+    Health check endpoint.
+    Returns overall system health status.
+    """
+    status_data = scheduler.get_status()
+    queue_size = status_data["queue"]["queue_size"]
+    # Count healthy keys
+    keys_data = status_data["keys"].get("keys", {})
+    healthy_keys = sum(1 for k in keys_data.values() if k.get("is_healthy", False))
+    # Determine status
+    if not status_data["running"]:
+        status = "unhealthy"
+    elif healthy_keys == 0:
+        status = "unhealthy"
+    elif queue_size > status_data["queue"]["max_size"] * 0.9:
+        status = "degraded"
+    else:
+        status = "healthy"
+    return HealthResponse(
+        status=status,
+        scheduler_running=status_data["running"],
+        queue_size=queue_size,
+        keys_available=healthy_keys,
+    )
+@router.get("/status", response_model=StatusResponse, tags=["System"])
+async def status(
+    scheduler: Scheduler = Depends(get_scheduler),
+) -> StatusResponse:
+    """
+    Get detailed scheduler status.
+    Returns queue statistics, key states, and configuration.
+    """
+    return scheduler.get_status()
+@router.get("/capacity", response_model=CapacityResponse, tags=["System"])
+async def capacity(
+    scheduler: Scheduler = Depends(get_scheduler),
+) -> CapacityResponse:
+    """
+    Get current capacity across all API keys.
+    Useful for monitoring and understanding rate limit state.
+    """
+    status_data = scheduler.get_status()
+    keys_data = status_data["keys"]
+    total = keys_data.get("total_capacity", {})
+    available = keys_data.get("available_capacity", {})
+    key_statuses = [
+        KeyStatus(**key_info)
+        for key_info in keys_data.get("keys", {}).values()
+    ]
+    return CapacityResponse(
+        total_tpm=total.get("tpm", 0),
+        available_tpm=available.get("tpm", 0),
+        total_rpm=total.get("rpm", 0),
+        available_rpm=available.get("rpm", 0),
+        keys=key_statuses,
+    )
+@router.get("/metrics", tags=["System"])
+async def metrics():
+    """
+    Prometheus metrics endpoint.
+    Returns metrics in Prometheus exposition format.
+    """
+    from starlette.responses import Response
+    return Response(
+        content=generate_latest(),
+        media_type=CONTENT_TYPE_LATEST,
+    )
+@router.get("/", tags=["System"])
+async def root():
+    """Root endpoint with API information."""
+    return {
+        "name": "LLM Rate Limit Scheduler",
+        "version": "0.1.0",
+        "description": (
+            "An intelligent scheduling and rate-limit-aware control layer "
+            "on top of LiteLLM that maximizes throughput and prevents 429 errors."
+        ),
+        "docs_url": "/docs",
+        "health_url": "/health",
+        "metrics_url": "/metrics",
+    }

llm_scheduler/api/schemas.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Pydantic schemas for API requests and responses."""
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+class Message(BaseModel):
+    """Chat message."""
+    role: Literal["system", "user", "assistant"] = Field(
+        description="Role of the message sender"
+    )
+    content: str = Field(description="Message content")
+class ChatRequest(BaseModel):
+    """Request body for /chat endpoint."""
+    model: str = Field(
+        description="Model name (e.g., 'mixtral', 'gpt-4o-mini', 'llama-3.1-70b')"
+    )
+    messages: list[Message] = Field(
+        description="List of chat messages"
+    )
+    priority: Literal["high", "medium", "low"] = Field(
+        default="medium",
+        description="Request priority for queue ordering"
+    )
+    max_tokens: int | None = Field(
+        default=None,
+        description="Maximum tokens in response"
+    )
+    temperature: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=2.0,
+        description="Sampling temperature"
+    )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "model": "mixtral-8x7b-32768",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Hello, how are you?"}
+                    ],
+                    "priority": "medium",
+                    "max_tokens": 1024,
+                    "temperature": 0.7
+                }
+            ]
+        }
+    }
+class ChatResponse(BaseModel):
+    """Response body for /chat endpoint."""
+    id: str = Field(description="Response ID")
+    model: str = Field(description="Model used")
+    content: str = Field(description="Generated content")
+    usage: dict[str, int] | None = Field(
+        default=None,
+        description="Token usage statistics"
+    )
+    finish_reason: str | None = Field(
+        default=None,
+        description="Reason for completion"
+    )
+class BatchRequest(BaseModel):
+    """Request body for /batch endpoint."""
+    requests: list[ChatRequest] = Field(
+        description="List of chat requests to process"
+    )
+class BatchResponse(BaseModel):
+    """Response body for /batch endpoint."""
+    results: list[ChatResponse | dict[str, str]] = Field(
+        description="Results for each request (response or error)"
+    )
+    total: int = Field(description="Total requests")
+    successful: int = Field(description="Successful requests")
+    failed: int = Field(description="Failed requests")
+class HealthResponse(BaseModel):
+    """Response body for /health endpoint."""
+    status: Literal["healthy", "degraded", "unhealthy"] = Field(
+        description="Overall health status"
+    )
+    scheduler_running: bool = Field(description="Whether scheduler is running")
+    queue_size: int = Field(description="Current queue size")
+    keys_available: int = Field(description="Number of healthy API keys")
+class StatusResponse(BaseModel):
+    """Response body for /status endpoint."""
+    running: bool
+    queue: dict[str, Any]
+    keys: dict[str, Any]
+    strategy: str
+class KeyStatus(BaseModel):
+    """Status of a single API key."""
+    key_id: str
+    provider: str
+    tpm_available: int
+    tpm_capacity: int
+    rpm_available: int
+    rpm_capacity: int
+    utilization: float
+    total_requests: int
+    total_tokens_used: int
+    is_healthy: bool
+class CapacityResponse(BaseModel):
+    """Response body for /capacity endpoint."""
+    total_tpm: int
+    available_tpm: int
+    total_rpm: int
+    available_rpm: int
+    keys: list[KeyStatus]

llm_scheduler/config.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Configuration management for LLM Rate Limit Scheduler."""
+import json
+from pydantic import Field, field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class APIKeyConfig:
+    """Configuration for a single API key."""
+    def __init__(
+        self,
+        key_id: str,
+        api_key: str,
+        provider: str,
+        models: list[str],
+        tpm_limit: int,
+        rpm_limit: int,
+    ):
+        self.key_id = key_id
+        self.api_key = api_key
+        self.provider = provider
+        self.models = models
+        self.tpm_limit = tpm_limit
+        self.rpm_limit = rpm_limit
+    def __repr__(self) -> str:
+        return f"APIKeyConfig(key_id={self.key_id}, provider={self.provider})"
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+    # Server
+    host: str = Field(default="0.0.0.0")
+    port: int = Field(default=8000)
+    debug: bool = Field(default=False)
+    log_level: str = Field(default="INFO")
+    # API Keys (JSON string)
+    api_keys_config: str = Field(default="{}")
+    # Scheduling
+    default_strategy: str = Field(default="least_utilized")
+    # Token estimation
+    default_max_tokens: int = Field(default=1024)
+    token_estimation_buffer: float = Field(default=1.1)
+    # Retry
+    max_retries: int = Field(default=3)
+    retry_base_delay: float = Field(default=1.0)
+    retry_max_delay: float = Field(default=60.0)
+    # Queue
+    max_queue_size: int = Field(default=10000)
+    default_priority: str = Field(default="medium")
+    # Redis (optional)
+    redis_url: str | None = Field(default=None)
+    use_redis_queue: bool = Field(default=False)
+    @field_validator("default_strategy")
+    @classmethod
+    def validate_strategy(cls, v: str) -> str:
+        valid = {"least_utilized", "round_robin", "token_aware"}
+        if v not in valid:
+            raise ValueError(f"Strategy must be one of {valid}")
+        return v
+    @field_validator("default_priority")
+    @classmethod
+    def validate_priority(cls, v: str) -> str:
+        valid = {"high", "medium", "low"}
+        if v not in valid:
+            raise ValueError(f"Priority must be one of {valid}")
+        return v
+    def get_api_keys(self) -> dict[str, APIKeyConfig]:
+        """Parse API keys configuration into typed objects."""
+        try:
+            raw_config = json.loads(self.api_keys_config)
+        except json.JSONDecodeError:
+            return {}
+        result = {}
+        for key_id, config in raw_config.items():
+            result[key_id] = APIKeyConfig(
+                key_id=key_id,
+                api_key=config.get("api_key", ""),
+                provider=config.get("provider", ""),
+                models=config.get("models", []),
+                tpm_limit=config.get("tpm_limit", 10000),
+                rpm_limit=config.get("rpm_limit", 60),
+            )
+        return result
+    def get_keys_for_model(self, model: str) -> list[APIKeyConfig]:
+        """Get all API keys that support a given model."""
+        keys = self.get_api_keys()
+        return [k for k in keys.values() if model in k.models]
+    def get_keys_for_provider(self, provider: str) -> list[APIKeyConfig]:
+        """Get all API keys for a given provider."""
+        keys = self.get_api_keys()
+        return [k for k in keys.values() if k.provider == provider]
+# Global settings instance
+settings = Settings()

llm_scheduler/core/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Core module initialization."""
+from llm_scheduler.core.scheduler import Scheduler
+from llm_scheduler.core.queue_manager import QueueManager, QueuedRequest
+from llm_scheduler.core.token_estimator import TokenEstimator
+from llm_scheduler.core.dispatcher import Dispatcher
+__all__ = ["Scheduler", "QueueManager", "QueuedRequest", "TokenEstimator", "Dispatcher"]