PyPI - vllm-judge - Versions diffs - 0.1.0__py3-none-any.whl - Mend

vllm-judge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vllm_judge/__init__.py +120 -0
vllm_judge/api/__init__.py +39 -0
vllm_judge/api/client.py +354 -0
vllm_judge/api/models.py +157 -0
vllm_judge/api/server.py +564 -0
vllm_judge/batch.py +147 -0
vllm_judge/cli.py +288 -0
vllm_judge/client.py +262 -0
vllm_judge/exceptions.py +42 -0
vllm_judge/judge.py +421 -0
vllm_judge/metrics.py +417 -0
vllm_judge/models.py +185 -0
vllm_judge/prompts.py +175 -0
vllm_judge/templating.py +206 -0
vllm_judge-0.1.0.dist-info/METADATA +124 -0
vllm_judge-0.1.0.dist-info/RECORD +19 -0
vllm_judge-0.1.0.dist-info/WHEEL +5 -0
vllm_judge-0.1.0.dist-info/entry_points.txt +2 -0
vllm_judge-0.1.0.dist-info/top_level.txt +1 -0

vllm_judge/cli.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""
+Command-line interface for vLLM Judge.
+"""
+import asyncio
+import json
+import sys
+from typing import Optional
+import click
+from vllm_judge import Judge
+from vllm_judge.models import JudgeConfig
+from vllm_judge.api.server import start_server as start_api_server
+from vllm_judge.api.client import JudgeClient
+from vllm_judge.metrics import BUILTIN_METRICS
+@click.group()
+def cli():
+    """vLLM Judge - LLM-as-a-Judge evaluation tool."""
+    pass
+@cli.command()
+@click.option('--base-url', required=True, help='vLLM server URL')
+@click.option('--model', help='Model name/path (auto-detected if not provided)')
+@click.option('--host', default='0.0.0.0', help='API server host')
+@click.option('--port', default=8080, help='API server port')
+@click.option('--reload', is_flag=True, help='Enable auto-reload for development')
+@click.option('--max-concurrent', default=50, help='Maximum concurrent requests')
+@click.option('--timeout', default=30.0, help='Request timeout in seconds')
+def serve(base_url: str, model: str, host: str, port: int, reload: bool, max_concurrent: int, timeout: float):
+    """Start the Judge API server."""
+    click.echo(f"Starting vLLM Judge API server...")
+    click.echo(f"Base URL: {base_url}")
+    click.echo(f"Model: {model}")
+    click.echo(f"Server: http://{host}:{port}")
+    start_api_server(
+        base_url=base_url,
+        model=model,
+        host=host,
+        port=port,
+        reload=reload,
+        max_concurrent=max_concurrent,
+        timeout=timeout
+    )
+@cli.command()
+@click.option('--api-url', help='Judge API URL (if using remote server)')
+@click.option('--base-url', help='vLLM server URL (if using local)')
+@click.option('--model', help='Model name (if using local)')
+@click.option('--response', required=True, help='Text to evaluate')
+@click.option('--criteria', help='Evaluation criteria')
+@click.option('--metric', help='Pre-defined metric name')
+@click.option('--scale', nargs=2, type=int, help='Numeric scale (min max)')
+@click.option('--rubric', help='Evaluation rubric')
+@click.option('--context', help='Additional context')
+@click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
+def evaluate(
+    api_url: Optional[str],
+    base_url: Optional[str],
+    model: Optional[str],
+    response: str,
+    criteria: Optional[str],
+    metric: Optional[str],
+    scale: Optional[tuple],
+    rubric: Optional[str],
+    context: Optional[str],
+    output: str
+):
+    """Evaluate a single response."""
+    async def run_evaluation():
+        if api_url:
+            # Use API client
+            async with JudgeClient(api_url) as client:
+                result = await client.evaluate(
+                    response=response,
+                    criteria=criteria,
+                    metric=metric,
+                    scale=scale,
+                    rubric=rubric,
+                    context=context
+                )
+        else:
+            # Use local Judge
+            if not base_url:
+                click.echo("Error: Either --api-url or --base-url is required", err=True)
+                sys.exit(1)
+            judge = Judge.from_url(base_url, model=model)
+            async with judge:
+                result = await judge.evaluate(
+                    response=response,
+                    criteria=criteria,
+                    metric=metric,
+                    scale=scale,
+                    rubric=rubric,
+                    context=context
+                )
+        # Format output
+        if output == 'json':
+            click.echo(json.dumps(result.model_dump(), indent=2))
+        else:
+            click.echo(f"Decision: {result.decision}")
+            if result.score is not None:
+                click.echo(f"Score: {result.score}")
+            click.echo(f"Reasoning: {result.reasoning}")
+    asyncio.run(run_evaluation())
+@cli.command()
+@click.option('--api-url', help='Judge API URL (if using remote server)')
+@click.option('--base-url', help='vLLM server URL (if using local)')
+@click.option('--model', help='Model name (if using local)')
+@click.option('--response-a', required=True, help='First response')
+@click.option('--response-b', required=True, help='Second response')
+@click.option('--criteria', required=True, help='Comparison criteria')
+@click.option('--output', type=click.Choice(['json', 'text']), default='text', help='Output format')
+def compare(
+    api_url: Optional[str],
+    base_url: Optional[str],
+    model: Optional[str],
+    response_a: str,
+    response_b: str,
+    criteria: str,
+    output: str
+):
+    """Compare two responses."""
+    async def run_comparison():
+        if api_url:
+            async with JudgeClient(api_url) as client:
+                result = await client.compare(
+                    response_a=response_a,
+                    response_b=response_b,
+                    criteria=criteria
+                )
+        else:
+            if not base_url:
+                click.echo("Error: Either --api-url or --base-url is required", err=True)
+                sys.exit(1)
+            judge = Judge.from_url(base_url, model=model)
+            async with judge:
+                result = await judge.compare(
+                    response_a=response_a,
+                    response_b=response_b,
+                    criteria=criteria
+                )
+        if output == 'json':
+            click.echo(json.dumps(result.model_dump(), indent=2))
+        else:
+            click.echo(f"Winner: {result.decision}")
+            click.echo(f"Reasoning: {result.reasoning}")
+    asyncio.run(run_comparison())
+@cli.command()
+@click.option('--api-url', required=True, help='Judge API URL')
+def health(api_url: str):
+    """Check API health status."""
+    async def check_health():
+        async with JudgeClient(api_url) as client:
+            try:
+                health_data = await client.health_check()
+                click.echo(json.dumps(health_data, indent=2))
+            except Exception as e:
+                click.echo(f"Health check failed: {e}", err=True)
+                sys.exit(1)
+    asyncio.run(check_health())
+@cli.command()
+@click.option('--api-url', help='Judge API URL (if using remote server)')
+@click.option('--filter', help='Filter metrics by name')
+def list_metrics(api_url: Optional[str], filter: Optional[str]):
+    """List available metrics."""
+    async def list_all_metrics():
+        if api_url:
+            async with JudgeClient(api_url) as client:
+                metrics = await client.list_metrics()
+                for metric in metrics:
+                    if filter and filter.lower() not in metric.name.lower():
+                        continue
+                    click.echo(f"\n{metric.name}:")
+                    click.echo(f"  Criteria: {metric.criteria}")
+                    if metric.has_scale:
+                        click.echo(f"  Scale: {metric.scale}")
+                    click.echo(f"  Has rubric: {metric.has_rubric}")
+                    click.echo(f"  Examples: {metric.example_count}")
+        else:
+            # List built-in metrics
+            for name, metric in BUILTIN_METRICS.items():
+                if filter and filter.lower() not in name.lower():
+                    continue
+                click.echo(f"\n{name}:")
+                click.echo(f"  Criteria: {metric.criteria}")
+                if metric.scale:
+                    click.echo(f"  Scale: {metric.scale}")
+                click.echo(f"  Has rubric: {'Yes' if metric.rubric else 'No'}")
+                click.echo(f"  Examples: {len(metric.examples)}")
+    asyncio.run(list_all_metrics())
+@cli.command()
+@click.option('--api-url', help='Judge API URL')
+@click.option('--file', required=True, type=click.File('r'), help='JSON file with batch data')
+@click.option('--async', 'use_async', is_flag=True, help='Use async batch processing')
+@click.option('--max-concurrent', type=int, help='Maximum concurrent requests')
+@click.option('--output', type=click.File('w'), help='Output file (default: stdout)')
+def batch(api_url: str, file, use_async: bool, max_concurrent: Optional[int], output):
+    """Run batch evaluation from JSON file."""
+    # Load batch data
+    try:
+        data = json.load(file)
+        if not isinstance(data, list):
+            click.echo("Error: Batch file must contain a JSON array", err=True)
+            sys.exit(1)
+    except json.JSONDecodeError as e:
+        click.echo(f"Error parsing JSON: {e}", err=True)
+        sys.exit(1)
+    async def run_batch():
+        async with JudgeClient(api_url) as client:
+            if use_async:
+                click.echo(f"Starting async batch evaluation of {len(data)} items...")
+                result = await client.async_batch_evaluate(
+                    data=data,
+                    max_concurrent=max_concurrent
+                )
+            else:
+                click.echo(f"Running batch evaluation of {len(data)} items...")
+                result = await client.batch_evaluate(
+                    data=data,
+                    max_concurrent=max_concurrent
+                )
+            # Format results
+            output_data = {
+                "total": result.total,
+                "successful": result.successful,
+                "failed": result.failed,
+                "success_rate": result.success_rate,
+                "duration_seconds": result.duration_seconds,
+                "results": []
+            }
+            for r in result.results:
+                if isinstance(r, Exception):
+                    output_data["results"].append({"error": str(r)})
+                else:
+                    output_data["results"].append({
+                        "decision": r.decision,
+                        "reasoning": r.reasoning,
+                        "score": r.score,
+                        "metadata": r.metadata
+                    })
+            # Write output
+            output_file = output or sys.stdout
+            json.dump(output_data, output_file, indent=2)
+            if output:
+                click.echo(f"Results written to {output.name}")
+            # Summary
+            click.echo(f"\nSummary:")
+            click.echo(f"  Total: {result.total}")
+            click.echo(f"  Successful: {result.successful}")
+            click.echo(f"  Failed: {result.failed}")
+            click.echo(f"  Success rate: {result.success_rate:.1%}")
+            click.echo(f"  Duration: {result.duration_seconds:.1f}s")
+    asyncio.run(run_batch())
+def main():
+    """Main entry point."""
+    cli()
+if __name__ == '__main__':
+    main()

vllm_judge/client.py ADDED Viewed

@@ -0,0 +1,262 @@
+from typing import List, Dict, Any
+import httpx
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+    # before_retry
+)
+from vllm_judge.models import JudgeConfig
+from vllm_judge.exceptions import (
+    ConnectionError,
+    TimeoutError,
+    ParseError,
+    RetryExhaustedError
+)
+CHAT_COMPLETIONS_ENDPOINT = "/v1/chat/completions"
+COMPLETIONS_ENDPOINT = "/v1/completions"
+MODELS_ENDPOINT = "/v1/models"
+class VLLMClient:
+    """Async client for vLLM endpoints."""
+    def __init__(self, config: JudgeConfig):
+        """
+        Initialize vLLM client.
+        Args:
+            config: Judge configuration
+        """
+        if not config.model:
+            config.model = detect_model_sync(config.base_url)
+        self.config = config
+        self.session = httpx.AsyncClient(
+            base_url=config.base_url,
+            timeout=httpx.Timeout(config.timeout),
+            limits=httpx.Limits(
+                max_connections=100,
+                max_keepalive_connections=20
+            ),
+            headers={
+                "Authorization": f"Bearer {config.api_key}",
+                "Content-Type": "application/json"
+            }
+        )
+    async def __aenter__(self):
+        """Async context manager entry."""
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.close()
+    async def close(self):
+        """Close the HTTP session."""
+        await self.session.aclose()
+    def _log_retry(self, retry_state):
+        """Log retry attempts."""
+        attempt = retry_state.attempt_number
+        if attempt > 1:
+            print(f"Retry attempt {attempt} after error: {retry_state.outcome.exception()}")
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        retry=retry_if_exception_type((httpx.HTTPError, ConnectionError, TimeoutError)),
+        # before=before_retry(lambda retry_state: retry_state.outcome and print(
+        #     f"Retrying after error: {retry_state.outcome.exception()}"
+        # ))
+    )
+    async def _request_with_retry(self, endpoint: str, **kwargs) -> Dict[str, Any]:
+        """
+        Make HTTP request with retry logic.
+        Args:
+            endpoint: API endpoint
+            **kwargs: Request parameters
+        Returns:
+            Parsed JSON response
+        Raises:
+            ConnectionError: If unable to connect
+            TimeoutError: If request times out
+            RetryExhaustedError: If all retries fail
+        """
+        try:
+            response = await self.session.post(endpoint, **kwargs)
+            response.raise_for_status()
+            return response.json()
+        except httpx.ConnectError as e:
+            raise ConnectionError(f"Failed to connect to {self.config.base_url}: {e}")
+        except httpx.TimeoutException as e:
+            raise TimeoutError(f"Request timed out after {self.config.timeout}s: {e}")
+        except httpx.HTTPStatusError as e:
+            # Parse error message from response if available
+            try:
+                error_detail = e.response.json().get('detail', str(e))
+            except:
+                error_detail = str(e)
+            raise ConnectionError(f"HTTP {e.response.status_code}: {error_detail}")
+        except Exception as e:
+            raise ConnectionError(f"Unexpected error: {e}")
+    async def chat_completion(self, messages: List[Dict[str, str]]) -> str:
+        """
+        Use chat completions endpoint (handles templates automatically).
+        Args:
+            messages: List of chat messages
+        Returns:
+            Model response content
+        Raises:
+            ConnectionError: If request fails
+            ParseError: If response parsing fails
+        """
+        request_data = {
+            "model": self.config.model,
+            "messages": messages,
+            "temperature": self.config.temperature,
+            "max_tokens": self.config.max_tokens,
+            # "top_p": self.config.top_p,
+        }
+        # # Request JSON response format if supported
+        # if self.config.temperature < 0.2:  # Only for low temperature
+        #     request_data["response_format"] = {"type": "json_object"}
+        try:
+            response = await self._request_with_retry(
+                CHAT_COMPLETIONS_ENDPOINT,
+                json=request_data
+            )
+            # Extract content from response
+            if "choices" not in response or not response["choices"]:
+                raise ParseError("Invalid response format: missing choices")
+            content = response["choices"][0]["message"]["content"]
+            return content
+        except RetryExhaustedError:
+            raise
+        except Exception as e:
+            if isinstance(e, (ConnectionError, TimeoutError, ParseError)):
+                raise
+            raise ConnectionError(f"Chat completion failed: {e}")
+    async def completion(self, prompt: str) -> str:
+        """
+        Use completions endpoint for edge cases.
+        Args:
+            prompt: Text prompt
+        Returns:
+            Model response text
+        Raises:
+            ConnectionError: If request fails
+            ParseError: If response parsing fails
+        """
+        request_data = {
+            "model": self.config.model,
+            "prompt": prompt,
+            "temperature": self.config.temperature,
+            "max_tokens": self.config.max_tokens,
+            # "top_p": self.config.top_p,
+        }
+        try:
+            response = await self._request_with_retry(
+                COMPLETIONS_ENDPOINT,
+                json=request_data
+            )
+            # Extract text from response
+            if "choices" not in response or not response["choices"]:
+                raise ParseError("Invalid response format: missing choices")
+            text = response["choices"][0]["text"]
+            return text
+        except RetryExhaustedError:
+            raise
+        except Exception as e:
+            if isinstance(e, (ConnectionError, TimeoutError, ParseError)):
+                raise
+            raise ConnectionError(f"Completion failed: {e}")
+    async def list_models(self) -> List[str]:
+        """
+        List available models.
+        Returns:
+            List of model names
+        Raises:
+            ConnectionError: If request fails
+        """
+        try:
+            response = await self._request_with_retry(MODELS_ENDPOINT)
+            models = response.get("data", [])
+            return [model["id"] for model in models]
+        except Exception as e:
+            if isinstance(e, ConnectionError):
+                raise
+            raise ConnectionError(f"Failed to list models: {e}")
+    async def detect_model(self) -> str:
+        """
+        Auto-detect the first available model.
+        Returns:
+            Model name
+        Raises:
+            ConnectionError: If no models found
+        """
+        models = await self.list_models()
+        if not models:
+            raise ConnectionError("No models available on vLLM server")
+        return models[0]
+def detect_model_sync(base_url: str, timeout: float = 30.0) -> str:
+    """
+    Synchronously detect the first available model.
+    Args:
+        base_url: vLLM server URL
+        timeout: Request timeout
+    Returns:
+        Model name
+    Raises:
+        ConnectionError: If no models found
+    """
+    url = f"{base_url}{MODELS_ENDPOINT}"
+    try:
+        with httpx.Client(timeout=timeout) as client:
+            response = client.get(url)
+            response.raise_for_status()
+            data = response.json().get("data", [])
+            models = [model["id"] for model in data]
+            if not models:
+                raise ConnectionError("No models available on vLLM server")
+            model = models[0]
+            return model
+    except httpx.HTTPError as e:
+        raise ConnectionError(f"Failed to detect model: {e}")

vllm_judge/exceptions.py ADDED Viewed

@@ -0,0 +1,42 @@
+class VLLMJudgeError(Exception):
+    """Base exception for all vLLM Judge errors."""
+    pass
+class ConfigurationError(VLLMJudgeError):
+    """Raised when configuration is invalid."""
+    pass
+class ConnectionError(VLLMJudgeError):
+    """Raised when unable to connect to vLLM server."""
+    pass
+class TimeoutError(VLLMJudgeError):
+    """Raised when request times out."""
+    pass
+class ParseError(VLLMJudgeError):
+    """Raised when unable to parse LLM response."""
+    def __init__(self, message: str, raw_response: str = None):
+        super().__init__(message)
+        self.raw_response = raw_response
+class MetricNotFoundError(VLLMJudgeError):
+    """Raised when requested metric is not found."""
+    pass
+class InvalidInputError(VLLMJudgeError):
+    """Raised when input parameters are invalid."""
+    pass
+class RetryExhaustedError(VLLMJudgeError):
+    """Raised when all retry attempts are exhausted."""
+    def __init__(self, message: str, last_error: Exception = None):
+        super().__init__(message)
+        self.last_error = last_error