PyPI - speedy-utils - Versions diffs - 1.0.9__py3-none-any.whl → 1.0.12__py3-none-any.whl - Mend

speedy-utils 1.0.9py3-none-any.whl → 1.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

llm_utils/__init__.py +4 -1
llm_utils/lm/__init__.py +2 -1
llm_utils/lm/alm.py +447 -0
llm_utils/lm/lm.py +282 -28
llm_utils/scripts/vllm_load_balancer.py +7 -6
llm_utils/scripts/vllm_serve.py +66 -136
{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/METADATA +1 -1
{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/RECORD +10 -9
{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/WHEEL +0 -0
{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/entry_points.txt +0 -0

llm_utils/__init__.py CHANGED Viewed

@@ -9,7 +9,8 @@ from .chat_format import (
     format_msgs,
     display_chat_messages_as_html,
 )
-from .lm import LM
+from .lm.lm import LM, LMReasoner
+from .lm.alm import AsyncLM
 from .group_messages import (
     split_indices_by_length,
     group_messages_by_len,
@@ -27,5 +28,7 @@ __all__ = [
     "split_indices_by_length",
     "group_messages_by_len",
     "LM",
+    "LMReasoner",
+    "AsyncLM",
     "display_chat_messages_as_html",
 ]

llm_utils/lm/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from .lm import LM
+from .alm import AsyncLM
 OAI_LM = LM
 __all__ = [
     "LM",
     "OAI_LM",
+    "AsyncLM",
 ]

llm_utils/lm/alm.py ADDED Viewed

@@ -0,0 +1,447 @@
+from __future__ import annotations
+"""An **asynchronous** drop‑in replacement for the original `LM` class.
+Usage example (Python ≥3.8):
+    from async_lm import AsyncLM
+    import asyncio
+    async def main():
+        lm = AsyncLM(model="gpt-4o-mini")
+        reply: str = await lm(prompt="Hello, world!")
+        print(reply)
+    asyncio.run(main())
+"""
+import asyncio
+import base64
+import hashlib
+import json
+import os
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+    cast,
+)
+from httpx import URL
+from openai import AsyncOpenAI, AuthenticationError, RateLimitError
+# from openai.pagination import AsyncSyncPage
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam,
+    ChatCompletionMessageParam,
+    ChatCompletionSystemMessageParam,
+    ChatCompletionToolMessageParam,
+    ChatCompletionUserMessageParam,
+)
+from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
+from openai.types.model import Model
+from pydantic import BaseModel
+from loguru import logger
+from openai.pagination import AsyncPage as AsyncSyncPage
+# --------------------------------------------------------------------------- #
+# type helpers
+# --------------------------------------------------------------------------- #
+TModel = TypeVar("TModel", bound=BaseModel)
+Messages = List[ChatCompletionMessageParam]
+LegacyMsgs = List[Dict[str, str]]
+RawMsgs = Union[Messages, LegacyMsgs]
+# --------------------------------------------------------------------------- #
+# color helpers (unchanged)
+# --------------------------------------------------------------------------- #
+def _color(code: int, text: str) -> str:
+    return f"\x1b[{code}m{text}\x1b[0m"
+_red = lambda t: _color(31, t)
+_green = lambda t: _color(32, t)
+_blue = lambda t: _color(34, t)
+_yellow = lambda t: _color(33, t)
+class AsyncLM:
+    """Unified **async** language‑model wrapper with optional JSON parsing."""
+    def __init__(
+        self,
+        model: str | None = None,
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 2_000,
+        host: str = "localhost",
+        port: Optional[int | str] = None,
+        base_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        cache: bool = True,
+        ports: Optional[List[int]] = None,
+        **openai_kwargs: Any,
+    ) -> None:
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.port = port
+        self.host = host
+        self.base_url = base_url or (f"http://{host}:{port}/v1" if port else None)
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY", "abc")
+        self.openai_kwargs = openai_kwargs
+        self.do_cache = cache
+        self.ports = ports
+        # Async client
+    @property
+    def client(self) -> AsyncOpenAI:
+        # if have multiple ports
+        if self.ports:
+            import random
+            port = random.choice(self.ports)
+            api_base = f"http://{self.host}:{port}/v1"
+            logger.debug(f"Using port: {port}")
+        else:
+            api_base = self.base_url or f"http://{self.host}:{self.port}/v1"
+        client = AsyncOpenAI(
+            api_key=self.api_key, base_url=api_base, **self.openai_kwargs
+        )
+        return client
+    # ------------------------------------------------------------------ #
+    # Public API – typed overloads
+    # ------------------------------------------------------------------ #
+    @overload
+    async def __call__(
+        self,
+        *,
+        prompt: str | None = ...,
+        messages: RawMsgs | None = ...,
+        response_format: type[str] = str,
+        return_openai_response: bool = ...,
+        **kwargs: Any,
+    ) -> str: ...
+    @overload
+    async def __call__(
+        self,
+        *,
+        prompt: str | None = ...,
+        messages: RawMsgs | None = ...,
+        response_format: Type[TModel],
+        return_openai_response: bool = ...,
+        **kwargs: Any,
+    ) -> TModel: ...
+    async def __call__(
+        self,
+        prompt: Optional[str] = None,
+        messages: Optional[RawMsgs] = None,
+        response_format: Union[type[str], Type[BaseModel]] = str,
+        cache: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        return_openai_response: bool = False,
+        **kwargs: Any,
+    ):
+        if (prompt is None) == (messages is None):
+            raise ValueError("Provide *either* `prompt` or `messages` (but not both).")
+        if prompt is not None:
+            messages = [{"role": "user", "content": prompt}]
+        assert messages is not None
+        # assert self.model is not None, "Model must be set before calling."
+        if not self.model:
+            models = await self.list_models(port=self.port, host=self.host)
+            self.model = models[0] if models else None
+            logger.info(
+                f"No model specified. Using the first available model. {self.model}"
+            )
+        openai_msgs: Messages = (
+            self._convert_messages(cast(LegacyMsgs, messages))
+            if isinstance(messages[0], dict)
+            else cast(Messages, messages)
+        )
+        kw = dict(
+            self.openai_kwargs,
+            temperature=self.temperature,
+            max_tokens=max_tokens or self.max_tokens,
+        )
+        kw.update(kwargs)
+        use_cache = self.do_cache if cache is None else cache
+        raw_response = await self._call_raw(
+            openai_msgs,
+            response_format=response_format,
+            use_cache=use_cache,
+            **kw,
+        )
+        if return_openai_response:
+            response = raw_response
+        else:
+            response = self._parse_output(raw_response, response_format)
+        self.last_log = [prompt, messages, raw_response]
+        return response
+    # ------------------------------------------------------------------ #
+    # Model invocation (async)
+    # ------------------------------------------------------------------ #
+    async def _call_raw(
+        self,
+        messages: Sequence[ChatCompletionMessageParam],
+        response_format: Union[type[str], Type[BaseModel]],
+        use_cache: bool,
+        **kw: Any,
+    ):
+        assert self.model is not None, "Model must be set before making a call."
+        model: str = self.model
+        cache_key = (
+            self._cache_key(messages, kw, response_format) if use_cache else None
+        )
+        if cache_key and (hit := self._load_cache(cache_key)) is not None:
+            return hit
+        try:
+            if response_format is not str and issubclass(response_format, BaseModel):
+                openai_response = await self.client.beta.chat.completions.parse(
+                    model=model,
+                    messages=list(messages),
+                    response_format=response_format,  # type: ignore[arg-type]
+                    **kw,
+                )
+            else:
+                openai_response = await self.client.chat.completions.create(
+                    model=model,
+                    messages=list(messages),
+                    **kw,
+                )
+        except (AuthenticationError, RateLimitError) as exc:
+            logger.error(exc)
+            raise
+        if cache_key:
+            self._dump_cache(cache_key, openai_response)
+        return openai_response
+    # ------------------------------------------------------------------ #
+    # Utilities below are unchanged (sync I/O is acceptable)
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _convert_messages(msgs: LegacyMsgs) -> Messages:
+        converted: Messages = []
+        for msg in msgs:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "user":
+                converted.append(
+                    ChatCompletionUserMessageParam(role="user", content=content)
+                )
+            elif role == "assistant":
+                converted.append(
+                    ChatCompletionAssistantMessageParam(
+                        role="assistant", content=content
+                    )
+                )
+            elif role == "system":
+                converted.append(
+                    ChatCompletionSystemMessageParam(role="system", content=content)
+                )
+            elif role == "tool":
+                converted.append(
+                    ChatCompletionToolMessageParam(
+                        role="tool",
+                        content=content,
+                        tool_call_id=msg.get("tool_call_id") or "",
+                    )
+                )
+            else:
+                converted.append({"role": role, "content": content})  # type: ignore[arg-type]
+        return converted
+    @staticmethod
+    def _parse_output(
+        raw_response: Any, response_format: Union[type[str], Type[BaseModel]]
+    ) -> str | BaseModel:
+        if hasattr(raw_response, "model_dump"):
+            raw_response = raw_response.model_dump()
+        if response_format is str:
+            if isinstance(raw_response, dict) and "choices" in raw_response:
+                message = raw_response["choices"][0]["message"]
+                return message.get("content", "") or ""
+            return cast(str, raw_response)
+        model_cls = cast(Type[BaseModel], response_format)
+        if isinstance(raw_response, dict) and "choices" in raw_response:
+            message = raw_response["choices"][0]["message"]
+            if "parsed" in message:
+                return model_cls.model_validate(message["parsed"])
+            content = message.get("content")
+            if content is None:
+                raise ValueError("Model returned empty content")
+            try:
+                data = json.loads(content)
+                return model_cls.model_validate(data)
+            except Exception as exc:
+                raise ValueError(
+                    f"Failed to parse model output as JSON:\n{content}"
+                ) from exc
+        if isinstance(raw_response, model_cls):
+            return raw_response
+        if isinstance(raw_response, dict):
+            return model_cls.model_validate(raw_response)
+        try:
+            data = json.loads(raw_response)
+            return model_cls.model_validate(data)
+        except Exception as exc:
+            raise ValueError(
+                f"Model did not return valid JSON:\n---\n{raw_response}"
+            ) from exc
+    # ------------------------------------------------------------------ #
+    # Simple disk cache (sync)
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _cache_key(
+        messages: Any, kw: Any, response_format: Union[type[str], Type[BaseModel]]
+    ) -> str:
+        tag = response_format.__name__ if response_format is not str else "text"
+        blob = json.dumps([messages, kw, tag], sort_keys=True).encode()
+        return base64.urlsafe_b64encode(hashlib.sha256(blob).digest()).decode()[:22]
+    @staticmethod
+    def _cache_path(key: str) -> str:
+        return os.path.expanduser(f"~/.cache/lm/{key}.json")
+    def _dump_cache(self, key: str, val: Any) -> None:
+        try:
+            path = self._cache_path(key)
+            os.makedirs(os.path.dirname(path), exist_ok=True)
+            with open(path, "w") as fh:
+                if isinstance(val, BaseModel):
+                    json.dump(val.model_dump(mode="json"), fh)
+                else:
+                    json.dump(val, fh)
+        except Exception as exc:
+            logger.debug(f"cache write skipped: {exc}")
+    def _load_cache(self, key: str) -> Any | None:
+        path = self._cache_path(key)
+        if not os.path.exists(path):
+            return None
+        try:
+            with open(path) as fh:
+                return json.load(fh)
+        except Exception:
+            return None
+    # ------------------------------------------------------------------ #
+    # Utility helpers
+    # ------------------------------------------------------------------ #
+    async def inspect_history(self) -> None:
+        if not hasattr(self, "last_log"):
+            raise ValueError("No history available. Please call the model first.")
+        prompt, messages, response = self.last_log
+        if hasattr(response, "model_dump"):
+            response = response.model_dump()
+        if not messages:
+            messages = [{"role": "user", "content": prompt}]
+        print("\n\n")
+        print(_blue("[Conversation History]") + "\n")
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            print(_red(f"{role.capitalize()}:"))
+            if isinstance(content, str):
+                print(content.strip())
+            elif isinstance(content, list):
+                for item in content:
+                    if item.get("type") == "text":
+                        print(item["text"].strip())
+                    elif item.get("type") == "image_url":
+                        image_url = item["image_url"]["url"]
+                        if "base64" in image_url:
+                            len_base64 = len(image_url.split("base64,")[1])
+                            print(_blue(f"<IMAGE BASE64 ENCODED({len_base64})>"))
+                        else:
+                            print(_blue(f"<image_url: {image_url}>"))
+            print("\n")
+        print(_red("Response:"))
+        if isinstance(response, dict) and response.get("choices"):
+            message = response["choices"][0].get("message", {})
+            reasoning = message.get("reasoning_content")
+            parsed = message.get("parsed")
+            content = message.get("content")
+            if reasoning:
+                print(_yellow("<think>"))
+                print(reasoning.strip())
+                print(_yellow("</think>\n"))
+            if parsed:
+                print(
+                    json.dumps(
+                        (
+                            parsed.model_dump()
+                            if hasattr(parsed, "model_dump")
+                            else parsed
+                        ),
+                        indent=2,
+                    )
+                    + "\n"
+                )
+            elif content:
+                print(content.strip())
+            else:
+                print(_green("[No content]"))
+            if len(response["choices"]) > 1:
+                print(
+                    _blue(f"\n(Plus {len(response['choices']) - 1} other completions)")
+                )
+        else:
+            print(_yellow("Warning: Not a standard OpenAI response object"))
+            if isinstance(response, str):
+                print(_green(response.strip()))
+            elif isinstance(response, dict):
+                print(_green(json.dumps(response, indent=2)))
+            else:
+                print(_green(str(response)))
+    # ------------------------------------------------------------------ #
+    # Misc helpers
+    # ------------------------------------------------------------------ #
+    def set_model(self, model: str) -> None:
+        self.model = model
+    @staticmethod
+    async def list_models(port=None, host="localhost") -> List[str]:
+        try:
+            client: AsyncOpenAI = AsyncLM(port=port, host=host).client  # type: ignore[arg-type]
+            base_url: URL = client.base_url
+            logger.debug(f"Base URL: {base_url}")
+            models: AsyncSyncPage[Model] = await client.models.list()  # type: ignore[assignment]
+            return [model.id for model in models.data]
+        except Exception as exc:
+            logger.error(f"Failed to list models: {exc}")
+            return []

llm_utils/lm/lm.py CHANGED Viewed

@@ -4,6 +4,7 @@ import base64
 import hashlib
 import json
 import os
+from token import OP
 from typing import (
     Any,
     Dict,
@@ -18,7 +19,9 @@ from typing import (
 )
 from httpx import URL
+from huggingface_hub import repo_info
 from loguru import logger
+from numpy import isin
 from openai import OpenAI, AuthenticationError, RateLimitError
 from openai.pagination import SyncPage
 from openai.types.chat import (
@@ -42,6 +45,29 @@ LegacyMsgs = List[Dict[str, str]]  # old “…role/content…” dicts
 RawMsgs = Union[Messages, LegacyMsgs]  # what __call__ accepts
+# --------------------------------------------------------------------------- #
+# color formatting helpers
+# --------------------------------------------------------------------------- #
+def _red(text: str) -> str:
+    """Format text with red color."""
+    return f"\x1b[31m{text}\x1b[0m"
+def _green(text: str) -> str:
+    """Format text with green color."""
+    return f"\x1b[32m{text}\x1b[0m"
+def _blue(text: str) -> str:
+    """Format text with blue color."""
+    return f"\x1b[34m{text}\x1b[0m"
+def _yellow(text: str) -> str:
+    """Format text with yellow color."""
+    return f"\x1b[33m{text}\x1b[0m"
 class LM:
     """
     Unified language-model wrapper.
@@ -73,6 +99,7 @@ class LM:
         self.api_key = api_key or os.getenv("OPENAI_API_KEY", "abc")
         self.openai_kwargs = openai_kwargs
         self.do_cache = cache
+        self._init_port = port  # <-- store the port provided at init
         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
@@ -90,6 +117,7 @@ class LM:
         prompt: str | None = ...,
         messages: RawMsgs | None = ...,
         response_format: type[str] = str,
+        return_openai_response: bool = ...,
         **kwargs: Any,
     ) -> str: ...
@@ -100,6 +128,7 @@ class LM:
         prompt: str | None = ...,
         messages: RawMsgs | None = ...,
         response_format: Type[TModel],
+        return_openai_response: bool = ...,
         **kwargs: Any,
     ) -> TModel: ...
@@ -111,6 +140,7 @@ class LM:
         response_format: Union[type[str], Type[BaseModel]] = str,
         cache: Optional[bool] = None,
         max_tokens: Optional[int] = None,
+        return_openai_response: bool = False,
         **kwargs: Any,
     ):
         # argument validation ------------------------------------------------
@@ -121,7 +151,20 @@ class LM:
             messages = [{"role": "user", "content": prompt}]
         assert messages is not None  # for type-checker
-        assert self.model is not None, "Model must be set before calling."
+        # If model is not specified, but port is provided, use the first available model
+        if self.model is None:
+            port = self._init_port
+            if port:
+                available_models = self.list_models(port=port)
+                if available_models:
+                    self.model = available_models[0]
+                    logger.info(f"Auto-selected model: {self.model}")
+                else:
+                    raise ValueError("No models available to select from.")
+            else:
+                raise AssertionError("Model must be set before calling.")
         openai_msgs: Messages = (
             self._convert_messages(cast(LegacyMsgs, messages))
             if isinstance(messages[0], dict)  # legacy style
@@ -132,17 +175,119 @@ class LM:
             self.openai_kwargs,
             temperature=self.temperature,
             max_tokens=max_tokens or self.max_tokens,
-            **kwargs,
         )
+        kw.update(kwargs)
         use_cache = self.do_cache if cache is None else cache
-        raw = self._call_raw(
+        raw_response = self._call_raw(
             openai_msgs,
             response_format=response_format,
             use_cache=use_cache,
             **kw,
         )
-        return self._parse_output(raw, response_format)
+        if return_openai_response:
+            response = raw_response
+        else:
+            response = self._parse_output(raw_response, response_format)
+        self.last_log = [prompt, messages, raw_response]
+        return response
+    def inspect_history(self) -> None:
+        if not hasattr(self, "last_log"):
+            raise ValueError("No history available. Please call the model first.")
+        prompt, messages, response = self.last_log
+        # Ensure response is a dictionary
+        if hasattr(response, "model_dump"):
+            response = response.model_dump()
+        if not messages:
+            messages = [{"role": "user", "content": prompt}]
+        print("\n\n")
+        print(_blue("[Conversation History]") + "\n")
+        # Print all messages in the conversation
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            print(_red(f"{role.capitalize()}:"))
+            if isinstance(content, str):
+                print(content.strip())
+            elif isinstance(content, list):
+                # Handle multimodal content
+                for item in content:
+                    if item.get("type") == "text":
+                        print(item["text"].strip())
+                    elif item.get("type") == "image_url":
+                        image_url = item["image_url"]["url"]
+                        if "base64" in image_url:
+                            len_base64 = len(image_url.split("base64,")[1])
+                            print(_blue(f"<IMAGE BASE64 ENCODED({len_base64})>"))
+                        else:
+                            print(_blue(f"<image_url: {image_url}>"))
+            print("\n")
+        # Print the response - now always an OpenAI completion
+        print(_red("Response:"))
+        # Handle OpenAI response object
+        if isinstance(response, dict) and "choices" in response and response["choices"]:
+            message = response["choices"][0].get("message", {})
+            # Check for reasoning content (if available)
+            reasoning = message.get("reasoning_content")
+            # Check for parsed content (structured mode)
+            parsed = message.get("parsed")
+            # Get regular content
+            content = message.get("content")
+            # Display reasoning if available
+            if reasoning:
+                print(_yellow("<think>"))
+                print(reasoning.strip())
+                print(_yellow("</think>"))
+                print()
+            # Display parsed content for structured responses
+            if parsed:
+                # print(_green('<Parsed Structure>'))
+                if hasattr(parsed, "model_dump"):
+                    print(json.dumps(parsed.model_dump(), indent=2))
+                else:
+                    print(json.dumps(parsed, indent=2))
+                # print(_green('</Parsed Structure>'))
+                print()
+            else:
+                if content:
+                    # print(_green("<Content>"))
+                    print(content.strip())
+                    # print(_green("</Content>"))
+                else:
+                    print(_green("[No content]"))
+            # Show if there were multiple completions
+            if len(response["choices"]) > 1:
+                print(
+                    _blue(f"\n(Plus {len(response['choices']) - 1} other completions)")
+                )
+        else:
+            # Fallback for non-standard response objects or cached responses
+            print(_yellow("Warning: Not a standard OpenAI response object"))
+            if isinstance(response, str):
+                print(_green(response.strip()))
+            elif isinstance(response, dict):
+                print(_green(json.dumps(response, indent=2)))
+            else:
+                print(_green(str(response)))
+        # print("\n\n")
     # --------------------------------------------------------------------- #
     # low-level OpenAI call
@@ -156,6 +301,7 @@ class LM:
     ):
         assert self.model is not None, "Model must be set before making a call."
         model: str = self.model
         cache_key = (
             self._cache_key(messages, kw, response_format) if use_cache else None
         )
@@ -165,31 +311,28 @@ class LM:
         try:
             # structured mode
             if response_format is not str and issubclass(response_format, BaseModel):
-                rsp: ParsedChatCompletion[BaseModel] = (
-                    self.client.beta.chat.completions.parse(
-                        model=model,
-                        messages=list(messages),
-                        response_format=response_format,  # type: ignore[arg-type]
-                        **kw,
-                    )
+                openai_response = self.client.beta.chat.completions.parse(
+                    model=model,
+                    messages=list(messages),
+                    response_format=response_format,  # type: ignore[arg-type]
+                    **kw,
                 )
-                result: Any = rsp.choices[0].message.parsed  # already a model
             # plain-text mode
             else:
-                rsp = self.client.chat.completions.create(
+                openai_response = self.client.chat.completions.create(
                     model=model,
                     messages=list(messages),
                     **kw,
                 )
-                result = rsp.choices[0].message.content  # str
         except (AuthenticationError, RateLimitError) as exc:  # pragma: no cover
             logger.error(exc)
             raise
         if cache_key:
-            self._dump_cache(cache_key, result)
+            self._dump_cache(cache_key, openai_response)
-        return result
+        return openai_response
     # --------------------------------------------------------------------- #
     # legacy → typed messages
@@ -232,31 +375,67 @@ class LM:
     # --------------------------------------------------------------------- #
     @staticmethod
     def _parse_output(
-        raw: Any,
+        raw_response: Any,
         response_format: Union[type[str], Type[BaseModel]],
     ) -> str | BaseModel:
+        # Convert any object to dict if needed
+        if hasattr(raw_response, "model_dump"):
+            raw_response = raw_response.model_dump()
         if response_format is str:
-            return cast(str, raw)
+            # Extract the content from OpenAI response dict
+            if isinstance(raw_response, dict) and "choices" in raw_response:
+                message = raw_response["choices"][0]["message"]
+                return message.get("content", "") or ""
+            return cast(str, raw_response)
         # For the type-checker: we *know* it's a BaseModel subclass here.
         model_cls = cast(Type[BaseModel], response_format)
-        if isinstance(raw, model_cls):
-            return raw
-        if isinstance(raw, dict):
-            return model_cls.model_validate(raw)
+        # Handle structured response
+        if isinstance(raw_response, dict) and "choices" in raw_response:
+            message = raw_response["choices"][0]["message"]
+            # Check if already parsed by OpenAI client
+            if "parsed" in message:
+                return model_cls.model_validate(message["parsed"])
+            # Need to parse the content
+            content = message.get("content")
+            if content is None:
+                raise ValueError("Model returned empty content")
+            try:
+                data = json.loads(content)
+                return model_cls.model_validate(data)
+            except Exception as exc:
+                raise ValueError(
+                    f"Failed to parse model output as JSON:\n{content}"
+                ) from exc
+        # Handle cached response or other formats
+        if isinstance(raw_response, model_cls):
+            return raw_response
+        if isinstance(raw_response, dict):
+            return model_cls.model_validate(raw_response)
+        # Try parsing as JSON string
         try:
-            data = json.loads(raw)
-        except Exception as exc:  # noqa: BLE001
-            raise ValueError(f"Model did not return JSON:\n---\n{raw}") from exc
-        return model_cls.model_validate(data)
+            data = json.loads(raw_response)
+            return model_cls.model_validate(data)
+        except Exception as exc:
+            raise ValueError(
+                f"Model did not return valid JSON:\n---\n{raw_response}"
+            ) from exc
     # --------------------------------------------------------------------- #
     # tiny disk cache
     # --------------------------------------------------------------------- #
     @staticmethod
     def _cache_key(
-        messages: Any, kw: Any, response_format: Union[type[str], Type[BaseModel]]
+        messages: Any,
+        kw: Any,
+        response_format: Union[type[str], Type[BaseModel]],
     ) -> str:
         tag = response_format.__name__ if response_format is not str else "text"
         blob = json.dumps([messages, kw, tag], sort_keys=True).encode()
@@ -289,7 +468,7 @@ class LM:
             return None
     @staticmethod
-    def list_models(port=None, host='localhost') -> List[str]:
+    def list_models(port=None, host="localhost") -> List[str]:
         """
         List available models.
         """
@@ -302,3 +481,78 @@ class LM:
         except Exception as exc:
             logger.error(f"Failed to list models: {exc}")
             return []
+from functools import cache
+from llm_utils.lm.lm import LM, RawMsgs
+from pydantic import BaseModel
+import re
+import json
+from typing import *
+import re
+class LMReasoner(LM):
+    "Regex-based reasoning wrapper for LM."
+    def build_regex_from_pydantic(self, model: type[BaseModel]) -> str:
+        """
+        Build a regex pattern string for validating output that should match a Pydantic model.
+        Args:
+            model: A Pydantic BaseModel class
+        Returns:
+            A regex string that matches a JSON representation of the model
+        """
+        # regex = f"<think>\\n.*?\\n</think>\\n\\n\\```json\\n.*"
+        print(f"{regex=}")
+        return regex
+    def __call__(
+        self,
+        response_format: type[BaseModel],
+        prompt: Optional[str] = None,
+        messages: Optional[RawMsgs] = None,
+        **kwargs,
+    ):
+        if prompt is not None:
+            output = super().__call__(
+                prompt=prompt
+                + "\nresponse_format:"
+                + str(response_format.model_json_schema()),
+                response_format=str,
+                # extra_body={"guided_regex": regex},
+                **kwargs,
+            )  # type: ignore
+        elif messages is not None:
+            # append last message with the json schema
+            messages[-1]["content"] += "\nresponse_format:" + str(  # type: ignore
+                response_format.model_json_schema()
+            )
+            output = super().__call__(
+                messages=messages,
+                response_format=str,
+                # extra_body={"guided_regex": regex},
+                **kwargs,
+            )
+        else:
+            raise ValueError("Either prompt or messages must be provided.")
+        # import ipdb; ipdb.set_trace()
+        # parse using regex
+        pattern = re.compile(
+            r"<think>\n(?P<think>.*?)\n</think>\n\n(?P<json>\{.*\})",
+            re.DOTALL,
+        )
+        match = pattern.search(output)
+        if not match:
+            raise ValueError("Output does not match expected format")
+        parsed_output = match.group(0)
+        think_part = match.group("think")
+        json_part = match.group("json")
+        pydantic_object = response_format.model_validate(json.loads(json_part))
+        return pydantic_object

llm_utils/scripts/vllm_load_balancer.py CHANGED Viewed

@@ -5,18 +5,19 @@ import time
 from tabulate import tabulate
 import contextlib
 import aiohttp  # <-- Import aiohttp
+from speedy_utils import setup_logger
 from loguru import logger
+setup_logger(min_interval=5)
 # --- Configuration ---
 LOAD_BALANCER_HOST = "0.0.0.0"
 LOAD_BALANCER_PORT = 8008
 SCAN_TARGET_HOST = "localhost"
-SCAN_PORT_START = 8150
+SCAN_PORT_START = 8140
 SCAN_PORT_END = 8170  # Inclusive
 SCAN_INTERVAL = 30
 # Timeout applies to the HTTP health check request now
-HEALTH_CHECK_TIMEOUT = 2.0  # Increased slightly for HTTP requests
+HEALTH_CHECK_TIMEOUT = 2  # Increased slightly for HTTP requests
 STATUS_PRINT_INTERVAL = 5
 BUFFER_SIZE = 4096
@@ -83,14 +84,14 @@ async def check_server_health(session, host, port):
             # Check for a successful status code (2xx range)
             if 200 <= response.status < 300:
                 logger.debug(
-                    f"Health check success for {url} (Status: {response.status})"
+                    f"[{LOAD_BALANCER_PORT=}] Health check success for {url} (Status: {response.status})"
                 )
                 # Ensure the connection is released back to the pool
                 await response.release()
                 return True
             else:
                 logger.debug(
-                    f"Health check failed for {url} (Status: {response.status})"
+                    f"[{LOAD_BALANCER_PORT=}] Health check failed for {url} (Status: {response.status})"
                 )
                 await response.release()
                 return False
@@ -180,7 +181,7 @@ async def scan_and_update_servers():
                     if server not in connection_counts:
                         connection_counts[server] = 0
-            logger.debug(f"Scan complete. Active servers: {available_servers}")
+            logger.debug(f"[{LOAD_BALANCER_PORT=}]Scan complete. Active servers: {available_servers}")
         except asyncio.CancelledError:
             logger.info("Server scan task cancelled.")

llm_utils/scripts/vllm_serve.py CHANGED Viewed

@@ -9,19 +9,17 @@ Serve a base model:
 svllm serve --model MODEL_NAME --gpus GPU_GROUPS
 Add a LoRA to a served model:
-svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port (if add then the port must be specify)
+svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port
+(if add then the port must be specify)
 """
-from glob import glob
 import os
 import subprocess
-import time
-from typing import List, Literal, Optional
-from fastcore.script import call_parse
-from loguru import logger
+from typing import List, Optional
 import argparse
 import requests
 import openai
+from loguru import logger
 from speedy_utils.common.utils_io import load_by_ext
@@ -32,63 +30,22 @@ HF_HOME: str = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingfac
 logger.info(f"LORA_DIR: {LORA_DIR}")
-def model_list(host_port, api_key="abc"):
+def model_list(host_port: str, api_key: str = "abc") -> None:
+    """List models from the vLLM server."""
     client = openai.OpenAI(base_url=f"http://{host_port}/v1", api_key=api_key)
     models = client.models.list()
     for model in models:
         print(f"Model ID: {model.id}")
-def kill_existing_vllm(vllm_binary: Optional[str] = None) -> None:
-    """Kill selected vLLM processes using fzf."""
-    if not vllm_binary:
-        vllm_binary = get_vllm()
-    # List running vLLM processes
-    result = subprocess.run(
-        f"ps aux | grep {vllm_binary} | grep -v grep",
-        shell=True,
-        capture_output=True,
-        text=True,
-    )
-    processes = result.stdout.strip().split("\n")
-    if not processes or processes == [""]:
-        print("No running vLLM processes found.")
-        return
-    # Use fzf to select processes to kill
-    fzf = subprocess.Popen(
-        ["fzf", "--multi"],
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        text=True,
-    )
-    selected, _ = fzf.communicate("\n".join(processes))
-    if not selected:
-        print("No processes selected.")
-        return
-    # Extract PIDs and kill selected processes
-    pids = [line.split()[1] for line in selected.strip().split("\n")]
-    for pid in pids:
-        subprocess.run(
-            f"kill -9 {pid}",
-            shell=True,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-    print(f"Killed processes: {', '.join(pids)}")
 def add_lora(
     lora_name_or_path: str,
     host_port: str,
     url: str = "http://HOST:PORT/v1/load_lora_adapter",
     served_model_name: Optional[str] = None,
-    lora_module: Optional[str] = None,  # Added parameter
+    lora_module: Optional[str] = None,
 ) -> dict:
+    """Add a LoRA adapter to a running vLLM server."""
     url = url.replace("HOST:PORT", host_port)
     headers = {"Content-Type": "application/json"}
@@ -96,15 +53,12 @@ def add_lora(
         "lora_name": served_model_name,
         "lora_path": os.path.abspath(lora_name_or_path),
     }
-    if lora_module:  # Include lora_module if provided
+    if lora_module:
         data["lora_module"] = lora_module
     logger.info(f"{data=}, {headers}, {url=}")
-    # logger.warning(f"Failed to unload LoRA adapter: {str(e)}")
     try:
-        response = requests.post(url, headers=headers, json=data)
+        response = requests.post(url, headers=headers, json=data, timeout=10)
         response.raise_for_status()
-        # Handle potential non-JSON responses
         try:
             return response.json()
         except ValueError:
@@ -116,113 +70,100 @@ def add_lora(
                     else "Request completed with empty response"
                 ),
             }
     except requests.exceptions.RequestException as e:
         logger.error(f"Request failed: {str(e)}")
         return {"error": f"Request failed: {str(e)}"}
-def unload_lora(lora_name, host_port):
+def unload_lora(lora_name: str, host_port: str) -> Optional[dict]:
+    """Unload a LoRA adapter from a running vLLM server."""
     try:
         url = f"http://{host_port}/v1/unload_lora_adapter"
         logger.info(f"{url=}")
         headers = {"Content-Type": "application/json"}
         data = {"lora_name": lora_name}
         logger.info(f"Unloading LoRA adapter: {data=}")
-        response = requests.post(url, headers=headers, json=data)
+        response = requests.post(url, headers=headers, json=data, timeout=10)
         response.raise_for_status()
         logger.success(f"Unloaded LoRA adapter: {lora_name}")
     except requests.exceptions.RequestException as e:
         return {"error": f"Request failed: {str(e)}"}
-def serve(
-    model: str,
-    gpu_groups: str,
-    served_model_name: Optional[str] = None,
-    port_start: int = 8155,
-    gpu_memory_utilization: float = 0.93,
-    dtype: str = "bfloat16",
-    max_model_len: int = 8192,
-    enable_lora: bool = False,
-    is_bnb: bool = False,
-    eager: bool = False,
-    lora_modules: Optional[List[str]] = None,  # Updated type
-) -> None:
-    """Main function to start or kill vLLM containers."""
+def serve(args) -> None:
     """Start vLLM containers with dynamic args."""
     print("Starting vLLM containers...,")
-    gpu_groups_arr: List[str] = gpu_groups.split(",")
-    VLLM_BINARY: str = get_vllm()
-    if enable_lora:
-        VLLM_BINARY = "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True " + VLLM_BINARY
-    # Auto-detect quantization based on model name if not explicitly set
-    if not is_bnb and model and ("bnb" in model.lower() or "4bit" in model.lower()):
-        is_bnb = True
-        print(f"Auto-detected quantization for model: {model}")
-    # Set environment variables for LoRA if needed
-    if enable_lora:
+    gpu_groups_arr: List[str] = args.gpu_groups.split(",")
+    vllm_binary: str = get_vllm()
+    if args.enable_lora:
+        vllm_binary = "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True " + vllm_binary
+    if (
+        not args.bnb
+        and args.model
+        and ("bnb" in args.model.lower() or "4bit" in args.model.lower())
+    ):
+        args.bnb = True
+        print(f"Auto-detected quantization for model: {args.model}")
+    if args.enable_lora:
         os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "True"
         print("Enabled runtime LoRA updating")
     for i, gpu_group in enumerate(gpu_groups_arr):
-        port = port_start + i
+        port = int(args.host_port.split(":")[-1]) + i
         gpu_group = ",".join([str(x) for x in gpu_group])
         tensor_parallel = len(gpu_group.split(","))
         cmd = [
             f"CUDA_VISIBLE_DEVICES={gpu_group}",
-            VLLM_BINARY,
+            vllm_binary,
             "serve",
-            model,
+            args.model,
             "--port",
             str(port),
             "--tensor-parallel",
             str(tensor_parallel),
             "--gpu-memory-utilization",
-            str(gpu_memory_utilization),
+            str(args.gpu_memory_utilization),
             "--dtype",
-            dtype,
+            args.dtype,
             "--max-model-len",
-            str(max_model_len),
+            str(args.max_model_len),
             "--enable-prefix-caching",
             "--disable-log-requests",
-            "--uvicorn-log-level critical",
+            # "--uvicorn-log-level critical",
         ]
         if HF_HOME:
-            # insert
             cmd.insert(0, f"HF_HOME={HF_HOME}")
-        if eager:
+        if args.eager:
             cmd.append("--enforce-eager")
-        if served_model_name:
-            cmd.extend(["--served-model-name", served_model_name])
+        if args.served_model_name:
+            cmd.extend(["--served-model-name", args.served_model_name])
-        if is_bnb:
+        if args.bnb:
             cmd.extend(
                 ["--quantization", "bitsandbytes", "--load-format", "bitsandbytes"]
             )
-        if enable_lora:
+        if args.enable_lora:
             cmd.extend(["--fully-sharded-loras", "--enable-lora"])
-        if lora_modules:
-            # for lora_module in lora_modules:
-            # len must be even and we will join tuple with `=`
-            assert len(lora_modules) % 2 == 0, "lora_modules must be even"
-            # lora_modulle = [f'{name}={module}' for name, module in zip(lora_module[::2], lora_module[1::2])]
-            # import ipdb;ipdb.set_trace()
+        if args.lora_modules:
+            assert len(args.lora_modules) % 2 == 0, "lora_modules must be even"
             s = ""
-            for i in range(0, len(lora_modules), 2):
-                name = lora_modules[i]
-                module = lora_modules[i + 1]
+            for i in range(0, len(args.lora_modules), 2):
+                name = args.lora_modules[i]
+                module = args.lora_modules[i + 1]
                 s += f"{name}={module} "
             cmd.extend(["--lora-modules", s])
-        # add kwargs
+        if hasattr(args, "enable_reasoning") and args.enable_reasoning:
+            cmd.extend(["--enable-reasoning", "--reasoning-parser", "deepseek_r1"])
+            # Add VLLM_USE_V1=0 to the environment for reasoning mode
+            cmd.insert(0, "VLLM_USE_V1=0")
         final_cmd = " ".join(cmd)
         log_file = f"/tmp/vllm_{port}.txt"
         final_cmd_with_log = f'"{final_cmd} 2>&1 | tee {log_file}"'
@@ -235,14 +176,15 @@ def serve(
         os.system(run_in_tmux)
-def get_vllm():
-    VLLM_BINARY = subprocess.check_output("which vllm", shell=True, text=True).strip()
-    VLLM_BINARY = os.getenv("VLLM_BINARY", VLLM_BINARY)
-    logger.info(f"vLLM binary: {VLLM_BINARY}")
+def get_vllm() -> str:
+    """Get the vLLM binary path."""
+    vllm_binary = subprocess.check_output("which vllm", shell=True, text=True).strip()
+    vllm_binary = os.getenv("VLLM_BINARY", vllm_binary)
+    logger.info(f"vLLM binary: {vllm_binary}")
     assert os.path.exists(
-        VLLM_BINARY
-    ), f"vLLM binary not found at {VLLM_BINARY}, please set VLLM_BINARY env variable"
-    return VLLM_BINARY
+        vllm_binary
+    ), f"vLLM binary not found at {vllm_binary}, please set VLLM_BINARY env variable"
+    return vllm_binary
 def get_args():
@@ -292,11 +234,11 @@ def get_args():
         "--max_model_len", "-mml", type=int, default=8192, help="Maximum model length"
     )
     parser.add_argument(
-        "--disable_lora",
+        "--enable_lora",
         dest="enable_lora",
-        action="store_false",
+        action="store_true",
         help="Disable LoRA support",
-        default=True,
+        default=False,
     )
     parser.add_argument("--bnb", action="store_true", help="Enable quantization")
     parser.add_argument(
@@ -330,6 +272,9 @@ def get_args():
         type=str,
         help="List of LoRA modules in the format lora_name lora_module",
     )
+    parser.add_argument(
+        "--enable-reasoning", action="store_true", help="Enable reasoning"
+    )
     return parser.parse_args()
@@ -371,23 +316,8 @@ def main():
                 logger.info(f"Model name from LoRA config: {model_name}")
                 args.model = model_name
         # port_start from hostport
-        port_start = int(args.host_port.split(":")[-1])
-        serve(
-            args.model,
-            args.gpu_groups,
-            args.served_model_name,
-            port_start,
-            args.gpu_memory_utilization,
-            args.dtype,
-            args.max_model_len,
-            args.enable_lora,
-            args.bnb,
-            args.eager,
-            args.lora_modules,
-        )
+        serve(args)
-    elif args.mode == "kill":
-        kill_existing_vllm(args.vllm_binary)
     elif args.mode == "add_lora":
         if args.lora:
             lora_name, lora_path = args.lora

{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: speedy-utils
-Version: 1.0.9
+Version: 1.0.12
 Summary: Fast and easy-to-use package for data science
 Author: AnhVTH
 Author-email: anhvth.226@gmail.com

{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,15 @@
-llm_utils/__init__.py,sha256=ibEVUPkL11M4htL-3uXkSyyUZiIO-TZD6IzWVmi8QYw,697
+llm_utils/__init__.py,sha256=JcRRsx6dtGLD8nwIz90Iowj6PLOO2_hUi254VbDpc_I,773
 llm_utils/chat_format/__init__.py,sha256=8dBIUqFJvkgQYedxBtcyxt-4tt8JxAKVap2JlTXmgaM,737
 llm_utils/chat_format/display.py,sha256=a3zWzo47SUf4i-uic-dwf-vxtu6gZWLbnJrszjjZjQ8,9801
 llm_utils/chat_format/transform.py,sha256=328V18FOgRQzljAl9Mh8NF4Tl-N3cZZIPmAwHQspXCY,5461
 llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
 llm_utils/group_messages.py,sha256=wyiZzs7O8yK2lyIakV2x-1CrrWVT12sjnP1vVnmPet4,3606
-llm_utils/lm/__init__.py,sha256=vXFILZLBmmpg39cy5XniQPSMzoFQCE3wdfz39EtqDKU,71
-llm_utils/lm/lm.py,sha256=wLVX7-kE8odyS-mLew0rihswxWxoEn7smxFGHoPKv_4,10739
+llm_utils/lm/__init__.py,sha256=e8eCWlLo39GZjq9CokludZGHYVZ7BnbWZ6GOJoiWGzE,110
+llm_utils/lm/alm.py,sha256=mJvB6uAzfakIjA7We19-VJNI9UKKkdfqeef1rJlKR9A,15773
+llm_utils/lm/lm.py,sha256=3mzLYKRbo50XjHp6_WuqkfG2HqTwmozXtQjYQC81m28,19516
 llm_utils/lm/utils.py,sha256=-fDNueiXKQI6RDoNHJYNyORomf2XlCf2doJZ3GEV2Io,4762
-llm_utils/scripts/vllm_load_balancer.py,sha256=MgMnnoKWJQc-l2fspUSkyA9wxL1RkXd7wdBLJNQBlr4,17384
-llm_utils/scripts/vllm_serve.py,sha256=uFS5kNXZ7kZ9rQms63LnliGEVV3rATT6dEppGTgoR0s,13910
+llm_utils/scripts/vllm_load_balancer.py,sha256=17zaq8RJseikHVoAibGOz0p_MCLcNlnhZDkk7g4cuLc,17519
+llm_utils/scripts/vllm_serve.py,sha256=CbW_3Y9Vt7eQYoGGPT3yj1nhbLYOc3b1LdJBy1sVX-Y,11976
 speedy_utils/__init__.py,sha256=I2bSfDIE9yRF77tnHW0vqfExDA2m1gUx4AH8C9XmGtg,1707
 speedy_utils/all.py,sha256=A9jiKGjo950eg1pscS9x38OWAjKGyusoAN5mrfweY4E,3090
 speedy_utils/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,7 +25,7 @@ speedy_utils/multi_worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
 speedy_utils/multi_worker/process.py,sha256=XwQlffxzRFnCVeKjDNBZDwFfUQHiJiuFA12MRGJVru8,6708
 speedy_utils/multi_worker/thread.py,sha256=9pXjvgjD0s0Hp0cZ6I3M0ndp1OlYZ1yvqbs_bcun_Kw,12775
 speedy_utils/scripts/mpython.py,sha256=ZzkBWI5Xw3vPoMx8xQt2x4mOFRjtwWqfvAJ5_ngyWgw,3816
-speedy_utils-1.0.9.dist-info/METADATA,sha256=bLbYeo_uDW0hasjtbmuZoslF-S6qkO-gURfbZhnEuIo,7391
-speedy_utils-1.0.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-speedy_utils-1.0.9.dist-info/entry_points.txt,sha256=rP43satgw1uHcKUAlmVxS-MTAQImL-03-WwLIB5a300,165
-speedy_utils-1.0.9.dist-info/RECORD,,
+speedy_utils-1.0.12.dist-info/METADATA,sha256=obiUx5u8QPzhUDupqzgjZW-pHWyR4tRUt8iNhoZtZ10,7392
+speedy_utils-1.0.12.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+speedy_utils-1.0.12.dist-info/entry_points.txt,sha256=rP43satgw1uHcKUAlmVxS-MTAQImL-03-WwLIB5a300,165
+speedy_utils-1.0.12.dist-info/RECORD,,

{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{speedy_utils-1.0.9.dist-info → speedy_utils-1.0.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

speedy-utils 1.0.9__py3-none-any.whl → 1.0.12__py3-none-any.whl

speedy-utils 1.0.9py3-none-any.whl → 1.0.12py3-none-any.whl