PyPI - speedy-utils - Versions diffs - 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl - Mend

speedy-utils 1.1.4py3-none-any.whl → 1.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

llm_utils/chat_format/display.py +17 -4
llm_utils/lm/async_lm/__init__.py +2 -0
llm_utils/lm/async_lm/_utils.py +198 -0
llm_utils/lm/async_lm/async_llm_task.py +154 -0
llm_utils/lm/{async_lm.py → async_lm/async_lm.py} +191 -354
llm_utils/scripts/vllm_load_balancer.py +220 -135
{speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/METADATA +1 -1
{speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/RECORD +10 -7
{speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/WHEEL +0 -0
{speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/entry_points.txt +0 -0

llm_utils/lm/{async_lm.py → async_lm/async_lm.py} RENAMED Viewed

@@ -1,28 +1,23 @@
+# from ._utils import *
 import base64
 import hashlib
 import json
 import os
-from abc import ABC
-from functools import cache, lru_cache
 from typing import (
     Any,
     Dict,
-    Generic,
     List,
     Literal,
     Optional,
     Sequence,
     Type,
-    TypeVar,
     Union,
     cast,
     overload,
 )
-from typing_extensions import TypedDict
 from httpx import URL
 from loguru import logger
-from numpy import isin
 from openai import AsyncOpenAI, AuthenticationError, BadRequestError, RateLimitError
 from openai.pagination import AsyncPage as AsyncSyncPage
@@ -36,49 +31,23 @@ from openai.types.chat import (
 )
 from openai.types.model import Model
 from pydantic import BaseModel
-from pydantic import ValidationError
-from llm_utils.chat_format.display import get_conversation_one_turn
-# --------------------------------------------------------------------------- #
-# type helpers
-# --------------------------------------------------------------------------- #
-TModel = TypeVar("TModel", bound=BaseModel)
-Messages = List[ChatCompletionMessageParam]
-LegacyMsgs = List[Dict[str, str]]
-RawMsgs = Union[Messages, LegacyMsgs]
-# --------------------------------------------------------------------------- #
-# color helpers (unchanged)
-# --------------------------------------------------------------------------- #
-def _color(code: int, text: str) -> str:
-    return f"\x1b[{code}m{text}\x1b[0m"
-def _red(t):
-    return _color(31, t)
-def _green(t):
-    return _color(32, t)
-def _blue(t):
-    return _color(34, t)
-def _yellow(t):
-    return _color(33, t)
-TParsed = TypeVar("TParsed", bound=BaseModel)
-class ParsedOutput(TypedDict, Generic[TParsed]):
-    messages: List
-    completion: Any
-    parsed: TParsed
+from speedy_utils import jloads
+from ._utils import (
+    LegacyMsgs,
+    Messages,
+    ParsedOutput,
+    RawMsgs,
+    TModel,
+    TParsed,
+    _blue,
+    _green,
+    _red,
+    _yellow,
+    get_tokenizer,
+    inspect_word_probs_async,
+)
 class AsyncLM:
@@ -153,6 +122,14 @@ class AsyncLM:
         **kwargs: Any,
     ) -> TModel: ...
+    async def _set_model(self) -> None:
+        if not self.model:
+            models = await self.list_models(port=self.port, host=self.host)
+            self.model = models[0] if models else None
+            logger.info(
+                f"No model specified. Using the first available model. {self.model}"
+            )
     async def __call__(
         self,
         prompt: Optional[str] = None,
@@ -171,12 +148,8 @@ class AsyncLM:
         assert messages is not None
         # assert self.model is not None, "Model must be set before calling."
-        if not self.model:
-            models = await self.list_models(port=self.port, host=self.host)
-            self.model = models[0] if models else None
-            logger.info(
-                f"No model specified. Using the first available model. {self.model}"
-            )
+        await self._set_model()
         openai_msgs: Messages = (
             self._convert_messages(cast(LegacyMsgs, messages))
             if isinstance(messages[0], dict)
@@ -203,7 +176,7 @@ class AsyncLM:
         else:
             response = self._parse_output(raw_response, response_format)
-        self.last_log = [prompt, messages, raw_response]
+        self._last_log = [prompt, messages, raw_response]
         return response
     # ------------------------------------------------------------------ #
@@ -390,48 +363,44 @@ class AsyncLM:
     async def parse(
         self,
         response_model: Type[TParsed],
-        instruction: Optional[str] = None,
-        prompt: Optional[str] = None,
-        messages: Optional[RawMsgs] = None,
+        instruction,
+        prompt,
         think: Literal[True, False, None] = None,
         add_json_schema_to_instruction: bool = False,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
-        cache: Optional[bool] = True,
+        cache: Optional[bool] = None,
+        use_beta: bool = False,
         **kwargs,
     ) -> ParsedOutput[TParsed]:
         """Parse response using guided JSON generation."""
-        if messages is None:
-            assert instruction is not None, "Instruction must be provided."
-            assert prompt is not None, "Prompt must be provided."
-            messages = [
-                {
-                    "role": "system",
-                    "content": instruction,
-                },
-                {
-                    "role": "user",
-                    "content": prompt,
-                },
-            ]  # type: ignore
-        post_fix = ""
+        if not use_beta:
+            assert add_json_schema_to_instruction, (
+                "add_json_schema_to_instruction must be True when use_beta is False. otherwise model will not be able to parse the response."
+            )
         json_schema = response_model.model_json_schema()
-        if add_json_schema_to_instruction and response_model:
-            _schema = f"\n\n<output_json_schema>\n{json.dumps(json_schema, indent=2)}\n</output_json_schema>"
-            post_fix += _schema
-        if think:
-            post_fix += "\n\n/think"
-        elif not think:
-            post_fix += "\n\n/no_think"
-        assert isinstance(messages, list), "Messages must be a list."
-        assert len(messages) > 0, "Messages cannot be empty."
-        assert messages[0]["role"] == "system", (
-            "First message must be a system message with instruction."
+        # Build system message content in a single, clear block
+        assert instruction is not None, "Instruction must be provided."
+        assert prompt is not None, "Prompt must be provided."
+        system_content = instruction
+        # Add schema if needed
+        system_content = self._build_system_prompt(
+            response_model,
+            add_json_schema_to_instruction,
+            json_schema,
+            system_content,
+            think=think,
         )
-        messages[0]["content"] += post_fix  # type: ignore
+        # Rebuild messages with updated system message if needed
+        messages = [
+            {"role": "system", "content": system_content},
+            {"role": "user", "content": prompt},
+        ]  # type: ignore
         model_kwargs = {}
         if temperature is not None:
@@ -443,38 +412,98 @@ class AsyncLM:
         use_cache = self.do_cache if cache is None else cache
         cache_key = None
         completion = None
+        choice = None
+        parsed = None
         if use_cache:
             cache_data = {
                 "messages": messages,
                 "model_kwargs": model_kwargs,
                 "guided_json": json_schema,
                 "response_format": response_model.__name__,
+                "use_beta": use_beta,
             }
             cache_key = self._cache_key(cache_data, {}, response_model)
             completion = self._load_cache(cache_key)  # dict
         if not completion:
-            completion = await self.client.chat.completions.create(
-                model=self.model,  # type: ignore
-                messages=messages,  # type: ignore
-                extra_body={"guided_json": json_schema},
-                **model_kwargs,
+            completion, choice, parsed = await self._call_and_parse_completion(
+                messages,
+                response_model,
+                json_schema,
+                use_beta=use_beta,
+                model_kwargs=model_kwargs,
             )
-            completion = completion.model_dump()
             if cache_key:
                 self._dump_cache(cache_key, completion)
+        else:
+            # Extract choice and parsed from cached completion
+            choice = completion["choices"][0]["message"]
+            try:
+                parsed = self._parse_complete_output(completion, response_model)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to parse cached completion: {e}\nRaw: {choice.get('content')}"
+                ) from e
         assert isinstance(completion, dict), (
             "Completion must be a dictionary with OpenAI response format."
         )
-        self.last_log = [prompt, messages, completion]
+        self._last_log = [prompt, messages, completion]
+        reasoning_content = choice.get("reasoning_content", "").strip()
+        _content = choice.get("content", "").lstrip("\n")
+        content = f"<think>\n{reasoning_content}\n</think>\n\n{_content}"
+        full_messages = messages + [{"role": "assistant", "content": content}]
-        output = cast(TParsed, self._parse_complete_output(completion, response_model))
-        full_messages = messages + [completion]
         return ParsedOutput(
             messages=full_messages,
             completion=completion,
-            parsed=output,
+            parsed=parsed,  # type: ignore
         )
+    def _build_system_prompt(
+        self,
+        response_model,
+        add_json_schema_to_instruction,
+        json_schema,
+        system_content,
+        think,
+    ):
+        if add_json_schema_to_instruction and response_model:
+            schema_block = f"\n\n<output_json_schema>\n{json.dumps(json_schema, indent=2)}\n</output_json_schema>"
+            # if schema_block not in system_content:
+            if "<output_json_schema>" in system_content:
+                # remove exsting schema block
+                import re  # replace
+                system_content = re.sub(
+                    r"<output_json_schema>.*?</output_json_schema>",
+                    "",
+                    system_content,
+                    flags=re.DOTALL,
+                )
+                system_content = system_content.strip()
+            system_content += schema_block
+        if think is True:
+            if "/think" in system_content:
+                pass
+            elif "/no_think" in system_content:
+                system_content = system_content.replace("/no_think", "/think")
+            else:
+                system_content += "\n\n/think"
+        elif think is False:
+            if "/no_think" in system_content:
+                pass
+            elif "/think" in system_content:
+                system_content = system_content.replace("/think", "/no_think")
+            else:
+                system_content += "\n\n/no_think"
+        return system_content
     def _parse_complete_output(
         self, completion: Any, response_model: Type[BaseModel]
     ) -> BaseModel:
@@ -492,24 +521,24 @@ class AsyncLM:
             # Try to extract tokens from the completion for debugging
             input_tokens = None
             try:
-                input_tokens = completion.get('usage', {}).get('prompt_tokens')
+                input_tokens = completion.get("usage", {}).get("prompt_tokens")
             except Exception:
                 input_tokens = None
             # Try to get the prompt/messages for tokenization
             prompt = None
             try:
-                prompt = completion.get('messages') or completion.get('prompt')
+                prompt = completion.get("messages") or completion.get("prompt")
             except Exception:
                 prompt = None
-            tokens_preview = ''
+            tokens_preview = ""
             if prompt is not None:
                 try:
                     tokenizer = get_tokenizer(self.model)
                     if isinstance(prompt, list):
-                        prompt_text = '\n'.join(
-                            m.get('content', '') for m in prompt if isinstance(m, dict)
+                        prompt_text = "\n".join(
+                            m.get("content", "") for m in prompt if isinstance(m, dict)
                         )
                     else:
                         prompt_text = str(prompt)
@@ -518,17 +547,17 @@ class AsyncLM:
                     first_100 = tokens[:100]
                     last_100 = tokens[-100:] if n_tokens > 100 else []
                     tokens_preview = (
-                        f'\nInput tokens: {n_tokens}'
-                        f'\nFirst 100 tokens: {first_100}'
-                        f'\nLast 100 tokens: {last_100}'
+                        f"\nInput tokens: {n_tokens}"
+                        f"\nFirst 100 tokens: {first_100}"
+                        f"\nLast 100 tokens: {last_100}"
                     )
                 except Exception as exc:
-                    tokens_preview = f'\n[Tokenization failed: {exc}]'
+                    tokens_preview = f"\n[Tokenization failed: {exc}]"
             raise ValueError(
-                f'Empty content in response.'
-                f'\nInput tokens (if available): {input_tokens}'
-                f'{tokens_preview}'
+                f"Empty content in response."
+                f"\nInput tokens (if available): {input_tokens}"
+                f"{tokens_preview}"
             )
         try:
@@ -579,7 +608,7 @@ class AsyncLM:
         if not hasattr(self, "last_log"):
             return None
-        last_conv = self.last_log
+        last_conv = self._last_log
         messages = last_conv[1] if len(last_conv) > 1 else None
         last_msg = last_conv[2]
         if not isinstance(last_msg, dict):
@@ -607,7 +636,7 @@ class AsyncLM:
         if not hasattr(self, "last_log"):
             raise ValueError("No history available. Please call the model first.")
-        prompt, messages, response = self.last_log
+        prompt, messages, response = self._last_log
         if hasattr(response, "model_dump"):
             response = response.model_dump()
         if not messages:
@@ -692,251 +721,59 @@ class AsyncLM:
             logger.error(f"Failed to list models: {exc}")
             return []
-# --------------------------------------------------------------------------- #
-# Module-level utility functions (async versions)
-# --------------------------------------------------------------------------- #
-@lru_cache(maxsize=10)
-def get_tokenizer(model_name: str) -> Any:
-    """Get tokenizer for the given model."""
-    from transformers import AutoTokenizer  # type: ignore
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    return tokenizer
-async def inspect_word_probs_async(lm, tokenizer, messages):
-    """Async version of inspect_word_probs."""
-    import numpy as np
-    async def compute_word_log_probs(
-        tokenizer: Any,
-        lm_client: Any,
-    ) -> tuple[List[Dict[str, Any]], Any]:
-        # Build a prompt that preserves literal newlines
-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,  # Don't tokenize yet, we need raw text
-            add_generation_prompt=False,  # No generation prompt needed
-        )
-        # Request token logprobs
-        response = await lm_client.client.completions.create(
-            model=lm_client.model,  # type: ignore
-            prompt=prompt,
-            max_tokens=1,
-            logprobs=1,
-            extra_body={"prompt_logprobs": 0},
-        )
-        token_logprob_dicts = response.choices[0].prompt_logprobs  # type: ignore
-        # Override first token to known start marker
-        start_id = tokenizer.encode("<|im_start|>")[0]
-        token_logprob_dicts[0] = {
-            str(start_id): {
-                "logprob": -1,
-                "rank": 1,
-                "decoded_token": "<|im_start|>",
-            }
-        }
-        # Flatten tokens
-        tokens: List[Dict[str, Any]] = [
-            {"id": int(tid), **tdata}
-            for td in token_logprob_dicts
-            for tid, tdata in td.items()
-        ]
-        # Validate tokenization
-        tokenized = tokenizer.tokenize(prompt)
-        if len(tokenized) != len(tokens):
-            raise ValueError(f"Token count mismatch: {len(tokenized)} vs {len(tokens)}")
-        for idx, tok in enumerate(tokens):
-            if tokenized[idx] != tok["decoded_token"]:
-                raise AssertionError(
-                    f"Token mismatch at {idx}: "
-                    f"{tokenized[idx]} != {tok['decoded_token']}"
-                )
-        # Split on newline sentinel
-        split_prompt = prompt.replace("\n", " <NL> ")
-        words = split_prompt.split()
-        word_log_probs: List[Dict[str, Any]] = []
-        token_idx = 0
-        for word in words:
-            # Map sentinel back to actual newline for encoding
-            target = "\n" if word == "<NL>" else word
-            sub_ids = tokenizer.encode(target, add_special_tokens=False)
-            count = len(sub_ids)
-            if count == 0:
-                continue
-            subs = tokens[token_idx : token_idx + count]
-            avg_logprob = sum(s["logprob"] for s in subs) / count
-            prob = float(np.exp(avg_logprob))
-            word_log_probs.append({"word": target, "probability": prob})
-            token_idx += count
-        return word_log_probs, token_logprob_dicts  # type: ignore
-    def render_by_logprob(word_log_probs: List[Dict[str, Any]]) -> str:
-        """
-        Return an ANSI-colored string for word probabilities (red → green).
-        """
-        if not word_log_probs:
-            return ""
-        probs = [entry["probability"] for entry in word_log_probs]
-        min_p, max_p = min(probs), max(probs)
-        parts: List[str] = []
-        for entry in word_log_probs:
-            word = entry["word"]
-            # Preserve actual line breaks
-            if word == "\n":
-                parts.append("\n")
-                continue
-            p = entry["probability"]
-            norm = (p - min_p) / (max_p - min_p or 1.0)
-            r = int(255 * (1 - norm))  # red component (high when prob is low)
-            g = int(255 * norm)  # green component (high when prob is high)
-            b = 0  # no blue for red-green gradient
-            colored = f"\x1b[38;2;{r};{g};{b}m{word}\x1b[0m"
-            parts.append(colored + " ")
-        return "".join(parts).rstrip()
-    word_probs, token_logprob_dicts = await compute_word_log_probs(tokenizer, lm)
-    return word_probs, token_logprob_dicts, render_by_logprob(word_probs)
-# --------------------------------------------------------------------------- #
-# Async LLMTask class
-# --------------------------------------------------------------------------- #
-InputModelType = TypeVar("InputModelType", bound=BaseModel)
-OutputModelType = TypeVar("OutputModelType", bound=BaseModel)
-class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
-    """
-    Async callable wrapper around an AsyncLM endpoint.
-    Sub-classes must set:
-      • lm              – the async language-model instance
-      • InputModel      – a Pydantic input class
-      • OutputModel     – a Pydantic output class
-    Optional flags:
-      • temperature     – float (default 0.6)
-      • think           – bool  (if the backend supports "chain-of-thought")
-      • add_json_schema – bool  (include schema in the instruction)
-    The **docstring** of each sub-class is sent as the LM instruction.
-    Example
-    ```python
-        class DemoTask(AsyncLLMTask):
-            "TODO: SYSTEM_PROMPT_INSTURCTION HERE"
-            lm = AsyncLM(port=8130, cache=False, model="gpt-3.5-turbo")
-            class InputModel(BaseModel):
-                text_to_translate:str
-            class OutputModel(BaseModel):
-                translation:str
-                glossary_use:str
-            temperature = 0.6
-            think=False
-        demo_task = DemoTask()
-        result = await demo_task({'text_to_translate': 'Translate from english to vietnamese: Hello how are you'})
-    ```
-    """
-    lm: "AsyncLM"
-    InputModel: InputModelType
-    OutputModel: OutputModelType
-    temperature: float = 0.6
-    think: bool = False
-    add_json_schema: bool = False
-    cache: bool = False
-    async def __call__(
+    async def _call_and_parse_completion(
         self,
-        data: BaseModel | dict,
-        temperature: float = 0.1,
-        cache: bool = False,
-        think: Optional[bool] = None,  # if not None, overrides self.think
-    ) -> tuple[OutputModelType, List[Dict[str, Any]]]:
-        # Get the input and output model types from the generic parameters
-        type_args = getattr(self.__class__, "__orig_bases__", None)
-        if (
-            type_args
-            and hasattr(type_args[0], "__args__")
-            and len(type_args[0].__args__) >= 2
-        ):
-            input_model = type_args[0].__args__[0]
-            output_model = type_args[0].__args__[1]
-        else:
-            # Fallback to the old way if type introspection fails
-            if (
-                not hasattr(self, "InputModel")
-                or not hasattr(self, "OutputModel")
-                or not hasattr(self, "lm")
-            ):
-                raise NotImplementedError(
-                    f"{self.__class__.__name__} must define lm, InputModel, and OutputModel as class attributes or use proper generic typing."
+        messages: list[dict],
+        response_model: Type[TParsed],
+        json_schema: dict,
+        use_beta: bool,
+        model_kwargs: dict,
+    ) -> tuple[dict, dict, TParsed]:
+        """Call vLLM or OpenAI-compatible endpoint and parse JSON response consistently."""
+        await self._set_model()  # Ensure model is set before making the call
+        # Convert messages to proper type
+        converted_messages = self._convert_messages(messages)  # type: ignore
+        if use_beta:
+            # Use guided JSON for structure enforcement
+            try:
+                completion = await self.client.chat.completions.create(
+                    model=str(self.model),  # type: ignore
+                    messages=converted_messages,
+                    extra_body={"guided_json": json_schema},  # type: ignore
+                    **model_kwargs,
+                )  # type: ignore
+            except Exception:
+                # Fallback if extra_body is not supported
+                completion = await self.client.chat.completions.create(
+                    model=str(self.model),  # type: ignore
+                    messages=converted_messages,
+                    response_format={"type": "json_object"},
+                    **model_kwargs,
                 )
-            input_model = self.InputModel
-            output_model = self.OutputModel
-        # Ensure input_model is a class before calling
-        if isinstance(data, BaseModel):
-            item = data
-        elif isinstance(input_model, type) and issubclass(input_model, BaseModel):
-            item = input_model(**data)
         else:
-            raise TypeError("InputModel must be a subclass of BaseModel")
-        assert isinstance(output_model, type) and issubclass(output_model, BaseModel), (
-            "OutputModel must be a subclass of BaseModel"
-        )
+            # Use OpenAI-style structured output
+            completion = await self.client.chat.completions.create(
+                model=str(self.model),  # type: ignore
+                messages=converted_messages,
+                response_format={"type": "json_object"},
+                **model_kwargs,
+            )
-        result = await self.lm.parse(
-            prompt=item.model_dump_json(),
-            instruction=self.__doc__ or "",
-            response_model=output_model,
-            temperature=temperature or self.temperature,
-            think=think if think is not None else self.think,
-            add_json_schema_to_instruction=self.add_json_schema,
-            cache=self.cache or cache,
-        )
+        if hasattr(completion, "model_dump"):
+            completion = completion.model_dump()
-        return (
-            cast(OutputModelType, result["parsed"]),  # type: ignore
-            cast(List[dict], result["messages"]),  # type: ignore
-        )
+        choice = completion["choices"][0]["message"]
-    def generate_training_data(
-        self, input_dict: Dict[str, Any], output: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Return share gpt like format"""
-        system_prompt = self.__doc__ or ""
-        user_msg = self.InputModel(**input_dict).model_dump_json()  # type: ignore[attr-defined]
-        assistant_msg = self.OutputModel(**output).model_dump_json()  # type: ignore[attr-defined]
-        messages = get_conversation_one_turn(
-            system_msg=system_prompt, user_msg=user_msg, assistant_msg=assistant_msg
-        )
-        return {"messages": messages}
+        try:
+            parsed = (
+                self._parse_complete_output(completion, response_model)
+                if use_beta
+                else response_model.model_validate(jloads(choice.get("content")))
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Failed to parse model response: {e}\nRaw: {choice.get('content')}"
+            ) from e
-    arun = __call__  # alias for compatibility with other LLMTask implementations
+        return completion, choice, parsed  # type: ignore

speedy-utils 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

speedy-utils 1.1.4py3-none-any.whl → 1.1.6py3-none-any.whl