PyPI - speedy-utils - Versions diffs - 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl - Mend

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

llm_utils/__init__.py +3 -2
llm_utils/lm/async_lm/async_llm_task.py +1 -0
llm_utils/lm/llm_task.py +303 -10
llm_utils/lm/openai_memoize.py +10 -2
llm_utils/vector_cache/core.py +250 -234
speedy_utils/__init__.py +2 -1
speedy_utils/common/utils_cache.py +38 -19
speedy_utils/common/utils_io.py +9 -5
speedy_utils/multi_worker/process.py +91 -10
speedy_utils/multi_worker/thread.py +94 -2
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/METADATA +34 -13
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/RECORD +19 -19
{speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/WHEEL +1 -1
speedy_utils-1.1.20.dist-info/entry_points.txt +5 -0
speedy_utils-1.1.18.dist-info/entry_points.txt +0 -6

llm_utils/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from llm_utils.vector_cache import VectorCache
 from llm_utils.lm.lm_base import get_model_name
 from llm_utils.lm.base_prompt_builder import BasePromptBuilder
+LLM = LLMTask
 from .chat_format import (
     build_chatml_input,
@@ -34,5 +34,6 @@ __all__ = [
     "MOpenAI",
     "get_model_name",
     "VectorCache",
-    "BasePromptBuilder"
+    "BasePromptBuilder",
+    "LLM"
 ]

llm_utils/lm/async_lm/async_llm_task.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# type: ignore
 """
 Async LLM Task module for handling language model interactions with structured input/output.
 """

llm_utils/lm/llm_task.py CHANGED Viewed

@@ -4,10 +4,12 @@
 Simplified LLM Task module for handling language model interactions with structured input/output.
 """
+import os
 from typing import Any, Dict, List, Optional, Type, Union, cast
+import requests
 from loguru import logger
-from openai import OpenAI
+from openai import OpenAI, AuthenticationError, BadRequestError, RateLimitError
 from openai.types.chat import ChatCompletionMessageParam
 from pydantic import BaseModel
@@ -38,6 +40,90 @@ def get_base_client(
         )
+def _is_lora_path(path: str) -> bool:
+    """Check if the given path is a LoRA adapter directory.
+    Args:
+        path: Path to check
+    Returns:
+        True if the path contains adapter_config.json, False otherwise
+    """
+    if not os.path.isdir(path):
+        return False
+    adapter_config_path = os.path.join(path, 'adapter_config.json')
+    return os.path.isfile(adapter_config_path)
+def _get_port_from_client(client: OpenAI) -> Optional[int]:
+    """Extract port number from OpenAI client base_url.
+    Args:
+        client: OpenAI client instance
+    Returns:
+        Port number if found, None otherwise
+    """
+    if hasattr(client, 'base_url') and client.base_url:
+        base_url = str(client.base_url)
+        if 'localhost:' in base_url:
+            try:
+                # Extract port from localhost:PORT/v1 format
+                port_part = base_url.split('localhost:')[1].split('/')[0]
+                return int(port_part)
+            except (IndexError, ValueError):
+                pass
+    return None
+def _load_lora_adapter(lora_path: str, port: int) -> str:
+    """Load a LoRA adapter from the specified path.
+    Args:
+        lora_path: Path to the LoRA adapter directory
+        port: Port number for the API endpoint
+    Returns:
+        Name of the loaded LoRA adapter
+    Raises:
+        requests.RequestException: If the API call fails
+    """
+    lora_name = os.path.basename(lora_path.rstrip('/\\'))
+    if not lora_name:  # Handle edge case of empty basename
+        lora_name = os.path.basename(os.path.dirname(lora_path))
+    response = requests.post(
+        f'http://localhost:{port}/v1/load_lora_adapter',
+        headers={'accept': 'application/json', 'Content-Type': 'application/json'},
+        json={"lora_name": lora_name, "lora_path": os.path.abspath(lora_path)}
+    )
+    response.raise_for_status()
+    return lora_name
+def _unload_lora_adapter(lora_path: str, port: int) -> None:
+    """Unload the current LoRA adapter.
+    Args:
+        lora_path: Path to the LoRA adapter directory
+        port: Port number for the API endpoint
+    """
+    try:
+        lora_name = os.path.basename(lora_path.rstrip('/\\'))
+        if not lora_name:  # Handle edge case of empty basename
+            lora_name = os.path.basename(os.path.dirname(lora_path))
+        response = requests.post(
+            f'http://localhost:{port}/v1/unload_lora_adapter',
+            headers={'accept': 'application/json', 'Content-Type': 'application/json'},
+            json={"lora_name": lora_name, "lora_int_id": 0}
+        )
+        response.raise_for_status()
+    except requests.RequestException as e:
+        logger.warning(f"Error unloading LoRA adapter: {str(e)[:100]}")
 class LLMTask:
     """
     Language model task with structured input/output and optional system instruction.
@@ -106,6 +192,9 @@ class LLMTask:
         output_model: Type[BaseModel] | Type[str] = None,
         client: Union[OpenAI, int, str, None] = None,
         cache=True,
+        is_reasoning_model: bool = False,
+        force_lora_unload: bool = False,
+        lora_path: Optional[str] = None,
         **model_kwargs,
     ):
         """
@@ -117,6 +206,12 @@ class LLMTask:
             output_model: Output BaseModel type
             client: OpenAI client, port number, or base_url string
             cache: Whether to use cached responses (default True)
+            is_reasoning_model: Whether the model is a reasoning model (o1-preview, o1-mini, etc.)
+                              that outputs reasoning_content separately from content (default False)
+            force_lora_unload: If True, forces unloading of any existing LoRA adapter before loading
+                             a new one when lora_path is provided (default False)
+            lora_path: Optional path to LoRA adapter directory. If provided, will load the LoRA
+                      and use it as the model. Takes precedence over model parameter.
             **model_kwargs: Additional model parameters including:
                 - temperature: Controls randomness (0.0 to 2.0)
                 - n: Number of responses to generate (when n > 1, returns list)
@@ -127,6 +222,10 @@ class LLMTask:
         self.input_model = input_model
         self.output_model = output_model
         self.model_kwargs = model_kwargs
+        self.is_reasoning_model = is_reasoning_model
+        self.force_lora_unload = force_lora_unload
+        self.lora_path = lora_path
+        self.last_ai_response = None  # Store raw response from client
         # if cache:
         #     print("Caching is enabled will use llm_utils.MOpenAI")
@@ -135,11 +234,152 @@ class LLMTask:
         # else:
         #     self.client = OpenAI(base_url=base_url, api_key=api_key)
         self.client = get_base_client(client, cache=cache)
+        # check connection of client
+        try:
+            self.client.models.list()
+        except Exception as e:
+            logger.error(f"Failed to connect to OpenAI client: {str(e)}, base_url={self.client.base_url}")
+            raise e
         if not self.model_kwargs.get("model", ""):
             self.model_kwargs["model"] = self.client.models.list().data[0].id
+        # Handle LoRA loading if lora_path is provided
+        if self.lora_path:
+            self._load_lora_adapter()
         print(self.model_kwargs)
+    def _load_lora_adapter(self) -> None:
+        """
+        Load LoRA adapter from the specified lora_path.
+        This method:
+        1. Validates that lora_path is a valid LoRA directory
+        2. Checks if LoRA is already loaded (unless force_lora_unload is True)
+        3. Loads the LoRA adapter and updates the model name
+        """
+        if not self.lora_path:
+            return
+        if not _is_lora_path(self.lora_path):
+            raise ValueError(
+                f"Invalid LoRA path '{self.lora_path}': "
+                "Directory must contain 'adapter_config.json'"
+            )
+        logger.info(f"Loading LoRA adapter from: {self.lora_path}")
+        # Get the expected LoRA name (basename of the path)
+        lora_name = os.path.basename(self.lora_path.rstrip('/\\'))
+        if not lora_name:  # Handle edge case of empty basename
+            lora_name = os.path.basename(os.path.dirname(self.lora_path))
+        # Get list of available models to check if LoRA is already loaded
+        try:
+            available_models = [m.id for m in self.client.models.list().data]
+        except Exception as e:
+            logger.warning(f"Failed to list models, proceeding with LoRA load: {str(e)[:100]}")
+            available_models = []
+        # Check if LoRA is already loaded
+        if lora_name in available_models and not self.force_lora_unload:
+            logger.info(f"LoRA adapter '{lora_name}' is already loaded, using existing model")
+            self.model_kwargs["model"] = lora_name
+            return
+        # Force unload if requested
+        if self.force_lora_unload and lora_name in available_models:
+            logger.info(f"Force unloading LoRA adapter '{lora_name}' before reloading")
+            port = _get_port_from_client(self.client)
+            if port is not None:
+                try:
+                    LLMTask.unload_lora(port, lora_name)
+                    logger.info(f"Successfully unloaded LoRA adapter: {lora_name}")
+                except Exception as e:
+                    logger.warning(f"Failed to unload LoRA adapter: {str(e)[:100]}")
+        # Get port from client for API calls
+        port = _get_port_from_client(self.client)
+        if port is None:
+            raise ValueError(
+                f"Cannot load LoRA adapter '{self.lora_path}': "
+                "Unable to determine port from client base_url. "
+                "LoRA loading requires a client initialized with port number."
+            )
+        try:
+            # Load the LoRA adapter
+            loaded_lora_name = _load_lora_adapter(self.lora_path, port)
+            logger.info(f"Successfully loaded LoRA adapter: {loaded_lora_name}")
+            # Update model name to the loaded LoRA name
+            self.model_kwargs["model"] = loaded_lora_name
+        except requests.RequestException as e:
+            # Check if the error is due to LoRA already being loaded
+            error_msg = str(e)
+            if "400" in error_msg or "Bad Request" in error_msg:
+                logger.info(f"LoRA adapter may already be loaded, attempting to use '{lora_name}'")
+                # Refresh the model list to check if it's now available
+                try:
+                    updated_models = [m.id for m in self.client.models.list().data]
+                    if lora_name in updated_models:
+                        logger.info(f"Found LoRA adapter '{lora_name}' in updated model list")
+                        self.model_kwargs["model"] = lora_name
+                        return
+                except Exception:
+                    pass  # Fall through to original error
+            raise ValueError(
+                f"Failed to load LoRA adapter from '{self.lora_path}': {error_msg[:100]}"
+            )
+    def unload_lora_adapter(self, lora_path: str) -> None:
+        """
+        Unload a LoRA adapter.
+        Args:
+            lora_path: Path to the LoRA adapter directory to unload
+        Raises:
+            ValueError: If unable to determine port from client
+        """
+        port = _get_port_from_client(self.client)
+        if port is None:
+            raise ValueError(
+                "Cannot unload LoRA adapter: "
+                "Unable to determine port from client base_url. "
+                "LoRA operations require a client initialized with port number."
+            )
+        _unload_lora_adapter(lora_path, port)
+        lora_name = os.path.basename(lora_path.rstrip('/\\'))
+        logger.info(f"Unloaded LoRA adapter: {lora_name}")
+    @staticmethod
+    def unload_lora(port: int, lora_name: str) -> None:
+        """Static method to unload a LoRA adapter by name.
+        Args:
+            port: Port number for the API endpoint
+            lora_name: Name of the LoRA adapter to unload
+        Raises:
+            requests.RequestException: If the API call fails
+        """
+        try:
+            response = requests.post(
+                f'http://localhost:{port}/v1/unload_lora_adapter',
+                headers={'accept': 'application/json', 'Content-Type': 'application/json'},
+                json={"lora_name": lora_name, "lora_int_id": 0}
+            )
+            response.raise_for_status()
+            logger.info(f"Successfully unloaded LoRA adapter: {lora_name}")
+        except requests.RequestException as e:
+            logger.error(f"Error unloading LoRA adapter '{lora_name}': {str(e)[:100]}")
+            raise
     def _prepare_input(self, input_data: Union[str, BaseModel, List[Dict]]) -> Messages:
         """Convert input to messages format."""
         if isinstance(input_data, list):
@@ -200,9 +440,24 @@ class LLMTask:
         # Extract model name from kwargs for API call
         api_kwargs = {k: v for k, v in effective_kwargs.items() if k != "model"}
-        completion = self.client.chat.completions.create(
-            model=model_name, messages=messages, **api_kwargs
-        )
+        try:
+            completion = self.client.chat.completions.create(
+                model=model_name, messages=messages, **api_kwargs
+            )
+            # Store raw response from client
+            self.last_ai_response = completion
+        except (AuthenticationError, RateLimitError, BadRequestError) as exc:
+            error_msg = f"OpenAI API error ({type(exc).__name__}): {exc}"
+            logger.error(error_msg)
+            raise
+        except Exception as e:
+            is_length_error = "Length" in str(e) or "maximum context length" in str(e)
+            if is_length_error:
+                raise ValueError(
+                    f"Input too long for model {model_name}. Error: {str(e)[:100]}..."
+                )
+            # Re-raise all other exceptions
+            raise
         # print(completion)
         results: List[Dict[str, Any]] = []
@@ -211,9 +466,13 @@ class LLMTask:
                 Messages,
                 messages + [{"role": "assistant", "content": choice.message.content}],
             )
-            results.append(
-                {"parsed": choice.message.content, "messages": choice_messages}
-            )
+            result_dict = {"parsed": choice.message.content, "messages": choice_messages}
+            # Add reasoning content if this is a reasoning model
+            if self.is_reasoning_model and hasattr(choice.message, 'reasoning_content'):
+                result_dict["reasoning_content"] = choice.message.reasoning_content
+            results.append(result_dict)
         return results
     def pydantic_parse(
@@ -239,6 +498,11 @@ class LLMTask:
             List of dicts [{'parsed': parsed_model, 'messages': messages}, ...]
             When n=1: List contains one dict
             When n>1: List contains multiple dicts
+        Note:
+            This method ensures consistent Pydantic model output for both fresh and cached responses.
+            When responses are cached and loaded back, the parsed content is re-validated to maintain
+            type consistency between first-time and subsequent calls.
         """
         # Prepare messages
         messages = self._prepare_input(input_data)
@@ -265,12 +529,20 @@ class LLMTask:
                 response_format=pydantic_model_to_use,
                 **api_kwargs,
             )
+            # Store raw response from client
+            self.last_ai_response = completion
+        except (AuthenticationError, RateLimitError, BadRequestError) as exc:
+            error_msg = f"OpenAI API error ({type(exc).__name__}): {exc}"
+            logger.error(error_msg)
+            raise
         except Exception as e:
             is_length_error = "Length" in str(e) or "maximum context length" in str(e)
             if is_length_error:
                 raise ValueError(
                     f"Input too long for model {model_name}. Error: {str(e)[:100]}..."
                 )
+            # Re-raise all other exceptions
+            raise
         results: List[Dict[str, Any]] = []
         for choice in completion.choices:  # type: ignore[attr-defined]
@@ -278,9 +550,23 @@ class LLMTask:
                 Messages,
                 messages + [{"role": "assistant", "content": choice.message.content}],
             )
-            results.append(
-                {"parsed": choice.message.parsed, "messages": choice_messages}
-            )  # type: ignore[attr-defined]
+            # Ensure consistent Pydantic model output for both fresh and cached responses
+            parsed_content = choice.message.parsed  # type: ignore[attr-defined]
+            if isinstance(parsed_content, dict):
+                # Cached response: validate dict back to Pydantic model
+                parsed_content = pydantic_model_to_use.model_validate(parsed_content)
+            elif not isinstance(parsed_content, pydantic_model_to_use):
+                # Fallback: ensure it's the correct type
+                parsed_content = pydantic_model_to_use.model_validate(parsed_content)
+            result_dict = {"parsed": parsed_content, "messages": choice_messages}
+            # Add reasoning content if this is a reasoning model
+            if self.is_reasoning_model and hasattr(choice.message, 'reasoning_content'):
+                result_dict["reasoning_content"] = choice.message.reasoning_content
+            results.append(result_dict)
         return results
     def __call__(
@@ -364,6 +650,8 @@ class LLMTask:
         builder: BasePromptBuilder,
         client: Union[OpenAI, int, str, None] = None,
         cache=True,
+        is_reasoning_model: bool = False,
+        lora_path: Optional[str] = None,
         **model_kwargs,
     ) -> "LLMTask":
         """
@@ -382,6 +670,10 @@ class LLMTask:
             input_model=input_model,
             output_model=output_model,
             client=client,
+            cache=cache,
+            is_reasoning_model=is_reasoning_model,
+            lora_path=lora_path,
+            **model_kwargs,
         )
     @staticmethod
@@ -398,3 +690,4 @@ class LLMTask:
         client = get_base_client(client, cache=False)
         models = client.models.list().data
         return [m.id for m in models]

llm_utils/lm/openai_memoize.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from openai import OpenAI, AsyncOpenAI
+from typing import Any, Callable
 from speedy_utils.common.utils_cache import memoize
@@ -30,6 +31,8 @@ class MOpenAI(OpenAI):
     - If you need a shared cache across instances, or more advanced cache controls,
         modify `memoize` or wrap at a class/static level instead of assigning to the
         bound method.
+    - Type information is now fully preserved by the memoize decorator, eliminating
+        the need for type casting.
     Example
             m = MOpenAI(api_key="...", model="gpt-4")
@@ -40,7 +43,12 @@ class MOpenAI(OpenAI):
     def __init__(self, *args, cache=True, **kwargs):
         super().__init__(*args, **kwargs)
         if cache:
-            self.post = memoize(self.post)
+            # Create a memoized wrapper for the instance's post method.
+            # The memoize decorator now preserves exact type information,
+            # so no casting is needed.
+            orig_post = self.post
+            memoized = memoize(orig_post)
+            self.post = memoized
 class MAsyncOpenAI(AsyncOpenAI):
@@ -69,4 +77,4 @@ class MAsyncOpenAI(AsyncOpenAI):
     def __init__(self, *args, cache=True, **kwargs):
         super().__init__(*args, **kwargs)
         if cache:
-            self.post = memoize(self.post)
+            self.post = memoize(self.post) # type: ignore

speedy-utils 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

speedy-utils 1.1.18py3-none-any.whl → 1.1.20py3-none-any.whl