PyPI - llm-ie - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

llm-ie 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

llm_ie/__init__.py +2 -2
llm_ie/engines.py +497 -250
llm_ie/extractors.py +100 -251
llm_ie/prompt_editor.py +13 -13
{llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/METADATA +2 -2
{llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/RECORD +7 -7
{llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/WHEEL +0 -0

llm_ie/engines.py CHANGED Viewed

@@ -1,22 +1,290 @@
 import abc
+import re
 import warnings
-import importlib
-from typing import List, Dict, Union, Generator
+import importlib.util
+from typing import Any, Tuple, List, Dict, Union, Generator
+class LLMConfig(abc.ABC):
+    def __init__(self, **kwargs):
+        """
+        This is an abstract class to provide interfaces for LLM configuration.
+        Children classes that inherts this class can be used in extrators and prompt editor.
+        Common LLM parameters: max_new_tokens, temperature, top_p, top_k, min_p.
+        """
+        self.params = kwargs.copy()
+    @abc.abstractmethod
+    def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
+        """
+        This method preprocesses the input messages before passing them to the LLM.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        Returns:
+        -------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        """
+        return NotImplemented
+    @abc.abstractmethod
+    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[str, None, None]]:
+        """
+        This method postprocesses the LLM response after it is generated.
+        Parameters:
+        ----------
+        response : Union[str, Generator[str, None, None]]
+            the LLM response. Can be a string or a generator.
+        Returns:
+        -------
+        response : str
+            the postprocessed LLM response
+        """
+        return NotImplemented
+class BasicLLMConfig(LLMConfig):
+    def __init__(self, max_new_tokens:int=2048, temperature:float=0.0, **kwargs):
+        """
+        The basic LLM configuration for most non-reasoning models.
+        """
+        super().__init__(**kwargs)
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.params["max_new_tokens"] = self.max_new_tokens
+        self.params["temperature"] = self.temperature
+    def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
+        """
+        This method preprocesses the input messages before passing them to the LLM.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        Returns:
+        -------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        """
+        return messages
+    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[Dict[str, str], None, None]]:
+        """
+        This method postprocesses the LLM response after it is generated.
+        Parameters:
+        ----------
+        response : Union[str, Generator[str, None, None]]
+            the LLM response. Can be a string or a generator.
+        Returns: Union[str, Generator[Dict[str, str], None, None]]
+            the postprocessed LLM response.
+            if input is a generator, the output will be a generator {"data": <content>}.
+        """
+        if isinstance(response, str):
+            return response
+        def _process_stream():
+            for chunk in response:
+                yield {"type": "response", "data": chunk}
+        return _process_stream()
+class Qwen3LLMConfig(LLMConfig):
+    def __init__(self, thinking_mode:bool=True, **kwargs):
+        """
+        The Qwen3 LLM configuration for reasoning models.
+        Parameters:
+        ----------
+        thinking_mode : bool, Optional
+            if True, a special token "/think" will be placed after each system and user prompt. Otherwise, "/no_think" will be placed.
+        """
+        super().__init__(**kwargs)
+        self.thinking_mode = thinking_mode
+    def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
+        """
+        Append a special token to the system and user prompts.
+        The token is "/think" if thinking_mode is True, otherwise "/no_think".
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        Returns:
+        -------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        """
+        thinking_token = "/think" if self.thinking_mode else "/no_think"
+        new_messages = []
+        for message in messages:
+            if message['role'] in ['system', 'user']:
+                new_message = {'role': message['role'], 'content': f"{message['content']} {thinking_token}"}
+            else:
+                new_message = {'role': message['role'], 'content': message['content']}
+            new_messages.append(new_message)
+        return new_messages
+    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[Dict[str,str], None, None]]:
+        """
+        If input is a generator, tag contents in <think> and </think> as {"type": "reasoning", "data": <content>},
+        and the rest as {"type": "response", "data": <content>}.
+        If input is a string, drop contents in <think> and </think>.
+        Parameters:
+        ----------
+        response : Union[str, Generator[str, None, None]]
+            the LLM response. Can be a string or a generator.
+        Returns:
+        -------
+        response : Union[str, Generator[str, None, None]]
+            the postprocessed LLM response.
+            if input is a generator, the output will be a generator {"type": <reasoning or response>, "data": <content>}.
+        """
+        if isinstance(response, str):
+            return re.sub(r"<think>.*?</think>\s*", "", response, flags=re.DOTALL).strip()
+        if isinstance(response, Generator):
+            def _process_stream():
+                think_flag = False
+                buffer = ""
+                for chunk in response:
+                    if isinstance(chunk, str):
+                        buffer += chunk
+                        # switch between reasoning and response
+                        if "<think>" in buffer:
+                            think_flag = True
+                            buffer = buffer.replace("<think>", "")
+                        elif "</think>" in buffer:
+                            think_flag = False
+                            buffer = buffer.replace("</think>", "")
+                        # if chunk is in thinking block, tag it as reasoning; else tag it as response
+                        if chunk not in ["<think>", "</think>"]:
+                            if think_flag:
+                                yield {"type": "reasoning", "data": chunk}
+                            else:
+                                yield {"type": "response", "data": chunk}
+            return _process_stream()
+class OpenAIReasoningLLMConfig(LLMConfig):
+    def __init__(self, reasoning_effort:str="low", **kwargs):
+        """
+        The OpenAI "o" series configuration.
+        1. The reasoning effort is set to "low" by default.
+        2. The temperature parameter is not supported and will be ignored.
+        3. The system prompt is not supported and will be concatenated to the next user prompt.
+        Parameters:
+        ----------
+        reasoning_effort : str, Optional
+            the reasoning effort. Must be one of {"low", "medium", "high"}. Default is "low".
+        """
+        super().__init__(**kwargs)
+        if reasoning_effort not in ["low", "medium", "high"]:
+            raise ValueError("reasoning_effort must be one of {'low', 'medium', 'high'}.")
+        self.reasoning_effort = reasoning_effort
+        self.params["reasoning_effort"] = self.reasoning_effort
+        if "temperature" in self.params:
+            warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
+            self.params.pop("temperature")
+    def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
+        """
+        Concatenate system prompts to the next user prompt.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        Returns:
+        -------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        """
+        system_prompt_holder = ""
+        new_messages = []
+        for i, message in enumerate(messages):
+            # if system prompt, store it in system_prompt_holder
+            if message['role'] == 'system':
+                system_prompt_holder = message['content']
+            # if user prompt, concatenate it with system_prompt_holder
+            elif message['role'] == 'user':
+                if system_prompt_holder:
+                    new_message = {'role': message['role'], 'content': f"{system_prompt_holder} {message['content']}"}
+                    system_prompt_holder = ""
+                else:
+                    new_message = {'role': message['role'], 'content': message['content']}
+                new_messages.append(new_message)
+            # if assistant/other prompt, do nothing
+            else:
+                new_message = {'role': message['role'], 'content': message['content']}
+                new_messages.append(new_message)
+        return new_messages
+    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[Dict[str, str], None, None]]:
+        """
+        This method postprocesses the LLM response after it is generated.
+        Parameters:
+        ----------
+        response : Union[str, Generator[str, None, None]]
+            the LLM response. Can be a string or a generator.
+        Returns: Union[str, Generator[Dict[str, str], None, None]]
+            the postprocessed LLM response.
+            if input is a generator, the output will be a generator {"type": "response", "data": <content>}.
+        """
+        if isinstance(response, str):
+            return response
+        def _process_stream():
+            for chunk in response:
+                yield {"type": "response", "data": chunk}
+        return _process_stream()
 class InferenceEngine:
     @abc.abstractmethod
-    def __init__(self):
+    def __init__(self, config:LLMConfig, **kwrs):
         """
         This is an abstract class to provide interfaces for LLM inference engines.
         Children classes that inherts this class can be used in extrators. Must implement chat() method.
+        Parameters:
+        ----------
+        config : LLMConfig
+            the LLM configuration. Must be a child class of LLMConfig.
         """
         return NotImplemented
     @abc.abstractmethod
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0,
-             verbose:bool=False, stream:bool=False, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def chat(self, messages:List[Dict[str,str]],
+             verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -24,10 +292,6 @@ class InferenceEngine:
         ----------
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens LLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
         verbose : bool, Optional
             if True, LLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
@@ -35,9 +299,18 @@ class InferenceEngine:
         """
         return NotImplemented
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        Return : Dict[str, Any]
+            the config parameters.
+        """
+        return NotImplemented
 class LlamaCppInferenceEngine(InferenceEngine):
-    def __init__(self, repo_id:str, gguf_filename:str, n_ctx:int=4096, n_gpu_layers:int=-1, **kwrs):
+    def __init__(self, repo_id:str, gguf_filename:str, n_ctx:int=4096, n_gpu_layers:int=-1, config:LLMConfig=None, **kwrs):
         """
         The Llama.cpp inference engine.
@@ -52,12 +325,16 @@ class LlamaCppInferenceEngine(InferenceEngine):
             context length that LLM will evaluate.
         n_gpu_layers : int, Optional
             number of layers to offload to GPU. Default is all layers (-1).
+        config : LLMConfig
+            the LLM configuration.
         """
         from llama_cpp import Llama
         self.repo_id = repo_id
         self.gguf_filename = gguf_filename
         self.n_ctx = n_ctx
         self.n_gpu_layers = n_gpu_layers
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
         self.model = Llama.from_pretrained(
             repo_id=self.repo_id,
@@ -73,8 +350,18 @@ class LlamaCppInferenceEngine(InferenceEngine):
         """
         del self.model
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, verbose:bool=False, **kwrs) -> str:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False) -> str:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -82,19 +369,15 @@ class LlamaCppInferenceEngine(InferenceEngine):
         ----------
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens LLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
         verbose : bool, Optional
             if True, LLM generated text will be printed in terminal in real-time.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         response = self.model.create_chat_completion(
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
+                    messages=processed_messages,
                     stream=verbose,
-                    **kwrs
+                    **self.formatted_params
                 )
         if verbose:
@@ -105,13 +388,14 @@ class LlamaCppInferenceEngine(InferenceEngine):
                     res += out_dict['content']
                     print(out_dict['content'], end='', flush=True)
             print('\n')
-            return res
+            return self.config.postprocess_response(res)
-        return response['choices'][0]['message']['content']
+        res = response['choices'][0]['message']['content']
+        return self.config.postprocess_response(res)
 class OllamaInferenceEngine(InferenceEngine):
-    def __init__(self, model_name:str, num_ctx:int=4096, keep_alive:int=300, **kwrs):
+    def __init__(self, model_name:str, num_ctx:int=4096, keep_alive:int=300, config:LLMConfig=None, **kwrs):
         """
         The Ollama inference engine.
@@ -123,6 +407,8 @@ class OllamaInferenceEngine(InferenceEngine):
             context length that LLM will evaluate.
         keep_alive : int, Optional
             seconds to hold the LLM after the last API call.
+        config : LLMConfig
+            the LLM configuration.
         """
         if importlib.util.find_spec("ollama") is None:
             raise ImportError("ollama-python not found. Please install ollama-python (```pip install ollama```).")
@@ -133,9 +419,22 @@ class OllamaInferenceEngine(InferenceEngine):
         self.model_name = model_name
         self.num_ctx = num_ctx
         self.keep_alive = keep_alive
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0,
-             verbose:bool=False, stream:bool=False, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["num_predict"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
+    def chat(self, messages:List[Dict[str,str]],
+             verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs VLM generated text.
@@ -143,21 +442,19 @@ class OllamaInferenceEngine(InferenceEngine):
         ----------
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens VLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
         verbose : bool, Optional
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
         """
-        options={'temperature':temperature, 'num_ctx': self.num_ctx, 'num_predict': max_new_tokens, **kwrs}
+        processed_messages = self.config.preprocess_messages(messages)
+        options={'num_ctx': self.num_ctx, **self.formatted_params}
         if stream:
             def _stream_generator():
                 response_stream = self.client.chat(
                     model=self.model_name,
-                    messages=messages,
+                    messages=processed_messages,
                     options=options,
                     stream=True,
                     keep_alive=self.keep_alive
@@ -167,12 +464,12 @@ class OllamaInferenceEngine(InferenceEngine):
                     if content_chunk:
                         yield content_chunk
-            return _stream_generator()
+            return self.config.postprocess_response(_stream_generator())
         elif verbose:
             response = self.client.chat(
                             model=self.model_name,
-                            messages=messages,
+                            messages=processed_messages,
                             options=options,
                             stream=True,
                             keep_alive=self.keep_alive
@@ -184,48 +481,82 @@ class OllamaInferenceEngine(InferenceEngine):
                 print(content_chunk, end='', flush=True)
                 res += content_chunk
             print('\n')
-            return res
+            return self.config.postprocess_response(res)
         else:
             response = self.client.chat(
                                 model=self.model_name,
-                                messages=messages,
+                                messages=processed_messages,
                                 options=options,
                                 stream=False,
                                 keep_alive=self.keep_alive
                             )
-            return response.get('message', {}).get('content')
+            res = response.get('message', {}).get('content')
+            return self.config.postprocess_response(res)
-    async def chat_async(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, **kwrs) -> str:
+    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
         """
         Async version of chat method. Streaming is not supported.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         response = await self.async_client.chat(
                             model=self.model_name,
-                            messages=messages,
-                            options={'temperature':temperature, 'num_ctx': self.num_ctx, 'num_predict': max_new_tokens, **kwrs},
+                            messages=processed_messages,
+                            options={'num_ctx': self.num_ctx, **self.formatted_params},
                             stream=False,
                             keep_alive=self.keep_alive
                         )
-        return response['message']['content']
+        res = response['message']['content']
+        return self.config.postprocess_response(res)
 class HuggingFaceHubInferenceEngine(InferenceEngine):
-    def __init__(self, model:str=None, token:Union[str, bool]=None, base_url:str=None, api_key:str=None, **kwrs):
+    def __init__(self, model:str=None, token:Union[str, bool]=None, base_url:str=None, api_key:str=None, config:LLMConfig=None, **kwrs):
         """
         The Huggingface_hub InferenceClient inference engine.
         For parameters and documentation, refer to https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client
+        Parameters:
+        ----------
+        model : str
+            the model name exactly as shown in Huggingface repo
+        token : str, Optional
+            the Huggingface token. If None, will use the token in os.environ['HF_TOKEN'].
+        base_url : str, Optional
+            the base url for the LLM server. If None, will use the default Huggingface Hub URL.
+        api_key : str, Optional
+            the API key for the LLM server.
+        config : LLMConfig
+            the LLM configuration.
         """
         if importlib.util.find_spec("huggingface_hub") is None:
             raise ImportError("huggingface-hub not found. Please install huggingface-hub (```pip install huggingface-hub```).")
         from huggingface_hub import InferenceClient, AsyncInferenceClient
+        self.model = model
+        self.base_url = base_url
         self.client = InferenceClient(model=model, token=token, base_url=base_url, api_key=api_key, **kwrs)
         self.client_async = AsyncInferenceClient(model=model, token=token, base_url=base_url, api_key=api_key, **kwrs)
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0,
-             verbose:bool=False, stream:bool=False, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
+    def chat(self, messages:List[Dict[str,str]],
+             verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -233,38 +564,32 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
         ----------
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens LLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
         verbose : bool, Optional
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         if stream:
             def _stream_generator():
                 response_stream = self.client.chat.completions.create(
-                                    messages=messages,
-                                    max_tokens=max_new_tokens,
-                                    temperature=temperature,
+                                    messages=processed_messages,
                                     stream=True,
-                                    **kwrs
+                                    **self.formatted_params
                                 )
                 for chunk in response_stream:
                     content_chunk = chunk.get('choices')[0].get('delta').get('content')
                     if content_chunk:
                         yield content_chunk
-            return _stream_generator()
+            return self.config.postprocess_response(_stream_generator())
         elif verbose:
             response = self.client.chat.completions.create(
-                            messages=messages,
-                            max_tokens=max_new_tokens,
-                            temperature=temperature,
+                            messages=processed_messages,
                             stream=True,
-                            **kwrs
+                            **self.formatted_params
                         )
             res = ''
@@ -273,35 +598,35 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
                 if content_chunk:
                     res += content_chunk
                     print(content_chunk, end='', flush=True)
-            return res
+            return self.config.postprocess_response(res)
         else:
             response = self.client.chat.completions.create(
-                                messages=messages,
-                                max_tokens=max_new_tokens,
-                                temperature=temperature,
+                                messages=processed_messages,
                                 stream=False,
-                                **kwrs
+                                **self.formatted_params
                             )
-            return response.choices[0].message.content
+            res = response.choices[0].message.content
+            return self.config.postprocess_response(res)
-    async def chat_async(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, **kwrs) -> str:
+    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
         """
         Async version of chat method. Streaming is not supported.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         response = await self.client_async.chat.completions.create(
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
+                    messages=processed_messages,
                     stream=False,
-                    **kwrs
+                    **self.formatted_params
                 )
-        return response.choices[0].message.content
+        res = response.choices[0].message.content
+        return self.config.postprocess_response(res)
 class OpenAIInferenceEngine(InferenceEngine):
-    def __init__(self, model:str, reasoning_model:bool=False, **kwrs):
+    def __init__(self, model:str, config:LLMConfig=None, **kwrs):
         """
         The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers:
         - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
@@ -313,8 +638,6 @@ class OpenAIInferenceEngine(InferenceEngine):
         ----------
         model_name : str
             model name as described in https://platform.openai.com/docs/models
-        reasoning_model : bool, Optional
-            indicator for OpenAI reasoning models ("o" series).
         """
         if importlib.util.find_spec("openai") is None:
             raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
@@ -323,10 +646,21 @@ class OpenAIInferenceEngine(InferenceEngine):
         self.client = OpenAI(**kwrs)
         self.async_client = AsyncOpenAI(**kwrs)
         self.model = model
-        self.reasoning_model = reasoning_model
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_completion_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0,
-             verbose:bool=False, stream:bool=False, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -334,177 +668,81 @@ class OpenAIInferenceEngine(InferenceEngine):
         ----------
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens LLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
         verbose : bool, Optional
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
         """
-        # For reasoning models
-        if self.reasoning_model:
-            # Reasoning models do not support temperature parameter
-            if temperature != 0.0:
-                warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
-            # Reasoning models do not support system prompts
-            if any(msg['role'] == 'system' for msg in messages):
-                warnings.warn("Reasoning models do not support system prompts. Will be ignored.", UserWarning)
-                messages = [msg for msg in messages if msg['role'] != 'system']
-            if stream:
-                def _stream_generator():
-                    response_stream = self.client.chat.completions.create(
-                                            model=self.model,
-                                            messages=messages,
-                                            max_completion_tokens=max_new_tokens,
-                                            stream=True,
-                                            **kwrs
-                                        )
-                    for chunk in response_stream:
-                        if len(chunk.choices) > 0:
-                            if chunk.choices[0].delta.content is not None:
-                                yield chunk.choices[0].delta.content
-                            if chunk.choices[0].finish_reason == "length":
-                                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
-                                if self.reasoning_model:
-                                    warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
-                return _stream_generator()
-            elif verbose:
-                response = self.client.chat.completions.create(
-                    model=self.model,
-                    messages=messages,
-                    max_completion_tokens=max_new_tokens,
-                    stream=True,
-                    **kwrs
-                )
-                res = ''
-                for chunk in response:
-                    if len(chunk.choices) > 0:
-                        if chunk.choices[0].delta.content is not None:
-                            res += chunk.choices[0].delta.content
-                            print(chunk.choices[0].delta.content, end="", flush=True)
-                        if chunk.choices[0].finish_reason == "length":
-                            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
-                            if self.reasoning_model:
-                                warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
+        processed_messages = self.config.preprocess_messages(messages)
-                print('\n')
-                return res
-            else:
-                response = self.client.chat.completions.create(
-                    model=self.model,
-                    messages=messages,
-                    max_completion_tokens=max_new_tokens,
-                    stream=False,
-                    **kwrs
-                )
-                return response.choices[0].message.content
-        # For non-reasoning models
-        else:
-            if stream:
-                def _stream_generator():
-                    response_stream = self.client.chat.completions.create(
-                        model=self.model,
-                        messages=messages,
-                        max_tokens=max_new_tokens,
-                        temperature=temperature,
-                        stream=True,
-                        **kwrs
-                    )
-                    for chunk in response_stream:
-                        if len(chunk.choices) > 0:
-                            if chunk.choices[0].delta.content is not None:
-                                yield chunk.choices[0].delta.content
-                            if chunk.choices[0].finish_reason == "length":
-                                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
-                                if self.reasoning_model:
-                                    warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
-                return _stream_generator()
-            elif verbose:
-                response = self.client.chat.completions.create(
-                    model=self.model,
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
-                    stream=True,
-                    **kwrs
-                )
-                res = ''
-                for chunk in response:
+        if stream:
+            def _stream_generator():
+                response_stream = self.client.chat.completions.create(
+                                        model=self.model,
+                                        messages=processed_messages,
+                                        stream=True,
+                                        **self.formatted_params
+                                    )
+                for chunk in response_stream:
                     if len(chunk.choices) > 0:
                         if chunk.choices[0].delta.content is not None:
-                            res += chunk.choices[0].delta.content
-                            print(chunk.choices[0].delta.content, end="", flush=True)
+                            yield chunk.choices[0].delta.content
                         if chunk.choices[0].finish_reason == "length":
                             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
-                            if self.reasoning_model:
-                                warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
-                print('\n')
-                return res
-            else:
-                response = self.client.chat.completions.create(
-                    model=self.model,
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
-                    stream=False,
-                    **kwrs
-                )
-            return response.choices[0].message.content
-    async def chat_async(self, messages:List[Dict[str,str]], max_new_tokens:int=4096, temperature:float=0.0, **kwrs) -> str:
-        """
-        Async version of chat method. Streaming is not supported.
-        """
-        if self.reasoning_model:
-            # Reasoning models do not support temperature parameter
-            if temperature != 0.0:
-                warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
-            # Reasoning models do not support system prompts
-            if any(msg['role'] == 'system' for msg in messages):
-                warnings.warn("Reasoning models do not support system prompts. Will be ignored.", UserWarning)
-                messages = [msg for msg in messages if msg['role'] != 'system']
+            return self.config.postprocess_response(_stream_generator())
-            response = await self.async_client.chat.completions.create(
+        elif verbose:
+            response = self.client.chat.completions.create(
                 model=self.model,
-                messages=messages,
-                max_completion_tokens=max_new_tokens,
-                stream=False,
-                **kwrs
+                messages=processed_messages,
+                stream=True,
+                **self.formatted_params
             )
+            res = ''
+            for chunk in response:
+                if len(chunk.choices) > 0:
+                    if chunk.choices[0].delta.content is not None:
+                        res += chunk.choices[0].delta.content
+                        print(chunk.choices[0].delta.content, end="", flush=True)
+                    if chunk.choices[0].finish_reason == "length":
+                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+            print('\n')
+            return self.config.postprocess_response(res)
         else:
-            response = await self.async_client.chat.completions.create(
+            response = self.client.chat.completions.create(
                 model=self.model,
-                messages=messages,
-                max_tokens=max_new_tokens,
-                temperature=temperature,
+                messages=processed_messages,
                 stream=False,
-                **kwrs
+                **self.formatted_params
             )
+            res = response.choices[0].message.content
+            return self.config.postprocess_response(res)
+    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
+        """
+        Async version of chat method. Streaming is not supported.
+        """
+        processed_messages = self.config.preprocess_messages(messages)
+        response = await self.async_client.chat.completions.create(
+            model=self.model,
+            messages=processed_messages,
+            stream=False,
+            **self.formatted_params
+        )
         if response.choices[0].finish_reason == "length":
             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
-            if self.reasoning_model:
-                warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
-        return response.choices[0].message.content
+        res = response.choices[0].message.content
+        return self.config.postprocess_response(res)
 class AzureOpenAIInferenceEngine(OpenAIInferenceEngine):
-    def __init__(self, model:str, api_version:str, reasoning_model:bool=False, **kwrs):
+    def __init__(self, model:str, api_version:str, config:LLMConfig=None, **kwrs):
         """
         The Azure OpenAI API inference engine.
         For parameters and documentation, refer to
@@ -517,8 +755,8 @@ class AzureOpenAIInferenceEngine(OpenAIInferenceEngine):
             model name as described in https://platform.openai.com/docs/models
         api_version : str
             the Azure OpenAI API version
-        reasoning_model : bool, Optional
-            indicator for OpenAI reasoning models ("o" series).
+        config : LLMConfig
+            the LLM configuration.
         """
         if importlib.util.find_spec("openai") is None:
             raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
@@ -530,11 +768,12 @@ class AzureOpenAIInferenceEngine(OpenAIInferenceEngine):
                                   **kwrs)
         self.async_client = AsyncAzureOpenAI(api_version=self.api_version,
                                              **kwrs)
-        self.reasoning_model = reasoning_model
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
 class LiteLLMInferenceEngine(InferenceEngine):
-    def __init__(self, model:str=None, base_url:str=None, api_key:str=None):
+    def __init__(self, model:str=None, base_url:str=None, api_key:str=None, config:LLMConfig=None):
         """
         The LiteLLM inference engine.
         For parameters and documentation, refer to https://github.com/BerriAI/litellm?tab=readme-ov-file
@@ -547,6 +786,8 @@ class LiteLLMInferenceEngine(InferenceEngine):
             the base url for the LLM server
         api_key : str, Optional
             the API key for the LLM server
+        config : LLMConfig
+            the LLM configuration.
         """
         if importlib.util.find_spec("litellm") is None:
             raise ImportError("litellm not found. Please install litellm (```pip install litellm```).")
@@ -556,36 +797,44 @@ class LiteLLMInferenceEngine(InferenceEngine):
         self.model = model
         self.base_url = base_url
         self.api_key = api_key
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
-    def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0,
-             verbose:bool=False, stream:bool=False, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
         Parameters:
         ----------
         messages : List[Dict[str,str]]
-            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
-        max_new_tokens : str, Optional
-            the max number of new tokens LLM can generate.
-        temperature : float, Optional
-            the temperature for token sampling.
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
         verbose : bool, Optional
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         if stream:
             def _stream_generator():
                 response_stream = self.litellm.completion(
                     model=self.model,
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
+                    messages=processed_messages,
                     stream=True,
                     base_url=self.base_url,
                     api_key=self.api_key,
-                    **kwrs
+                    **self.formatted_params
                 )
                 for chunk in response_stream:
@@ -593,18 +842,16 @@ class LiteLLMInferenceEngine(InferenceEngine):
                     if chunk_content:
                         yield chunk_content
-            return _stream_generator()
+            return self.config.postprocess_response(_stream_generator())
         elif verbose:
             response = self.litellm.completion(
                 model=self.model,
-                messages=messages,
-                max_tokens=max_new_tokens,
-                temperature=temperature,
+                messages=processed_messages,
                 stream=True,
                 base_url=self.base_url,
                 api_key=self.api_key,
-                **kwrs
+                **self.formatted_params
             )
             res = ''
@@ -614,34 +861,34 @@ class LiteLLMInferenceEngine(InferenceEngine):
                     res += chunk_content
                     print(chunk_content, end='', flush=True)
-            return res
+            return self.config.postprocess_response(res)
         else:
             response = self.litellm.completion(
                     model=self.model,
-                    messages=messages,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
+                    messages=processed_messages,
                     stream=False,
                     base_url=self.base_url,
                     api_key=self.api_key,
-                    **kwrs
+                    **self.formatted_params
                 )
-            return response.choices[0].message.content
+            res = response.choices[0].message.content
+            return self.config.postprocess_response(res)
-    async def chat_async(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, **kwrs) -> str:
+    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
         """
         Async version of chat method. Streaming is not supported.
         """
+        processed_messages = self.config.preprocess_messages(messages)
         response = await self.litellm.acompletion(
             model=self.model,
-            messages=messages,
-            max_tokens=max_new_tokens,
-            temperature=temperature,
+            messages=processed_messages,
             stream=False,
             base_url=self.base_url,
             api_key=self.api_key,
-            **kwrs
+            **self.formatted_params
         )
-        return response.get('choices')[0].get('message').get('content')
+        res = response.get('choices')[0].get('message').get('content')
+        return self.config.postprocess_response(res)

llm-ie 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

llm-ie 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl