PyPI - llm-ie - Versions diffs - 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl - Mend

llm-ie 1.2.2py3-none-any.whl → 1.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

llm_ie/__init__.py +5 -4
llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
llm_ie/chunkers.py +145 -6
llm_ie/data_types.py +23 -37
llm_ie/engines.py +621 -61
llm_ie/extractors.py +341 -297
llm_ie/prompt_editor.py +9 -32
llm_ie/utils.py +95 -0
{llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/METADATA +1 -1
{llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/RECORD +11 -9
{llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/WHEEL +0 -0

llm_ie/engines.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import abc
+import os
 import re
 import warnings
 import importlib.util
@@ -33,13 +34,13 @@ class LLMConfig(abc.ABC):
         return NotImplemented
     @abc.abstractmethod
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method postprocesses the LLM response after it is generated.
         Parameters:
         ----------
-        response : Union[str, Generator[Dict[str, str], None, None]]
+        response : Union[str, Dict[str, str], Generator[Dict[str, str], None, None]]
             the LLM response. Can be a dict or a generator.
         Returns:
@@ -75,15 +76,15 @@ class BasicLLMConfig(LLMConfig):
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
         """
-        return messages
+        return messages.copy()
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method postprocesses the LLM response after it is generated.
         Parameters:
         ----------
-        response : Union[str, Generator[str, None, None]]
+        response : Union[str, Dict[str, str], Generator[str, None, None]]
             the LLM response. Can be a string or a generator.
         Returns: Union[Dict[str,str], Generator[Dict[str, str], None, None]]
@@ -93,13 +94,27 @@ class BasicLLMConfig(LLMConfig):
         """
         if isinstance(response, str):
             return {"response": response}
+        elif isinstance(response, dict):
+            if "response" in response:
+                return response
+            else:
+                warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
+                return {"response": ""}
-        def _process_stream():
-            for chunk in response:
-                yield {"type": "response", "data": chunk}
+        elif isinstance(response, Generator):
+            def _process_stream():
+                for chunk in response:
+                    if isinstance(chunk, dict):
+                        yield chunk
+                    elif isinstance(chunk, str):
+                        yield {"type": "response", "data": chunk}
-        return _process_stream()
+            return _process_stream()
+        else:
+            warnings.warn(f"Invalid response type: {type(response)}. Returning default empty dict.", UserWarning)
+            return {"response": ""}
 class ReasoningLLMConfig(LLMConfig):
     def __init__(self, thinking_token_start="<think>", thinking_token_end="</think>", **kwargs):
@@ -124,11 +139,16 @@ class ReasoningLLMConfig(LLMConfig):
         messages : List[Dict[str,str]]
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
         """
-        return messages
+        return messages.copy()
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str,str], None, None]]:
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str,str], None, None]]:
         """
         This method postprocesses the LLM response after it is generated.
+        1. If input is a string, it will extract the reasoning and response based on the thinking tokens.
+        2. If input is a dict, it should contain keys "reasoning" and "response". This is for inference engines that already parse reasoning and response.
+        3. If input is a generator,
+            a. if the chunk is a dict, it should contain keys "type" and "data". This is for inference engines that already parse reasoning and response.
+            b. if the chunk is a string, it will yield dicts with keys "type" and "data" based on the thinking tokens.
         Parameters:
         ----------
@@ -143,18 +163,29 @@ class ReasoningLLMConfig(LLMConfig):
         """
         if isinstance(response, str):
             # get contents between thinking_token_start and thinking_token_end
-            match = re.search(f"{self.thinking_token_start}.*?{self.thinking_token_end}", response, re.DOTALL)
-            reasoning = match.group(0) if match else ""
+            pattern = f"{re.escape(self.thinking_token_start)}(.*?){re.escape(self.thinking_token_end)}"
+            match = re.search(pattern, response, re.DOTALL)
+            reasoning = match.group(1) if match else ""
             # get response AFTER thinking_token_end
             response = re.sub(f".*?{self.thinking_token_end}", "", response, flags=re.DOTALL).strip()
             return {"reasoning": reasoning, "response": response}
-        if isinstance(response, Generator):
+        elif isinstance(response, dict):
+            if "reasoning" in response and "response" in response:
+                return response
+            else:
+                warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
+                return {"reasoning": "", "response": ""}
+        elif isinstance(response, Generator):
             def _process_stream():
                 think_flag = False
                 buffer = ""
                 for chunk in response:
-                    if isinstance(chunk, str):
+                    if isinstance(chunk, dict):
+                        yield chunk
+                    elif isinstance(chunk, str):
                         buffer += chunk
                         # switch between reasoning and response
                         if self.thinking_token_start in buffer:
@@ -173,6 +204,9 @@ class ReasoningLLMConfig(LLMConfig):
             return _process_stream()
+        else:
+            warnings.warn(f"Invalid response type: {type(response)}. Returning default empty dict.", UserWarning)
+            return {"reasoning": "", "response": ""}
 class Qwen3LLMConfig(ReasoningLLMConfig):
     def __init__(self, thinking_mode:bool=True, **kwargs):
@@ -279,6 +313,32 @@ class OpenAIReasoningLLMConfig(ReasoningLLMConfig):
         return new_messages
+class MessagesLogger:
+    def __init__(self):
+        """
+        This class is used to log the messages for InferenceEngine.chat().
+        """
+        self.messages_log = []
+    def log_messages(self, messages : List[Dict[str,str]]):
+        """
+        This method logs the messages to a list.
+        """
+        self.messages_log.append(messages)
+    def get_messages_log(self) -> List[List[Dict[str,str]]]:
+        """
+        This method returns a copy of the current messages log
+        """
+        return self.messages_log.copy()
+    def clear_messages_log(self):
+        """
+        This method clears the current messages log
+        """
+        self.messages_log.clear()
 class InferenceEngine:
     @abc.abstractmethod
     def __init__(self, config:LLMConfig, **kwrs):
@@ -293,10 +353,16 @@ class InferenceEngine:
         """
         return NotImplemented
+    def get_messages_log(self) -> List[List[Dict[str,str]]]:
+        return self.messages_log.copy()
+    def clear_messages_log(self):
+        self.messages_log = []
     @abc.abstractmethod
-    def chat(self, messages:List[Dict[str,str]],
-             verbose:bool=False, stream:bool=False) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -308,6 +374,8 @@ class InferenceEngine:
             if True, LLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        Messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         Returns:
         -------
@@ -346,6 +414,7 @@ class LlamaCppInferenceEngine(InferenceEngine):
             the LLM configuration.
         """
         from llama_cpp import Llama
+        super().__init__(config)
         self.repo_id = repo_id
         self.gguf_filename = gguf_filename
         self.n_ctx = n_ctx
@@ -378,7 +447,7 @@ class LlamaCppInferenceEngine(InferenceEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False) -> Dict[str,str]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -388,15 +457,18 @@ class LlamaCppInferenceEngine(InferenceEngine):
             a list of dict with role and content. role must be one of {"system", "user", "assistant"}
         verbose : bool, Optional
             if True, LLM generated text will be printed in terminal in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         """
+        # Preprocess messages
         processed_messages = self.config.preprocess_messages(messages)
+        # Generate response
         response = self.model.create_chat_completion(
                     messages=processed_messages,
                     stream=verbose,
                     **self.formatted_params
                 )
         if verbose:
             res = ''
             for chunk in response:
@@ -408,7 +480,16 @@ class LlamaCppInferenceEngine(InferenceEngine):
             return self.config.postprocess_response(res)
         res = response['choices'][0]['message']['content']
-        return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
 class OllamaInferenceEngine(InferenceEngine):
@@ -431,6 +512,7 @@ class OllamaInferenceEngine(InferenceEngine):
             raise ImportError("ollama-python not found. Please install ollama-python (```pip install ollama```).")
         from ollama import Client, AsyncClient
+        super().__init__(config)
         self.client = Client(**kwrs)
         self.async_client = AsyncClient(**kwrs)
         self.model_name = model_name
@@ -450,8 +532,8 @@ class OllamaInferenceEngine(InferenceEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]],
-             verbose:bool=False, stream:bool=False) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs VLM generated text.
@@ -463,6 +545,8 @@ class OllamaInferenceEngine(InferenceEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        Messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         Returns:
         -------
@@ -481,10 +565,28 @@ class OllamaInferenceEngine(InferenceEngine):
                     stream=True,
                     keep_alive=self.keep_alive
                 )
+                res = {"reasoning": "", "response": ""}
                 for chunk in response_stream:
-                    content_chunk = chunk.get('message', {}).get('content')
-                    if content_chunk:
-                        yield content_chunk
+                    if hasattr(chunk.message, 'thinking') and chunk.message.thinking:
+                        content_chunk = getattr(getattr(chunk, 'message', {}), 'thinking', '')
+                        res["reasoning"] += content_chunk
+                        yield {"type": "reasoning", "data": content_chunk}
+                    else:
+                        content_chunk = getattr(getattr(chunk, 'message', {}), 'content', '')
+                        res["response"] += content_chunk
+                        yield {"type": "response", "data": content_chunk}
+                    if chunk.done_reason == "length":
+                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res)
+                # Write to messages log
+                if messages_logger:
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
@@ -497,14 +599,29 @@ class OllamaInferenceEngine(InferenceEngine):
                             keep_alive=self.keep_alive
                         )
-            res = ''
+            res = {"reasoning": "", "response": ""}
+            phase = ""
             for chunk in response:
-                content_chunk = chunk.get('message', {}).get('content')
+                if hasattr(chunk.message, 'thinking') and chunk.message.thinking:
+                    if phase != "reasoning":
+                        print("\n--- Reasoning ---")
+                        phase = "reasoning"
+                    content_chunk = getattr(getattr(chunk, 'message', {}), 'thinking', '')
+                    res["reasoning"] += content_chunk
+                else:
+                    if phase != "response":
+                        print("\n--- Response ---")
+                        phase = "response"
+                    content_chunk = getattr(getattr(chunk, 'message', {}), 'content', '')
+                    res["response"] += content_chunk
                 print(content_chunk, end='', flush=True)
-                res += content_chunk
+                if chunk.done_reason == "length":
+                    warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
             print('\n')
-            return self.config.postprocess_response(res)
         else:
             response = self.client.chat(
                                 model=self.model_name,
@@ -513,11 +630,25 @@ class OllamaInferenceEngine(InferenceEngine):
                                 stream=False,
                                 keep_alive=self.keep_alive
                             )
-            res = response.get('message', {}).get('content')
-            return self.config.postprocess_response(res)
+            res = {"reasoning": getattr(getattr(response, 'message', {}), 'thinking', ''),
+                   "response": getattr(getattr(response, 'message', {}), 'content', '')}
+            if response.done_reason == "length":
+                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> Dict[str,str]:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -531,8 +662,21 @@ class OllamaInferenceEngine(InferenceEngine):
                             keep_alive=self.keep_alive
                         )
-        res = response['message']['content']
-        return self.config.postprocess_response(res)
+        res = {"reasoning": getattr(getattr(response, 'message', {}), 'thinking', ''),
+               "response": getattr(getattr(response, 'message', {}), 'content', '')}
+        if response.done_reason == "length":
+            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
 class HuggingFaceHubInferenceEngine(InferenceEngine):
@@ -558,6 +702,7 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
             raise ImportError("huggingface-hub not found. Please install huggingface-hub (```pip install huggingface-hub```).")
         from huggingface_hub import InferenceClient, AsyncInferenceClient
+        super().__init__(config)
         self.model = model
         self.base_url = base_url
         self.client = InferenceClient(model=model, token=token, base_url=base_url, api_key=api_key, **kwrs)
@@ -577,8 +722,8 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]],
-             verbose:bool=False, stream:bool=False) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -590,7 +735,9 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         Returns:
         -------
         response : Union[Dict[str,str], Generator[Dict[str, str], None, None]]
@@ -605,11 +752,22 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
                                     stream=True,
                                     **self.formatted_params
                                 )
+                res_text = ""
                 for chunk in response_stream:
                     content_chunk = chunk.get('choices')[0].get('delta').get('content')
                     if content_chunk:
+                        res_text += content_chunk
                         yield content_chunk
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
         elif verbose:
@@ -625,7 +783,7 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
                 if content_chunk:
                     res += content_chunk
                     print(content_chunk, end='', flush=True)
-            return self.config.postprocess_response(res)
         else:
             response = self.client.chat.completions.create(
@@ -634,9 +792,20 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
                                 **self.formatted_params
                             )
             res = response.choices[0].message.content
-            return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> Dict[str,str]:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -649,16 +818,343 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
                 )
         res = response.choices[0].message.content
-        return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
+class OpenAICompatibleInferenceEngine(InferenceEngine):
+    def __init__(self, model:str, api_key:str, base_url:str, config:LLMConfig=None, **kwrs):
+        """
+        General OpenAI-compatible server inference engine.
+        https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str
+            the API key for the vLLM server.
+        base_url : str
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        if importlib.util.find_spec("openai") is None:
+            raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
+        from openai import OpenAI, AsyncOpenAI
+        from openai.types.chat import ChatCompletionChunk
+        self.ChatCompletionChunk = ChatCompletionChunk
+        super().__init__(config)
+        self.client = OpenAI(api_key=api_key, base_url=base_url, **kwrs)
+        self.async_client = AsyncOpenAI(api_key=api_key, base_url=base_url, **kwrs)
+        self.model = model
+        self.config = config if config else BasicLLMConfig()
+        self.formatted_params = self._format_config()
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the LLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_completion_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
+    @abc.abstractmethod
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        return NotImplemented
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False, messages_logger:MessagesLogger=None) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
+        """
+        This method inputs chat messages and outputs LLM generated text.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        verbose : bool, Optional
+            if True, VLM generated text will be printed in terminal in real-time.
+        stream : bool, Optional
+            if True, returns a generator that yields the output in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
+        Returns:
+        -------
+        response : Union[Dict[str,str], Generator[Dict[str, str], None, None]]
+            a dict {"reasoning": <reasoning>, "response": <response>} or Generator {"type": <reasoning or response>, "data": <content>}
+        """
+        processed_messages = self.config.preprocess_messages(messages)
+        if stream:
+            def _stream_generator():
+                response_stream = self.client.chat.completions.create(
+                                        model=self.model,
+                                        messages=processed_messages,
+                                        stream=True,
+                                        **self.formatted_params
+                                    )
+                res_text = ""
+                for chunk in response_stream:
+                    if len(chunk.choices) > 0:
+                        chunk_dict = self._format_response(chunk)
+                        yield chunk_dict
+                        res_text += chunk_dict["data"]
+                        if chunk.choices[0].finish_reason == "length":
+                            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
+            return self.config.postprocess_response(_stream_generator())
+        elif verbose:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=processed_messages,
+                stream=True,
+                **self.formatted_params
+            )
+            res = {"reasoning": "", "response": ""}
+            phase = ""
+            for chunk in response:
+                if len(chunk.choices) > 0:
+                    chunk_dict = self._format_response(chunk)
+                    chunk_text = chunk_dict["data"]
+                    res[chunk_dict["type"]] += chunk_text
+                    if phase != chunk_dict["type"] and chunk_text != "":
+                        print(f"\n--- {chunk_dict['type'].capitalize()} ---")
+                        phase = chunk_dict["type"]
+                    print(chunk_text, end="", flush=True)
+                    if chunk.choices[0].finish_reason == "length":
+                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+            print('\n')
+        else:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=processed_messages,
+                stream=False,
+                **self.formatted_params
+            )
+            res = self._format_response(response)
+            if response.choices[0].finish_reason == "length":
+                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
+        """
+        Async version of chat method. Streaming is not supported.
+        """
+        processed_messages = self.config.preprocess_messages(messages)
+        response = await self.async_client.chat.completions.create(
+            model=self.model,
+            messages=processed_messages,
+            stream=False,
+            **self.formatted_params
+        )
+        if response.choices[0].finish_reason == "length":
+            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        res = self._format_response(response)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
+class VLLMInferenceEngine(OpenAICompatibleInferenceEngine):
+    def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:8000/v1", config:LLMConfig=None, **kwrs):
+        """
+        vLLM OpenAI compatible server inference engine.
+        https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str, Optional
+            the API key for the vLLM server.
+        base_url : str, Optional
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        super().__init__(model, api_key, base_url, config, **kwrs)
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
+                chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "reasoning", "data": chunk_text}
+            else:
+                chunk_text = getattr(response.choices[0].delta, "content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "response", "data": chunk_text}
+        return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
+                "response": getattr(response.choices[0].message, "content", "")}
+class SGLangInferenceEngine(OpenAICompatibleInferenceEngine):
+    def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:30000/v1", config:LLMConfig=None, **kwrs):
+        """
+        SGLang OpenAI compatible API inference engine.
+        https://docs.sglang.ai/basic_usage/openai_api.html
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str, Optional
+            the API key for the vLLM server.
+        base_url : str, Optional
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        super().__init__(model, api_key, base_url, config, **kwrs)
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
+                chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "reasoning", "data": chunk_text}
+            else:
+                chunk_text = getattr(response.choices[0].delta, "content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "response", "data": chunk_text}
+        return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
+                "response": getattr(response.choices[0].message, "content", "")}
+class OpenRouterInferenceEngine(OpenAICompatibleInferenceEngine):
+    def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, **kwrs):
+        """
+        OpenRouter OpenAI-compatible server inference engine.
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str, Optional
+            the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY'].
+        base_url : str, Optional
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        self.api_key = api_key
+        if self.api_key is None:
+            self.api_key = os.getenv("OPENROUTER_API_KEY")
+        super().__init__(model, self.api_key, base_url, config, **kwrs)
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            if hasattr(response.choices[0].delta, "reasoning") and getattr(response.choices[0].delta, "reasoning") is not None:
+                chunk_text = getattr(response.choices[0].delta, "reasoning", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "reasoning", "data": chunk_text}
+            else:
+                chunk_text = getattr(response.choices[0].delta, "content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "response", "data": chunk_text}
+        return {"reasoning": getattr(response.choices[0].message, "reasoning", ""),
+                "response": getattr(response.choices[0].message, "content", "")}
 class OpenAIInferenceEngine(InferenceEngine):
     def __init__(self, model:str, config:LLMConfig=None, **kwrs):
         """
-        The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers:
-        - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
-        - Llama.cpp OpenAI compatible server (https://llama-cpp-python.readthedocs.io/en/latest/server/)
+        The OpenAI API inference engine.
         For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
         Parameters:
@@ -670,6 +1166,7 @@ class OpenAIInferenceEngine(InferenceEngine):
             raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
         from openai import OpenAI, AsyncOpenAI
+        super().__init__(config)
         self.client = OpenAI(**kwrs)
         self.async_client = AsyncOpenAI(**kwrs)
         self.model = model
@@ -687,7 +1184,7 @@ class OpenAIInferenceEngine(InferenceEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False, messages_logger:MessagesLogger=None) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -699,6 +1196,8 @@ class OpenAIInferenceEngine(InferenceEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         Returns:
         -------
@@ -715,13 +1214,25 @@ class OpenAIInferenceEngine(InferenceEngine):
                                         stream=True,
                                         **self.formatted_params
                                     )
+                res_text = ""
                 for chunk in response_stream:
                     if len(chunk.choices) > 0:
-                        if chunk.choices[0].delta.content is not None:
-                            yield chunk.choices[0].delta.content
+                        chunk_text = chunk.choices[0].delta.content
+                        if chunk_text is not None:
+                            res_text += chunk_text
+                            yield chunk_text
                         if chunk.choices[0].finish_reason == "length":
                             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
         elif verbose:
@@ -741,7 +1252,7 @@ class OpenAIInferenceEngine(InferenceEngine):
                         warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
             print('\n')
-            return self.config.postprocess_response(res)
         else:
             response = self.client.chat.completions.create(
                 model=self.model,
@@ -750,10 +1261,20 @@ class OpenAIInferenceEngine(InferenceEngine):
                 **self.formatted_params
             )
             res = response.choices[0].message.content
-            return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> Dict[str,str]:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -770,7 +1291,16 @@ class OpenAIInferenceEngine(InferenceEngine):
             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
         res = response.choices[0].message.content
-        return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
 class AzureOpenAIInferenceEngine(OpenAIInferenceEngine):
@@ -825,6 +1355,7 @@ class LiteLLMInferenceEngine(InferenceEngine):
             raise ImportError("litellm not found. Please install litellm (```pip install litellm```).")
         import litellm
+        super().__init__(config)
         self.litellm = litellm
         self.model = model
         self.base_url = base_url
@@ -843,7 +1374,7 @@ class LiteLLMInferenceEngine(InferenceEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False, messages_logger:MessagesLogger=None) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -855,6 +1386,8 @@ class LiteLLMInferenceEngine(InferenceEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        messages_logger: MessagesLogger, Optional
+            a messages logger that logs the messages.
         Returns:
         -------
@@ -873,12 +1406,22 @@ class LiteLLMInferenceEngine(InferenceEngine):
                     api_key=self.api_key,
                     **self.formatted_params
                 )
+                res_text = ""
                 for chunk in response_stream:
                     chunk_content = chunk.get('choices')[0].get('delta').get('content')
                     if chunk_content:
+                        res_text += chunk_content
                         yield chunk_content
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
         elif verbose:
@@ -897,8 +1440,6 @@ class LiteLLMInferenceEngine(InferenceEngine):
                 if chunk_content:
                     res += chunk_content
                     print(chunk_content, end='', flush=True)
-            return self.config.postprocess_response(res)
         else:
             response = self.litellm.completion(
@@ -910,9 +1451,19 @@ class LiteLLMInferenceEngine(InferenceEngine):
                     **self.formatted_params
                 )
             res = response.choices[0].message.content
-            return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> Dict[str,str]:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -928,4 +1479,13 @@ class LiteLLMInferenceEngine(InferenceEngine):
         )
         res = response.get('choices')[0].get('message').get('content')
-        return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict

llm-ie 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

llm-ie 1.2.2py3-none-any.whl → 1.2.4py3-none-any.whl