PyPI - vlm4ocr - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

vlm4ocr 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

vlm4ocr/__init__.py +5 -1
vlm4ocr/cli.py +4 -13
vlm4ocr/data_types.py +17 -5
vlm4ocr/ocr_engines.py +30 -13
vlm4ocr/vlm_engines.py +639 -46
{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/METADATA +1 -1
{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/RECORD +9 -9
{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/WHEEL +0 -0
{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/entry_points.txt +0 -0

vlm4ocr/__init__.py CHANGED Viewed

@@ -1,11 +1,15 @@
 from .ocr_engines import OCREngine
-from .vlm_engines import BasicVLMConfig, OpenAIReasoningVLMConfig, OllamaVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine
+from .vlm_engines import BasicVLMConfig, ReasoningVLMConfig, OpenAIReasoningVLMConfig, OllamaVLMEngine, OpenAICompatibleVLMEngine, VLLMVLMEngine, OpenRouterVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine
 __all__ = [
     "BasicVLMConfig",
+    "ReasoningVLMConfig",
     "OpenAIReasoningVLMConfig",
     "OCREngine",
     "OllamaVLMEngine",
+    "OpenAICompatibleVLMEngine",
+    "VLLMVLMEngine",
+    "OpenRouterVLMEngine",
     "OpenAIVLMEngine",
     "AzureOpenAIVLMEngine"
 ]

vlm4ocr/cli.py CHANGED Viewed

@@ -4,18 +4,9 @@ import sys
 import logging
 import asyncio
 import time
-# Attempt to import from the local package structure
-try:
-    from .ocr_engines import OCREngine
-    from .vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
-    from .data_types import OCRResult
-except ImportError:
-    # Fallback for when the package is installed
-    from vlm4ocr.ocr_engines import OCREngine
-    from vlm4ocr.vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
-    from vlm4ocr.data_types import OCRResult
+from .ocr_engines import OCREngine
+from .vlm_engines import OpenAICompatibleVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
+from .data_types import OCRResult
 import tqdm.asyncio
 # --- Global logger setup (console) ---
@@ -208,7 +199,7 @@ def main():
             vlm_engine_instance = OpenAIVLMEngine(model=args.model, api_key=args.api_key, config=config)
         elif args.vlm_engine == "openai_compatible":
             if not args.base_url: parser.error("--base_url is required for openai_compatible.")
-            vlm_engine_instance = OpenAIVLMEngine(model=args.model, api_key=args.api_key, base_url=args.base_url, config=config)
+            vlm_engine_instance = OpenAICompatibleVLMEngine(model=args.model, api_key=args.api_key, base_url=args.base_url, config=config)
         elif args.vlm_engine == "azure_openai":
             if not args.azure_api_key: parser.error("--azure_api_key (or AZURE_OPENAI_API_KEY) is required.")
             if not args.azure_endpoint: parser.error("--azure_endpoint (or AZURE_OPENAI_ENDPOINT) is required.")

vlm4ocr/data_types.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Literal
+from typing import List, Dict, Literal
 from dataclasses import dataclass, field
 from vlm4ocr.utils import get_default_page_delimiter
@@ -24,6 +24,7 @@ class OCRResult:
     pages: List[dict] = field(default_factory=list)
     filename: str = field(init=False)
     status: str = field(init=False, default="processing")
+    messages_log: List[List[Dict[str,str]]] = field(default_factory=list)
     def __post_init__(self):
         """
@@ -67,10 +68,6 @@ class OCRResult:
         }
         self.pages.append(page)
-    def __len__(self):
-        return len(self.pages)
     def get_page(self, idx):
         if not isinstance(idx, int):
             raise ValueError("Index must be an integer")
@@ -78,6 +75,21 @@ class OCRResult:
             raise IndexError(f"Index out of range. The OCRResult has {len(self.pages)} pages, but index {idx} was requested.")
         return self.pages[idx]
+    def clear_messages_log(self):
+        self.messages_log = []
+    def add_messages_to_log(self, messages: List[Dict[str,str]]):
+        if not isinstance(messages, list):
+            raise ValueError("messages must be a list of dict")
+        self.messages_log.extend(messages)
+    def get_messages_log(self) -> List[List[Dict[str,str]]]:
+        return self.messages_log.copy()
+    def __len__(self):
+        return len(self.pages)
     def __iter__(self):
         return iter(self.pages)

vlm4ocr/ocr_engines.py CHANGED Viewed

@@ -6,7 +6,7 @@ from colorama import Fore, Style
 import json
 from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
 from vlm4ocr.data_types import OCRResult
-from vlm4ocr.vlm_engines import VLMEngine
+from vlm4ocr.vlm_engines import VLMEngine, MessagesLogger
 SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
@@ -126,7 +126,8 @@ class OCREngine:
                     stream=True
                 )
                 for chunk in response_stream:
-                    yield {"type": "ocr_chunk", "data": chunk}
+                    if chunk["type"] == "response":
+                        yield {"type": "ocr_chunk", "data": chunk["data"]}
                 if i < len(images) - 1:
                     yield {"type": "page_delimiter", "data": get_default_page_delimiter(self.output_mode)}
@@ -157,7 +158,8 @@ class OCREngine:
                     stream=True
                 )
             for chunk in response_stream:
-                yield {"type": "ocr_chunk", "data": chunk}
+                if chunk["type"] == "response":
+                    yield {"type": "ocr_chunk", "data": chunk["data"]}
     def sequential_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
@@ -271,24 +273,32 @@ class OCREngine:
                 try:
                     messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                    # Define a messages logger to capture messages
+                    messages_logger = MessagesLogger()
+                    # Generate response
                     response = self.vlm_engine.chat(
                         messages,
                         verbose=verbose,
-                        stream=False
+                        stream=False,
+                        messages_logger=messages_logger
                     )
+                    ocr_text = response["response"]
                     # Clean the response if output mode is markdown
                     if self.output_mode == "markdown":
-                        response = clean_markdown(response)
+                        ocr_text = clean_markdown(ocr_text)
                     # Parse the response if output mode is JSON
-                    if self.output_mode == "JSON":
-                        json_list = extract_json(response)
+                    elif self.output_mode == "JSON":
+                        json_list = extract_json(ocr_text)
                         # Serialize the JSON list to a string
-                        response = json.dumps(json_list, indent=4)
+                        ocr_text = json.dumps(json_list, indent=4)
                     # Add the page to the OCR result
-                    ocr_result.add_page(text=response,
+                    ocr_result.add_page(text=ocr_text,
                                         image_processing_status=image_processing_status)
+                    # Add messages log to the OCR result
+                    ocr_result.add_messages_to_log(messages_logger.get_messages_log())
                 except Exception as page_e:
                     ocr_result.status = "error"
@@ -387,6 +397,7 @@ class OCREngine:
             filename = os.path.basename(file_path)
             file_ext = os.path.splitext(file_path)[1].lower()
             result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
+            messages_logger = MessagesLogger()
             # check file extension
             if file_ext not in SUPPORTED_IMAGE_EXTS:
                 result.status = "error"
@@ -416,7 +427,8 @@ class OCREngine:
                         data_loader=data_loader,
                         page_index=page_index,
                         rotate_correction=rotate_correction,
-                        max_dimension_pixels=max_dimension_pixels
+                        max_dimension_pixels=max_dimension_pixels,
+                        messages_logger=messages_logger
                     )
                     page_processing_tasks.append(task)
@@ -428,14 +440,17 @@ class OCREngine:
             except Exception as e:
                 result.status = "error"
                 result.add_page(text=f"Error during OCR for {filename}: {str(e)}", image_processing_status={})
+                result.add_messages_to_log(messages_logger.get_messages_log())
                 return result
         # Set status to success if no errors occurred
         result.status = "success"
+        result.add_messages_to_log(messages_logger.get_messages_log())
         return result
     async def _ocr_page_with_semaphore(self, vlm_call_semaphore: asyncio.Semaphore, data_loader: DataLoader,
-                                       page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Tuple[str, Dict[str, str]]:
+                                       page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                                       messages_logger:MessagesLogger=None) -> Tuple[str, Dict[str, str]]:
         """
         This internal method takes a semaphore and OCR a single image/page using the VLM inference engine.
@@ -476,15 +491,17 @@ class OCREngine:
                     }
             messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
-            ocr_text = await self.vlm_engine.chat_async(
+            response = await self.vlm_engine.chat_async(
                 messages,
+                messages_logger=messages_logger
             )
+            ocr_text = response["response"]
             # Clean the OCR text if output mode is markdown
             if self.output_mode == "markdown":
                 ocr_text = clean_markdown(ocr_text)
             # Parse the response if output mode is JSON
-            if self.output_mode == "JSON":
+            elif self.output_mode == "JSON":
                 json_list = extract_json(ocr_text)
                 # Serialize the JSON list to a string
                 ocr_text = json.dumps(json_list, indent=4)

vlm4ocr/vlm_engines.py CHANGED Viewed

@@ -2,6 +2,8 @@ import abc
 import importlib.util
 from typing import Any, List, Dict, Union, Generator
 import warnings
+import os
+import re
 from PIL import Image
 from vlm4ocr.utils import image_to_base64
@@ -33,7 +35,7 @@ class VLMConfig(abc.ABC):
         return NotImplemented
     @abc.abstractmethod
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[str, None, None]]:
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[str, Generator[str, None, None]]:
         """
         This method postprocesses the VLM response after it is generated.
@@ -77,7 +79,7 @@ class BasicVLMConfig(VLMConfig):
         """
         return messages
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[str, None, None]]:
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
         """
         This method postprocesses the VLM response after it is generated.
@@ -88,19 +90,121 @@ class BasicVLMConfig(VLMConfig):
         Returns: Union[str, Generator[Dict[str, str], None, None]]
             the postprocessed VLM response.
-            if input is a generator, the output will be a generator {"data": <content>}.
+            if input is a generator, the output will be a generator {"type": "response", "data": <content>}.
         """
         if isinstance(response, str):
-            return response
+            return {"response": response}
+        elif isinstance(response, dict):
+            if "response" in response:
+                return response
+            else:
+                warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
+                return {"response": ""}
         def _process_stream():
             for chunk in response:
-                yield chunk
+                if isinstance(chunk, dict):
+                    yield chunk
+                elif isinstance(chunk, str):
+                    yield {"type": "response", "data": chunk}
         return _process_stream()
+class ReasoningVLMConfig(VLMConfig):
+    def __init__(self, thinking_token_start="<think>", thinking_token_end="</think>", **kwargs):
+        """
+        The general configuration for reasoning vision models.
+        """
+        super().__init__(**kwargs)
+        self.thinking_token_start = thinking_token_start
+        self.thinking_token_end = thinking_token_end
+    def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
+        """
+        This method preprocesses the input messages before passing them to the VLM.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        Returns:
+        -------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        """
+        return messages.copy()
+    def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None]]) -> Union[Dict[str,str], Generator[Dict[str,str], None, None]]:
+        """
+        This method postprocesses the VLM response after it is generated.
+        1. If input is a string, it will extract the reasoning and response based on the thinking tokens.
+        2. If input is a dict, it should contain keys "reasoning" and "response". This is for inference engines that already parse reasoning and response.
+        3. If input is a generator,
+            a. if the chunk is a dict, it should contain keys "type" and "data". This is for inference engines that already parse reasoning and response.
+            b. if the chunk is a string, it will yield dicts with keys "type" and "data" based on the thinking tokens.
-class OpenAIReasoningVLMConfig(VLMConfig):
+        Parameters:
+        ----------
+        response : Union[str, Generator[str, None, None]]
+            the VLM response. Can be a string or a generator.
+        Returns:
+        -------
+        response : Union[str, Generator[str, None, None]]
+            the postprocessed LLM response as a dict {"reasoning": <reasoning>, "response": <content>}
+            if input is a generator, the output will be a generator {"type": <reasoning or response>, "data": <content>}.
+        """
+        if isinstance(response, str):
+            # get contents between thinking_token_start and thinking_token_end
+            pattern = f"{re.escape(self.thinking_token_start)}(.*?){re.escape(self.thinking_token_end)}"
+            match = re.search(pattern, response, re.DOTALL)
+            reasoning = match.group(1) if match else ""
+            # get response AFTER thinking_token_end
+            response = re.sub(f".*?{self.thinking_token_end}", "", response, flags=re.DOTALL).strip()
+            return {"reasoning": reasoning, "response": response}
+        elif isinstance(response, dict):
+            if "reasoning" in response and "response" in response:
+                return response
+            else:
+                warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
+                return {"reasoning": "", "response": ""}
+        elif isinstance(response, Generator):
+            def _process_stream():
+                think_flag = False
+                buffer = ""
+                for chunk in response:
+                    if isinstance(chunk, dict):
+                        yield chunk
+                    elif isinstance(chunk, str):
+                        buffer += chunk
+                        # switch between reasoning and response
+                        if self.thinking_token_start in buffer:
+                            think_flag = True
+                            buffer = buffer.replace(self.thinking_token_start, "")
+                        elif self.thinking_token_end in buffer:
+                            think_flag = False
+                            buffer = buffer.replace(self.thinking_token_end, "")
+                        # if chunk is in thinking block, tag it as reasoning; else tag it as response
+                        if chunk not in [self.thinking_token_start, self.thinking_token_end]:
+                            if think_flag:
+                                yield {"type": "reasoning", "data": chunk}
+                            else:
+                                yield {"type": "response", "data": chunk}
+            return _process_stream()
+        else:
+            warnings.warn(f"Invalid response type: {type(response)}. Returning default empty dict.", UserWarning)
+            return {"reasoning": "", "response": ""}
+class OpenAIReasoningVLMConfig(ReasoningVLMConfig):
     def __init__(self, reasoning_effort:str="low", **kwargs):
         """
         The OpenAI "o" series configuration.
@@ -160,27 +264,31 @@ class OpenAIReasoningVLMConfig(VLMConfig):
         return new_messages
-    def postprocess_response(self, response:Union[str, Generator[str, None, None]]) -> Union[str, Generator[Dict[str, str], None, None]]:
-        """
-        This method postprocesses the VLM response after it is generated.
-        Parameters:
-        ----------
-        response : Union[str, Generator[str, None, None]]
-            the VLM response. Can be a string or a generator.
-        Returns: Union[str, Generator[Dict[str, str], None, None]]
-            the postprocessed VLM response.
-            if input is a generator, the output will be a generator {"type": "response", "data": <content>}.
+class MessagesLogger:
+    def __init__(self):
         """
-        if isinstance(response, str):
-            return response
+        This class is used to log the messages for InferenceEngine.chat().
+        """
+        self.messages_log = []
-        def _process_stream():
-            for chunk in response:
-                yield {"type": "response", "data": chunk}
+    def log_messages(self, messages : List[Dict[str,str]]):
+        """
+        This method logs the messages to a list.
+        """
+        self.messages_log.append(messages)
-        return _process_stream()
+    def get_messages_log(self) -> List[List[Dict[str,str]]]:
+        """
+        This method returns a copy of the current messages log
+        """
+        return self.messages_log.copy()
+    def clear_messages_log(self):
+        """
+        This method clears the current messages log
+        """
+        self.messages_log.clear()
 class VLMEngine:
@@ -198,7 +306,8 @@ class VLMEngine:
         return NotImplemented
     @abc.abstractmethod
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[str, None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs VLM generated text.
@@ -210,11 +319,13 @@ class VLMEngine:
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        Messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
         """
         return NotImplemented
     @abc.abstractmethod
-    def chat_async(self, messages:List[Dict[str,str]]) -> str:
+    def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str, str]:
         """
         The async version of chat method. Streaming is not supported.
         """
@@ -285,7 +396,8 @@ class OllamaVLMEngine(VLMEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[str, None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str,str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs VLM generated text.
@@ -297,6 +409,13 @@ class OllamaVLMEngine(VLMEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        Messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
+        Returns:
+        -------
+        response : Union[Dict[str,str], Generator[Dict[str, str], None, None]]
+            a dict {"reasoning": <reasoning>, "response": <response>} or Generator {"type": <reasoning or response>, "data": <content>}
         """
         processed_messages = self.config.preprocess_messages(messages)
@@ -310,10 +429,33 @@ class OllamaVLMEngine(VLMEngine):
                     stream=True,
                     keep_alive=self.keep_alive
                 )
+                res = {"reasoning": "", "response": ""}
                 for chunk in response_stream:
-                    content_chunk = chunk.get('message', {}).get('content')
-                    if content_chunk:
-                        yield content_chunk
+                    if hasattr(chunk.message, 'thinking') and chunk.message.thinking:
+                        content_chunk = getattr(getattr(chunk, 'message', {}), 'thinking', '')
+                        res["reasoning"] += content_chunk
+                        yield {"type": "reasoning", "data": content_chunk}
+                    else:
+                        content_chunk = getattr(getattr(chunk, 'message', {}), 'content', '')
+                        res["response"] += content_chunk
+                        yield {"type": "response", "data": content_chunk}
+                    if chunk.done_reason == "length":
+                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res)
+                # Write to messages log
+                if messages_logger:
+                    # replace images content with a placeholder "[image]" to save space
+                    for messages in processed_messages:
+                        if "images" in messages:
+                            messages["images"] = ["[image]" for _ in messages["images"]]
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
@@ -326,14 +468,29 @@ class OllamaVLMEngine(VLMEngine):
                             keep_alive=self.keep_alive
                         )
-            res = ''
+            res = {"reasoning": "", "response": ""}
+            phase = ""
             for chunk in response:
-                content_chunk = chunk.get('message', {}).get('content')
+                if hasattr(chunk.message, 'thinking') and chunk.message.thinking:
+                    if phase != "reasoning":
+                        print("\n--- Reasoning ---")
+                        phase = "reasoning"
+                    content_chunk = getattr(getattr(chunk, 'message', {}), 'thinking', '')
+                    res["reasoning"] += content_chunk
+                else:
+                    if phase != "response":
+                        print("\n--- Response ---")
+                        phase = "response"
+                    content_chunk = getattr(getattr(chunk, 'message', {}), 'content', '')
+                    res["response"] += content_chunk
                 print(content_chunk, end='', flush=True)
-                res += content_chunk
+                if chunk.done_reason == "length":
+                    warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
             print('\n')
-            return self.config.postprocess_response(res)
         else:
             response = self.client.chat(
                                 model=self.model_name,
@@ -342,11 +499,30 @@ class OllamaVLMEngine(VLMEngine):
                                 stream=False,
                                 keep_alive=self.keep_alive
                             )
-            res = response.get('message', {}).get('content')
-            return self.config.postprocess_response(res)
+            res = {"reasoning": getattr(getattr(response, 'message', {}), 'thinking', ''),
+                   "response": getattr(getattr(response, 'message', {}), 'content', '')}
+            if response.done_reason == "length":
+                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "images" in messages:
+                    messages["images"] = ["[image]" for _ in messages["images"]]
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -360,8 +536,26 @@ class OllamaVLMEngine(VLMEngine):
                             keep_alive=self.keep_alive
                         )
-        res = response['message']['content']
-        return self.config.postprocess_response(res)
+        res = {"reasoning": getattr(getattr(response, 'message', {}), 'thinking', ''),
+               "response": getattr(getattr(response, 'message', {}), 'content', '')}
+        if response.done_reason == "length":
+            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "images" in messages:
+                    messages["images"] = ["[image]" for _ in messages["images"]]
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
     def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image) -> List[Dict[str,str]]:
         """
@@ -387,6 +581,346 @@ class OllamaVLMEngine(VLMEngine):
         ]
+class OpenAICompatibleVLMEngine(VLMEngine):
+    def __init__(self, model:str, api_key:str, base_url:str, config:VLMConfig=None, **kwrs):
+        """
+        General OpenAI-compatible server inference engine.
+        https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str
+            the API key for the vLLM server.
+        base_url : str
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        if importlib.util.find_spec("openai") is None:
+            raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
+        from openai import OpenAI, AsyncOpenAI
+        from openai.types.chat import ChatCompletionChunk
+        self.ChatCompletionChunk = ChatCompletionChunk
+        super().__init__(config)
+        self.client = OpenAI(api_key=api_key, base_url=base_url, **kwrs)
+        self.async_client = AsyncOpenAI(api_key=api_key, base_url=base_url, **kwrs)
+        self.model = model
+        self.config = config if config else BasicVLMConfig()
+        self.formatted_params = self._format_config()
+    def _format_config(self) -> Dict[str, Any]:
+        """
+        This method format the VLM configuration with the correct key for the inference engine.
+        """
+        formatted_params = self.config.params.copy()
+        if "max_new_tokens" in formatted_params:
+            formatted_params["max_completion_tokens"] = formatted_params["max_new_tokens"]
+            formatted_params.pop("max_new_tokens")
+        return formatted_params
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            chunk_text = getattr(response.choices[0].delta, "content", "")
+            if chunk_text is None:
+                chunk_text = ""
+            return {"type": "response", "data": chunk_text}
+        return {"response": getattr(response.choices[0].message, "content", "")}
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False,
+             messages_logger:MessagesLogger=None) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
+        """
+        This method inputs chat messages and outputs LLM generated text.
+        Parameters:
+        ----------
+        messages : List[Dict[str,str]]
+            a list of dict with role and content. role must be one of {"system", "user", "assistant"}
+        verbose : bool, Optional
+            if True, VLM generated text will be printed in terminal in real-time.
+        stream : bool, Optional
+            if True, returns a generator that yields the output in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
+        Returns:
+        -------
+        response : Union[Dict[str,str], Generator[Dict[str, str], None, None]]
+            a dict {"reasoning": <reasoning>, "response": <response>} or Generator {"type": <reasoning or response>, "data": <content>}
+        """
+        processed_messages = self.config.preprocess_messages(messages)
+        if stream:
+            def _stream_generator():
+                response_stream = self.client.chat.completions.create(
+                                        model=self.model,
+                                        messages=processed_messages,
+                                        stream=True,
+                                        **self.formatted_params
+                                    )
+                res_text = ""
+                for chunk in response_stream:
+                    if len(chunk.choices) > 0:
+                        chunk_dict = self._format_response(chunk)
+                        yield chunk_dict
+                        res_text += chunk_dict["data"]
+                        if chunk.choices[0].finish_reason == "length":
+                            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    # replace images content with a placeholder "[image]" to save space
+                    for messages in processed_messages:
+                        if "content" in messages and isinstance(messages["content"], list):
+                            for content in messages["content"]:
+                                if isinstance(content, dict) and content.get("type") == "image_url":
+                                    content["image_url"]["url"] = "[image]"
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
+            return self.config.postprocess_response(_stream_generator())
+        elif verbose:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=processed_messages,
+                stream=True,
+                **self.formatted_params
+            )
+            res = {"reasoning": "", "response": ""}
+            phase = ""
+            for chunk in response:
+                if len(chunk.choices) > 0:
+                    chunk_dict = self._format_response(chunk)
+                    chunk_text = chunk_dict["data"]
+                    res[chunk_dict["type"]] += chunk_text
+                    if phase != chunk_dict["type"] and chunk_text != "":
+                        print(f"\n--- {chunk_dict['type'].capitalize()} ---")
+                        phase = chunk_dict["type"]
+                    print(chunk_text, end="", flush=True)
+                    if chunk.choices[0].finish_reason == "length":
+                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+            print('\n')
+        else:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=processed_messages,
+                stream=False,
+                **self.formatted_params
+            )
+            res = self._format_response(response)
+            if response.choices[0].finish_reason == "length":
+                warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "content" in messages and isinstance(messages["content"], list):
+                    for content in messages["content"]:
+                        if isinstance(content, dict) and content.get("type") == "image_url":
+                            content["image_url"]["url"] = "[image]"
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
+        """
+        Async version of chat method. Streaming is not supported.
+        """
+        processed_messages = self.config.preprocess_messages(messages)
+        response = await self.async_client.chat.completions.create(
+            model=self.model,
+            messages=processed_messages,
+            stream=False,
+            **self.formatted_params
+        )
+        if response.choices[0].finish_reason == "length":
+            warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+        res = self._format_response(response)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "content" in messages and isinstance(messages["content"], list):
+                    for content in messages["content"]:
+                        if isinstance(content, dict) and content.get("type") == "image_url":
+                            content["image_url"]["url"] = "[image]"
+            processed_messages.append({"role": "assistant",
+                                        "content": res_dict.get("response", ""),
+                                        "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
+    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', detail:str="high") -> List[Dict[str,str]]:
+        """
+        This method inputs an image and returns the correesponding chat messages for the inference engine.
+        Parameters:
+        ----------
+        system_prompt : str
+            the system prompt.
+        user_prompt : str
+            the user prompt.
+        image : Image.Image
+            the image for OCR.
+        format : str, Optional
+            the image format.
+        detail : str, Optional
+            the detail level of the image. Default is "high".
+        """
+        base64_str = image_to_base64(image)
+        return [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/{format};base64,{base64_str}",
+                            "detail": detail
+                        },
+                    },
+                    {"type": "text", "text": user_prompt},
+                ],
+            },
+        ]
+class VLLMVLMEngine(OpenAICompatibleVLMEngine):
+    def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:8000/v1", config:VLMConfig=None, **kwrs):
+        """
+        vLLM OpenAI compatible server inference engine.
+        https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str, Optional
+            the API key for the vLLM server.
+        base_url : str, Optional
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        super().__init__(model, api_key, base_url, config, **kwrs)
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
+                chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "reasoning", "data": chunk_text}
+            else:
+                chunk_text = getattr(response.choices[0].delta, "content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "response", "data": chunk_text}
+        return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
+                "response": getattr(response.choices[0].message, "content", "")}
+class OpenRouterVLMEngine(OpenAICompatibleVLMEngine):
+    def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:VLMConfig=None, **kwrs):
+        """
+        OpenRouter OpenAI-compatible server inference engine.
+        Parameters:
+        ----------
+        model_name : str
+            model name as shown in the vLLM server
+        api_key : str, Optional
+            the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY'].
+        base_url : str, Optional
+            the base url for the vLLM server.
+        config : LLMConfig
+            the LLM configuration.
+        """
+        self.api_key = api_key
+        if self.api_key is None:
+            self.api_key = os.getenv("OPENROUTER_API_KEY")
+        super().__init__(model, self.api_key, base_url, config, **kwrs)
+    def _format_response(self, response: Any) -> Dict[str, str]:
+        """
+        This method format the response from OpenAI API to a dict with keys "type" and "data".
+        Parameters:
+        ----------
+        response : Any
+            the response from OpenAI-compatible API. Could be a dict, generator, or object.
+        """
+        if isinstance(response, self.ChatCompletionChunk):
+            if hasattr(response.choices[0].delta, "reasoning") and getattr(response.choices[0].delta, "reasoning") is not None:
+                chunk_text = getattr(response.choices[0].delta, "reasoning", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "reasoning", "data": chunk_text}
+            else:
+                chunk_text = getattr(response.choices[0].delta, "content", "")
+                if chunk_text is None:
+                    chunk_text = ""
+                return {"type": "response", "data": chunk_text}
+        return {"reasoning": getattr(response.choices[0].message, "reasoning", ""),
+                "response": getattr(response.choices[0].message, "content", "")}
 class OpenAIVLMEngine(VLMEngine):
     def __init__(self, model:str, config:VLMConfig=None, **kwrs):
         """
@@ -423,7 +957,7 @@ class OpenAIVLMEngine(VLMEngine):
         return formatted_params
-    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[str, None, None]]:
+    def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False, messages_logger:MessagesLogger=None) -> Union[Dict[str, str], Generator[Dict[str, str], None, None]]:
         """
         This method inputs chat messages and outputs LLM generated text.
@@ -435,6 +969,13 @@ class OpenAIVLMEngine(VLMEngine):
             if True, VLM generated text will be printed in terminal in real-time.
         stream : bool, Optional
             if True, returns a generator that yields the output in real-time.
+        messages_logger : MessagesLogger, Optional
+            the message logger that logs the chat messages.
+        Returns:
+        -------
+        response : Union[Dict[str,str], Generator[Dict[str, str], None, None]]
+            a dict {"reasoning": <reasoning>, "response": <response>} or Generator {"type": <reasoning or response>, "data": <content>}
         """
         processed_messages = self.config.preprocess_messages(messages)
@@ -446,13 +987,32 @@ class OpenAIVLMEngine(VLMEngine):
                                         stream=True,
                                         **self.formatted_params
                                     )
+                res_text = ""
                 for chunk in response_stream:
                     if len(chunk.choices) > 0:
-                        if chunk.choices[0].delta.content is not None:
-                            yield chunk.choices[0].delta.content
+                        chunk_text = chunk.choices[0].delta.content
+                        if chunk_text is not None:
+                            res_text += chunk_text
+                            yield chunk_text
                         if chunk.choices[0].finish_reason == "length":
                             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
+                # Postprocess response
+                res_dict = self.config.postprocess_response(res_text)
+                # Write to messages log
+                if messages_logger:
+                    # replace images content with a placeholder "[image]" to save space
+                    for messages in processed_messages:
+                        if "content" in messages and isinstance(messages["content"], list):
+                            for content in messages["content"]:
+                                if isinstance(content, dict) and content.get("type") == "image_url":
+                                    content["image_url"]["url"] = "[image]"
+                    processed_messages.append({"role": "assistant",
+                                                "content": res_dict.get("response", ""),
+                                                "reasoning": res_dict.get("reasoning", "")})
+                    messages_logger.log_messages(processed_messages)
             return self.config.postprocess_response(_stream_generator())
         elif verbose:
@@ -472,7 +1032,7 @@ class OpenAIVLMEngine(VLMEngine):
                         warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
             print('\n')
-            return self.config.postprocess_response(res)
         else:
             response = self.client.chat.completions.create(
                 model=self.model,
@@ -481,10 +1041,27 @@ class OpenAIVLMEngine(VLMEngine):
                 **self.formatted_params
             )
             res = response.choices[0].message.content
-            return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "content" in messages and isinstance(messages["content"], list):
+                    for content in messages["content"]:
+                        if isinstance(content, dict) and content.get("type") == "image_url":
+                            content["image_url"]["url"] = "[image]"
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
-    async def chat_async(self, messages:List[Dict[str,str]]) -> str:
+    async def chat_async(self, messages:List[Dict[str,str]], messages_logger:MessagesLogger=None) -> Dict[str,str]:
         """
         Async version of chat method. Streaming is not supported.
         """
@@ -501,7 +1078,23 @@ class OpenAIVLMEngine(VLMEngine):
             warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
         res = response.choices[0].message.content
-        return self.config.postprocess_response(res)
+        # Postprocess response
+        res_dict = self.config.postprocess_response(res)
+        # Write to messages log
+        if messages_logger:
+            # replace images content with a placeholder "[image]" to save space
+            for messages in processed_messages:
+                if "content" in messages and isinstance(messages["content"], list):
+                    for content in messages["content"]:
+                        if isinstance(content, dict) and content.get("type") == "image_url":
+                            content["image_url"]["url"] = "[image]"
+            processed_messages.append({"role": "assistant",
+                                    "content": res_dict.get("response", ""),
+                                    "reasoning": res_dict.get("reasoning", "")})
+            messages_logger.log_messages(processed_messages)
+        return res_dict
     def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', detail:str="high") -> List[Dict[str,str]]:
         """

{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vlm4ocr
-Version: 0.3.0
+Version: 0.3.1
 Summary: Python package and Web App for OCR with vision language models.
 License: MIT
 Author: Enshuo (David) Hsu

{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-vlm4ocr/__init__.py,sha256=k5TZY0LmRnjGyjHD0H5AxJHJMw_cS2SzGxTJ0NQbQsc,315
+vlm4ocr/__init__.py,sha256=NpJ-jquqaXo-uHPcMOYUtqToLWLxixftPQn7epD2XbY,506
 vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt,sha256=igPOntiLDZXTB71-QrTmMJveb6XC1TgArg1serPc9V8,547
 vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt,sha256=cVn538JojZfCtIhfrcOPWt0dO7dtDqgB9xdS_5VvAqo,41
 vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt,sha256=v-fUw53gkngc_dz9TMH2abALDsAEZfe-zJ2u3-SO4ck,417
@@ -6,12 +6,12 @@ vlm4ocr/assets/default_prompt_templates/ocr_markdown_system_prompt.txt,sha256=pI
 vlm4ocr/assets/default_prompt_templates/ocr_markdown_user_prompt.txt,sha256=61EJv8POsQGIIUVwCjDU73lMXJE7F3qhPIYl6zSbl1Q,45
 vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt,sha256=WbLSOerqFjlYGaGWJ-w2enhky1WhnPl011s0fgRPgnQ,398
 vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt,sha256=ftgNAIPy_UlrcY6m7-IkH2ApHkCzRnymra1w2wg60Ks,47
-vlm4ocr/cli.py,sha256=b13WswreFxTNLA7n2F2jPR7Wrb2Onb06zFnvf7MOLi0,20268
-vlm4ocr/data_types.py,sha256=IygbR6NWn1hMnfMc500pPz6s_odzqIjk-I_5Nz-djCs,3943
-vlm4ocr/ocr_engines.py,sha256=xYTkT2DIbASlJtKMtyfWpuFl5PeSaaVDtGyiTWxCaJg,24429
+vlm4ocr/cli.py,sha256=mq5fbJQvgUm89Vd9v2SIW9ARsGex-8V46-r3-evjYrs,19966
+vlm4ocr/data_types.py,sha256=BOcq5KsZFJ_-Fxb9A4IJfOd0x5u-1tUQkYbWAJayuPM,4416
+vlm4ocr/ocr_engines.py,sha256=up7p9xGIeBdwQgqChlr7lsTMWTVFtSWzwlFZp2wKAxk,25431
 vlm4ocr/utils.py,sha256=nQhUskOze99wCVMKmvsen0dhq-9NdN4EPC_bdYfkjgA,13611
-vlm4ocr/vlm_engines.py,sha256=jQuRZ5HlJtTtJXESiFcoYQXwX-lYu0gc-KKOpRLuW6A,22331
-vlm4ocr-0.3.0.dist-info/METADATA,sha256=fK3pR2tuInWeRLqZC4Mt86DKvjYhfYX-4GV09PjptEE,710
-vlm4ocr-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-vlm4ocr-0.3.0.dist-info/entry_points.txt,sha256=qzWUk_QTZ12cH4DLjjfqce89EAlOydD85dreRRZF3K4,44
-vlm4ocr-0.3.0.dist-info/RECORD,,
+vlm4ocr/vlm_engines.py,sha256=rfb4P1fhpY6ClC27FMhYCWOaIjCipZCx3gPrNnDbF0w,50209
+vlm4ocr-0.3.1.dist-info/METADATA,sha256=_l03maaznCHetgYPATohqd_yFJenWE57sdw_JLaVmc0,710
+vlm4ocr-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vlm4ocr-0.3.1.dist-info/entry_points.txt,sha256=qzWUk_QTZ12cH4DLjjfqce89EAlOydD85dreRRZF3K4,44
+vlm4ocr-0.3.1.dist-info/RECORD,,

{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{vlm4ocr-0.3.0.dist-info → vlm4ocr-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vlm4ocr 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

vlm4ocr 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl