PyPI - vlm4ocr - Versions diffs - 0.3.1__tar.gz → 0.4.0__tar.gz - Mend

vlm4ocr 0.3.1tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vlm4ocr
-Version: 0.3.1
+Version: 0.4.0
 Summary: Python package and Web App for OCR with vision language models.
 License: MIT
 Author: Enshuo (David) Hsu

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vlm4ocr"
-version = "0.3.1"
+version = "0.4.0"
 description = "Python package and Web App for OCR with vision language models."
 authors = ["Enshuo (David) Hsu"]
 license = "MIT"

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/vlm4ocr/__init__.py RENAMED Viewed

@@ -1,7 +1,9 @@
+from .data_types import FewShotExample
 from .ocr_engines import OCREngine
 from .vlm_engines import BasicVLMConfig, ReasoningVLMConfig, OpenAIReasoningVLMConfig, OllamaVLMEngine, OpenAICompatibleVLMEngine, VLLMVLMEngine, OpenRouterVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine
 __all__ = [
+    "FewShotExample",
     "BasicVLMConfig",
     "ReasoningVLMConfig",
     "OpenAIReasoningVLMConfig",

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/vlm4ocr/cli.py RENAMED Viewed

@@ -15,7 +15,12 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s: %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S'
 )
+# Get our specific logger for CLI messages
 logger = logging.getLogger("vlm4ocr_cli")
+# Get the logger that will receive captured warnings
+# By default, warnings are logged to a logger named 'py.warnings'
+warnings_logger = logging.getLogger('py.warnings')
 SUPPORTED_IMAGE_EXTS_CLI = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
 OUTPUT_EXTENSIONS = {'markdown': '.md', 'HTML':'.html', 'text':'.txt'}
@@ -56,17 +61,26 @@ def setup_file_logger(log_dir, timestamp_str, debug_mode):
     log_file_path = os.path.join(log_dir, log_file_name)
     file_handler = logging.FileHandler(log_file_path, mode='a')
-    formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(name)s:%(filename)s:%(lineno)d] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
     file_handler.setFormatter(formatter)
     log_level = logging.DEBUG if debug_mode else logging.INFO
     file_handler.setLevel(log_level)
-    logger.addHandler(file_handler)
+    # Add handler to the root logger to capture all logs (from our logger,
+    # and from the warnings logger 'py.warnings')
+    root_logger = logging.getLogger()
+    root_logger.addHandler(file_handler)
+    # We still configure our specific logger's level for console output
     logger.info(f"Logging to file: {log_file_path}")
 def main():
+    # Capture warnings from the 'warnings' module (like RuntimeWarning)
+    # and redirect them to the 'logging' system.
+    logging.captureWarnings(True)
     parser = argparse.ArgumentParser(
         description="VLM4OCR: Perform OCR on images, PDFs, or TIFF files using Vision Language Models. Processing is concurrent by default.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -94,7 +108,8 @@ def main():
     vlm_engine_group.add_argument("--vlm_engine", choices=["openai", "azure_openai", "ollama", "openai_compatible"], required=True, help="VLM engine.")
     vlm_engine_group.add_argument("--model", required=True, help="Model identifier for the VLM engine.")
     vlm_engine_group.add_argument("--max_new_tokens", type=int, default=4096, help="Max new tokens for VLM.")
-    vlm_engine_group.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
+    vlm_engine_group.add_argument("--temperature", type=float, default=None, help="Sampling temperature.")
+    vlm_engine_group.add_argument("--top_p", type=float, default=None, help="Sampling top p.")
     openai_group = parser.add_argument_group("OpenAI & OpenAI-Compatible Options")
     openai_group.add_argument("--api_key", default=os.environ.get("OPENAI_API_KEY"), help="API key.")
@@ -135,16 +150,23 @@ def main():
     current_timestamp_str = time.strftime("%Y%m%d_%H%M%S")
     # --- Configure Logger Level based on args ---
+    # Get root logger to control global level for libraries
+    root_logger = logging.getLogger()
     if args.debug:
-        logger.setLevel(logging.DEBUG)
-        # Set root logger to DEBUG only if our specific logger is DEBUG, to avoid overly verbose library logs unless intended.
-        if logger.getEffectiveLevel() <= logging.DEBUG:
-            logging.getLogger().setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG) # Our logger to DEBUG
+        warnings_logger.setLevel(logging.DEBUG) # Warnings logger to DEBUG
+        root_logger.setLevel(logging.DEBUG) # Root to DEBUG
         logger.debug("Debug mode enabled for console.")
     else:
-        logger.setLevel(logging.INFO) # Default for our CLI's own messages
-        logging.getLogger().setLevel(logging.WARNING) # Keep external libraries quieter by default
+        logger.setLevel(logging.INFO) # Our logger to INFO
+        warnings_logger.setLevel(logging.INFO) # Warnings logger to INFO
+        root_logger.setLevel(logging.WARNING) # Root to WARNING (quieter libraries)
+        # Our console handler (from basicConfig) is on the root logger,
+        # so setting root to WARNING makes console quiet
+        # But our logger (vlm4ocr_cli) is INFO, so if a file handler
+        # is added, it will get INFO messages from 'logger'
     if args.concurrent_batch_size < 1:
         parser.error("--concurrent_batch_size must be 1 or greater.")
@@ -183,6 +205,15 @@ def main():
     # --- Setup File Logger (if --log is specified) ---
     if args.log:
         setup_file_logger(effective_output_dir, current_timestamp_str, args.debug)
+        # If logging to file, we want our console to be less verbose
+        # if not in debug mode, so we set the console handler's level higher.
+        if not args.debug:
+            # Find the console handler (from basicConfig) and set its level
+            for handler in root_logger.handlers:
+                if isinstance(handler, logging.StreamHandler) and handler.stream == sys.stderr:
+                     handler.setLevel(logging.WARNING)
+                     logger.debug("Set console handler level to WARNING.")
+                     break
     logger.debug(f"Parsed arguments: {args}")
@@ -190,9 +221,11 @@ def main():
     vlm_engine_instance = None
     try:
         logger.info(f"Initializing VLM engine: {args.vlm_engine} with model: {args.model}")
+        logger.info(f"max_new_tokens: {args.max_new_tokens}, temperature: {args.temperature}, top_p: {args.top_p}")
         config = BasicVLMConfig(
             max_new_tokens=args.max_new_tokens,
-            temperature=args.temperature
+            temperature=args.temperature,
+            top_p=args.top_p
         )
         if args.vlm_engine == "openai":
             if not args.api_key: parser.error("--api_key (or OPENAI_API_KEY) is required for OpenAI.")
@@ -286,16 +319,34 @@ def main():
             # console verbosity controlled by logger level.
             show_progress_bar = (num_actual_files > 0)
+            # Only show progress bar if not in debug mode (debug logs would interfere)
+            # and if there are files to process.
+            # If logging to file, console can be quiet (INFO level).
+            # If NOT logging to file, console must be INFO level to show bar.
+            # Determine if progress bar should be active (not disabled)
+            # Disable bar if in debug mode (logs interfere) or no files
+            disable_bar = args.debug or not show_progress_bar
+            # If not logging to file AND not debug, we need console at INFO
+            if not args.log and not args.debug:
+                 for handler in logging.getLogger().handlers:
+                    if isinstance(handler, logging.StreamHandler) and handler.stream == sys.stderr:
+                         handler.setLevel(logging.INFO)
+                         logger.debug("Set console handler level to INFO for progress bar.")
+                         break
             iterator_wrapper = tqdm.asyncio.tqdm(
                 ocr_task_generator,
                 total=num_actual_files,
                 desc="Processing files",
                 unit="file",
-                disable=not show_progress_bar # disable if no files, or can remove this disable if tqdm handles total=0
+                disable=disable_bar
             )
             async for result_object in iterator_wrapper:
                 if not isinstance(result_object, OCRResult):
+                    # This warning *will* now be captured by the file log
                     logger.warning(f"Received unexpected data type: {type(result_object)}")
                     continue
@@ -314,9 +365,12 @@ def main():
                         content_to_write = result_object.to_string()
                         with open(current_ocr_output_file_path, "w", encoding="utf-8") as f:
                             f.write(content_to_write)
-                        # Log less verbosely to console if progress bar is active
-                        if not show_progress_bar or logger.getEffectiveLevel() <= logging.DEBUG:
-                           logger.info(f"OCR result for '{input_file_path_from_result}' saved to: {current_ocr_output_file_path}")
+                        # MODIFIED: Always log success info.
+                        # This will go to the file log if active.
+                        # It will NOT go to console if console level is WARNING.
+                        logger.info(f"OCR result for '{input_file_path_from_result}' saved to: {current_ocr_output_file_path}")
                     except Exception as e:
                         logger.error(f"Error writing output for '{input_file_path_from_result}' to '{current_ocr_output_file_path}': {e}")

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/vlm4ocr/data_types.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import os
 from typing import List, Dict, Literal
+from PIL import Image
 from dataclasses import dataclass, field
-from vlm4ocr.utils import get_default_page_delimiter
+from vlm4ocr.utils import get_default_page_delimiter, ImageProcessor
 OutputMode = Literal["markdown", "HTML", "text", "JSON"]
@@ -118,4 +119,41 @@ class OCRResult:
         else:
             self.page_delimiter = page_delimiter
-        return self.page_delimiter.join([page.get("text", "") for page in self.pages])
+        return self.page_delimiter.join([page.get("text", "") for page in self.pages])
+@dataclass
+class FewShotExample:
+    """
+    This class represents a few-shot example for OCR tasks.
+    Parameters:
+    ----------
+    image : PIL.Image.Image
+        The image associated with the example.
+    text : str
+        The expected OCR result text for the image.
+    rotate_correction : bool, Optional
+        If True, applies rotate correction to the images using pytesseract.
+    max_dimension_pixels : int, Optional
+        The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
+    """
+    image: Image.Image
+    text: str
+    rotate_correction: bool = False
+    max_dimension_pixels: int = None
+    def __post_init__(self):
+        if not isinstance(self.image, Image.Image):
+            raise ValueError("image must be a PIL.Image.Image object")
+        if not isinstance(self.text, str):
+            raise ValueError("text must be a string")
+        if self.rotate_correction or self.max_dimension_pixels is not None:
+            self.image_processor = ImageProcessor()
+        # Rotate correction if specified
+        if self.rotate_correction:
+            self.image, _ = self.image_processor.rotate_correction(self.image)
+        # Resize image if max_dimension_pixels is specified
+        if self.max_dimension_pixels is not None:
+            self.image, _ = self.image_processor.resize(image=self.image, max_dimension_pixels=self.max_dimension_pixels)

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/vlm4ocr/ocr_engines.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import os
-from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
+from typing import Any, Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
 import importlib
 import asyncio
 from colorama import Fore, Style
 import json
 from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
-from vlm4ocr.data_types import OCRResult
+from vlm4ocr.data_types import OCRResult, FewShotExample
 from vlm4ocr.vlm_engines import VLMEngine, MessagesLogger
 SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
@@ -60,7 +60,8 @@ class OCREngine:
         self.image_processor = ImageProcessor()
-    def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Generator[Dict[str, str], None, None]:
+    def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                   few_shot_examples:List[FewShotExample]=None) -> Generator[Dict[str, str], None, None]:
         """
         This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
         Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
@@ -73,6 +74,8 @@ class OCREngine:
             If True, applies rotate correction to the images using pytesseract.
         max_dimension_pixels : int, Optional
             The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples.
         Returns:
         --------
@@ -90,10 +93,6 @@ class OCREngine:
         file_ext = os.path.splitext(file_path)[1].lower()
         if file_ext not in SUPPORTED_IMAGE_EXTS:
             raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
-        # Check if image preprocessing can be applied
-        if self.image_processor.has_tesseract==False and rotate_correction:
-            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
         # PDF or TIFF
         if file_ext in ['.pdf', '.tif', '.tiff']:
@@ -105,8 +104,8 @@ class OCREngine:
             # OCR each image
             for i, image in enumerate(images):
-                # Apply rotate correction if specified and tesseract is available
-                if rotate_correction and self.image_processor.has_tesseract:
+                # Apply rotate correction if specified
+                if rotate_correction:
                     try:
                         image, _ = self.image_processor.rotate_correction(image)
@@ -120,7 +119,13 @@ class OCREngine:
                     except Exception as e:
                         yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
-                messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                # Get OCR messages
+                messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                            user_prompt=self.user_prompt,
+                                                            image=image,
+                                                            few_shot_examples=few_shot_examples)
+                # Stream response
                 response_stream = self.vlm_engine.chat(
                     messages,
                     stream=True
@@ -137,8 +142,8 @@ class OCREngine:
             data_loader = ImageDataLoader(file_path)
             image = data_loader.get_page(0)
-            # Apply rotate correction if specified and tesseract is available
-            if rotate_correction and self.image_processor.has_tesseract:
+            # Apply rotate correction if specified
+            if rotate_correction:
                 try:
                     image, _ = self.image_processor.rotate_correction(image)
@@ -152,7 +157,12 @@ class OCREngine:
                 except Exception as e:
                     yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
-            messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+            # Get OCR messages
+            messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                        user_prompt=self.user_prompt,
+                                                        image=image,
+                                                        few_shot_examples=few_shot_examples)
+            # Stream response
             response_stream = self.vlm_engine.chat(
                     messages,
                     stream=True
@@ -163,7 +173,7 @@ class OCREngine:
     def sequential_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
-                       max_dimension_pixels:int=None, verbose:bool=False) -> List[OCRResult]:
+                       max_dimension_pixels:int=None, verbose:bool=False, few_shot_examples:List[FewShotExample]=None) -> List[OCRResult]:
         """
         This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
@@ -177,6 +187,8 @@ class OCREngine:
             The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
         verbose : bool, Optional
             If True, the function will print the output in terminal.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
         Returns:
         --------
@@ -186,6 +198,7 @@ class OCREngine:
         if isinstance(file_paths, str):
             file_paths = [file_paths]
+        # Iterate through file paths
         ocr_results = []
         for file_path in file_paths:
             # Define OCRResult object
@@ -235,8 +248,8 @@ class OCREngine:
             # OCR images
             for i, image in enumerate(images):
                 image_processing_status = {}
-                # Apply rotate correction if specified and tesseract is available
-                if rotate_correction and self.image_processor.has_tesseract:
+                # Apply rotate correction if specified
+                if rotate_correction:
                     try:
                         image, rotation_angle = self.image_processor.rotate_correction(image)
                         image_processing_status["rotate_correction"] = {
@@ -272,7 +285,10 @@ class OCREngine:
                             print(f"{Fore.RED}Error resizing image for {filename}:{Style.RESET_ALL} {resized['error']}. OCR continues without resizing.")
                 try:
-                    messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                    messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                                user_prompt=self.user_prompt,
+                                                                image=image,
+                                                                few_shot_examples=few_shot_examples)
                     # Define a messages logger to capture messages
                     messages_logger = MessagesLogger()
                     # Generate response
@@ -308,11 +324,12 @@ class OCREngine:
                         print(f"{Fore.RED}Error during OCR for a page in {filename}:{Style.RESET_ALL} {page_e}")
             # Add the OCR result to the list
-            ocr_result.status = "success"
+            if ocr_result.status != "error":
+                ocr_result.status = "success"
             ocr_results.append(ocr_result)
             if verbose:
-                print(f"{Fore.BLUE}Successfully processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
+                print(f"{Fore.BLUE}Processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
                 for page in ocr_result:
                     print(page)
                     print("-" * 80)
@@ -321,7 +338,8 @@ class OCREngine:
     def concurrent_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
-                       max_dimension_pixels:int=None, concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
+                       max_dimension_pixels:int=None, few_shot_examples:List[FewShotExample]=None,
+                       concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
         First complete first out. Input and output order not guaranteed.
         This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
@@ -335,6 +353,8 @@ class OCREngine:
             If True, applies rotate correction to the images using pytesseract.
         max_dimension_pixels : int, Optional
             The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
         concurrent_batch_size : int, Optional
             The number of concurrent VLM calls to make.
         max_file_load : int, Optional
@@ -353,18 +373,17 @@ class OCREngine:
         if not isinstance(max_file_load, int) or max_file_load <= 0:
             raise ValueError("max_file_load must be a positive integer")
-        if self.image_processor.has_tesseract==False and rotate_correction:
-            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
         return self._ocr_async(file_paths=file_paths,
                                rotate_correction=rotate_correction,
                                max_dimension_pixels=max_dimension_pixels,
+                               few_shot_examples=few_shot_examples,
                                concurrent_batch_size=concurrent_batch_size,
                                max_file_load=max_file_load)
     async def _ocr_async(self, file_paths: Iterable[str], rotate_correction:bool=False, max_dimension_pixels:int=None,
+                         few_shot_examples:List[FewShotExample]=None,
                          concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
         Internal method to asynchronously process an iterable of file paths.
@@ -380,7 +399,8 @@ class OCREngine:
                                                  vlm_call_semaphore=vlm_call_semaphore,
                                                  file_path=file_path,
                                                  rotate_correction=rotate_correction,
-                                                 max_dimension_pixels=max_dimension_pixels)
+                                                 max_dimension_pixels=max_dimension_pixels,
+                                                 few_shot_examples=few_shot_examples)
             tasks.append(task)
@@ -389,7 +409,8 @@ class OCREngine:
             yield result
     async def _ocr_file_with_semaphore(self, file_load_semaphore:asyncio.Semaphore, vlm_call_semaphore:asyncio.Semaphore,
-                                       file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> OCRResult:
+                                       file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                                       few_shot_examples:List[FewShotExample]=None) -> OCRResult:
         """
         This internal method takes a semaphore and OCR a single file using the VLM inference engine.
         """
@@ -428,6 +449,7 @@ class OCREngine:
                         page_index=page_index,
                         rotate_correction=rotate_correction,
                         max_dimension_pixels=max_dimension_pixels,
+                        few_shot_examples=few_shot_examples,
                         messages_logger=messages_logger
                     )
                     page_processing_tasks.append(task)
@@ -444,13 +466,14 @@ class OCREngine:
                 return result
         # Set status to success if no errors occurred
-        result.status = "success"
+        if result.status != "error":
+            result.status = "success"
         result.add_messages_to_log(messages_logger.get_messages_log())
         return result
     async def _ocr_page_with_semaphore(self, vlm_call_semaphore: asyncio.Semaphore, data_loader: DataLoader,
                                        page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None,
-                                       messages_logger:MessagesLogger=None) -> Tuple[str, Dict[str, str]]:
+                                       few_shot_examples:List[FewShotExample]=None, messages_logger:MessagesLogger=None) -> Tuple[str, Dict[str, str]]:
         """
         This internal method takes a semaphore and OCR a single image/page using the VLM inference engine.
@@ -462,8 +485,8 @@ class OCREngine:
         async with vlm_call_semaphore:
             image = await data_loader.get_page_async(page_index)
             image_processing_status = {}
-            # Apply rotate correction if specified and tesseract is available
-            if rotate_correction and self.image_processor.has_tesseract:
+            # Apply rotate correction if specified
+            if rotate_correction:
                 try:
                     image, rotation_angle = await self.image_processor.rotate_correction_async(image)
                     image_processing_status["rotate_correction"] = {
@@ -490,7 +513,10 @@ class OCREngine:
                         "error": str(e)
                     }
-            messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+            messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                        user_prompt=self.user_prompt,
+                                                        image=image,
+                                                        few_shot_examples=few_shot_examples)
             response = await self.vlm_engine.chat_async(
                 messages,
                 messages_logger=messages_logger

{vlm4ocr-0.3.1 → vlm4ocr-0.4.0}/vlm4ocr/vlm_engines.py RENAMED Viewed

@@ -6,6 +6,7 @@ import os
 import re
 from PIL import Image
 from vlm4ocr.utils import image_to_base64
+from vlm4ocr.data_types import FewShotExample
 class VLMConfig(abc.ABC):
@@ -332,7 +333,7 @@ class VLMEngine:
         return NotImplemented
     @abc.abstractmethod
-    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image) -> List[Dict[str,str]]:
+    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
         """
         This method inputs an image and returns the correesponding chat messages for the inference engine.
@@ -344,6 +345,8 @@ class VLMEngine:
             the user prompt.
         image : Image.Image
             the image for OCR.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples.
         """
         return NotImplemented
@@ -557,7 +560,7 @@ class OllamaVLMEngine(VLMEngine):
         return res_dict
-    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image) -> List[Dict[str,str]]:
+    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
         """
         This method inputs an image and returns the correesponding chat messages for the inference engine.
@@ -569,16 +572,32 @@ class OllamaVLMEngine(VLMEngine):
             the user prompt.
         image : Image.Image
             the image for OCR.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples.
         """
         base64_str = image_to_base64(image)
-        return [
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": user_prompt,
-                "images": [base64_str]
-            }
-        ]
+        output_messages = []
+        # system message
+        system_message = {"role": "system", "content": system_prompt}
+        output_messages.append(system_message)
+        # few-shot examples
+        if few_shot_examples is not None:
+            for example in few_shot_examples:
+                if not isinstance(example, FewShotExample):
+                    raise ValueError("Few-shot example must be a FewShotExample object.")
+                example_image_b64 = image_to_base64(example.image)
+                example_user_message = {"role": "user", "content": user_prompt, "images": [example_image_b64]}
+                example_agent_message = {"role": "assistant", "content": example.text}
+                output_messages.append(example_user_message)
+                output_messages.append(example_agent_message)
+        # user message
+        user_message = {"role": "user", "content": user_prompt, "images": [base64_str]}
+        output_messages.append(user_message)
+        return output_messages
 class OpenAICompatibleVLMEngine(VLMEngine):
@@ -792,7 +811,8 @@ class OpenAICompatibleVLMEngine(VLMEngine):
         return res_dict
-    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', detail:str="high") -> List[Dict[str,str]]:
+    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
+                         detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
         """
         This method inputs an image and returns the correesponding chat messages for the inference engine.
@@ -808,24 +828,55 @@ class OpenAICompatibleVLMEngine(VLMEngine):
             the image format.
         detail : str, Optional
             the detail level of the image. Default is "high".
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples.
         """
         base64_str = image_to_base64(image)
-        return [
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/{format};base64,{base64_str}",
-                            "detail": detail
+        output_messages = []
+        # system message
+        system_message = {"role": "system", "content": system_prompt}
+        output_messages.append(system_message)
+        # few-shot examples
+        if few_shot_examples is not None:
+            for example in few_shot_examples:
+                if not isinstance(example, FewShotExample):
+                    raise ValueError("Few-shot example must be a FewShotExample object.")
+                example_image_b64 = image_to_base64(example.image)
+                example_user_message = {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/{format};base64,{example_image_b64}",
+                                "detail": detail
+                            },
                         },
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+                example_agent_message = {"role": "assistant", "content": example.text}
+                output_messages.append(example_user_message)
+                output_messages.append(example_agent_message)
+        # user message
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/{format};base64,{base64_str}",
+                        "detail": detail
                     },
-                    {"type": "text", "text": user_prompt},
-                ],
-            },
-        ]
+                },
+                {"type": "text", "text": user_prompt},
+            ],
+        }
+        output_messages.append(user_message)
+        return output_messages
 class VLLMVLMEngine(OpenAICompatibleVLMEngine):
@@ -1096,7 +1147,8 @@ class OpenAIVLMEngine(VLMEngine):
         return res_dict
-    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', detail:str="high") -> List[Dict[str,str]]:
+    def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
+                         detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
         """
         This method inputs an image and returns the correesponding chat messages for the inference engine.
@@ -1112,24 +1164,55 @@ class OpenAIVLMEngine(VLMEngine):
             the image format.
         detail : str, Optional
             the detail level of the image. Default is "high".
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
         """
         base64_str = image_to_base64(image)
-        return [
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/{format};base64,{base64_str}",
-                            "detail": detail
+        output_messages = []
+        # system message
+        system_message = {"role": "system", "content": system_prompt}
+        output_messages.append(system_message)
+        # few-shot examples
+        if few_shot_examples is not None:
+            for example in few_shot_examples:
+                if not isinstance(example, FewShotExample):
+                    raise ValueError("Few-shot example must be a FewShotExample object.")
+                example_image_b64 = image_to_base64(example.image)
+                example_user_message = {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/{format};base64,{example_image_b64}",
+                                "detail": detail
+                            },
                         },
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+                example_agent_message = {"role": "assistant", "content": example.text}
+                output_messages.append(example_user_message)
+                output_messages.append(example_agent_message)
+        # user message
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/{format};base64,{base64_str}",
+                        "detail": detail
                     },
-                    {"type": "text", "text": user_prompt},
-                ],
-            },
-        ]
+                },
+                {"type": "text", "text": user_prompt},
+            ],
+        }
+        output_messages.append(user_message)
+        return output_messages
 class AzureOpenAIVLMEngine(OpenAIVLMEngine):