PyPI - vlm4ocr - Versions diffs - 0.3.0__tar.gz → 0.4.0__tar.gz - Mend

vlm4ocr 0.3.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{vlm4ocr-0.3.0 → vlm4ocr-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vlm4ocr
-Version: 0.3.0
+Version: 0.4.0
 Summary: Python package and Web App for OCR with vision language models.
 License: MIT
 Author: Enshuo (David) Hsu

{vlm4ocr-0.3.0 → vlm4ocr-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vlm4ocr"
-version = "0.3.0"
+version = "0.4.0"
 description = "Python package and Web App for OCR with vision language models."
 authors = ["Enshuo (David) Hsu"]
 license = "MIT"

vlm4ocr-0.4.0/vlm4ocr/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from .data_types import FewShotExample
+from .ocr_engines import OCREngine
+from .vlm_engines import BasicVLMConfig, ReasoningVLMConfig, OpenAIReasoningVLMConfig, OllamaVLMEngine, OpenAICompatibleVLMEngine, VLLMVLMEngine, OpenRouterVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine
+__all__ = [
+    "FewShotExample",
+    "BasicVLMConfig",
+    "ReasoningVLMConfig",
+    "OpenAIReasoningVLMConfig",
+    "OCREngine",
+    "OllamaVLMEngine",
+    "OpenAICompatibleVLMEngine",
+    "VLLMVLMEngine",
+    "OpenRouterVLMEngine",
+    "OpenAIVLMEngine",
+    "AzureOpenAIVLMEngine"
+]

{vlm4ocr-0.3.0 → vlm4ocr-0.4.0}/vlm4ocr/cli.py RENAMED Viewed

@@ -4,18 +4,9 @@ import sys
 import logging
 import asyncio
 import time
-# Attempt to import from the local package structure
-try:
-    from .ocr_engines import OCREngine
-    from .vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
-    from .data_types import OCRResult
-except ImportError:
-    # Fallback for when the package is installed
-    from vlm4ocr.ocr_engines import OCREngine
-    from vlm4ocr.vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
-    from vlm4ocr.data_types import OCRResult
+from .ocr_engines import OCREngine
+from .vlm_engines import OpenAICompatibleVLMEngine, OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine, BasicVLMConfig
+from .data_types import OCRResult
 import tqdm.asyncio
 # --- Global logger setup (console) ---
@@ -24,7 +15,12 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s: %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S'
 )
+# Get our specific logger for CLI messages
 logger = logging.getLogger("vlm4ocr_cli")
+# Get the logger that will receive captured warnings
+# By default, warnings are logged to a logger named 'py.warnings'
+warnings_logger = logging.getLogger('py.warnings')
 SUPPORTED_IMAGE_EXTS_CLI = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
 OUTPUT_EXTENSIONS = {'markdown': '.md', 'HTML':'.html', 'text':'.txt'}
@@ -65,17 +61,26 @@ def setup_file_logger(log_dir, timestamp_str, debug_mode):
     log_file_path = os.path.join(log_dir, log_file_name)
     file_handler = logging.FileHandler(log_file_path, mode='a')
-    formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - [%(name)s:%(filename)s:%(lineno)d] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
     file_handler.setFormatter(formatter)
     log_level = logging.DEBUG if debug_mode else logging.INFO
     file_handler.setLevel(log_level)
-    logger.addHandler(file_handler)
+    # Add handler to the root logger to capture all logs (from our logger,
+    # and from the warnings logger 'py.warnings')
+    root_logger = logging.getLogger()
+    root_logger.addHandler(file_handler)
+    # We still configure our specific logger's level for console output
     logger.info(f"Logging to file: {log_file_path}")
 def main():
+    # Capture warnings from the 'warnings' module (like RuntimeWarning)
+    # and redirect them to the 'logging' system.
+    logging.captureWarnings(True)
     parser = argparse.ArgumentParser(
         description="VLM4OCR: Perform OCR on images, PDFs, or TIFF files using Vision Language Models. Processing is concurrent by default.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -103,7 +108,8 @@ def main():
     vlm_engine_group.add_argument("--vlm_engine", choices=["openai", "azure_openai", "ollama", "openai_compatible"], required=True, help="VLM engine.")
     vlm_engine_group.add_argument("--model", required=True, help="Model identifier for the VLM engine.")
     vlm_engine_group.add_argument("--max_new_tokens", type=int, default=4096, help="Max new tokens for VLM.")
-    vlm_engine_group.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
+    vlm_engine_group.add_argument("--temperature", type=float, default=None, help="Sampling temperature.")
+    vlm_engine_group.add_argument("--top_p", type=float, default=None, help="Sampling top p.")
     openai_group = parser.add_argument_group("OpenAI & OpenAI-Compatible Options")
     openai_group.add_argument("--api_key", default=os.environ.get("OPENAI_API_KEY"), help="API key.")
@@ -144,16 +150,23 @@ def main():
     current_timestamp_str = time.strftime("%Y%m%d_%H%M%S")
     # --- Configure Logger Level based on args ---
+    # Get root logger to control global level for libraries
+    root_logger = logging.getLogger()
     if args.debug:
-        logger.setLevel(logging.DEBUG)
-        # Set root logger to DEBUG only if our specific logger is DEBUG, to avoid overly verbose library logs unless intended.
-        if logger.getEffectiveLevel() <= logging.DEBUG:
-            logging.getLogger().setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG) # Our logger to DEBUG
+        warnings_logger.setLevel(logging.DEBUG) # Warnings logger to DEBUG
+        root_logger.setLevel(logging.DEBUG) # Root to DEBUG
         logger.debug("Debug mode enabled for console.")
     else:
-        logger.setLevel(logging.INFO) # Default for our CLI's own messages
-        logging.getLogger().setLevel(logging.WARNING) # Keep external libraries quieter by default
+        logger.setLevel(logging.INFO) # Our logger to INFO
+        warnings_logger.setLevel(logging.INFO) # Warnings logger to INFO
+        root_logger.setLevel(logging.WARNING) # Root to WARNING (quieter libraries)
+        # Our console handler (from basicConfig) is on the root logger,
+        # so setting root to WARNING makes console quiet
+        # But our logger (vlm4ocr_cli) is INFO, so if a file handler
+        # is added, it will get INFO messages from 'logger'
     if args.concurrent_batch_size < 1:
         parser.error("--concurrent_batch_size must be 1 or greater.")
@@ -192,6 +205,15 @@ def main():
     # --- Setup File Logger (if --log is specified) ---
     if args.log:
         setup_file_logger(effective_output_dir, current_timestamp_str, args.debug)
+        # If logging to file, we want our console to be less verbose
+        # if not in debug mode, so we set the console handler's level higher.
+        if not args.debug:
+            # Find the console handler (from basicConfig) and set its level
+            for handler in root_logger.handlers:
+                if isinstance(handler, logging.StreamHandler) and handler.stream == sys.stderr:
+                     handler.setLevel(logging.WARNING)
+                     logger.debug("Set console handler level to WARNING.")
+                     break
     logger.debug(f"Parsed arguments: {args}")
@@ -199,16 +221,18 @@ def main():
     vlm_engine_instance = None
     try:
         logger.info(f"Initializing VLM engine: {args.vlm_engine} with model: {args.model}")
+        logger.info(f"max_new_tokens: {args.max_new_tokens}, temperature: {args.temperature}, top_p: {args.top_p}")
         config = BasicVLMConfig(
             max_new_tokens=args.max_new_tokens,
-            temperature=args.temperature
+            temperature=args.temperature,
+            top_p=args.top_p
         )
         if args.vlm_engine == "openai":
             if not args.api_key: parser.error("--api_key (or OPENAI_API_KEY) is required for OpenAI.")
             vlm_engine_instance = OpenAIVLMEngine(model=args.model, api_key=args.api_key, config=config)
         elif args.vlm_engine == "openai_compatible":
             if not args.base_url: parser.error("--base_url is required for openai_compatible.")
-            vlm_engine_instance = OpenAIVLMEngine(model=args.model, api_key=args.api_key, base_url=args.base_url, config=config)
+            vlm_engine_instance = OpenAICompatibleVLMEngine(model=args.model, api_key=args.api_key, base_url=args.base_url, config=config)
         elif args.vlm_engine == "azure_openai":
             if not args.azure_api_key: parser.error("--azure_api_key (or AZURE_OPENAI_API_KEY) is required.")
             if not args.azure_endpoint: parser.error("--azure_endpoint (or AZURE_OPENAI_ENDPOINT) is required.")
@@ -295,16 +319,34 @@ def main():
             # console verbosity controlled by logger level.
             show_progress_bar = (num_actual_files > 0)
+            # Only show progress bar if not in debug mode (debug logs would interfere)
+            # and if there are files to process.
+            # If logging to file, console can be quiet (INFO level).
+            # If NOT logging to file, console must be INFO level to show bar.
+            # Determine if progress bar should be active (not disabled)
+            # Disable bar if in debug mode (logs interfere) or no files
+            disable_bar = args.debug or not show_progress_bar
+            # If not logging to file AND not debug, we need console at INFO
+            if not args.log and not args.debug:
+                 for handler in logging.getLogger().handlers:
+                    if isinstance(handler, logging.StreamHandler) and handler.stream == sys.stderr:
+                         handler.setLevel(logging.INFO)
+                         logger.debug("Set console handler level to INFO for progress bar.")
+                         break
             iterator_wrapper = tqdm.asyncio.tqdm(
                 ocr_task_generator,
                 total=num_actual_files,
                 desc="Processing files",
                 unit="file",
-                disable=not show_progress_bar # disable if no files, or can remove this disable if tqdm handles total=0
+                disable=disable_bar
             )
             async for result_object in iterator_wrapper:
                 if not isinstance(result_object, OCRResult):
+                    # This warning *will* now be captured by the file log
                     logger.warning(f"Received unexpected data type: {type(result_object)}")
                     continue
@@ -323,9 +365,12 @@ def main():
                         content_to_write = result_object.to_string()
                         with open(current_ocr_output_file_path, "w", encoding="utf-8") as f:
                             f.write(content_to_write)
-                        # Log less verbosely to console if progress bar is active
-                        if not show_progress_bar or logger.getEffectiveLevel() <= logging.DEBUG:
-                           logger.info(f"OCR result for '{input_file_path_from_result}' saved to: {current_ocr_output_file_path}")
+                        # MODIFIED: Always log success info.
+                        # This will go to the file log if active.
+                        # It will NOT go to console if console level is WARNING.
+                        logger.info(f"OCR result for '{input_file_path_from_result}' saved to: {current_ocr_output_file_path}")
                     except Exception as e:
                         logger.error(f"Error writing output for '{input_file_path_from_result}' to '{current_ocr_output_file_path}': {e}")

{vlm4ocr-0.3.0 → vlm4ocr-0.4.0}/vlm4ocr/data_types.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import os
-from typing import List, Literal
+from typing import List, Dict, Literal
+from PIL import Image
 from dataclasses import dataclass, field
-from vlm4ocr.utils import get_default_page_delimiter
+from vlm4ocr.utils import get_default_page_delimiter, ImageProcessor
 OutputMode = Literal["markdown", "HTML", "text", "JSON"]
@@ -24,6 +25,7 @@ class OCRResult:
     pages: List[dict] = field(default_factory=list)
     filename: str = field(init=False)
     status: str = field(init=False, default="processing")
+    messages_log: List[List[Dict[str,str]]] = field(default_factory=list)
     def __post_init__(self):
         """
@@ -67,10 +69,6 @@ class OCRResult:
         }
         self.pages.append(page)
-    def __len__(self):
-        return len(self.pages)
     def get_page(self, idx):
         if not isinstance(idx, int):
             raise ValueError("Index must be an integer")
@@ -78,6 +76,21 @@ class OCRResult:
             raise IndexError(f"Index out of range. The OCRResult has {len(self.pages)} pages, but index {idx} was requested.")
         return self.pages[idx]
+    def clear_messages_log(self):
+        self.messages_log = []
+    def add_messages_to_log(self, messages: List[Dict[str,str]]):
+        if not isinstance(messages, list):
+            raise ValueError("messages must be a list of dict")
+        self.messages_log.extend(messages)
+    def get_messages_log(self) -> List[List[Dict[str,str]]]:
+        return self.messages_log.copy()
+    def __len__(self):
+        return len(self.pages)
     def __iter__(self):
         return iter(self.pages)
@@ -106,4 +119,41 @@ class OCRResult:
         else:
             self.page_delimiter = page_delimiter
-        return self.page_delimiter.join([page.get("text", "") for page in self.pages])
+        return self.page_delimiter.join([page.get("text", "") for page in self.pages])
+@dataclass
+class FewShotExample:
+    """
+    This class represents a few-shot example for OCR tasks.
+    Parameters:
+    ----------
+    image : PIL.Image.Image
+        The image associated with the example.
+    text : str
+        The expected OCR result text for the image.
+    rotate_correction : bool, Optional
+        If True, applies rotate correction to the images using pytesseract.
+    max_dimension_pixels : int, Optional
+        The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
+    """
+    image: Image.Image
+    text: str
+    rotate_correction: bool = False
+    max_dimension_pixels: int = None
+    def __post_init__(self):
+        if not isinstance(self.image, Image.Image):
+            raise ValueError("image must be a PIL.Image.Image object")
+        if not isinstance(self.text, str):
+            raise ValueError("text must be a string")
+        if self.rotate_correction or self.max_dimension_pixels is not None:
+            self.image_processor = ImageProcessor()
+        # Rotate correction if specified
+        if self.rotate_correction:
+            self.image, _ = self.image_processor.rotate_correction(self.image)
+        # Resize image if max_dimension_pixels is specified
+        if self.max_dimension_pixels is not None:
+            self.image, _ = self.image_processor.resize(image=self.image, max_dimension_pixels=self.max_dimension_pixels)

{vlm4ocr-0.3.0 → vlm4ocr-0.4.0}/vlm4ocr/ocr_engines.py RENAMED Viewed

@@ -1,12 +1,12 @@
 import os
-from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
+from typing import Any, Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
 import importlib
 import asyncio
 from colorama import Fore, Style
 import json
 from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
-from vlm4ocr.data_types import OCRResult
-from vlm4ocr.vlm_engines import VLMEngine
+from vlm4ocr.data_types import OCRResult, FewShotExample
+from vlm4ocr.vlm_engines import VLMEngine, MessagesLogger
 SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
@@ -60,7 +60,8 @@ class OCREngine:
         self.image_processor = ImageProcessor()
-    def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Generator[Dict[str, str], None, None]:
+    def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                   few_shot_examples:List[FewShotExample]=None) -> Generator[Dict[str, str], None, None]:
         """
         This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
         Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
@@ -73,6 +74,8 @@ class OCREngine:
             If True, applies rotate correction to the images using pytesseract.
         max_dimension_pixels : int, Optional
             The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples.
         Returns:
         --------
@@ -90,10 +93,6 @@ class OCREngine:
         file_ext = os.path.splitext(file_path)[1].lower()
         if file_ext not in SUPPORTED_IMAGE_EXTS:
             raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
-        # Check if image preprocessing can be applied
-        if self.image_processor.has_tesseract==False and rotate_correction:
-            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
         # PDF or TIFF
         if file_ext in ['.pdf', '.tif', '.tiff']:
@@ -105,8 +104,8 @@ class OCREngine:
             # OCR each image
             for i, image in enumerate(images):
-                # Apply rotate correction if specified and tesseract is available
-                if rotate_correction and self.image_processor.has_tesseract:
+                # Apply rotate correction if specified
+                if rotate_correction:
                     try:
                         image, _ = self.image_processor.rotate_correction(image)
@@ -120,13 +119,20 @@ class OCREngine:
                     except Exception as e:
                         yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
-                messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                # Get OCR messages
+                messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                            user_prompt=self.user_prompt,
+                                                            image=image,
+                                                            few_shot_examples=few_shot_examples)
+                # Stream response
                 response_stream = self.vlm_engine.chat(
                     messages,
                     stream=True
                 )
                 for chunk in response_stream:
-                    yield {"type": "ocr_chunk", "data": chunk}
+                    if chunk["type"] == "response":
+                        yield {"type": "ocr_chunk", "data": chunk["data"]}
                 if i < len(images) - 1:
                     yield {"type": "page_delimiter", "data": get_default_page_delimiter(self.output_mode)}
@@ -136,8 +142,8 @@ class OCREngine:
             data_loader = ImageDataLoader(file_path)
             image = data_loader.get_page(0)
-            # Apply rotate correction if specified and tesseract is available
-            if rotate_correction and self.image_processor.has_tesseract:
+            # Apply rotate correction if specified
+            if rotate_correction:
                 try:
                     image, _ = self.image_processor.rotate_correction(image)
@@ -151,17 +157,23 @@ class OCREngine:
                 except Exception as e:
                     yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
-            messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+            # Get OCR messages
+            messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                        user_prompt=self.user_prompt,
+                                                        image=image,
+                                                        few_shot_examples=few_shot_examples)
+            # Stream response
             response_stream = self.vlm_engine.chat(
                     messages,
                     stream=True
                 )
             for chunk in response_stream:
-                yield {"type": "ocr_chunk", "data": chunk}
+                if chunk["type"] == "response":
+                    yield {"type": "ocr_chunk", "data": chunk["data"]}
     def sequential_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
-                       max_dimension_pixels:int=None, verbose:bool=False) -> List[OCRResult]:
+                       max_dimension_pixels:int=None, verbose:bool=False, few_shot_examples:List[FewShotExample]=None) -> List[OCRResult]:
         """
         This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
@@ -175,6 +187,8 @@ class OCREngine:
             The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
         verbose : bool, Optional
             If True, the function will print the output in terminal.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
         Returns:
         --------
@@ -184,6 +198,7 @@ class OCREngine:
         if isinstance(file_paths, str):
             file_paths = [file_paths]
+        # Iterate through file paths
         ocr_results = []
         for file_path in file_paths:
             # Define OCRResult object
@@ -233,8 +248,8 @@ class OCREngine:
             # OCR images
             for i, image in enumerate(images):
                 image_processing_status = {}
-                # Apply rotate correction if specified and tesseract is available
-                if rotate_correction and self.image_processor.has_tesseract:
+                # Apply rotate correction if specified
+                if rotate_correction:
                     try:
                         image, rotation_angle = self.image_processor.rotate_correction(image)
                         image_processing_status["rotate_correction"] = {
@@ -270,25 +285,36 @@ class OCREngine:
                             print(f"{Fore.RED}Error resizing image for {filename}:{Style.RESET_ALL} {resized['error']}. OCR continues without resizing.")
                 try:
-                    messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                    messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                                user_prompt=self.user_prompt,
+                                                                image=image,
+                                                                few_shot_examples=few_shot_examples)
+                    # Define a messages logger to capture messages
+                    messages_logger = MessagesLogger()
+                    # Generate response
                     response = self.vlm_engine.chat(
                         messages,
                         verbose=verbose,
-                        stream=False
+                        stream=False,
+                        messages_logger=messages_logger
                     )
+                    ocr_text = response["response"]
                     # Clean the response if output mode is markdown
                     if self.output_mode == "markdown":
-                        response = clean_markdown(response)
+                        ocr_text = clean_markdown(ocr_text)
                     # Parse the response if output mode is JSON
-                    if self.output_mode == "JSON":
-                        json_list = extract_json(response)
+                    elif self.output_mode == "JSON":
+                        json_list = extract_json(ocr_text)
                         # Serialize the JSON list to a string
-                        response = json.dumps(json_list, indent=4)
+                        ocr_text = json.dumps(json_list, indent=4)
                     # Add the page to the OCR result
-                    ocr_result.add_page(text=response,
+                    ocr_result.add_page(text=ocr_text,
                                         image_processing_status=image_processing_status)
+                    # Add messages log to the OCR result
+                    ocr_result.add_messages_to_log(messages_logger.get_messages_log())
                 except Exception as page_e:
                     ocr_result.status = "error"
@@ -298,11 +324,12 @@ class OCREngine:
                         print(f"{Fore.RED}Error during OCR for a page in {filename}:{Style.RESET_ALL} {page_e}")
             # Add the OCR result to the list
-            ocr_result.status = "success"
+            if ocr_result.status != "error":
+                ocr_result.status = "success"
             ocr_results.append(ocr_result)
             if verbose:
-                print(f"{Fore.BLUE}Successfully processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
+                print(f"{Fore.BLUE}Processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
                 for page in ocr_result:
                     print(page)
                     print("-" * 80)
@@ -311,7 +338,8 @@ class OCREngine:
     def concurrent_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
-                       max_dimension_pixels:int=None, concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
+                       max_dimension_pixels:int=None, few_shot_examples:List[FewShotExample]=None,
+                       concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
         First complete first out. Input and output order not guaranteed.
         This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
@@ -325,6 +353,8 @@ class OCREngine:
             If True, applies rotate correction to the images using pytesseract.
         max_dimension_pixels : int, Optional
             The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied.
+        few_shot_examples : List[FewShotExample], Optional
+            list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
         concurrent_batch_size : int, Optional
             The number of concurrent VLM calls to make.
         max_file_load : int, Optional
@@ -343,18 +373,17 @@ class OCREngine:
         if not isinstance(max_file_load, int) or max_file_load <= 0:
             raise ValueError("max_file_load must be a positive integer")
-        if self.image_processor.has_tesseract==False and rotate_correction:
-            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
         return self._ocr_async(file_paths=file_paths,
                                rotate_correction=rotate_correction,
                                max_dimension_pixels=max_dimension_pixels,
+                               few_shot_examples=few_shot_examples,
                                concurrent_batch_size=concurrent_batch_size,
                                max_file_load=max_file_load)
     async def _ocr_async(self, file_paths: Iterable[str], rotate_correction:bool=False, max_dimension_pixels:int=None,
+                         few_shot_examples:List[FewShotExample]=None,
                          concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
         Internal method to asynchronously process an iterable of file paths.
@@ -370,7 +399,8 @@ class OCREngine:
                                                  vlm_call_semaphore=vlm_call_semaphore,
                                                  file_path=file_path,
                                                  rotate_correction=rotate_correction,
-                                                 max_dimension_pixels=max_dimension_pixels)
+                                                 max_dimension_pixels=max_dimension_pixels,
+                                                 few_shot_examples=few_shot_examples)
             tasks.append(task)
@@ -379,7 +409,8 @@ class OCREngine:
             yield result
     async def _ocr_file_with_semaphore(self, file_load_semaphore:asyncio.Semaphore, vlm_call_semaphore:asyncio.Semaphore,
-                                       file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> OCRResult:
+                                       file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                                       few_shot_examples:List[FewShotExample]=None) -> OCRResult:
         """
         This internal method takes a semaphore and OCR a single file using the VLM inference engine.
         """
@@ -387,6 +418,7 @@ class OCREngine:
             filename = os.path.basename(file_path)
             file_ext = os.path.splitext(file_path)[1].lower()
             result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
+            messages_logger = MessagesLogger()
             # check file extension
             if file_ext not in SUPPORTED_IMAGE_EXTS:
                 result.status = "error"
@@ -416,7 +448,9 @@ class OCREngine:
                         data_loader=data_loader,
                         page_index=page_index,
                         rotate_correction=rotate_correction,
-                        max_dimension_pixels=max_dimension_pixels
+                        max_dimension_pixels=max_dimension_pixels,
+                        few_shot_examples=few_shot_examples,
+                        messages_logger=messages_logger
                     )
                     page_processing_tasks.append(task)
@@ -428,14 +462,18 @@ class OCREngine:
             except Exception as e:
                 result.status = "error"
                 result.add_page(text=f"Error during OCR for {filename}: {str(e)}", image_processing_status={})
+                result.add_messages_to_log(messages_logger.get_messages_log())
                 return result
         # Set status to success if no errors occurred
-        result.status = "success"
+        if result.status != "error":
+            result.status = "success"
+        result.add_messages_to_log(messages_logger.get_messages_log())
         return result
     async def _ocr_page_with_semaphore(self, vlm_call_semaphore: asyncio.Semaphore, data_loader: DataLoader,
-                                       page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Tuple[str, Dict[str, str]]:
+                                       page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None,
+                                       few_shot_examples:List[FewShotExample]=None, messages_logger:MessagesLogger=None) -> Tuple[str, Dict[str, str]]:
         """
         This internal method takes a semaphore and OCR a single image/page using the VLM inference engine.
@@ -447,8 +485,8 @@ class OCREngine:
         async with vlm_call_semaphore:
             image = await data_loader.get_page_async(page_index)
             image_processing_status = {}
-            # Apply rotate correction if specified and tesseract is available
-            if rotate_correction and self.image_processor.has_tesseract:
+            # Apply rotate correction if specified
+            if rotate_correction:
                 try:
                     image, rotation_angle = await self.image_processor.rotate_correction_async(image)
                     image_processing_status["rotate_correction"] = {
@@ -475,16 +513,21 @@ class OCREngine:
                         "error": str(e)
                     }
-            messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
-            ocr_text = await self.vlm_engine.chat_async(
+            messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
+                                                        user_prompt=self.user_prompt,
+                                                        image=image,
+                                                        few_shot_examples=few_shot_examples)
+            response = await self.vlm_engine.chat_async(
                 messages,
+                messages_logger=messages_logger
             )
+            ocr_text = response["response"]
             # Clean the OCR text if output mode is markdown
             if self.output_mode == "markdown":
                 ocr_text = clean_markdown(ocr_text)
             # Parse the response if output mode is JSON
-            if self.output_mode == "JSON":
+            elif self.output_mode == "JSON":
                 json_list = extract_json(ocr_text)
                 # Serialize the JSON list to a string
                 ocr_text = json.dumps(json_list, indent=4)

vlm4ocr 0.3.0__tar.gz → 0.4.0__tar.gz

vlm4ocr 0.3.0tar.gz → 0.4.0tar.gz