PyPI - vlm4ocr - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

vlm4ocr 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

vlm4ocr/__init__.py +3 -1
vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt +1 -0
vlm4ocr/cli.py +276 -287
vlm4ocr/data_types.py +109 -0
vlm4ocr/ocr_engines.py +363 -195
vlm4ocr/utils.py +386 -39
vlm4ocr/vlm_engines.py +316 -190
{vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/METADATA +5 -1
vlm4ocr-0.3.0.dist-info/RECORD +17 -0
vlm4ocr-0.1.0.dist-info/RECORD +0 -15
{vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/WHEEL +0 -0
{vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/entry_points.txt +0 -0

vlm4ocr/ocr_engines.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import os
-from typing import List, Dict, Union, Generator, Iterable
+from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
 import importlib
 import asyncio
-from vlm4ocr.utils import get_images_from_pdf, get_images_from_tiff, get_image_from_file, clean_markdown
+from colorama import Fore, Style
+import json
+from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
+from vlm4ocr.data_types import OCRResult
 from vlm4ocr.vlm_engines import VLMEngine
 SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
 class OCREngine:
-    def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="auto"):
+    def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None):
         """
         This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.
@@ -18,17 +21,11 @@ class OCREngine:
         inference_engine : InferenceEngine
             The inference engine to use for OCR.
         output_mode : str, Optional
-            The output format. Must be 'markdown', 'HTML', or 'text'.
+            The output format. Must be 'markdown', 'HTML', 'text', or 'JSON'.
         system_prompt : str, Optional
             Custom system prompt. We recommend use a default system prompt by leaving this blank.
         user_prompt : str, Optional
             Custom user prompt. It is good to include some information regarding the document. If not specified, a default will be used.
-        page_delimiter : str, Optional
-            The delimiter to use between PDF pages.
-            if 'auto', it will be set to the default page delimiter for the output mode:
-            'markdown' -> '\n\n---\n\n'
-            'HTML' -> '<br><br>'
-            'text' -> '\n\n---\n\n'
         """
         # Check inference engine
         if not isinstance(vlm_engine, VLMEngine):
@@ -36,42 +33,34 @@ class OCREngine:
         self.vlm_engine = vlm_engine
         # Check output mode
-        if output_mode not in ["markdown", "HTML", "text"]:
-            raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
+        if output_mode not in ["markdown", "HTML", "text", "JSON"]:
+            raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'.")
         self.output_mode = output_mode
         # System prompt
         if isinstance(system_prompt, str) and system_prompt:
             self.system_prompt = system_prompt
         else:
-            file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_system_prompt.txt')
-            with open(file_path, 'r', encoding='utf-8') as f:
+            prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_system_prompt.txt')
+            with prompt_template_path.open('r', encoding='utf-8') as f:
                 self.system_prompt =  f.read()
         # User prompt
         if isinstance(user_prompt, str) and user_prompt:
             self.user_prompt = user_prompt
         else:
-            file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
-            with open(file_path, 'r', encoding='utf-8') as f:
+            if self.output_mode == "JSON":
+                raise ValueError("user_prompt must be provided when output_mode is 'JSON' to define the JSON structure.")
+            prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
+            with prompt_template_path.open('r', encoding='utf-8') as f:
                 self.user_prompt =  f.read()
-        # Page delimiter
-        if isinstance(page_delimiter, str):
-            if page_delimiter == "auto":
-                if self.output_mode == "markdown":
-                    self.page_delimiter = "\n\n---\n\n"
-                elif self.output_mode == "HTML":
-                    self.page_delimiter = "<br><br>"
-                else:
-                    self.page_delimiter = "\n\n---\n\n"
-            else:
-                self.page_delimiter = page_delimiter
-        else:
-            raise ValueError("page_delimiter must be a string")
+        # Image processor
+        self.image_processor = ImageProcessor()
-    def stream_ocr(self, file_path: str, max_new_tokens:int=4096, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
+    def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Generator[Dict[str, str], None, None]:
         """
         This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
         Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
@@ -80,15 +69,18 @@ class OCREngine:
         -----------
         file_path : str
             The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
-        max_new_tokens : int, Optional
-            The maximum number of tokens to generate.
-        temperature : float, Optional
-            The temperature to use for sampling.
+        rotate_correction : bool, Optional
+            If True, applies rotate correction to the images using pytesseract.
+        max_dimension_pixels : int, Optional
+            The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
         Returns:
         --------
-        Generator[str, None, None]
-            A generator that yields the output.
+        Generator[Dict[str, str], None, None]
+            A generator that yields the output:
+            {"type": "info", "data": msg}
+            {"type": "ocr_chunk", "data": chunk}
+            {"type": "page_delimiter", "data": page_delimiter}
         """
         # Check file path
         if not isinstance(file_path, str):
@@ -98,227 +90,403 @@ class OCREngine:
         file_ext = os.path.splitext(file_path)[1].lower()
         if file_ext not in SUPPORTED_IMAGE_EXTS:
             raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
+        # Check if image preprocessing can be applied
+        if self.image_processor.has_tesseract==False and rotate_correction:
+            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
         # PDF or TIFF
         if file_ext in ['.pdf', '.tif', '.tiff']:
-            images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
+            data_loader = PDFDataLoader(file_path) if file_ext == '.pdf' else TIFFDataLoader(file_path)
+            images = data_loader.get_all_pages()
+            # Check if images were extracted
             if not images:
                 raise ValueError(f"No images extracted from file: {file_path}")
+            # OCR each image
             for i, image in enumerate(images):
+                # Apply rotate correction if specified and tesseract is available
+                if rotate_correction and self.image_processor.has_tesseract:
+                    try:
+                        image, _ = self.image_processor.rotate_correction(image)
+                    except Exception as e:
+                        yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}
+                # Resize the image if max_dimension_pixels is specified
+                if max_dimension_pixels is not None:
+                    try:
+                        image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
+                    except Exception as e:
+                        yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
                 messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
                 response_stream = self.vlm_engine.chat(
                     messages,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    stream=True,
-                    **kwrs
+                    stream=True
                 )
                 for chunk in response_stream:
                     yield {"type": "ocr_chunk", "data": chunk}
                 if i < len(images) - 1:
-                    yield {"type": "page_delimiter", "data": self.page_delimiter}
+                    yield {"type": "page_delimiter", "data": get_default_page_delimiter(self.output_mode)}
         # Image
         else:
-            image = get_image_from_file(file_path)
+            data_loader = ImageDataLoader(file_path)
+            image = data_loader.get_page(0)
+            # Apply rotate correction if specified and tesseract is available
+            if rotate_correction and self.image_processor.has_tesseract:
+                try:
+                    image, _ = self.image_processor.rotate_correction(image)
+                except Exception as e:
+                    yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}
+            # Resize the image if max_dimension_pixels is specified
+            if max_dimension_pixels is not None:
+                try:
+                    image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
+                except Exception as e:
+                    yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
             messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
             response_stream = self.vlm_engine.chat(
                     messages,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    stream=True,
-                    **kwrs
+                    stream=True
                 )
             for chunk in response_stream:
                 yield {"type": "ocr_chunk", "data": chunk}
-    def run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096, temperature:float=0.0,
-                verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32, **kwrs) -> Union[str, Generator[str, None, None]]:
+    def sequential_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
+                       max_dimension_pixels:int=None, verbose:bool=False) -> List[OCRResult]:
         """
-        This method takes a list of file paths (image, PDF, TIFF) and perform OCR using the VLM inference engine.
+        This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
         Parameters:
         -----------
         file_paths : Union[str, Iterable[str]]
-            A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff, '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
-        max_new_tokens : int, Optional
-            The maximum number of tokens to generate.
-        temperature : float, Optional
-            The temperature to use for sampling.
+            A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
+        rotate_correction : bool, Optional
+            If True, applies rotate correction to the images using pytesseract.
+        max_dimension_pixels : int, Optional
+            The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
         verbose : bool, Optional
-            If True, the function will print the output in terminal.
-        concurrent : bool, Optional
-            If True, the function will process the files concurrently.
-        concurrent_batch_size : int, Optional
-            The number of images/pages to process concurrently.
+            If True, the function will print the output in terminal.
+        Returns:
+        --------
+        List[OCRResult]
+            A list of OCR result objects.
         """
-        # if file_paths is a string, convert it to a list
         if isinstance(file_paths, str):
             file_paths = [file_paths]
-        if not isinstance(file_paths, Iterable):
-            raise TypeError("file_paths must be a string or an iterable of strings")
-        # check if all file paths are valid
+        ocr_results = []
         for file_path in file_paths:
-            if not isinstance(file_path, str):
-                raise TypeError("file_paths must be a string or an iterable of strings")
+            # Define OCRResult object
+            ocr_result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
+            # get file extension
             file_ext = os.path.splitext(file_path)[1].lower()
+            # Check file extension
             if file_ext not in SUPPORTED_IMAGE_EXTS:
-                raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
+                if verbose:
+                    print(f"{Fore.RED}Unsupported file type:{Style.RESET_ALL} {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
+                ocr_result.status = "error"
+                ocr_result.add_page(text=f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}",
+                                    image_processing_status={})
+                ocr_results.append(ocr_result)
+                continue
+            filename = os.path.basename(file_path)
+            try:
+                # Load images from file
+                if file_ext == '.pdf':
+                    data_loader = PDFDataLoader(file_path)
+                elif file_ext in ['.tif', '.tiff']:
+                    data_loader = TIFFDataLoader(file_path)
+                else:
+                    data_loader = ImageDataLoader(file_path)
+                images = data_loader.get_all_pages()
+            except Exception as e:
+                if verbose:
+                    print(f"{Fore.RED}Error processing file {filename}:{Style.RESET_ALL} {str(e)}")
+                ocr_result.status = "error"
+                ocr_result.add_page(text=f"Error processing file {filename}: {str(e)}", image_processing_status={})
+                ocr_results.append(ocr_result)
+                continue
+            # Check if images were extracted
+            if not images:
+                if verbose:
+                    print(f"{Fore.RED}No images extracted from file:{Style.RESET_ALL} {filename}. It might be empty or corrupted.")
+                ocr_result.status = "error"
+                ocr_result.add_page(text=f"No images extracted from file: {filename}. It might be empty or corrupted.",
+                                    image_processing_status={})
+                ocr_results.append(ocr_result)
+                continue
+            # OCR images
+            for i, image in enumerate(images):
+                image_processing_status = {}
+                # Apply rotate correction if specified and tesseract is available
+                if rotate_correction and self.image_processor.has_tesseract:
+                    try:
+                        image, rotation_angle = self.image_processor.rotate_correction(image)
+                        image_processing_status["rotate_correction"] = {
+                            "status": "success",
+                            "rotation_angle": rotation_angle
+                        }
+                        if verbose:
+                            print(f"{Fore.GREEN}Rotate correction applied for {filename} page {i} with angle {rotation_angle} degrees.{Style.RESET_ALL}")
+                    except Exception as e:
+                        image_processing_status["rotate_correction"] = {
+                            "status": "error",
+                            "error": str(e)
+                        }
+                        if verbose:
+                            print(f"{Fore.RED}Error during rotate correction for {filename}:{Style.RESET_ALL} {rotation_angle['error']}. OCR continues without rotate correction.")
+                # Resize the image if max_dimension_pixels is specified
+                if max_dimension_pixels is not None:
+                    try:
+                        image, resized = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
+                        image_processing_status["resize"] = {
+                            "status": "success",
+                            "resized": resized
+                        }
+                        if verbose and resized:
+                            print(f"{Fore.GREEN}Image resized for {filename} page {i} to fit within {max_dimension_pixels} pixels.{Style.RESET_ALL}")
+                    except Exception as e:
+                        image_processing_status["resize"] = {
+                            "status": "error",
+                            "error": str(e)
+                        }
+                        if verbose:
+                            print(f"{Fore.RED}Error resizing image for {filename}:{Style.RESET_ALL} {resized['error']}. OCR continues without resizing.")
+                try:
+                    messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+                    response = self.vlm_engine.chat(
+                        messages,
+                        verbose=verbose,
+                        stream=False
+                    )
+                    # Clean the response if output mode is markdown
+                    if self.output_mode == "markdown":
+                        response = clean_markdown(response)
-        # Concurrent processing
-        if concurrent:
-            # Check concurrent_batch_size
-            if concurrent_batch_size <= 0:
-                raise ValueError("concurrent_batch_size must be greater than 0")
+                    # Parse the response if output mode is JSON
+                    if self.output_mode == "JSON":
+                        json_list = extract_json(response)
+                        # Serialize the JSON list to a string
+                        response = json.dumps(json_list, indent=4)
+                    # Add the page to the OCR result
+                    ocr_result.add_page(text=response,
+                                        image_processing_status=image_processing_status)
+                except Exception as page_e:
+                    ocr_result.status = "error"
+                    ocr_result.add_page(text=f"Error during OCR for a page in {filename}: {str(page_e)}",
+                                        image_processing_status={})
+                    if verbose:
+                        print(f"{Fore.RED}Error during OCR for a page in {filename}:{Style.RESET_ALL} {page_e}")
+            # Add the OCR result to the list
+            ocr_result.status = "success"
+            ocr_results.append(ocr_result)
             if verbose:
-                Warning("verbose is not supported for concurrent processing.", UserWarning)
+                print(f"{Fore.BLUE}Successfully processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
+                for page in ocr_result:
+                    print(page)
+                    print("-" * 80)
+        return ocr_results
-            return asyncio.run(self._run_ocr_async(file_paths,
-                                                   max_new_tokens=max_new_tokens,
-                                                   temperature=temperature,
-                                                   concurrent_batch_size=concurrent_batch_size,
-                                                   **kwrs))
-        # Sync processing
-        return self._run_ocr(file_paths, max_new_tokens=max_new_tokens, temperature=temperature, verbose=verbose, **kwrs)
-    def _run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096,
-                 temperature:float=0.0, verbose:bool=False, **kwrs) -> Iterable[str]:
+    def concurrent_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
+                       max_dimension_pixels:int=None, concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
-        This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
+        First complete first out. Input and output order not guaranteed.
+        This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
+        Results are processed concurrently using asyncio.
         Parameters:
         -----------
         file_paths : Union[str, Iterable[str]]
             A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
-        max_new_tokens : int, Optional
-            The maximum number of tokens to generate.
-        temperature : float, Optional
-            The temperature to use for sampling.
-        verbose : bool, Optional
-            If True, the function will print the output in terminal.
+        rotate_correction : bool, Optional
+            If True, applies rotate correction to the images using pytesseract.
+        max_dimension_pixels : int, Optional
+            The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied.
+        concurrent_batch_size : int, Optional
+            The number of concurrent VLM calls to make.
+        max_file_load : int, Optional
+            The maximum number of files to load concurrently. If None, defaults to 2 times of concurrent_batch_size.
         Returns:
         --------
-        Iterable[str]
-            A list of strings containing the OCR results.
+        AsyncGenerator[OCRResult, None]
+            A generator that yields OCR result objects as they complete.
         """
-        ocr_results = []
-        for file_path in file_paths:
-            file_ext = os.path.splitext(file_path)[1].lower()
-            # PDF or TIFF
-            if file_ext in ['.pdf', '.tif', '.tiff']:
-                images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
-                if not images:
-                    raise ValueError(f"No images extracted from file: {file_path}")
-                results = []
-                for image in images:
-                    messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
-                    response = self.vlm_engine.chat(
-                        messages,
-                        max_new_tokens=max_new_tokens,
-                        temperature=temperature,
-                        verbose=verbose,
-                        stream=False,
-                        **kwrs
-                    )
-                    results.append(response)
-                ocr_text = self.page_delimiter.join(results)
-            # Image
-            else:
-                image = get_image_from_file(file_path)
-                messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
-                ocr_text = self.vlm_engine.chat(
-                    messages,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature,
-                    verbose=verbose,
-                    stream=False,
-                    **kwrs
-                )
-            # Clean markdown
-            if self.output_mode == "markdown":
-                ocr_text = clean_markdown(ocr_text)
-            ocr_results.append(ocr_text)
+        if isinstance(file_paths, str):
+            file_paths = [file_paths]
+        if max_file_load is None:
+            max_file_load = concurrent_batch_size * 2
-        return ocr_results
+        if not isinstance(max_file_load, int) or max_file_load <= 0:
+            raise ValueError("max_file_load must be a positive integer")
+        if self.image_processor.has_tesseract==False and rotate_correction:
+            raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
+        return self._ocr_async(file_paths=file_paths,
+                               rotate_correction=rotate_correction,
+                               max_dimension_pixels=max_dimension_pixels,
+                               concurrent_batch_size=concurrent_batch_size,
+                               max_file_load=max_file_load)
-    async def _run_ocr_async(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096,
-                       temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[str]:
+    async def _ocr_async(self, file_paths: Iterable[str], rotate_correction:bool=False, max_dimension_pixels:int=None,
+                         concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
         """
-        This is the async version of the _run_ocr method.
+        Internal method to asynchronously process an iterable of file paths.
+        Yields OCRResult objects as they complete. Order not guaranteed.
+        concurrent_batch_size controls how many VLM calls are made concurrently.
         """
-        # flatten pages/images in file_paths
-        flat_page_list = []
+        vlm_call_semaphore = asyncio.Semaphore(concurrent_batch_size)
+        file_load_semaphore = asyncio.Semaphore(max_file_load)
+        tasks = []
         for file_path in file_paths:
+            task = self._ocr_file_with_semaphore(file_load_semaphore=file_load_semaphore,
+                                                 vlm_call_semaphore=vlm_call_semaphore,
+                                                 file_path=file_path,
+                                                 rotate_correction=rotate_correction,
+                                                 max_dimension_pixels=max_dimension_pixels)
+            tasks.append(task)
+        for future in asyncio.as_completed(tasks):
+            result: OCRResult = await future
+            yield result
+    async def _ocr_file_with_semaphore(self, file_load_semaphore:asyncio.Semaphore, vlm_call_semaphore:asyncio.Semaphore,
+                                       file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> OCRResult:
+        """
+        This internal method takes a semaphore and OCR a single file using the VLM inference engine.
+        """
+        async with file_load_semaphore:
+            filename = os.path.basename(file_path)
             file_ext = os.path.splitext(file_path)[1].lower()
-            # PDF or TIFF
-            if file_ext in ['.pdf', '.tif', '.tiff']:
-                images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
-                if not images:
-                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": 0, "total_page_count": 0})
-                for page_num, image in enumerate(images):
-                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": page_num, "total_page_count": len(images)})
-            # Image
-            else:
-                image = get_image_from_file(file_path)
-                flat_page_list.append({'file_path': file_path, 'file_type': "image", "image": image})
-        # Process images with asyncio.Semaphore
-        semaphore = asyncio.Semaphore(concurrent_batch_size)
-        async def semaphore_helper(page:List[Dict[str,str]], max_new_tokens:int, temperature:float, **kwrs):
+            result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
+            # check file extension
+            if file_ext not in SUPPORTED_IMAGE_EXTS:
+                result.status = "error"
+                result.add_page(text=f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}",
+                                image_processing_status={})
+                return result
             try:
-                messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, page["image"])
-                async with semaphore:
-                    async_task = self.vlm_engine.chat_async(
-                        messages,
-                        max_new_tokens=max_new_tokens,
-                        temperature=temperature,
-                        **kwrs
+                # Load images from file
+                if file_ext == '.pdf':
+                    data_loader = PDFDataLoader(file_path)
+                elif file_ext in ['.tif', '.tiff']:
+                    data_loader = TIFFDataLoader(file_path)
+                else:
+                    data_loader = ImageDataLoader(file_path)
+            except Exception as e:
+                result.status = "error"
+                result.add_page(text=f"Error processing file {filename}: {str(e)}", image_processing_status={})
+                return result
+            try:
+                page_processing_tasks = []
+                for page_index in range(data_loader.get_page_count()):
+                    task = self._ocr_page_with_semaphore(
+                        vlm_call_semaphore=vlm_call_semaphore,
+                        data_loader=data_loader,
+                        page_index=page_index,
+                        rotate_correction=rotate_correction,
+                        max_dimension_pixels=max_dimension_pixels
                     )
-                return await async_task
+                    page_processing_tasks.append(task)
+                if page_processing_tasks:
+                    processed_page_results = await asyncio.gather(*page_processing_tasks)
+                    for text, image_processing_status in processed_page_results:
+                        result.add_page(text=text, image_processing_status=image_processing_status)
             except Exception as e:
-                print(f"Error processing image: {e}")
-                return f"[Error: {e}]"
+                result.status = "error"
+                result.add_page(text=f"Error during OCR for {filename}: {str(e)}", image_processing_status={})
+                return result
-        tasks = []
-        for page in flat_page_list:
-            async_task = semaphore_helper(
-                page,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                **kwrs
-            )
-            tasks.append(asyncio.create_task(async_task))
+        # Set status to success if no errors occurred
+        result.status = "success"
+        return result
-        responses = await asyncio.gather(*tasks)
+    async def _ocr_page_with_semaphore(self, vlm_call_semaphore: asyncio.Semaphore, data_loader: DataLoader,
+                                       page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Tuple[str, Dict[str, str]]:
+        """
+        This internal method takes a semaphore and OCR a single image/page using the VLM inference engine.
-        # Restructure the results
-        ocr_results = []
-        page_text_buffer = ""
-        for page, ocr_text in zip(flat_page_list, responses):
-            # PDF or TIFF
-            if page['file_type'] == "PDF/TIFF":
-                page_text_buffer += ocr_text + self.page_delimiter
-                if page['page_num'] == page['total_page_count'] - 1:
-                    if self.output_mode == "markdown":
-                        page_text_buffer = clean_markdown(page_text_buffer)
-                    ocr_results.append(page_text_buffer)
-                    page_text_buffer = ""
-            # Image
-            if page['file_type'] == "image":
-                if self.output_mode == "markdown":
-                    ocr_text = clean_markdown(ocr_text)
-                ocr_results.append(ocr_text)
-        return ocr_results
+        Returns:
+        -------
+        Tuple[str, Dict[str, str]]
+            A tuple containing the OCR text and a dictionary with image processing status.
+        """
+        async with vlm_call_semaphore:
+            image = await data_loader.get_page_async(page_index)
+            image_processing_status = {}
+            # Apply rotate correction if specified and tesseract is available
+            if rotate_correction and self.image_processor.has_tesseract:
+                try:
+                    image, rotation_angle = await self.image_processor.rotate_correction_async(image)
+                    image_processing_status["rotate_correction"] = {
+                        "status": "success",
+                        "rotation_angle": rotation_angle
+                    }
+                except Exception as e:
+                    image_processing_status["rotate_correction"] = {
+                        "status": "error",
+                        "error": str(e)
+                    }
+            # Resize the image if max_dimension_pixels is specified
+            if max_dimension_pixels is not None:
+                try:
+                    image, resized = await self.image_processor.resize_async(image, max_dimension_pixels=max_dimension_pixels)
+                    image_processing_status["resize"] = {
+                        "status": "success",
+                        "resized": resized
+                    }
+                except Exception as e:
+                    image_processing_status["resize"] = {
+                        "status": "error",
+                        "error": str(e)
+                    }
+            messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
+            ocr_text = await self.vlm_engine.chat_async(
+                messages,
+            )
+            # Clean the OCR text if output mode is markdown
+            if self.output_mode == "markdown":
+                ocr_text = clean_markdown(ocr_text)
+            # Parse the response if output mode is JSON
+            if self.output_mode == "JSON":
+                json_list = extract_json(ocr_text)
+                # Serialize the JSON list to a string
+                ocr_text = json.dumps(json_list, indent=4)
+            return ocr_text, image_processing_status

vlm4ocr 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

vlm4ocr 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl