PyPI - vlm4ocr - Versions diffs - 0.0.1__tar.gz → 0.1.0__tar.gz - Mend

vlm4ocr 0.0.1tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: vlm4ocr
-Version: 0.0.1
-Summary: OCR with vision language models.
+Version: 0.1.0
+Summary: Python package and Web App for OCR with vision language models.
 License: MIT
 Author: Enshuo (David) Hsu
 Requires-Python: >=3.11,<4.0

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "vlm4ocr"
-version = "0.0.1"
-description = "OCR with vision language models."
+version = "0.1.0"
+description = "Python package and Web App for OCR with vision language models."
 authors = ["Enshuo (David) Hsu"]
 license = "MIT"
 readme = "README.md"
@@ -17,6 +17,8 @@ python = "^3.11"
 pdf2image = ">=1.16.0"
 pillow = ">=10.0.0"
+[tool.poetry.scripts]
+vlm4ocr = "vlm4ocr.cli:main"
 [build-system]
 requires = ["poetry-core"]

vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt ADDED Viewed

@@ -0,0 +1 @@

+ You are a helpful assistant that can convert scanned documents into functional HTML. Your output is accurate and well-formatted, starting with <html> and ending with </html>. You will only output the HTML without any additional explanations or comments. The HTML should include all text, tables, and lists with appropriate tags (e.g., "table", "tbody", "tr", ""li) and stlyes (e.g., "font-family", "color", "font-size") that represents the text contents in the input. You will ignore images, icons, or anything that can not be converted into text.

vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ Convert contents in this image into HTML.

vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ Convert contents in this image into plain text.

vlm4ocr-0.1.0/vlm4ocr/cli.py ADDED Viewed

@@ -0,0 +1,378 @@
+# vlm4ocr/cli.py
+import argparse
+import os
+import sys
+import logging
+# Attempt to import from the local package structure
+# This allows running the script directly for development,
+# assuming the script is in vlm4ocr/vlm4ocr/cli.py and the package root is vlm4ocr/vlm4ocr
+try:
+    from .ocr_engines import OCREngine
+    from .vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
+except ImportError:
+    # Fallback for when the package is installed and cli.py is run as part of it
+    from vlm4ocr.ocr_engines import OCREngine
+    from vlm4ocr.vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
+# Configure basic logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+# Define supported extensions here, ideally this should be sourced from ocr_engines.py
+SUPPORTED_IMAGE_EXTS_CLI = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
+OUTPUT_EXTENSIONS = {'markdown': '.md', 'HTML':'.html', 'text':'txt'}
+def main():
+    """
+    Main function for the vlm4ocr CLI.
+    Parses arguments, initializes engines, runs OCR, and handles output.
+    """
+    parser = argparse.ArgumentParser(
+        description="VLM4OCR: Perform OCR on images, PDFs, or TIFF files using Vision Language Models.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    # --- Input/Output Arguments ---
+    io_group = parser.add_argument_group("Input/Output Options")
+    io_group.add_argument(
+        "--input_path",
+        required=True,
+        help="Path to the input image, PDF, or TIFF file, or a directory containing these files. "
+             "If a directory is provided, all supported files within will be processed."
+    )
+    io_group.add_argument(
+        "--output_mode",
+        choices=["markdown", "HTML", "text"],
+        default="markdown",
+        help="Desired output format for the OCR results."
+    )
+    io_group.add_argument(
+        "--output_file",
+        help="Optional: Path to a file to save the output. "
+             "If input_path is a directory, this should be a directory where results will be saved "
+             "(one file per input, with original name and new extension). "
+             "If not provided, output is written to files in the current working directory "
+             "(e.g., 'input_name_ocr.output_mode')."
+    )
+    # --- VLM Engine Selection ---
+    vlm_engine_group = parser.add_argument_group("VLM Engine Selection")
+    vlm_engine_group.add_argument(
+        "--vlm_engine",
+        choices=["openai", "azure_openai", "ollama", "openai_compatible"],
+        required=True,
+        help="Specify the VLM engine to use."
+    )
+    vlm_engine_group.add_argument(
+        "--model",
+        required=True,
+        help="The specific model identifier for the chosen VLM engine. "
+             "E.g., 'gpt-4o' for OpenAI, 'deployment-name' for Azure, "
+             "'Qwen/Qwen2.5-VL-7B-Instruct' for OpenAI-compatible, "
+             "or 'llava:latest' for Ollama."
+    )
+    # --- OpenAI Engine Arguments ---
+    openai_group = parser.add_argument_group("OpenAI & OpenAI-Compatible Options")
+    openai_group.add_argument(
+        "--api_key",
+        default=os.environ.get("OPENAI_API_KEY"),
+        help="API key for OpenAI or OpenAI-compatible service. "
+             "Can also be set via OPENAI_API_KEY environment variable."
+    )
+    openai_group.add_argument(
+        "--base_url",
+        help="Base URL for OpenAI-compatible services (e.g., vLLM endpoint like 'http://localhost:8000/v1'). "
+             "Not used for official OpenAI API."
+    )
+    # --- Azure OpenAI Engine Arguments ---
+    azure_group = parser.add_argument_group("Azure OpenAI Options")
+    azure_group.add_argument(
+        "--azure_api_key",
+        default=os.environ.get("AZURE_OPENAI_API_KEY"),
+        help="API key for Azure OpenAI service. "
+             "Can also be set via AZURE_OPENAI_API_KEY environment variable."
+    )
+    azure_group.add_argument(
+        "--azure_endpoint",
+        default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+        help="Endpoint URL for Azure OpenAI service. "
+             "Can also be set via AZURE_OPENAI_ENDPOINT environment variable."
+    )
+    azure_group.add_argument(
+        "--azure_api_version",
+        default=os.environ.get("AZURE_OPENAI_API_VERSION"),
+        help="API version for Azure OpenAI service (e.g., '2024-02-01'). "
+             "Can also be set via AZURE_OPENAI_API_VERSION environment variable."
+    )
+    # --- Ollama Engine Arguments ---
+    ollama_group = parser.add_argument_group("Ollama Options")
+    ollama_group.add_argument(
+        "--ollama_host",
+        default="http://localhost:11434",
+        help="Host URL for the Ollama server."
+    )
+    ollama_group.add_argument(
+        "--ollama_num_ctx",
+        type=int,
+        default=4096,
+        help="Context length for Ollama models."
+    )
+    ollama_group.add_argument(
+        "--ollama_keep_alive",
+        type=int,
+        default=300, # Default from OllamaVLMEngine
+        help="Seconds to keep the Ollama model loaded after the last call."
+    )
+    # --- OCR Engine Parameters ---
+    ocr_params_group = parser.add_argument_group("OCR Engine Parameters")
+    ocr_params_group.add_argument(
+        "--user_prompt",
+        help="Optional: Custom user prompt to provide context about the image/PDF/TIFF."
+    )
+    # REMOVED --system_prompt argument
+    ocr_params_group.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=4096, # Default from OCREngine
+        help="Maximum number of new tokens the VLM can generate."
+    )
+    ocr_params_group.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0, # Default from OCREngine
+        help="Temperature for token sampling (0.0 for deterministic output)."
+    )
+    # --- Processing Options ---
+    processing_group = parser.add_argument_group("Processing Options")
+    processing_group.add_argument(
+        "--concurrent",
+        action="store_true",
+        help="Enable concurrent processing for multiple files or PDF/TIFF pages."
+    )
+    processing_group.add_argument(
+        "--concurrent_batch_size",
+        type=int,
+        default=32,
+        help="Batch size for concurrent processing."
+    )
+    processing_group.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output from the OCR engine during processing. CLI will also log more info."
+    )
+    processing_group.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug level logging for more detailed information."
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
+        logger.debug("Debug mode enabled.")
+        logger.debug(f"Parsed arguments: {args}")
+    elif args.verbose:
+        logger.setLevel(logging.INFO) # Ensure logger level is at least INFO for verbose CLI output
+    # --- Validate Arguments ---
+    # verbose is not supported with concurrent processing
+    if args.verbose and args.concurrent:
+        logger.warning("Verbose output is not supported with concurrent processing. "
+                       "Verbose mode will be ignored.")
+        args.verbose = False
+    # --- Initialize VLM Engine ---
+    vlm_engine_instance = None
+    try:
+        logger.info(f"Initializing VLM engine: {args.vlm_engine} with model: {args.model}")
+        if args.vlm_engine == "openai":
+            if not args.api_key:
+                parser.error("--api_key (or OPENAI_API_KEY env var) is required for OpenAI engine.")
+            vlm_engine_instance = OpenAIVLMEngine(
+                model=args.model,
+                api_key=args.api_key
+                # reasoning_model removed
+            )
+        elif args.vlm_engine == "openai_compatible":
+            if not args.api_key :
+                 logger.warning("API key not provided or empty for openai_compatible. This might be acceptable for some servers (e.g. if 'EMPTY' is expected).")
+            if not args.base_url:
+                parser.error("--base_url is required for openai_compatible engine.")
+            vlm_engine_instance = OpenAIVLMEngine(
+                model=args.model,
+                api_key=args.api_key,
+                base_url=args.base_url
+                # reasoning_model removed
+            )
+        elif args.vlm_engine == "azure_openai":
+            if not args.azure_api_key:
+                parser.error("--azure_api_key (or AZURE_OPENAI_API_KEY env var) is required for Azure OpenAI engine.")
+            if not args.azure_endpoint:
+                parser.error("--azure_endpoint (or AZURE_OPENAI_ENDPOINT env var) is required for Azure OpenAI engine.")
+            if not args.azure_api_version:
+                parser.error("--azure_api_version (or AZURE_OPENAI_API_VERSION env var) is required for Azure OpenAI engine.")
+            vlm_engine_instance = AzureOpenAIVLMEngine(
+                model=args.model,
+                api_key=args.azure_api_key,
+                azure_endpoint=args.azure_endpoint,
+                api_version=args.azure_api_version
+                # reasoning_model removed
+            )
+        elif args.vlm_engine == "ollama":
+            vlm_engine_instance = OllamaVLMEngine(
+                model_name=args.model, # OllamaVLMEngine expects model_name
+                host=args.ollama_host,
+                num_ctx=args.ollama_num_ctx,
+                keep_alive=args.ollama_keep_alive
+            )
+        else:
+            # This case should be caught by argparse choices, but as a safeguard:
+            logger.error(f"Invalid VLM engine specified: {args.vlm_engine}")
+            sys.exit(1)
+        logger.info("VLM engine initialized successfully.")
+    except ImportError as e:
+        logger.error(f"Failed to import a required library for {args.vlm_engine}: {e}. "
+                     "Please ensure the necessary dependencies (e.g., 'openai', 'ollama') are installed.")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"Error initializing VLM engine '{args.vlm_engine}': {e}")
+        if args.debug:
+            logger.exception("Traceback for VLM engine initialization error:")
+        sys.exit(1)
+    # --- Initialize OCR Engine ---
+    try:
+        logger.info(f"Initializing OCR engine with output mode: {args.output_mode}")
+        ocr_engine_instance = OCREngine(
+            vlm_engine=vlm_engine_instance,
+            output_mode=args.output_mode,
+            # system_prompt removed, OCREngine will use its default
+            user_prompt=args.user_prompt
+        )
+        logger.info("OCR engine initialized successfully.")
+    except Exception as e:
+        logger.error(f"Error initializing OCR engine: {e}")
+        if args.debug:
+            logger.exception("Traceback for OCR engine initialization error:")
+        sys.exit(1)
+    # --- Prepare input file paths ---
+    input_files_to_process = []
+    if os.path.isdir(args.input_path):
+        logger.info(f"Input path is a directory: {args.input_path}. Scanning for supported files...")
+        for item in os.listdir(args.input_path):
+            item_path = os.path.join(args.input_path, item)
+            if os.path.isfile(item_path):
+                file_ext = os.path.splitext(item)[1].lower()
+                if file_ext in SUPPORTED_IMAGE_EXTS_CLI:
+                    input_files_to_process.append(item_path)
+        if not input_files_to_process:
+            logger.error(f"No supported files (PDF, TIFF, PNG, JPG, etc.) found in directory: {args.input_path}")
+            sys.exit(1)
+        logger.info(f"Found {len(input_files_to_process)} supported files to process.")
+    elif os.path.isfile(args.input_path):
+        file_ext = os.path.splitext(args.input_path)[1].lower()
+        if file_ext not in SUPPORTED_IMAGE_EXTS_CLI:
+            logger.error(f"Input file '{args.input_path}' is not a supported file type. Supported: {SUPPORTED_IMAGE_EXTS_CLI}")
+            sys.exit(1)
+        input_files_to_process = [args.input_path]
+        logger.info(f"Processing single input file: {args.input_path}")
+    else:
+        logger.error(f"Input path is not a valid file or directory: {args.input_path}")
+        sys.exit(1)
+    # --- Run OCR ---
+    try:
+        logger.info("Starting OCR processing...")
+        ocr_results_list = ocr_engine_instance.run_ocr(
+            file_paths=input_files_to_process,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            verbose=args.verbose,
+            concurrent=args.concurrent,
+            concurrent_batch_size=args.concurrent_batch_size
+        )
+        logger.info("OCR processing completed.")
+        # --- Handle Output ---
+        if args.output_file:
+            if os.path.isdir(args.input_path) and len(input_files_to_process) > 1 :
+                if not os.path.exists(args.output_file):
+                    logger.info(f"Creating output directory: {args.output_file}")
+                    os.makedirs(args.output_file, exist_ok=True)
+                elif not os.path.isdir(args.output_file):
+                    logger.error(f"Output path '{args.output_file}' exists and is not a directory, "
+                                 "but multiple input files were processed. Please specify a directory for --output_file.")
+                    sys.exit(1)
+                output_target_dir = args.output_file
+            elif not (os.path.isdir(args.input_path) and len(input_files_to_process) > 1):
+                # Single input file, or directory with one file. output_file is a direct file path.
+                # Ensure its directory exists.
+                output_target_dir = os.path.dirname(args.output_file)
+                if output_target_dir and not os.path.exists(output_target_dir):
+                    logger.info(f"Creating output directory: {output_target_dir}")
+                    os.makedirs(output_target_dir, exist_ok=True)
+            else: # Should not happen if logic above is correct
+                output_target_dir = os.getcwd()
+            for i, input_file_path in enumerate(input_files_to_process):
+                if os.path.isdir(args.input_path) and len(input_files_to_process) > 1:
+                    # Multiple inputs, save into the directory specified by args.output_file
+                    base_name = os.path.basename(input_file_path)
+                    name_part, _ = os.path.splitext(base_name)
+                    output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
+                    full_output_path = os.path.join(args.output_file, output_filename)
+                else:
+                    # Single input, args.output_file is the exact path
+                    full_output_path = args.output_file
+                try:
+                    with open(full_output_path, "w", encoding="utf-8") as f:
+                        f.write(ocr_results_list[i])
+                    logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
+                except Exception as e:
+                    logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
+        else:
+            # No --output_file specified, save to current working directory
+            current_dir = os.getcwd()
+            logger.info(f"No --output_file specified. Results will be saved to the current working directory: {current_dir}")
+            for i, input_file_path in enumerate(input_files_to_process):
+                base_name = os.path.basename(input_file_path)
+                name_part, _ = os.path.splitext(base_name)
+                output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
+                full_output_path = os.path.join(current_dir, output_filename)
+                try:
+                    with open(full_output_path, "w", encoding="utf-8") as f:
+                        f.write(ocr_results_list[i])
+                    logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
+                except Exception as e:
+                    logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
+    except FileNotFoundError as e:
+        logger.error(f"File not found during OCR processing: {e}")
+        sys.exit(1)
+    except ValueError as e:
+        logger.error(f"Input Error or Value Error during processing: {e}")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"An unexpected error occurred during OCR processing: {e}")
+        if args.debug:
+            logger.exception("Traceback for OCR processing error:")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/ocr_engines.py RENAMED Viewed

@@ -2,13 +2,14 @@ import os
 from typing import List, Dict, Union, Generator, Iterable
 import importlib
 import asyncio
-from vlm4ocr.utils import get_images_from_pdf, get_image_from_file, clean_markdown
+from vlm4ocr.utils import get_images_from_pdf, get_images_from_tiff, get_image_from_file, clean_markdown
 from vlm4ocr.vlm_engines import VLMEngine
-SUPPORTED_IMAGE_EXTS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
+SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
 class OCREngine:
-    def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="\n\n---\n\n"):
+    def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="auto"):
         """
         This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.
@@ -17,13 +18,17 @@ class OCREngine:
         inference_engine : InferenceEngine
             The inference engine to use for OCR.
         output_mode : str, Optional
-            The output format. Can be 'markdown' or 'text'.
+            The output format. Must be 'markdown', 'HTML', or 'text'.
         system_prompt : str, Optional
             Custom system prompt. We recommend use a default system prompt by leaving this blank.
         user_prompt : str, Optional
             Custom user prompt. It is good to include some information regarding the document. If not specified, a default will be used.
         page_delimiter : str, Optional
             The delimiter to use between PDF pages.
+            if 'auto', it will be set to the default page delimiter for the output mode:
+            'markdown' -> '\n\n---\n\n'
+            'HTML' -> '<br><br>'
+            'text' -> '\n\n---\n\n'
         """
         # Check inference engine
         if not isinstance(vlm_engine, VLMEngine):
@@ -31,8 +36,8 @@ class OCREngine:
         self.vlm_engine = vlm_engine
         # Check output mode
-        if output_mode not in ["markdown", "text"]:
-            raise ValueError("output_mode must be 'markdown' or 'text'")
+        if output_mode not in ["markdown", "HTML", "text"]:
+            raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
         self.output_mode = output_mode
         # System prompt
@@ -47,13 +52,21 @@ class OCREngine:
         if isinstance(user_prompt, str) and user_prompt:
             self.user_prompt = user_prompt
         else:
-            file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath('ocr_user_prompt.txt')
+            file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
             with open(file_path, 'r', encoding='utf-8') as f:
                 self.user_prompt =  f.read()
         # Page delimiter
         if isinstance(page_delimiter, str):
-            self.page_delimiter = page_delimiter
+            if page_delimiter == "auto":
+                if self.output_mode == "markdown":
+                    self.page_delimiter = "\n\n---\n\n"
+                elif self.output_mode == "HTML":
+                    self.page_delimiter = "<br><br>"
+                else:
+                    self.page_delimiter = "\n\n---\n\n"
+            else:
+                self.page_delimiter = page_delimiter
         else:
             raise ValueError("page_delimiter must be a string")
@@ -61,16 +74,17 @@ class OCREngine:
     def stream_ocr(self, file_path: str, max_new_tokens:int=4096, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
         """
         This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
+        Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
         Parameters:
         -----------
         file_path : str
-            The path to the image or PDF file. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
+            The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
         max_new_tokens : int, Optional
             The maximum number of tokens to generate.
         temperature : float, Optional
             The temperature to use for sampling.
         Returns:
         --------
         Generator[str, None, None]
@@ -82,14 +96,14 @@ class OCREngine:
         # Check file extension
         file_ext = os.path.splitext(file_path)[1].lower()
-        if file_ext not in SUPPORTED_IMAGE_EXTS and file_ext != '.pdf':
-            raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS + ['.pdf']}")
+        if file_ext not in SUPPORTED_IMAGE_EXTS:
+            raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
-        # PDF
-        if file_ext == '.pdf':
-            images = get_images_from_pdf(file_path)
+        # PDF or TIFF
+        if file_ext in ['.pdf', '.tif', '.tiff']:
+            images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
             if not images:
-                raise ValueError(f"No images extracted from PDF: {file_path}")
+                raise ValueError(f"No images extracted from file: {file_path}")
             for i, image in enumerate(images):
                 messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
                 response_stream = self.vlm_engine.chat(
@@ -100,10 +114,10 @@ class OCREngine:
                     **kwrs
                 )
                 for chunk in response_stream:
-                    yield chunk
+                    yield {"type": "ocr_chunk", "data": chunk}
                 if i < len(images) - 1:
-                    yield self.page_delimiter
+                    yield {"type": "page_delimiter", "data": self.page_delimiter}
         # Image
         else:
@@ -117,18 +131,18 @@ class OCREngine:
                     **kwrs
                 )
             for chunk in response_stream:
-                yield chunk
+                yield {"type": "ocr_chunk", "data": chunk}
     def run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096, temperature:float=0.0,
                 verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32, **kwrs) -> Union[str, Generator[str, None, None]]:
         """
-        This method takes a list of file paths (image or PDF) and perform OCR using the VLM inference engine.
+        This method takes a list of file paths (image, PDF, TIFF) and perform OCR using the VLM inference engine.
         Parameters:
         -----------
         file_paths : Union[str, Iterable[str]]
-            A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
+            A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff, '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
         max_new_tokens : int, Optional
             The maximum number of tokens to generate.
         temperature : float, Optional
@@ -152,9 +166,8 @@ class OCREngine:
             if not isinstance(file_path, str):
                 raise TypeError("file_paths must be a string or an iterable of strings")
             file_ext = os.path.splitext(file_path)[1].lower()
-            if file_ext not in SUPPORTED_IMAGE_EXTS and file_ext != '.pdf':
-                raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS + ['.pdf']}")
+            if file_ext not in SUPPORTED_IMAGE_EXTS:
+                raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
         # Concurrent processing
         if concurrent:
@@ -178,12 +191,12 @@ class OCREngine:
     def _run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096,
                  temperature:float=0.0, verbose:bool=False, **kwrs) -> Iterable[str]:
         """
-        This method inputs a file path or a list of file paths (image or PDF) and performs OCR using the VLM inference engine.
+        This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
         Parameters:
         -----------
         file_paths : Union[str, Iterable[str]]
-            A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
+            A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
         max_new_tokens : int, Optional
             The maximum number of tokens to generate.
         temperature : float, Optional
@@ -199,12 +212,12 @@ class OCREngine:
         ocr_results = []
         for file_path in file_paths:
             file_ext = os.path.splitext(file_path)[1].lower()
-            # PDF
-            if file_ext == '.pdf':
-                images = get_images_from_pdf(file_path)
+            # PDF or TIFF
+            if file_ext in ['.pdf', '.tif', '.tiff']:
+                images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
                 if not images:
-                    raise ValueError(f"No images extracted from PDF: {file_path}")
-                pdf_results = []
+                    raise ValueError(f"No images extracted from file: {file_path}")
+                results = []
                 for image in images:
                     messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
                     response = self.vlm_engine.chat(
@@ -215,9 +228,9 @@ class OCREngine:
                         stream=False,
                         **kwrs
                     )
-                    pdf_results.append(response)
+                    results.append(response)
-                ocr_text = self.page_delimiter.join(pdf_results)
+                ocr_text = self.page_delimiter.join(results)
             # Image
             else:
                 image = get_image_from_file(file_path)
@@ -248,13 +261,13 @@ class OCREngine:
         flat_page_list = []
         for file_path in file_paths:
             file_ext = os.path.splitext(file_path)[1].lower()
-            # PDF
-            if file_ext == '.pdf':
-                images = get_images_from_pdf(file_path)
+            # PDF or TIFF
+            if file_ext in ['.pdf', '.tif', '.tiff']:
+                images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
                 if not images:
-                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": 0, "total_page_count": 0})
+                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": 0, "total_page_count": 0})
                 for page_num, image in enumerate(images):
-                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": page_num, "total_page_count": len(images)})
+                    flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": page_num, "total_page_count": len(images)})
             # Image
             else:
                 image = get_image_from_file(file_path)
@@ -291,16 +304,16 @@ class OCREngine:
         # Restructure the results
         ocr_results = []
-        pdf_page_text_buffer = ""
+        page_text_buffer = ""
         for page, ocr_text in zip(flat_page_list, responses):
-            # PDF
-            if page['file_type'] == "PDF":
-                pdf_page_text_buffer += ocr_text + self.page_delimiter
+            # PDF or TIFF
+            if page['file_type'] == "PDF/TIFF":
+                page_text_buffer += ocr_text + self.page_delimiter
                 if page['page_num'] == page['total_page_count'] - 1:
                     if self.output_mode == "markdown":
-                        pdf_page_text_buffer = clean_markdown(pdf_page_text_buffer)
-                    ocr_results.append(pdf_page_text_buffer)
-                    pdf_page_text_buffer = ""
+                        page_text_buffer = clean_markdown(page_text_buffer)
+                    ocr_results.append(page_text_buffer)
+                    page_text_buffer = ""
             # Image
             if page['file_type'] == "image":
                 if self.output_mode == "markdown":

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/utils.py RENAMED Viewed

@@ -16,17 +16,34 @@ def get_images_from_pdf(file_path: str) -> List[Image.Image]:
         print(f"Error converting PDF to images: {e}")
         raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
+def get_images_from_tiff(file_path: str) -> List[Image.Image]:
+    """ Extracts images from a TIFF file. """
+    images = []
+    try:
+        img = Image.open(file_path)
+        for i in range(img.n_frames):
+            img.seek(i)
+            images.append(img.copy())
+        if not images:
+            print(f"Warning: No images extracted from TIFF: {file_path}")
+        return images
+    except FileNotFoundError:
+        raise FileNotFoundError(f"TIFF file not found: {file_path}")
+    except Exception as e:
+        print(f"Error processing TIFF file: {e}")
+        raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
 def get_image_from_file(file_path: str) -> Image.Image:
-        """ Loads a single image file. """
-        try:
-            image = Image.open(file_path)
-            image.load()
-            return image
-        except FileNotFoundError:
-            raise FileNotFoundError(f"Image file not found: {file_path}")
-        except Exception as e:
-            raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
+    """ Loads a single image file. """
+    try:
+        image = Image.open(file_path)
+        image.load()
+        return image
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Image file not found: {file_path}")
+    except Exception as e:
+        raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
 def image_to_base64(image:Image.Image, format:str="png") -> str:

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/vlm_engines.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import abc
-import importlib
+import importlib.util
 from typing import List, Dict, Union, Generator
 import warnings
 from PIL import Image

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/README.md RENAMED Viewed

File without changes

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/__init__.py RENAMED Viewed

File without changes

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/assets/default_prompt_templates/ocr_markdown_system_prompt.txt RENAMED Viewed

File without changes

/vlm4ocr-0.0.1/vlm4ocr/assets/default_prompt_templates/ocr_user_prompt.txt → /vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_markdown_user_prompt.txt RENAMED Viewed

File without changes

{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt RENAMED Viewed

File without changes

vlm4ocr 0.0.1__tar.gz → 0.1.0__tar.gz

vlm4ocr 0.0.1tar.gz → 0.1.0tar.gz