PyPI - xgen-doc2chunk - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

xgen_doc2chunk/ocr/__init__.py ADDED Viewed

@@ -0,0 +1,67 @@
+# xgen_doc2chunk/ocr/__init__.py
+# OCR module package initialization
+"""
+OCR Processing Module
+This module provides OCR functionality to extract text from images
+using various LLM Vision models.
+Usage Examples:
+    ```python
+    from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
+    # OCR processing with OpenAI Vision model
+    ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
+    result = ocr.convert_image_to_text("/path/to/image.png")
+    # OCR processing with Anthropic Claude Vision model
+    ocr = AnthropicOCR(api_key="sk-ant-...", model="claude-sonnet-4-20250514")
+    result = ocr.convert_image_to_text("/path/to/image.png")
+    # OCR processing with Google Gemini Vision model
+    ocr = GeminiOCR(api_key="...", model="gemini-2.0-flash")
+    result = ocr.convert_image_to_text("/path/to/image.png")
+    # OCR processing with vLLM-based Vision model
+    ocr = VllmOCR(base_url="http://localhost:8000/v1", model="Qwen/Qwen2-VL-7B-Instruct")
+    result = ocr.convert_image_to_text("/path/to/image.png")
+    ```
+Classes:
+    - BaseOCR: Abstract base class for OCR processing
+    - OpenAIOCR: OpenAI Vision model based OCR (ocr_engine module)
+    - AnthropicOCR: Anthropic Claude Vision model based OCR (ocr_engine module)
+    - GeminiOCR: Google Gemini Vision model based OCR (ocr_engine module)
+    - VllmOCR: vLLM-based Vision model OCR (ocr_engine module)
+"""
+from xgen_doc2chunk.ocr.base import BaseOCR
+from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
+from xgen_doc2chunk.ocr.ocr_processor import (
+    IMAGE_TAG_PATTERN,
+    extract_image_tags,
+    load_image_from_path,
+    convert_image_to_text_with_llm,
+    process_text_with_ocr,
+    process_text_with_ocr_progress,
+    _b64_from_file,
+    _get_mime_type,
+)
+__all__ = [
+    # Base Class
+    "BaseOCR",
+    # OCR Engines
+    "OpenAIOCR",
+    "AnthropicOCR",
+    "GeminiOCR",
+    "VllmOCR",
+    # Functions
+    "IMAGE_TAG_PATTERN",
+    "extract_image_tags",
+    "load_image_from_path",
+    "convert_image_to_text_with_llm",
+    "process_text_with_ocr",
+    "process_text_with_ocr_progress",
+]

xgen_doc2chunk/ocr/base.py ADDED Viewed

@@ -0,0 +1,209 @@
+# xgen_doc2chunk/ocr/base.py
+# Abstract base class for OCR models
+import logging
+import re
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Pattern
+logger = logging.getLogger("ocr-base")
+class BaseOCR(ABC):
+    """
+    Abstract base class for OCR processing.
+    All OCR model implementations must inherit from this class.
+    """
+    # Default prompt (can be overridden in subclasses)
+    DEFAULT_PROMPT = (
+        "Extract meaningful information from this image.\n\n"
+        "**If the image contains a TABLE:**\n"
+        "- Convert to HTML table format (<table>, <tr>, <td>, <th>)\n"
+        "- Use 'rowspan' and 'colspan' attributes for merged cells\n"
+        "- Preserve all cell content exactly as shown\n"
+        "- Example:\n"
+        "  <table>\n"
+        "    <tr><th colspan=\"2\">Header</th></tr>\n"
+        "    <tr><td rowspan=\"2\">Merged</td><td>A</td></tr>\n"
+        "    <tr><td>B</td></tr>\n"
+        "  </table>\n\n"
+        "**If the image contains TEXT (non-table):**\n"
+        "- Extract all text exactly as shown\n"
+        "- Keep layout, hierarchy, and structure\n\n"
+        "**If the image contains DATA (charts, graphs, diagrams):**\n"
+        "- Extract the data and its meaning\n"
+        "- Describe trends, relationships, or key insights\n\n"
+        "**If the image is decorative or has no semantic meaning:**\n"
+        "- Simply state what it is in one short sentence\n"
+        "- Example: 'A decorative geometric shape' or 'Company logo'\n"
+        "- Do NOT over-analyze decorative elements\n\n"
+        "**Rules:**\n"
+        "- Output in Korean (except HTML tags)\n"
+        "- Tables MUST use HTML format with proper rowspan/colspan\n"
+        "- Be concise - only include what is semantically meaningful\n"
+        "- No filler words or unnecessary descriptions"
+    )
+    # Simple prompt (used for vllm, etc.)
+    SIMPLE_PROMPT = "Describe the contents of this image."
+    def __init__(self, llm_client: Any, prompt: Optional[str] = None):
+        """
+        Initialize OCR model.
+        Args:
+            llm_client: LangChain LLM client (must support Vision models)
+            prompt: Custom prompt (uses default prompt if None)
+        """
+        self.llm_client = llm_client
+        self.prompt = prompt if prompt is not None else self.DEFAULT_PROMPT
+        self._image_pattern: Optional[Pattern[str]] = None
+    @property
+    @abstractmethod
+    def provider(self) -> str:
+        """Return OCR provider name (e.g., 'openai', 'anthropic')"""
+        pass
+    @abstractmethod
+    def build_message_content(self, b64_image: str, mime_type: str) -> list:
+        """
+        Build message content for LLM.
+        Args:
+            b64_image: Base64 encoded image
+            mime_type: Image MIME type
+        Returns:
+            Content list for LangChain HumanMessage
+        """
+        pass
+    def convert_image_to_text(self, image_path: str) -> Optional[str]:
+        """
+        Convert image to text.
+        Args:
+            image_path: Local image file path
+        Returns:
+            Extracted text from image or None (on failure)
+        """
+        from xgen_doc2chunk.ocr.ocr_processor import (
+            _b64_from_file,
+            _get_mime_type,
+        )
+        from langchain_core.messages import HumanMessage
+        try:
+            b64_image = _b64_from_file(image_path)
+            mime_type = _get_mime_type(image_path)
+            content = self.build_message_content(b64_image, mime_type)
+            message = HumanMessage(content=content)
+            response = self.llm_client.invoke([message])
+            result = response.content.strip()
+            # Wrap result in [Figure:...] format
+            result = f"[Figure:{result}]"
+            logger.info(f"[{self.provider.upper()}] Image to text conversion completed")
+            return result
+        except Exception as e:
+            logger.error(f"[{self.provider.upper()}] Image to text conversion failed: {e}")
+            return f"[Image conversion error: {str(e)}]"
+    def set_image_pattern(self, pattern: Optional[Pattern[str]] = None) -> None:
+        """
+        Set custom image pattern for tag detection.
+        Args:
+            pattern: Compiled regex pattern with capture group for image path.
+                     If None, uses default [Image:{path}] pattern.
+        Examples:
+            >>> import re
+            >>> ocr.set_image_pattern(re.compile(r"<img src='([^']+)'/>"))
+        """
+        self._image_pattern = pattern
+    def set_image_pattern_from_string(self, pattern_string: str) -> None:
+        """
+        Set custom image pattern from pattern string.
+        Args:
+            pattern_string: Regex pattern string with capture group for image path.
+        Examples:
+            >>> ocr.set_image_pattern_from_string(r"<img src='([^']+)'/>")
+        """
+        self._image_pattern = re.compile(pattern_string)
+    def process_text(self, text: str, image_pattern: Optional[Pattern[str]] = None) -> str:
+        """
+        Detect image tags in text and replace with OCR results.
+        Args:
+            text: Text containing image tags
+            image_pattern: Custom regex pattern for image tags.
+                           If None, uses instance pattern or default [Image:{path}] pattern.
+        Returns:
+            Text with image tags replaced by OCR results
+        """
+        from xgen_doc2chunk.ocr.ocr_processor import (
+            extract_image_tags,
+            load_image_from_path,
+            DEFAULT_IMAGE_TAG_PATTERN,
+        )
+        if not self.llm_client:
+            logger.warning(f"[{self.provider.upper()}] Skipping OCR processing: no LLM client")
+            return text
+        # Determine which pattern to use: parameter > instance > default
+        pattern = image_pattern or self._image_pattern or DEFAULT_IMAGE_TAG_PATTERN
+        image_paths = extract_image_tags(text, pattern)
+        if not image_paths:
+            logger.debug(f"[{self.provider.upper()}] No image tags found in text")
+            return text
+        logger.info(f"[{self.provider.upper()}] Detected {len(image_paths)} image tags")
+        result_text = text
+        for img_path in image_paths:
+            # Build replacement pattern using the same pattern structure
+            # Escape the path and create a pattern that matches the full tag
+            escaped_path = re.escape(img_path)
+            # Get the pattern string and replace capture group with escaped path
+            pattern_str = pattern.pattern
+            # Replace the capture group (.*), ([^...]+), etc. with the escaped path
+            tag_pattern_str = re.sub(r'\([^)]+\)', escaped_path, pattern_str, count=1)
+            tag_pattern = re.compile(tag_pattern_str)
+            local_path = load_image_from_path(img_path)
+            if local_path is None:
+                logger.warning(f"[{self.provider.upper()}] Image load failed, keeping original tag: {img_path}")
+                continue
+            ocr_result = self.convert_image_to_text(local_path)
+            if ocr_result is None or ocr_result.startswith("[Image conversion error:"):
+                logger.warning(f"[{self.provider.upper()}] Image conversion failed, keeping original tag: {img_path}")
+                continue
+            result_text = tag_pattern.sub(ocr_result, result_text)
+            logger.info(f"[{self.provider.upper()}] Tag replacement completed: {img_path[:50]}...")
+        return result_text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(provider='{self.provider}')"

xgen_doc2chunk/ocr/ocr_engine/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# xgen_doc2chunk/ocr/ocr_engine/__init__.py
+# OCR engine module initialization
+"""
+OCR Engine Module
+Provides OCR engine classes for each LLM provider.
+"""
+from xgen_doc2chunk.ocr.ocr_engine.openai_ocr import OpenAIOCR
+from xgen_doc2chunk.ocr.ocr_engine.anthropic_ocr import AnthropicOCR
+from xgen_doc2chunk.ocr.ocr_engine.gemini_ocr import GeminiOCR
+from xgen_doc2chunk.ocr.ocr_engine.vllm_ocr import VllmOCR
+from xgen_doc2chunk.ocr.ocr_engine.bedrock_ocr import BedrockOCR
+__all__ = [
+    "OpenAIOCR",
+    "AnthropicOCR",
+    "GeminiOCR",
+    "VllmOCR",
+    "BedrockOCR",
+]

xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py ADDED Viewed

@@ -0,0 +1,91 @@
+# xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py
+# OCR class using Anthropic Claude Vision model
+import logging
+from typing import Any, Optional
+from xgen_doc2chunk.ocr.base import BaseOCR
+logger = logging.getLogger("ocr-anthropic")
+# Default model
+DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
+class AnthropicOCR(BaseOCR):
+    """
+    OCR processing class using Anthropic Claude Vision model.
+    Supported models: claude-3-opus, claude-3-sonnet, claude-3-haiku, claude-sonnet-4, etc.
+    Example:
+        ```python
+        from xgen_doc2chunk.ocr.ocr_engine import AnthropicOCR
+        # Method 1: Initialize with api_key and model
+        ocr = AnthropicOCR(api_key="sk-ant-...", model="claude-sonnet-4-20250514")
+        # Method 2: Use existing LLM client
+        from langchain_anthropic import ChatAnthropic
+        llm = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0, api_key="sk-ant-...")
+        ocr = AnthropicOCR(llm_client=llm)
+        # Single image conversion
+        result = ocr.convert_image_to_text("/path/to/image.png")
+        ```
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = DEFAULT_ANTHROPIC_MODEL,
+        llm_client: Optional[Any] = None,
+        prompt: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ):
+        """
+        Initialize Anthropic OCR.
+        Args:
+            api_key: Anthropic API key (required if llm_client is not provided)
+            model: Model name to use (default: claude-sonnet-4-20250514)
+            llm_client: Existing LangChain Anthropic client (if provided, api_key and model are ignored)
+            prompt: Custom prompt (if None, default prompt is used)
+            temperature: Generation temperature (default: 0.0)
+            max_tokens: Maximum number of tokens (default: 4096)
+        """
+        if llm_client is None:
+            if api_key is None:
+                raise ValueError("Either api_key or llm_client is required.")
+            from langchain_anthropic import ChatAnthropic
+            llm_client = ChatAnthropic(
+                model=model,
+                api_key=api_key,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            logger.info(f"[Anthropic OCR] Client created: model={model}")
+        super().__init__(llm_client=llm_client, prompt=prompt)
+        self.model = model
+        logger.info("[Anthropic OCR] Initialization completed")
+    @property
+    def provider(self) -> str:
+        return "anthropic"
+    def build_message_content(self, b64_image: str, mime_type: str) -> list:
+        return [
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": mime_type,
+                    "data": b64_image
+                }
+            },
+            {"type": "text", "text": self.prompt}
+        ]

xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py ADDED Viewed

@@ -0,0 +1,172 @@
+# xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py
+# OCR class using AWS Bedrock Vision model
+import logging
+import os
+from typing import Any, Optional
+from xgen_doc2chunk.ocr.base import BaseOCR
+logger = logging.getLogger("ocr-bedrock")
+# Default model
+DEFAULT_BEDROCK_MODEL = "anthropic.claude-3-5-sonnet-20241022-v2:0"
+class BedrockOCR(BaseOCR):
+    """
+    OCR processing class using AWS Bedrock Vision model.
+    Supports Claude and other vision-capable models available on AWS Bedrock.
+    Example:
+        ```python
+        from xgen_doc2chunk.ocr.ocr_engine import BedrockOCR
+        # Method 1: Initialize with AWS credentials
+        ocr = BedrockOCR(
+            aws_access_key_id="AKIA...",
+            aws_secret_access_key="...",
+            aws_region="us-east-1",
+            model="anthropic.claude-3-5-sonnet-20241022-v2:0"
+        )
+        # Method 2: Use existing AWS credentials from environment
+        # (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION)
+        ocr = BedrockOCR(model="anthropic.claude-3-5-sonnet-20241022-v2:0")
+        # Method 3: Use with session token (temporary credentials)
+        ocr = BedrockOCR(
+            aws_access_key_id="ASIA...",
+            aws_secret_access_key="...",
+            aws_session_token="...",
+            aws_region="ap-northeast-2"
+        )
+        # Method 4: Use existing LLM client
+        from langchain_aws import ChatBedrockConverse
+        llm = ChatBedrockConverse(model="anthropic.claude-3-5-sonnet-20241022-v2:0")
+        ocr = BedrockOCR(llm_client=llm)
+        # Single image conversion
+        result = ocr.convert_image_to_text("/path/to/image.png")
+        ```
+    """
+    def __init__(
+        self,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        aws_session_token: Optional[str] = None,
+        aws_region: Optional[str] = None,
+        endpoint_url: Optional[str] = None,
+        model: str = DEFAULT_BEDROCK_MODEL,
+        llm_client: Optional[Any] = None,
+        prompt: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+        connect_timeout: int = 60,
+        read_timeout: int = 120,
+        max_retries: int = 10,
+    ):
+        """
+        Initialize AWS Bedrock OCR.
+        Args:
+            aws_access_key_id: AWS access key ID (if not provided, uses environment variable)
+            aws_secret_access_key: AWS secret access key (if not provided, uses environment variable)
+            aws_session_token: AWS session token for temporary credentials (optional)
+            aws_region: AWS region (default: from environment or "ap-northeast-2")
+            endpoint_url: Custom endpoint URL (for VPC endpoints, etc.)
+            model: Model ID to use (default: anthropic.claude-3-5-sonnet-20241022-v2:0)
+            llm_client: Existing LangChain Bedrock client (if provided, other params are ignored)
+            prompt: Custom prompt (if None, default prompt is used)
+            temperature: Generation temperature (default: 0.0)
+            max_tokens: Maximum number of tokens (default: 4096)
+            connect_timeout: Connection timeout in seconds (default: 60)
+            read_timeout: Read timeout in seconds (default: 120)
+            max_retries: Maximum retry attempts (default: 10)
+        """
+        if llm_client is None:
+            from langchain_aws import ChatBedrockConverse
+            from botocore.config import Config as BotocoreConfig
+            # Set environment variables for boto3 auto-discovery
+            if aws_access_key_id:
+                os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
+            if aws_secret_access_key:
+                os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
+            if aws_session_token:
+                os.environ["AWS_SESSION_TOKEN"] = aws_session_token
+            # Determine region
+            if not aws_region:
+                aws_region = os.environ.get(
+                    "AWS_REGION",
+                    os.environ.get("AWS_DEFAULT_REGION", "ap-northeast-2")
+                )
+            logger.info(f"[Bedrock OCR] Using: model={model}, region={aws_region}")
+            # Configure botocore with retry settings
+            bedrock_config = BotocoreConfig(
+                retries={
+                    "max_attempts": max_retries,
+                    "mode": "adaptive",
+                },
+                connect_timeout=connect_timeout,
+                read_timeout=read_timeout,
+            )
+            # Build kwargs for ChatBedrockConverse
+            llm_kwargs = {
+                "model": model,
+                "temperature": temperature,
+                "max_tokens": max_tokens,
+                "disable_streaming": False,
+                "config": bedrock_config,
+            }
+            if aws_region:
+                llm_kwargs["region_name"] = aws_region
+            if aws_access_key_id:
+                llm_kwargs["aws_access_key_id"] = aws_access_key_id
+            if aws_secret_access_key:
+                llm_kwargs["aws_secret_access_key"] = aws_secret_access_key
+            if aws_session_token:
+                llm_kwargs["aws_session_token"] = aws_session_token
+            if endpoint_url:
+                llm_kwargs["endpoint_url"] = endpoint_url
+            llm_client = ChatBedrockConverse(**llm_kwargs)
+            logger.info(f"[Bedrock OCR] Client created: model={model}, region={aws_region}")
+        super().__init__(llm_client=llm_client, prompt=prompt)
+        self.model = model
+        self.aws_region = aws_region
+        logger.info("[Bedrock OCR] Initialization completed")
+    @property
+    def provider(self) -> str:
+        return "aws_bedrock"
+    def build_message_content(self, b64_image: str, mime_type: str) -> list:
+        """
+        Build message content for AWS Bedrock.
+        AWS Bedrock uses the same format as Anthropic Claude models.
+        """
+        return [
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": mime_type,
+                    "data": b64_image
+                }
+            },
+            {"type": "text", "text": self.prompt}
+        ]
+__all__ = ["BedrockOCR", "DEFAULT_BEDROCK_MODEL"]

xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py ADDED Viewed

@@ -0,0 +1,91 @@
+# xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py
+# OCR class using Google Gemini Vision model
+import logging
+from typing import Any, Optional
+from xgen_doc2chunk.ocr.base import BaseOCR
+logger = logging.getLogger("ocr-gemini")
+# Default model
+DEFAULT_GEMINI_MODEL = "gemini-2.0-flash"
+class GeminiOCR(BaseOCR):
+    """
+    OCR processing class using Google Gemini Vision model.
+    Supported models: gemini-pro-vision, gemini-1.5-pro, gemini-2.0-flash, etc.
+    Example:
+        ```python
+        from xgen_doc2chunk.ocr.ocr_engine import GeminiOCR
+        # Method 1: Initialize with api_key and model
+        ocr = GeminiOCR(api_key="...", model="gemini-2.0-flash")
+        # Method 2: Use existing LLM client
+        from langchain_google_genai import ChatGoogleGenerativeAI
+        llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key="...")
+        ocr = GeminiOCR(llm_client=llm)
+        # Single image conversion
+        result = ocr.convert_image_to_text("/path/to/image.png")
+        ```
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model: str = DEFAULT_GEMINI_MODEL,
+        llm_client: Optional[Any] = None,
+        prompt: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: Optional[int] = None,
+    ):
+        """
+        Initialize Gemini OCR.
+        Args:
+            api_key: Google API key (required if llm_client is not provided)
+            model: Model name to use (default: gemini-2.0-flash)
+            llm_client: Existing LangChain Gemini client (if provided, api_key and model are ignored)
+            prompt: Custom prompt (if None, default prompt is used)
+            temperature: Generation temperature (default: 0.0)
+            max_tokens: Maximum number of tokens (if None, model default is used)
+        """
+        if llm_client is None:
+            if api_key is None:
+                raise ValueError("Either api_key or llm_client is required.")
+            from langchain_google_genai import ChatGoogleGenerativeAI
+            client_kwargs = {
+                "model": model,
+                "google_api_key": api_key,
+                "temperature": temperature,
+            }
+            if max_tokens is not None:
+                client_kwargs["max_output_tokens"] = max_tokens
+            llm_client = ChatGoogleGenerativeAI(**client_kwargs)
+            logger.info(f"[Gemini OCR] Client created: model={model}")
+        super().__init__(llm_client=llm_client, prompt=prompt)
+        self.model = model
+        logger.info("[Gemini OCR] Initialization completed")
+    @property
+    def provider(self) -> str:
+        return "gemini"
+    def build_message_content(self, b64_image: str, mime_type: str) -> list:
+        return [
+            {"type": "text", "text": self.prompt},
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:{mime_type};base64,{b64_image}"}
+            }
+        ]

xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl