PyPI - chatterer - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

chatterer 0.1.13py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

chatterer/__init__.py +36 -5
chatterer/interactive.py +692 -0
chatterer/language_model.py +217 -261
chatterer/messages.py +13 -1
chatterer/tools/__init__.py +26 -15
chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
chatterer/tools/convert_pdf_to_markdown.py +302 -0
chatterer/tools/convert_to_text.py +45 -16
chatterer/tools/upstage_document_parser.py +481 -214
chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
chatterer/tools/youtube.py +2 -1
chatterer/utils/__init__.py +1 -1
chatterer/utils/{image.py → base64_image.py} +56 -62
chatterer/utils/code_agent.py +137 -38
chatterer/utils/imghdr.py +148 -0
chatterer-0.1.16.dist-info/METADATA +392 -0
chatterer-0.1.16.dist-info/RECORD +33 -0
{chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/WHEEL +1 -1
chatterer/tools/webpage_to_markdown/__init__.py +0 -4
chatterer-0.1.13.dist-info/METADATA +0 -171
chatterer-0.1.13.dist-info/RECORD +0 -31
{chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/top_level.txt +0 -0

chatterer/tools/upstage_document_parser.py CHANGED Viewed

@@ -1,28 +1,38 @@
-"""Adopted from`langchain_upstage.document_parse"""
+# -*- coding: utf-8 -*-
+"""Adopted from `langchain_upstage.document_parse`"""
+from __future__ import annotations
+import base64
+import binascii
 import io
 import json
 import logging
 import os
-from typing import Iterator, Literal, Optional, cast
+import uuid
+from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
 import requests
 from langchain_core.document_loaders import BaseBlobParser, Blob
 from langchain_core.documents import Document
 from pydantic import BaseModel, Field
-from pypdf import PdfReader, PdfWriter
-from pypdf.errors import PdfReadError
 from ..common_types.io import BytesReadable
 from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
-from ..utils.image import Base64Image
+from ..utils.base64_image import Base64Image
+from ..utils.imghdr import what
+if TYPE_CHECKING:
+    from pypdf import PdfReader
 logger = logging.getLogger("pypdf")
 logger.setLevel(logging.ERROR)
+parser_logger = logging.getLogger(__name__)  # Added logger for this module
 DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
 DEFAULT_NUM_PAGES = 10
 DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
+DEFAULT_IMAGE_DIR = "images"  # Added default image directory
 OutputFormat = Literal["text", "html", "markdown"]
 OCR = Literal["auto", "force"]
@@ -63,34 +73,124 @@ class Element(BaseModel):
     page: int
     def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
+        """
+        Generates the text representation of the element.
+        If the element is a figure with base64 encoding and no chatterer is provided,
+        it generates a markdown link to a uniquely named image file and stores the
+        image data in the parser's image_data dictionary. Otherwise, it uses the
+        chatterer for description or returns the standard text/html/markdown.
+        """
         output_format: OutputFormat = parser.output_format
         chatterer: Optional[Chatterer] = parser.chatterer
         image_description_instruction: str = parser.image_description_instruction
         output: Optional[str] = None
         if output_format == "text":
             output = self.content.text
         elif output_format == "html":
             output = self.content.html
         elif output_format == "markdown":
             output = self.content.markdown
         if output is None:
-            raise ValueError(f"Invalid output format: {output_format}")
-        if chatterer is not None and self.category == "figure" and self.base64_encoding:
-            image = Base64Image.from_string(f"data:image/jpeg;base64,{self.base64_encoding}")
-            if image is None:
-                raise ValueError(f"Invalid base64 encoding for image: {self.base64_encoding}")
-            ocr_content = output.removeprefix("![image](/image/placeholder)\n")
-            image_description = chatterer.describe_image(
-                image.data_uri,
-                image_description_instruction
-                + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
-            )
-            output = f"\n\n<details>\n{image_description}\n</details>\n\n"
+            # Fallback or raise error if needed, here using text as fallback
+            output = self.content.text or ""
+            # Or raise ValueError(f"Invalid output format or missing content: {output_format}")
+        # --- Logic modification starts here ---
+        if self.category == "figure" and self.base64_encoding:
+            # Case 1: Chatterer is available - Generate description
+            if chatterer is not None:
+                # Check if base64 encoding is valid
+                try:
+                    # Decode base64 to check if valid
+                    img_type = what(self.base64_encoding)
+                    if not img_type:
+                        parser_logger.warning(
+                            f"Could not determine image type for figure element {self.id} (page {self.page})."
+                        )
+                        return output
+                    image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
+                except (binascii.Error, ValueError) as e:
+                    parser_logger.warning(
+                        f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
+                    )
+                    return output
+                if image is None:
+                    parser_logger.warning(
+                        f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
+                    )
+                    # Fallback to original output (placeholder/OCR)
+                    return output
+                ocr_content = ""
+                if output_format == "markdown":
+                    ocr_content = output.removeprefix("![image](/image/placeholder)\n")
+                elif output_format == "text":
+                    ocr_content = output
+                image_description = chatterer.describe_image(
+                    image.data_uri,
+                    image_description_instruction
+                    + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
+                )
+                # Return description within details tag (as original)
+                output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
+            # Case 2: Chatterer is NOT available - Generate file path and store data
+            elif parser.image_dir is not None:
+                try:
+                    img_type = what(self.base64_encoding)
+                    if not img_type:
+                        parser_logger.warning(
+                            f"Could not determine image type for figure element {self.id} (page {self.page})."
+                        )
+                        return output
+                    image_bytes = base64.b64decode(self.base64_encoding)
+                    # Generate unique filename and path
+                    filename = f"{uuid.uuid4().hex}.{img_type}"  # Use default format
+                    # Create relative path for markdown link, ensuring forward slashes
+                    relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
+                    # Store the image data for the user to save later
+                    parser.image_data[relative_path] = image_bytes
+                    # Extract OCR content if present
+                    ocr_content = ""
+                    if output_format == "markdown" and output.startswith("![image]"):
+                        ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
+                    elif output_format == "text":
+                        ocr_content = output  # Assume text output is OCR for images
+                    # Update output to be the markdown link + OCR
+                    output = f"![image]({relative_path})\n{ocr_content}".strip()
+                except (binascii.Error, ValueError) as e:
+                    # Handle potential base64 decoding errors gracefully
+                    parser_logger.warning(
+                        f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
+                    )
+                    # Keep the original 'output' value (placeholder or OCR)
+                    pass
         return output
+class Coordinates(TypedDict):
+    id: int
+    category: Category
+    coordinates: list[Coordinate]
+class PageCoordinates(Coordinates):
+    page: int
 def get_from_param_or_env(
     key: str,
     param: Optional[str] = None,
@@ -108,13 +208,30 @@ def get_from_param_or_env(
         raise ValueError(
             f"Did not find {key}, please add an environment variable"
             f" `{env_key}` which contains it, or pass"
-            f"  `{key}` as a named parameter."
+            f" `{key}` as a named parameter."
         )
 class UpstageDocumentParseParser(BaseBlobParser):
     """Upstage Document Parse Parser.
+    Parses documents using the Upstage Document AI API. Can optionally extract
+    images and return their data alongside the parsed documents.
+    If a `chatterer` is provided, it will be used to generate descriptions for
+    images (figures with base64 encoding).
+    If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
+    this parser will:
+    1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
+       The base directory can be configured with `image_dir`.
+    2. Replace the element's content with a markdown image link pointing to this path.
+    3. Store the actual image bytes in the `image_data` attribute dictionary,
+       mapping the generated relative path to the bytes.
+    The user is responsible for saving the files from the `image_data` dictionary
+    after processing the documents yielded by `lazy_parse`.
     To use, you should have the environment variable `UPSTAGE_API_KEY`
     set with your API key or pass it as a named parameter to the constructor.
@@ -122,8 +239,58 @@ class UpstageDocumentParseParser(BaseBlobParser):
         .. code-block:: python
             from langchain_upstage import UpstageDocumentParseParser
+            from langchain_core.documents import Blob
+            import os
+            # --- Setup ---
+            # Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
+            # Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
+            # --- Parsing without chatterer (extracts images) ---
+            parser = UpstageDocumentParseParser(
+                split="page",
+                output_format="markdown",
+                base64_encoding=["figure"], # Important: Request base64 for figures
+                image_dir="extracted_images" # Optional: specify image dir
+            )
+            blob = Blob.from_path("my_document.pdf") # Or your image file path
+            documents = []
+            for doc in parser.lazy_parse(blob):
+                print("--- Document ---")
+                print(f"Page: {get_metadata_from_document(doc).get('page')}")
+                print(doc.page_content)
+                documents.append(doc)
+            print("\\n--- Extracted Image Data ---")
+            if parser.image_data:
+                # User saves the images
+                for img_path, img_bytes in parser.image_data.items():
+                    # Create directories if they don't exist
+                    os.makedirs(os.path.dirname(img_path), exist_ok=True)
+                    try:
+                        with open(img_path, "wb") as f:
+                            f.write(img_bytes)
+                        print(f"Saved image: {img_path}")
+                    except IOError as e:
+                        print(f"Error saving image {img_path}: {e}")
+            else:
+                print("No images extracted.")
+            # --- Parsing with chatterer (generates descriptions) ---
+            # from langchain_upstage import UpstageChatter # Assuming this exists
+            # chatterer = UpstageChatter() # Initialize your chatterer
+            # parser_with_desc = UpstageDocumentParseParser(
+            #     split="page",
+            #     output_format="markdown",
+            #     base64_encoding=["figure"], # Still need base64 for description
+            #     chatterer=chatterer
+            # )
+            # documents_with_desc = list(parser_with_desc.lazy_parse(blob))
+            # print("\\n--- Documents with Descriptions ---")
+            # for doc in documents_with_desc:
+            #     print(f"Page: {get_metadata_from_document(doc).get('page')}")
+            #     print(doc.page_content)
-            loader = UpstageDocumentParseParser(split="page", output_format="text")
     """
     def __init__(
@@ -138,36 +305,34 @@ class UpstageDocumentParseParser(BaseBlobParser):
         base64_encoding: list[Category] = [],
         chatterer: Optional[Chatterer] = None,
         image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
+        image_dir: Optional[str] = None,  # Added: Directory for image paths
     ) -> None:
         """
-        Initializes an instance of the Upstage class.
+        Initializes an instance of the UpstageDocumentParseParser.
         Args:
-            api_key (str, optional): The API key for accessing the Upstage API.
-                                     Defaults to None, in which case it will be
-                                     fetched from the environment variable
-                                     `UPSTAGE_API_KEY`.
-            base_url (str, optional): The base URL for accessing the Upstage API.
-            model (str): The model to be used for the document parse.
-                         Defaults to "document-parse".
-            split (SplitType, optional): The type of splitting to be applied.
-                                         Defaults to "none" (no splitting).
-            ocr (OCRMode, optional): Extract text from images in the document using OCR.
-                                     If the value is "force", OCR is used to extract
-                                     text from an image. If the value is "auto", text is
-                                     extracted from a PDF. (An error will occur if the
-                                     value is "auto" and the input is NOT in PDF format)
-            output_format (OutputFormat, optional): Format of the inference results.
-            coordinates (bool, optional): Whether to include the coordinates of the
-                                          OCR in the output.
-            base64_encoding (List[Category], optional): The category of the elements to
-                                                        be encoded in base64.
-            chatterer (Chatterer, optional): The Chatterer instance to use for image
-                                             description.
-            image_description_instruction (str, optional): The instruction to use for
-                                                           image description.
+            api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
+            base_url (str, optional): Base URL for the Upstage API.
+            model (str): Model for document parse. Defaults to "document-parse".
+            split (SplitType, optional): Splitting type ("none", "page", "element").
+                                          Defaults to "none".
+            ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
+            output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
+                                                     Defaults to "markdown".
+            coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
+            base64_encoding (List[Category], optional): Categories to return as base64.
+                                                       Crucial for image extraction/description.
+                                                       Set to `["figure"]` to process images.
+                                                       Defaults to [].
+            chatterer (Chatterer, optional): Chatterer instance for image description.
+                                             If None, images will be extracted to files.
+                                             Defaults to None.
+            image_description_instruction (str, optional): Instruction for image description.
+                                                            Defaults to a standard instruction.
+            image_dir (str, optional): The directory name to use when constructing
+                                        relative paths for extracted images.
+                                        Defaults to "images". This directory
+                                        is NOT created by the parser.
         """
         self.api_key = get_from_param_or_env(
             "UPSTAGE_API_KEY",
@@ -181,28 +346,30 @@ class UpstageDocumentParseParser(BaseBlobParser):
         self.ocr: OCR = ocr
         self.output_format: OutputFormat = output_format
         self.coordinates = coordinates
+        # Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
+        # However, it's better to require the user to explicitly set base64_encoding=["figure"]
         self.base64_encoding: list[Category] = base64_encoding
         self.chatterer = chatterer
         self.image_description_instruction = image_description_instruction
+        self.image_dir = image_dir  # Store output directory name
+        # Initialize dictionary to store image data (path -> bytes)
+        self.image_data: Dict[str, bytes] = {}
-    def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
+    def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
         """
         Sends a POST request to the API endpoint with the provided files and
-        returns the response.
-        Args:
-            files (dict): A dictionary containing the files to be sent in the request.
-        Returns:
-            dict: The JSON response from the API.
-        Raises:
-            ValueError: If there is an error in the API call.
+        returns the parsed elements.
         """
+        response: Optional[requests.Response] = None
         try:
             headers = {
                 "Authorization": f"Bearer {self.api_key}",
             }
+            # Convert list to string representation required by the API
+            base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
+            output_formats_str = f"['{self.output_format}']"
             response = requests.post(
                 self.base_url,
                 headers=headers,
@@ -210,104 +377,152 @@ class UpstageDocumentParseParser(BaseBlobParser):
                 data={
                     "ocr": self.ocr,
                     "model": self.model,
-                    "output_formats": f"['{self.output_format}']",
-                    "coordinates": self.coordinates,
-                    "base64_encoding": f"{self.base64_encoding}",
+                    "output_formats": output_formats_str,
+                    "coordinates": str(self.coordinates).lower(),  # API might expect 'true'/'false'
+                    "base64_encoding": base64_encoding_str,
                 },
             )
-            response.raise_for_status()
-            result: object = response.json().get("elements", [])
+            response.raise_for_status()  # Raises HTTPError for bad responses (4xx or 5xx)
+            # Check content type before parsing JSON
+            content_type = response.headers.get("Content-Type", "")
+            if "application/json" not in content_type:
+                raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
+            response_data = response.json()
+            result: object = response_data.get("elements", [])
             if not isinstance(result, list):
-                raise ValueError(f"Failed to parse JSON data: {result}")
-            result = cast(list[object], result)
-            return [Element.model_validate(element) for element in result]
+                raise ValueError(f"API response 'elements' is not a list: {result}")
+            result = cast(list[object], result)  # Cast to list of objects
+            # Validate each element using Pydantic
+            validated_elements: list[Element] = []
+            for i, element_data in enumerate(result):
+                try:
+                    validated_elements.append(Element.model_validate(element_data))
+                except Exception as e:  # Catch Pydantic validation errors etc.
+                    parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
+                    # Decide whether to skip the element or raise the error
+                    # continue # Option: skip problematic element
+                    raise ValueError(f"Failed to validate element {i}: {e}") from e  # Option: fail fast
+            return validated_elements
         except requests.HTTPError as e:
-            raise ValueError(f"HTTP error: {e.response.text}")
+            # Log more details from the response if available
+            error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
+            try:
+                error_details = e.response.json()  # Try to get JSON error details
+                error_message += f" - {error_details}"
+            except json.JSONDecodeError:
+                error_message += f" - Response body: {e.response.text}"
+            raise ValueError(error_message) from e
         except requests.RequestException as e:
-            # Handle any request-related exceptions
-            raise ValueError(f"Failed to send request: {e}")
+            raise ValueError(f"Failed to send request: {e}") from e
         except json.JSONDecodeError as e:
-            # Handle JSON decode errors
-            raise ValueError(f"Failed to decode JSON response: {e}")
-        except Exception as e:
-            # Handle any other exceptions
-            raise ValueError(f"An error occurred: {e}")
+            # Include part of the response text that failed to parse
+            raise ValueError(
+                f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
+            ) from e
+        except Exception as e:  # Catch-all for other unexpected errors
+            raise ValueError(f"An unexpected error occurred during API call: {e}") from e
     def _split_and_request(
         self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
     ) -> list[Element]:
         """
-        Splits the full pdf document into partial pages and sends a request to the
-        server.
-        Args:
-            full_docs (PdfReader): The full document to be split and requested.
-            start_page (int): The starting page number for splitting the document.
-            num_pages (int, optional): The number of pages to split the document
-                                       into.
-                                       Defaults to DEFAULT_NUMBER_OF_PAGE.
-        Returns:
-            response: The response from the server.
+        Splits the full pdf document into partial pages and sends a request.
         """
+        # Need to import here if not globally available
+        try:
+            from pypdf import PdfWriter
+        except ImportError:
+            raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
         merger = PdfWriter()
-        merger.append(
-            full_docs,
-            pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())),
-        )
+        total_pages = len(full_docs.pages)  # Use len(reader.pages) instead of get_num_pages()
+        end_page = min(start_page + num_pages, total_pages)
+        # Check if start_page is valid
+        if start_page >= total_pages:
+            parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
+            return []
+        # pypdf page indices are 0-based, slicing is exclusive of the end index
+        # PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
+        # However, the example used pages=(start, end) which might behave differently depending on version?
+        # Let's stick to add_page for clarity if possible, or ensure append range is correct.
+        # merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
+        # Alternative using add_page loop (more explicit)
+        for i in range(start_page, end_page):
+            merger.add_page(full_docs.pages[i])
         with io.BytesIO() as buffer:
             merger.write(buffer)
             buffer.seek(0)
-            return self._get_response({"document": buffer})
+            # Need to provide a filename for the 'files' dict
+            return self._get_response({"document": ("partial_doc.pdf", buffer)})  # Provide a dummy filename
     def _element_document(self, element: Element, start_page: int = 0) -> Document:
-        """
-        Converts an elements into a Document object.
-        Args:
-            elements (Dict) : The elements to convert.
-            start_page (int): The starting page number for splitting the document.
-                              This number starts from zero.
+        """Converts an element into a Document object."""
+        # parse_text now handles image path generation and data storage if needed
+        page_content = element.parse_text(self)
+        metadata: dict[str, object] = element.model_dump(
+            exclude={"content", "base64_encoding"}, exclude_none=True
+        )  # Exclude raw content/base64
+        metadata["page"] = element.page + start_page  # Adjust page number
+        # Base64 encoding is not added to metadata if it was processed into image_data
+        # Coordinates are kept if requested
+        if not self.coordinates:
+            metadata.pop("coordinates", None)
-        Returns:
-            A list containing a single Document object.
-        """
-        metadata: dict[str, object] = element.model_dump(exclude_none=True)
-        metadata["page"] = element.page + start_page
         return Document(
-            page_content=element.parse_text(self),
+            page_content=page_content,
             metadata=metadata,
         )
     def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
-        """
-        Combines elements with the same page number into a single Document object.
-        Args:
-            elements (List): A list of elements containing page numbers.
-            start_page (int): The starting page number for splitting the document.
-                              This number starts from zero.
-        Returns:
-            List[Document]: A list of Document objects, each representing a page
-                            with its content and metadata.
-        """
+        """Combines elements with the same page number into a single Document object."""
         documents: list[Document] = []
-        pages: list[int] = sorted(set(map(lambda x: x.page, elements)))
-        page_group: list[list[Element]] = [[element for element in elements if element.page == x] for x in pages]
-        for group in page_group:
+        if not elements:
+            return documents
+        # Group elements by page (relative to the current batch)
+        pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
+        page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
+        for element in elements:
+            page_groups[element.page].append(element)
+        for page_num, group in page_groups.items():
+            actual_page_num = page_num + start_page
+            page_content_parts: list[str] = []
+            page_coordinates: list[Coordinates] = []
+            # Base64 encodings are handled within parse_text now, not collected here
+            for element in sorted(group, key=lambda x: x.id):  # Process elements in order
+                page_content_parts.append(element.parse_text(self))
+                if self.coordinates and element.coordinates:
+                    page_coordinates.append({  # Store coordinates with element id/category for context
+                        "id": element.id,
+                        "category": element.category,
+                        "coordinates": element.coordinates,
+                    })
             metadata: dict[str, object] = {
-                "page": group[0].page + start_page,
+                "page": actual_page_num,
             }
-            if self.base64_encoding:
-                metadata["base64_encodings"] = [element.base64_encoding for element in group if element.base64_encoding]
-            if self.coordinates:
-                metadata["coordinates"] = [element.coordinates for element in group if element.coordinates]
+            if self.coordinates and page_coordinates:
+                metadata["element_coordinates"] = page_coordinates  # Changed key for clarity
+            # Combine content, typically with spaces or newlines
+            # Using newline might be better for readability if elements are paragraphs etc.
+            combined_page_content = "\n\n".join(part for part in page_content_parts if part)  # Join non-empty parts
             documents.append(
                 Document(
-                    page_content=" ".join(element.parse_text(self) for element in group),
+                    page_content=combined_page_content,
                     metadata=metadata,
                 )
             )
@@ -316,123 +531,175 @@ class UpstageDocumentParseParser(BaseBlobParser):
     def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
         """
-        Lazily parses a document and yields Document objects based on the specified
-        split type.
+        Lazily parses a document blob.
+        Yields Document objects based on the specified split type.
+        If images are extracted (chatterer=None, base64_encoding=["figure"]),
+        the image data will be available in `self.image_data` after iteration.
         Args:
-            blob (Blob): The input document blob to parse.
-            is_batch (bool, optional): Whether to parse the document in batches.
-                                       Defaults to False (single page parsing)
+            blob (Blob): The input document blob to parse. Requires `blob.path`.
+            is_batch (bool, optional): Currently affects PDF page batch size.
+                                       Defaults to False (process 1 page batch for PDF).
+                                       *Note: API might have limits regardless.*
         Yields:
-            Document: The parsed document object.
+            Document: The parsed document object(s).
         Raises:
-            ValueError: If an invalid split type is provided.
+            ValueError: If blob.path is not set, API error occurs, or invalid config.
+            ImportError: If pypdf is needed but not installed.
         """
+        # Clear image data at the start of parsing for this specific call
+        self.image_data = {}
-        if is_batch:
-            num_pages = DEFAULT_NUM_PAGES
-        else:
-            num_pages = 1
+        if not blob.path:
+            # Non-PDF files and direct API calls require reading the file,
+            # PDF splitting also requires the path.
+            raise ValueError("Blob path is required for UpstageDocumentParseParser.")
-        full_docs: Optional[PdfReader] = None
+        # Try importing pypdf here, only if needed
+        PdfReader = None
+        PdfReadError = None
         try:
-            full_docs = PdfReader(str(blob.path))
-            number_of_pages = full_docs.get_num_pages()
-        except PdfReadError:
-            number_of_pages = 1
-        except Exception as e:
-            raise ValueError(f"Failed to read PDF file: {e}")
-        if self.split == "none":
-            result = ""
-            base64_encodings: list[str] = []
-            coordinates: list[list[Coordinate]] = []
-            if full_docs is not None:
-                start_page = 0
-                num_pages = DEFAULT_NUM_PAGES
-                for _ in range(number_of_pages):
-                    if start_page >= number_of_pages:
-                        break
+            from pypdf import PdfReader as PyPdfReader
+            from pypdf.errors import PdfReadError as PyPdfReadError
-                    elements = self._split_and_request(full_docs, start_page, num_pages)
-                    for element in elements:
-                        result += element.parse_text(self)
-                        if self.base64_encoding and (base64_encoding := element.base64_encoding):
-                            base64_encodings.append(base64_encoding)
-                        if self.coordinates and (coords := element.coordinates):
-                            coordinates.append(coords)
+            PdfReader = PyPdfReader
+            PdfReadError = PyPdfReadError
+        except ImportError:
+            # We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
+            # Let's attempt to read anyway, API might support non-PDFs directly.
+            # We'll check for PdfReader later if we determine it's a PDF.
+            pass
-                    start_page += num_pages
+        full_docs: Optional[PdfReader] = None
+        is_pdf = False
+        number_of_pages = 1  # Default for non-PDF or single-page docs
+        try:
+            # Check if it's a PDF by trying to open it
+            if PdfReader and PdfReadError:
+                try:
+                    # Use strict=False to be more lenient with potentially corrupted PDFs
+                    full_docs = PdfReader(str(blob.path), strict=False)
+                    number_of_pages = len(full_docs.pages)
+                    is_pdf = True
+                except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
+                    parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
+                except Exception as e:  # Catch other potential pypdf errors
+                    parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
+                    raise ValueError(f"Failed to process PDF file: {e}") from e
             else:
-                if not blob.path:
-                    raise ValueError("Blob path is required for non-PDF files.")
+                parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
-                with open(blob.path, "rb") as f:
-                    elements = self._get_response({"document": f})
+        except Exception as e:
+            raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
-                for element in elements:
-                    result += element.parse_text(self)
+        # --- Parsing Logic based on Split Type ---
-                    if self.base64_encoding and (base64_encoding := element.base64_encoding):
-                        base64_encodings.append(base64_encoding)
-                    if self.coordinates and (coords := element.coordinates):
-                        coordinates.append(coords)
-            metadata: dict[str, object] = {"total_pages": number_of_pages}
-            if self.coordinates:
-                metadata["coordinates"] = coordinates
-            if self.base64_encoding:
-                metadata["base64_encodings"] = base64_encodings
+        # Case 1: No Splitting (Combine all content)
+        if self.split == "none":
+            combined_result = ""
+            all_coordinates: list[PageCoordinates] = []
+            # Base64 handled by parse_text, data stored in self.image_data
+            if is_pdf and full_docs and PdfReader:  # Process PDF page by page or in batches
+                start_page = 0
+                # Use a reasonable batch size for 'none' split to avoid huge requests
+                batch_num_pages = DEFAULT_NUM_PAGES
+                while start_page < number_of_pages:
+                    elements = self._split_and_request(full_docs, start_page, batch_num_pages)
+                    for element in sorted(elements, key=lambda x: (x.page, x.id)):
+                        combined_result += element.parse_text(self) + "\n\n"  # Add separator
+                        if self.coordinates and element.coordinates:
+                            # Adjust page number for coordinates metadata
+                            coords_with_page: PageCoordinates = {
+                                "id": element.id,
+                                "category": element.category,
+                                "page": element.page + start_page,  # Actual page
+                                "coordinates": element.coordinates,
+                            }
+                            all_coordinates.append(coords_with_page)
+                    start_page += batch_num_pages
+            else:  # Process non-PDF file as a single unit
+                with open(blob.path, "rb") as f:
+                    # Provide a filename for the 'files' dict
+                    filename = os.path.basename(blob.path)
+                    elements = self._get_response({"document": (filename, f)})
+                for element in sorted(elements, key=lambda x: x.id):
+                    combined_result += element.parse_text(self) + "\n\n"
+                    if self.coordinates and element.coordinates:
+                        all_coordinates.append({
+                            "id": element.id,
+                            "category": element.category,
+                            "page": element.page,  # Page is relative to the single doc (usually 0 or 1)
+                            "coordinates": element.coordinates,
+                        })
+            metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
+            if self.coordinates and all_coordinates:
+                metadata["element_coordinates"] = all_coordinates
+            # self.image_data is populated, no need to add base64 to metadata
             yield Document(
-                page_content=result,
+                page_content=combined_result.strip(),
                 metadata=metadata,
             )
+        # Case 2: Split by Element
         elif self.split == "element":
-            if full_docs is not None:
+            if is_pdf and full_docs and PdfReader:
                 start_page = 0
-                for _ in range(number_of_pages):
-                    if start_page >= number_of_pages:
-                        break
-                    elements = self._split_and_request(full_docs, start_page, num_pages)
-                    for element in elements:
-                        yield self._element_document(element, start_page)
-                    start_page += num_pages
-            else:
-                if not blob.path:
-                    raise ValueError("Blob path is required for non-PDF files.")
+                batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1  # Use smaller batches for element split?
+                while start_page < number_of_pages:
+                    elements = self._split_and_request(full_docs, start_page, batch_num_pages)
+                    for element in sorted(elements, key=lambda x: (x.page, x.id)):
+                        # _element_document handles metadata and adjusts page number
+                        doc = self._element_document(element, start_page)
+                        _get_metadata_from_document(doc)["source"] = blob.path  # Add source
+                        yield doc
+                    start_page += batch_num_pages
+            else:  # Non-PDF
                 with open(blob.path, "rb") as f:
-                    elements = self._get_response({"document": f})
-                for element in elements:
-                    yield self._element_document(element)
+                    filename = os.path.basename(blob.path)
+                    elements = self._get_response({"document": (filename, f)})
+                for element in sorted(elements, key=lambda x: x.id):
+                    doc = self._element_document(element, 0)  # Start page is 0 for single doc
+                    _get_metadata_from_document(doc)["source"] = blob.path  # Add source
+                    yield doc
+        # Case 3: Split by Page
         elif self.split == "page":
-            if full_docs is not None:
+            if is_pdf and full_docs and PdfReader:
                 start_page = 0
-                for _ in range(number_of_pages):
-                    if start_page >= number_of_pages:
-                        break
-                    elements = self._split_and_request(full_docs, start_page, num_pages)
-                    yield from self._page_document(elements, start_page)
-                    start_page += num_pages
-            else:
-                if not blob.path:
-                    raise ValueError("Blob path is required for non-PDF files.")
+                batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1  # Process page-by-page if not is_batch
+                while start_page < number_of_pages:
+                    elements = self._split_and_request(full_docs, start_page, batch_num_pages)
+                    # _page_document groups elements by page and creates Documents
+                    page_docs = self._page_document(elements, start_page)
+                    for doc in page_docs:
+                        _get_metadata_from_document(doc)["source"] = blob.path  # Add source
+                        yield doc
+                    start_page += batch_num_pages
+            else:  # Non-PDF (treat as single page)
                 with open(blob.path, "rb") as f:
-                    elements = self._get_response({"document": f})
-                yield from self._page_document(elements)
+                    filename = os.path.basename(blob.path)
+                    elements = self._get_response({"document": (filename, f)})
+                page_docs = self._page_document(elements, 0)  # Process elements as page 0
+                for doc in page_docs:
+                    _get_metadata_from_document(doc)["source"] = blob.path  # Add source
+                    yield doc
         else:
             raise ValueError(f"Invalid split type: {self.split}")
+def _get_metadata_from_document(doc: Document) -> dict[object, object]:
+    """
+    Helper function to extract metadata from a Document object.
+    This is a placeholder and should be adjusted based on actual metadata structure.
+    """
+    metadata: dict[object, object] = doc.metadata  # pyright: ignore[reportUnknownMemberType]
+    return metadata

chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl

chatterer 0.1.13py3-none-any.whl → 0.1.16py3-none-any.whl