chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,28 +1,38 @@
1
- """Adopted from`langchain_upstage.document_parse"""
1
+ # -*- coding: utf-8 -*-
2
+ """Adopted from `langchain_upstage.document_parse`"""
2
3
 
4
+ from __future__ import annotations
5
+
6
+ import base64
7
+ import binascii
3
8
  import io
4
9
  import json
5
10
  import logging
6
11
  import os
7
- from typing import Iterator, Literal, Optional, cast
12
+ import uuid
13
+ from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
8
14
 
9
15
  import requests
10
16
  from langchain_core.document_loaders import BaseBlobParser, Blob
11
17
  from langchain_core.documents import Document
12
18
  from pydantic import BaseModel, Field
13
- from pypdf import PdfReader, PdfWriter
14
- from pypdf.errors import PdfReadError
15
19
 
16
20
  from ..common_types.io import BytesReadable
17
21
  from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
18
- from ..utils.image import Base64Image
22
+ from ..utils.base64_image import Base64Image
23
+ from ..utils.imghdr import what
24
+
25
+ if TYPE_CHECKING:
26
+ from pypdf import PdfReader
19
27
 
20
28
  logger = logging.getLogger("pypdf")
21
29
  logger.setLevel(logging.ERROR)
30
+ parser_logger = logging.getLogger(__name__) # Added logger for this module
22
31
 
23
32
  DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
24
33
  DEFAULT_NUM_PAGES = 10
25
34
  DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
35
+ DEFAULT_IMAGE_DIR = "images" # Added default image directory
26
36
 
27
37
  OutputFormat = Literal["text", "html", "markdown"]
28
38
  OCR = Literal["auto", "force"]
@@ -63,34 +73,124 @@ class Element(BaseModel):
63
73
  page: int
64
74
 
65
75
  def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
76
+ """
77
+ Generates the text representation of the element.
78
+
79
+ If the element is a figure with base64 encoding and no chatterer is provided,
80
+ it generates a markdown link to a uniquely named image file and stores the
81
+ image data in the parser's image_data dictionary. Otherwise, it uses the
82
+ chatterer for description or returns the standard text/html/markdown.
83
+ """
66
84
  output_format: OutputFormat = parser.output_format
67
85
  chatterer: Optional[Chatterer] = parser.chatterer
68
86
  image_description_instruction: str = parser.image_description_instruction
69
87
  output: Optional[str] = None
88
+
70
89
  if output_format == "text":
71
90
  output = self.content.text
72
91
  elif output_format == "html":
73
92
  output = self.content.html
74
93
  elif output_format == "markdown":
75
94
  output = self.content.markdown
95
+
76
96
  if output is None:
77
- raise ValueError(f"Invalid output format: {output_format}")
78
-
79
- if chatterer is not None and self.category == "figure" and self.base64_encoding:
80
- image = Base64Image.from_string(f"data:image/jpeg;base64,{self.base64_encoding}")
81
- if image is None:
82
- raise ValueError(f"Invalid base64 encoding for image: {self.base64_encoding}")
83
- ocr_content = output.removeprefix("![image](/image/placeholder)\n")
84
- image_description = chatterer.describe_image(
85
- image.data_uri,
86
- image_description_instruction
87
- + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
88
- )
89
- output = f"\n\n<details>\n{image_description}\n</details>\n\n"
97
+ # Fallback or raise error if needed, here using text as fallback
98
+ output = self.content.text or ""
99
+ # Or raise ValueError(f"Invalid output format or missing content: {output_format}")
100
+
101
+ # --- Logic modification starts here ---
102
+ if self.category == "figure" and self.base64_encoding:
103
+ # Case 1: Chatterer is available - Generate description
104
+ if chatterer is not None:
105
+ # Check if base64 encoding is valid
106
+ try:
107
+ # Decode base64 to check if valid
108
+ img_type = what(self.base64_encoding)
109
+ if not img_type:
110
+ parser_logger.warning(
111
+ f"Could not determine image type for figure element {self.id} (page {self.page})."
112
+ )
113
+ return output
114
+ image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
115
+
116
+ except (binascii.Error, ValueError) as e:
117
+ parser_logger.warning(
118
+ f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
119
+ )
120
+ return output
121
+
122
+ if image is None:
123
+ parser_logger.warning(
124
+ f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
125
+ )
126
+ # Fallback to original output (placeholder/OCR)
127
+ return output
128
+
129
+ ocr_content = ""
130
+ if output_format == "markdown":
131
+ ocr_content = output.removeprefix("![image](/image/placeholder)\n")
132
+ elif output_format == "text":
133
+ ocr_content = output
134
+
135
+ image_description = chatterer.describe_image(
136
+ image.data_uri,
137
+ image_description_instruction
138
+ + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
139
+ )
140
+ # Return description within details tag (as original)
141
+ output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
142
+
143
+ # Case 2: Chatterer is NOT available - Generate file path and store data
144
+ elif parser.image_dir is not None:
145
+ try:
146
+ img_type = what(self.base64_encoding)
147
+ if not img_type:
148
+ parser_logger.warning(
149
+ f"Could not determine image type for figure element {self.id} (page {self.page})."
150
+ )
151
+ return output
152
+
153
+ image_bytes = base64.b64decode(self.base64_encoding)
154
+
155
+ # Generate unique filename and path
156
+ filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
157
+ # Create relative path for markdown link, ensuring forward slashes
158
+ relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
159
+
160
+ # Store the image data for the user to save later
161
+ parser.image_data[relative_path] = image_bytes
162
+
163
+ # Extract OCR content if present
164
+ ocr_content = ""
165
+ if output_format == "markdown" and output.startswith("![image]"):
166
+ ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
167
+ elif output_format == "text":
168
+ ocr_content = output # Assume text output is OCR for images
169
+
170
+ # Update output to be the markdown link + OCR
171
+ output = f"![image]({relative_path})\n{ocr_content}".strip()
172
+
173
+ except (binascii.Error, ValueError) as e:
174
+ # Handle potential base64 decoding errors gracefully
175
+ parser_logger.warning(
176
+ f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
177
+ )
178
+ # Keep the original 'output' value (placeholder or OCR)
179
+ pass
90
180
 
91
181
  return output
92
182
 
93
183
 
184
+ class Coordinates(TypedDict):
185
+ id: int
186
+ category: Category
187
+ coordinates: list[Coordinate]
188
+
189
+
190
+ class PageCoordinates(Coordinates):
191
+ page: int
192
+
193
+
94
194
  def get_from_param_or_env(
95
195
  key: str,
96
196
  param: Optional[str] = None,
@@ -108,13 +208,30 @@ def get_from_param_or_env(
108
208
  raise ValueError(
109
209
  f"Did not find {key}, please add an environment variable"
110
210
  f" `{env_key}` which contains it, or pass"
111
- f" `{key}` as a named parameter."
211
+ f" `{key}` as a named parameter."
112
212
  )
113
213
 
114
214
 
115
215
  class UpstageDocumentParseParser(BaseBlobParser):
116
216
  """Upstage Document Parse Parser.
117
217
 
218
+ Parses documents using the Upstage Document AI API. Can optionally extract
219
+ images and return their data alongside the parsed documents.
220
+
221
+ If a `chatterer` is provided, it will be used to generate descriptions for
222
+ images (figures with base64 encoding).
223
+
224
+ If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
225
+ this parser will:
226
+ 1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
227
+ The base directory can be configured with `image_dir`.
228
+ 2. Replace the element's content with a markdown image link pointing to this path.
229
+ 3. Store the actual image bytes in the `image_data` attribute dictionary,
230
+ mapping the generated relative path to the bytes.
231
+
232
+ The user is responsible for saving the files from the `image_data` dictionary
233
+ after processing the documents yielded by `lazy_parse`.
234
+
118
235
  To use, you should have the environment variable `UPSTAGE_API_KEY`
119
236
  set with your API key or pass it as a named parameter to the constructor.
120
237
 
@@ -122,8 +239,58 @@ class UpstageDocumentParseParser(BaseBlobParser):
122
239
  .. code-block:: python
123
240
 
124
241
  from langchain_upstage import UpstageDocumentParseParser
242
+ from langchain_core.documents import Blob
243
+ import os
244
+
245
+ # --- Setup ---
246
+ # Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
247
+ # Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
248
+
249
+ # --- Parsing without chatterer (extracts images) ---
250
+ parser = UpstageDocumentParseParser(
251
+ split="page",
252
+ output_format="markdown",
253
+ base64_encoding=["figure"], # Important: Request base64 for figures
254
+ image_dir="extracted_images" # Optional: specify image dir
255
+ )
256
+ blob = Blob.from_path("my_document.pdf") # Or your image file path
257
+ documents = []
258
+ for doc in parser.lazy_parse(blob):
259
+ print("--- Document ---")
260
+ print(f"Page: {get_metadata_from_document(doc).get('page')}")
261
+ print(doc.page_content)
262
+ documents.append(doc)
263
+
264
+ print("\\n--- Extracted Image Data ---")
265
+ if parser.image_data:
266
+ # User saves the images
267
+ for img_path, img_bytes in parser.image_data.items():
268
+ # Create directories if they don't exist
269
+ os.makedirs(os.path.dirname(img_path), exist_ok=True)
270
+ try:
271
+ with open(img_path, "wb") as f:
272
+ f.write(img_bytes)
273
+ print(f"Saved image: {img_path}")
274
+ except IOError as e:
275
+ print(f"Error saving image {img_path}: {e}")
276
+ else:
277
+ print("No images extracted.")
278
+
279
+ # --- Parsing with chatterer (generates descriptions) ---
280
+ # from langchain_upstage import UpstageChatter # Assuming this exists
281
+ # chatterer = UpstageChatter() # Initialize your chatterer
282
+ # parser_with_desc = UpstageDocumentParseParser(
283
+ # split="page",
284
+ # output_format="markdown",
285
+ # base64_encoding=["figure"], # Still need base64 for description
286
+ # chatterer=chatterer
287
+ # )
288
+ # documents_with_desc = list(parser_with_desc.lazy_parse(blob))
289
+ # print("\\n--- Documents with Descriptions ---")
290
+ # for doc in documents_with_desc:
291
+ # print(f"Page: {get_metadata_from_document(doc).get('page')}")
292
+ # print(doc.page_content)
125
293
 
126
- loader = UpstageDocumentParseParser(split="page", output_format="text")
127
294
  """
128
295
 
129
296
  def __init__(
@@ -138,36 +305,34 @@ class UpstageDocumentParseParser(BaseBlobParser):
138
305
  base64_encoding: list[Category] = [],
139
306
  chatterer: Optional[Chatterer] = None,
140
307
  image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
308
+ image_dir: Optional[str] = None, # Added: Directory for image paths
141
309
  ) -> None:
142
310
  """
143
- Initializes an instance of the Upstage class.
311
+ Initializes an instance of the UpstageDocumentParseParser.
144
312
 
145
313
  Args:
146
- api_key (str, optional): The API key for accessing the Upstage API.
147
- Defaults to None, in which case it will be
148
- fetched from the environment variable
149
- `UPSTAGE_API_KEY`.
150
- base_url (str, optional): The base URL for accessing the Upstage API.
151
- model (str): The model to be used for the document parse.
152
- Defaults to "document-parse".
153
- split (SplitType, optional): The type of splitting to be applied.
154
- Defaults to "none" (no splitting).
155
- ocr (OCRMode, optional): Extract text from images in the document using OCR.
156
- If the value is "force", OCR is used to extract
157
- text from an image. If the value is "auto", text is
158
- extracted from a PDF. (An error will occur if the
159
- value is "auto" and the input is NOT in PDF format)
160
- output_format (OutputFormat, optional): Format of the inference results.
161
- coordinates (bool, optional): Whether to include the coordinates of the
162
- OCR in the output.
163
- base64_encoding (List[Category], optional): The category of the elements to
164
- be encoded in base64.
165
- chatterer (Chatterer, optional): The Chatterer instance to use for image
166
- description.
167
- image_description_instruction (str, optional): The instruction to use for
168
- image description.
169
-
170
-
314
+ api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
315
+ base_url (str, optional): Base URL for the Upstage API.
316
+ model (str): Model for document parse. Defaults to "document-parse".
317
+ split (SplitType, optional): Splitting type ("none", "page", "element").
318
+ Defaults to "none".
319
+ ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
320
+ output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
321
+ Defaults to "markdown".
322
+ coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
323
+ base64_encoding (List[Category], optional): Categories to return as base64.
324
+ Crucial for image extraction/description.
325
+ Set to `["figure"]` to process images.
326
+ Defaults to [].
327
+ chatterer (Chatterer, optional): Chatterer instance for image description.
328
+ If None, images will be extracted to files.
329
+ Defaults to None.
330
+ image_description_instruction (str, optional): Instruction for image description.
331
+ Defaults to a standard instruction.
332
+ image_dir (str, optional): The directory name to use when constructing
333
+ relative paths for extracted images.
334
+ Defaults to "images". This directory
335
+ is NOT created by the parser.
171
336
  """
172
337
  self.api_key = get_from_param_or_env(
173
338
  "UPSTAGE_API_KEY",
@@ -181,28 +346,30 @@ class UpstageDocumentParseParser(BaseBlobParser):
181
346
  self.ocr: OCR = ocr
182
347
  self.output_format: OutputFormat = output_format
183
348
  self.coordinates = coordinates
349
+ # Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
350
+ # However, it's better to require the user to explicitly set base64_encoding=["figure"]
184
351
  self.base64_encoding: list[Category] = base64_encoding
185
352
  self.chatterer = chatterer
186
353
  self.image_description_instruction = image_description_instruction
354
+ self.image_dir = image_dir # Store output directory name
355
+
356
+ # Initialize dictionary to store image data (path -> bytes)
357
+ self.image_data: Dict[str, bytes] = {}
187
358
 
188
- def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
359
+ def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
189
360
  """
190
361
  Sends a POST request to the API endpoint with the provided files and
191
- returns the response.
192
-
193
- Args:
194
- files (dict): A dictionary containing the files to be sent in the request.
195
-
196
- Returns:
197
- dict: The JSON response from the API.
198
-
199
- Raises:
200
- ValueError: If there is an error in the API call.
362
+ returns the parsed elements.
201
363
  """
364
+ response: Optional[requests.Response] = None
202
365
  try:
203
366
  headers = {
204
367
  "Authorization": f"Bearer {self.api_key}",
205
368
  }
369
+ # Convert list to string representation required by the API
370
+ base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
371
+ output_formats_str = f"['{self.output_format}']"
372
+
206
373
  response = requests.post(
207
374
  self.base_url,
208
375
  headers=headers,
@@ -210,104 +377,152 @@ class UpstageDocumentParseParser(BaseBlobParser):
210
377
  data={
211
378
  "ocr": self.ocr,
212
379
  "model": self.model,
213
- "output_formats": f"['{self.output_format}']",
214
- "coordinates": self.coordinates,
215
- "base64_encoding": f"{self.base64_encoding}",
380
+ "output_formats": output_formats_str,
381
+ "coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
382
+ "base64_encoding": base64_encoding_str,
216
383
  },
217
384
  )
218
- response.raise_for_status()
219
- result: object = response.json().get("elements", [])
385
+ response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
386
+
387
+ # Check content type before parsing JSON
388
+ content_type = response.headers.get("Content-Type", "")
389
+ if "application/json" not in content_type:
390
+ raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
391
+
392
+ response_data = response.json()
393
+ result: object = response_data.get("elements", [])
394
+
220
395
  if not isinstance(result, list):
221
- raise ValueError(f"Failed to parse JSON data: {result}")
222
- result = cast(list[object], result)
223
- return [Element.model_validate(element) for element in result]
396
+ raise ValueError(f"API response 'elements' is not a list: {result}")
397
+ result = cast(list[object], result) # Cast to list of objects
398
+
399
+ # Validate each element using Pydantic
400
+ validated_elements: list[Element] = []
401
+ for i, element_data in enumerate(result):
402
+ try:
403
+ validated_elements.append(Element.model_validate(element_data))
404
+ except Exception as e: # Catch Pydantic validation errors etc.
405
+ parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
406
+ # Decide whether to skip the element or raise the error
407
+ # continue # Option: skip problematic element
408
+ raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
409
+
410
+ return validated_elements
411
+
224
412
  except requests.HTTPError as e:
225
- raise ValueError(f"HTTP error: {e.response.text}")
413
+ # Log more details from the response if available
414
+ error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
415
+ try:
416
+ error_details = e.response.json() # Try to get JSON error details
417
+ error_message += f" - {error_details}"
418
+ except json.JSONDecodeError:
419
+ error_message += f" - Response body: {e.response.text}"
420
+ raise ValueError(error_message) from e
226
421
  except requests.RequestException as e:
227
- # Handle any request-related exceptions
228
- raise ValueError(f"Failed to send request: {e}")
422
+ raise ValueError(f"Failed to send request: {e}") from e
229
423
  except json.JSONDecodeError as e:
230
- # Handle JSON decode errors
231
- raise ValueError(f"Failed to decode JSON response: {e}")
232
- except Exception as e:
233
- # Handle any other exceptions
234
- raise ValueError(f"An error occurred: {e}")
424
+ # Include part of the response text that failed to parse
425
+ raise ValueError(
426
+ f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
427
+ ) from e
428
+ except Exception as e: # Catch-all for other unexpected errors
429
+ raise ValueError(f"An unexpected error occurred during API call: {e}") from e
235
430
 
236
431
  def _split_and_request(
237
432
  self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
238
433
  ) -> list[Element]:
239
434
  """
240
- Splits the full pdf document into partial pages and sends a request to the
241
- server.
242
-
243
- Args:
244
- full_docs (PdfReader): The full document to be split and requested.
245
- start_page (int): The starting page number for splitting the document.
246
- num_pages (int, optional): The number of pages to split the document
247
- into.
248
- Defaults to DEFAULT_NUMBER_OF_PAGE.
249
-
250
- Returns:
251
- response: The response from the server.
435
+ Splits the full pdf document into partial pages and sends a request.
252
436
  """
437
+ # Need to import here if not globally available
438
+ try:
439
+ from pypdf import PdfWriter
440
+ except ImportError:
441
+ raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
442
+
253
443
  merger = PdfWriter()
254
- merger.append(
255
- full_docs,
256
- pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())),
257
- )
444
+ total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
445
+ end_page = min(start_page + num_pages, total_pages)
446
+
447
+ # Check if start_page is valid
448
+ if start_page >= total_pages:
449
+ parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
450
+ return []
451
+
452
+ # pypdf page indices are 0-based, slicing is exclusive of the end index
453
+ # PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
454
+ # However, the example used pages=(start, end) which might behave differently depending on version?
455
+ # Let's stick to add_page for clarity if possible, or ensure append range is correct.
456
+ # merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
457
+
458
+ # Alternative using add_page loop (more explicit)
459
+ for i in range(start_page, end_page):
460
+ merger.add_page(full_docs.pages[i])
258
461
 
259
462
  with io.BytesIO() as buffer:
260
463
  merger.write(buffer)
261
464
  buffer.seek(0)
262
- return self._get_response({"document": buffer})
465
+ # Need to provide a filename for the 'files' dict
466
+ return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
263
467
 
264
468
  def _element_document(self, element: Element, start_page: int = 0) -> Document:
265
- """
266
- Converts an elements into a Document object.
267
-
268
- Args:
269
- elements (Dict) : The elements to convert.
270
- start_page (int): The starting page number for splitting the document.
271
- This number starts from zero.
469
+ """Converts an element into a Document object."""
470
+ # parse_text now handles image path generation and data storage if needed
471
+ page_content = element.parse_text(self)
472
+ metadata: dict[str, object] = element.model_dump(
473
+ exclude={"content", "base64_encoding"}, exclude_none=True
474
+ ) # Exclude raw content/base64
475
+ metadata["page"] = element.page + start_page # Adjust page number
476
+ # Base64 encoding is not added to metadata if it was processed into image_data
477
+ # Coordinates are kept if requested
478
+ if not self.coordinates:
479
+ metadata.pop("coordinates", None)
272
480
 
273
- Returns:
274
- A list containing a single Document object.
275
-
276
- """
277
- metadata: dict[str, object] = element.model_dump(exclude_none=True)
278
- metadata["page"] = element.page + start_page
279
481
  return Document(
280
- page_content=element.parse_text(self),
482
+ page_content=page_content,
281
483
  metadata=metadata,
282
484
  )
283
485
 
284
486
  def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
285
- """
286
- Combines elements with the same page number into a single Document object.
287
-
288
- Args:
289
- elements (List): A list of elements containing page numbers.
290
- start_page (int): The starting page number for splitting the document.
291
- This number starts from zero.
292
-
293
- Returns:
294
- List[Document]: A list of Document objects, each representing a page
295
- with its content and metadata.
296
- """
487
+ """Combines elements with the same page number into a single Document object."""
297
488
  documents: list[Document] = []
298
- pages: list[int] = sorted(set(map(lambda x: x.page, elements)))
299
- page_group: list[list[Element]] = [[element for element in elements if element.page == x] for x in pages]
300
- for group in page_group:
489
+ if not elements:
490
+ return documents
491
+
492
+ # Group elements by page (relative to the current batch)
493
+ pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
494
+ page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
495
+ for element in elements:
496
+ page_groups[element.page].append(element)
497
+
498
+ for page_num, group in page_groups.items():
499
+ actual_page_num = page_num + start_page
500
+ page_content_parts: list[str] = []
501
+ page_coordinates: list[Coordinates] = []
502
+ # Base64 encodings are handled within parse_text now, not collected here
503
+
504
+ for element in sorted(group, key=lambda x: x.id): # Process elements in order
505
+ page_content_parts.append(element.parse_text(self))
506
+ if self.coordinates and element.coordinates:
507
+ page_coordinates.append({ # Store coordinates with element id/category for context
508
+ "id": element.id,
509
+ "category": element.category,
510
+ "coordinates": element.coordinates,
511
+ })
512
+
301
513
  metadata: dict[str, object] = {
302
- "page": group[0].page + start_page,
514
+ "page": actual_page_num,
303
515
  }
304
- if self.base64_encoding:
305
- metadata["base64_encodings"] = [element.base64_encoding for element in group if element.base64_encoding]
306
- if self.coordinates:
307
- metadata["coordinates"] = [element.coordinates for element in group if element.coordinates]
516
+ if self.coordinates and page_coordinates:
517
+ metadata["element_coordinates"] = page_coordinates # Changed key for clarity
518
+
519
+ # Combine content, typically with spaces or newlines
520
+ # Using newline might be better for readability if elements are paragraphs etc.
521
+ combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
522
+
308
523
  documents.append(
309
524
  Document(
310
- page_content=" ".join(element.parse_text(self) for element in group),
525
+ page_content=combined_page_content,
311
526
  metadata=metadata,
312
527
  )
313
528
  )
@@ -316,123 +531,175 @@ class UpstageDocumentParseParser(BaseBlobParser):
316
531
 
317
532
  def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
318
533
  """
319
- Lazily parses a document and yields Document objects based on the specified
320
- split type.
534
+ Lazily parses a document blob.
535
+
536
+ Yields Document objects based on the specified split type.
537
+ If images are extracted (chatterer=None, base64_encoding=["figure"]),
538
+ the image data will be available in `self.image_data` after iteration.
321
539
 
322
540
  Args:
323
- blob (Blob): The input document blob to parse.
324
- is_batch (bool, optional): Whether to parse the document in batches.
325
- Defaults to False (single page parsing)
541
+ blob (Blob): The input document blob to parse. Requires `blob.path`.
542
+ is_batch (bool, optional): Currently affects PDF page batch size.
543
+ Defaults to False (process 1 page batch for PDF).
544
+ *Note: API might have limits regardless.*
326
545
 
327
546
  Yields:
328
- Document: The parsed document object.
547
+ Document: The parsed document object(s).
329
548
 
330
549
  Raises:
331
- ValueError: If an invalid split type is provided.
332
-
550
+ ValueError: If blob.path is not set, API error occurs, or invalid config.
551
+ ImportError: If pypdf is needed but not installed.
333
552
  """
553
+ # Clear image data at the start of parsing for this specific call
554
+ self.image_data = {}
334
555
 
335
- if is_batch:
336
- num_pages = DEFAULT_NUM_PAGES
337
- else:
338
- num_pages = 1
556
+ if not blob.path:
557
+ # Non-PDF files and direct API calls require reading the file,
558
+ # PDF splitting also requires the path.
559
+ raise ValueError("Blob path is required for UpstageDocumentParseParser.")
339
560
 
340
- full_docs: Optional[PdfReader] = None
561
+ # Try importing pypdf here, only if needed
562
+ PdfReader = None
563
+ PdfReadError = None
341
564
  try:
342
- full_docs = PdfReader(str(blob.path))
343
- number_of_pages = full_docs.get_num_pages()
344
- except PdfReadError:
345
- number_of_pages = 1
346
- except Exception as e:
347
- raise ValueError(f"Failed to read PDF file: {e}")
348
-
349
- if self.split == "none":
350
- result = ""
351
- base64_encodings: list[str] = []
352
- coordinates: list[list[Coordinate]] = []
353
-
354
- if full_docs is not None:
355
- start_page = 0
356
- num_pages = DEFAULT_NUM_PAGES
357
- for _ in range(number_of_pages):
358
- if start_page >= number_of_pages:
359
- break
565
+ from pypdf import PdfReader as PyPdfReader
566
+ from pypdf.errors import PdfReadError as PyPdfReadError
360
567
 
361
- elements = self._split_and_request(full_docs, start_page, num_pages)
362
- for element in elements:
363
- result += element.parse_text(self)
364
- if self.base64_encoding and (base64_encoding := element.base64_encoding):
365
- base64_encodings.append(base64_encoding)
366
- if self.coordinates and (coords := element.coordinates):
367
- coordinates.append(coords)
568
+ PdfReader = PyPdfReader
569
+ PdfReadError = PyPdfReadError
570
+ except ImportError:
571
+ # We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
572
+ # Let's attempt to read anyway, API might support non-PDFs directly.
573
+ # We'll check for PdfReader later if we determine it's a PDF.
574
+ pass
368
575
 
369
- start_page += num_pages
576
+ full_docs: Optional[PdfReader] = None
577
+ is_pdf = False
578
+ number_of_pages = 1 # Default for non-PDF or single-page docs
370
579
 
580
+ try:
581
+ # Check if it's a PDF by trying to open it
582
+ if PdfReader and PdfReadError:
583
+ try:
584
+ # Use strict=False to be more lenient with potentially corrupted PDFs
585
+ full_docs = PdfReader(str(blob.path), strict=False)
586
+ number_of_pages = len(full_docs.pages)
587
+ is_pdf = True
588
+ except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
589
+ parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
590
+ except Exception as e: # Catch other potential pypdf errors
591
+ parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
592
+ raise ValueError(f"Failed to process PDF file: {e}") from e
371
593
  else:
372
- if not blob.path:
373
- raise ValueError("Blob path is required for non-PDF files.")
594
+ parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
374
595
 
375
- with open(blob.path, "rb") as f:
376
- elements = self._get_response({"document": f})
596
+ except Exception as e:
597
+ raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
377
598
 
378
- for element in elements:
379
- result += element.parse_text(self)
599
+ # --- Parsing Logic based on Split Type ---
380
600
 
381
- if self.base64_encoding and (base64_encoding := element.base64_encoding):
382
- base64_encodings.append(base64_encoding)
383
- if self.coordinates and (coords := element.coordinates):
384
- coordinates.append(coords)
385
- metadata: dict[str, object] = {"total_pages": number_of_pages}
386
- if self.coordinates:
387
- metadata["coordinates"] = coordinates
388
- if self.base64_encoding:
389
- metadata["base64_encodings"] = base64_encodings
601
+ # Case 1: No Splitting (Combine all content)
602
+ if self.split == "none":
603
+ combined_result = ""
604
+ all_coordinates: list[PageCoordinates] = []
605
+ # Base64 handled by parse_text, data stored in self.image_data
606
+
607
+ if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
608
+ start_page = 0
609
+ # Use a reasonable batch size for 'none' split to avoid huge requests
610
+ batch_num_pages = DEFAULT_NUM_PAGES
611
+ while start_page < number_of_pages:
612
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
613
+ for element in sorted(elements, key=lambda x: (x.page, x.id)):
614
+ combined_result += element.parse_text(self) + "\n\n" # Add separator
615
+ if self.coordinates and element.coordinates:
616
+ # Adjust page number for coordinates metadata
617
+ coords_with_page: PageCoordinates = {
618
+ "id": element.id,
619
+ "category": element.category,
620
+ "page": element.page + start_page, # Actual page
621
+ "coordinates": element.coordinates,
622
+ }
623
+ all_coordinates.append(coords_with_page)
624
+ start_page += batch_num_pages
625
+ else: # Process non-PDF file as a single unit
626
+ with open(blob.path, "rb") as f:
627
+ # Provide a filename for the 'files' dict
628
+ filename = os.path.basename(blob.path)
629
+ elements = self._get_response({"document": (filename, f)})
630
+
631
+ for element in sorted(elements, key=lambda x: x.id):
632
+ combined_result += element.parse_text(self) + "\n\n"
633
+ if self.coordinates and element.coordinates:
634
+ all_coordinates.append({
635
+ "id": element.id,
636
+ "category": element.category,
637
+ "page": element.page, # Page is relative to the single doc (usually 0 or 1)
638
+ "coordinates": element.coordinates,
639
+ })
640
+
641
+ metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
642
+ if self.coordinates and all_coordinates:
643
+ metadata["element_coordinates"] = all_coordinates
644
+ # self.image_data is populated, no need to add base64 to metadata
390
645
 
391
646
  yield Document(
392
- page_content=result,
647
+ page_content=combined_result.strip(),
393
648
  metadata=metadata,
394
649
  )
395
650
 
651
+ # Case 2: Split by Element
396
652
  elif self.split == "element":
397
- if full_docs is not None:
653
+ if is_pdf and full_docs and PdfReader:
398
654
  start_page = 0
399
- for _ in range(number_of_pages):
400
- if start_page >= number_of_pages:
401
- break
402
-
403
- elements = self._split_and_request(full_docs, start_page, num_pages)
404
- for element in elements:
405
- yield self._element_document(element, start_page)
406
-
407
- start_page += num_pages
408
-
409
- else:
410
- if not blob.path:
411
- raise ValueError("Blob path is required for non-PDF files.")
655
+ batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
656
+ while start_page < number_of_pages:
657
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
658
+ for element in sorted(elements, key=lambda x: (x.page, x.id)):
659
+ # _element_document handles metadata and adjusts page number
660
+ doc = self._element_document(element, start_page)
661
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
662
+ yield doc
663
+ start_page += batch_num_pages
664
+ else: # Non-PDF
412
665
  with open(blob.path, "rb") as f:
413
- elements = self._get_response({"document": f})
414
-
415
- for element in elements:
416
- yield self._element_document(element)
417
-
666
+ filename = os.path.basename(blob.path)
667
+ elements = self._get_response({"document": (filename, f)})
668
+ for element in sorted(elements, key=lambda x: x.id):
669
+ doc = self._element_document(element, 0) # Start page is 0 for single doc
670
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
671
+ yield doc
672
+
673
+ # Case 3: Split by Page
418
674
  elif self.split == "page":
419
- if full_docs is not None:
675
+ if is_pdf and full_docs and PdfReader:
420
676
  start_page = 0
421
- for _ in range(number_of_pages):
422
- if start_page >= number_of_pages:
423
- break
424
-
425
- elements = self._split_and_request(full_docs, start_page, num_pages)
426
- yield from self._page_document(elements, start_page)
427
-
428
- start_page += num_pages
429
- else:
430
- if not blob.path:
431
- raise ValueError("Blob path is required for non-PDF files.")
677
+ batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
678
+ while start_page < number_of_pages:
679
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
680
+ # _page_document groups elements by page and creates Documents
681
+ page_docs = self._page_document(elements, start_page)
682
+ for doc in page_docs:
683
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
684
+ yield doc
685
+ start_page += batch_num_pages
686
+ else: # Non-PDF (treat as single page)
432
687
  with open(blob.path, "rb") as f:
433
- elements = self._get_response({"document": f})
434
-
435
- yield from self._page_document(elements)
688
+ filename = os.path.basename(blob.path)
689
+ elements = self._get_response({"document": (filename, f)})
690
+ page_docs = self._page_document(elements, 0) # Process elements as page 0
691
+ for doc in page_docs:
692
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
693
+ yield doc
436
694
 
437
695
  else:
438
696
  raise ValueError(f"Invalid split type: {self.split}")
697
+
698
+
699
+ def _get_metadata_from_document(doc: Document) -> dict[object, object]:
700
+ """
701
+ Helper function to extract metadata from a Document object.
702
+ This is a placeholder and should be adjusted based on actual metadata structure.
703
+ """
704
+ metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
705
+ return metadata