chatterer 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +95 -0
  6. chatterer/examples/get_code_snippets.py +64 -0
  7. chatterer/examples/login_with_playwright.py +171 -0
  8. chatterer/examples/make_ppt.py +499 -0
  9. chatterer/examples/pdf_to_markdown.py +107 -0
  10. chatterer/examples/pdf_to_text.py +60 -0
  11. chatterer/examples/transcription_api.py +127 -0
  12. chatterer/examples/upstage_parser.py +95 -0
  13. chatterer/examples/webpage_to_markdown.py +79 -0
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +302 -302
  30. chatterer/tools/convert_to_text.py +447 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
  40. chatterer-0.1.19.dist-info/RECORD +44 -0
  41. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.19.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.17.dist-info/RECORD +0 -33
  44. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,705 +1,705 @@
1
- # -*- coding: utf-8 -*-
2
- """Adopted from `langchain_upstage.document_parse`"""
3
-
4
- from __future__ import annotations
5
-
6
- import base64
7
- import binascii
8
- import io
9
- import json
10
- import logging
11
- import os
12
- import uuid
13
- from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
14
-
15
- import requests
16
- from langchain_core.document_loaders import BaseBlobParser, Blob
17
- from langchain_core.documents import Document
18
- from pydantic import BaseModel, Field
19
-
20
- from ..common_types.io import BytesReadable
21
- from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
22
- from ..utils.base64_image import Base64Image
23
- from ..utils.imghdr import what
24
-
25
- if TYPE_CHECKING:
26
- from pypdf import PdfReader
27
-
28
- logger = logging.getLogger("pypdf")
29
- logger.setLevel(logging.ERROR)
30
- parser_logger = logging.getLogger(__name__) # Added logger for this module
31
-
32
- DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
33
- DEFAULT_NUM_PAGES = 10
34
- DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
35
- DEFAULT_IMAGE_DIR = "images" # Added default image directory
36
-
37
- OutputFormat = Literal["text", "html", "markdown"]
38
- OCR = Literal["auto", "force"]
39
- SplitType = Literal["none", "page", "element"]
40
- Category = Literal[
41
- "paragraph",
42
- "table",
43
- "figure",
44
- "header",
45
- "footer",
46
- "caption",
47
- "equation",
48
- "heading1",
49
- "list",
50
- "index",
51
- "footnote",
52
- "chart",
53
- ]
54
-
55
-
56
- class Content(BaseModel):
57
- text: Optional[str] = None
58
- html: Optional[str] = None
59
- markdown: Optional[str] = None
60
-
61
-
62
- class Coordinate(BaseModel):
63
- x: float
64
- y: float
65
-
66
-
67
- class Element(BaseModel):
68
- category: Category
69
- content: Content
70
- coordinates: list[Coordinate] = Field(default_factory=list)
71
- base64_encoding: str = ""
72
- id: int
73
- page: int
74
-
75
- def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
76
- """
77
- Generates the text representation of the element.
78
-
79
- If the element is a figure with base64 encoding and no chatterer is provided,
80
- it generates a markdown link to a uniquely named image file and stores the
81
- image data in the parser's image_data dictionary. Otherwise, it uses the
82
- chatterer for description or returns the standard text/html/markdown.
83
- """
84
- output_format: OutputFormat = parser.output_format
85
- chatterer: Optional[Chatterer] = parser.chatterer
86
- image_description_instruction: str = parser.image_description_instruction
87
- output: Optional[str] = None
88
-
89
- if output_format == "text":
90
- output = self.content.text
91
- elif output_format == "html":
92
- output = self.content.html
93
- elif output_format == "markdown":
94
- output = self.content.markdown
95
-
96
- if output is None:
97
- # Fallback or raise error if needed, here using text as fallback
98
- output = self.content.text or ""
99
- # Or raise ValueError(f"Invalid output format or missing content: {output_format}")
100
-
101
- # --- Logic modification starts here ---
102
- if self.category == "figure" and self.base64_encoding:
103
- # Case 1: Chatterer is available - Generate description
104
- if chatterer is not None:
105
- # Check if base64 encoding is valid
106
- try:
107
- # Decode base64 to check if valid
108
- img_type = what(self.base64_encoding)
109
- if not img_type:
110
- parser_logger.warning(
111
- f"Could not determine image type for figure element {self.id} (page {self.page})."
112
- )
113
- return output
114
- image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
115
-
116
- except (binascii.Error, ValueError) as e:
117
- parser_logger.warning(
118
- f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
119
- )
120
- return output
121
-
122
- if image is None:
123
- parser_logger.warning(
124
- f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
125
- )
126
- # Fallback to original output (placeholder/OCR)
127
- return output
128
-
129
- ocr_content = ""
130
- if output_format == "markdown":
131
- ocr_content = output.removeprefix("![image](/image/placeholder)\n")
132
- elif output_format == "text":
133
- ocr_content = output
134
-
135
- image_description = chatterer.describe_image(
136
- image.data_uri,
137
- image_description_instruction
138
- + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
139
- )
140
- # Return description within details tag (as original)
141
- output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
142
-
143
- # Case 2: Chatterer is NOT available - Generate file path and store data
144
- elif parser.image_dir is not None:
145
- try:
146
- img_type = what(self.base64_encoding)
147
- if not img_type:
148
- parser_logger.warning(
149
- f"Could not determine image type for figure element {self.id} (page {self.page})."
150
- )
151
- return output
152
-
153
- image_bytes = base64.b64decode(self.base64_encoding)
154
-
155
- # Generate unique filename and path
156
- filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
157
- # Create relative path for markdown link, ensuring forward slashes
158
- relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
159
-
160
- # Store the image data for the user to save later
161
- parser.image_data[relative_path] = image_bytes
162
-
163
- # Extract OCR content if present
164
- ocr_content = ""
165
- if output_format == "markdown" and output.startswith("![image]"):
166
- ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
167
- elif output_format == "text":
168
- ocr_content = output # Assume text output is OCR for images
169
-
170
- # Update output to be the markdown link + OCR
171
- output = f"![image]({relative_path})\n{ocr_content}".strip()
172
-
173
- except (binascii.Error, ValueError) as e:
174
- # Handle potential base64 decoding errors gracefully
175
- parser_logger.warning(
176
- f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
177
- )
178
- # Keep the original 'output' value (placeholder or OCR)
179
- pass
180
-
181
- return output
182
-
183
-
184
- class Coordinates(TypedDict):
185
- id: int
186
- category: Category
187
- coordinates: list[Coordinate]
188
-
189
-
190
- class PageCoordinates(Coordinates):
191
- page: int
192
-
193
-
194
- def get_from_param_or_env(
195
- key: str,
196
- param: Optional[str] = None,
197
- env_key: Optional[str] = None,
198
- default: Optional[str] = None,
199
- ) -> str:
200
- """Get a value from a param or an environment variable."""
201
- if param is not None:
202
- return param
203
- elif env_key and env_key in os.environ and os.environ[env_key]:
204
- return os.environ[env_key]
205
- elif default is not None:
206
- return default
207
- else:
208
- raise ValueError(
209
- f"Did not find {key}, please add an environment variable"
210
- f" `{env_key}` which contains it, or pass"
211
- f" `{key}` as a named parameter."
212
- )
213
-
214
-
215
- class UpstageDocumentParseParser(BaseBlobParser):
216
- """Upstage Document Parse Parser.
217
-
218
- Parses documents using the Upstage Document AI API. Can optionally extract
219
- images and return their data alongside the parsed documents.
220
-
221
- If a `chatterer` is provided, it will be used to generate descriptions for
222
- images (figures with base64 encoding).
223
-
224
- If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
225
- this parser will:
226
- 1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
227
- The base directory can be configured with `image_dir`.
228
- 2. Replace the element's content with a markdown image link pointing to this path.
229
- 3. Store the actual image bytes in the `image_data` attribute dictionary,
230
- mapping the generated relative path to the bytes.
231
-
232
- The user is responsible for saving the files from the `image_data` dictionary
233
- after processing the documents yielded by `lazy_parse`.
234
-
235
- To use, you should have the environment variable `UPSTAGE_API_KEY`
236
- set with your API key or pass it as a named parameter to the constructor.
237
-
238
- Example:
239
- .. code-block:: python
240
-
241
- from langchain_upstage import UpstageDocumentParseParser
242
- from langchain_core.documents import Blob
243
- import os
244
-
245
- # --- Setup ---
246
- # Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
247
- # Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
248
-
249
- # --- Parsing without chatterer (extracts images) ---
250
- parser = UpstageDocumentParseParser(
251
- split="page",
252
- output_format="markdown",
253
- base64_encoding=["figure"], # Important: Request base64 for figures
254
- image_dir="extracted_images" # Optional: specify image dir
255
- )
256
- blob = Blob.from_path("my_document.pdf") # Or your image file path
257
- documents = []
258
- for doc in parser.lazy_parse(blob):
259
- print("--- Document ---")
260
- print(f"Page: {get_metadata_from_document(doc).get('page')}")
261
- print(doc.page_content)
262
- documents.append(doc)
263
-
264
- print("\\n--- Extracted Image Data ---")
265
- if parser.image_data:
266
- # User saves the images
267
- for img_path, img_bytes in parser.image_data.items():
268
- # Create directories if they don't exist
269
- os.makedirs(os.path.dirname(img_path), exist_ok=True)
270
- try:
271
- with open(img_path, "wb") as f:
272
- f.write(img_bytes)
273
- print(f"Saved image: {img_path}")
274
- except IOError as e:
275
- print(f"Error saving image {img_path}: {e}")
276
- else:
277
- print("No images extracted.")
278
-
279
- # --- Parsing with chatterer (generates descriptions) ---
280
- # from langchain_upstage import UpstageChatter # Assuming this exists
281
- # chatterer = UpstageChatter() # Initialize your chatterer
282
- # parser_with_desc = UpstageDocumentParseParser(
283
- # split="page",
284
- # output_format="markdown",
285
- # base64_encoding=["figure"], # Still need base64 for description
286
- # chatterer=chatterer
287
- # )
288
- # documents_with_desc = list(parser_with_desc.lazy_parse(blob))
289
- # print("\\n--- Documents with Descriptions ---")
290
- # for doc in documents_with_desc:
291
- # print(f"Page: {get_metadata_from_document(doc).get('page')}")
292
- # print(doc.page_content)
293
-
294
- """
295
-
296
- def __init__(
297
- self,
298
- api_key: Optional[str] = None,
299
- base_url: str = DOCUMENT_PARSE_BASE_URL,
300
- model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
301
- split: SplitType = "none",
302
- ocr: OCR = "auto",
303
- output_format: OutputFormat = "markdown",
304
- coordinates: bool = True,
305
- base64_encoding: list[Category] = [],
306
- chatterer: Optional[Chatterer] = None,
307
- image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
308
- image_dir: Optional[str] = None, # Added: Directory for image paths
309
- ) -> None:
310
- """
311
- Initializes an instance of the UpstageDocumentParseParser.
312
-
313
- Args:
314
- api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
315
- base_url (str, optional): Base URL for the Upstage API.
316
- model (str): Model for document parse. Defaults to "document-parse".
317
- split (SplitType, optional): Splitting type ("none", "page", "element").
318
- Defaults to "none".
319
- ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
320
- output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
321
- Defaults to "markdown".
322
- coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
323
- base64_encoding (List[Category], optional): Categories to return as base64.
324
- Crucial for image extraction/description.
325
- Set to `["figure"]` to process images.
326
- Defaults to [].
327
- chatterer (Chatterer, optional): Chatterer instance for image description.
328
- If None, images will be extracted to files.
329
- Defaults to None.
330
- image_description_instruction (str, optional): Instruction for image description.
331
- Defaults to a standard instruction.
332
- image_dir (str, optional): The directory name to use when constructing
333
- relative paths for extracted images.
334
- Defaults to "images". This directory
335
- is NOT created by the parser.
336
- """
337
- self.api_key = get_from_param_or_env(
338
- "UPSTAGE_API_KEY",
339
- api_key,
340
- "UPSTAGE_API_KEY",
341
- os.environ.get("UPSTAGE_API_KEY"),
342
- )
343
- self.base_url = base_url
344
- self.model = model
345
- self.split: SplitType = split
346
- self.ocr: OCR = ocr
347
- self.output_format: OutputFormat = output_format
348
- self.coordinates = coordinates
349
- # Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
350
- # However, it's better to require the user to explicitly set base64_encoding=["figure"]
351
- self.base64_encoding: list[Category] = base64_encoding
352
- self.chatterer = chatterer
353
- self.image_description_instruction = image_description_instruction
354
- self.image_dir = image_dir # Store output directory name
355
-
356
- # Initialize dictionary to store image data (path -> bytes)
357
- self.image_data: Dict[str, bytes] = {}
358
-
359
- def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
360
- """
361
- Sends a POST request to the API endpoint with the provided files and
362
- returns the parsed elements.
363
- """
364
- response: Optional[requests.Response] = None
365
- try:
366
- headers = {
367
- "Authorization": f"Bearer {self.api_key}",
368
- }
369
- # Convert list to string representation required by the API
370
- base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
371
- output_formats_str = f"['{self.output_format}']"
372
-
373
- response = requests.post(
374
- self.base_url,
375
- headers=headers,
376
- files=files,
377
- data={
378
- "ocr": self.ocr,
379
- "model": self.model,
380
- "output_formats": output_formats_str,
381
- "coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
382
- "base64_encoding": base64_encoding_str,
383
- },
384
- )
385
- response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
386
-
387
- # Check content type before parsing JSON
388
- content_type = response.headers.get("Content-Type", "")
389
- if "application/json" not in content_type:
390
- raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
391
-
392
- response_data = response.json()
393
- result: object = response_data.get("elements", [])
394
-
395
- if not isinstance(result, list):
396
- raise ValueError(f"API response 'elements' is not a list: {result}")
397
- result = cast(list[object], result) # Cast to list of objects
398
-
399
- # Validate each element using Pydantic
400
- validated_elements: list[Element] = []
401
- for i, element_data in enumerate(result):
402
- try:
403
- validated_elements.append(Element.model_validate(element_data))
404
- except Exception as e: # Catch Pydantic validation errors etc.
405
- parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
406
- # Decide whether to skip the element or raise the error
407
- # continue # Option: skip problematic element
408
- raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
409
-
410
- return validated_elements
411
-
412
- except requests.HTTPError as e:
413
- # Log more details from the response if available
414
- error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
415
- try:
416
- error_details = e.response.json() # Try to get JSON error details
417
- error_message += f" - {error_details}"
418
- except json.JSONDecodeError:
419
- error_message += f" - Response body: {e.response.text}"
420
- raise ValueError(error_message) from e
421
- except requests.RequestException as e:
422
- raise ValueError(f"Failed to send request: {e}") from e
423
- except json.JSONDecodeError as e:
424
- # Include part of the response text that failed to parse
425
- raise ValueError(
426
- f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
427
- ) from e
428
- except Exception as e: # Catch-all for other unexpected errors
429
- raise ValueError(f"An unexpected error occurred during API call: {e}") from e
430
-
431
- def _split_and_request(
432
- self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
433
- ) -> list[Element]:
434
- """
435
- Splits the full pdf document into partial pages and sends a request.
436
- """
437
- # Need to import here if not globally available
438
- try:
439
- from pypdf import PdfWriter
440
- except ImportError:
441
- raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
442
-
443
- merger = PdfWriter()
444
- total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
445
- end_page = min(start_page + num_pages, total_pages)
446
-
447
- # Check if start_page is valid
448
- if start_page >= total_pages:
449
- parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
450
- return []
451
-
452
- # pypdf page indices are 0-based, slicing is exclusive of the end index
453
- # PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
454
- # However, the example used pages=(start, end) which might behave differently depending on version?
455
- # Let's stick to add_page for clarity if possible, or ensure append range is correct.
456
- # merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
457
-
458
- # Alternative using add_page loop (more explicit)
459
- for i in range(start_page, end_page):
460
- merger.add_page(full_docs.pages[i])
461
-
462
- with io.BytesIO() as buffer:
463
- merger.write(buffer)
464
- buffer.seek(0)
465
- # Need to provide a filename for the 'files' dict
466
- return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
467
-
468
- def _element_document(self, element: Element, start_page: int = 0) -> Document:
469
- """Converts an element into a Document object."""
470
- # parse_text now handles image path generation and data storage if needed
471
- page_content = element.parse_text(self)
472
- metadata: dict[str, object] = element.model_dump(
473
- exclude={"content", "base64_encoding"}, exclude_none=True
474
- ) # Exclude raw content/base64
475
- metadata["page"] = element.page + start_page # Adjust page number
476
- # Base64 encoding is not added to metadata if it was processed into image_data
477
- # Coordinates are kept if requested
478
- if not self.coordinates:
479
- metadata.pop("coordinates", None)
480
-
481
- return Document(
482
- page_content=page_content,
483
- metadata=metadata,
484
- )
485
-
486
- def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
487
- """Combines elements with the same page number into a single Document object."""
488
- documents: list[Document] = []
489
- if not elements:
490
- return documents
491
-
492
- # Group elements by page (relative to the current batch)
493
- pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
494
- page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
495
- for element in elements:
496
- page_groups[element.page].append(element)
497
-
498
- for page_num, group in page_groups.items():
499
- actual_page_num = page_num + start_page
500
- page_content_parts: list[str] = []
501
- page_coordinates: list[Coordinates] = []
502
- # Base64 encodings are handled within parse_text now, not collected here
503
-
504
- for element in sorted(group, key=lambda x: x.id): # Process elements in order
505
- page_content_parts.append(element.parse_text(self))
506
- if self.coordinates and element.coordinates:
507
- page_coordinates.append({ # Store coordinates with element id/category for context
508
- "id": element.id,
509
- "category": element.category,
510
- "coordinates": element.coordinates,
511
- })
512
-
513
- metadata: dict[str, object] = {
514
- "page": actual_page_num,
515
- }
516
- if self.coordinates and page_coordinates:
517
- metadata["element_coordinates"] = page_coordinates # Changed key for clarity
518
-
519
- # Combine content, typically with spaces or newlines
520
- # Using newline might be better for readability if elements are paragraphs etc.
521
- combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
522
-
523
- documents.append(
524
- Document(
525
- page_content=combined_page_content,
526
- metadata=metadata,
527
- )
528
- )
529
-
530
- return documents
531
-
532
- def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
533
- """
534
- Lazily parses a document blob.
535
-
536
- Yields Document objects based on the specified split type.
537
- If images are extracted (chatterer=None, base64_encoding=["figure"]),
538
- the image data will be available in `self.image_data` after iteration.
539
-
540
- Args:
541
- blob (Blob): The input document blob to parse. Requires `blob.path`.
542
- is_batch (bool, optional): Currently affects PDF page batch size.
543
- Defaults to False (process 1 page batch for PDF).
544
- *Note: API might have limits regardless.*
545
-
546
- Yields:
547
- Document: The parsed document object(s).
548
-
549
- Raises:
550
- ValueError: If blob.path is not set, API error occurs, or invalid config.
551
- ImportError: If pypdf is needed but not installed.
552
- """
553
- # Clear image data at the start of parsing for this specific call
554
- self.image_data = {}
555
-
556
- if not blob.path:
557
- # Non-PDF files and direct API calls require reading the file,
558
- # PDF splitting also requires the path.
559
- raise ValueError("Blob path is required for UpstageDocumentParseParser.")
560
-
561
- # Try importing pypdf here, only if needed
562
- PdfReader = None
563
- PdfReadError = None
564
- try:
565
- from pypdf import PdfReader as PyPdfReader
566
- from pypdf.errors import PdfReadError as PyPdfReadError
567
-
568
- PdfReader = PyPdfReader
569
- PdfReadError = PyPdfReadError
570
- except ImportError:
571
- # We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
572
- # Let's attempt to read anyway, API might support non-PDFs directly.
573
- # We'll check for PdfReader later if we determine it's a PDF.
574
- pass
575
-
576
- full_docs: Optional[PdfReader] = None
577
- is_pdf = False
578
- number_of_pages = 1 # Default for non-PDF or single-page docs
579
-
580
- try:
581
- # Check if it's a PDF by trying to open it
582
- if PdfReader and PdfReadError:
583
- try:
584
- # Use strict=False to be more lenient with potentially corrupted PDFs
585
- full_docs = PdfReader(str(blob.path), strict=False)
586
- number_of_pages = len(full_docs.pages)
587
- is_pdf = True
588
- except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
589
- parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
590
- except Exception as e: # Catch other potential pypdf errors
591
- parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
592
- raise ValueError(f"Failed to process PDF file: {e}") from e
593
- else:
594
- parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
595
-
596
- except Exception as e:
597
- raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
598
-
599
- # --- Parsing Logic based on Split Type ---
600
-
601
- # Case 1: No Splitting (Combine all content)
602
- if self.split == "none":
603
- combined_result = ""
604
- all_coordinates: list[PageCoordinates] = []
605
- # Base64 handled by parse_text, data stored in self.image_data
606
-
607
- if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
608
- start_page = 0
609
- # Use a reasonable batch size for 'none' split to avoid huge requests
610
- batch_num_pages = DEFAULT_NUM_PAGES
611
- while start_page < number_of_pages:
612
- elements = self._split_and_request(full_docs, start_page, batch_num_pages)
613
- for element in sorted(elements, key=lambda x: (x.page, x.id)):
614
- combined_result += element.parse_text(self) + "\n\n" # Add separator
615
- if self.coordinates and element.coordinates:
616
- # Adjust page number for coordinates metadata
617
- coords_with_page: PageCoordinates = {
618
- "id": element.id,
619
- "category": element.category,
620
- "page": element.page + start_page, # Actual page
621
- "coordinates": element.coordinates,
622
- }
623
- all_coordinates.append(coords_with_page)
624
- start_page += batch_num_pages
625
- else: # Process non-PDF file as a single unit
626
- with open(blob.path, "rb") as f:
627
- # Provide a filename for the 'files' dict
628
- filename = os.path.basename(blob.path)
629
- elements = self._get_response({"document": (filename, f)})
630
-
631
- for element in sorted(elements, key=lambda x: x.id):
632
- combined_result += element.parse_text(self) + "\n\n"
633
- if self.coordinates and element.coordinates:
634
- all_coordinates.append({
635
- "id": element.id,
636
- "category": element.category,
637
- "page": element.page, # Page is relative to the single doc (usually 0 or 1)
638
- "coordinates": element.coordinates,
639
- })
640
-
641
- metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
642
- if self.coordinates and all_coordinates:
643
- metadata["element_coordinates"] = all_coordinates
644
- # self.image_data is populated, no need to add base64 to metadata
645
-
646
- yield Document(
647
- page_content=combined_result.strip(),
648
- metadata=metadata,
649
- )
650
-
651
- # Case 2: Split by Element
652
- elif self.split == "element":
653
- if is_pdf and full_docs and PdfReader:
654
- start_page = 0
655
- batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
656
- while start_page < number_of_pages:
657
- elements = self._split_and_request(full_docs, start_page, batch_num_pages)
658
- for element in sorted(elements, key=lambda x: (x.page, x.id)):
659
- # _element_document handles metadata and adjusts page number
660
- doc = self._element_document(element, start_page)
661
- _get_metadata_from_document(doc)["source"] = blob.path # Add source
662
- yield doc
663
- start_page += batch_num_pages
664
- else: # Non-PDF
665
- with open(blob.path, "rb") as f:
666
- filename = os.path.basename(blob.path)
667
- elements = self._get_response({"document": (filename, f)})
668
- for element in sorted(elements, key=lambda x: x.id):
669
- doc = self._element_document(element, 0) # Start page is 0 for single doc
670
- _get_metadata_from_document(doc)["source"] = blob.path # Add source
671
- yield doc
672
-
673
- # Case 3: Split by Page
674
- elif self.split == "page":
675
- if is_pdf and full_docs and PdfReader:
676
- start_page = 0
677
- batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
678
- while start_page < number_of_pages:
679
- elements = self._split_and_request(full_docs, start_page, batch_num_pages)
680
- # _page_document groups elements by page and creates Documents
681
- page_docs = self._page_document(elements, start_page)
682
- for doc in page_docs:
683
- _get_metadata_from_document(doc)["source"] = blob.path # Add source
684
- yield doc
685
- start_page += batch_num_pages
686
- else: # Non-PDF (treat as single page)
687
- with open(blob.path, "rb") as f:
688
- filename = os.path.basename(blob.path)
689
- elements = self._get_response({"document": (filename, f)})
690
- page_docs = self._page_document(elements, 0) # Process elements as page 0
691
- for doc in page_docs:
692
- _get_metadata_from_document(doc)["source"] = blob.path # Add source
693
- yield doc
694
-
695
- else:
696
- raise ValueError(f"Invalid split type: {self.split}")
697
-
698
-
699
- def _get_metadata_from_document(doc: Document) -> dict[object, object]:
700
- """
701
- Helper function to extract metadata from a Document object.
702
- This is a placeholder and should be adjusted based on actual metadata structure.
703
- """
704
- metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
705
- return metadata
1
+ # -*- coding: utf-8 -*-
2
+ """Adopted from `langchain_upstage.document_parse`"""
3
+
4
+ from __future__ import annotations
5
+
6
+ import base64
7
+ import binascii
8
+ import io
9
+ import json
10
+ import logging
11
+ import os
12
+ import uuid
13
+ from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
14
+
15
+ import requests
16
+ from langchain_core.document_loaders import BaseBlobParser, Blob
17
+ from langchain_core.documents import Document
18
+ from pydantic import BaseModel, Field
19
+
20
+ from ..common_types.io import BytesReadable
21
+ from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
22
+ from ..utils.base64_image import Base64Image
23
+ from ..utils.imghdr import what
24
+
25
+ if TYPE_CHECKING:
26
+ from pypdf import PdfReader
27
+
28
+ logger = logging.getLogger("pypdf")
29
+ logger.setLevel(logging.ERROR)
30
+ parser_logger = logging.getLogger(__name__) # Added logger for this module
31
+
32
+ DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
33
+ DEFAULT_NUM_PAGES = 10
34
+ DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
35
+ DEFAULT_IMAGE_DIR = "images" # Added default image directory
36
+
37
+ OutputFormat = Literal["text", "html", "markdown"]
38
+ OCR = Literal["auto", "force"]
39
+ SplitType = Literal["none", "page", "element"]
40
+ Category = Literal[
41
+ "paragraph",
42
+ "table",
43
+ "figure",
44
+ "header",
45
+ "footer",
46
+ "caption",
47
+ "equation",
48
+ "heading1",
49
+ "list",
50
+ "index",
51
+ "footnote",
52
+ "chart",
53
+ ]
54
+
55
+
56
+ class Content(BaseModel):
57
+ text: Optional[str] = None
58
+ html: Optional[str] = None
59
+ markdown: Optional[str] = None
60
+
61
+
62
+ class Coordinate(BaseModel):
63
+ x: float
64
+ y: float
65
+
66
+
67
+ class Element(BaseModel):
68
+ category: Category
69
+ content: Content
70
+ coordinates: list[Coordinate] = Field(default_factory=list)
71
+ base64_encoding: str = ""
72
+ id: int
73
+ page: int
74
+
75
+ def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
76
+ """
77
+ Generates the text representation of the element.
78
+
79
+ If the element is a figure with base64 encoding and no chatterer is provided,
80
+ it generates a markdown link to a uniquely named image file and stores the
81
+ image data in the parser's image_data dictionary. Otherwise, it uses the
82
+ chatterer for description or returns the standard text/html/markdown.
83
+ """
84
+ output_format: OutputFormat = parser.output_format
85
+ chatterer: Optional[Chatterer] = parser.chatterer
86
+ image_description_instruction: str = parser.image_description_instruction
87
+ output: Optional[str] = None
88
+
89
+ if output_format == "text":
90
+ output = self.content.text
91
+ elif output_format == "html":
92
+ output = self.content.html
93
+ elif output_format == "markdown":
94
+ output = self.content.markdown
95
+
96
+ if output is None:
97
+ # Fallback or raise error if needed, here using text as fallback
98
+ output = self.content.text or ""
99
+ # Or raise ValueError(f"Invalid output format or missing content: {output_format}")
100
+
101
+ # --- Logic modification starts here ---
102
+ if self.category == "figure" and self.base64_encoding:
103
+ # Case 1: Chatterer is available - Generate description
104
+ if chatterer is not None:
105
+ # Check if base64 encoding is valid
106
+ try:
107
+ # Decode base64 to check if valid
108
+ img_type = what(self.base64_encoding)
109
+ if not img_type:
110
+ parser_logger.warning(
111
+ f"Could not determine image type for figure element {self.id} (page {self.page})."
112
+ )
113
+ return output
114
+ image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
115
+
116
+ except (binascii.Error, ValueError) as e:
117
+ parser_logger.warning(
118
+ f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
119
+ )
120
+ return output
121
+
122
+ if image is None:
123
+ parser_logger.warning(
124
+ f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
125
+ )
126
+ # Fallback to original output (placeholder/OCR)
127
+ return output
128
+
129
+ ocr_content = ""
130
+ if output_format == "markdown":
131
+ ocr_content = output.removeprefix("![image](/image/placeholder)\n")
132
+ elif output_format == "text":
133
+ ocr_content = output
134
+
135
+ image_description = chatterer.describe_image(
136
+ image.data_uri,
137
+ image_description_instruction
138
+ + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
139
+ )
140
+ # Return description within details tag (as original)
141
+ output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
142
+
143
+ # Case 2: Chatterer is NOT available - Generate file path and store data
144
+ elif parser.image_dir is not None:
145
+ try:
146
+ img_type = what(self.base64_encoding)
147
+ if not img_type:
148
+ parser_logger.warning(
149
+ f"Could not determine image type for figure element {self.id} (page {self.page})."
150
+ )
151
+ return output
152
+
153
+ image_bytes = base64.b64decode(self.base64_encoding)
154
+
155
+ # Generate unique filename and path
156
+ filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
157
+ # Create relative path for markdown link, ensuring forward slashes
158
+ relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
159
+
160
+ # Store the image data for the user to save later
161
+ parser.image_data[relative_path] = image_bytes
162
+
163
+ # Extract OCR content if present
164
+ ocr_content = ""
165
+ if output_format == "markdown" and output.startswith("![image]"):
166
+ ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
167
+ elif output_format == "text":
168
+ ocr_content = output # Assume text output is OCR for images
169
+
170
+ # Update output to be the markdown link + OCR
171
+ output = f"![image]({relative_path})\n{ocr_content}".strip()
172
+
173
+ except (binascii.Error, ValueError) as e:
174
+ # Handle potential base64 decoding errors gracefully
175
+ parser_logger.warning(
176
+ f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
177
+ )
178
+ # Keep the original 'output' value (placeholder or OCR)
179
+ pass
180
+
181
+ return output
182
+
183
+
184
+ class Coordinates(TypedDict):
185
+ id: int
186
+ category: Category
187
+ coordinates: list[Coordinate]
188
+
189
+
190
+ class PageCoordinates(Coordinates):
191
+ page: int
192
+
193
+
194
+ def get_from_param_or_env(
195
+ key: str,
196
+ param: Optional[str] = None,
197
+ env_key: Optional[str] = None,
198
+ default: Optional[str] = None,
199
+ ) -> str:
200
+ """Get a value from a param or an environment variable."""
201
+ if param is not None:
202
+ return param
203
+ elif env_key and env_key in os.environ and os.environ[env_key]:
204
+ return os.environ[env_key]
205
+ elif default is not None:
206
+ return default
207
+ else:
208
+ raise ValueError(
209
+ f"Did not find {key}, please add an environment variable"
210
+ f" `{env_key}` which contains it, or pass"
211
+ f" `{key}` as a named parameter."
212
+ )
213
+
214
+
215
+ class UpstageDocumentParseParser(BaseBlobParser):
216
+ """Upstage Document Parse Parser.
217
+
218
+ Parses documents using the Upstage Document AI API. Can optionally extract
219
+ images and return their data alongside the parsed documents.
220
+
221
+ If a `chatterer` is provided, it will be used to generate descriptions for
222
+ images (figures with base64 encoding).
223
+
224
+ If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
225
+ this parser will:
226
+ 1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
227
+ The base directory can be configured with `image_dir`.
228
+ 2. Replace the element's content with a markdown image link pointing to this path.
229
+ 3. Store the actual image bytes in the `image_data` attribute dictionary,
230
+ mapping the generated relative path to the bytes.
231
+
232
+ The user is responsible for saving the files from the `image_data` dictionary
233
+ after processing the documents yielded by `lazy_parse`.
234
+
235
+ To use, you should have the environment variable `UPSTAGE_API_KEY`
236
+ set with your API key or pass it as a named parameter to the constructor.
237
+
238
+ Example:
239
+ .. code-block:: python
240
+
241
+ from langchain_upstage import UpstageDocumentParseParser
242
+ from langchain_core.documents import Blob
243
+ import os
244
+
245
+ # --- Setup ---
246
+ # Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
247
+ # Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
248
+
249
+ # --- Parsing without chatterer (extracts images) ---
250
+ parser = UpstageDocumentParseParser(
251
+ split="page",
252
+ output_format="markdown",
253
+ base64_encoding=["figure"], # Important: Request base64 for figures
254
+ image_dir="extracted_images" # Optional: specify image dir
255
+ )
256
+ blob = Blob.from_path("my_document.pdf") # Or your image file path
257
+ documents = []
258
+ for doc in parser.lazy_parse(blob):
259
+ print("--- Document ---")
260
+ print(f"Page: {get_metadata_from_document(doc).get('page')}")
261
+ print(doc.page_content)
262
+ documents.append(doc)
263
+
264
+ print("\\n--- Extracted Image Data ---")
265
+ if parser.image_data:
266
+ # User saves the images
267
+ for img_path, img_bytes in parser.image_data.items():
268
+ # Create directories if they don't exist
269
+ os.makedirs(os.path.dirname(img_path), exist_ok=True)
270
+ try:
271
+ with open(img_path, "wb") as f:
272
+ f.write(img_bytes)
273
+ print(f"Saved image: {img_path}")
274
+ except IOError as e:
275
+ print(f"Error saving image {img_path}: {e}")
276
+ else:
277
+ print("No images extracted.")
278
+
279
+ # --- Parsing with chatterer (generates descriptions) ---
280
+ # from langchain_upstage import UpstageChatter # Assuming this exists
281
+ # chatterer = UpstageChatter() # Initialize your chatterer
282
+ # parser_with_desc = UpstageDocumentParseParser(
283
+ # split="page",
284
+ # output_format="markdown",
285
+ # base64_encoding=["figure"], # Still need base64 for description
286
+ # chatterer=chatterer
287
+ # )
288
+ # documents_with_desc = list(parser_with_desc.lazy_parse(blob))
289
+ # print("\\n--- Documents with Descriptions ---")
290
+ # for doc in documents_with_desc:
291
+ # print(f"Page: {get_metadata_from_document(doc).get('page')}")
292
+ # print(doc.page_content)
293
+
294
+ """
295
+
296
+ def __init__(
297
+ self,
298
+ api_key: Optional[str] = None,
299
+ base_url: str = DOCUMENT_PARSE_BASE_URL,
300
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
301
+ split: SplitType = "none",
302
+ ocr: OCR = "auto",
303
+ output_format: OutputFormat = "markdown",
304
+ coordinates: bool = True,
305
+ base64_encoding: list[Category] = [],
306
+ chatterer: Optional[Chatterer] = None,
307
+ image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
308
+ image_dir: Optional[str] = None, # Added: Directory for image paths
309
+ ) -> None:
310
+ """
311
+ Initializes an instance of the UpstageDocumentParseParser.
312
+
313
+ Args:
314
+ api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
315
+ base_url (str, optional): Base URL for the Upstage API.
316
+ model (str): Model for document parse. Defaults to "document-parse".
317
+ split (SplitType, optional): Splitting type ("none", "page", "element").
318
+ Defaults to "none".
319
+ ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
320
+ output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
321
+ Defaults to "markdown".
322
+ coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
323
+ base64_encoding (List[Category], optional): Categories to return as base64.
324
+ Crucial for image extraction/description.
325
+ Set to `["figure"]` to process images.
326
+ Defaults to [].
327
+ chatterer (Chatterer, optional): Chatterer instance for image description.
328
+ If None, images will be extracted to files.
329
+ Defaults to None.
330
+ image_description_instruction (str, optional): Instruction for image description.
331
+ Defaults to a standard instruction.
332
+ image_dir (str, optional): The directory name to use when constructing
333
+ relative paths for extracted images.
334
+ Defaults to "images". This directory
335
+ is NOT created by the parser.
336
+ """
337
+ self.api_key = get_from_param_or_env(
338
+ "UPSTAGE_API_KEY",
339
+ api_key,
340
+ "UPSTAGE_API_KEY",
341
+ os.environ.get("UPSTAGE_API_KEY"),
342
+ )
343
+ self.base_url = base_url
344
+ self.model = model
345
+ self.split: SplitType = split
346
+ self.ocr: OCR = ocr
347
+ self.output_format: OutputFormat = output_format
348
+ self.coordinates = coordinates
349
+ # Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
350
+ # However, it's better to require the user to explicitly set base64_encoding=["figure"]
351
+ self.base64_encoding: list[Category] = base64_encoding
352
+ self.chatterer = chatterer
353
+ self.image_description_instruction = image_description_instruction
354
+ self.image_dir = image_dir # Store output directory name
355
+
356
+ # Initialize dictionary to store image data (path -> bytes)
357
+ self.image_data: Dict[str, bytes] = {}
358
+
359
+ def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
360
+ """
361
+ Sends a POST request to the API endpoint with the provided files and
362
+ returns the parsed elements.
363
+ """
364
+ response: Optional[requests.Response] = None
365
+ try:
366
+ headers = {
367
+ "Authorization": f"Bearer {self.api_key}",
368
+ }
369
+ # Convert list to string representation required by the API
370
+ base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
371
+ output_formats_str = f"['{self.output_format}']"
372
+
373
+ response = requests.post(
374
+ self.base_url,
375
+ headers=headers,
376
+ files=files,
377
+ data={
378
+ "ocr": self.ocr,
379
+ "model": self.model,
380
+ "output_formats": output_formats_str,
381
+ "coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
382
+ "base64_encoding": base64_encoding_str,
383
+ },
384
+ )
385
+ response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
386
+
387
+ # Check content type before parsing JSON
388
+ content_type = response.headers.get("Content-Type", "")
389
+ if "application/json" not in content_type:
390
+ raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
391
+
392
+ response_data = response.json()
393
+ result: object = response_data.get("elements", [])
394
+
395
+ if not isinstance(result, list):
396
+ raise ValueError(f"API response 'elements' is not a list: {result}")
397
+ result = cast(list[object], result) # Cast to list of objects
398
+
399
+ # Validate each element using Pydantic
400
+ validated_elements: list[Element] = []
401
+ for i, element_data in enumerate(result):
402
+ try:
403
+ validated_elements.append(Element.model_validate(element_data))
404
+ except Exception as e: # Catch Pydantic validation errors etc.
405
+ parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
406
+ # Decide whether to skip the element or raise the error
407
+ # continue # Option: skip problematic element
408
+ raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
409
+
410
+ return validated_elements
411
+
412
+ except requests.HTTPError as e:
413
+ # Log more details from the response if available
414
+ error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
415
+ try:
416
+ error_details = e.response.json() # Try to get JSON error details
417
+ error_message += f" - {error_details}"
418
+ except json.JSONDecodeError:
419
+ error_message += f" - Response body: {e.response.text}"
420
+ raise ValueError(error_message) from e
421
+ except requests.RequestException as e:
422
+ raise ValueError(f"Failed to send request: {e}") from e
423
+ except json.JSONDecodeError as e:
424
+ # Include part of the response text that failed to parse
425
+ raise ValueError(
426
+ f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
427
+ ) from e
428
+ except Exception as e: # Catch-all for other unexpected errors
429
+ raise ValueError(f"An unexpected error occurred during API call: {e}") from e
430
+
431
+ def _split_and_request(
432
+ self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
433
+ ) -> list[Element]:
434
+ """
435
+ Splits the full pdf document into partial pages and sends a request.
436
+ """
437
+ # Need to import here if not globally available
438
+ try:
439
+ from pypdf import PdfWriter
440
+ except ImportError:
441
+ raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
442
+
443
+ merger = PdfWriter()
444
+ total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
445
+ end_page = min(start_page + num_pages, total_pages)
446
+
447
+ # Check if start_page is valid
448
+ if start_page >= total_pages:
449
+ parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
450
+ return []
451
+
452
+ # pypdf page indices are 0-based, slicing is exclusive of the end index
453
+ # PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
454
+ # However, the example used pages=(start, end) which might behave differently depending on version?
455
+ # Let's stick to add_page for clarity if possible, or ensure append range is correct.
456
+ # merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
457
+
458
+ # Alternative using add_page loop (more explicit)
459
+ for i in range(start_page, end_page):
460
+ merger.add_page(full_docs.pages[i])
461
+
462
+ with io.BytesIO() as buffer:
463
+ merger.write(buffer)
464
+ buffer.seek(0)
465
+ # Need to provide a filename for the 'files' dict
466
+ return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
467
+
468
+ def _element_document(self, element: Element, start_page: int = 0) -> Document:
469
+ """Converts an element into a Document object."""
470
+ # parse_text now handles image path generation and data storage if needed
471
+ page_content = element.parse_text(self)
472
+ metadata: dict[str, object] = element.model_dump(
473
+ exclude={"content", "base64_encoding"}, exclude_none=True
474
+ ) # Exclude raw content/base64
475
+ metadata["page"] = element.page + start_page # Adjust page number
476
+ # Base64 encoding is not added to metadata if it was processed into image_data
477
+ # Coordinates are kept if requested
478
+ if not self.coordinates:
479
+ metadata.pop("coordinates", None)
480
+
481
+ return Document(
482
+ page_content=page_content,
483
+ metadata=metadata,
484
+ )
485
+
486
+ def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
487
+ """Combines elements with the same page number into a single Document object."""
488
+ documents: list[Document] = []
489
+ if not elements:
490
+ return documents
491
+
492
+ # Group elements by page (relative to the current batch)
493
+ pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
494
+ page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
495
+ for element in elements:
496
+ page_groups[element.page].append(element)
497
+
498
+ for page_num, group in page_groups.items():
499
+ actual_page_num = page_num + start_page
500
+ page_content_parts: list[str] = []
501
+ page_coordinates: list[Coordinates] = []
502
+ # Base64 encodings are handled within parse_text now, not collected here
503
+
504
+ for element in sorted(group, key=lambda x: x.id): # Process elements in order
505
+ page_content_parts.append(element.parse_text(self))
506
+ if self.coordinates and element.coordinates:
507
+ page_coordinates.append({ # Store coordinates with element id/category for context
508
+ "id": element.id,
509
+ "category": element.category,
510
+ "coordinates": element.coordinates,
511
+ })
512
+
513
+ metadata: dict[str, object] = {
514
+ "page": actual_page_num,
515
+ }
516
+ if self.coordinates and page_coordinates:
517
+ metadata["element_coordinates"] = page_coordinates # Changed key for clarity
518
+
519
+ # Combine content, typically with spaces or newlines
520
+ # Using newline might be better for readability if elements are paragraphs etc.
521
+ combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
522
+
523
+ documents.append(
524
+ Document(
525
+ page_content=combined_page_content,
526
+ metadata=metadata,
527
+ )
528
+ )
529
+
530
+ return documents
531
+
532
+ def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
533
+ """
534
+ Lazily parses a document blob.
535
+
536
+ Yields Document objects based on the specified split type.
537
+ If images are extracted (chatterer=None, base64_encoding=["figure"]),
538
+ the image data will be available in `self.image_data` after iteration.
539
+
540
+ Args:
541
+ blob (Blob): The input document blob to parse. Requires `blob.path`.
542
+ is_batch (bool, optional): Currently affects PDF page batch size.
543
+ Defaults to False (process 1 page batch for PDF).
544
+ *Note: API might have limits regardless.*
545
+
546
+ Yields:
547
+ Document: The parsed document object(s).
548
+
549
+ Raises:
550
+ ValueError: If blob.path is not set, API error occurs, or invalid config.
551
+ ImportError: If pypdf is needed but not installed.
552
+ """
553
+ # Clear image data at the start of parsing for this specific call
554
+ self.image_data = {}
555
+
556
+ if not blob.path:
557
+ # Non-PDF files and direct API calls require reading the file,
558
+ # PDF splitting also requires the path.
559
+ raise ValueError("Blob path is required for UpstageDocumentParseParser.")
560
+
561
+ # Try importing pypdf here, only if needed
562
+ PdfReader = None
563
+ PdfReadError = None
564
+ try:
565
+ from pypdf import PdfReader as PyPdfReader
566
+ from pypdf.errors import PdfReadError as PyPdfReadError
567
+
568
+ PdfReader = PyPdfReader
569
+ PdfReadError = PyPdfReadError
570
+ except ImportError:
571
+ # We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
572
+ # Let's attempt to read anyway, API might support non-PDFs directly.
573
+ # We'll check for PdfReader later if we determine it's a PDF.
574
+ pass
575
+
576
+ full_docs: Optional[PdfReader] = None
577
+ is_pdf = False
578
+ number_of_pages = 1 # Default for non-PDF or single-page docs
579
+
580
+ try:
581
+ # Check if it's a PDF by trying to open it
582
+ if PdfReader and PdfReadError:
583
+ try:
584
+ # Use strict=False to be more lenient with potentially corrupted PDFs
585
+ full_docs = PdfReader(str(blob.path), strict=False)
586
+ number_of_pages = len(full_docs.pages)
587
+ is_pdf = True
588
+ except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
589
+ parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
590
+ except Exception as e: # Catch other potential pypdf errors
591
+ parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
592
+ raise ValueError(f"Failed to process PDF file: {e}") from e
593
+ else:
594
+ parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
595
+
596
+ except Exception as e:
597
+ raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
598
+
599
+ # --- Parsing Logic based on Split Type ---
600
+
601
+ # Case 1: No Splitting (Combine all content)
602
+ if self.split == "none":
603
+ combined_result = ""
604
+ all_coordinates: list[PageCoordinates] = []
605
+ # Base64 handled by parse_text, data stored in self.image_data
606
+
607
+ if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
608
+ start_page = 0
609
+ # Use a reasonable batch size for 'none' split to avoid huge requests
610
+ batch_num_pages = DEFAULT_NUM_PAGES
611
+ while start_page < number_of_pages:
612
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
613
+ for element in sorted(elements, key=lambda x: (x.page, x.id)):
614
+ combined_result += element.parse_text(self) + "\n\n" # Add separator
615
+ if self.coordinates and element.coordinates:
616
+ # Adjust page number for coordinates metadata
617
+ coords_with_page: PageCoordinates = {
618
+ "id": element.id,
619
+ "category": element.category,
620
+ "page": element.page + start_page, # Actual page
621
+ "coordinates": element.coordinates,
622
+ }
623
+ all_coordinates.append(coords_with_page)
624
+ start_page += batch_num_pages
625
+ else: # Process non-PDF file as a single unit
626
+ with open(blob.path, "rb") as f:
627
+ # Provide a filename for the 'files' dict
628
+ filename = os.path.basename(blob.path)
629
+ elements = self._get_response({"document": (filename, f)})
630
+
631
+ for element in sorted(elements, key=lambda x: x.id):
632
+ combined_result += element.parse_text(self) + "\n\n"
633
+ if self.coordinates and element.coordinates:
634
+ all_coordinates.append({
635
+ "id": element.id,
636
+ "category": element.category,
637
+ "page": element.page, # Page is relative to the single doc (usually 0 or 1)
638
+ "coordinates": element.coordinates,
639
+ })
640
+
641
+ metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
642
+ if self.coordinates and all_coordinates:
643
+ metadata["element_coordinates"] = all_coordinates
644
+ # self.image_data is populated, no need to add base64 to metadata
645
+
646
+ yield Document(
647
+ page_content=combined_result.strip(),
648
+ metadata=metadata,
649
+ )
650
+
651
+ # Case 2: Split by Element
652
+ elif self.split == "element":
653
+ if is_pdf and full_docs and PdfReader:
654
+ start_page = 0
655
+ batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
656
+ while start_page < number_of_pages:
657
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
658
+ for element in sorted(elements, key=lambda x: (x.page, x.id)):
659
+ # _element_document handles metadata and adjusts page number
660
+ doc = self._element_document(element, start_page)
661
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
662
+ yield doc
663
+ start_page += batch_num_pages
664
+ else: # Non-PDF
665
+ with open(blob.path, "rb") as f:
666
+ filename = os.path.basename(blob.path)
667
+ elements = self._get_response({"document": (filename, f)})
668
+ for element in sorted(elements, key=lambda x: x.id):
669
+ doc = self._element_document(element, 0) # Start page is 0 for single doc
670
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
671
+ yield doc
672
+
673
+ # Case 3: Split by Page
674
+ elif self.split == "page":
675
+ if is_pdf and full_docs and PdfReader:
676
+ start_page = 0
677
+ batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
678
+ while start_page < number_of_pages:
679
+ elements = self._split_and_request(full_docs, start_page, batch_num_pages)
680
+ # _page_document groups elements by page and creates Documents
681
+ page_docs = self._page_document(elements, start_page)
682
+ for doc in page_docs:
683
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
684
+ yield doc
685
+ start_page += batch_num_pages
686
+ else: # Non-PDF (treat as single page)
687
+ with open(blob.path, "rb") as f:
688
+ filename = os.path.basename(blob.path)
689
+ elements = self._get_response({"document": (filename, f)})
690
+ page_docs = self._page_document(elements, 0) # Process elements as page 0
691
+ for doc in page_docs:
692
+ _get_metadata_from_document(doc)["source"] = blob.path # Add source
693
+ yield doc
694
+
695
+ else:
696
+ raise ValueError(f"Invalid split type: {self.split}")
697
+
698
+
699
+ def _get_metadata_from_document(doc: Document) -> dict[object, object]:
700
+ """
701
+ Helper function to extract metadata from a Document object.
702
+ This is a placeholder and should be adjusted based on actual metadata structure.
703
+ """
704
+ metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
705
+ return metadata