chatterer 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/__init__.py +0 -0
- chatterer/examples/anything_to_markdown.py +95 -91
- chatterer/examples/get_code_snippets.py +64 -62
- chatterer/examples/login_with_playwright.py +171 -167
- chatterer/examples/make_ppt.py +499 -497
- chatterer/examples/pdf_to_markdown.py +107 -107
- chatterer/examples/pdf_to_text.py +60 -56
- chatterer/examples/transcription_api.py +127 -123
- chatterer/examples/upstage_parser.py +95 -100
- chatterer/examples/webpage_to_markdown.py +79 -79
- chatterer/interactive.py +354 -354
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +302 -302
- chatterer/tools/convert_to_text.py +447 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
- chatterer-0.1.19.dist-info/RECORD +44 -0
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
- chatterer-0.1.19.dist-info/entry_points.txt +10 -0
- chatterer-0.1.18.dist-info/RECORD +0 -42
- {chatterer-0.1.18.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,705 +1,705 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
"""Adopted from `langchain_upstage.document_parse`"""
|
3
|
-
|
4
|
-
from __future__ import annotations
|
5
|
-
|
6
|
-
import base64
|
7
|
-
import binascii
|
8
|
-
import io
|
9
|
-
import json
|
10
|
-
import logging
|
11
|
-
import os
|
12
|
-
import uuid
|
13
|
-
from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
|
14
|
-
|
15
|
-
import requests
|
16
|
-
from langchain_core.document_loaders import BaseBlobParser, Blob
|
17
|
-
from langchain_core.documents import Document
|
18
|
-
from pydantic import BaseModel, Field
|
19
|
-
|
20
|
-
from ..common_types.io import BytesReadable
|
21
|
-
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
22
|
-
from ..utils.base64_image import Base64Image
|
23
|
-
from ..utils.imghdr import what
|
24
|
-
|
25
|
-
if TYPE_CHECKING:
|
26
|
-
from pypdf import PdfReader
|
27
|
-
|
28
|
-
logger = logging.getLogger("pypdf")
|
29
|
-
logger.setLevel(logging.ERROR)
|
30
|
-
parser_logger = logging.getLogger(__name__) # Added logger for this module
|
31
|
-
|
32
|
-
DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
|
33
|
-
DEFAULT_NUM_PAGES = 10
|
34
|
-
DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
|
35
|
-
DEFAULT_IMAGE_DIR = "images" # Added default image directory
|
36
|
-
|
37
|
-
OutputFormat = Literal["text", "html", "markdown"]
|
38
|
-
OCR = Literal["auto", "force"]
|
39
|
-
SplitType = Literal["none", "page", "element"]
|
40
|
-
Category = Literal[
|
41
|
-
"paragraph",
|
42
|
-
"table",
|
43
|
-
"figure",
|
44
|
-
"header",
|
45
|
-
"footer",
|
46
|
-
"caption",
|
47
|
-
"equation",
|
48
|
-
"heading1",
|
49
|
-
"list",
|
50
|
-
"index",
|
51
|
-
"footnote",
|
52
|
-
"chart",
|
53
|
-
]
|
54
|
-
|
55
|
-
|
56
|
-
class Content(BaseModel):
|
57
|
-
text: Optional[str] = None
|
58
|
-
html: Optional[str] = None
|
59
|
-
markdown: Optional[str] = None
|
60
|
-
|
61
|
-
|
62
|
-
class Coordinate(BaseModel):
|
63
|
-
x: float
|
64
|
-
y: float
|
65
|
-
|
66
|
-
|
67
|
-
class Element(BaseModel):
|
68
|
-
category: Category
|
69
|
-
content: Content
|
70
|
-
coordinates: list[Coordinate] = Field(default_factory=list)
|
71
|
-
base64_encoding: str = ""
|
72
|
-
id: int
|
73
|
-
page: int
|
74
|
-
|
75
|
-
def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
|
76
|
-
"""
|
77
|
-
Generates the text representation of the element.
|
78
|
-
|
79
|
-
If the element is a figure with base64 encoding and no chatterer is provided,
|
80
|
-
it generates a markdown link to a uniquely named image file and stores the
|
81
|
-
image data in the parser's image_data dictionary. Otherwise, it uses the
|
82
|
-
chatterer for description or returns the standard text/html/markdown.
|
83
|
-
"""
|
84
|
-
output_format: OutputFormat = parser.output_format
|
85
|
-
chatterer: Optional[Chatterer] = parser.chatterer
|
86
|
-
image_description_instruction: str = parser.image_description_instruction
|
87
|
-
output: Optional[str] = None
|
88
|
-
|
89
|
-
if output_format == "text":
|
90
|
-
output = self.content.text
|
91
|
-
elif output_format == "html":
|
92
|
-
output = self.content.html
|
93
|
-
elif output_format == "markdown":
|
94
|
-
output = self.content.markdown
|
95
|
-
|
96
|
-
if output is None:
|
97
|
-
# Fallback or raise error if needed, here using text as fallback
|
98
|
-
output = self.content.text or ""
|
99
|
-
# Or raise ValueError(f"Invalid output format or missing content: {output_format}")
|
100
|
-
|
101
|
-
# --- Logic modification starts here ---
|
102
|
-
if self.category == "figure" and self.base64_encoding:
|
103
|
-
# Case 1: Chatterer is available - Generate description
|
104
|
-
if chatterer is not None:
|
105
|
-
# Check if base64 encoding is valid
|
106
|
-
try:
|
107
|
-
# Decode base64 to check if valid
|
108
|
-
img_type = what(self.base64_encoding)
|
109
|
-
if not img_type:
|
110
|
-
parser_logger.warning(
|
111
|
-
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
112
|
-
)
|
113
|
-
return output
|
114
|
-
image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
|
115
|
-
|
116
|
-
except (binascii.Error, ValueError) as e:
|
117
|
-
parser_logger.warning(
|
118
|
-
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
119
|
-
)
|
120
|
-
return output
|
121
|
-
|
122
|
-
if image is None:
|
123
|
-
parser_logger.warning(
|
124
|
-
f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
|
125
|
-
)
|
126
|
-
# Fallback to original output (placeholder/OCR)
|
127
|
-
return output
|
128
|
-
|
129
|
-
ocr_content = ""
|
130
|
-
if output_format == "markdown":
|
131
|
-
ocr_content = output.removeprefix("\n")
|
132
|
-
elif output_format == "text":
|
133
|
-
ocr_content = output
|
134
|
-
|
135
|
-
image_description = chatterer.describe_image(
|
136
|
-
image.data_uri,
|
137
|
-
image_description_instruction
|
138
|
-
+ f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
|
139
|
-
)
|
140
|
-
# Return description within details tag (as original)
|
141
|
-
output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
|
142
|
-
|
143
|
-
# Case 2: Chatterer is NOT available - Generate file path and store data
|
144
|
-
elif parser.image_dir is not None:
|
145
|
-
try:
|
146
|
-
img_type = what(self.base64_encoding)
|
147
|
-
if not img_type:
|
148
|
-
parser_logger.warning(
|
149
|
-
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
150
|
-
)
|
151
|
-
return output
|
152
|
-
|
153
|
-
image_bytes = base64.b64decode(self.base64_encoding)
|
154
|
-
|
155
|
-
# Generate unique filename and path
|
156
|
-
filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
|
157
|
-
# Create relative path for markdown link, ensuring forward slashes
|
158
|
-
relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
|
159
|
-
|
160
|
-
# Store the image data for the user to save later
|
161
|
-
parser.image_data[relative_path] = image_bytes
|
162
|
-
|
163
|
-
# Extract OCR content if present
|
164
|
-
ocr_content = ""
|
165
|
-
if output_format == "markdown" and output.startswith("![image]"):
|
166
|
-
ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
|
167
|
-
elif output_format == "text":
|
168
|
-
ocr_content = output # Assume text output is OCR for images
|
169
|
-
|
170
|
-
# Update output to be the markdown link + OCR
|
171
|
-
output = f"\n{ocr_content}".strip()
|
172
|
-
|
173
|
-
except (binascii.Error, ValueError) as e:
|
174
|
-
# Handle potential base64 decoding errors gracefully
|
175
|
-
parser_logger.warning(
|
176
|
-
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
177
|
-
)
|
178
|
-
# Keep the original 'output' value (placeholder or OCR)
|
179
|
-
pass
|
180
|
-
|
181
|
-
return output
|
182
|
-
|
183
|
-
|
184
|
-
class Coordinates(TypedDict):
|
185
|
-
id: int
|
186
|
-
category: Category
|
187
|
-
coordinates: list[Coordinate]
|
188
|
-
|
189
|
-
|
190
|
-
class PageCoordinates(Coordinates):
|
191
|
-
page: int
|
192
|
-
|
193
|
-
|
194
|
-
def get_from_param_or_env(
|
195
|
-
key: str,
|
196
|
-
param: Optional[str] = None,
|
197
|
-
env_key: Optional[str] = None,
|
198
|
-
default: Optional[str] = None,
|
199
|
-
) -> str:
|
200
|
-
"""Get a value from a param or an environment variable."""
|
201
|
-
if param is not None:
|
202
|
-
return param
|
203
|
-
elif env_key and env_key in os.environ and os.environ[env_key]:
|
204
|
-
return os.environ[env_key]
|
205
|
-
elif default is not None:
|
206
|
-
return default
|
207
|
-
else:
|
208
|
-
raise ValueError(
|
209
|
-
f"Did not find {key}, please add an environment variable"
|
210
|
-
f" `{env_key}` which contains it, or pass"
|
211
|
-
f" `{key}` as a named parameter."
|
212
|
-
)
|
213
|
-
|
214
|
-
|
215
|
-
class UpstageDocumentParseParser(BaseBlobParser):
|
216
|
-
"""Upstage Document Parse Parser.
|
217
|
-
|
218
|
-
Parses documents using the Upstage Document AI API. Can optionally extract
|
219
|
-
images and return their data alongside the parsed documents.
|
220
|
-
|
221
|
-
If a `chatterer` is provided, it will be used to generate descriptions for
|
222
|
-
images (figures with base64 encoding).
|
223
|
-
|
224
|
-
If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
|
225
|
-
this parser will:
|
226
|
-
1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
|
227
|
-
The base directory can be configured with `image_dir`.
|
228
|
-
2. Replace the element's content with a markdown image link pointing to this path.
|
229
|
-
3. Store the actual image bytes in the `image_data` attribute dictionary,
|
230
|
-
mapping the generated relative path to the bytes.
|
231
|
-
|
232
|
-
The user is responsible for saving the files from the `image_data` dictionary
|
233
|
-
after processing the documents yielded by `lazy_parse`.
|
234
|
-
|
235
|
-
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
236
|
-
set with your API key or pass it as a named parameter to the constructor.
|
237
|
-
|
238
|
-
Example:
|
239
|
-
.. code-block:: python
|
240
|
-
|
241
|
-
from langchain_upstage import UpstageDocumentParseParser
|
242
|
-
from langchain_core.documents import Blob
|
243
|
-
import os
|
244
|
-
|
245
|
-
# --- Setup ---
|
246
|
-
# Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
|
247
|
-
# Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
|
248
|
-
|
249
|
-
# --- Parsing without chatterer (extracts images) ---
|
250
|
-
parser = UpstageDocumentParseParser(
|
251
|
-
split="page",
|
252
|
-
output_format="markdown",
|
253
|
-
base64_encoding=["figure"], # Important: Request base64 for figures
|
254
|
-
image_dir="extracted_images" # Optional: specify image dir
|
255
|
-
)
|
256
|
-
blob = Blob.from_path("my_document.pdf") # Or your image file path
|
257
|
-
documents = []
|
258
|
-
for doc in parser.lazy_parse(blob):
|
259
|
-
print("--- Document ---")
|
260
|
-
print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
261
|
-
print(doc.page_content)
|
262
|
-
documents.append(doc)
|
263
|
-
|
264
|
-
print("\\n--- Extracted Image Data ---")
|
265
|
-
if parser.image_data:
|
266
|
-
# User saves the images
|
267
|
-
for img_path, img_bytes in parser.image_data.items():
|
268
|
-
# Create directories if they don't exist
|
269
|
-
os.makedirs(os.path.dirname(img_path), exist_ok=True)
|
270
|
-
try:
|
271
|
-
with open(img_path, "wb") as f:
|
272
|
-
f.write(img_bytes)
|
273
|
-
print(f"Saved image: {img_path}")
|
274
|
-
except IOError as e:
|
275
|
-
print(f"Error saving image {img_path}: {e}")
|
276
|
-
else:
|
277
|
-
print("No images extracted.")
|
278
|
-
|
279
|
-
# --- Parsing with chatterer (generates descriptions) ---
|
280
|
-
# from langchain_upstage import UpstageChatter # Assuming this exists
|
281
|
-
# chatterer = UpstageChatter() # Initialize your chatterer
|
282
|
-
# parser_with_desc = UpstageDocumentParseParser(
|
283
|
-
# split="page",
|
284
|
-
# output_format="markdown",
|
285
|
-
# base64_encoding=["figure"], # Still need base64 for description
|
286
|
-
# chatterer=chatterer
|
287
|
-
# )
|
288
|
-
# documents_with_desc = list(parser_with_desc.lazy_parse(blob))
|
289
|
-
# print("\\n--- Documents with Descriptions ---")
|
290
|
-
# for doc in documents_with_desc:
|
291
|
-
# print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
292
|
-
# print(doc.page_content)
|
293
|
-
|
294
|
-
"""
|
295
|
-
|
296
|
-
def __init__(
|
297
|
-
self,
|
298
|
-
api_key: Optional[str] = None,
|
299
|
-
base_url: str = DOCUMENT_PARSE_BASE_URL,
|
300
|
-
model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
|
301
|
-
split: SplitType = "none",
|
302
|
-
ocr: OCR = "auto",
|
303
|
-
output_format: OutputFormat = "markdown",
|
304
|
-
coordinates: bool = True,
|
305
|
-
base64_encoding: list[Category] = [],
|
306
|
-
chatterer: Optional[Chatterer] = None,
|
307
|
-
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
|
308
|
-
image_dir: Optional[str] = None, # Added: Directory for image paths
|
309
|
-
) -> None:
|
310
|
-
"""
|
311
|
-
Initializes an instance of the UpstageDocumentParseParser.
|
312
|
-
|
313
|
-
Args:
|
314
|
-
api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
|
315
|
-
base_url (str, optional): Base URL for the Upstage API.
|
316
|
-
model (str): Model for document parse. Defaults to "document-parse".
|
317
|
-
split (SplitType, optional): Splitting type ("none", "page", "element").
|
318
|
-
Defaults to "none".
|
319
|
-
ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
|
320
|
-
output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
|
321
|
-
Defaults to "markdown".
|
322
|
-
coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
|
323
|
-
base64_encoding (List[Category], optional): Categories to return as base64.
|
324
|
-
Crucial for image extraction/description.
|
325
|
-
Set to `["figure"]` to process images.
|
326
|
-
Defaults to [].
|
327
|
-
chatterer (Chatterer, optional): Chatterer instance for image description.
|
328
|
-
If None, images will be extracted to files.
|
329
|
-
Defaults to None.
|
330
|
-
image_description_instruction (str, optional): Instruction for image description.
|
331
|
-
Defaults to a standard instruction.
|
332
|
-
image_dir (str, optional): The directory name to use when constructing
|
333
|
-
relative paths for extracted images.
|
334
|
-
Defaults to "images". This directory
|
335
|
-
is NOT created by the parser.
|
336
|
-
"""
|
337
|
-
self.api_key = get_from_param_or_env(
|
338
|
-
"UPSTAGE_API_KEY",
|
339
|
-
api_key,
|
340
|
-
"UPSTAGE_API_KEY",
|
341
|
-
os.environ.get("UPSTAGE_API_KEY"),
|
342
|
-
)
|
343
|
-
self.base_url = base_url
|
344
|
-
self.model = model
|
345
|
-
self.split: SplitType = split
|
346
|
-
self.ocr: OCR = ocr
|
347
|
-
self.output_format: OutputFormat = output_format
|
348
|
-
self.coordinates = coordinates
|
349
|
-
# Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
|
350
|
-
# However, it's better to require the user to explicitly set base64_encoding=["figure"]
|
351
|
-
self.base64_encoding: list[Category] = base64_encoding
|
352
|
-
self.chatterer = chatterer
|
353
|
-
self.image_description_instruction = image_description_instruction
|
354
|
-
self.image_dir = image_dir # Store output directory name
|
355
|
-
|
356
|
-
# Initialize dictionary to store image data (path -> bytes)
|
357
|
-
self.image_data: Dict[str, bytes] = {}
|
358
|
-
|
359
|
-
def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
|
360
|
-
"""
|
361
|
-
Sends a POST request to the API endpoint with the provided files and
|
362
|
-
returns the parsed elements.
|
363
|
-
"""
|
364
|
-
response: Optional[requests.Response] = None
|
365
|
-
try:
|
366
|
-
headers = {
|
367
|
-
"Authorization": f"Bearer {self.api_key}",
|
368
|
-
}
|
369
|
-
# Convert list to string representation required by the API
|
370
|
-
base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
|
371
|
-
output_formats_str = f"['{self.output_format}']"
|
372
|
-
|
373
|
-
response = requests.post(
|
374
|
-
self.base_url,
|
375
|
-
headers=headers,
|
376
|
-
files=files,
|
377
|
-
data={
|
378
|
-
"ocr": self.ocr,
|
379
|
-
"model": self.model,
|
380
|
-
"output_formats": output_formats_str,
|
381
|
-
"coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
|
382
|
-
"base64_encoding": base64_encoding_str,
|
383
|
-
},
|
384
|
-
)
|
385
|
-
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
386
|
-
|
387
|
-
# Check content type before parsing JSON
|
388
|
-
content_type = response.headers.get("Content-Type", "")
|
389
|
-
if "application/json" not in content_type:
|
390
|
-
raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
|
391
|
-
|
392
|
-
response_data = response.json()
|
393
|
-
result: object = response_data.get("elements", [])
|
394
|
-
|
395
|
-
if not isinstance(result, list):
|
396
|
-
raise ValueError(f"API response 'elements' is not a list: {result}")
|
397
|
-
result = cast(list[object], result) # Cast to list of objects
|
398
|
-
|
399
|
-
# Validate each element using Pydantic
|
400
|
-
validated_elements: list[Element] = []
|
401
|
-
for i, element_data in enumerate(result):
|
402
|
-
try:
|
403
|
-
validated_elements.append(Element.model_validate(element_data))
|
404
|
-
except Exception as e: # Catch Pydantic validation errors etc.
|
405
|
-
parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
|
406
|
-
# Decide whether to skip the element or raise the error
|
407
|
-
# continue # Option: skip problematic element
|
408
|
-
raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
|
409
|
-
|
410
|
-
return validated_elements
|
411
|
-
|
412
|
-
except requests.HTTPError as e:
|
413
|
-
# Log more details from the response if available
|
414
|
-
error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
|
415
|
-
try:
|
416
|
-
error_details = e.response.json() # Try to get JSON error details
|
417
|
-
error_message += f" - {error_details}"
|
418
|
-
except json.JSONDecodeError:
|
419
|
-
error_message += f" - Response body: {e.response.text}"
|
420
|
-
raise ValueError(error_message) from e
|
421
|
-
except requests.RequestException as e:
|
422
|
-
raise ValueError(f"Failed to send request: {e}") from e
|
423
|
-
except json.JSONDecodeError as e:
|
424
|
-
# Include part of the response text that failed to parse
|
425
|
-
raise ValueError(
|
426
|
-
f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
|
427
|
-
) from e
|
428
|
-
except Exception as e: # Catch-all for other unexpected errors
|
429
|
-
raise ValueError(f"An unexpected error occurred during API call: {e}") from e
|
430
|
-
|
431
|
-
def _split_and_request(
|
432
|
-
self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
|
433
|
-
) -> list[Element]:
|
434
|
-
"""
|
435
|
-
Splits the full pdf document into partial pages and sends a request.
|
436
|
-
"""
|
437
|
-
# Need to import here if not globally available
|
438
|
-
try:
|
439
|
-
from pypdf import PdfWriter
|
440
|
-
except ImportError:
|
441
|
-
raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
|
442
|
-
|
443
|
-
merger = PdfWriter()
|
444
|
-
total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
|
445
|
-
end_page = min(start_page + num_pages, total_pages)
|
446
|
-
|
447
|
-
# Check if start_page is valid
|
448
|
-
if start_page >= total_pages:
|
449
|
-
parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
|
450
|
-
return []
|
451
|
-
|
452
|
-
# pypdf page indices are 0-based, slicing is exclusive of the end index
|
453
|
-
# PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
|
454
|
-
# However, the example used pages=(start, end) which might behave differently depending on version?
|
455
|
-
# Let's stick to add_page for clarity if possible, or ensure append range is correct.
|
456
|
-
# merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
|
457
|
-
|
458
|
-
# Alternative using add_page loop (more explicit)
|
459
|
-
for i in range(start_page, end_page):
|
460
|
-
merger.add_page(full_docs.pages[i])
|
461
|
-
|
462
|
-
with io.BytesIO() as buffer:
|
463
|
-
merger.write(buffer)
|
464
|
-
buffer.seek(0)
|
465
|
-
# Need to provide a filename for the 'files' dict
|
466
|
-
return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
|
467
|
-
|
468
|
-
def _element_document(self, element: Element, start_page: int = 0) -> Document:
|
469
|
-
"""Converts an element into a Document object."""
|
470
|
-
# parse_text now handles image path generation and data storage if needed
|
471
|
-
page_content = element.parse_text(self)
|
472
|
-
metadata: dict[str, object] = element.model_dump(
|
473
|
-
exclude={"content", "base64_encoding"}, exclude_none=True
|
474
|
-
) # Exclude raw content/base64
|
475
|
-
metadata["page"] = element.page + start_page # Adjust page number
|
476
|
-
# Base64 encoding is not added to metadata if it was processed into image_data
|
477
|
-
# Coordinates are kept if requested
|
478
|
-
if not self.coordinates:
|
479
|
-
metadata.pop("coordinates", None)
|
480
|
-
|
481
|
-
return Document(
|
482
|
-
page_content=page_content,
|
483
|
-
metadata=metadata,
|
484
|
-
)
|
485
|
-
|
486
|
-
def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
|
487
|
-
"""Combines elements with the same page number into a single Document object."""
|
488
|
-
documents: list[Document] = []
|
489
|
-
if not elements:
|
490
|
-
return documents
|
491
|
-
|
492
|
-
# Group elements by page (relative to the current batch)
|
493
|
-
pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
|
494
|
-
page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
|
495
|
-
for element in elements:
|
496
|
-
page_groups[element.page].append(element)
|
497
|
-
|
498
|
-
for page_num, group in page_groups.items():
|
499
|
-
actual_page_num = page_num + start_page
|
500
|
-
page_content_parts: list[str] = []
|
501
|
-
page_coordinates: list[Coordinates] = []
|
502
|
-
# Base64 encodings are handled within parse_text now, not collected here
|
503
|
-
|
504
|
-
for element in sorted(group, key=lambda x: x.id): # Process elements in order
|
505
|
-
page_content_parts.append(element.parse_text(self))
|
506
|
-
if self.coordinates and element.coordinates:
|
507
|
-
page_coordinates.append({ # Store coordinates with element id/category for context
|
508
|
-
"id": element.id,
|
509
|
-
"category": element.category,
|
510
|
-
"coordinates": element.coordinates,
|
511
|
-
})
|
512
|
-
|
513
|
-
metadata: dict[str, object] = {
|
514
|
-
"page": actual_page_num,
|
515
|
-
}
|
516
|
-
if self.coordinates and page_coordinates:
|
517
|
-
metadata["element_coordinates"] = page_coordinates # Changed key for clarity
|
518
|
-
|
519
|
-
# Combine content, typically with spaces or newlines
|
520
|
-
# Using newline might be better for readability if elements are paragraphs etc.
|
521
|
-
combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
|
522
|
-
|
523
|
-
documents.append(
|
524
|
-
Document(
|
525
|
-
page_content=combined_page_content,
|
526
|
-
metadata=metadata,
|
527
|
-
)
|
528
|
-
)
|
529
|
-
|
530
|
-
return documents
|
531
|
-
|
532
|
-
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
533
|
-
"""
|
534
|
-
Lazily parses a document blob.
|
535
|
-
|
536
|
-
Yields Document objects based on the specified split type.
|
537
|
-
If images are extracted (chatterer=None, base64_encoding=["figure"]),
|
538
|
-
the image data will be available in `self.image_data` after iteration.
|
539
|
-
|
540
|
-
Args:
|
541
|
-
blob (Blob): The input document blob to parse. Requires `blob.path`.
|
542
|
-
is_batch (bool, optional): Currently affects PDF page batch size.
|
543
|
-
Defaults to False (process 1 page batch for PDF).
|
544
|
-
*Note: API might have limits regardless.*
|
545
|
-
|
546
|
-
Yields:
|
547
|
-
Document: The parsed document object(s).
|
548
|
-
|
549
|
-
Raises:
|
550
|
-
ValueError: If blob.path is not set, API error occurs, or invalid config.
|
551
|
-
ImportError: If pypdf is needed but not installed.
|
552
|
-
"""
|
553
|
-
# Clear image data at the start of parsing for this specific call
|
554
|
-
self.image_data = {}
|
555
|
-
|
556
|
-
if not blob.path:
|
557
|
-
# Non-PDF files and direct API calls require reading the file,
|
558
|
-
# PDF splitting also requires the path.
|
559
|
-
raise ValueError("Blob path is required for UpstageDocumentParseParser.")
|
560
|
-
|
561
|
-
# Try importing pypdf here, only if needed
|
562
|
-
PdfReader = None
|
563
|
-
PdfReadError = None
|
564
|
-
try:
|
565
|
-
from pypdf import PdfReader as PyPdfReader
|
566
|
-
from pypdf.errors import PdfReadError as PyPdfReadError
|
567
|
-
|
568
|
-
PdfReader = PyPdfReader
|
569
|
-
PdfReadError = PyPdfReadError
|
570
|
-
except ImportError:
|
571
|
-
# We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
|
572
|
-
# Let's attempt to read anyway, API might support non-PDFs directly.
|
573
|
-
# We'll check for PdfReader later if we determine it's a PDF.
|
574
|
-
pass
|
575
|
-
|
576
|
-
full_docs: Optional[PdfReader] = None
|
577
|
-
is_pdf = False
|
578
|
-
number_of_pages = 1 # Default for non-PDF or single-page docs
|
579
|
-
|
580
|
-
try:
|
581
|
-
# Check if it's a PDF by trying to open it
|
582
|
-
if PdfReader and PdfReadError:
|
583
|
-
try:
|
584
|
-
# Use strict=False to be more lenient with potentially corrupted PDFs
|
585
|
-
full_docs = PdfReader(str(blob.path), strict=False)
|
586
|
-
number_of_pages = len(full_docs.pages)
|
587
|
-
is_pdf = True
|
588
|
-
except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
|
589
|
-
parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
|
590
|
-
except Exception as e: # Catch other potential pypdf errors
|
591
|
-
parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
|
592
|
-
raise ValueError(f"Failed to process PDF file: {e}") from e
|
593
|
-
else:
|
594
|
-
parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
|
595
|
-
|
596
|
-
except Exception as e:
|
597
|
-
raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
|
598
|
-
|
599
|
-
# --- Parsing Logic based on Split Type ---
|
600
|
-
|
601
|
-
# Case 1: No Splitting (Combine all content)
|
602
|
-
if self.split == "none":
|
603
|
-
combined_result = ""
|
604
|
-
all_coordinates: list[PageCoordinates] = []
|
605
|
-
# Base64 handled by parse_text, data stored in self.image_data
|
606
|
-
|
607
|
-
if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
|
608
|
-
start_page = 0
|
609
|
-
# Use a reasonable batch size for 'none' split to avoid huge requests
|
610
|
-
batch_num_pages = DEFAULT_NUM_PAGES
|
611
|
-
while start_page < number_of_pages:
|
612
|
-
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
613
|
-
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
614
|
-
combined_result += element.parse_text(self) + "\n\n" # Add separator
|
615
|
-
if self.coordinates and element.coordinates:
|
616
|
-
# Adjust page number for coordinates metadata
|
617
|
-
coords_with_page: PageCoordinates = {
|
618
|
-
"id": element.id,
|
619
|
-
"category": element.category,
|
620
|
-
"page": element.page + start_page, # Actual page
|
621
|
-
"coordinates": element.coordinates,
|
622
|
-
}
|
623
|
-
all_coordinates.append(coords_with_page)
|
624
|
-
start_page += batch_num_pages
|
625
|
-
else: # Process non-PDF file as a single unit
|
626
|
-
with open(blob.path, "rb") as f:
|
627
|
-
# Provide a filename for the 'files' dict
|
628
|
-
filename = os.path.basename(blob.path)
|
629
|
-
elements = self._get_response({"document": (filename, f)})
|
630
|
-
|
631
|
-
for element in sorted(elements, key=lambda x: x.id):
|
632
|
-
combined_result += element.parse_text(self) + "\n\n"
|
633
|
-
if self.coordinates and element.coordinates:
|
634
|
-
all_coordinates.append({
|
635
|
-
"id": element.id,
|
636
|
-
"category": element.category,
|
637
|
-
"page": element.page, # Page is relative to the single doc (usually 0 or 1)
|
638
|
-
"coordinates": element.coordinates,
|
639
|
-
})
|
640
|
-
|
641
|
-
metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
|
642
|
-
if self.coordinates and all_coordinates:
|
643
|
-
metadata["element_coordinates"] = all_coordinates
|
644
|
-
# self.image_data is populated, no need to add base64 to metadata
|
645
|
-
|
646
|
-
yield Document(
|
647
|
-
page_content=combined_result.strip(),
|
648
|
-
metadata=metadata,
|
649
|
-
)
|
650
|
-
|
651
|
-
# Case 2: Split by Element
|
652
|
-
elif self.split == "element":
|
653
|
-
if is_pdf and full_docs and PdfReader:
|
654
|
-
start_page = 0
|
655
|
-
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
|
656
|
-
while start_page < number_of_pages:
|
657
|
-
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
658
|
-
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
659
|
-
# _element_document handles metadata and adjusts page number
|
660
|
-
doc = self._element_document(element, start_page)
|
661
|
-
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
662
|
-
yield doc
|
663
|
-
start_page += batch_num_pages
|
664
|
-
else: # Non-PDF
|
665
|
-
with open(blob.path, "rb") as f:
|
666
|
-
filename = os.path.basename(blob.path)
|
667
|
-
elements = self._get_response({"document": (filename, f)})
|
668
|
-
for element in sorted(elements, key=lambda x: x.id):
|
669
|
-
doc = self._element_document(element, 0) # Start page is 0 for single doc
|
670
|
-
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
671
|
-
yield doc
|
672
|
-
|
673
|
-
# Case 3: Split by Page
|
674
|
-
elif self.split == "page":
|
675
|
-
if is_pdf and full_docs and PdfReader:
|
676
|
-
start_page = 0
|
677
|
-
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
|
678
|
-
while start_page < number_of_pages:
|
679
|
-
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
680
|
-
# _page_document groups elements by page and creates Documents
|
681
|
-
page_docs = self._page_document(elements, start_page)
|
682
|
-
for doc in page_docs:
|
683
|
-
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
684
|
-
yield doc
|
685
|
-
start_page += batch_num_pages
|
686
|
-
else: # Non-PDF (treat as single page)
|
687
|
-
with open(blob.path, "rb") as f:
|
688
|
-
filename = os.path.basename(blob.path)
|
689
|
-
elements = self._get_response({"document": (filename, f)})
|
690
|
-
page_docs = self._page_document(elements, 0) # Process elements as page 0
|
691
|
-
for doc in page_docs:
|
692
|
-
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
693
|
-
yield doc
|
694
|
-
|
695
|
-
else:
|
696
|
-
raise ValueError(f"Invalid split type: {self.split}")
|
697
|
-
|
698
|
-
|
699
|
-
def _get_metadata_from_document(doc: Document) -> dict[object, object]:
|
700
|
-
"""
|
701
|
-
Helper function to extract metadata from a Document object.
|
702
|
-
This is a placeholder and should be adjusted based on actual metadata structure.
|
703
|
-
"""
|
704
|
-
metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
|
705
|
-
return metadata
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
"""Adopted from `langchain_upstage.document_parse`"""
|
3
|
+
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
import base64
|
7
|
+
import binascii
|
8
|
+
import io
|
9
|
+
import json
|
10
|
+
import logging
|
11
|
+
import os
|
12
|
+
import uuid
|
13
|
+
from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
|
14
|
+
|
15
|
+
import requests
|
16
|
+
from langchain_core.document_loaders import BaseBlobParser, Blob
|
17
|
+
from langchain_core.documents import Document
|
18
|
+
from pydantic import BaseModel, Field
|
19
|
+
|
20
|
+
from ..common_types.io import BytesReadable
|
21
|
+
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
22
|
+
from ..utils.base64_image import Base64Image
|
23
|
+
from ..utils.imghdr import what
|
24
|
+
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
from pypdf import PdfReader
|
27
|
+
|
28
|
+
logger = logging.getLogger("pypdf")
|
29
|
+
logger.setLevel(logging.ERROR)
|
30
|
+
parser_logger = logging.getLogger(__name__) # Added logger for this module
|
31
|
+
|
32
|
+
DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
|
33
|
+
DEFAULT_NUM_PAGES = 10
|
34
|
+
DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
|
35
|
+
DEFAULT_IMAGE_DIR = "images" # Added default image directory
|
36
|
+
|
37
|
+
OutputFormat = Literal["text", "html", "markdown"]
|
38
|
+
OCR = Literal["auto", "force"]
|
39
|
+
SplitType = Literal["none", "page", "element"]
|
40
|
+
Category = Literal[
|
41
|
+
"paragraph",
|
42
|
+
"table",
|
43
|
+
"figure",
|
44
|
+
"header",
|
45
|
+
"footer",
|
46
|
+
"caption",
|
47
|
+
"equation",
|
48
|
+
"heading1",
|
49
|
+
"list",
|
50
|
+
"index",
|
51
|
+
"footnote",
|
52
|
+
"chart",
|
53
|
+
]
|
54
|
+
|
55
|
+
|
56
|
+
class Content(BaseModel):
|
57
|
+
text: Optional[str] = None
|
58
|
+
html: Optional[str] = None
|
59
|
+
markdown: Optional[str] = None
|
60
|
+
|
61
|
+
|
62
|
+
class Coordinate(BaseModel):
|
63
|
+
x: float
|
64
|
+
y: float
|
65
|
+
|
66
|
+
|
67
|
+
class Element(BaseModel):
|
68
|
+
category: Category
|
69
|
+
content: Content
|
70
|
+
coordinates: list[Coordinate] = Field(default_factory=list)
|
71
|
+
base64_encoding: str = ""
|
72
|
+
id: int
|
73
|
+
page: int
|
74
|
+
|
75
|
+
def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
|
76
|
+
"""
|
77
|
+
Generates the text representation of the element.
|
78
|
+
|
79
|
+
If the element is a figure with base64 encoding and no chatterer is provided,
|
80
|
+
it generates a markdown link to a uniquely named image file and stores the
|
81
|
+
image data in the parser's image_data dictionary. Otherwise, it uses the
|
82
|
+
chatterer for description or returns the standard text/html/markdown.
|
83
|
+
"""
|
84
|
+
output_format: OutputFormat = parser.output_format
|
85
|
+
chatterer: Optional[Chatterer] = parser.chatterer
|
86
|
+
image_description_instruction: str = parser.image_description_instruction
|
87
|
+
output: Optional[str] = None
|
88
|
+
|
89
|
+
if output_format == "text":
|
90
|
+
output = self.content.text
|
91
|
+
elif output_format == "html":
|
92
|
+
output = self.content.html
|
93
|
+
elif output_format == "markdown":
|
94
|
+
output = self.content.markdown
|
95
|
+
|
96
|
+
if output is None:
|
97
|
+
# Fallback or raise error if needed, here using text as fallback
|
98
|
+
output = self.content.text or ""
|
99
|
+
# Or raise ValueError(f"Invalid output format or missing content: {output_format}")
|
100
|
+
|
101
|
+
# --- Logic modification starts here ---
|
102
|
+
if self.category == "figure" and self.base64_encoding:
|
103
|
+
# Case 1: Chatterer is available - Generate description
|
104
|
+
if chatterer is not None:
|
105
|
+
# Check if base64 encoding is valid
|
106
|
+
try:
|
107
|
+
# Decode base64 to check if valid
|
108
|
+
img_type = what(self.base64_encoding)
|
109
|
+
if not img_type:
|
110
|
+
parser_logger.warning(
|
111
|
+
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
112
|
+
)
|
113
|
+
return output
|
114
|
+
image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
|
115
|
+
|
116
|
+
except (binascii.Error, ValueError) as e:
|
117
|
+
parser_logger.warning(
|
118
|
+
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
119
|
+
)
|
120
|
+
return output
|
121
|
+
|
122
|
+
if image is None:
|
123
|
+
parser_logger.warning(
|
124
|
+
f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
|
125
|
+
)
|
126
|
+
# Fallback to original output (placeholder/OCR)
|
127
|
+
return output
|
128
|
+
|
129
|
+
ocr_content = ""
|
130
|
+
if output_format == "markdown":
|
131
|
+
ocr_content = output.removeprefix("\n")
|
132
|
+
elif output_format == "text":
|
133
|
+
ocr_content = output
|
134
|
+
|
135
|
+
image_description = chatterer.describe_image(
|
136
|
+
image.data_uri,
|
137
|
+
image_description_instruction
|
138
|
+
+ f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
|
139
|
+
)
|
140
|
+
# Return description within details tag (as original)
|
141
|
+
output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
|
142
|
+
|
143
|
+
# Case 2: Chatterer is NOT available - Generate file path and store data
|
144
|
+
elif parser.image_dir is not None:
|
145
|
+
try:
|
146
|
+
img_type = what(self.base64_encoding)
|
147
|
+
if not img_type:
|
148
|
+
parser_logger.warning(
|
149
|
+
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
150
|
+
)
|
151
|
+
return output
|
152
|
+
|
153
|
+
image_bytes = base64.b64decode(self.base64_encoding)
|
154
|
+
|
155
|
+
# Generate unique filename and path
|
156
|
+
filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
|
157
|
+
# Create relative path for markdown link, ensuring forward slashes
|
158
|
+
relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
|
159
|
+
|
160
|
+
# Store the image data for the user to save later
|
161
|
+
parser.image_data[relative_path] = image_bytes
|
162
|
+
|
163
|
+
# Extract OCR content if present
|
164
|
+
ocr_content = ""
|
165
|
+
if output_format == "markdown" and output.startswith("![image]"):
|
166
|
+
ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
|
167
|
+
elif output_format == "text":
|
168
|
+
ocr_content = output # Assume text output is OCR for images
|
169
|
+
|
170
|
+
# Update output to be the markdown link + OCR
|
171
|
+
output = f"\n{ocr_content}".strip()
|
172
|
+
|
173
|
+
except (binascii.Error, ValueError) as e:
|
174
|
+
# Handle potential base64 decoding errors gracefully
|
175
|
+
parser_logger.warning(
|
176
|
+
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
177
|
+
)
|
178
|
+
# Keep the original 'output' value (placeholder or OCR)
|
179
|
+
pass
|
180
|
+
|
181
|
+
return output
|
182
|
+
|
183
|
+
|
184
|
+
class Coordinates(TypedDict):
|
185
|
+
id: int
|
186
|
+
category: Category
|
187
|
+
coordinates: list[Coordinate]
|
188
|
+
|
189
|
+
|
190
|
+
class PageCoordinates(Coordinates):
|
191
|
+
page: int
|
192
|
+
|
193
|
+
|
194
|
+
def get_from_param_or_env(
|
195
|
+
key: str,
|
196
|
+
param: Optional[str] = None,
|
197
|
+
env_key: Optional[str] = None,
|
198
|
+
default: Optional[str] = None,
|
199
|
+
) -> str:
|
200
|
+
"""Get a value from a param or an environment variable."""
|
201
|
+
if param is not None:
|
202
|
+
return param
|
203
|
+
elif env_key and env_key in os.environ and os.environ[env_key]:
|
204
|
+
return os.environ[env_key]
|
205
|
+
elif default is not None:
|
206
|
+
return default
|
207
|
+
else:
|
208
|
+
raise ValueError(
|
209
|
+
f"Did not find {key}, please add an environment variable"
|
210
|
+
f" `{env_key}` which contains it, or pass"
|
211
|
+
f" `{key}` as a named parameter."
|
212
|
+
)
|
213
|
+
|
214
|
+
|
215
|
+
class UpstageDocumentParseParser(BaseBlobParser):
|
216
|
+
"""Upstage Document Parse Parser.
|
217
|
+
|
218
|
+
Parses documents using the Upstage Document AI API. Can optionally extract
|
219
|
+
images and return their data alongside the parsed documents.
|
220
|
+
|
221
|
+
If a `chatterer` is provided, it will be used to generate descriptions for
|
222
|
+
images (figures with base64 encoding).
|
223
|
+
|
224
|
+
If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
|
225
|
+
this parser will:
|
226
|
+
1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
|
227
|
+
The base directory can be configured with `image_dir`.
|
228
|
+
2. Replace the element's content with a markdown image link pointing to this path.
|
229
|
+
3. Store the actual image bytes in the `image_data` attribute dictionary,
|
230
|
+
mapping the generated relative path to the bytes.
|
231
|
+
|
232
|
+
The user is responsible for saving the files from the `image_data` dictionary
|
233
|
+
after processing the documents yielded by `lazy_parse`.
|
234
|
+
|
235
|
+
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
236
|
+
set with your API key or pass it as a named parameter to the constructor.
|
237
|
+
|
238
|
+
Example:
|
239
|
+
.. code-block:: python
|
240
|
+
|
241
|
+
from langchain_upstage import UpstageDocumentParseParser
|
242
|
+
from langchain_core.documents import Blob
|
243
|
+
import os
|
244
|
+
|
245
|
+
# --- Setup ---
|
246
|
+
# Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
|
247
|
+
# Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
|
248
|
+
|
249
|
+
# --- Parsing without chatterer (extracts images) ---
|
250
|
+
parser = UpstageDocumentParseParser(
|
251
|
+
split="page",
|
252
|
+
output_format="markdown",
|
253
|
+
base64_encoding=["figure"], # Important: Request base64 for figures
|
254
|
+
image_dir="extracted_images" # Optional: specify image dir
|
255
|
+
)
|
256
|
+
blob = Blob.from_path("my_document.pdf") # Or your image file path
|
257
|
+
documents = []
|
258
|
+
for doc in parser.lazy_parse(blob):
|
259
|
+
print("--- Document ---")
|
260
|
+
print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
261
|
+
print(doc.page_content)
|
262
|
+
documents.append(doc)
|
263
|
+
|
264
|
+
print("\\n--- Extracted Image Data ---")
|
265
|
+
if parser.image_data:
|
266
|
+
# User saves the images
|
267
|
+
for img_path, img_bytes in parser.image_data.items():
|
268
|
+
# Create directories if they don't exist
|
269
|
+
os.makedirs(os.path.dirname(img_path), exist_ok=True)
|
270
|
+
try:
|
271
|
+
with open(img_path, "wb") as f:
|
272
|
+
f.write(img_bytes)
|
273
|
+
print(f"Saved image: {img_path}")
|
274
|
+
except IOError as e:
|
275
|
+
print(f"Error saving image {img_path}: {e}")
|
276
|
+
else:
|
277
|
+
print("No images extracted.")
|
278
|
+
|
279
|
+
# --- Parsing with chatterer (generates descriptions) ---
|
280
|
+
# from langchain_upstage import UpstageChatter # Assuming this exists
|
281
|
+
# chatterer = UpstageChatter() # Initialize your chatterer
|
282
|
+
# parser_with_desc = UpstageDocumentParseParser(
|
283
|
+
# split="page",
|
284
|
+
# output_format="markdown",
|
285
|
+
# base64_encoding=["figure"], # Still need base64 for description
|
286
|
+
# chatterer=chatterer
|
287
|
+
# )
|
288
|
+
# documents_with_desc = list(parser_with_desc.lazy_parse(blob))
|
289
|
+
# print("\\n--- Documents with Descriptions ---")
|
290
|
+
# for doc in documents_with_desc:
|
291
|
+
# print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
292
|
+
# print(doc.page_content)
|
293
|
+
|
294
|
+
"""
|
295
|
+
|
296
|
+
def __init__(
|
297
|
+
self,
|
298
|
+
api_key: Optional[str] = None,
|
299
|
+
base_url: str = DOCUMENT_PARSE_BASE_URL,
|
300
|
+
model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
|
301
|
+
split: SplitType = "none",
|
302
|
+
ocr: OCR = "auto",
|
303
|
+
output_format: OutputFormat = "markdown",
|
304
|
+
coordinates: bool = True,
|
305
|
+
base64_encoding: list[Category] = [],
|
306
|
+
chatterer: Optional[Chatterer] = None,
|
307
|
+
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
|
308
|
+
image_dir: Optional[str] = None, # Added: Directory for image paths
|
309
|
+
) -> None:
|
310
|
+
"""
|
311
|
+
Initializes an instance of the UpstageDocumentParseParser.
|
312
|
+
|
313
|
+
Args:
|
314
|
+
api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
|
315
|
+
base_url (str, optional): Base URL for the Upstage API.
|
316
|
+
model (str): Model for document parse. Defaults to "document-parse".
|
317
|
+
split (SplitType, optional): Splitting type ("none", "page", "element").
|
318
|
+
Defaults to "none".
|
319
|
+
ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
|
320
|
+
output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
|
321
|
+
Defaults to "markdown".
|
322
|
+
coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
|
323
|
+
base64_encoding (List[Category], optional): Categories to return as base64.
|
324
|
+
Crucial for image extraction/description.
|
325
|
+
Set to `["figure"]` to process images.
|
326
|
+
Defaults to [].
|
327
|
+
chatterer (Chatterer, optional): Chatterer instance for image description.
|
328
|
+
If None, images will be extracted to files.
|
329
|
+
Defaults to None.
|
330
|
+
image_description_instruction (str, optional): Instruction for image description.
|
331
|
+
Defaults to a standard instruction.
|
332
|
+
image_dir (str, optional): The directory name to use when constructing
|
333
|
+
relative paths for extracted images.
|
334
|
+
Defaults to "images". This directory
|
335
|
+
is NOT created by the parser.
|
336
|
+
"""
|
337
|
+
self.api_key = get_from_param_or_env(
|
338
|
+
"UPSTAGE_API_KEY",
|
339
|
+
api_key,
|
340
|
+
"UPSTAGE_API_KEY",
|
341
|
+
os.environ.get("UPSTAGE_API_KEY"),
|
342
|
+
)
|
343
|
+
self.base_url = base_url
|
344
|
+
self.model = model
|
345
|
+
self.split: SplitType = split
|
346
|
+
self.ocr: OCR = ocr
|
347
|
+
self.output_format: OutputFormat = output_format
|
348
|
+
self.coordinates = coordinates
|
349
|
+
# Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
|
350
|
+
# However, it's better to require the user to explicitly set base64_encoding=["figure"]
|
351
|
+
self.base64_encoding: list[Category] = base64_encoding
|
352
|
+
self.chatterer = chatterer
|
353
|
+
self.image_description_instruction = image_description_instruction
|
354
|
+
self.image_dir = image_dir # Store output directory name
|
355
|
+
|
356
|
+
# Initialize dictionary to store image data (path -> bytes)
|
357
|
+
self.image_data: Dict[str, bytes] = {}
|
358
|
+
|
359
|
+
def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
|
360
|
+
"""
|
361
|
+
Sends a POST request to the API endpoint with the provided files and
|
362
|
+
returns the parsed elements.
|
363
|
+
"""
|
364
|
+
response: Optional[requests.Response] = None
|
365
|
+
try:
|
366
|
+
headers = {
|
367
|
+
"Authorization": f"Bearer {self.api_key}",
|
368
|
+
}
|
369
|
+
# Convert list to string representation required by the API
|
370
|
+
base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
|
371
|
+
output_formats_str = f"['{self.output_format}']"
|
372
|
+
|
373
|
+
response = requests.post(
|
374
|
+
self.base_url,
|
375
|
+
headers=headers,
|
376
|
+
files=files,
|
377
|
+
data={
|
378
|
+
"ocr": self.ocr,
|
379
|
+
"model": self.model,
|
380
|
+
"output_formats": output_formats_str,
|
381
|
+
"coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
|
382
|
+
"base64_encoding": base64_encoding_str,
|
383
|
+
},
|
384
|
+
)
|
385
|
+
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
386
|
+
|
387
|
+
# Check content type before parsing JSON
|
388
|
+
content_type = response.headers.get("Content-Type", "")
|
389
|
+
if "application/json" not in content_type:
|
390
|
+
raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
|
391
|
+
|
392
|
+
response_data = response.json()
|
393
|
+
result: object = response_data.get("elements", [])
|
394
|
+
|
395
|
+
if not isinstance(result, list):
|
396
|
+
raise ValueError(f"API response 'elements' is not a list: {result}")
|
397
|
+
result = cast(list[object], result) # Cast to list of objects
|
398
|
+
|
399
|
+
# Validate each element using Pydantic
|
400
|
+
validated_elements: list[Element] = []
|
401
|
+
for i, element_data in enumerate(result):
|
402
|
+
try:
|
403
|
+
validated_elements.append(Element.model_validate(element_data))
|
404
|
+
except Exception as e: # Catch Pydantic validation errors etc.
|
405
|
+
parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
|
406
|
+
# Decide whether to skip the element or raise the error
|
407
|
+
# continue # Option: skip problematic element
|
408
|
+
raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
|
409
|
+
|
410
|
+
return validated_elements
|
411
|
+
|
412
|
+
except requests.HTTPError as e:
|
413
|
+
# Log more details from the response if available
|
414
|
+
error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
|
415
|
+
try:
|
416
|
+
error_details = e.response.json() # Try to get JSON error details
|
417
|
+
error_message += f" - {error_details}"
|
418
|
+
except json.JSONDecodeError:
|
419
|
+
error_message += f" - Response body: {e.response.text}"
|
420
|
+
raise ValueError(error_message) from e
|
421
|
+
except requests.RequestException as e:
|
422
|
+
raise ValueError(f"Failed to send request: {e}") from e
|
423
|
+
except json.JSONDecodeError as e:
|
424
|
+
# Include part of the response text that failed to parse
|
425
|
+
raise ValueError(
|
426
|
+
f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
|
427
|
+
) from e
|
428
|
+
except Exception as e: # Catch-all for other unexpected errors
|
429
|
+
raise ValueError(f"An unexpected error occurred during API call: {e}") from e
|
430
|
+
|
431
|
+
def _split_and_request(
|
432
|
+
self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
|
433
|
+
) -> list[Element]:
|
434
|
+
"""
|
435
|
+
Splits the full pdf document into partial pages and sends a request.
|
436
|
+
"""
|
437
|
+
# Need to import here if not globally available
|
438
|
+
try:
|
439
|
+
from pypdf import PdfWriter
|
440
|
+
except ImportError:
|
441
|
+
raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
|
442
|
+
|
443
|
+
merger = PdfWriter()
|
444
|
+
total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
|
445
|
+
end_page = min(start_page + num_pages, total_pages)
|
446
|
+
|
447
|
+
# Check if start_page is valid
|
448
|
+
if start_page >= total_pages:
|
449
|
+
parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
|
450
|
+
return []
|
451
|
+
|
452
|
+
# pypdf page indices are 0-based, slicing is exclusive of the end index
|
453
|
+
# PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
|
454
|
+
# However, the example used pages=(start, end) which might behave differently depending on version?
|
455
|
+
# Let's stick to add_page for clarity if possible, or ensure append range is correct.
|
456
|
+
# merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
|
457
|
+
|
458
|
+
# Alternative using add_page loop (more explicit)
|
459
|
+
for i in range(start_page, end_page):
|
460
|
+
merger.add_page(full_docs.pages[i])
|
461
|
+
|
462
|
+
with io.BytesIO() as buffer:
|
463
|
+
merger.write(buffer)
|
464
|
+
buffer.seek(0)
|
465
|
+
# Need to provide a filename for the 'files' dict
|
466
|
+
return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
|
467
|
+
|
468
|
+
def _element_document(self, element: Element, start_page: int = 0) -> Document:
|
469
|
+
"""Converts an element into a Document object."""
|
470
|
+
# parse_text now handles image path generation and data storage if needed
|
471
|
+
page_content = element.parse_text(self)
|
472
|
+
metadata: dict[str, object] = element.model_dump(
|
473
|
+
exclude={"content", "base64_encoding"}, exclude_none=True
|
474
|
+
) # Exclude raw content/base64
|
475
|
+
metadata["page"] = element.page + start_page # Adjust page number
|
476
|
+
# Base64 encoding is not added to metadata if it was processed into image_data
|
477
|
+
# Coordinates are kept if requested
|
478
|
+
if not self.coordinates:
|
479
|
+
metadata.pop("coordinates", None)
|
480
|
+
|
481
|
+
return Document(
|
482
|
+
page_content=page_content,
|
483
|
+
metadata=metadata,
|
484
|
+
)
|
485
|
+
|
486
|
+
def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
|
487
|
+
"""Combines elements with the same page number into a single Document object."""
|
488
|
+
documents: list[Document] = []
|
489
|
+
if not elements:
|
490
|
+
return documents
|
491
|
+
|
492
|
+
# Group elements by page (relative to the current batch)
|
493
|
+
pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
|
494
|
+
page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
|
495
|
+
for element in elements:
|
496
|
+
page_groups[element.page].append(element)
|
497
|
+
|
498
|
+
for page_num, group in page_groups.items():
|
499
|
+
actual_page_num = page_num + start_page
|
500
|
+
page_content_parts: list[str] = []
|
501
|
+
page_coordinates: list[Coordinates] = []
|
502
|
+
# Base64 encodings are handled within parse_text now, not collected here
|
503
|
+
|
504
|
+
for element in sorted(group, key=lambda x: x.id): # Process elements in order
|
505
|
+
page_content_parts.append(element.parse_text(self))
|
506
|
+
if self.coordinates and element.coordinates:
|
507
|
+
page_coordinates.append({ # Store coordinates with element id/category for context
|
508
|
+
"id": element.id,
|
509
|
+
"category": element.category,
|
510
|
+
"coordinates": element.coordinates,
|
511
|
+
})
|
512
|
+
|
513
|
+
metadata: dict[str, object] = {
|
514
|
+
"page": actual_page_num,
|
515
|
+
}
|
516
|
+
if self.coordinates and page_coordinates:
|
517
|
+
metadata["element_coordinates"] = page_coordinates # Changed key for clarity
|
518
|
+
|
519
|
+
# Combine content, typically with spaces or newlines
|
520
|
+
# Using newline might be better for readability if elements are paragraphs etc.
|
521
|
+
combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
|
522
|
+
|
523
|
+
documents.append(
|
524
|
+
Document(
|
525
|
+
page_content=combined_page_content,
|
526
|
+
metadata=metadata,
|
527
|
+
)
|
528
|
+
)
|
529
|
+
|
530
|
+
return documents
|
531
|
+
|
532
|
+
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
533
|
+
"""
|
534
|
+
Lazily parses a document blob.
|
535
|
+
|
536
|
+
Yields Document objects based on the specified split type.
|
537
|
+
If images are extracted (chatterer=None, base64_encoding=["figure"]),
|
538
|
+
the image data will be available in `self.image_data` after iteration.
|
539
|
+
|
540
|
+
Args:
|
541
|
+
blob (Blob): The input document blob to parse. Requires `blob.path`.
|
542
|
+
is_batch (bool, optional): Currently affects PDF page batch size.
|
543
|
+
Defaults to False (process 1 page batch for PDF).
|
544
|
+
*Note: API might have limits regardless.*
|
545
|
+
|
546
|
+
Yields:
|
547
|
+
Document: The parsed document object(s).
|
548
|
+
|
549
|
+
Raises:
|
550
|
+
ValueError: If blob.path is not set, API error occurs, or invalid config.
|
551
|
+
ImportError: If pypdf is needed but not installed.
|
552
|
+
"""
|
553
|
+
# Clear image data at the start of parsing for this specific call
|
554
|
+
self.image_data = {}
|
555
|
+
|
556
|
+
if not blob.path:
|
557
|
+
# Non-PDF files and direct API calls require reading the file,
|
558
|
+
# PDF splitting also requires the path.
|
559
|
+
raise ValueError("Blob path is required for UpstageDocumentParseParser.")
|
560
|
+
|
561
|
+
# Try importing pypdf here, only if needed
|
562
|
+
PdfReader = None
|
563
|
+
PdfReadError = None
|
564
|
+
try:
|
565
|
+
from pypdf import PdfReader as PyPdfReader
|
566
|
+
from pypdf.errors import PdfReadError as PyPdfReadError
|
567
|
+
|
568
|
+
PdfReader = PyPdfReader
|
569
|
+
PdfReadError = PyPdfReadError
|
570
|
+
except ImportError:
|
571
|
+
# We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
|
572
|
+
# Let's attempt to read anyway, API might support non-PDFs directly.
|
573
|
+
# We'll check for PdfReader later if we determine it's a PDF.
|
574
|
+
pass
|
575
|
+
|
576
|
+
full_docs: Optional[PdfReader] = None
|
577
|
+
is_pdf = False
|
578
|
+
number_of_pages = 1 # Default for non-PDF or single-page docs
|
579
|
+
|
580
|
+
try:
|
581
|
+
# Check if it's a PDF by trying to open it
|
582
|
+
if PdfReader and PdfReadError:
|
583
|
+
try:
|
584
|
+
# Use strict=False to be more lenient with potentially corrupted PDFs
|
585
|
+
full_docs = PdfReader(str(blob.path), strict=False)
|
586
|
+
number_of_pages = len(full_docs.pages)
|
587
|
+
is_pdf = True
|
588
|
+
except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
|
589
|
+
parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
|
590
|
+
except Exception as e: # Catch other potential pypdf errors
|
591
|
+
parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
|
592
|
+
raise ValueError(f"Failed to process PDF file: {e}") from e
|
593
|
+
else:
|
594
|
+
parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
|
595
|
+
|
596
|
+
except Exception as e:
|
597
|
+
raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
|
598
|
+
|
599
|
+
# --- Parsing Logic based on Split Type ---
|
600
|
+
|
601
|
+
# Case 1: No Splitting (Combine all content)
|
602
|
+
if self.split == "none":
|
603
|
+
combined_result = ""
|
604
|
+
all_coordinates: list[PageCoordinates] = []
|
605
|
+
# Base64 handled by parse_text, data stored in self.image_data
|
606
|
+
|
607
|
+
if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
|
608
|
+
start_page = 0
|
609
|
+
# Use a reasonable batch size for 'none' split to avoid huge requests
|
610
|
+
batch_num_pages = DEFAULT_NUM_PAGES
|
611
|
+
while start_page < number_of_pages:
|
612
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
613
|
+
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
614
|
+
combined_result += element.parse_text(self) + "\n\n" # Add separator
|
615
|
+
if self.coordinates and element.coordinates:
|
616
|
+
# Adjust page number for coordinates metadata
|
617
|
+
coords_with_page: PageCoordinates = {
|
618
|
+
"id": element.id,
|
619
|
+
"category": element.category,
|
620
|
+
"page": element.page + start_page, # Actual page
|
621
|
+
"coordinates": element.coordinates,
|
622
|
+
}
|
623
|
+
all_coordinates.append(coords_with_page)
|
624
|
+
start_page += batch_num_pages
|
625
|
+
else: # Process non-PDF file as a single unit
|
626
|
+
with open(blob.path, "rb") as f:
|
627
|
+
# Provide a filename for the 'files' dict
|
628
|
+
filename = os.path.basename(blob.path)
|
629
|
+
elements = self._get_response({"document": (filename, f)})
|
630
|
+
|
631
|
+
for element in sorted(elements, key=lambda x: x.id):
|
632
|
+
combined_result += element.parse_text(self) + "\n\n"
|
633
|
+
if self.coordinates and element.coordinates:
|
634
|
+
all_coordinates.append({
|
635
|
+
"id": element.id,
|
636
|
+
"category": element.category,
|
637
|
+
"page": element.page, # Page is relative to the single doc (usually 0 or 1)
|
638
|
+
"coordinates": element.coordinates,
|
639
|
+
})
|
640
|
+
|
641
|
+
metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
|
642
|
+
if self.coordinates and all_coordinates:
|
643
|
+
metadata["element_coordinates"] = all_coordinates
|
644
|
+
# self.image_data is populated, no need to add base64 to metadata
|
645
|
+
|
646
|
+
yield Document(
|
647
|
+
page_content=combined_result.strip(),
|
648
|
+
metadata=metadata,
|
649
|
+
)
|
650
|
+
|
651
|
+
# Case 2: Split by Element
|
652
|
+
elif self.split == "element":
|
653
|
+
if is_pdf and full_docs and PdfReader:
|
654
|
+
start_page = 0
|
655
|
+
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
|
656
|
+
while start_page < number_of_pages:
|
657
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
658
|
+
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
659
|
+
# _element_document handles metadata and adjusts page number
|
660
|
+
doc = self._element_document(element, start_page)
|
661
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
662
|
+
yield doc
|
663
|
+
start_page += batch_num_pages
|
664
|
+
else: # Non-PDF
|
665
|
+
with open(blob.path, "rb") as f:
|
666
|
+
filename = os.path.basename(blob.path)
|
667
|
+
elements = self._get_response({"document": (filename, f)})
|
668
|
+
for element in sorted(elements, key=lambda x: x.id):
|
669
|
+
doc = self._element_document(element, 0) # Start page is 0 for single doc
|
670
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
671
|
+
yield doc
|
672
|
+
|
673
|
+
# Case 3: Split by Page
|
674
|
+
elif self.split == "page":
|
675
|
+
if is_pdf and full_docs and PdfReader:
|
676
|
+
start_page = 0
|
677
|
+
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
|
678
|
+
while start_page < number_of_pages:
|
679
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
680
|
+
# _page_document groups elements by page and creates Documents
|
681
|
+
page_docs = self._page_document(elements, start_page)
|
682
|
+
for doc in page_docs:
|
683
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
684
|
+
yield doc
|
685
|
+
start_page += batch_num_pages
|
686
|
+
else: # Non-PDF (treat as single page)
|
687
|
+
with open(blob.path, "rb") as f:
|
688
|
+
filename = os.path.basename(blob.path)
|
689
|
+
elements = self._get_response({"document": (filename, f)})
|
690
|
+
page_docs = self._page_document(elements, 0) # Process elements as page 0
|
691
|
+
for doc in page_docs:
|
692
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
693
|
+
yield doc
|
694
|
+
|
695
|
+
else:
|
696
|
+
raise ValueError(f"Invalid split type: {self.split}")
|
697
|
+
|
698
|
+
|
699
|
+
def _get_metadata_from_document(doc: Document) -> dict[object, object]:
|
700
|
+
"""
|
701
|
+
Helper function to extract metadata from a Document object.
|
702
|
+
This is a placeholder and should be adjusted based on actual metadata structure.
|
703
|
+
"""
|
704
|
+
metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
|
705
|
+
return metadata
|