chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +36 -5
- chatterer/interactive.py +692 -0
- chatterer/language_model.py +217 -261
- chatterer/messages.py +13 -1
- chatterer/tools/__init__.py +26 -15
- chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
- chatterer/tools/convert_pdf_to_markdown.py +302 -0
- chatterer/tools/convert_to_text.py +45 -16
- chatterer/tools/upstage_document_parser.py +481 -214
- chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
- chatterer/tools/youtube.py +2 -1
- chatterer/utils/__init__.py +1 -1
- chatterer/utils/{image.py → base64_image.py} +56 -62
- chatterer/utils/code_agent.py +137 -38
- chatterer/utils/imghdr.py +148 -0
- chatterer-0.1.16.dist-info/METADATA +392 -0
- chatterer-0.1.16.dist-info/RECORD +33 -0
- {chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/WHEEL +1 -1
- chatterer/tools/webpage_to_markdown/__init__.py +0 -4
- chatterer-0.1.13.dist-info/METADATA +0 -171
- chatterer-0.1.13.dist-info/RECORD +0 -31
- {chatterer-0.1.13.dist-info → chatterer-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,38 @@
|
|
1
|
-
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
"""Adopted from `langchain_upstage.document_parse`"""
|
2
3
|
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
import base64
|
7
|
+
import binascii
|
3
8
|
import io
|
4
9
|
import json
|
5
10
|
import logging
|
6
11
|
import os
|
7
|
-
|
12
|
+
import uuid
|
13
|
+
from typing import TYPE_CHECKING, Dict, Iterator, Literal, Optional, TypedDict, cast
|
8
14
|
|
9
15
|
import requests
|
10
16
|
from langchain_core.document_loaders import BaseBlobParser, Blob
|
11
17
|
from langchain_core.documents import Document
|
12
18
|
from pydantic import BaseModel, Field
|
13
|
-
from pypdf import PdfReader, PdfWriter
|
14
|
-
from pypdf.errors import PdfReadError
|
15
19
|
|
16
20
|
from ..common_types.io import BytesReadable
|
17
21
|
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
18
|
-
from ..utils.
|
22
|
+
from ..utils.base64_image import Base64Image
|
23
|
+
from ..utils.imghdr import what
|
24
|
+
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
from pypdf import PdfReader
|
19
27
|
|
20
28
|
logger = logging.getLogger("pypdf")
|
21
29
|
logger.setLevel(logging.ERROR)
|
30
|
+
parser_logger = logging.getLogger(__name__) # Added logger for this module
|
22
31
|
|
23
32
|
DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
|
24
33
|
DEFAULT_NUM_PAGES = 10
|
25
34
|
DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
|
35
|
+
DEFAULT_IMAGE_DIR = "images" # Added default image directory
|
26
36
|
|
27
37
|
OutputFormat = Literal["text", "html", "markdown"]
|
28
38
|
OCR = Literal["auto", "force"]
|
@@ -63,34 +73,124 @@ class Element(BaseModel):
|
|
63
73
|
page: int
|
64
74
|
|
65
75
|
def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
|
76
|
+
"""
|
77
|
+
Generates the text representation of the element.
|
78
|
+
|
79
|
+
If the element is a figure with base64 encoding and no chatterer is provided,
|
80
|
+
it generates a markdown link to a uniquely named image file and stores the
|
81
|
+
image data in the parser's image_data dictionary. Otherwise, it uses the
|
82
|
+
chatterer for description or returns the standard text/html/markdown.
|
83
|
+
"""
|
66
84
|
output_format: OutputFormat = parser.output_format
|
67
85
|
chatterer: Optional[Chatterer] = parser.chatterer
|
68
86
|
image_description_instruction: str = parser.image_description_instruction
|
69
87
|
output: Optional[str] = None
|
88
|
+
|
70
89
|
if output_format == "text":
|
71
90
|
output = self.content.text
|
72
91
|
elif output_format == "html":
|
73
92
|
output = self.content.html
|
74
93
|
elif output_format == "markdown":
|
75
94
|
output = self.content.markdown
|
95
|
+
|
76
96
|
if output is None:
|
77
|
-
raise
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
97
|
+
# Fallback or raise error if needed, here using text as fallback
|
98
|
+
output = self.content.text or ""
|
99
|
+
# Or raise ValueError(f"Invalid output format or missing content: {output_format}")
|
100
|
+
|
101
|
+
# --- Logic modification starts here ---
|
102
|
+
if self.category == "figure" and self.base64_encoding:
|
103
|
+
# Case 1: Chatterer is available - Generate description
|
104
|
+
if chatterer is not None:
|
105
|
+
# Check if base64 encoding is valid
|
106
|
+
try:
|
107
|
+
# Decode base64 to check if valid
|
108
|
+
img_type = what(self.base64_encoding)
|
109
|
+
if not img_type:
|
110
|
+
parser_logger.warning(
|
111
|
+
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
112
|
+
)
|
113
|
+
return output
|
114
|
+
image = Base64Image.from_string(f"data:image/{img_type};base64,{self.base64_encoding}")
|
115
|
+
|
116
|
+
except (binascii.Error, ValueError) as e:
|
117
|
+
parser_logger.warning(
|
118
|
+
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
119
|
+
)
|
120
|
+
return output
|
121
|
+
|
122
|
+
if image is None:
|
123
|
+
parser_logger.warning(
|
124
|
+
f"Invalid base64 encoding format for image element {self.id}, cannot create Base64Image object."
|
125
|
+
)
|
126
|
+
# Fallback to original output (placeholder/OCR)
|
127
|
+
return output
|
128
|
+
|
129
|
+
ocr_content = ""
|
130
|
+
if output_format == "markdown":
|
131
|
+
ocr_content = output.removeprefix("\n")
|
132
|
+
elif output_format == "text":
|
133
|
+
ocr_content = output
|
134
|
+
|
135
|
+
image_description = chatterer.describe_image(
|
136
|
+
image.data_uri,
|
137
|
+
image_description_instruction
|
138
|
+
+ f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
|
139
|
+
)
|
140
|
+
# Return description within details tag (as original)
|
141
|
+
output = f"\n\n<details>\n<summary>Image Description</summary>\n{image_description}\n</details>\n\n"
|
142
|
+
|
143
|
+
# Case 2: Chatterer is NOT available - Generate file path and store data
|
144
|
+
elif parser.image_dir is not None:
|
145
|
+
try:
|
146
|
+
img_type = what(self.base64_encoding)
|
147
|
+
if not img_type:
|
148
|
+
parser_logger.warning(
|
149
|
+
f"Could not determine image type for figure element {self.id} (page {self.page})."
|
150
|
+
)
|
151
|
+
return output
|
152
|
+
|
153
|
+
image_bytes = base64.b64decode(self.base64_encoding)
|
154
|
+
|
155
|
+
# Generate unique filename and path
|
156
|
+
filename = f"{uuid.uuid4().hex}.{img_type}" # Use default format
|
157
|
+
# Create relative path for markdown link, ensuring forward slashes
|
158
|
+
relative_path = os.path.join(parser.image_dir, filename).replace("\\", "/")
|
159
|
+
|
160
|
+
# Store the image data for the user to save later
|
161
|
+
parser.image_data[relative_path] = image_bytes
|
162
|
+
|
163
|
+
# Extract OCR content if present
|
164
|
+
ocr_content = ""
|
165
|
+
if output_format == "markdown" and output.startswith("![image]"):
|
166
|
+
ocr_content = output.split("\n", 1)[1] if "\n" in output else ""
|
167
|
+
elif output_format == "text":
|
168
|
+
ocr_content = output # Assume text output is OCR for images
|
169
|
+
|
170
|
+
# Update output to be the markdown link + OCR
|
171
|
+
output = f"\n{ocr_content}".strip()
|
172
|
+
|
173
|
+
except (binascii.Error, ValueError) as e:
|
174
|
+
# Handle potential base64 decoding errors gracefully
|
175
|
+
parser_logger.warning(
|
176
|
+
f"Could not decode base64 for figure element {self.id} (page {self.page}): {e}. Falling back to original output."
|
177
|
+
)
|
178
|
+
# Keep the original 'output' value (placeholder or OCR)
|
179
|
+
pass
|
90
180
|
|
91
181
|
return output
|
92
182
|
|
93
183
|
|
184
|
+
class Coordinates(TypedDict):
|
185
|
+
id: int
|
186
|
+
category: Category
|
187
|
+
coordinates: list[Coordinate]
|
188
|
+
|
189
|
+
|
190
|
+
class PageCoordinates(Coordinates):
|
191
|
+
page: int
|
192
|
+
|
193
|
+
|
94
194
|
def get_from_param_or_env(
|
95
195
|
key: str,
|
96
196
|
param: Optional[str] = None,
|
@@ -108,13 +208,30 @@ def get_from_param_or_env(
|
|
108
208
|
raise ValueError(
|
109
209
|
f"Did not find {key}, please add an environment variable"
|
110
210
|
f" `{env_key}` which contains it, or pass"
|
111
|
-
f"
|
211
|
+
f" `{key}` as a named parameter."
|
112
212
|
)
|
113
213
|
|
114
214
|
|
115
215
|
class UpstageDocumentParseParser(BaseBlobParser):
|
116
216
|
"""Upstage Document Parse Parser.
|
117
217
|
|
218
|
+
Parses documents using the Upstage Document AI API. Can optionally extract
|
219
|
+
images and return their data alongside the parsed documents.
|
220
|
+
|
221
|
+
If a `chatterer` is provided, it will be used to generate descriptions for
|
222
|
+
images (figures with base64 encoding).
|
223
|
+
|
224
|
+
If `chatterer` is NOT provided, for figure elements with `base64_encoding`,
|
225
|
+
this parser will:
|
226
|
+
1. Generate a unique relative file path (e.g., "images/uuid.jpeg").
|
227
|
+
The base directory can be configured with `image_dir`.
|
228
|
+
2. Replace the element's content with a markdown image link pointing to this path.
|
229
|
+
3. Store the actual image bytes in the `image_data` attribute dictionary,
|
230
|
+
mapping the generated relative path to the bytes.
|
231
|
+
|
232
|
+
The user is responsible for saving the files from the `image_data` dictionary
|
233
|
+
after processing the documents yielded by `lazy_parse`.
|
234
|
+
|
118
235
|
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
119
236
|
set with your API key or pass it as a named parameter to the constructor.
|
120
237
|
|
@@ -122,8 +239,58 @@ class UpstageDocumentParseParser(BaseBlobParser):
|
|
122
239
|
.. code-block:: python
|
123
240
|
|
124
241
|
from langchain_upstage import UpstageDocumentParseParser
|
242
|
+
from langchain_core.documents import Blob
|
243
|
+
import os
|
244
|
+
|
245
|
+
# --- Setup ---
|
246
|
+
# Ensure UPSTAGE_API_KEY is set in environment or passed as api_key
|
247
|
+
# Create a dummy PDF or image file 'my_document.pdf' / 'my_image.png'
|
248
|
+
|
249
|
+
# --- Parsing without chatterer (extracts images) ---
|
250
|
+
parser = UpstageDocumentParseParser(
|
251
|
+
split="page",
|
252
|
+
output_format="markdown",
|
253
|
+
base64_encoding=["figure"], # Important: Request base64 for figures
|
254
|
+
image_dir="extracted_images" # Optional: specify image dir
|
255
|
+
)
|
256
|
+
blob = Blob.from_path("my_document.pdf") # Or your image file path
|
257
|
+
documents = []
|
258
|
+
for doc in parser.lazy_parse(blob):
|
259
|
+
print("--- Document ---")
|
260
|
+
print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
261
|
+
print(doc.page_content)
|
262
|
+
documents.append(doc)
|
263
|
+
|
264
|
+
print("\\n--- Extracted Image Data ---")
|
265
|
+
if parser.image_data:
|
266
|
+
# User saves the images
|
267
|
+
for img_path, img_bytes in parser.image_data.items():
|
268
|
+
# Create directories if they don't exist
|
269
|
+
os.makedirs(os.path.dirname(img_path), exist_ok=True)
|
270
|
+
try:
|
271
|
+
with open(img_path, "wb") as f:
|
272
|
+
f.write(img_bytes)
|
273
|
+
print(f"Saved image: {img_path}")
|
274
|
+
except IOError as e:
|
275
|
+
print(f"Error saving image {img_path}: {e}")
|
276
|
+
else:
|
277
|
+
print("No images extracted.")
|
278
|
+
|
279
|
+
# --- Parsing with chatterer (generates descriptions) ---
|
280
|
+
# from langchain_upstage import UpstageChatter # Assuming this exists
|
281
|
+
# chatterer = UpstageChatter() # Initialize your chatterer
|
282
|
+
# parser_with_desc = UpstageDocumentParseParser(
|
283
|
+
# split="page",
|
284
|
+
# output_format="markdown",
|
285
|
+
# base64_encoding=["figure"], # Still need base64 for description
|
286
|
+
# chatterer=chatterer
|
287
|
+
# )
|
288
|
+
# documents_with_desc = list(parser_with_desc.lazy_parse(blob))
|
289
|
+
# print("\\n--- Documents with Descriptions ---")
|
290
|
+
# for doc in documents_with_desc:
|
291
|
+
# print(f"Page: {get_metadata_from_document(doc).get('page')}")
|
292
|
+
# print(doc.page_content)
|
125
293
|
|
126
|
-
loader = UpstageDocumentParseParser(split="page", output_format="text")
|
127
294
|
"""
|
128
295
|
|
129
296
|
def __init__(
|
@@ -138,36 +305,34 @@ class UpstageDocumentParseParser(BaseBlobParser):
|
|
138
305
|
base64_encoding: list[Category] = [],
|
139
306
|
chatterer: Optional[Chatterer] = None,
|
140
307
|
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
|
308
|
+
image_dir: Optional[str] = None, # Added: Directory for image paths
|
141
309
|
) -> None:
|
142
310
|
"""
|
143
|
-
Initializes an instance of the
|
311
|
+
Initializes an instance of the UpstageDocumentParseParser.
|
144
312
|
|
145
313
|
Args:
|
146
|
-
api_key (str, optional):
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
image description.
|
169
|
-
|
170
|
-
|
314
|
+
api_key (str, optional): Upstage API key. Defaults to env `UPSTAGE_API_KEY`.
|
315
|
+
base_url (str, optional): Base URL for the Upstage API.
|
316
|
+
model (str): Model for document parse. Defaults to "document-parse".
|
317
|
+
split (SplitType, optional): Splitting type ("none", "page", "element").
|
318
|
+
Defaults to "none".
|
319
|
+
ocr (OCR, optional): OCR mode ("auto", "force"). Defaults to "auto".
|
320
|
+
output_format (OutputFormat, optional): Output format ("text", "html", "markdown").
|
321
|
+
Defaults to "markdown".
|
322
|
+
coordinates (bool, optional): Include coordinates in metadata. Defaults to True.
|
323
|
+
base64_encoding (List[Category], optional): Categories to return as base64.
|
324
|
+
Crucial for image extraction/description.
|
325
|
+
Set to `["figure"]` to process images.
|
326
|
+
Defaults to [].
|
327
|
+
chatterer (Chatterer, optional): Chatterer instance for image description.
|
328
|
+
If None, images will be extracted to files.
|
329
|
+
Defaults to None.
|
330
|
+
image_description_instruction (str, optional): Instruction for image description.
|
331
|
+
Defaults to a standard instruction.
|
332
|
+
image_dir (str, optional): The directory name to use when constructing
|
333
|
+
relative paths for extracted images.
|
334
|
+
Defaults to "images". This directory
|
335
|
+
is NOT created by the parser.
|
171
336
|
"""
|
172
337
|
self.api_key = get_from_param_or_env(
|
173
338
|
"UPSTAGE_API_KEY",
|
@@ -181,28 +346,30 @@ class UpstageDocumentParseParser(BaseBlobParser):
|
|
181
346
|
self.ocr: OCR = ocr
|
182
347
|
self.output_format: OutputFormat = output_format
|
183
348
|
self.coordinates = coordinates
|
349
|
+
# Ensure 'figure' is requested if chatterer is None and user wants extraction implicitly
|
350
|
+
# However, it's better to require the user to explicitly set base64_encoding=["figure"]
|
184
351
|
self.base64_encoding: list[Category] = base64_encoding
|
185
352
|
self.chatterer = chatterer
|
186
353
|
self.image_description_instruction = image_description_instruction
|
354
|
+
self.image_dir = image_dir # Store output directory name
|
355
|
+
|
356
|
+
# Initialize dictionary to store image data (path -> bytes)
|
357
|
+
self.image_data: Dict[str, bytes] = {}
|
187
358
|
|
188
|
-
def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
|
359
|
+
def _get_response(self, files: dict[str, tuple[str, BytesReadable]]) -> list[Element]:
|
189
360
|
"""
|
190
361
|
Sends a POST request to the API endpoint with the provided files and
|
191
|
-
returns the
|
192
|
-
|
193
|
-
Args:
|
194
|
-
files (dict): A dictionary containing the files to be sent in the request.
|
195
|
-
|
196
|
-
Returns:
|
197
|
-
dict: The JSON response from the API.
|
198
|
-
|
199
|
-
Raises:
|
200
|
-
ValueError: If there is an error in the API call.
|
362
|
+
returns the parsed elements.
|
201
363
|
"""
|
364
|
+
response: Optional[requests.Response] = None
|
202
365
|
try:
|
203
366
|
headers = {
|
204
367
|
"Authorization": f"Bearer {self.api_key}",
|
205
368
|
}
|
369
|
+
# Convert list to string representation required by the API
|
370
|
+
base64_encoding_str = str(self.base64_encoding) if self.base64_encoding else "[]"
|
371
|
+
output_formats_str = f"['{self.output_format}']"
|
372
|
+
|
206
373
|
response = requests.post(
|
207
374
|
self.base_url,
|
208
375
|
headers=headers,
|
@@ -210,104 +377,152 @@ class UpstageDocumentParseParser(BaseBlobParser):
|
|
210
377
|
data={
|
211
378
|
"ocr": self.ocr,
|
212
379
|
"model": self.model,
|
213
|
-
"output_formats":
|
214
|
-
"coordinates": self.coordinates,
|
215
|
-
"base64_encoding":
|
380
|
+
"output_formats": output_formats_str,
|
381
|
+
"coordinates": str(self.coordinates).lower(), # API might expect 'true'/'false'
|
382
|
+
"base64_encoding": base64_encoding_str,
|
216
383
|
},
|
217
384
|
)
|
218
|
-
response.raise_for_status()
|
219
|
-
|
385
|
+
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
386
|
+
|
387
|
+
# Check content type before parsing JSON
|
388
|
+
content_type = response.headers.get("Content-Type", "")
|
389
|
+
if "application/json" not in content_type:
|
390
|
+
raise ValueError(f"Unexpected content type: {content_type}. Response body: {response.text}")
|
391
|
+
|
392
|
+
response_data = response.json()
|
393
|
+
result: object = response_data.get("elements", [])
|
394
|
+
|
220
395
|
if not isinstance(result, list):
|
221
|
-
raise ValueError(f"
|
222
|
-
result = cast(list[object], result)
|
223
|
-
|
396
|
+
raise ValueError(f"API response 'elements' is not a list: {result}")
|
397
|
+
result = cast(list[object], result) # Cast to list of objects
|
398
|
+
|
399
|
+
# Validate each element using Pydantic
|
400
|
+
validated_elements: list[Element] = []
|
401
|
+
for i, element_data in enumerate(result):
|
402
|
+
try:
|
403
|
+
validated_elements.append(Element.model_validate(element_data))
|
404
|
+
except Exception as e: # Catch Pydantic validation errors etc.
|
405
|
+
parser_logger.error(f"Failed to validate element {i}: {element_data}. Error: {e}")
|
406
|
+
# Decide whether to skip the element or raise the error
|
407
|
+
# continue # Option: skip problematic element
|
408
|
+
raise ValueError(f"Failed to validate element {i}: {e}") from e # Option: fail fast
|
409
|
+
|
410
|
+
return validated_elements
|
411
|
+
|
224
412
|
except requests.HTTPError as e:
|
225
|
-
|
413
|
+
# Log more details from the response if available
|
414
|
+
error_message = f"HTTP error: {e.response.status_code} {e.response.reason}"
|
415
|
+
try:
|
416
|
+
error_details = e.response.json() # Try to get JSON error details
|
417
|
+
error_message += f" - {error_details}"
|
418
|
+
except json.JSONDecodeError:
|
419
|
+
error_message += f" - Response body: {e.response.text}"
|
420
|
+
raise ValueError(error_message) from e
|
226
421
|
except requests.RequestException as e:
|
227
|
-
|
228
|
-
raise ValueError(f"Failed to send request: {e}")
|
422
|
+
raise ValueError(f"Failed to send request: {e}") from e
|
229
423
|
except json.JSONDecodeError as e:
|
230
|
-
#
|
231
|
-
raise ValueError(
|
232
|
-
|
233
|
-
|
234
|
-
|
424
|
+
# Include part of the response text that failed to parse
|
425
|
+
raise ValueError(
|
426
|
+
f"Failed to decode JSON response: {e}. Response text starts with: {response.text[:200] if response else 'No response'}"
|
427
|
+
) from e
|
428
|
+
except Exception as e: # Catch-all for other unexpected errors
|
429
|
+
raise ValueError(f"An unexpected error occurred during API call: {e}") from e
|
235
430
|
|
236
431
|
def _split_and_request(
|
237
432
|
self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
|
238
433
|
) -> list[Element]:
|
239
434
|
"""
|
240
|
-
Splits the full pdf document into partial pages and sends a request
|
241
|
-
server.
|
242
|
-
|
243
|
-
Args:
|
244
|
-
full_docs (PdfReader): The full document to be split and requested.
|
245
|
-
start_page (int): The starting page number for splitting the document.
|
246
|
-
num_pages (int, optional): The number of pages to split the document
|
247
|
-
into.
|
248
|
-
Defaults to DEFAULT_NUMBER_OF_PAGE.
|
249
|
-
|
250
|
-
Returns:
|
251
|
-
response: The response from the server.
|
435
|
+
Splits the full pdf document into partial pages and sends a request.
|
252
436
|
"""
|
437
|
+
# Need to import here if not globally available
|
438
|
+
try:
|
439
|
+
from pypdf import PdfWriter
|
440
|
+
except ImportError:
|
441
|
+
raise ImportError("pypdf is required for PDF splitting. Please install it with `pip install pypdf`.")
|
442
|
+
|
253
443
|
merger = PdfWriter()
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
444
|
+
total_pages = len(full_docs.pages) # Use len(reader.pages) instead of get_num_pages()
|
445
|
+
end_page = min(start_page + num_pages, total_pages)
|
446
|
+
|
447
|
+
# Check if start_page is valid
|
448
|
+
if start_page >= total_pages:
|
449
|
+
parser_logger.warning(f"Start page {start_page} is out of bounds for document with {total_pages} pages.")
|
450
|
+
return []
|
451
|
+
|
452
|
+
# pypdf page indices are 0-based, slicing is exclusive of the end index
|
453
|
+
# PdfWriter.append() expects pages=(start, stop) where stop is exclusive.
|
454
|
+
# However, the example used pages=(start, end) which might behave differently depending on version?
|
455
|
+
# Let's stick to add_page for clarity if possible, or ensure append range is correct.
|
456
|
+
# merger.append(full_docs, pages=(start_page, end_page)) # This selects pages start_page..end_page-1
|
457
|
+
|
458
|
+
# Alternative using add_page loop (more explicit)
|
459
|
+
for i in range(start_page, end_page):
|
460
|
+
merger.add_page(full_docs.pages[i])
|
258
461
|
|
259
462
|
with io.BytesIO() as buffer:
|
260
463
|
merger.write(buffer)
|
261
464
|
buffer.seek(0)
|
262
|
-
|
465
|
+
# Need to provide a filename for the 'files' dict
|
466
|
+
return self._get_response({"document": ("partial_doc.pdf", buffer)}) # Provide a dummy filename
|
263
467
|
|
264
468
|
def _element_document(self, element: Element, start_page: int = 0) -> Document:
|
265
|
-
"""
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
469
|
+
"""Converts an element into a Document object."""
|
470
|
+
# parse_text now handles image path generation and data storage if needed
|
471
|
+
page_content = element.parse_text(self)
|
472
|
+
metadata: dict[str, object] = element.model_dump(
|
473
|
+
exclude={"content", "base64_encoding"}, exclude_none=True
|
474
|
+
) # Exclude raw content/base64
|
475
|
+
metadata["page"] = element.page + start_page # Adjust page number
|
476
|
+
# Base64 encoding is not added to metadata if it was processed into image_data
|
477
|
+
# Coordinates are kept if requested
|
478
|
+
if not self.coordinates:
|
479
|
+
metadata.pop("coordinates", None)
|
272
480
|
|
273
|
-
Returns:
|
274
|
-
A list containing a single Document object.
|
275
|
-
|
276
|
-
"""
|
277
|
-
metadata: dict[str, object] = element.model_dump(exclude_none=True)
|
278
|
-
metadata["page"] = element.page + start_page
|
279
481
|
return Document(
|
280
|
-
page_content=
|
482
|
+
page_content=page_content,
|
281
483
|
metadata=metadata,
|
282
484
|
)
|
283
485
|
|
284
486
|
def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
|
285
|
-
"""
|
286
|
-
Combines elements with the same page number into a single Document object.
|
287
|
-
|
288
|
-
Args:
|
289
|
-
elements (List): A list of elements containing page numbers.
|
290
|
-
start_page (int): The starting page number for splitting the document.
|
291
|
-
This number starts from zero.
|
292
|
-
|
293
|
-
Returns:
|
294
|
-
List[Document]: A list of Document objects, each representing a page
|
295
|
-
with its content and metadata.
|
296
|
-
"""
|
487
|
+
"""Combines elements with the same page number into a single Document object."""
|
297
488
|
documents: list[Document] = []
|
298
|
-
|
299
|
-
|
300
|
-
|
489
|
+
if not elements:
|
490
|
+
return documents
|
491
|
+
|
492
|
+
# Group elements by page (relative to the current batch)
|
493
|
+
pages: list[int] = sorted(list(set(map(lambda x: x.page, elements))))
|
494
|
+
page_groups: Dict[int, list[Element]] = {page: [] for page in pages}
|
495
|
+
for element in elements:
|
496
|
+
page_groups[element.page].append(element)
|
497
|
+
|
498
|
+
for page_num, group in page_groups.items():
|
499
|
+
actual_page_num = page_num + start_page
|
500
|
+
page_content_parts: list[str] = []
|
501
|
+
page_coordinates: list[Coordinates] = []
|
502
|
+
# Base64 encodings are handled within parse_text now, not collected here
|
503
|
+
|
504
|
+
for element in sorted(group, key=lambda x: x.id): # Process elements in order
|
505
|
+
page_content_parts.append(element.parse_text(self))
|
506
|
+
if self.coordinates and element.coordinates:
|
507
|
+
page_coordinates.append({ # Store coordinates with element id/category for context
|
508
|
+
"id": element.id,
|
509
|
+
"category": element.category,
|
510
|
+
"coordinates": element.coordinates,
|
511
|
+
})
|
512
|
+
|
301
513
|
metadata: dict[str, object] = {
|
302
|
-
"page":
|
514
|
+
"page": actual_page_num,
|
303
515
|
}
|
304
|
-
if self.
|
305
|
-
metadata["
|
306
|
-
|
307
|
-
|
516
|
+
if self.coordinates and page_coordinates:
|
517
|
+
metadata["element_coordinates"] = page_coordinates # Changed key for clarity
|
518
|
+
|
519
|
+
# Combine content, typically with spaces or newlines
|
520
|
+
# Using newline might be better for readability if elements are paragraphs etc.
|
521
|
+
combined_page_content = "\n\n".join(part for part in page_content_parts if part) # Join non-empty parts
|
522
|
+
|
308
523
|
documents.append(
|
309
524
|
Document(
|
310
|
-
page_content=
|
525
|
+
page_content=combined_page_content,
|
311
526
|
metadata=metadata,
|
312
527
|
)
|
313
528
|
)
|
@@ -316,123 +531,175 @@ class UpstageDocumentParseParser(BaseBlobParser):
|
|
316
531
|
|
317
532
|
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
318
533
|
"""
|
319
|
-
Lazily parses a document
|
320
|
-
|
534
|
+
Lazily parses a document blob.
|
535
|
+
|
536
|
+
Yields Document objects based on the specified split type.
|
537
|
+
If images are extracted (chatterer=None, base64_encoding=["figure"]),
|
538
|
+
the image data will be available in `self.image_data` after iteration.
|
321
539
|
|
322
540
|
Args:
|
323
|
-
blob (Blob): The input document blob to parse.
|
324
|
-
is_batch (bool, optional):
|
325
|
-
Defaults to False (
|
541
|
+
blob (Blob): The input document blob to parse. Requires `blob.path`.
|
542
|
+
is_batch (bool, optional): Currently affects PDF page batch size.
|
543
|
+
Defaults to False (process 1 page batch for PDF).
|
544
|
+
*Note: API might have limits regardless.*
|
326
545
|
|
327
546
|
Yields:
|
328
|
-
Document: The parsed document object.
|
547
|
+
Document: The parsed document object(s).
|
329
548
|
|
330
549
|
Raises:
|
331
|
-
ValueError: If
|
332
|
-
|
550
|
+
ValueError: If blob.path is not set, API error occurs, or invalid config.
|
551
|
+
ImportError: If pypdf is needed but not installed.
|
333
552
|
"""
|
553
|
+
# Clear image data at the start of parsing for this specific call
|
554
|
+
self.image_data = {}
|
334
555
|
|
335
|
-
if
|
336
|
-
|
337
|
-
|
338
|
-
|
556
|
+
if not blob.path:
|
557
|
+
# Non-PDF files and direct API calls require reading the file,
|
558
|
+
# PDF splitting also requires the path.
|
559
|
+
raise ValueError("Blob path is required for UpstageDocumentParseParser.")
|
339
560
|
|
340
|
-
|
561
|
+
# Try importing pypdf here, only if needed
|
562
|
+
PdfReader = None
|
563
|
+
PdfReadError = None
|
341
564
|
try:
|
342
|
-
|
343
|
-
|
344
|
-
except PdfReadError:
|
345
|
-
number_of_pages = 1
|
346
|
-
except Exception as e:
|
347
|
-
raise ValueError(f"Failed to read PDF file: {e}")
|
348
|
-
|
349
|
-
if self.split == "none":
|
350
|
-
result = ""
|
351
|
-
base64_encodings: list[str] = []
|
352
|
-
coordinates: list[list[Coordinate]] = []
|
353
|
-
|
354
|
-
if full_docs is not None:
|
355
|
-
start_page = 0
|
356
|
-
num_pages = DEFAULT_NUM_PAGES
|
357
|
-
for _ in range(number_of_pages):
|
358
|
-
if start_page >= number_of_pages:
|
359
|
-
break
|
565
|
+
from pypdf import PdfReader as PyPdfReader
|
566
|
+
from pypdf.errors import PdfReadError as PyPdfReadError
|
360
567
|
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
568
|
+
PdfReader = PyPdfReader
|
569
|
+
PdfReadError = PyPdfReadError
|
570
|
+
except ImportError:
|
571
|
+
# We only absolutely need pypdf if the file is a PDF and split is not 'none' maybe?
|
572
|
+
# Let's attempt to read anyway, API might support non-PDFs directly.
|
573
|
+
# We'll check for PdfReader later if we determine it's a PDF.
|
574
|
+
pass
|
368
575
|
|
369
|
-
|
576
|
+
full_docs: Optional[PdfReader] = None
|
577
|
+
is_pdf = False
|
578
|
+
number_of_pages = 1 # Default for non-PDF or single-page docs
|
370
579
|
|
580
|
+
try:
|
581
|
+
# Check if it's a PDF by trying to open it
|
582
|
+
if PdfReader and PdfReadError:
|
583
|
+
try:
|
584
|
+
# Use strict=False to be more lenient with potentially corrupted PDFs
|
585
|
+
full_docs = PdfReader(str(blob.path), strict=False)
|
586
|
+
number_of_pages = len(full_docs.pages)
|
587
|
+
is_pdf = True
|
588
|
+
except (PdfReadError, FileNotFoundError, IsADirectoryError) as e:
|
589
|
+
parser_logger.warning(f"Could not read '{blob.path}' as PDF: {e}. Assuming non-PDF format.")
|
590
|
+
except Exception as e: # Catch other potential pypdf errors
|
591
|
+
parser_logger.error(f"Unexpected error reading PDF '{blob.path}': {e}")
|
592
|
+
raise ValueError(f"Failed to process PDF file: {e}") from e
|
371
593
|
else:
|
372
|
-
|
373
|
-
raise ValueError("Blob path is required for non-PDF files.")
|
594
|
+
parser_logger.info("pypdf not installed. Treating input as a single non-PDF document for the API.")
|
374
595
|
|
375
|
-
|
376
|
-
|
596
|
+
except Exception as e:
|
597
|
+
raise ValueError(f"Failed to access or identify file type for: {blob.path}. Error: {e}") from e
|
377
598
|
|
378
|
-
|
379
|
-
result += element.parse_text(self)
|
599
|
+
# --- Parsing Logic based on Split Type ---
|
380
600
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
601
|
+
# Case 1: No Splitting (Combine all content)
|
602
|
+
if self.split == "none":
|
603
|
+
combined_result = ""
|
604
|
+
all_coordinates: list[PageCoordinates] = []
|
605
|
+
# Base64 handled by parse_text, data stored in self.image_data
|
606
|
+
|
607
|
+
if is_pdf and full_docs and PdfReader: # Process PDF page by page or in batches
|
608
|
+
start_page = 0
|
609
|
+
# Use a reasonable batch size for 'none' split to avoid huge requests
|
610
|
+
batch_num_pages = DEFAULT_NUM_PAGES
|
611
|
+
while start_page < number_of_pages:
|
612
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
613
|
+
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
614
|
+
combined_result += element.parse_text(self) + "\n\n" # Add separator
|
615
|
+
if self.coordinates and element.coordinates:
|
616
|
+
# Adjust page number for coordinates metadata
|
617
|
+
coords_with_page: PageCoordinates = {
|
618
|
+
"id": element.id,
|
619
|
+
"category": element.category,
|
620
|
+
"page": element.page + start_page, # Actual page
|
621
|
+
"coordinates": element.coordinates,
|
622
|
+
}
|
623
|
+
all_coordinates.append(coords_with_page)
|
624
|
+
start_page += batch_num_pages
|
625
|
+
else: # Process non-PDF file as a single unit
|
626
|
+
with open(blob.path, "rb") as f:
|
627
|
+
# Provide a filename for the 'files' dict
|
628
|
+
filename = os.path.basename(blob.path)
|
629
|
+
elements = self._get_response({"document": (filename, f)})
|
630
|
+
|
631
|
+
for element in sorted(elements, key=lambda x: x.id):
|
632
|
+
combined_result += element.parse_text(self) + "\n\n"
|
633
|
+
if self.coordinates and element.coordinates:
|
634
|
+
all_coordinates.append({
|
635
|
+
"id": element.id,
|
636
|
+
"category": element.category,
|
637
|
+
"page": element.page, # Page is relative to the single doc (usually 0 or 1)
|
638
|
+
"coordinates": element.coordinates,
|
639
|
+
})
|
640
|
+
|
641
|
+
metadata: dict[str, object] = {"source": blob.path, "total_pages": number_of_pages}
|
642
|
+
if self.coordinates and all_coordinates:
|
643
|
+
metadata["element_coordinates"] = all_coordinates
|
644
|
+
# self.image_data is populated, no need to add base64 to metadata
|
390
645
|
|
391
646
|
yield Document(
|
392
|
-
page_content=
|
647
|
+
page_content=combined_result.strip(),
|
393
648
|
metadata=metadata,
|
394
649
|
)
|
395
650
|
|
651
|
+
# Case 2: Split by Element
|
396
652
|
elif self.split == "element":
|
397
|
-
if full_docs
|
653
|
+
if is_pdf and full_docs and PdfReader:
|
398
654
|
start_page = 0
|
399
|
-
for
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
start_page +=
|
408
|
-
|
409
|
-
else:
|
410
|
-
if not blob.path:
|
411
|
-
raise ValueError("Blob path is required for non-PDF files.")
|
655
|
+
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Use smaller batches for element split?
|
656
|
+
while start_page < number_of_pages:
|
657
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
658
|
+
for element in sorted(elements, key=lambda x: (x.page, x.id)):
|
659
|
+
# _element_document handles metadata and adjusts page number
|
660
|
+
doc = self._element_document(element, start_page)
|
661
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
662
|
+
yield doc
|
663
|
+
start_page += batch_num_pages
|
664
|
+
else: # Non-PDF
|
412
665
|
with open(blob.path, "rb") as f:
|
413
|
-
|
414
|
-
|
415
|
-
for element in elements:
|
416
|
-
|
417
|
-
|
666
|
+
filename = os.path.basename(blob.path)
|
667
|
+
elements = self._get_response({"document": (filename, f)})
|
668
|
+
for element in sorted(elements, key=lambda x: x.id):
|
669
|
+
doc = self._element_document(element, 0) # Start page is 0 for single doc
|
670
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
671
|
+
yield doc
|
672
|
+
|
673
|
+
# Case 3: Split by Page
|
418
674
|
elif self.split == "page":
|
419
|
-
if full_docs
|
675
|
+
if is_pdf and full_docs and PdfReader:
|
420
676
|
start_page = 0
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
raise ValueError("Blob path is required for non-PDF files.")
|
677
|
+
batch_num_pages = DEFAULT_NUM_PAGES if is_batch else 1 # Process page-by-page if not is_batch
|
678
|
+
while start_page < number_of_pages:
|
679
|
+
elements = self._split_and_request(full_docs, start_page, batch_num_pages)
|
680
|
+
# _page_document groups elements by page and creates Documents
|
681
|
+
page_docs = self._page_document(elements, start_page)
|
682
|
+
for doc in page_docs:
|
683
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
684
|
+
yield doc
|
685
|
+
start_page += batch_num_pages
|
686
|
+
else: # Non-PDF (treat as single page)
|
432
687
|
with open(blob.path, "rb") as f:
|
433
|
-
|
434
|
-
|
435
|
-
|
688
|
+
filename = os.path.basename(blob.path)
|
689
|
+
elements = self._get_response({"document": (filename, f)})
|
690
|
+
page_docs = self._page_document(elements, 0) # Process elements as page 0
|
691
|
+
for doc in page_docs:
|
692
|
+
_get_metadata_from_document(doc)["source"] = blob.path # Add source
|
693
|
+
yield doc
|
436
694
|
|
437
695
|
else:
|
438
696
|
raise ValueError(f"Invalid split type: {self.split}")
|
697
|
+
|
698
|
+
|
699
|
+
def _get_metadata_from_document(doc: Document) -> dict[object, object]:
|
700
|
+
"""
|
701
|
+
Helper function to extract metadata from a Document object.
|
702
|
+
This is a placeholder and should be adjusted based on actual metadata structure.
|
703
|
+
"""
|
704
|
+
metadata: dict[object, object] = doc.metadata # pyright: ignore[reportUnknownMemberType]
|
705
|
+
return metadata
|