agno 1.7.8__py3-none-any.whl → 1.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,13 @@
1
1
  import asyncio
2
+ import re
2
3
  from pathlib import Path
3
- from typing import IO, Any, List, Optional, Union
4
+ from typing import IO, Any, List, Optional, Tuple, Union
4
5
  from uuid import uuid4
5
6
 
6
7
  from agno.document.base import Document
7
8
  from agno.document.reader.base import Reader
8
9
  from agno.utils.http import async_fetch_with_retry, fetch_with_retry
9
- from agno.utils.log import log_info, logger
10
+ from agno.utils.log import log_error, log_info, logger
10
11
 
11
12
  try:
12
13
  from pypdf import PdfReader as DocumentReader # noqa: F401
@@ -15,7 +16,13 @@ except ImportError:
15
16
  raise ImportError("`pypdf` not installed. Please install it via `pip install pypdf`.")
16
17
 
17
18
 
18
- def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
19
+ PAGE_START_NUMBERING_FORMAT_DEFAULT = "<start page {page_nr}>"
20
+ PAGE_END_NUMBERING_FORMAT_DEFAULT = "<end page {page_nr}>"
21
+ PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL = 0.4
22
+
23
+
24
+ def _ocr_reader(page: Any) -> str:
25
+ """A single PDF page object."""
19
26
  try:
20
27
  import rapidocr_onnxruntime as rapidocr
21
28
  except ImportError:
@@ -23,7 +30,6 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
23
30
  "`rapidocr_onnxruntime` not installed. Please install it via `pip install rapidocr_onnxruntime`."
24
31
  )
25
32
  ocr = rapidocr.RapidOCR()
26
- page_text = page.extract_text() or ""
27
33
  images_text_list = []
28
34
 
29
35
  # Extract and process images
@@ -34,22 +40,13 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
34
40
  ocr_result, elapse = ocr(image_data)
35
41
 
36
42
  # Extract text from OCR result
37
- if ocr_result:
38
- images_text_list += [item[1] for item in ocr_result]
43
+ images_text_list += [item[1] for item in ocr_result] if ocr_result else []
39
44
 
40
- images_text = "\n".join(images_text_list)
41
- content = page_text + "\n" + images_text
42
-
43
- # Append the document
44
- return Document(
45
- name=doc_name,
46
- id=str(uuid4()),
47
- meta_data={"page": page_number},
48
- content=content,
49
- )
45
+ return "\n".join(images_text_list)
50
46
 
51
47
 
52
- async def async_process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
48
+ async def _async_ocr_reader(page: Any) -> str:
49
+ """page: A single PDF page object."""
53
50
  try:
54
51
  import rapidocr_onnxruntime as rapidocr
55
52
  except ImportError:
@@ -58,9 +55,6 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
58
55
  )
59
56
  ocr = rapidocr.RapidOCR()
60
57
 
61
- page_text = page.extract_text() or ""
62
- images_text_list: List = []
63
-
64
58
  # Process images in parallel
65
59
  async def process_image(image_data: bytes) -> List[str]:
66
60
  ocr_result, _ = ocr(image_data)
@@ -69,32 +63,250 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
69
63
  image_tasks = [process_image(image.data) for image in page.images]
70
64
  images_results = await asyncio.gather(*image_tasks)
71
65
 
66
+ images_text_list: List = []
72
67
  for result in images_results:
73
68
  images_text_list.extend(result)
74
69
 
75
70
  images_text = "\n".join(images_text_list)
76
- content = page_text + "\n" + images_text
77
-
78
- return Document(
79
- name=doc_name,
80
- id=str(uuid4()),
81
- meta_data={"page": page_number},
82
- content=content,
71
+ return images_text
72
+
73
+
74
+ def _clean_page_numbers(
75
+ page_content_list: List[str],
76
+ extra_content: List[str] = [],
77
+ page_start_numbering_format: str = PAGE_START_NUMBERING_FORMAT_DEFAULT,
78
+ page_end_numbering_format: str = PAGE_END_NUMBERING_FORMAT_DEFAULT,
79
+ ) -> Tuple[List[str], Optional[int]]:
80
+ f"""
81
+ Identifies and removes or reformats page numbers from a list of PDF page contents, based on the most consistent sequential numbering.
82
+
83
+ Args:
84
+ page_content_list (List[str]): A list of strings where each string represents the content of a PDF page.
85
+ extra_content (List[str]): A list of strings where each string will be appended after the main content. Can be used for appending image information.
86
+ page_start_numbering_format (str): A format string to prepend to the page content, with `{{page_nr}}` as a placeholder for the page number.
87
+ Defaults to {PAGE_START_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
88
+ page_end_numbering_format (str): A format string to append to the page content, with `{{page_nr}}` as a placeholder for the page number.
89
+ Defaults to {PAGE_END_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
90
+
91
+ Returns:
92
+ List[str]: The list of page contents with page numbers removed or reformatted based on the detected sequence.
93
+ Optional[Int]: The shift for the page numbering. Can be (-2, -1, 0, 1, 2).
94
+
95
+ Notes:
96
+ - The function scans for page numbers using a regular expression that matches digits at the start or end of a string.
97
+ - It evaluates several potential starting points for numbering (-2, -1, 0, 1, 2 shifts) to determine the most consistent sequence.
98
+ - If at least a specified ratio of pages (defined by `PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL`) has correct sequential numbering,
99
+ the page numbers are processed.
100
+ - If page numbers are found, the function will add formatted page numbers to each page's content if `page_start_numbering_format` or
101
+ `page_end_numbering_format` is provided.
102
+ """
103
+ assert len(extra_content) == 0 or len(extra_content) == len(page_content_list), (
104
+ "Please provide an equally sized list of extra content if provided."
83
105
  )
84
106
 
107
+ # Regex to match potential page numbers at the start or end of a string
108
+ page_number_regex = re.compile(r"^\s*(\d+)\s*|\s*(\d+)\s*$")
109
+
110
+ def find_page_number(content):
111
+ match = page_number_regex.search(content)
112
+ if match:
113
+ return int(match.group(1) or match.group(2))
114
+ return None
115
+
116
+ page_numbers = [find_page_number(content) for content in page_content_list]
117
+ if all(x is None or x > 5 for x in page_numbers):
118
+ # This approach won't work reliably for higher page numbers.
119
+ return page_content_list, None
120
+
121
+ # Possible range shifts to detect page numbering
122
+ range_shifts = [-2, -1, 0, 1, 2]
123
+ best_match, best_correct_count, best_shift = _identify_best_page_sequence(page_numbers, range_shifts)
124
+
125
+ # Check if at least ..% of the pages have correct sequential numbering
126
+ if best_match and best_correct_count / len(page_numbers) >= PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL:
127
+ # Remove the page numbers from the content
128
+ for i, expected_number in enumerate(best_match):
129
+ page_content_list[i] = re.sub(
130
+ rf"^\s*{expected_number}\s*|\s*{expected_number}\s*$", "", page_content_list[i]
131
+ )
132
+
133
+ page_start = (
134
+ page_start_numbering_format.format(page_nr=expected_number) + "\n"
135
+ if page_start_numbering_format
136
+ else ""
137
+ )
138
+ page_end = (
139
+ "\n" + page_end_numbering_format.format(page_nr=expected_number) if page_end_numbering_format else ""
140
+ )
141
+ extra_info = "\n" + extra_content[i] if extra_content else ""
142
+
143
+ # Add formatted page numbering if configured.
144
+ page_content_list[i] = page_start + page_content_list[i] + extra_info + page_end
145
+ else:
146
+ best_shift = None
147
+
148
+ return page_content_list, best_shift
149
+
150
+
151
+ def _identify_best_page_sequence(page_numbers, range_shifts):
152
+ best_match = None
153
+ best_shift: Optional[int] = None
154
+ best_correct_count = 0
155
+
156
+ for shift in range_shifts:
157
+ expected_numbers = [i + shift for i in range(len(page_numbers))]
158
+ # Check if expected number occurs (or that the expected "2" occurs in an incorrectly merged number like 25,
159
+ # where 2 is the page number and 5 is part of the PDF content).
160
+ correct_count = sum(
161
+ 1
162
+ for actual, expected in zip(page_numbers, expected_numbers)
163
+ if actual == expected or str(actual).startswith(str(expected)) or str(actual).endswith(str(expected))
164
+ )
165
+
166
+ if correct_count > best_correct_count:
167
+ best_correct_count = correct_count
168
+ best_match = expected_numbers
169
+ best_shift = shift
170
+
171
+ return best_match, best_correct_count, best_shift
172
+
85
173
 
86
174
  class BasePDFReader(Reader):
175
+ def __init__(
176
+ self,
177
+ split_on_pages: bool = True,
178
+ page_start_numbering_format: Optional[str] = None,
179
+ page_end_numbering_format: Optional[str] = None,
180
+ password: Optional[str] = None,
181
+ **kwargs,
182
+ ):
183
+ if page_start_numbering_format is None:
184
+ page_start_numbering_format = PAGE_START_NUMBERING_FORMAT_DEFAULT
185
+ if page_end_numbering_format is None:
186
+ page_end_numbering_format = PAGE_END_NUMBERING_FORMAT_DEFAULT
187
+
188
+ self.split_on_pages = split_on_pages
189
+ self.page_start_numbering_format = page_start_numbering_format
190
+ self.page_end_numbering_format = page_end_numbering_format
191
+ self.password = password
192
+
193
+ super().__init__(**kwargs)
194
+
87
195
  def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
88
196
  chunked_documents: List[Document] = []
89
197
  for document in documents:
90
198
  chunked_documents.extend(self.chunk_document(document))
91
199
  return chunked_documents
92
200
 
201
+ def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
202
+ if not doc_reader.is_encrypted:
203
+ return True
204
+
205
+ # Use provided password or fall back to instance password
206
+ pdf_password = password or self.password
207
+ if not pdf_password:
208
+ logger.error(f"PDF {doc_name} is password protected but no password provided")
209
+ return False
210
+
211
+ try:
212
+ decrypted_pdf = doc_reader.decrypt(pdf_password)
213
+ if decrypted_pdf:
214
+ log_info(f"Successfully decrypted PDF {doc_name} with user password")
215
+ return True
216
+ else:
217
+ log_error(f"Failed to decrypt PDF {doc_name}: incorrect password")
218
+ return False
219
+ except Exception as e:
220
+ log_error(f"Error decrypting PDF {doc_name}: {e}")
221
+ return False
222
+
223
+ def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
224
+ if self.split_on_pages:
225
+ shift = page_number_shift if page_number_shift is not None else 1
226
+ documents: List[Document] = []
227
+ for page_number, page_content in enumerate(pdf_content, start=shift):
228
+ documents.append(
229
+ Document(
230
+ name=doc_name,
231
+ id=(str(uuid4()) if use_uuid_for_id else f"{doc_name}_{page_number}"),
232
+ meta_data={"page": page_number},
233
+ content=page_content,
234
+ )
235
+ )
236
+ else:
237
+ pdf_content_str = "\n".join(pdf_content)
238
+ document = Document(
239
+ name=doc_name,
240
+ id=str(uuid4()) if use_uuid_for_id else doc_name,
241
+ meta_data={},
242
+ content=pdf_content_str,
243
+ )
244
+ documents = [document]
245
+
246
+ if self.chunk:
247
+ return self._build_chunked_documents(documents)
248
+
249
+ return documents
250
+
251
+ def _pdf_reader_to_documents(
252
+ self,
253
+ doc_reader: DocumentReader,
254
+ doc_name,
255
+ read_images=False,
256
+ use_uuid_for_id=False,
257
+ ):
258
+ pdf_content = []
259
+ pdf_images_text = []
260
+ for page in doc_reader.pages:
261
+ pdf_content.append(page.extract_text())
262
+ if read_images:
263
+ pdf_images_text.append(_ocr_reader(page))
264
+
265
+ pdf_content, shift = _clean_page_numbers(
266
+ page_content_list=pdf_content,
267
+ extra_content=pdf_images_text,
268
+ page_start_numbering_format=self.page_start_numbering_format,
269
+ page_end_numbering_format=self.page_end_numbering_format,
270
+ )
271
+ return self._create_documents(pdf_content, doc_name, use_uuid_for_id, shift)
272
+
273
+ async def _async_pdf_reader_to_documents(
274
+ self,
275
+ doc_reader: DocumentReader,
276
+ doc_name: str,
277
+ read_images=False,
278
+ use_uuid_for_id=False,
279
+ ):
280
+ async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
281
+ # We tried "asyncio.to_thread(page.extract_text)", but it maintains state internally, which leads to issues.
282
+ page_text = page.extract_text()
283
+
284
+ if read_images:
285
+ pdf_images_text = await _async_ocr_reader(page)
286
+ else:
287
+ pdf_images_text = ""
288
+
289
+ return page_text, pdf_images_text
290
+
291
+ # Process pages in parallel using asyncio.gather
292
+ pdf_content: List[Tuple[str, str]] = await asyncio.gather(
293
+ *[_read_pdf_page(page, read_images) for page in doc_reader.pages]
294
+ )
295
+
296
+ pdf_content_clean, shift = _clean_page_numbers(
297
+ page_content_list=[x[0] for x in pdf_content],
298
+ extra_content=[x[1] for x in pdf_content],
299
+ page_start_numbering_format=self.page_start_numbering_format,
300
+ page_end_numbering_format=self.page_end_numbering_format,
301
+ )
302
+
303
+ return self._create_documents(pdf_content_clean, doc_name, use_uuid_for_id, shift)
304
+
93
305
 
94
306
  class PDFReader(BasePDFReader):
95
307
  """Reader for PDF files"""
96
308
 
97
- def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
309
+ def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
98
310
  try:
99
311
  if isinstance(pdf, str):
100
312
  doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -106,26 +318,19 @@ class PDFReader(BasePDFReader):
106
318
  log_info(f"Reading: {doc_name}")
107
319
 
108
320
  try:
109
- doc_reader = DocumentReader(pdf)
321
+ pdf_reader = DocumentReader(pdf)
110
322
  except PdfStreamError as e:
111
323
  logger.error(f"Error reading PDF: {e}")
112
324
  return []
113
325
 
114
- documents = []
115
- for page_number, page in enumerate(doc_reader.pages, start=1):
116
- documents.append(
117
- Document(
118
- name=doc_name,
119
- id=str(uuid4()),
120
- meta_data={"page": page_number},
121
- content=page.extract_text(),
122
- )
123
- )
124
- if self.chunk:
125
- return self._build_chunked_documents(documents)
126
- return documents
326
+ # Handle PDF decryption
327
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
328
+ return []
127
329
 
128
- async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
330
+ # Read and chunk.
331
+ return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
332
+
333
+ async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
129
334
  try:
130
335
  if isinstance(pdf, str):
131
336
  doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -137,40 +342,27 @@ class PDFReader(BasePDFReader):
137
342
  log_info(f"Reading: {doc_name}")
138
343
 
139
344
  try:
140
- doc_reader = DocumentReader(pdf)
345
+ pdf_reader = DocumentReader(pdf)
141
346
  except PdfStreamError as e:
142
347
  logger.error(f"Error reading PDF: {e}")
143
348
  return []
144
349
 
145
- async def _process_document(doc_name: str, page_number: int, page: Any) -> Document:
146
- return Document(
147
- name=doc_name,
148
- id=str(uuid4()),
149
- meta_data={"page": page_number},
150
- content=page.extract_text(),
151
- )
152
-
153
- # Process pages in parallel using asyncio.gather
154
- documents = await asyncio.gather(
155
- *[
156
- _process_document(doc_name, page_number, page)
157
- for page_number, page in enumerate(doc_reader.pages, start=1)
158
- ]
159
- )
350
+ # Handle PDF decryption
351
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
352
+ return []
160
353
 
161
- if self.chunk:
162
- return self._build_chunked_documents(documents)
163
- return documents
354
+ # Read and chunk.
355
+ return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
164
356
 
165
357
 
166
358
  class PDFUrlReader(BasePDFReader):
167
359
  """Reader for PDF files from URL"""
168
360
 
169
- def __init__(self, proxy: Optional[str] = None, **kwargs):
170
- super().__init__(**kwargs)
361
+ def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
362
+ super().__init__(password=password, **kwargs)
171
363
  self.proxy = proxy
172
364
 
173
- def read(self, url: str) -> List[Document]:
365
+ def read(self, url: str, password: Optional[str] = None) -> List[Document]:
174
366
  if not url:
175
367
  raise ValueError("No url provided")
176
368
 
@@ -182,23 +374,16 @@ class PDFUrlReader(BasePDFReader):
182
374
  response = fetch_with_retry(url, proxy=self.proxy)
183
375
 
184
376
  doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
185
- doc_reader = DocumentReader(BytesIO(response.content))
186
-
187
- documents = []
188
- for page_number, page in enumerate(doc_reader.pages, start=1):
189
- documents.append(
190
- Document(
191
- name=doc_name,
192
- id=f"{doc_name}_{page_number}",
193
- meta_data={"page": page_number},
194
- content=page.extract_text(),
195
- )
196
- )
197
- if self.chunk:
198
- return self._build_chunked_documents(documents)
199
- return documents
377
+ pdf_reader = DocumentReader(BytesIO(response.content))
378
+
379
+ # Handle PDF decryption
380
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
381
+ return []
200
382
 
201
- async def async_read(self, url: str) -> List[Document]:
383
+ # Read and chunk.
384
+ return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
385
+
386
+ async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
202
387
  if not url:
203
388
  raise ValueError("No url provided")
204
389
 
@@ -213,33 +398,20 @@ class PDFUrlReader(BasePDFReader):
213
398
  response = await async_fetch_with_retry(url, client=client)
214
399
 
215
400
  doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
216
- doc_reader = DocumentReader(BytesIO(response.content))
217
-
218
- async def _process_document(doc_name: str, page_number: int, page: Any) -> Document:
219
- return Document(
220
- name=doc_name,
221
- id=f"{doc_name}_{page_number}",
222
- meta_data={"page": page_number},
223
- content=page.extract_text(),
224
- )
401
+ pdf_reader = DocumentReader(BytesIO(response.content))
225
402
 
226
- # Process pages in parallel using asyncio.gather
227
- documents = await asyncio.gather(
228
- *[
229
- _process_document(doc_name, page_number, page)
230
- for page_number, page in enumerate(doc_reader.pages, start=1)
231
- ]
232
- )
403
+ # Handle PDF decryption
404
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
405
+ return []
233
406
 
234
- if self.chunk:
235
- return self._build_chunked_documents(documents)
236
- return documents
407
+ # Read and chunk.
408
+ return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
237
409
 
238
410
 
239
411
  class PDFImageReader(BasePDFReader):
240
412
  """Reader for PDF files with text and images extraction"""
241
413
 
242
- def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
414
+ def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
243
415
  if not pdf:
244
416
  raise ValueError("No pdf provided")
245
417
 
@@ -252,18 +424,16 @@ class PDFImageReader(BasePDFReader):
252
424
  doc_name = "pdf"
253
425
 
254
426
  log_info(f"Reading: {doc_name}")
255
- doc_reader = DocumentReader(pdf)
427
+ pdf_reader = DocumentReader(pdf)
256
428
 
257
- documents = []
258
- for page_number, page in enumerate(doc_reader.pages, start=1):
259
- documents.append(process_image_page(doc_name, page_number, page))
260
-
261
- if self.chunk:
262
- return self._build_chunked_documents(documents)
429
+ # Handle PDF decryption
430
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
431
+ return []
263
432
 
264
- return documents
433
+ # Read and chunk.
434
+ return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
265
435
 
266
- async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
436
+ async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
267
437
  if not pdf:
268
438
  raise ValueError("No pdf provided")
269
439
 
@@ -276,28 +446,24 @@ class PDFImageReader(BasePDFReader):
276
446
  doc_name = "pdf"
277
447
 
278
448
  log_info(f"Reading: {doc_name}")
279
- doc_reader = DocumentReader(pdf)
449
+ pdf_reader = DocumentReader(pdf)
280
450
 
281
- documents = await asyncio.gather(
282
- *[
283
- async_process_image_page(doc_name, page_number, page)
284
- for page_number, page in enumerate(doc_reader.pages, start=1)
285
- ]
286
- )
451
+ # Handle PDF decryption
452
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
453
+ return []
287
454
 
288
- if self.chunk:
289
- return self._build_chunked_documents(documents)
290
- return documents
455
+ # Read and chunk.
456
+ return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
291
457
 
292
458
 
293
459
  class PDFUrlImageReader(BasePDFReader):
294
460
  """Reader for PDF files from URL with text and images extraction"""
295
461
 
296
- def __init__(self, proxy: Optional[str] = None, **kwargs):
297
- super().__init__(**kwargs)
462
+ def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
463
+ super().__init__(password=password, **kwargs)
298
464
  self.proxy = proxy
299
465
 
300
- def read(self, url: str) -> List[Document]:
466
+ def read(self, url: str, password: Optional[str] = None) -> List[Document]:
301
467
  if not url:
302
468
  raise ValueError("No url provided")
303
469
 
@@ -310,19 +476,16 @@ class PDFUrlImageReader(BasePDFReader):
310
476
  response = httpx.get(url, proxy=self.proxy) if self.proxy else httpx.get(url)
311
477
 
312
478
  doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
313
- doc_reader = DocumentReader(BytesIO(response.content))
314
-
315
- documents = []
316
- for page_number, page in enumerate(doc_reader.pages, start=1):
317
- documents.append(process_image_page(doc_name, page_number, page))
479
+ pdf_reader = DocumentReader(BytesIO(response.content))
318
480
 
319
- # Optionally chunk documents
320
- if self.chunk:
321
- return self._build_chunked_documents(documents)
481
+ # Handle PDF decryption
482
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
483
+ return []
322
484
 
323
- return documents
485
+ # Read and chunk.
486
+ return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
324
487
 
325
- async def async_read(self, url: str) -> List[Document]:
488
+ async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
326
489
  if not url:
327
490
  raise ValueError("No url provided")
328
491
 
@@ -338,15 +501,11 @@ class PDFUrlImageReader(BasePDFReader):
338
501
  response.raise_for_status()
339
502
 
340
503
  doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
341
- doc_reader = DocumentReader(BytesIO(response.content))
504
+ pdf_reader = DocumentReader(BytesIO(response.content))
342
505
 
343
- documents = await asyncio.gather(
344
- *[
345
- async_process_image_page(doc_name, page_number, page)
346
- for page_number, page in enumerate(doc_reader.pages, start=1)
347
- ]
348
- )
506
+ # Handle PDF decryption
507
+ if not self._decrypt_pdf(pdf_reader, doc_name, password):
508
+ return []
349
509
 
350
- if self.chunk:
351
- return self._build_chunked_documents(documents)
352
- return documents
510
+ # Read and chunk.
511
+ return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)