agno 1.7.8__py3-none-any.whl → 1.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +33 -27
- agno/document/reader/pdf_reader.py +302 -143
- agno/knowledge/agent.py +68 -72
- agno/knowledge/pdf.py +32 -8
- agno/knowledge/pdf_url.py +13 -5
- agno/models/openai/responses.py +30 -1
- agno/run/response.py +10 -0
- agno/run/team.py +10 -0
- agno/team/team.py +39 -20
- agno/tools/aws_lambda.py +10 -0
- agno/tools/github.py +54 -18
- agno/vectordb/lancedb/lance_db.py +10 -2
- agno/vectordb/pgvector/pgvector.py +3 -0
- agno/vectordb/weaviate/weaviate.py +84 -18
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/METADATA +2 -1
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/RECORD +20 -20
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/WHEEL +0 -0
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/entry_points.txt +0 -0
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/licenses/LICENSE +0 -0
- {agno-1.7.8.dist-info → agno-1.7.10.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import re
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import IO, Any, List, Optional, Union
|
|
4
|
+
from typing import IO, Any, List, Optional, Tuple, Union
|
|
4
5
|
from uuid import uuid4
|
|
5
6
|
|
|
6
7
|
from agno.document.base import Document
|
|
7
8
|
from agno.document.reader.base import Reader
|
|
8
9
|
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
|
|
9
|
-
from agno.utils.log import log_info, logger
|
|
10
|
+
from agno.utils.log import log_error, log_info, logger
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
13
|
from pypdf import PdfReader as DocumentReader # noqa: F401
|
|
@@ -15,7 +16,13 @@ except ImportError:
|
|
|
15
16
|
raise ImportError("`pypdf` not installed. Please install it via `pip install pypdf`.")
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
PAGE_START_NUMBERING_FORMAT_DEFAULT = "<start page {page_nr}>"
|
|
20
|
+
PAGE_END_NUMBERING_FORMAT_DEFAULT = "<end page {page_nr}>"
|
|
21
|
+
PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL = 0.4
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _ocr_reader(page: Any) -> str:
|
|
25
|
+
"""A single PDF page object."""
|
|
19
26
|
try:
|
|
20
27
|
import rapidocr_onnxruntime as rapidocr
|
|
21
28
|
except ImportError:
|
|
@@ -23,7 +30,6 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
|
|
|
23
30
|
"`rapidocr_onnxruntime` not installed. Please install it via `pip install rapidocr_onnxruntime`."
|
|
24
31
|
)
|
|
25
32
|
ocr = rapidocr.RapidOCR()
|
|
26
|
-
page_text = page.extract_text() or ""
|
|
27
33
|
images_text_list = []
|
|
28
34
|
|
|
29
35
|
# Extract and process images
|
|
@@ -34,22 +40,13 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
|
|
|
34
40
|
ocr_result, elapse = ocr(image_data)
|
|
35
41
|
|
|
36
42
|
# Extract text from OCR result
|
|
37
|
-
if ocr_result
|
|
38
|
-
images_text_list += [item[1] for item in ocr_result]
|
|
43
|
+
images_text_list += [item[1] for item in ocr_result] if ocr_result else []
|
|
39
44
|
|
|
40
|
-
|
|
41
|
-
content = page_text + "\n" + images_text
|
|
42
|
-
|
|
43
|
-
# Append the document
|
|
44
|
-
return Document(
|
|
45
|
-
name=doc_name,
|
|
46
|
-
id=str(uuid4()),
|
|
47
|
-
meta_data={"page": page_number},
|
|
48
|
-
content=content,
|
|
49
|
-
)
|
|
45
|
+
return "\n".join(images_text_list)
|
|
50
46
|
|
|
51
47
|
|
|
52
|
-
async def
|
|
48
|
+
async def _async_ocr_reader(page: Any) -> str:
|
|
49
|
+
"""page: A single PDF page object."""
|
|
53
50
|
try:
|
|
54
51
|
import rapidocr_onnxruntime as rapidocr
|
|
55
52
|
except ImportError:
|
|
@@ -58,9 +55,6 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
|
|
|
58
55
|
)
|
|
59
56
|
ocr = rapidocr.RapidOCR()
|
|
60
57
|
|
|
61
|
-
page_text = page.extract_text() or ""
|
|
62
|
-
images_text_list: List = []
|
|
63
|
-
|
|
64
58
|
# Process images in parallel
|
|
65
59
|
async def process_image(image_data: bytes) -> List[str]:
|
|
66
60
|
ocr_result, _ = ocr(image_data)
|
|
@@ -69,32 +63,250 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
|
|
|
69
63
|
image_tasks = [process_image(image.data) for image in page.images]
|
|
70
64
|
images_results = await asyncio.gather(*image_tasks)
|
|
71
65
|
|
|
66
|
+
images_text_list: List = []
|
|
72
67
|
for result in images_results:
|
|
73
68
|
images_text_list.extend(result)
|
|
74
69
|
|
|
75
70
|
images_text = "\n".join(images_text_list)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
71
|
+
return images_text
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _clean_page_numbers(
|
|
75
|
+
page_content_list: List[str],
|
|
76
|
+
extra_content: List[str] = [],
|
|
77
|
+
page_start_numbering_format: str = PAGE_START_NUMBERING_FORMAT_DEFAULT,
|
|
78
|
+
page_end_numbering_format: str = PAGE_END_NUMBERING_FORMAT_DEFAULT,
|
|
79
|
+
) -> Tuple[List[str], Optional[int]]:
|
|
80
|
+
f"""
|
|
81
|
+
Identifies and removes or reformats page numbers from a list of PDF page contents, based on the most consistent sequential numbering.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
page_content_list (List[str]): A list of strings where each string represents the content of a PDF page.
|
|
85
|
+
extra_content (List[str]): A list of strings where each string will be appended after the main content. Can be used for appending image information.
|
|
86
|
+
page_start_numbering_format (str): A format string to prepend to the page content, with `{{page_nr}}` as a placeholder for the page number.
|
|
87
|
+
Defaults to {PAGE_START_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
|
|
88
|
+
page_end_numbering_format (str): A format string to append to the page content, with `{{page_nr}}` as a placeholder for the page number.
|
|
89
|
+
Defaults to {PAGE_END_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
List[str]: The list of page contents with page numbers removed or reformatted based on the detected sequence.
|
|
93
|
+
Optional[Int]: The shift for the page numbering. Can be (-2, -1, 0, 1, 2).
|
|
94
|
+
|
|
95
|
+
Notes:
|
|
96
|
+
- The function scans for page numbers using a regular expression that matches digits at the start or end of a string.
|
|
97
|
+
- It evaluates several potential starting points for numbering (-2, -1, 0, 1, 2 shifts) to determine the most consistent sequence.
|
|
98
|
+
- If at least a specified ratio of pages (defined by `PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL`) has correct sequential numbering,
|
|
99
|
+
the page numbers are processed.
|
|
100
|
+
- If page numbers are found, the function will add formatted page numbers to each page's content if `page_start_numbering_format` or
|
|
101
|
+
`page_end_numbering_format` is provided.
|
|
102
|
+
"""
|
|
103
|
+
assert len(extra_content) == 0 or len(extra_content) == len(page_content_list), (
|
|
104
|
+
"Please provide an equally sized list of extra content if provided."
|
|
83
105
|
)
|
|
84
106
|
|
|
107
|
+
# Regex to match potential page numbers at the start or end of a string
|
|
108
|
+
page_number_regex = re.compile(r"^\s*(\d+)\s*|\s*(\d+)\s*$")
|
|
109
|
+
|
|
110
|
+
def find_page_number(content):
|
|
111
|
+
match = page_number_regex.search(content)
|
|
112
|
+
if match:
|
|
113
|
+
return int(match.group(1) or match.group(2))
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
page_numbers = [find_page_number(content) for content in page_content_list]
|
|
117
|
+
if all(x is None or x > 5 for x in page_numbers):
|
|
118
|
+
# This approach won't work reliably for higher page numbers.
|
|
119
|
+
return page_content_list, None
|
|
120
|
+
|
|
121
|
+
# Possible range shifts to detect page numbering
|
|
122
|
+
range_shifts = [-2, -1, 0, 1, 2]
|
|
123
|
+
best_match, best_correct_count, best_shift = _identify_best_page_sequence(page_numbers, range_shifts)
|
|
124
|
+
|
|
125
|
+
# Check if at least ..% of the pages have correct sequential numbering
|
|
126
|
+
if best_match and best_correct_count / len(page_numbers) >= PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL:
|
|
127
|
+
# Remove the page numbers from the content
|
|
128
|
+
for i, expected_number in enumerate(best_match):
|
|
129
|
+
page_content_list[i] = re.sub(
|
|
130
|
+
rf"^\s*{expected_number}\s*|\s*{expected_number}\s*$", "", page_content_list[i]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
page_start = (
|
|
134
|
+
page_start_numbering_format.format(page_nr=expected_number) + "\n"
|
|
135
|
+
if page_start_numbering_format
|
|
136
|
+
else ""
|
|
137
|
+
)
|
|
138
|
+
page_end = (
|
|
139
|
+
"\n" + page_end_numbering_format.format(page_nr=expected_number) if page_end_numbering_format else ""
|
|
140
|
+
)
|
|
141
|
+
extra_info = "\n" + extra_content[i] if extra_content else ""
|
|
142
|
+
|
|
143
|
+
# Add formatted page numbering if configured.
|
|
144
|
+
page_content_list[i] = page_start + page_content_list[i] + extra_info + page_end
|
|
145
|
+
else:
|
|
146
|
+
best_shift = None
|
|
147
|
+
|
|
148
|
+
return page_content_list, best_shift
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _identify_best_page_sequence(page_numbers, range_shifts):
|
|
152
|
+
best_match = None
|
|
153
|
+
best_shift: Optional[int] = None
|
|
154
|
+
best_correct_count = 0
|
|
155
|
+
|
|
156
|
+
for shift in range_shifts:
|
|
157
|
+
expected_numbers = [i + shift for i in range(len(page_numbers))]
|
|
158
|
+
# Check if expected number occurs (or that the expected "2" occurs in an incorrectly merged number like 25,
|
|
159
|
+
# where 2 is the page number and 5 is part of the PDF content).
|
|
160
|
+
correct_count = sum(
|
|
161
|
+
1
|
|
162
|
+
for actual, expected in zip(page_numbers, expected_numbers)
|
|
163
|
+
if actual == expected or str(actual).startswith(str(expected)) or str(actual).endswith(str(expected))
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if correct_count > best_correct_count:
|
|
167
|
+
best_correct_count = correct_count
|
|
168
|
+
best_match = expected_numbers
|
|
169
|
+
best_shift = shift
|
|
170
|
+
|
|
171
|
+
return best_match, best_correct_count, best_shift
|
|
172
|
+
|
|
85
173
|
|
|
86
174
|
class BasePDFReader(Reader):
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
split_on_pages: bool = True,
|
|
178
|
+
page_start_numbering_format: Optional[str] = None,
|
|
179
|
+
page_end_numbering_format: Optional[str] = None,
|
|
180
|
+
password: Optional[str] = None,
|
|
181
|
+
**kwargs,
|
|
182
|
+
):
|
|
183
|
+
if page_start_numbering_format is None:
|
|
184
|
+
page_start_numbering_format = PAGE_START_NUMBERING_FORMAT_DEFAULT
|
|
185
|
+
if page_end_numbering_format is None:
|
|
186
|
+
page_end_numbering_format = PAGE_END_NUMBERING_FORMAT_DEFAULT
|
|
187
|
+
|
|
188
|
+
self.split_on_pages = split_on_pages
|
|
189
|
+
self.page_start_numbering_format = page_start_numbering_format
|
|
190
|
+
self.page_end_numbering_format = page_end_numbering_format
|
|
191
|
+
self.password = password
|
|
192
|
+
|
|
193
|
+
super().__init__(**kwargs)
|
|
194
|
+
|
|
87
195
|
def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
|
|
88
196
|
chunked_documents: List[Document] = []
|
|
89
197
|
for document in documents:
|
|
90
198
|
chunked_documents.extend(self.chunk_document(document))
|
|
91
199
|
return chunked_documents
|
|
92
200
|
|
|
201
|
+
def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
|
|
202
|
+
if not doc_reader.is_encrypted:
|
|
203
|
+
return True
|
|
204
|
+
|
|
205
|
+
# Use provided password or fall back to instance password
|
|
206
|
+
pdf_password = password or self.password
|
|
207
|
+
if not pdf_password:
|
|
208
|
+
logger.error(f"PDF {doc_name} is password protected but no password provided")
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
decrypted_pdf = doc_reader.decrypt(pdf_password)
|
|
213
|
+
if decrypted_pdf:
|
|
214
|
+
log_info(f"Successfully decrypted PDF {doc_name} with user password")
|
|
215
|
+
return True
|
|
216
|
+
else:
|
|
217
|
+
log_error(f"Failed to decrypt PDF {doc_name}: incorrect password")
|
|
218
|
+
return False
|
|
219
|
+
except Exception as e:
|
|
220
|
+
log_error(f"Error decrypting PDF {doc_name}: {e}")
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
|
|
224
|
+
if self.split_on_pages:
|
|
225
|
+
shift = page_number_shift if page_number_shift is not None else 1
|
|
226
|
+
documents: List[Document] = []
|
|
227
|
+
for page_number, page_content in enumerate(pdf_content, start=shift):
|
|
228
|
+
documents.append(
|
|
229
|
+
Document(
|
|
230
|
+
name=doc_name,
|
|
231
|
+
id=(str(uuid4()) if use_uuid_for_id else f"{doc_name}_{page_number}"),
|
|
232
|
+
meta_data={"page": page_number},
|
|
233
|
+
content=page_content,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
pdf_content_str = "\n".join(pdf_content)
|
|
238
|
+
document = Document(
|
|
239
|
+
name=doc_name,
|
|
240
|
+
id=str(uuid4()) if use_uuid_for_id else doc_name,
|
|
241
|
+
meta_data={},
|
|
242
|
+
content=pdf_content_str,
|
|
243
|
+
)
|
|
244
|
+
documents = [document]
|
|
245
|
+
|
|
246
|
+
if self.chunk:
|
|
247
|
+
return self._build_chunked_documents(documents)
|
|
248
|
+
|
|
249
|
+
return documents
|
|
250
|
+
|
|
251
|
+
def _pdf_reader_to_documents(
|
|
252
|
+
self,
|
|
253
|
+
doc_reader: DocumentReader,
|
|
254
|
+
doc_name,
|
|
255
|
+
read_images=False,
|
|
256
|
+
use_uuid_for_id=False,
|
|
257
|
+
):
|
|
258
|
+
pdf_content = []
|
|
259
|
+
pdf_images_text = []
|
|
260
|
+
for page in doc_reader.pages:
|
|
261
|
+
pdf_content.append(page.extract_text())
|
|
262
|
+
if read_images:
|
|
263
|
+
pdf_images_text.append(_ocr_reader(page))
|
|
264
|
+
|
|
265
|
+
pdf_content, shift = _clean_page_numbers(
|
|
266
|
+
page_content_list=pdf_content,
|
|
267
|
+
extra_content=pdf_images_text,
|
|
268
|
+
page_start_numbering_format=self.page_start_numbering_format,
|
|
269
|
+
page_end_numbering_format=self.page_end_numbering_format,
|
|
270
|
+
)
|
|
271
|
+
return self._create_documents(pdf_content, doc_name, use_uuid_for_id, shift)
|
|
272
|
+
|
|
273
|
+
async def _async_pdf_reader_to_documents(
|
|
274
|
+
self,
|
|
275
|
+
doc_reader: DocumentReader,
|
|
276
|
+
doc_name: str,
|
|
277
|
+
read_images=False,
|
|
278
|
+
use_uuid_for_id=False,
|
|
279
|
+
):
|
|
280
|
+
async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
|
|
281
|
+
# We tried "asyncio.to_thread(page.extract_text)", but it maintains state internally, which leads to issues.
|
|
282
|
+
page_text = page.extract_text()
|
|
283
|
+
|
|
284
|
+
if read_images:
|
|
285
|
+
pdf_images_text = await _async_ocr_reader(page)
|
|
286
|
+
else:
|
|
287
|
+
pdf_images_text = ""
|
|
288
|
+
|
|
289
|
+
return page_text, pdf_images_text
|
|
290
|
+
|
|
291
|
+
# Process pages in parallel using asyncio.gather
|
|
292
|
+
pdf_content: List[Tuple[str, str]] = await asyncio.gather(
|
|
293
|
+
*[_read_pdf_page(page, read_images) for page in doc_reader.pages]
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
pdf_content_clean, shift = _clean_page_numbers(
|
|
297
|
+
page_content_list=[x[0] for x in pdf_content],
|
|
298
|
+
extra_content=[x[1] for x in pdf_content],
|
|
299
|
+
page_start_numbering_format=self.page_start_numbering_format,
|
|
300
|
+
page_end_numbering_format=self.page_end_numbering_format,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return self._create_documents(pdf_content_clean, doc_name, use_uuid_for_id, shift)
|
|
304
|
+
|
|
93
305
|
|
|
94
306
|
class PDFReader(BasePDFReader):
|
|
95
307
|
"""Reader for PDF files"""
|
|
96
308
|
|
|
97
|
-
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
|
|
309
|
+
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
|
|
98
310
|
try:
|
|
99
311
|
if isinstance(pdf, str):
|
|
100
312
|
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
@@ -106,26 +318,19 @@ class PDFReader(BasePDFReader):
|
|
|
106
318
|
log_info(f"Reading: {doc_name}")
|
|
107
319
|
|
|
108
320
|
try:
|
|
109
|
-
|
|
321
|
+
pdf_reader = DocumentReader(pdf)
|
|
110
322
|
except PdfStreamError as e:
|
|
111
323
|
logger.error(f"Error reading PDF: {e}")
|
|
112
324
|
return []
|
|
113
325
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
Document(
|
|
118
|
-
name=doc_name,
|
|
119
|
-
id=str(uuid4()),
|
|
120
|
-
meta_data={"page": page_number},
|
|
121
|
-
content=page.extract_text(),
|
|
122
|
-
)
|
|
123
|
-
)
|
|
124
|
-
if self.chunk:
|
|
125
|
-
return self._build_chunked_documents(documents)
|
|
126
|
-
return documents
|
|
326
|
+
# Handle PDF decryption
|
|
327
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
328
|
+
return []
|
|
127
329
|
|
|
128
|
-
|
|
330
|
+
# Read and chunk.
|
|
331
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
332
|
+
|
|
333
|
+
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
|
|
129
334
|
try:
|
|
130
335
|
if isinstance(pdf, str):
|
|
131
336
|
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
@@ -137,40 +342,27 @@ class PDFReader(BasePDFReader):
|
|
|
137
342
|
log_info(f"Reading: {doc_name}")
|
|
138
343
|
|
|
139
344
|
try:
|
|
140
|
-
|
|
345
|
+
pdf_reader = DocumentReader(pdf)
|
|
141
346
|
except PdfStreamError as e:
|
|
142
347
|
logger.error(f"Error reading PDF: {e}")
|
|
143
348
|
return []
|
|
144
349
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
id=str(uuid4()),
|
|
149
|
-
meta_data={"page": page_number},
|
|
150
|
-
content=page.extract_text(),
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Process pages in parallel using asyncio.gather
|
|
154
|
-
documents = await asyncio.gather(
|
|
155
|
-
*[
|
|
156
|
-
_process_document(doc_name, page_number, page)
|
|
157
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
158
|
-
]
|
|
159
|
-
)
|
|
350
|
+
# Handle PDF decryption
|
|
351
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
352
|
+
return []
|
|
160
353
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
return documents
|
|
354
|
+
# Read and chunk.
|
|
355
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
164
356
|
|
|
165
357
|
|
|
166
358
|
class PDFUrlReader(BasePDFReader):
|
|
167
359
|
"""Reader for PDF files from URL"""
|
|
168
360
|
|
|
169
|
-
def __init__(self, proxy: Optional[str] = None, **kwargs):
|
|
170
|
-
super().__init__(**kwargs)
|
|
361
|
+
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
362
|
+
super().__init__(password=password, **kwargs)
|
|
171
363
|
self.proxy = proxy
|
|
172
364
|
|
|
173
|
-
def read(self, url: str) -> List[Document]:
|
|
365
|
+
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
|
|
174
366
|
if not url:
|
|
175
367
|
raise ValueError("No url provided")
|
|
176
368
|
|
|
@@ -182,23 +374,16 @@ class PDFUrlReader(BasePDFReader):
|
|
|
182
374
|
response = fetch_with_retry(url, proxy=self.proxy)
|
|
183
375
|
|
|
184
376
|
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
Document(
|
|
191
|
-
name=doc_name,
|
|
192
|
-
id=f"{doc_name}_{page_number}",
|
|
193
|
-
meta_data={"page": page_number},
|
|
194
|
-
content=page.extract_text(),
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
if self.chunk:
|
|
198
|
-
return self._build_chunked_documents(documents)
|
|
199
|
-
return documents
|
|
377
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
378
|
+
|
|
379
|
+
# Handle PDF decryption
|
|
380
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
381
|
+
return []
|
|
200
382
|
|
|
201
|
-
|
|
383
|
+
# Read and chunk.
|
|
384
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
385
|
+
|
|
386
|
+
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
|
|
202
387
|
if not url:
|
|
203
388
|
raise ValueError("No url provided")
|
|
204
389
|
|
|
@@ -213,33 +398,20 @@ class PDFUrlReader(BasePDFReader):
|
|
|
213
398
|
response = await async_fetch_with_retry(url, client=client)
|
|
214
399
|
|
|
215
400
|
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
async def _process_document(doc_name: str, page_number: int, page: Any) -> Document:
|
|
219
|
-
return Document(
|
|
220
|
-
name=doc_name,
|
|
221
|
-
id=f"{doc_name}_{page_number}",
|
|
222
|
-
meta_data={"page": page_number},
|
|
223
|
-
content=page.extract_text(),
|
|
224
|
-
)
|
|
401
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
225
402
|
|
|
226
|
-
#
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
_process_document(doc_name, page_number, page)
|
|
230
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
231
|
-
]
|
|
232
|
-
)
|
|
403
|
+
# Handle PDF decryption
|
|
404
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
405
|
+
return []
|
|
233
406
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
return documents
|
|
407
|
+
# Read and chunk.
|
|
408
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
237
409
|
|
|
238
410
|
|
|
239
411
|
class PDFImageReader(BasePDFReader):
|
|
240
412
|
"""Reader for PDF files with text and images extraction"""
|
|
241
413
|
|
|
242
|
-
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
|
|
414
|
+
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
|
|
243
415
|
if not pdf:
|
|
244
416
|
raise ValueError("No pdf provided")
|
|
245
417
|
|
|
@@ -252,18 +424,16 @@ class PDFImageReader(BasePDFReader):
|
|
|
252
424
|
doc_name = "pdf"
|
|
253
425
|
|
|
254
426
|
log_info(f"Reading: {doc_name}")
|
|
255
|
-
|
|
427
|
+
pdf_reader = DocumentReader(pdf)
|
|
256
428
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
if self.chunk:
|
|
262
|
-
return self._build_chunked_documents(documents)
|
|
429
|
+
# Handle PDF decryption
|
|
430
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
431
|
+
return []
|
|
263
432
|
|
|
264
|
-
|
|
433
|
+
# Read and chunk.
|
|
434
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
265
435
|
|
|
266
|
-
async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
|
|
436
|
+
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
|
|
267
437
|
if not pdf:
|
|
268
438
|
raise ValueError("No pdf provided")
|
|
269
439
|
|
|
@@ -276,28 +446,24 @@ class PDFImageReader(BasePDFReader):
|
|
|
276
446
|
doc_name = "pdf"
|
|
277
447
|
|
|
278
448
|
log_info(f"Reading: {doc_name}")
|
|
279
|
-
|
|
449
|
+
pdf_reader = DocumentReader(pdf)
|
|
280
450
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
285
|
-
]
|
|
286
|
-
)
|
|
451
|
+
# Handle PDF decryption
|
|
452
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
453
|
+
return []
|
|
287
454
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
return documents
|
|
455
|
+
# Read and chunk.
|
|
456
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
291
457
|
|
|
292
458
|
|
|
293
459
|
class PDFUrlImageReader(BasePDFReader):
|
|
294
460
|
"""Reader for PDF files from URL with text and images extraction"""
|
|
295
461
|
|
|
296
|
-
def __init__(self, proxy: Optional[str] = None, **kwargs):
|
|
297
|
-
super().__init__(**kwargs)
|
|
462
|
+
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
463
|
+
super().__init__(password=password, **kwargs)
|
|
298
464
|
self.proxy = proxy
|
|
299
465
|
|
|
300
|
-
def read(self, url: str) -> List[Document]:
|
|
466
|
+
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
|
|
301
467
|
if not url:
|
|
302
468
|
raise ValueError("No url provided")
|
|
303
469
|
|
|
@@ -310,19 +476,16 @@ class PDFUrlImageReader(BasePDFReader):
|
|
|
310
476
|
response = httpx.get(url, proxy=self.proxy) if self.proxy else httpx.get(url)
|
|
311
477
|
|
|
312
478
|
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
documents = []
|
|
316
|
-
for page_number, page in enumerate(doc_reader.pages, start=1):
|
|
317
|
-
documents.append(process_image_page(doc_name, page_number, page))
|
|
479
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
318
480
|
|
|
319
|
-
#
|
|
320
|
-
if self.
|
|
321
|
-
return
|
|
481
|
+
# Handle PDF decryption
|
|
482
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
483
|
+
return []
|
|
322
484
|
|
|
323
|
-
|
|
485
|
+
# Read and chunk.
|
|
486
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
324
487
|
|
|
325
|
-
async def async_read(self, url: str) -> List[Document]:
|
|
488
|
+
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
|
|
326
489
|
if not url:
|
|
327
490
|
raise ValueError("No url provided")
|
|
328
491
|
|
|
@@ -338,15 +501,11 @@ class PDFUrlImageReader(BasePDFReader):
|
|
|
338
501
|
response.raise_for_status()
|
|
339
502
|
|
|
340
503
|
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
341
|
-
|
|
504
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
342
505
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
347
|
-
]
|
|
348
|
-
)
|
|
506
|
+
# Handle PDF decryption
|
|
507
|
+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
508
|
+
return []
|
|
349
509
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
return documents
|
|
510
|
+
# Read and chunk.
|
|
511
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|