agno 1.7.7__py3-none-any.whl → 1.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +247 -36
- agno/document/reader/pdf_reader.py +239 -136
- agno/document/reader/youtube_reader.py +8 -4
- agno/models/anthropic/claude.py +1 -1
- agno/models/base.py +4 -0
- agno/models/message.py +6 -2
- agno/models/openai/chat.py +3 -0
- agno/models/openai/responses.py +6 -5
- agno/run/response.py +41 -0
- agno/run/team.py +27 -0
- agno/storage/gcs_json.py +1 -1
- agno/storage/json.py +2 -1
- agno/storage/redis.py +1 -1
- agno/storage/yaml.py +1 -1
- agno/team/team.py +443 -225
- agno/tools/aws_lambda.py +10 -0
- agno/tools/function.py +21 -11
- agno/tools/googlecalendar.py +567 -121
- agno/tools/googlesheets.py +6 -1
- agno/tools/mcp.py +19 -1
- agno/utils/events.py +50 -0
- agno/utils/response.py +3 -1
- agno/vectordb/lancedb/lance_db.py +10 -2
- agno/vectordb/pgvector/pgvector.py +3 -0
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/METADATA +1 -1
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/RECORD +30 -30
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/WHEEL +0 -0
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/entry_points.txt +0 -0
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/licenses/LICENSE +0 -0
- {agno-1.7.7.dist-info → agno-1.7.9.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import re
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import IO, Any, List, Optional, Union
|
|
4
|
+
from typing import IO, Any, List, Optional, Tuple, Union
|
|
4
5
|
from uuid import uuid4
|
|
5
6
|
|
|
6
7
|
from agno.document.base import Document
|
|
@@ -15,7 +16,13 @@ except ImportError:
|
|
|
15
16
|
raise ImportError("`pypdf` not installed. Please install it via `pip install pypdf`.")
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
PAGE_START_NUMBERING_FORMAT_DEFAULT = "<start page {page_nr}>"
|
|
20
|
+
PAGE_END_NUMBERING_FORMAT_DEFAULT = "<end page {page_nr}>"
|
|
21
|
+
PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL = 0.4
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _ocr_reader(page: Any) -> str:
|
|
25
|
+
"""A single PDF page object."""
|
|
19
26
|
try:
|
|
20
27
|
import rapidocr_onnxruntime as rapidocr
|
|
21
28
|
except ImportError:
|
|
@@ -23,7 +30,6 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
|
|
|
23
30
|
"`rapidocr_onnxruntime` not installed. Please install it via `pip install rapidocr_onnxruntime`."
|
|
24
31
|
)
|
|
25
32
|
ocr = rapidocr.RapidOCR()
|
|
26
|
-
page_text = page.extract_text() or ""
|
|
27
33
|
images_text_list = []
|
|
28
34
|
|
|
29
35
|
# Extract and process images
|
|
@@ -34,22 +40,13 @@ def process_image_page(doc_name: str, page_number: int, page: Any) -> Document:
|
|
|
34
40
|
ocr_result, elapse = ocr(image_data)
|
|
35
41
|
|
|
36
42
|
# Extract text from OCR result
|
|
37
|
-
if ocr_result
|
|
38
|
-
images_text_list += [item[1] for item in ocr_result]
|
|
43
|
+
images_text_list += [item[1] for item in ocr_result] if ocr_result else []
|
|
39
44
|
|
|
40
|
-
|
|
41
|
-
content = page_text + "\n" + images_text
|
|
42
|
-
|
|
43
|
-
# Append the document
|
|
44
|
-
return Document(
|
|
45
|
-
name=doc_name,
|
|
46
|
-
id=str(uuid4()),
|
|
47
|
-
meta_data={"page": page_number},
|
|
48
|
-
content=content,
|
|
49
|
-
)
|
|
45
|
+
return "\n".join(images_text_list)
|
|
50
46
|
|
|
51
47
|
|
|
52
|
-
async def
|
|
48
|
+
async def _async_ocr_reader(page: Any) -> str:
|
|
49
|
+
"""page: A single PDF page object."""
|
|
53
50
|
try:
|
|
54
51
|
import rapidocr_onnxruntime as rapidocr
|
|
55
52
|
except ImportError:
|
|
@@ -58,9 +55,6 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
|
|
|
58
55
|
)
|
|
59
56
|
ocr = rapidocr.RapidOCR()
|
|
60
57
|
|
|
61
|
-
page_text = page.extract_text() or ""
|
|
62
|
-
images_text_list: List = []
|
|
63
|
-
|
|
64
58
|
# Process images in parallel
|
|
65
59
|
async def process_image(image_data: bytes) -> List[str]:
|
|
66
60
|
ocr_result, _ = ocr(image_data)
|
|
@@ -69,27 +63,221 @@ async def async_process_image_page(doc_name: str, page_number: int, page: Any) -
|
|
|
69
63
|
image_tasks = [process_image(image.data) for image in page.images]
|
|
70
64
|
images_results = await asyncio.gather(*image_tasks)
|
|
71
65
|
|
|
66
|
+
images_text_list: List = []
|
|
72
67
|
for result in images_results:
|
|
73
68
|
images_text_list.extend(result)
|
|
74
69
|
|
|
75
70
|
images_text = "\n".join(images_text_list)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
71
|
+
return images_text
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _clean_page_numbers(
|
|
75
|
+
page_content_list: List[str],
|
|
76
|
+
extra_content: List[str] = [],
|
|
77
|
+
page_start_numbering_format: str = PAGE_START_NUMBERING_FORMAT_DEFAULT,
|
|
78
|
+
page_end_numbering_format: str = PAGE_END_NUMBERING_FORMAT_DEFAULT,
|
|
79
|
+
) -> Tuple[List[str], Optional[int]]:
|
|
80
|
+
f"""
|
|
81
|
+
Identifies and removes or reformats page numbers from a list of PDF page contents, based on the most consistent sequential numbering.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
page_content_list (List[str]): A list of strings where each string represents the content of a PDF page.
|
|
85
|
+
extra_content (List[str]): A list of strings where each string will be appended after the main content. Can be used for appending image information.
|
|
86
|
+
page_start_numbering_format (str): A format string to prepend to the page content, with `{{page_nr}}` as a placeholder for the page number.
|
|
87
|
+
Defaults to {PAGE_START_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
|
|
88
|
+
page_end_numbering_format (str): A format string to append to the page content, with `{{page_nr}}` as a placeholder for the page number.
|
|
89
|
+
Defaults to {PAGE_END_NUMBERING_FORMAT_DEFAULT}. Make it an empty string to remove the page number.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
List[str]: The list of page contents with page numbers removed or reformatted based on the detected sequence.
|
|
93
|
+
Optional[Int]: The shift for the page numbering. Can be (-2, -1, 0, 1, 2).
|
|
94
|
+
|
|
95
|
+
Notes:
|
|
96
|
+
- The function scans for page numbers using a regular expression that matches digits at the start or end of a string.
|
|
97
|
+
- It evaluates several potential starting points for numbering (-2, -1, 0, 1, 2 shifts) to determine the most consistent sequence.
|
|
98
|
+
- If at least a specified ratio of pages (defined by `PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL`) has correct sequential numbering,
|
|
99
|
+
the page numbers are processed.
|
|
100
|
+
- If page numbers are found, the function will add formatted page numbers to each page's content if `page_start_numbering_format` or
|
|
101
|
+
`page_end_numbering_format` is provided.
|
|
102
|
+
"""
|
|
103
|
+
assert len(extra_content) == 0 or len(extra_content) == len(page_content_list), (
|
|
104
|
+
"Please provide an equally sized list of extra content if provided."
|
|
83
105
|
)
|
|
84
106
|
|
|
107
|
+
# Regex to match potential page numbers at the start or end of a string
|
|
108
|
+
page_number_regex = re.compile(r"^\s*(\d+)\s*|\s*(\d+)\s*$")
|
|
109
|
+
|
|
110
|
+
def find_page_number(content):
|
|
111
|
+
match = page_number_regex.search(content)
|
|
112
|
+
if match:
|
|
113
|
+
return int(match.group(1) or match.group(2))
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
page_numbers = [find_page_number(content) for content in page_content_list]
|
|
117
|
+
if all(x is None or x > 5 for x in page_numbers):
|
|
118
|
+
# This approach won't work reliably for higher page numbers.
|
|
119
|
+
return page_content_list, None
|
|
120
|
+
|
|
121
|
+
# Possible range shifts to detect page numbering
|
|
122
|
+
range_shifts = [-2, -1, 0, 1, 2]
|
|
123
|
+
best_match, best_correct_count, best_shift = _identify_best_page_sequence(page_numbers, range_shifts)
|
|
124
|
+
|
|
125
|
+
# Check if at least ..% of the pages have correct sequential numbering
|
|
126
|
+
if best_match and best_correct_count / len(page_numbers) >= PAGE_NUMBERING_CORRECTNESS_RATIO_FOR_REMOVAL:
|
|
127
|
+
# Remove the page numbers from the content
|
|
128
|
+
for i, expected_number in enumerate(best_match):
|
|
129
|
+
page_content_list[i] = re.sub(
|
|
130
|
+
rf"^\s*{expected_number}\s*|\s*{expected_number}\s*$", "", page_content_list[i]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
page_start = (
|
|
134
|
+
page_start_numbering_format.format(page_nr=expected_number) + "\n"
|
|
135
|
+
if page_start_numbering_format
|
|
136
|
+
else ""
|
|
137
|
+
)
|
|
138
|
+
page_end = (
|
|
139
|
+
"\n" + page_end_numbering_format.format(page_nr=expected_number) if page_end_numbering_format else ""
|
|
140
|
+
)
|
|
141
|
+
extra_info = "\n" + extra_content[i] if extra_content else ""
|
|
142
|
+
|
|
143
|
+
# Add formatted page numbering if configured.
|
|
144
|
+
page_content_list[i] = page_start + page_content_list[i] + extra_info + page_end
|
|
145
|
+
else:
|
|
146
|
+
best_shift = None
|
|
147
|
+
|
|
148
|
+
return page_content_list, best_shift
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _identify_best_page_sequence(page_numbers, range_shifts):
|
|
152
|
+
best_match = None
|
|
153
|
+
best_shift: Optional[int] = None
|
|
154
|
+
best_correct_count = 0
|
|
155
|
+
|
|
156
|
+
for shift in range_shifts:
|
|
157
|
+
expected_numbers = [i + shift for i in range(len(page_numbers))]
|
|
158
|
+
# Check if expected number occurs (or that the expected "2" occurs in an incorrectly merged number like 25,
|
|
159
|
+
# where 2 is the page number and 5 is part of the PDF content).
|
|
160
|
+
correct_count = sum(
|
|
161
|
+
1
|
|
162
|
+
for actual, expected in zip(page_numbers, expected_numbers)
|
|
163
|
+
if actual == expected or str(actual).startswith(str(expected)) or str(actual).endswith(str(expected))
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if correct_count > best_correct_count:
|
|
167
|
+
best_correct_count = correct_count
|
|
168
|
+
best_match = expected_numbers
|
|
169
|
+
best_shift = shift
|
|
170
|
+
|
|
171
|
+
return best_match, best_correct_count, best_shift
|
|
172
|
+
|
|
85
173
|
|
|
86
174
|
class BasePDFReader(Reader):
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
split_on_pages: bool = True,
|
|
178
|
+
page_start_numbering_format: Optional[str] = None,
|
|
179
|
+
page_end_numbering_format: Optional[str] = None,
|
|
180
|
+
**kwargs,
|
|
181
|
+
):
|
|
182
|
+
if page_start_numbering_format is None:
|
|
183
|
+
page_start_numbering_format = PAGE_START_NUMBERING_FORMAT_DEFAULT
|
|
184
|
+
if page_end_numbering_format is None:
|
|
185
|
+
page_end_numbering_format = PAGE_END_NUMBERING_FORMAT_DEFAULT
|
|
186
|
+
|
|
187
|
+
self.split_on_pages = split_on_pages
|
|
188
|
+
self.page_start_numbering_format = page_start_numbering_format
|
|
189
|
+
self.page_end_numbering_format = page_end_numbering_format
|
|
190
|
+
|
|
191
|
+
super().__init__(**kwargs)
|
|
192
|
+
|
|
87
193
|
def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
|
|
88
194
|
chunked_documents: List[Document] = []
|
|
89
195
|
for document in documents:
|
|
90
196
|
chunked_documents.extend(self.chunk_document(document))
|
|
91
197
|
return chunked_documents
|
|
92
198
|
|
|
199
|
+
def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
|
|
200
|
+
if self.split_on_pages:
|
|
201
|
+
shift = page_number_shift if page_number_shift is not None else 1
|
|
202
|
+
documents: List[Document] = []
|
|
203
|
+
for page_number, page_content in enumerate(pdf_content, start=shift):
|
|
204
|
+
documents.append(
|
|
205
|
+
Document(
|
|
206
|
+
name=doc_name,
|
|
207
|
+
id=(str(uuid4()) if use_uuid_for_id else f"{doc_name}_{page_number}"),
|
|
208
|
+
meta_data={"page": page_number},
|
|
209
|
+
content=page_content,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
pdf_content_str = "\n".join(pdf_content)
|
|
214
|
+
document = Document(
|
|
215
|
+
name=doc_name,
|
|
216
|
+
id=str(uuid4()) if use_uuid_for_id else doc_name,
|
|
217
|
+
meta_data={},
|
|
218
|
+
content=pdf_content_str,
|
|
219
|
+
)
|
|
220
|
+
documents = [document]
|
|
221
|
+
|
|
222
|
+
if self.chunk:
|
|
223
|
+
return self._build_chunked_documents(documents)
|
|
224
|
+
|
|
225
|
+
return documents
|
|
226
|
+
|
|
227
|
+
def _pdf_reader_to_documents(
|
|
228
|
+
self,
|
|
229
|
+
doc_reader: DocumentReader,
|
|
230
|
+
doc_name,
|
|
231
|
+
read_images=False,
|
|
232
|
+
use_uuid_for_id=False,
|
|
233
|
+
):
|
|
234
|
+
pdf_content = []
|
|
235
|
+
pdf_images_text = []
|
|
236
|
+
for page in doc_reader.pages:
|
|
237
|
+
pdf_content.append(page.extract_text())
|
|
238
|
+
if read_images:
|
|
239
|
+
pdf_images_text.append(_ocr_reader(page))
|
|
240
|
+
|
|
241
|
+
pdf_content, shift = _clean_page_numbers(
|
|
242
|
+
page_content_list=pdf_content,
|
|
243
|
+
extra_content=pdf_images_text,
|
|
244
|
+
page_start_numbering_format=self.page_start_numbering_format,
|
|
245
|
+
page_end_numbering_format=self.page_end_numbering_format,
|
|
246
|
+
)
|
|
247
|
+
return self._create_documents(pdf_content, doc_name, use_uuid_for_id, shift)
|
|
248
|
+
|
|
249
|
+
async def _async_pdf_reader_to_documents(
|
|
250
|
+
self,
|
|
251
|
+
doc_reader: DocumentReader,
|
|
252
|
+
doc_name: str,
|
|
253
|
+
read_images=False,
|
|
254
|
+
use_uuid_for_id=False,
|
|
255
|
+
):
|
|
256
|
+
async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
|
|
257
|
+
# We tried "asyncio.to_thread(page.extract_text)", but it maintains state internally, which leads to issues.
|
|
258
|
+
page_text = page.extract_text()
|
|
259
|
+
|
|
260
|
+
if read_images:
|
|
261
|
+
pdf_images_text = await _async_ocr_reader(page)
|
|
262
|
+
else:
|
|
263
|
+
pdf_images_text = ""
|
|
264
|
+
|
|
265
|
+
return page_text, pdf_images_text
|
|
266
|
+
|
|
267
|
+
# Process pages in parallel using asyncio.gather
|
|
268
|
+
pdf_content: List[Tuple[str, str]] = await asyncio.gather(
|
|
269
|
+
*[_read_pdf_page(page, read_images) for page in doc_reader.pages]
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
pdf_content_clean, shift = _clean_page_numbers(
|
|
273
|
+
page_content_list=[x[0] for x in pdf_content],
|
|
274
|
+
extra_content=[x[1] for x in pdf_content],
|
|
275
|
+
page_start_numbering_format=self.page_start_numbering_format,
|
|
276
|
+
page_end_numbering_format=self.page_end_numbering_format,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return self._create_documents(pdf_content_clean, doc_name, use_uuid_for_id, shift)
|
|
280
|
+
|
|
93
281
|
|
|
94
282
|
class PDFReader(BasePDFReader):
|
|
95
283
|
"""Reader for PDF files"""
|
|
@@ -106,24 +294,13 @@ class PDFReader(BasePDFReader):
|
|
|
106
294
|
log_info(f"Reading: {doc_name}")
|
|
107
295
|
|
|
108
296
|
try:
|
|
109
|
-
|
|
297
|
+
pdf_reader = DocumentReader(pdf)
|
|
110
298
|
except PdfStreamError as e:
|
|
111
299
|
logger.error(f"Error reading PDF: {e}")
|
|
112
300
|
return []
|
|
113
301
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
documents.append(
|
|
117
|
-
Document(
|
|
118
|
-
name=doc_name,
|
|
119
|
-
id=str(uuid4()),
|
|
120
|
-
meta_data={"page": page_number},
|
|
121
|
-
content=page.extract_text(),
|
|
122
|
-
)
|
|
123
|
-
)
|
|
124
|
-
if self.chunk:
|
|
125
|
-
return self._build_chunked_documents(documents)
|
|
126
|
-
return documents
|
|
302
|
+
# Read and chunk.
|
|
303
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
127
304
|
|
|
128
305
|
async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
|
|
129
306
|
try:
|
|
@@ -137,30 +314,13 @@ class PDFReader(BasePDFReader):
|
|
|
137
314
|
log_info(f"Reading: {doc_name}")
|
|
138
315
|
|
|
139
316
|
try:
|
|
140
|
-
|
|
317
|
+
pdf_reader = DocumentReader(pdf)
|
|
141
318
|
except PdfStreamError as e:
|
|
142
319
|
logger.error(f"Error reading PDF: {e}")
|
|
143
320
|
return []
|
|
144
321
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
name=doc_name,
|
|
148
|
-
id=str(uuid4()),
|
|
149
|
-
meta_data={"page": page_number},
|
|
150
|
-
content=page.extract_text(),
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Process pages in parallel using asyncio.gather
|
|
154
|
-
documents = await asyncio.gather(
|
|
155
|
-
*[
|
|
156
|
-
_process_document(doc_name, page_number, page)
|
|
157
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
158
|
-
]
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
if self.chunk:
|
|
162
|
-
return self._build_chunked_documents(documents)
|
|
163
|
-
return documents
|
|
322
|
+
# Read and chunk.
|
|
323
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
164
324
|
|
|
165
325
|
|
|
166
326
|
class PDFUrlReader(BasePDFReader):
|
|
@@ -182,21 +342,10 @@ class PDFUrlReader(BasePDFReader):
|
|
|
182
342
|
response = fetch_with_retry(url, proxy=self.proxy)
|
|
183
343
|
|
|
184
344
|
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
documents.append(
|
|
190
|
-
Document(
|
|
191
|
-
name=doc_name,
|
|
192
|
-
id=f"{doc_name}_{page_number}",
|
|
193
|
-
meta_data={"page": page_number},
|
|
194
|
-
content=page.extract_text(),
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
if self.chunk:
|
|
198
|
-
return self._build_chunked_documents(documents)
|
|
199
|
-
return documents
|
|
345
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
346
|
+
|
|
347
|
+
# Read and chunk.
|
|
348
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
200
349
|
|
|
201
350
|
async def async_read(self, url: str) -> List[Document]:
|
|
202
351
|
if not url:
|
|
@@ -213,27 +362,10 @@ class PDFUrlReader(BasePDFReader):
|
|
|
213
362
|
response = await async_fetch_with_retry(url, client=client)
|
|
214
363
|
|
|
215
364
|
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
async def _process_document(doc_name: str, page_number: int, page: Any) -> Document:
|
|
219
|
-
return Document(
|
|
220
|
-
name=doc_name,
|
|
221
|
-
id=f"{doc_name}_{page_number}",
|
|
222
|
-
meta_data={"page": page_number},
|
|
223
|
-
content=page.extract_text(),
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
# Process pages in parallel using asyncio.gather
|
|
227
|
-
documents = await asyncio.gather(
|
|
228
|
-
*[
|
|
229
|
-
_process_document(doc_name, page_number, page)
|
|
230
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
231
|
-
]
|
|
232
|
-
)
|
|
365
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
233
366
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
return documents
|
|
367
|
+
# Read and chunk.
|
|
368
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
237
369
|
|
|
238
370
|
|
|
239
371
|
class PDFImageReader(BasePDFReader):
|
|
@@ -252,16 +384,10 @@ class PDFImageReader(BasePDFReader):
|
|
|
252
384
|
doc_name = "pdf"
|
|
253
385
|
|
|
254
386
|
log_info(f"Reading: {doc_name}")
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
documents = []
|
|
258
|
-
for page_number, page in enumerate(doc_reader.pages, start=1):
|
|
259
|
-
documents.append(process_image_page(doc_name, page_number, page))
|
|
260
|
-
|
|
261
|
-
if self.chunk:
|
|
262
|
-
return self._build_chunked_documents(documents)
|
|
387
|
+
pdf_reader = DocumentReader(pdf)
|
|
263
388
|
|
|
264
|
-
|
|
389
|
+
# Read and chunk.
|
|
390
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
265
391
|
|
|
266
392
|
async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
|
|
267
393
|
if not pdf:
|
|
@@ -276,18 +402,10 @@ class PDFImageReader(BasePDFReader):
|
|
|
276
402
|
doc_name = "pdf"
|
|
277
403
|
|
|
278
404
|
log_info(f"Reading: {doc_name}")
|
|
279
|
-
|
|
405
|
+
pdf_reader = DocumentReader(pdf)
|
|
280
406
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
async_process_image_page(doc_name, page_number, page)
|
|
284
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
285
|
-
]
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
if self.chunk:
|
|
289
|
-
return self._build_chunked_documents(documents)
|
|
290
|
-
return documents
|
|
407
|
+
# Read and chunk.
|
|
408
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
291
409
|
|
|
292
410
|
|
|
293
411
|
class PDFUrlImageReader(BasePDFReader):
|
|
@@ -310,17 +428,10 @@ class PDFUrlImageReader(BasePDFReader):
|
|
|
310
428
|
response = httpx.get(url, proxy=self.proxy) if self.proxy else httpx.get(url)
|
|
311
429
|
|
|
312
430
|
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
documents = []
|
|
316
|
-
for page_number, page in enumerate(doc_reader.pages, start=1):
|
|
317
|
-
documents.append(process_image_page(doc_name, page_number, page))
|
|
318
|
-
|
|
319
|
-
# Optionally chunk documents
|
|
320
|
-
if self.chunk:
|
|
321
|
-
return self._build_chunked_documents(documents)
|
|
431
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
322
432
|
|
|
323
|
-
|
|
433
|
+
# Read and chunk.
|
|
434
|
+
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
324
435
|
|
|
325
436
|
async def async_read(self, url: str) -> List[Document]:
|
|
326
437
|
if not url:
|
|
@@ -338,15 +449,7 @@ class PDFUrlImageReader(BasePDFReader):
|
|
|
338
449
|
response.raise_for_status()
|
|
339
450
|
|
|
340
451
|
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
341
|
-
|
|
452
|
+
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
342
453
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
async_process_image_page(doc_name, page_number, page)
|
|
346
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
347
|
-
]
|
|
348
|
-
)
|
|
349
|
-
|
|
350
|
-
if self.chunk:
|
|
351
|
-
return self._build_chunked_documents(documents)
|
|
352
|
-
return documents
|
|
454
|
+
# Read and chunk.
|
|
455
|
+
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
@@ -3,7 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from agno.document.base import Document
|
|
5
5
|
from agno.document.reader.base import Reader
|
|
6
|
-
from agno.utils.log import log_info, logger
|
|
6
|
+
from agno.utils.log import log_debug, log_info, logger
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
@@ -23,12 +23,16 @@ class YouTubeReader(Reader):
|
|
|
23
23
|
log_info(f"Reading transcript for video: {video_id}")
|
|
24
24
|
|
|
25
25
|
# Get transcript
|
|
26
|
-
|
|
26
|
+
log_debug(f"Fetching transcript for video: {video_id}")
|
|
27
|
+
# Create an instance of YouTubeTranscriptApi
|
|
28
|
+
ytt_api = YouTubeTranscriptApi()
|
|
29
|
+
transcript_data = ytt_api.fetch(video_id)
|
|
27
30
|
|
|
28
31
|
# Combine transcript segments into full text
|
|
29
32
|
transcript_text = ""
|
|
30
|
-
|
|
31
|
-
|
|
33
|
+
|
|
34
|
+
for segment in transcript_data:
|
|
35
|
+
transcript_text += f"{segment.text} "
|
|
32
36
|
|
|
33
37
|
documents = [
|
|
34
38
|
Document(
|
agno/models/anthropic/claude.py
CHANGED
|
@@ -449,7 +449,7 @@ class Claude(Model):
|
|
|
449
449
|
|
|
450
450
|
def get_system_message_for_model(self, tools: Optional[List[Any]] = None) -> Optional[str]:
|
|
451
451
|
if tools is not None and len(tools) > 0:
|
|
452
|
-
tool_call_prompt = "Do not reflect on the quality of the returned search results in your response"
|
|
452
|
+
tool_call_prompt = "Do not reflect on the quality of the returned search results in your response\n\n"
|
|
453
453
|
return tool_call_prompt
|
|
454
454
|
return None
|
|
455
455
|
|
agno/models/base.py
CHANGED
|
@@ -1025,6 +1025,10 @@ class Model(ABC):
|
|
|
1025
1025
|
stream_data.response_thinking += model_response_delta.thinking
|
|
1026
1026
|
should_yield = True
|
|
1027
1027
|
|
|
1028
|
+
if model_response_delta.reasoning_content is not None:
|
|
1029
|
+
stream_data.response_thinking += model_response_delta.reasoning_content
|
|
1030
|
+
should_yield = True
|
|
1031
|
+
|
|
1028
1032
|
if model_response_delta.redacted_thinking is not None:
|
|
1029
1033
|
stream_data.response_redacted_thinking += model_response_delta.redacted_thinking
|
|
1030
1034
|
should_yield = True
|
agno/models/message.py
CHANGED
|
@@ -338,8 +338,12 @@ class Message(BaseModel):
|
|
|
338
338
|
if isinstance(tool_call_arguments, dict)
|
|
339
339
|
else json.loads(tool_call_arguments)
|
|
340
340
|
)
|
|
341
|
-
|
|
342
|
-
|
|
341
|
+
# Ensure tool_call_args is a dictionary before calling .items()
|
|
342
|
+
if isinstance(tool_call_args, dict):
|
|
343
|
+
arguments = ", ".join(f"{k}: {v}" for k, v in tool_call_args.items())
|
|
344
|
+
tool_calls_list.append(f" Arguments: '{arguments}'")
|
|
345
|
+
else:
|
|
346
|
+
tool_calls_list.append(f" Arguments: '{tool_call_args}'")
|
|
343
347
|
except json.JSONDecodeError:
|
|
344
348
|
tool_calls_list.append(" Arguments: 'Invalid JSON format'")
|
|
345
349
|
tool_calls_str = "\n".join(tool_calls_list)
|
agno/models/openai/chat.py
CHANGED
|
@@ -62,6 +62,7 @@ class OpenAIChat(Model):
|
|
|
62
62
|
temperature: Optional[float] = None
|
|
63
63
|
user: Optional[str] = None
|
|
64
64
|
top_p: Optional[float] = None
|
|
65
|
+
service_tier: Optional[str] = None # "auto" | "default" | "flex" | "priority", defaults to "auto" when not set
|
|
65
66
|
extra_headers: Optional[Any] = None
|
|
66
67
|
extra_query: Optional[Any] = None
|
|
67
68
|
request_params: Optional[Dict[str, Any]] = None
|
|
@@ -175,6 +176,7 @@ class OpenAIChat(Model):
|
|
|
175
176
|
"extra_headers": self.extra_headers,
|
|
176
177
|
"extra_query": self.extra_query,
|
|
177
178
|
"metadata": self.metadata,
|
|
179
|
+
"service_tier": self.service_tier,
|
|
178
180
|
}
|
|
179
181
|
|
|
180
182
|
# Handle response format - always use JSON schema approach
|
|
@@ -241,6 +243,7 @@ class OpenAIChat(Model):
|
|
|
241
243
|
"user": self.user,
|
|
242
244
|
"extra_headers": self.extra_headers,
|
|
243
245
|
"extra_query": self.extra_query,
|
|
246
|
+
"service_tier": self.service_tier,
|
|
244
247
|
}
|
|
245
248
|
)
|
|
246
249
|
cleaned_dict = {k: v for k, v in model_dict.items() if v is not None}
|
agno/models/openai/responses.py
CHANGED
|
@@ -47,7 +47,7 @@ class OpenAIResponses(Model):
|
|
|
47
47
|
top_p: Optional[float] = None
|
|
48
48
|
truncation: Optional[Literal["auto", "disabled"]] = None
|
|
49
49
|
user: Optional[str] = None
|
|
50
|
-
|
|
50
|
+
service_tier: Optional[Literal["auto", "default", "flex", "priority"]] = None
|
|
51
51
|
request_params: Optional[Dict[str, Any]] = None
|
|
52
52
|
|
|
53
53
|
# Client parameters
|
|
@@ -178,6 +178,7 @@ class OpenAIResponses(Model):
|
|
|
178
178
|
"top_p": self.top_p,
|
|
179
179
|
"truncation": self.truncation,
|
|
180
180
|
"user": self.user,
|
|
181
|
+
"service_tier": self.service_tier,
|
|
181
182
|
}
|
|
182
183
|
# Set the response format
|
|
183
184
|
if response_format is not None:
|
|
@@ -310,11 +311,11 @@ class OpenAIResponses(Model):
|
|
|
310
311
|
formatted_tools = []
|
|
311
312
|
if tools:
|
|
312
313
|
for _tool in tools:
|
|
313
|
-
if _tool
|
|
314
|
-
_tool_dict = _tool
|
|
314
|
+
if _tool.get("type") == "function":
|
|
315
|
+
_tool_dict = _tool.get("function", {})
|
|
315
316
|
_tool_dict["type"] = "function"
|
|
316
|
-
for prop in _tool_dict
|
|
317
|
-
if isinstance(prop
|
|
317
|
+
for prop in _tool_dict.get("parameters", {}).get("properties", {}).values():
|
|
318
|
+
if isinstance(prop.get("type", ""), list):
|
|
318
319
|
prop["type"] = prop["type"][0]
|
|
319
320
|
|
|
320
321
|
formatted_tools.append(_tool_dict)
|