agno 2.0.0rc1__py3-none-any.whl → 2.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +32 -14
- agno/db/mongo/mongo.py +8 -3
- agno/eval/accuracy.py +12 -5
- agno/knowledge/chunking/strategy.py +14 -14
- agno/knowledge/knowledge.py +156 -120
- agno/knowledge/reader/arxiv_reader.py +5 -5
- agno/knowledge/reader/csv_reader.py +6 -77
- agno/knowledge/reader/docx_reader.py +5 -5
- agno/knowledge/reader/firecrawl_reader.py +5 -5
- agno/knowledge/reader/json_reader.py +5 -5
- agno/knowledge/reader/markdown_reader.py +31 -9
- agno/knowledge/reader/pdf_reader.py +10 -123
- agno/knowledge/reader/reader_factory.py +65 -72
- agno/knowledge/reader/s3_reader.py +44 -114
- agno/knowledge/reader/text_reader.py +5 -5
- agno/knowledge/reader/url_reader.py +75 -31
- agno/knowledge/reader/web_search_reader.py +6 -29
- agno/knowledge/reader/website_reader.py +5 -5
- agno/knowledge/reader/wikipedia_reader.py +5 -5
- agno/knowledge/reader/youtube_reader.py +6 -6
- agno/knowledge/utils.py +10 -10
- agno/models/aws/bedrock.py +3 -7
- agno/models/base.py +37 -6
- agno/os/app.py +32 -24
- agno/os/mcp.py +39 -59
- agno/os/router.py +547 -16
- agno/os/routers/evals/evals.py +197 -12
- agno/os/routers/knowledge/knowledge.py +428 -14
- agno/os/routers/memory/memory.py +250 -28
- agno/os/routers/metrics/metrics.py +125 -7
- agno/os/routers/session/session.py +393 -25
- agno/os/schema.py +55 -2
- agno/run/agent.py +9 -0
- agno/run/team.py +93 -2
- agno/run/workflow.py +25 -12
- agno/team/team.py +861 -1051
- agno/tools/mcp.py +1 -2
- agno/utils/log.py +52 -2
- agno/utils/mcp.py +55 -3
- agno/utils/models/claude.py +0 -8
- agno/utils/print_response/team.py +177 -73
- agno/utils/streamlit.py +27 -0
- agno/workflow/workflow.py +9 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/METADATA +1 -1
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/RECORD +48 -49
- agno/knowledge/reader/gcs_reader.py +0 -67
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/WHEEL +0 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import io
|
|
4
|
-
import os
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import IO, Any, List, Optional, Union
|
|
7
|
-
from urllib.parse import urlparse
|
|
8
6
|
from uuid import uuid4
|
|
9
7
|
|
|
10
|
-
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
|
|
11
|
-
|
|
12
8
|
try:
|
|
13
9
|
import aiofiles
|
|
14
10
|
except ImportError:
|
|
@@ -32,16 +28,16 @@ class CSVReader(Reader):
|
|
|
32
28
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
33
29
|
"""Get the list of supported chunking strategies for CSV readers."""
|
|
34
30
|
return [
|
|
35
|
-
ChunkingStrategyType.
|
|
36
|
-
ChunkingStrategyType.
|
|
37
|
-
ChunkingStrategyType.
|
|
38
|
-
ChunkingStrategyType.
|
|
39
|
-
ChunkingStrategyType.
|
|
31
|
+
ChunkingStrategyType.ROW_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
34
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
35
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
40
36
|
]
|
|
41
37
|
|
|
42
38
|
@classmethod
|
|
43
39
|
def get_supported_content_types(self) -> List[ContentType]:
|
|
44
|
-
return [ContentType.
|
|
40
|
+
return [ContentType.CSV, ContentType.XLSX, ContentType.XLS]
|
|
45
41
|
|
|
46
42
|
def read(
|
|
47
43
|
self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str = '"', name: Optional[str] = None
|
|
@@ -168,70 +164,3 @@ class CSVReader(Reader):
|
|
|
168
164
|
f"Error reading async: {getattr(file, 'name', str(file)) if isinstance(file, IO) else file}: {e}"
|
|
169
165
|
)
|
|
170
166
|
return []
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class CSVUrlReader(Reader):
|
|
174
|
-
"""Reader for CSV files"""
|
|
175
|
-
|
|
176
|
-
def __init__(
|
|
177
|
-
self, chunking_strategy: Optional[ChunkingStrategy] = RowChunking(), proxy: Optional[str] = None, **kwargs
|
|
178
|
-
):
|
|
179
|
-
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
180
|
-
self.proxy = proxy
|
|
181
|
-
|
|
182
|
-
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
183
|
-
"""Get the list of supported chunking strategies for CSV URL readers."""
|
|
184
|
-
return [
|
|
185
|
-
ChunkingStrategyType.ROW_CHUNKING,
|
|
186
|
-
ChunkingStrategyType.SEMANTIC_CHUNKING,
|
|
187
|
-
ChunkingStrategyType.FIXED_SIZE_CHUNKING,
|
|
188
|
-
ChunkingStrategyType.AGENTIC_CHUNKING,
|
|
189
|
-
ChunkingStrategyType.DOCUMENT_CHUNKING,
|
|
190
|
-
]
|
|
191
|
-
|
|
192
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
193
|
-
return [ContentType.URL]
|
|
194
|
-
|
|
195
|
-
def read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
196
|
-
if not url:
|
|
197
|
-
raise ValueError("No URL provided")
|
|
198
|
-
|
|
199
|
-
logger.info(f"Reading: {url}")
|
|
200
|
-
# Retry the request up to 3 times with exponential backoff
|
|
201
|
-
response = fetch_with_retry(url, proxy=self.proxy)
|
|
202
|
-
|
|
203
|
-
parsed_url = urlparse(url)
|
|
204
|
-
filename = os.path.basename(parsed_url.path) or "data.csv"
|
|
205
|
-
|
|
206
|
-
file_obj = io.BytesIO(response.content)
|
|
207
|
-
file_obj.name = filename
|
|
208
|
-
documents = CSVReader().read(file=file_obj, name=name)
|
|
209
|
-
|
|
210
|
-
file_obj.close()
|
|
211
|
-
|
|
212
|
-
return documents
|
|
213
|
-
|
|
214
|
-
async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
215
|
-
if not url:
|
|
216
|
-
raise ValueError("No URL provided")
|
|
217
|
-
|
|
218
|
-
import httpx
|
|
219
|
-
|
|
220
|
-
logger.info(f"Reading async: {url}")
|
|
221
|
-
|
|
222
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
223
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
224
|
-
response = await async_fetch_with_retry(url, client=client)
|
|
225
|
-
|
|
226
|
-
parsed_url = urlparse(url)
|
|
227
|
-
filename = os.path.basename(parsed_url.path) or "data.csv"
|
|
228
|
-
|
|
229
|
-
file_obj = io.BytesIO(response.content)
|
|
230
|
-
file_obj.name = filename
|
|
231
|
-
|
|
232
|
-
# Use the async version of CSVReader
|
|
233
|
-
documents = await CSVReader().async_read(file=file_obj, name=name)
|
|
234
|
-
|
|
235
|
-
file_obj.close()
|
|
236
|
-
|
|
237
|
-
return documents
|
|
@@ -26,11 +26,11 @@ class DocxReader(Reader):
|
|
|
26
26
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
27
27
|
"""Get the list of supported chunking strategies for DOCX readers."""
|
|
28
28
|
return [
|
|
29
|
-
ChunkingStrategyType.
|
|
30
|
-
ChunkingStrategyType.
|
|
31
|
-
ChunkingStrategyType.
|
|
32
|
-
ChunkingStrategyType.
|
|
33
|
-
ChunkingStrategyType.
|
|
29
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
34
34
|
]
|
|
35
35
|
|
|
36
36
|
@classmethod
|
|
@@ -46,11 +46,11 @@ class FirecrawlReader(Reader):
|
|
|
46
46
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
47
47
|
"""Get the list of supported chunking strategies for Firecrawl readers."""
|
|
48
48
|
return [
|
|
49
|
-
ChunkingStrategyType.
|
|
50
|
-
ChunkingStrategyType.
|
|
51
|
-
ChunkingStrategyType.
|
|
52
|
-
ChunkingStrategyType.
|
|
53
|
-
ChunkingStrategyType.
|
|
49
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
50
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
51
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
52
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
53
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
54
54
|
]
|
|
55
55
|
|
|
56
56
|
@classmethod
|
|
@@ -25,11 +25,11 @@ class JSONReader(Reader):
|
|
|
25
25
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
26
26
|
"""Get the list of supported chunking strategies for JSON readers."""
|
|
27
27
|
return [
|
|
28
|
-
ChunkingStrategyType.
|
|
29
|
-
ChunkingStrategyType.
|
|
30
|
-
ChunkingStrategyType.
|
|
31
|
-
ChunkingStrategyType.
|
|
32
|
-
ChunkingStrategyType.
|
|
28
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
29
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
33
33
|
]
|
|
34
34
|
|
|
35
35
|
@classmethod
|
|
@@ -3,13 +3,26 @@ import uuid
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import IO, Any, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from agno.knowledge.chunking.markdown import MarkdownChunking
|
|
7
6
|
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
7
|
from agno.knowledge.document.base import Document
|
|
9
8
|
from agno.knowledge.reader.base import Reader
|
|
10
9
|
from agno.knowledge.types import ContentType
|
|
11
10
|
from agno.utils.log import log_info, logger
|
|
12
11
|
|
|
12
|
+
DEFAULT_CHUNKER_STRATEGY: ChunkingStrategy
|
|
13
|
+
|
|
14
|
+
# Try to import MarkdownChunking, fallback to FixedSizeChunking if not available
|
|
15
|
+
try:
|
|
16
|
+
from agno.knowledge.chunking.markdown import MarkdownChunking
|
|
17
|
+
|
|
18
|
+
DEFAULT_CHUNKER_STRATEGY = MarkdownChunking()
|
|
19
|
+
MARKDOWN_CHUNKER_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
from agno.knowledge.chunking.fixed import FixedSizeChunking
|
|
22
|
+
|
|
23
|
+
DEFAULT_CHUNKER_STRATEGY = FixedSizeChunking()
|
|
24
|
+
MARKDOWN_CHUNKER_AVAILABLE = False
|
|
25
|
+
|
|
13
26
|
|
|
14
27
|
class MarkdownReader(Reader):
|
|
15
28
|
"""Reader for Markdown files"""
|
|
@@ -17,25 +30,34 @@ class MarkdownReader(Reader):
|
|
|
17
30
|
@classmethod
|
|
18
31
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
19
32
|
"""Get the list of supported chunking strategies for Markdown readers."""
|
|
20
|
-
|
|
21
|
-
ChunkingStrategyType.
|
|
22
|
-
ChunkingStrategyType.
|
|
23
|
-
ChunkingStrategyType.
|
|
24
|
-
ChunkingStrategyType.
|
|
25
|
-
ChunkingStrategyType.
|
|
26
|
-
ChunkingStrategyType.FIXED_SIZE_CHUNKING,
|
|
33
|
+
strategies = [
|
|
34
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
35
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
36
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
37
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
38
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
27
39
|
]
|
|
28
40
|
|
|
41
|
+
# Only include MarkdownChunking if it's available
|
|
42
|
+
if MARKDOWN_CHUNKER_AVAILABLE:
|
|
43
|
+
strategies.insert(0, ChunkingStrategyType.MARKDOWN_CHUNKER)
|
|
44
|
+
|
|
45
|
+
return strategies
|
|
46
|
+
|
|
29
47
|
@classmethod
|
|
30
48
|
def get_supported_content_types(self) -> List[ContentType]:
|
|
31
49
|
return [ContentType.MARKDOWN]
|
|
32
50
|
|
|
33
51
|
def __init__(
|
|
34
52
|
self,
|
|
35
|
-
chunking_strategy: Optional[ChunkingStrategy] =
|
|
53
|
+
chunking_strategy: Optional[ChunkingStrategy] = None,
|
|
36
54
|
name: Optional[str] = None,
|
|
37
55
|
description: Optional[str] = None,
|
|
38
56
|
) -> None:
|
|
57
|
+
# Use the default chunking strategy if none provided
|
|
58
|
+
if chunking_strategy is None:
|
|
59
|
+
chunking_strategy = DEFAULT_CHUNKER_STRATEGY
|
|
60
|
+
|
|
39
61
|
super().__init__(chunking_strategy=chunking_strategy, name=name, description=description)
|
|
40
62
|
|
|
41
63
|
def read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
@@ -8,7 +8,6 @@ from agno.knowledge.chunking.strategy import ChunkingStrategyType
|
|
|
8
8
|
from agno.knowledge.document.base import Document
|
|
9
9
|
from agno.knowledge.reader.base import Reader
|
|
10
10
|
from agno.knowledge.types import ContentType
|
|
11
|
-
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
|
|
12
11
|
from agno.utils.log import log_error, log_info, logger
|
|
13
12
|
|
|
14
13
|
try:
|
|
@@ -202,11 +201,11 @@ class BasePDFReader(Reader):
|
|
|
202
201
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
203
202
|
"""Get the list of supported chunking strategies for PDF readers."""
|
|
204
203
|
return [
|
|
205
|
-
ChunkingStrategyType.
|
|
206
|
-
ChunkingStrategyType.
|
|
207
|
-
ChunkingStrategyType.
|
|
208
|
-
ChunkingStrategyType.
|
|
209
|
-
ChunkingStrategyType.
|
|
204
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
205
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
206
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
207
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
208
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
210
209
|
]
|
|
211
210
|
|
|
212
211
|
def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
|
|
@@ -222,19 +221,19 @@ class BasePDFReader(Reader):
|
|
|
222
221
|
# Use provided password or fall back to instance password
|
|
223
222
|
pdf_password = password or self.password
|
|
224
223
|
if not pdf_password:
|
|
225
|
-
logger.error(f
|
|
224
|
+
logger.error(f'PDF file "{doc_name}" is password protected but no password provided')
|
|
226
225
|
return False
|
|
227
226
|
|
|
228
227
|
try:
|
|
229
228
|
decrypted_pdf = doc_reader.decrypt(pdf_password)
|
|
230
229
|
if decrypted_pdf:
|
|
231
|
-
log_info(f
|
|
230
|
+
log_info(f'Successfully decrypted PDF file "{doc_name}" with user password')
|
|
232
231
|
return True
|
|
233
232
|
else:
|
|
234
|
-
log_error(f
|
|
233
|
+
log_error(f'Failed to decrypt PDF file "{doc_name}": incorrect password')
|
|
235
234
|
return False
|
|
236
235
|
except Exception as e:
|
|
237
|
-
log_error(f
|
|
236
|
+
log_error(f'Error decrypting PDF file "{doc_name}": {e}')
|
|
238
237
|
return False
|
|
239
238
|
|
|
240
239
|
def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
|
|
@@ -368,7 +367,7 @@ class PDFReader(BasePDFReader):
|
|
|
368
367
|
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
369
368
|
return []
|
|
370
369
|
|
|
371
|
-
# Read and chunk
|
|
370
|
+
# Read and chunk
|
|
372
371
|
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
373
372
|
|
|
374
373
|
async def async_read(
|
|
@@ -405,63 +404,6 @@ class PDFReader(BasePDFReader):
|
|
|
405
404
|
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
406
405
|
|
|
407
406
|
|
|
408
|
-
class PDFUrlReader(BasePDFReader):
|
|
409
|
-
"""Reader for PDF files from URL"""
|
|
410
|
-
|
|
411
|
-
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
412
|
-
super().__init__(password=password, **kwargs)
|
|
413
|
-
self.proxy = proxy
|
|
414
|
-
|
|
415
|
-
@classmethod
|
|
416
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
417
|
-
return [ContentType.URL]
|
|
418
|
-
|
|
419
|
-
def read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
420
|
-
if not url:
|
|
421
|
-
raise ValueError("No url provided")
|
|
422
|
-
|
|
423
|
-
from io import BytesIO
|
|
424
|
-
|
|
425
|
-
log_info(f"Reading: {url}")
|
|
426
|
-
|
|
427
|
-
# Retry the request up to 3 times with exponential backoff
|
|
428
|
-
response = fetch_with_retry(url, proxy=self.proxy)
|
|
429
|
-
|
|
430
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
431
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
432
|
-
|
|
433
|
-
# Handle PDF decryption
|
|
434
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
435
|
-
return []
|
|
436
|
-
|
|
437
|
-
# Read and chunk.
|
|
438
|
-
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
439
|
-
|
|
440
|
-
async def async_read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
441
|
-
if not url:
|
|
442
|
-
raise ValueError("No url provided")
|
|
443
|
-
|
|
444
|
-
from io import BytesIO
|
|
445
|
-
|
|
446
|
-
import httpx
|
|
447
|
-
|
|
448
|
-
log_info(f"Reading: {url}")
|
|
449
|
-
|
|
450
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
451
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
452
|
-
response = await async_fetch_with_retry(url, client=client)
|
|
453
|
-
|
|
454
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
455
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
456
|
-
|
|
457
|
-
# Handle PDF decryption
|
|
458
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
459
|
-
return []
|
|
460
|
-
|
|
461
|
-
# Read and chunk.
|
|
462
|
-
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
463
|
-
|
|
464
|
-
|
|
465
407
|
class PDFImageReader(BasePDFReader):
|
|
466
408
|
"""Reader for PDF files with text and images extraction"""
|
|
467
409
|
|
|
@@ -512,58 +454,3 @@ class PDFImageReader(BasePDFReader):
|
|
|
512
454
|
|
|
513
455
|
# Read and chunk.
|
|
514
456
|
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
class PDFUrlImageReader(BasePDFReader):
|
|
518
|
-
"""Reader for PDF files from URL with text and images extraction"""
|
|
519
|
-
|
|
520
|
-
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
521
|
-
super().__init__(password=password, **kwargs)
|
|
522
|
-
self.proxy = proxy
|
|
523
|
-
|
|
524
|
-
def read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
525
|
-
if not url:
|
|
526
|
-
raise ValueError("No url provided")
|
|
527
|
-
|
|
528
|
-
from io import BytesIO
|
|
529
|
-
|
|
530
|
-
import httpx
|
|
531
|
-
|
|
532
|
-
# Read the PDF from the URL
|
|
533
|
-
log_info(f"Reading: {url}")
|
|
534
|
-
response = httpx.get(url, proxy=self.proxy) if self.proxy else httpx.get(url)
|
|
535
|
-
|
|
536
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
537
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
538
|
-
|
|
539
|
-
# Handle PDF decryption
|
|
540
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
541
|
-
return []
|
|
542
|
-
|
|
543
|
-
# Read and chunk.
|
|
544
|
-
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
545
|
-
|
|
546
|
-
async def async_read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
547
|
-
if not url:
|
|
548
|
-
raise ValueError("No url provided")
|
|
549
|
-
|
|
550
|
-
from io import BytesIO
|
|
551
|
-
|
|
552
|
-
import httpx
|
|
553
|
-
|
|
554
|
-
log_info(f"Reading: {url}")
|
|
555
|
-
|
|
556
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
557
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
558
|
-
response = await client.get(url)
|
|
559
|
-
response.raise_for_status()
|
|
560
|
-
|
|
561
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
562
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
563
|
-
|
|
564
|
-
# Handle PDF decryption
|
|
565
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
566
|
-
return []
|
|
567
|
-
|
|
568
|
-
# Read and chunk.
|
|
569
|
-
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
@@ -15,7 +15,11 @@ class ReaderFactory:
|
|
|
15
15
|
"""Get PDF reader instance."""
|
|
16
16
|
from agno.knowledge.reader.pdf_reader import PDFReader
|
|
17
17
|
|
|
18
|
-
config: Dict[str, Any] = {
|
|
18
|
+
config: Dict[str, Any] = {
|
|
19
|
+
"chunk": True,
|
|
20
|
+
"chunk_size": 100,
|
|
21
|
+
"description": "Processes PDF documents with OCR support for images and text extraction",
|
|
22
|
+
}
|
|
19
23
|
config.update(kwargs)
|
|
20
24
|
return PDFReader(**config)
|
|
21
25
|
|
|
@@ -24,7 +28,10 @@ class ReaderFactory:
|
|
|
24
28
|
"""Get CSV reader instance."""
|
|
25
29
|
from agno.knowledge.reader.csv_reader import CSVReader
|
|
26
30
|
|
|
27
|
-
config: Dict[str, Any] = {
|
|
31
|
+
config: Dict[str, Any] = {
|
|
32
|
+
"name": "CSV Reader",
|
|
33
|
+
"description": "Parses CSV, XLSX, and XLS files with custom delimiter support",
|
|
34
|
+
}
|
|
28
35
|
config.update(kwargs)
|
|
29
36
|
return CSVReader(**config)
|
|
30
37
|
|
|
@@ -33,7 +40,10 @@ class ReaderFactory:
|
|
|
33
40
|
"""Get Docx reader instance."""
|
|
34
41
|
from agno.knowledge.reader.docx_reader import DocxReader
|
|
35
42
|
|
|
36
|
-
config: Dict[str, Any] = {
|
|
43
|
+
config: Dict[str, Any] = {
|
|
44
|
+
"name": "Docx Reader",
|
|
45
|
+
"description": "Extracts text content from Microsoft Word documents (.docx and .doc formats)",
|
|
46
|
+
}
|
|
37
47
|
config.update(kwargs)
|
|
38
48
|
return DocxReader(**config)
|
|
39
49
|
|
|
@@ -42,7 +52,10 @@ class ReaderFactory:
|
|
|
42
52
|
"""Get JSON reader instance."""
|
|
43
53
|
from agno.knowledge.reader.json_reader import JSONReader
|
|
44
54
|
|
|
45
|
-
config: Dict[str, Any] = {
|
|
55
|
+
config: Dict[str, Any] = {
|
|
56
|
+
"name": "JSON Reader",
|
|
57
|
+
"description": "Processes JSON data structures and API responses with nested object handling",
|
|
58
|
+
}
|
|
46
59
|
config.update(kwargs)
|
|
47
60
|
return JSONReader(**config)
|
|
48
61
|
|
|
@@ -51,7 +64,10 @@ class ReaderFactory:
|
|
|
51
64
|
"""Get Markdown reader instance."""
|
|
52
65
|
from agno.knowledge.reader.markdown_reader import MarkdownReader
|
|
53
66
|
|
|
54
|
-
config: Dict[str, Any] = {
|
|
67
|
+
config: Dict[str, Any] = {
|
|
68
|
+
"name": "Markdown Reader",
|
|
69
|
+
"description": "Processes Markdown documentation with header-aware chunking and formatting preservation",
|
|
70
|
+
}
|
|
55
71
|
config.update(kwargs)
|
|
56
72
|
return MarkdownReader(**config)
|
|
57
73
|
|
|
@@ -60,25 +76,22 @@ class ReaderFactory:
|
|
|
60
76
|
"""Get Text reader instance."""
|
|
61
77
|
from agno.knowledge.reader.text_reader import TextReader
|
|
62
78
|
|
|
63
|
-
config: Dict[str, Any] = {
|
|
79
|
+
config: Dict[str, Any] = {
|
|
80
|
+
"name": "Text Reader",
|
|
81
|
+
"description": "Handles plain text files with customizable chunking strategies and encoding detection",
|
|
82
|
+
}
|
|
64
83
|
config.update(kwargs)
|
|
65
84
|
return TextReader(**config)
|
|
66
85
|
|
|
67
|
-
@classmethod
|
|
68
|
-
def _get_url_reader(cls, **kwargs) -> Reader:
|
|
69
|
-
"""Get URL reader instance."""
|
|
70
|
-
from agno.knowledge.reader.url_reader import URLReader
|
|
71
|
-
|
|
72
|
-
config: Dict[str, Any] = {"name": "URL Reader", "description": "Reads URLs"}
|
|
73
|
-
config.update(kwargs)
|
|
74
|
-
return URLReader(**config)
|
|
75
|
-
|
|
76
86
|
@classmethod
|
|
77
87
|
def _get_website_reader(cls, **kwargs) -> Reader:
|
|
78
88
|
"""Get Website reader instance."""
|
|
79
89
|
from agno.knowledge.reader.website_reader import WebsiteReader
|
|
80
90
|
|
|
81
|
-
config: Dict[str, Any] = {
|
|
91
|
+
config: Dict[str, Any] = {
|
|
92
|
+
"name": "Website Reader",
|
|
93
|
+
"description": "Scrapes and extracts content from web pages with HTML parsing and text cleaning",
|
|
94
|
+
}
|
|
82
95
|
config.update(kwargs)
|
|
83
96
|
return WebsiteReader(**config)
|
|
84
97
|
|
|
@@ -91,7 +104,7 @@ class ReaderFactory:
|
|
|
91
104
|
"api_key": kwargs.get("api_key") or os.getenv("FIRECRAWL_API_KEY"),
|
|
92
105
|
"mode": "crawl",
|
|
93
106
|
"name": "Firecrawl Reader",
|
|
94
|
-
"description": "
|
|
107
|
+
"description": "Advanced web scraping and crawling with JavaScript rendering and structured data extraction",
|
|
95
108
|
}
|
|
96
109
|
config.update(kwargs)
|
|
97
110
|
return FirecrawlReader(**config)
|
|
@@ -101,52 +114,22 @@ class ReaderFactory:
|
|
|
101
114
|
"""Get YouTube reader instance."""
|
|
102
115
|
from agno.knowledge.reader.youtube_reader import YouTubeReader
|
|
103
116
|
|
|
104
|
-
config: Dict[str, Any] = {
|
|
117
|
+
config: Dict[str, Any] = {
|
|
118
|
+
"name": "YouTube Reader",
|
|
119
|
+
"description": "Extracts transcripts and metadata from YouTube videos and playlists",
|
|
120
|
+
}
|
|
105
121
|
config.update(kwargs)
|
|
106
122
|
return YouTubeReader(**config)
|
|
107
123
|
|
|
108
|
-
@classmethod
|
|
109
|
-
def _get_pdf_url_reader(cls, **kwargs) -> Reader:
|
|
110
|
-
"""Get PDF URL reader instance."""
|
|
111
|
-
from agno.knowledge.reader.pdf_reader import PDFUrlReader
|
|
112
|
-
|
|
113
|
-
config: Dict[str, Any] = {"name": "PDF URL Reader", "description": "Reads PDF URLs"}
|
|
114
|
-
config.update(kwargs)
|
|
115
|
-
return PDFUrlReader(**config)
|
|
116
|
-
|
|
117
|
-
@classmethod
|
|
118
|
-
def _get_csv_url_reader(cls, **kwargs) -> Reader:
|
|
119
|
-
"""Get CSV URL reader instance."""
|
|
120
|
-
from agno.knowledge.reader.csv_reader import CSVUrlReader
|
|
121
|
-
|
|
122
|
-
config: Dict[str, Any] = {"name": "CSV URL Reader", "description": "Reads CSV URLs"}
|
|
123
|
-
config.update(kwargs)
|
|
124
|
-
return CSVUrlReader(**config)
|
|
125
|
-
|
|
126
|
-
@classmethod
|
|
127
|
-
def _get_s3_reader(cls, **kwargs) -> Reader:
|
|
128
|
-
"""Get S3 reader instance."""
|
|
129
|
-
from agno.knowledge.reader.s3_reader import S3Reader
|
|
130
|
-
|
|
131
|
-
config: Dict[str, Any] = {"name": "S3 Reader", "description": "Reads S3 files"}
|
|
132
|
-
config.update(kwargs)
|
|
133
|
-
return S3Reader(**config)
|
|
134
|
-
|
|
135
|
-
@classmethod
|
|
136
|
-
def _get_gcs_reader(cls, **kwargs) -> Reader:
|
|
137
|
-
"""Get GCS reader instance."""
|
|
138
|
-
from agno.knowledge.reader.gcs_reader import GCSReader
|
|
139
|
-
|
|
140
|
-
config: Dict[str, Any] = {"name": "GCS Reader", "description": "Reads GCS files"}
|
|
141
|
-
config.update(kwargs)
|
|
142
|
-
return GCSReader(**config)
|
|
143
|
-
|
|
144
124
|
@classmethod
|
|
145
125
|
def _get_arxiv_reader(cls, **kwargs) -> Reader:
|
|
146
126
|
"""Get Arxiv reader instance."""
|
|
147
127
|
from agno.knowledge.reader.arxiv_reader import ArxivReader
|
|
148
128
|
|
|
149
|
-
config: Dict[str, Any] = {
|
|
129
|
+
config: Dict[str, Any] = {
|
|
130
|
+
"name": "Arxiv Reader",
|
|
131
|
+
"description": "Downloads and processes academic papers from ArXiv with PDF parsing and metadata extraction",
|
|
132
|
+
}
|
|
150
133
|
config.update(kwargs)
|
|
151
134
|
return ArxivReader(**config)
|
|
152
135
|
|
|
@@ -155,7 +138,10 @@ class ReaderFactory:
|
|
|
155
138
|
"""Get Wikipedia reader instance."""
|
|
156
139
|
from agno.knowledge.reader.wikipedia_reader import WikipediaReader
|
|
157
140
|
|
|
158
|
-
config: Dict[str, Any] = {
|
|
141
|
+
config: Dict[str, Any] = {
|
|
142
|
+
"name": "Wikipedia Reader",
|
|
143
|
+
"description": "Fetches and processes Wikipedia articles with section-aware chunking and link resolution",
|
|
144
|
+
}
|
|
159
145
|
config.update(kwargs)
|
|
160
146
|
return WikipediaReader(**config)
|
|
161
147
|
|
|
@@ -164,7 +150,10 @@ class ReaderFactory:
|
|
|
164
150
|
"""Get Web Search reader instance."""
|
|
165
151
|
from agno.knowledge.reader.web_search_reader import WebSearchReader
|
|
166
152
|
|
|
167
|
-
config: Dict[str, Any] = {
|
|
153
|
+
config: Dict[str, Any] = {
|
|
154
|
+
"name": "Web Search Reader",
|
|
155
|
+
"description": "Executes web searches and processes results with relevance ranking and content extraction",
|
|
156
|
+
}
|
|
168
157
|
config.update(kwargs)
|
|
169
158
|
return WebSearchReader(**config)
|
|
170
159
|
|
|
@@ -224,27 +213,31 @@ class ReaderFactory:
|
|
|
224
213
|
# Default to URL reader
|
|
225
214
|
return cls.create_reader("url")
|
|
226
215
|
|
|
227
|
-
@classmethod
|
|
228
|
-
def get_reader_for_url_file(cls, extension: str) -> Reader:
|
|
229
|
-
"""Get the appropriate reader for a URL file extension."""
|
|
230
|
-
extension = extension.lower()
|
|
231
|
-
|
|
232
|
-
if extension == ".pdf":
|
|
233
|
-
return cls.create_reader("pdf_url")
|
|
234
|
-
elif extension == ".csv":
|
|
235
|
-
return cls.create_reader("csv_url")
|
|
236
|
-
else:
|
|
237
|
-
return cls.create_reader("url")
|
|
238
|
-
|
|
239
216
|
@classmethod
|
|
240
217
|
def get_all_reader_keys(cls) -> List[str]:
|
|
241
|
-
"""Get all available reader keys."""
|
|
218
|
+
"""Get all available reader keys in priority order."""
|
|
242
219
|
# Extract reader keys from method names
|
|
220
|
+
|
|
221
|
+
PREFIX = "_get_"
|
|
222
|
+
SUFFIX = "_reader"
|
|
223
|
+
|
|
243
224
|
reader_keys = []
|
|
244
225
|
for attr_name in dir(cls):
|
|
245
|
-
if attr_name.startswith(
|
|
246
|
-
reader_key = attr_name[
|
|
226
|
+
if attr_name.startswith(PREFIX) and attr_name.endswith(SUFFIX):
|
|
227
|
+
reader_key = attr_name[len(PREFIX) : -len(SUFFIX)] # Remove "_get_" prefix and "_reader" suffix
|
|
247
228
|
reader_keys.append(reader_key)
|
|
229
|
+
|
|
230
|
+
# Define priority order for URL readers
|
|
231
|
+
url_reader_priority = ["url", "website", "firecrawl", "pdf_url", "csv_url", "youtube", "web_search"]
|
|
232
|
+
|
|
233
|
+
# Sort with URL readers in priority order, others alphabetically
|
|
234
|
+
def sort_key(reader_key):
|
|
235
|
+
if reader_key in url_reader_priority:
|
|
236
|
+
return (0, url_reader_priority.index(reader_key))
|
|
237
|
+
else:
|
|
238
|
+
return (1, reader_key)
|
|
239
|
+
|
|
240
|
+
reader_keys.sort(key=sort_key)
|
|
248
241
|
return reader_keys
|
|
249
242
|
|
|
250
243
|
@classmethod
|