agno 2.0.0a1__py3-none-any.whl → 2.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +416 -41
- agno/api/agent.py +2 -2
- agno/api/evals.py +2 -2
- agno/api/os.py +1 -1
- agno/api/settings.py +2 -2
- agno/api/team.py +2 -2
- agno/db/dynamo/dynamo.py +0 -6
- agno/db/firestore/firestore.py +0 -6
- agno/db/in_memory/in_memory_db.py +0 -6
- agno/db/json/json_db.py +0 -6
- agno/db/mongo/mongo.py +8 -9
- agno/db/mysql/utils.py +0 -1
- agno/db/postgres/postgres.py +0 -10
- agno/db/postgres/utils.py +0 -1
- agno/db/redis/redis.py +0 -4
- agno/db/singlestore/singlestore.py +0 -10
- agno/db/singlestore/utils.py +0 -1
- agno/db/sqlite/sqlite.py +0 -4
- agno/db/sqlite/utils.py +0 -1
- agno/eval/accuracy.py +12 -5
- agno/integrations/discord/client.py +5 -1
- agno/knowledge/chunking/strategy.py +14 -14
- agno/knowledge/embedder/aws_bedrock.py +2 -2
- agno/knowledge/knowledge.py +156 -120
- agno/knowledge/reader/arxiv_reader.py +5 -5
- agno/knowledge/reader/csv_reader.py +6 -77
- agno/knowledge/reader/docx_reader.py +5 -5
- agno/knowledge/reader/firecrawl_reader.py +5 -5
- agno/knowledge/reader/json_reader.py +5 -5
- agno/knowledge/reader/markdown_reader.py +31 -9
- agno/knowledge/reader/pdf_reader.py +10 -123
- agno/knowledge/reader/reader_factory.py +65 -72
- agno/knowledge/reader/s3_reader.py +44 -114
- agno/knowledge/reader/text_reader.py +5 -5
- agno/knowledge/reader/url_reader.py +75 -31
- agno/knowledge/reader/web_search_reader.py +6 -29
- agno/knowledge/reader/website_reader.py +5 -5
- agno/knowledge/reader/wikipedia_reader.py +5 -5
- agno/knowledge/reader/youtube_reader.py +6 -6
- agno/knowledge/utils.py +10 -10
- agno/models/anthropic/claude.py +2 -49
- agno/models/aws/bedrock.py +3 -7
- agno/models/base.py +37 -6
- agno/models/message.py +7 -6
- agno/os/app.py +168 -64
- agno/os/interfaces/agui/agui.py +1 -1
- agno/os/interfaces/agui/utils.py +16 -9
- agno/os/interfaces/slack/slack.py +2 -3
- agno/os/interfaces/whatsapp/whatsapp.py +2 -3
- agno/os/mcp.py +235 -0
- agno/os/router.py +576 -19
- agno/os/routers/evals/evals.py +201 -12
- agno/os/routers/knowledge/knowledge.py +455 -18
- agno/os/routers/memory/memory.py +260 -29
- agno/os/routers/metrics/metrics.py +127 -7
- agno/os/routers/session/session.py +398 -25
- agno/os/schema.py +55 -2
- agno/os/settings.py +0 -1
- agno/run/agent.py +96 -2
- agno/run/cancel.py +0 -2
- agno/run/team.py +93 -2
- agno/run/workflow.py +25 -12
- agno/team/team.py +863 -1053
- agno/tools/function.py +65 -7
- agno/tools/linear.py +1 -1
- agno/tools/mcp.py +1 -2
- agno/utils/gemini.py +31 -1
- agno/utils/log.py +52 -2
- agno/utils/mcp.py +55 -3
- agno/utils/models/claude.py +41 -0
- agno/utils/print_response/team.py +177 -73
- agno/utils/streamlit.py +481 -0
- agno/workflow/workflow.py +17 -1
- {agno-2.0.0a1.dist-info → agno-2.0.0rc2.dist-info}/METADATA +1 -1
- {agno-2.0.0a1.dist-info → agno-2.0.0rc2.dist-info}/RECORD +78 -77
- agno/knowledge/reader/gcs_reader.py +0 -67
- {agno-2.0.0a1.dist-info → agno-2.0.0rc2.dist-info}/WHEEL +0 -0
- {agno-2.0.0a1.dist-info → agno-2.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {agno-2.0.0a1.dist-info → agno-2.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -3,13 +3,26 @@ import uuid
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import IO, Any, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from agno.knowledge.chunking.markdown import MarkdownChunking
|
|
7
6
|
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
7
|
from agno.knowledge.document.base import Document
|
|
9
8
|
from agno.knowledge.reader.base import Reader
|
|
10
9
|
from agno.knowledge.types import ContentType
|
|
11
10
|
from agno.utils.log import log_info, logger
|
|
12
11
|
|
|
12
|
+
DEFAULT_CHUNKER_STRATEGY: ChunkingStrategy
|
|
13
|
+
|
|
14
|
+
# Try to import MarkdownChunking, fallback to FixedSizeChunking if not available
|
|
15
|
+
try:
|
|
16
|
+
from agno.knowledge.chunking.markdown import MarkdownChunking
|
|
17
|
+
|
|
18
|
+
DEFAULT_CHUNKER_STRATEGY = MarkdownChunking()
|
|
19
|
+
MARKDOWN_CHUNKER_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
from agno.knowledge.chunking.fixed import FixedSizeChunking
|
|
22
|
+
|
|
23
|
+
DEFAULT_CHUNKER_STRATEGY = FixedSizeChunking()
|
|
24
|
+
MARKDOWN_CHUNKER_AVAILABLE = False
|
|
25
|
+
|
|
13
26
|
|
|
14
27
|
class MarkdownReader(Reader):
|
|
15
28
|
"""Reader for Markdown files"""
|
|
@@ -17,25 +30,34 @@ class MarkdownReader(Reader):
|
|
|
17
30
|
@classmethod
|
|
18
31
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
19
32
|
"""Get the list of supported chunking strategies for Markdown readers."""
|
|
20
|
-
|
|
21
|
-
ChunkingStrategyType.
|
|
22
|
-
ChunkingStrategyType.
|
|
23
|
-
ChunkingStrategyType.
|
|
24
|
-
ChunkingStrategyType.
|
|
25
|
-
ChunkingStrategyType.
|
|
26
|
-
ChunkingStrategyType.FIXED_SIZE_CHUNKING,
|
|
33
|
+
strategies = [
|
|
34
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
35
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
36
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
37
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
38
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
27
39
|
]
|
|
28
40
|
|
|
41
|
+
# Only include MarkdownChunking if it's available
|
|
42
|
+
if MARKDOWN_CHUNKER_AVAILABLE:
|
|
43
|
+
strategies.insert(0, ChunkingStrategyType.MARKDOWN_CHUNKER)
|
|
44
|
+
|
|
45
|
+
return strategies
|
|
46
|
+
|
|
29
47
|
@classmethod
|
|
30
48
|
def get_supported_content_types(self) -> List[ContentType]:
|
|
31
49
|
return [ContentType.MARKDOWN]
|
|
32
50
|
|
|
33
51
|
def __init__(
|
|
34
52
|
self,
|
|
35
|
-
chunking_strategy: Optional[ChunkingStrategy] =
|
|
53
|
+
chunking_strategy: Optional[ChunkingStrategy] = None,
|
|
36
54
|
name: Optional[str] = None,
|
|
37
55
|
description: Optional[str] = None,
|
|
38
56
|
) -> None:
|
|
57
|
+
# Use the default chunking strategy if none provided
|
|
58
|
+
if chunking_strategy is None:
|
|
59
|
+
chunking_strategy = DEFAULT_CHUNKER_STRATEGY
|
|
60
|
+
|
|
39
61
|
super().__init__(chunking_strategy=chunking_strategy, name=name, description=description)
|
|
40
62
|
|
|
41
63
|
def read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
@@ -8,7 +8,6 @@ from agno.knowledge.chunking.strategy import ChunkingStrategyType
|
|
|
8
8
|
from agno.knowledge.document.base import Document
|
|
9
9
|
from agno.knowledge.reader.base import Reader
|
|
10
10
|
from agno.knowledge.types import ContentType
|
|
11
|
-
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
|
|
12
11
|
from agno.utils.log import log_error, log_info, logger
|
|
13
12
|
|
|
14
13
|
try:
|
|
@@ -202,11 +201,11 @@ class BasePDFReader(Reader):
|
|
|
202
201
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
203
202
|
"""Get the list of supported chunking strategies for PDF readers."""
|
|
204
203
|
return [
|
|
205
|
-
ChunkingStrategyType.
|
|
206
|
-
ChunkingStrategyType.
|
|
207
|
-
ChunkingStrategyType.
|
|
208
|
-
ChunkingStrategyType.
|
|
209
|
-
ChunkingStrategyType.
|
|
204
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
205
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
206
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
207
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
208
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
210
209
|
]
|
|
211
210
|
|
|
212
211
|
def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
|
|
@@ -222,19 +221,19 @@ class BasePDFReader(Reader):
|
|
|
222
221
|
# Use provided password or fall back to instance password
|
|
223
222
|
pdf_password = password or self.password
|
|
224
223
|
if not pdf_password:
|
|
225
|
-
logger.error(f
|
|
224
|
+
logger.error(f'PDF file "{doc_name}" is password protected but no password provided')
|
|
226
225
|
return False
|
|
227
226
|
|
|
228
227
|
try:
|
|
229
228
|
decrypted_pdf = doc_reader.decrypt(pdf_password)
|
|
230
229
|
if decrypted_pdf:
|
|
231
|
-
log_info(f
|
|
230
|
+
log_info(f'Successfully decrypted PDF file "{doc_name}" with user password')
|
|
232
231
|
return True
|
|
233
232
|
else:
|
|
234
|
-
log_error(f
|
|
233
|
+
log_error(f'Failed to decrypt PDF file "{doc_name}": incorrect password')
|
|
235
234
|
return False
|
|
236
235
|
except Exception as e:
|
|
237
|
-
log_error(f
|
|
236
|
+
log_error(f'Error decrypting PDF file "{doc_name}": {e}')
|
|
238
237
|
return False
|
|
239
238
|
|
|
240
239
|
def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
|
|
@@ -368,7 +367,7 @@ class PDFReader(BasePDFReader):
|
|
|
368
367
|
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
369
368
|
return []
|
|
370
369
|
|
|
371
|
-
# Read and chunk
|
|
370
|
+
# Read and chunk
|
|
372
371
|
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
373
372
|
|
|
374
373
|
async def async_read(
|
|
@@ -405,63 +404,6 @@ class PDFReader(BasePDFReader):
|
|
|
405
404
|
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
|
|
406
405
|
|
|
407
406
|
|
|
408
|
-
class PDFUrlReader(BasePDFReader):
|
|
409
|
-
"""Reader for PDF files from URL"""
|
|
410
|
-
|
|
411
|
-
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
412
|
-
super().__init__(password=password, **kwargs)
|
|
413
|
-
self.proxy = proxy
|
|
414
|
-
|
|
415
|
-
@classmethod
|
|
416
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
417
|
-
return [ContentType.URL]
|
|
418
|
-
|
|
419
|
-
def read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
420
|
-
if not url:
|
|
421
|
-
raise ValueError("No url provided")
|
|
422
|
-
|
|
423
|
-
from io import BytesIO
|
|
424
|
-
|
|
425
|
-
log_info(f"Reading: {url}")
|
|
426
|
-
|
|
427
|
-
# Retry the request up to 3 times with exponential backoff
|
|
428
|
-
response = fetch_with_retry(url, proxy=self.proxy)
|
|
429
|
-
|
|
430
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
431
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
432
|
-
|
|
433
|
-
# Handle PDF decryption
|
|
434
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
435
|
-
return []
|
|
436
|
-
|
|
437
|
-
# Read and chunk.
|
|
438
|
-
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
439
|
-
|
|
440
|
-
async def async_read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
441
|
-
if not url:
|
|
442
|
-
raise ValueError("No url provided")
|
|
443
|
-
|
|
444
|
-
from io import BytesIO
|
|
445
|
-
|
|
446
|
-
import httpx
|
|
447
|
-
|
|
448
|
-
log_info(f"Reading: {url}")
|
|
449
|
-
|
|
450
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
451
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
452
|
-
response = await async_fetch_with_retry(url, client=client)
|
|
453
|
-
|
|
454
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
455
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
456
|
-
|
|
457
|
-
# Handle PDF decryption
|
|
458
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
459
|
-
return []
|
|
460
|
-
|
|
461
|
-
# Read and chunk.
|
|
462
|
-
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
|
|
463
|
-
|
|
464
|
-
|
|
465
407
|
class PDFImageReader(BasePDFReader):
|
|
466
408
|
"""Reader for PDF files with text and images extraction"""
|
|
467
409
|
|
|
@@ -512,58 +454,3 @@ class PDFImageReader(BasePDFReader):
|
|
|
512
454
|
|
|
513
455
|
# Read and chunk.
|
|
514
456
|
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
class PDFUrlImageReader(BasePDFReader):
|
|
518
|
-
"""Reader for PDF files from URL with text and images extraction"""
|
|
519
|
-
|
|
520
|
-
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
|
|
521
|
-
super().__init__(password=password, **kwargs)
|
|
522
|
-
self.proxy = proxy
|
|
523
|
-
|
|
524
|
-
def read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
525
|
-
if not url:
|
|
526
|
-
raise ValueError("No url provided")
|
|
527
|
-
|
|
528
|
-
from io import BytesIO
|
|
529
|
-
|
|
530
|
-
import httpx
|
|
531
|
-
|
|
532
|
-
# Read the PDF from the URL
|
|
533
|
-
log_info(f"Reading: {url}")
|
|
534
|
-
response = httpx.get(url, proxy=self.proxy) if self.proxy else httpx.get(url)
|
|
535
|
-
|
|
536
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
537
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
538
|
-
|
|
539
|
-
# Handle PDF decryption
|
|
540
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
541
|
-
return []
|
|
542
|
-
|
|
543
|
-
# Read and chunk.
|
|
544
|
-
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
545
|
-
|
|
546
|
-
async def async_read(self, url: str, name: Optional[str] = None, password: Optional[str] = None) -> List[Document]:
|
|
547
|
-
if not url:
|
|
548
|
-
raise ValueError("No url provided")
|
|
549
|
-
|
|
550
|
-
from io import BytesIO
|
|
551
|
-
|
|
552
|
-
import httpx
|
|
553
|
-
|
|
554
|
-
log_info(f"Reading: {url}")
|
|
555
|
-
|
|
556
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
557
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
558
|
-
response = await client.get(url)
|
|
559
|
-
response.raise_for_status()
|
|
560
|
-
|
|
561
|
-
doc_name = name or url.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
562
|
-
pdf_reader = DocumentReader(BytesIO(response.content))
|
|
563
|
-
|
|
564
|
-
# Handle PDF decryption
|
|
565
|
-
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
566
|
-
return []
|
|
567
|
-
|
|
568
|
-
# Read and chunk.
|
|
569
|
-
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
|
|
@@ -15,7 +15,11 @@ class ReaderFactory:
|
|
|
15
15
|
"""Get PDF reader instance."""
|
|
16
16
|
from agno.knowledge.reader.pdf_reader import PDFReader
|
|
17
17
|
|
|
18
|
-
config: Dict[str, Any] = {
|
|
18
|
+
config: Dict[str, Any] = {
|
|
19
|
+
"chunk": True,
|
|
20
|
+
"chunk_size": 100,
|
|
21
|
+
"description": "Processes PDF documents with OCR support for images and text extraction",
|
|
22
|
+
}
|
|
19
23
|
config.update(kwargs)
|
|
20
24
|
return PDFReader(**config)
|
|
21
25
|
|
|
@@ -24,7 +28,10 @@ class ReaderFactory:
|
|
|
24
28
|
"""Get CSV reader instance."""
|
|
25
29
|
from agno.knowledge.reader.csv_reader import CSVReader
|
|
26
30
|
|
|
27
|
-
config: Dict[str, Any] = {
|
|
31
|
+
config: Dict[str, Any] = {
|
|
32
|
+
"name": "CSV Reader",
|
|
33
|
+
"description": "Parses CSV, XLSX, and XLS files with custom delimiter support",
|
|
34
|
+
}
|
|
28
35
|
config.update(kwargs)
|
|
29
36
|
return CSVReader(**config)
|
|
30
37
|
|
|
@@ -33,7 +40,10 @@ class ReaderFactory:
|
|
|
33
40
|
"""Get Docx reader instance."""
|
|
34
41
|
from agno.knowledge.reader.docx_reader import DocxReader
|
|
35
42
|
|
|
36
|
-
config: Dict[str, Any] = {
|
|
43
|
+
config: Dict[str, Any] = {
|
|
44
|
+
"name": "Docx Reader",
|
|
45
|
+
"description": "Extracts text content from Microsoft Word documents (.docx and .doc formats)",
|
|
46
|
+
}
|
|
37
47
|
config.update(kwargs)
|
|
38
48
|
return DocxReader(**config)
|
|
39
49
|
|
|
@@ -42,7 +52,10 @@ class ReaderFactory:
|
|
|
42
52
|
"""Get JSON reader instance."""
|
|
43
53
|
from agno.knowledge.reader.json_reader import JSONReader
|
|
44
54
|
|
|
45
|
-
config: Dict[str, Any] = {
|
|
55
|
+
config: Dict[str, Any] = {
|
|
56
|
+
"name": "JSON Reader",
|
|
57
|
+
"description": "Processes JSON data structures and API responses with nested object handling",
|
|
58
|
+
}
|
|
46
59
|
config.update(kwargs)
|
|
47
60
|
return JSONReader(**config)
|
|
48
61
|
|
|
@@ -51,7 +64,10 @@ class ReaderFactory:
|
|
|
51
64
|
"""Get Markdown reader instance."""
|
|
52
65
|
from agno.knowledge.reader.markdown_reader import MarkdownReader
|
|
53
66
|
|
|
54
|
-
config: Dict[str, Any] = {
|
|
67
|
+
config: Dict[str, Any] = {
|
|
68
|
+
"name": "Markdown Reader",
|
|
69
|
+
"description": "Processes Markdown documentation with header-aware chunking and formatting preservation",
|
|
70
|
+
}
|
|
55
71
|
config.update(kwargs)
|
|
56
72
|
return MarkdownReader(**config)
|
|
57
73
|
|
|
@@ -60,25 +76,22 @@ class ReaderFactory:
|
|
|
60
76
|
"""Get Text reader instance."""
|
|
61
77
|
from agno.knowledge.reader.text_reader import TextReader
|
|
62
78
|
|
|
63
|
-
config: Dict[str, Any] = {
|
|
79
|
+
config: Dict[str, Any] = {
|
|
80
|
+
"name": "Text Reader",
|
|
81
|
+
"description": "Handles plain text files with customizable chunking strategies and encoding detection",
|
|
82
|
+
}
|
|
64
83
|
config.update(kwargs)
|
|
65
84
|
return TextReader(**config)
|
|
66
85
|
|
|
67
|
-
@classmethod
|
|
68
|
-
def _get_url_reader(cls, **kwargs) -> Reader:
|
|
69
|
-
"""Get URL reader instance."""
|
|
70
|
-
from agno.knowledge.reader.url_reader import URLReader
|
|
71
|
-
|
|
72
|
-
config: Dict[str, Any] = {"name": "URL Reader", "description": "Reads URLs"}
|
|
73
|
-
config.update(kwargs)
|
|
74
|
-
return URLReader(**config)
|
|
75
|
-
|
|
76
86
|
@classmethod
|
|
77
87
|
def _get_website_reader(cls, **kwargs) -> Reader:
|
|
78
88
|
"""Get Website reader instance."""
|
|
79
89
|
from agno.knowledge.reader.website_reader import WebsiteReader
|
|
80
90
|
|
|
81
|
-
config: Dict[str, Any] = {
|
|
91
|
+
config: Dict[str, Any] = {
|
|
92
|
+
"name": "Website Reader",
|
|
93
|
+
"description": "Scrapes and extracts content from web pages with HTML parsing and text cleaning",
|
|
94
|
+
}
|
|
82
95
|
config.update(kwargs)
|
|
83
96
|
return WebsiteReader(**config)
|
|
84
97
|
|
|
@@ -91,7 +104,7 @@ class ReaderFactory:
|
|
|
91
104
|
"api_key": kwargs.get("api_key") or os.getenv("FIRECRAWL_API_KEY"),
|
|
92
105
|
"mode": "crawl",
|
|
93
106
|
"name": "Firecrawl Reader",
|
|
94
|
-
"description": "
|
|
107
|
+
"description": "Advanced web scraping and crawling with JavaScript rendering and structured data extraction",
|
|
95
108
|
}
|
|
96
109
|
config.update(kwargs)
|
|
97
110
|
return FirecrawlReader(**config)
|
|
@@ -101,52 +114,22 @@ class ReaderFactory:
|
|
|
101
114
|
"""Get YouTube reader instance."""
|
|
102
115
|
from agno.knowledge.reader.youtube_reader import YouTubeReader
|
|
103
116
|
|
|
104
|
-
config: Dict[str, Any] = {
|
|
117
|
+
config: Dict[str, Any] = {
|
|
118
|
+
"name": "YouTube Reader",
|
|
119
|
+
"description": "Extracts transcripts and metadata from YouTube videos and playlists",
|
|
120
|
+
}
|
|
105
121
|
config.update(kwargs)
|
|
106
122
|
return YouTubeReader(**config)
|
|
107
123
|
|
|
108
|
-
@classmethod
|
|
109
|
-
def _get_pdf_url_reader(cls, **kwargs) -> Reader:
|
|
110
|
-
"""Get PDF URL reader instance."""
|
|
111
|
-
from agno.knowledge.reader.pdf_reader import PDFUrlReader
|
|
112
|
-
|
|
113
|
-
config: Dict[str, Any] = {"name": "PDF URL Reader", "description": "Reads PDF URLs"}
|
|
114
|
-
config.update(kwargs)
|
|
115
|
-
return PDFUrlReader(**config)
|
|
116
|
-
|
|
117
|
-
@classmethod
|
|
118
|
-
def _get_csv_url_reader(cls, **kwargs) -> Reader:
|
|
119
|
-
"""Get CSV URL reader instance."""
|
|
120
|
-
from agno.knowledge.reader.csv_reader import CSVUrlReader
|
|
121
|
-
|
|
122
|
-
config: Dict[str, Any] = {"name": "CSV URL Reader", "description": "Reads CSV URLs"}
|
|
123
|
-
config.update(kwargs)
|
|
124
|
-
return CSVUrlReader(**config)
|
|
125
|
-
|
|
126
|
-
@classmethod
|
|
127
|
-
def _get_s3_reader(cls, **kwargs) -> Reader:
|
|
128
|
-
"""Get S3 reader instance."""
|
|
129
|
-
from agno.knowledge.reader.s3_reader import S3Reader
|
|
130
|
-
|
|
131
|
-
config: Dict[str, Any] = {"name": "S3 Reader", "description": "Reads S3 files"}
|
|
132
|
-
config.update(kwargs)
|
|
133
|
-
return S3Reader(**config)
|
|
134
|
-
|
|
135
|
-
@classmethod
|
|
136
|
-
def _get_gcs_reader(cls, **kwargs) -> Reader:
|
|
137
|
-
"""Get GCS reader instance."""
|
|
138
|
-
from agno.knowledge.reader.gcs_reader import GCSReader
|
|
139
|
-
|
|
140
|
-
config: Dict[str, Any] = {"name": "GCS Reader", "description": "Reads GCS files"}
|
|
141
|
-
config.update(kwargs)
|
|
142
|
-
return GCSReader(**config)
|
|
143
|
-
|
|
144
124
|
@classmethod
|
|
145
125
|
def _get_arxiv_reader(cls, **kwargs) -> Reader:
|
|
146
126
|
"""Get Arxiv reader instance."""
|
|
147
127
|
from agno.knowledge.reader.arxiv_reader import ArxivReader
|
|
148
128
|
|
|
149
|
-
config: Dict[str, Any] = {
|
|
129
|
+
config: Dict[str, Any] = {
|
|
130
|
+
"name": "Arxiv Reader",
|
|
131
|
+
"description": "Downloads and processes academic papers from ArXiv with PDF parsing and metadata extraction",
|
|
132
|
+
}
|
|
150
133
|
config.update(kwargs)
|
|
151
134
|
return ArxivReader(**config)
|
|
152
135
|
|
|
@@ -155,7 +138,10 @@ class ReaderFactory:
|
|
|
155
138
|
"""Get Wikipedia reader instance."""
|
|
156
139
|
from agno.knowledge.reader.wikipedia_reader import WikipediaReader
|
|
157
140
|
|
|
158
|
-
config: Dict[str, Any] = {
|
|
141
|
+
config: Dict[str, Any] = {
|
|
142
|
+
"name": "Wikipedia Reader",
|
|
143
|
+
"description": "Fetches and processes Wikipedia articles with section-aware chunking and link resolution",
|
|
144
|
+
}
|
|
159
145
|
config.update(kwargs)
|
|
160
146
|
return WikipediaReader(**config)
|
|
161
147
|
|
|
@@ -164,7 +150,10 @@ class ReaderFactory:
|
|
|
164
150
|
"""Get Web Search reader instance."""
|
|
165
151
|
from agno.knowledge.reader.web_search_reader import WebSearchReader
|
|
166
152
|
|
|
167
|
-
config: Dict[str, Any] = {
|
|
153
|
+
config: Dict[str, Any] = {
|
|
154
|
+
"name": "Web Search Reader",
|
|
155
|
+
"description": "Executes web searches and processes results with relevance ranking and content extraction",
|
|
156
|
+
}
|
|
168
157
|
config.update(kwargs)
|
|
169
158
|
return WebSearchReader(**config)
|
|
170
159
|
|
|
@@ -224,27 +213,31 @@ class ReaderFactory:
|
|
|
224
213
|
# Default to URL reader
|
|
225
214
|
return cls.create_reader("url")
|
|
226
215
|
|
|
227
|
-
@classmethod
|
|
228
|
-
def get_reader_for_url_file(cls, extension: str) -> Reader:
|
|
229
|
-
"""Get the appropriate reader for a URL file extension."""
|
|
230
|
-
extension = extension.lower()
|
|
231
|
-
|
|
232
|
-
if extension == ".pdf":
|
|
233
|
-
return cls.create_reader("pdf_url")
|
|
234
|
-
elif extension == ".csv":
|
|
235
|
-
return cls.create_reader("csv_url")
|
|
236
|
-
else:
|
|
237
|
-
return cls.create_reader("url")
|
|
238
|
-
|
|
239
216
|
@classmethod
|
|
240
217
|
def get_all_reader_keys(cls) -> List[str]:
|
|
241
|
-
"""Get all available reader keys."""
|
|
218
|
+
"""Get all available reader keys in priority order."""
|
|
242
219
|
# Extract reader keys from method names
|
|
220
|
+
|
|
221
|
+
PREFIX = "_get_"
|
|
222
|
+
SUFFIX = "_reader"
|
|
223
|
+
|
|
243
224
|
reader_keys = []
|
|
244
225
|
for attr_name in dir(cls):
|
|
245
|
-
if attr_name.startswith(
|
|
246
|
-
reader_key = attr_name[
|
|
226
|
+
if attr_name.startswith(PREFIX) and attr_name.endswith(SUFFIX):
|
|
227
|
+
reader_key = attr_name[len(PREFIX) : -len(SUFFIX)] # Remove "_get_" prefix and "_reader" suffix
|
|
247
228
|
reader_keys.append(reader_key)
|
|
229
|
+
|
|
230
|
+
# Define priority order for URL readers
|
|
231
|
+
url_reader_priority = ["url", "website", "firecrawl", "pdf_url", "csv_url", "youtube", "web_search"]
|
|
232
|
+
|
|
233
|
+
# Sort with URL readers in priority order, others alphabetically
|
|
234
|
+
def sort_key(reader_key):
|
|
235
|
+
if reader_key in url_reader_priority:
|
|
236
|
+
return (0, url_reader_priority.index(reader_key))
|
|
237
|
+
else:
|
|
238
|
+
return (1, reader_key)
|
|
239
|
+
|
|
240
|
+
reader_keys.sort(key=sort_key)
|
|
248
241
|
return reader_keys
|
|
249
242
|
|
|
250
243
|
@classmethod
|
|
@@ -2,14 +2,15 @@ import asyncio
|
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import List, Optional
|
|
5
|
-
from uuid import uuid4
|
|
6
5
|
|
|
7
6
|
from agno.knowledge.chunking.fixed import FixedSizeChunking
|
|
8
7
|
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
9
8
|
from agno.knowledge.document.base import Document
|
|
10
9
|
from agno.knowledge.reader.base import Reader
|
|
10
|
+
from agno.knowledge.reader.pdf_reader import PDFReader
|
|
11
|
+
from agno.knowledge.reader.text_reader import TextReader
|
|
11
12
|
from agno.knowledge.types import ContentType
|
|
12
|
-
from agno.utils.log import
|
|
13
|
+
from agno.utils.log import log_info, logger
|
|
13
14
|
|
|
14
15
|
try:
|
|
15
16
|
from agno.aws.resource.s3.object import S3Object # type: ignore
|
|
@@ -37,11 +38,11 @@ class S3Reader(Reader):
|
|
|
37
38
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
38
39
|
"""Get the list of supported chunking strategies for S3 readers."""
|
|
39
40
|
return [
|
|
40
|
-
ChunkingStrategyType.
|
|
41
|
-
ChunkingStrategyType.
|
|
42
|
-
ChunkingStrategyType.
|
|
43
|
-
ChunkingStrategyType.
|
|
44
|
-
ChunkingStrategyType.
|
|
41
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
42
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
43
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
44
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
45
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
45
46
|
]
|
|
46
47
|
|
|
47
48
|
@classmethod
|
|
@@ -52,120 +53,49 @@ class S3Reader(Reader):
|
|
|
52
53
|
try:
|
|
53
54
|
log_info(f"Reading S3 file: {s3_object.uri}")
|
|
54
55
|
|
|
56
|
+
# Read PDF files
|
|
55
57
|
if s3_object.uri.endswith(".pdf"):
|
|
56
|
-
|
|
58
|
+
object_resource = s3_object.get_resource()
|
|
59
|
+
object_body = object_resource.get()["Body"]
|
|
60
|
+
doc_name = (
|
|
61
|
+
s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
62
|
+
if name is None
|
|
63
|
+
else name
|
|
64
|
+
)
|
|
65
|
+
return PDFReader().read(pdf=BytesIO(object_body.read()), name=doc_name)
|
|
66
|
+
|
|
67
|
+
# Read text files
|
|
57
68
|
else:
|
|
58
|
-
|
|
69
|
+
doc_name = (
|
|
70
|
+
s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
71
|
+
if name is None
|
|
72
|
+
else name
|
|
73
|
+
)
|
|
74
|
+
obj_name = s3_object.name.split("/")[-1]
|
|
75
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
76
|
+
s3_object.download(temporary_file)
|
|
77
|
+
|
|
78
|
+
# TODO: Before we were using textract here. Needed?
|
|
79
|
+
# s3_object.download(temporary_file)
|
|
80
|
+
# doc_content = textract.process(temporary_file)
|
|
81
|
+
# documents = [
|
|
82
|
+
# Document(
|
|
83
|
+
# name=doc_name,
|
|
84
|
+
# id=doc_name,
|
|
85
|
+
# content=doc_content.decode("utf-8"),
|
|
86
|
+
# )
|
|
87
|
+
# ]
|
|
88
|
+
|
|
89
|
+
documents = TextReader().read(file=temporary_file, name=doc_name)
|
|
90
|
+
|
|
91
|
+
temporary_file.unlink()
|
|
92
|
+
return documents
|
|
59
93
|
|
|
60
94
|
except Exception as e:
|
|
61
95
|
logger.error(f"Error reading: {s3_object.uri}: {e}")
|
|
62
|
-
return []
|
|
63
|
-
|
|
64
|
-
async def async_read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
65
|
-
"""Asynchronously read S3 files by running the synchronous read operation in a thread."""
|
|
66
|
-
return await asyncio.to_thread(self.read, name, s3_object)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
class S3TextReader(Reader):
|
|
70
|
-
"""Reader for text files on S3"""
|
|
71
96
|
|
|
72
|
-
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
73
|
-
"""Get the list of supported chunking strategies for S3 text readers."""
|
|
74
|
-
return [
|
|
75
|
-
ChunkingStrategyType.AGENTIC_CHUNKING,
|
|
76
|
-
ChunkingStrategyType.DOCUMENT_CHUNKING,
|
|
77
|
-
ChunkingStrategyType.RECURSIVE_CHUNKING,
|
|
78
|
-
]
|
|
79
|
-
|
|
80
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
81
|
-
return [ContentType.TEXT]
|
|
82
|
-
|
|
83
|
-
def read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
84
|
-
try:
|
|
85
|
-
log_info(f"Reading text file: {s3_object.uri}")
|
|
86
|
-
|
|
87
|
-
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
88
|
-
if name is not None:
|
|
89
|
-
doc_name = name
|
|
90
|
-
obj_name = s3_object.name.split("/")[-1]
|
|
91
|
-
temporary_file = Path("storage").joinpath(obj_name)
|
|
92
|
-
s3_object.download(temporary_file)
|
|
93
|
-
|
|
94
|
-
log_info(f"Parsing: {temporary_file}")
|
|
95
|
-
doc_content = textract.process(temporary_file)
|
|
96
|
-
documents = [
|
|
97
|
-
Document(
|
|
98
|
-
name=doc_name,
|
|
99
|
-
id=doc_name,
|
|
100
|
-
content=doc_content.decode("utf-8"),
|
|
101
|
-
)
|
|
102
|
-
]
|
|
103
|
-
if self.chunk:
|
|
104
|
-
chunked_documents = []
|
|
105
|
-
for document in documents:
|
|
106
|
-
chunked_documents.extend(self.chunk_document(document))
|
|
107
|
-
return chunked_documents
|
|
108
|
-
|
|
109
|
-
log_debug(f"Deleting: {temporary_file}")
|
|
110
|
-
temporary_file.unlink()
|
|
111
|
-
return documents
|
|
112
|
-
except Exception as e:
|
|
113
|
-
logger.error(f"Error reading: {s3_object.uri}: {e}")
|
|
114
97
|
return []
|
|
115
98
|
|
|
116
99
|
async def async_read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
117
|
-
"""Asynchronously read
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
s3_object (S3Object): The S3 object to read
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
List[Document]: List of documents from the text file
|
|
124
|
-
"""
|
|
125
|
-
return await asyncio.to_thread(self.read, name, s3_object)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class S3PDFReader(Reader):
|
|
129
|
-
"""Reader for PDF files on S3"""
|
|
130
|
-
|
|
131
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
132
|
-
return [ContentType.FILE]
|
|
133
|
-
|
|
134
|
-
def read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
135
|
-
try:
|
|
136
|
-
log_info(f"Reading PDF file: {s3_object.uri}")
|
|
137
|
-
|
|
138
|
-
object_resource = s3_object.get_resource()
|
|
139
|
-
object_body = object_resource.get()["Body"]
|
|
140
|
-
doc_name = s3_object.name.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
|
|
141
|
-
if name is not None:
|
|
142
|
-
doc_name = name
|
|
143
|
-
doc_reader = DocumentReader(BytesIO(object_body.read()))
|
|
144
|
-
documents = [
|
|
145
|
-
Document(
|
|
146
|
-
name=doc_name,
|
|
147
|
-
id=str(uuid4()),
|
|
148
|
-
meta_data={"page": page_number},
|
|
149
|
-
content=page.extract_text(),
|
|
150
|
-
)
|
|
151
|
-
for page_number, page in enumerate(doc_reader.pages, start=1)
|
|
152
|
-
]
|
|
153
|
-
if self.chunk:
|
|
154
|
-
chunked_documents = []
|
|
155
|
-
for document in documents:
|
|
156
|
-
chunked_documents.extend(self.chunk_document(document))
|
|
157
|
-
return chunked_documents
|
|
158
|
-
return documents
|
|
159
|
-
except Exception:
|
|
160
|
-
raise
|
|
161
|
-
|
|
162
|
-
async def async_read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
163
|
-
"""Asynchronously read PDF files from S3 by running the synchronous read operation in a thread.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
s3_object (S3Object): The S3 object to read
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
List[Document]: List of documents from the PDF file
|
|
170
|
-
"""
|
|
100
|
+
"""Asynchronously read S3 files by running the synchronous read operation in a thread."""
|
|
171
101
|
return await asyncio.to_thread(self.read, name, s3_object)
|