agno 2.2.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +751 -575
- agno/culture/manager.py +22 -24
- agno/db/async_postgres/__init__.py +1 -1
- agno/db/dynamo/dynamo.py +0 -2
- agno/db/firestore/firestore.py +0 -2
- agno/db/gcs_json/gcs_json_db.py +0 -4
- agno/db/gcs_json/utils.py +0 -24
- agno/db/in_memory/in_memory_db.py +0 -3
- agno/db/json/json_db.py +4 -10
- agno/db/json/utils.py +0 -24
- agno/db/mongo/mongo.py +0 -2
- agno/db/mysql/mysql.py +0 -3
- agno/db/postgres/__init__.py +1 -1
- agno/db/{async_postgres → postgres}/async_postgres.py +19 -22
- agno/db/postgres/postgres.py +7 -10
- agno/db/postgres/utils.py +106 -2
- agno/db/redis/redis.py +0 -2
- agno/db/singlestore/singlestore.py +0 -3
- agno/db/sqlite/__init__.py +2 -1
- agno/db/sqlite/async_sqlite.py +2269 -0
- agno/db/sqlite/sqlite.py +0 -2
- agno/db/sqlite/utils.py +96 -0
- agno/db/surrealdb/surrealdb.py +0 -6
- agno/knowledge/knowledge.py +14 -3
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +30 -0
- agno/knowledge/reader/tavily_reader.py +194 -0
- agno/knowledge/types.py +1 -0
- agno/memory/manager.py +28 -25
- agno/models/anthropic/claude.py +63 -6
- agno/models/base.py +255 -36
- agno/models/response.py +69 -0
- agno/os/router.py +7 -5
- agno/os/routers/memory/memory.py +2 -1
- agno/os/routers/memory/schemas.py +5 -2
- agno/os/schema.py +26 -20
- agno/os/utils.py +9 -2
- agno/run/agent.py +28 -30
- agno/run/base.py +17 -1
- agno/run/team.py +28 -29
- agno/run/workflow.py +32 -17
- agno/session/agent.py +3 -0
- agno/session/summary.py +4 -1
- agno/session/team.py +1 -1
- agno/team/team.py +620 -374
- agno/tools/dalle.py +2 -4
- agno/tools/eleven_labs.py +23 -25
- agno/tools/function.py +40 -0
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +324 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/slack.py +18 -3
- agno/tools/tavily.py +146 -0
- agno/utils/agent.py +366 -1
- agno/utils/mcp.py +92 -2
- agno/utils/media.py +166 -1
- agno/utils/message.py +60 -0
- agno/utils/print_response/workflow.py +17 -1
- agno/utils/team.py +89 -1
- agno/workflow/step.py +0 -1
- agno/workflow/types.py +10 -15
- agno/workflow/workflow.py +86 -1
- {agno-2.2.0.dist-info → agno-2.2.2.dist-info}/METADATA +31 -25
- {agno-2.2.0.dist-info → agno-2.2.2.dist-info}/RECORD +68 -64
- agno/db/async_postgres/schemas.py +0 -139
- agno/db/async_postgres/utils.py +0 -347
- agno/tools/mcp.py +0 -679
- {agno-2.2.0.dist-info → agno-2.2.2.dist-info}/WHEEL +0 -0
- {agno-2.2.0.dist-info → agno-2.2.2.dist-info}/licenses/LICENSE +0 -0
- {agno-2.2.0.dist-info → agno-2.2.2.dist-info}/top_level.txt +0 -0
agno/db/sqlite/sqlite.py
CHANGED
|
@@ -374,8 +374,6 @@ class SqliteDb(BaseDb):
|
|
|
374
374
|
# Filtering
|
|
375
375
|
if user_id is not None:
|
|
376
376
|
stmt = stmt.where(table.c.user_id == user_id)
|
|
377
|
-
if session_type is not None:
|
|
378
|
-
stmt = stmt.where(table.c.session_type == session_type)
|
|
379
377
|
|
|
380
378
|
result = sess.execute(stmt).fetchone()
|
|
381
379
|
if result is None:
|
agno/db/sqlite/utils.py
CHANGED
|
@@ -4,6 +4,8 @@ from datetime import date, datetime, timedelta, timezone
|
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
|
|
8
|
+
|
|
7
9
|
from agno.db.schemas.culture import CulturalKnowledge
|
|
8
10
|
from agno.db.sqlite.schemas import get_table_schema_definition
|
|
9
11
|
from agno.utils.log import log_debug, log_error, log_warning
|
|
@@ -50,6 +52,7 @@ def is_table_available(session: Session, table_name: str, db_schema: Optional[st
|
|
|
50
52
|
"""
|
|
51
53
|
Check if a table with the given name exists.
|
|
52
54
|
Note: db_schema parameter is ignored in SQLite but kept for API compatibility.
|
|
55
|
+
|
|
53
56
|
Returns:
|
|
54
57
|
bool: True if the table exists, False otherwise.
|
|
55
58
|
"""
|
|
@@ -65,6 +68,25 @@ def is_table_available(session: Session, table_name: str, db_schema: Optional[st
|
|
|
65
68
|
return False
|
|
66
69
|
|
|
67
70
|
|
|
71
|
+
async def ais_table_available(session: AsyncSession, table_name: str, db_schema: Optional[str] = None) -> bool:
|
|
72
|
+
"""
|
|
73
|
+
Check if a table with the given name exists.
|
|
74
|
+
Note: db_schema parameter is ignored in SQLite but kept for API compatibility.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
bool: True if the table exists, False otherwise.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
exists_query = text("SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = :table")
|
|
81
|
+
exists = (await session.execute(exists_query, {"table": table_name})).scalar() is not None
|
|
82
|
+
if not exists:
|
|
83
|
+
log_debug(f"Table {table_name} {'exists' if exists else 'does not exist'}")
|
|
84
|
+
return exists
|
|
85
|
+
except Exception as e:
|
|
86
|
+
log_error(f"Error checking if table exists: {e}")
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
|
|
68
90
|
def is_valid_table(db_engine: Engine, table_name: str, table_type: str, db_schema: Optional[str] = None) -> bool:
|
|
69
91
|
"""
|
|
70
92
|
Check if the existing table has the expected column names.
|
|
@@ -98,6 +120,47 @@ def is_valid_table(db_engine: Engine, table_name: str, table_type: str, db_schem
|
|
|
98
120
|
return False
|
|
99
121
|
|
|
100
122
|
|
|
123
|
+
async def ais_valid_table(
|
|
124
|
+
db_engine: AsyncEngine, table_name: str, table_type: str, db_schema: Optional[str] = None
|
|
125
|
+
) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
Check if the existing table has the expected column names.
|
|
128
|
+
Note: db_schema parameter is ignored in SQLite but kept for API compatibility.
|
|
129
|
+
Args:
|
|
130
|
+
db_engine (Engine): Database engine
|
|
131
|
+
table_name (str): Name of the table to validate
|
|
132
|
+
table_type (str): Type of table to get expected schema
|
|
133
|
+
db_schema (Optional[str]): Database schema name (ignored in SQLite)
|
|
134
|
+
Returns:
|
|
135
|
+
bool: True if table has all expected columns, False otherwise
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
expected_table_schema = get_table_schema_definition(table_type)
|
|
139
|
+
expected_columns = {col_name for col_name in expected_table_schema.keys() if not col_name.startswith("_")}
|
|
140
|
+
|
|
141
|
+
# Get existing columns from the async engine
|
|
142
|
+
async with db_engine.connect() as conn:
|
|
143
|
+
existing_columns = await conn.run_sync(_get_table_columns, table_name)
|
|
144
|
+
|
|
145
|
+
missing_columns = expected_columns - existing_columns
|
|
146
|
+
if missing_columns:
|
|
147
|
+
log_warning(f"Missing columns {missing_columns} in table {table_name}")
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
log_error(f"Error validating table schema for {table_name}: {e}")
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _get_table_columns(conn, table_name: str) -> set[str]:
|
|
158
|
+
"""Helper function to get table columns using sync inspector."""
|
|
159
|
+
inspector = inspect(conn)
|
|
160
|
+
columns_info = inspector.get_columns(table_name)
|
|
161
|
+
return {col["name"] for col in columns_info}
|
|
162
|
+
|
|
163
|
+
|
|
101
164
|
# -- Metrics util methods --
|
|
102
165
|
|
|
103
166
|
|
|
@@ -134,6 +197,39 @@ def bulk_upsert_metrics(session: Session, table: Table, metrics_records: list[di
|
|
|
134
197
|
return results # type: ignore
|
|
135
198
|
|
|
136
199
|
|
|
200
|
+
async def abulk_upsert_metrics(session: AsyncSession, table: Table, metrics_records: list[dict]) -> list[dict]:
|
|
201
|
+
"""Bulk upsert metrics into the database.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
table (Table): The table to upsert into.
|
|
205
|
+
metrics_records (list[dict]): The metrics records to upsert.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
list[dict]: The upserted metrics records.
|
|
209
|
+
"""
|
|
210
|
+
if not metrics_records:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
results = []
|
|
214
|
+
stmt = sqlite.insert(table)
|
|
215
|
+
|
|
216
|
+
# Columns to update in case of conflict
|
|
217
|
+
update_columns = {
|
|
218
|
+
col.name: stmt.excluded[col.name]
|
|
219
|
+
for col in table.columns
|
|
220
|
+
if col.name not in ["id", "date", "created_at", "aggregation_period"]
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
stmt = stmt.on_conflict_do_update(index_elements=["date", "aggregation_period"], set_=update_columns).returning( # type: ignore
|
|
224
|
+
table
|
|
225
|
+
)
|
|
226
|
+
result = await session.execute(stmt, metrics_records)
|
|
227
|
+
results = [dict(row._mapping) for row in result.fetchall()]
|
|
228
|
+
await session.commit()
|
|
229
|
+
|
|
230
|
+
return results # type: ignore
|
|
231
|
+
|
|
232
|
+
|
|
137
233
|
def calculate_date_metrics(date_to_process: date, sessions_data: dict) -> dict:
|
|
138
234
|
"""Calculate metrics for the given single date.
|
|
139
235
|
|
agno/db/surrealdb/surrealdb.py
CHANGED
|
@@ -238,12 +238,6 @@ class SurrealDb(BaseDb):
|
|
|
238
238
|
where = WhereClause()
|
|
239
239
|
if user_id is not None:
|
|
240
240
|
where = where.and_("user_id", user_id)
|
|
241
|
-
if session_type == SessionType.AGENT:
|
|
242
|
-
where = where.and_("agent", None, "!=")
|
|
243
|
-
elif session_type == SessionType.TEAM:
|
|
244
|
-
where = where.and_("team", None, "!=")
|
|
245
|
-
elif session_type == SessionType.WORKFLOW:
|
|
246
|
-
where = where.and_("workflow", None, "!=")
|
|
247
241
|
where_clause, where_vars = where.build()
|
|
248
242
|
query = dedent(f"""
|
|
249
243
|
SELECT *
|
agno/knowledge/knowledge.py
CHANGED
|
@@ -501,7 +501,7 @@ class Knowledge:
|
|
|
501
501
|
await self._add_to_contents_db(content)
|
|
502
502
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
503
503
|
content.status = ContentStatus.COMPLETED
|
|
504
|
-
self.
|
|
504
|
+
await self._aupdate_content(content)
|
|
505
505
|
return
|
|
506
506
|
|
|
507
507
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
@@ -547,6 +547,8 @@ class Knowledge:
|
|
|
547
547
|
reader = self.pdf_reader
|
|
548
548
|
elif file_extension == ".docx":
|
|
549
549
|
reader = self.docx_reader
|
|
550
|
+
elif file_extension == ".pptx":
|
|
551
|
+
reader = self.pptx_reader
|
|
550
552
|
elif file_extension == ".json":
|
|
551
553
|
reader = self.json_reader
|
|
552
554
|
elif file_extension == ".markdown":
|
|
@@ -723,7 +725,7 @@ class Knowledge:
|
|
|
723
725
|
await self._add_to_contents_db(content)
|
|
724
726
|
if self._should_skip(content.content_hash, skip_if_exists):
|
|
725
727
|
content.status = ContentStatus.COMPLETED
|
|
726
|
-
self.
|
|
728
|
+
await self._aupdate_content(content)
|
|
727
729
|
return
|
|
728
730
|
|
|
729
731
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
@@ -739,7 +741,7 @@ class Knowledge:
|
|
|
739
741
|
log_error(f"No reader available for topic: {topic}")
|
|
740
742
|
content.status = ContentStatus.FAILED
|
|
741
743
|
content.status_message = "No reader available for topic"
|
|
742
|
-
self.
|
|
744
|
+
await self._aupdate_content(content)
|
|
743
745
|
continue
|
|
744
746
|
|
|
745
747
|
read_documents = content.reader.read(topic)
|
|
@@ -835,6 +837,8 @@ class Knowledge:
|
|
|
835
837
|
reader = self.csv_reader
|
|
836
838
|
elif s3_object.uri.endswith(".docx"):
|
|
837
839
|
reader = self.docx_reader
|
|
840
|
+
elif s3_object.uri.endswith(".pptx"):
|
|
841
|
+
reader = self.pptx_reader
|
|
838
842
|
elif s3_object.uri.endswith(".json"):
|
|
839
843
|
reader = self.json_reader
|
|
840
844
|
elif s3_object.uri.endswith(".markdown"):
|
|
@@ -917,6 +921,8 @@ class Knowledge:
|
|
|
917
921
|
reader = self.csv_reader
|
|
918
922
|
elif gcs_object.name.endswith(".docx"):
|
|
919
923
|
reader = self.docx_reader
|
|
924
|
+
elif gcs_object.name.endswith(".pptx"):
|
|
925
|
+
reader = self.pptx_reader
|
|
920
926
|
elif gcs_object.name.endswith(".json"):
|
|
921
927
|
reader = self.json_reader
|
|
922
928
|
elif gcs_object.name.endswith(".markdown"):
|
|
@@ -1893,6 +1899,11 @@ class Knowledge:
|
|
|
1893
1899
|
"""Docx reader - lazy loaded via factory."""
|
|
1894
1900
|
return self._get_reader("docx")
|
|
1895
1901
|
|
|
1902
|
+
@property
|
|
1903
|
+
def pptx_reader(self) -> Optional[Reader]:
|
|
1904
|
+
"""PPTX reader - lazy loaded via factory."""
|
|
1905
|
+
return self._get_reader("pptx")
|
|
1906
|
+
|
|
1896
1907
|
@property
|
|
1897
1908
|
def json_reader(self) -> Optional[Reader]:
|
|
1898
1909
|
"""JSON reader - lazy loaded via factory."""
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import IO, Any, List, Optional, Union
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from agno.knowledge.chunking.document import DocumentChunking
|
|
7
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
|
+
from agno.knowledge.document.base import Document
|
|
9
|
+
from agno.knowledge.reader.base import Reader
|
|
10
|
+
from agno.knowledge.types import ContentType
|
|
11
|
+
from agno.utils.log import log_info, logger
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pptx import Presentation # type: ignore
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError("The `python-pptx` package is not installed. Please install it via `pip install python-pptx`.")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PPTXReader(Reader):
|
|
20
|
+
"""Reader for PPTX files"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, chunking_strategy: Optional[ChunkingStrategy] = DocumentChunking(), **kwargs):
|
|
23
|
+
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
27
|
+
"""Get the list of supported chunking strategies for PPTX readers."""
|
|
28
|
+
return [
|
|
29
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def get_supported_content_types(self) -> List[ContentType]:
|
|
38
|
+
return [ContentType.PPTX]
|
|
39
|
+
|
|
40
|
+
def read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
41
|
+
"""Read a pptx file and return a list of documents"""
|
|
42
|
+
try:
|
|
43
|
+
if isinstance(file, Path):
|
|
44
|
+
if not file.exists():
|
|
45
|
+
raise FileNotFoundError(f"Could not find file: {file}")
|
|
46
|
+
log_info(f"Reading: {file}")
|
|
47
|
+
presentation = Presentation(str(file))
|
|
48
|
+
doc_name = name or file.stem
|
|
49
|
+
else:
|
|
50
|
+
log_info(f"Reading uploaded file: {getattr(file, 'name', 'pptx_file')}")
|
|
51
|
+
presentation = Presentation(file)
|
|
52
|
+
doc_name = name or (
|
|
53
|
+
getattr(file, "name", "pptx_file").split(".")[0] if hasattr(file, "name") else "pptx_file"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Extract text from all slides
|
|
57
|
+
slide_texts = []
|
|
58
|
+
for slide_number, slide in enumerate(presentation.slides, 1):
|
|
59
|
+
slide_text = f"Slide {slide_number}:\n"
|
|
60
|
+
|
|
61
|
+
# Extract text from shapes that contain text
|
|
62
|
+
text_content = []
|
|
63
|
+
for shape in slide.shapes:
|
|
64
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
65
|
+
text_content.append(shape.text.strip())
|
|
66
|
+
|
|
67
|
+
if text_content:
|
|
68
|
+
slide_text += "\n".join(text_content)
|
|
69
|
+
else:
|
|
70
|
+
slide_text += "(No text content)"
|
|
71
|
+
|
|
72
|
+
slide_texts.append(slide_text)
|
|
73
|
+
|
|
74
|
+
doc_content = "\n\n".join(slide_texts)
|
|
75
|
+
|
|
76
|
+
documents = [
|
|
77
|
+
Document(
|
|
78
|
+
name=doc_name,
|
|
79
|
+
id=str(uuid4()),
|
|
80
|
+
content=doc_content,
|
|
81
|
+
)
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
if self.chunk:
|
|
85
|
+
chunked_documents = []
|
|
86
|
+
for document in documents:
|
|
87
|
+
chunked_documents.extend(self.chunk_document(document))
|
|
88
|
+
return chunked_documents
|
|
89
|
+
return documents
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"Error reading file: {e}")
|
|
93
|
+
return []
|
|
94
|
+
|
|
95
|
+
async def async_read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
96
|
+
"""Asynchronously read a pptx file and return a list of documents"""
|
|
97
|
+
try:
|
|
98
|
+
return await asyncio.to_thread(self.read, file, name)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"Error reading file asynchronously: {e}")
|
|
101
|
+
return []
|
|
@@ -58,6 +58,18 @@ class ReaderFactory:
|
|
|
58
58
|
config.update(kwargs)
|
|
59
59
|
return DocxReader(**config)
|
|
60
60
|
|
|
61
|
+
@classmethod
|
|
62
|
+
def _get_pptx_reader(cls, **kwargs) -> Reader:
|
|
63
|
+
"""Get PPTX reader instance."""
|
|
64
|
+
from agno.knowledge.reader.pptx_reader import PPTXReader
|
|
65
|
+
|
|
66
|
+
config: Dict[str, Any] = {
|
|
67
|
+
"name": "PPTX Reader",
|
|
68
|
+
"description": "Extracts text content from Microsoft PowerPoint presentations (.pptx format)",
|
|
69
|
+
}
|
|
70
|
+
config.update(kwargs)
|
|
71
|
+
return PPTXReader(**config)
|
|
72
|
+
|
|
61
73
|
@classmethod
|
|
62
74
|
def _get_json_reader(cls, **kwargs) -> Reader:
|
|
63
75
|
"""Get JSON reader instance."""
|
|
@@ -120,6 +132,21 @@ class ReaderFactory:
|
|
|
120
132
|
config.update(kwargs)
|
|
121
133
|
return FirecrawlReader(**config)
|
|
122
134
|
|
|
135
|
+
@classmethod
|
|
136
|
+
def _get_tavily_reader(cls, **kwargs) -> Reader:
|
|
137
|
+
"""Get Tavily reader instance."""
|
|
138
|
+
from agno.knowledge.reader.tavily_reader import TavilyReader
|
|
139
|
+
|
|
140
|
+
config: Dict[str, Any] = {
|
|
141
|
+
"api_key": kwargs.get("api_key") or os.getenv("TAVILY_API_KEY"),
|
|
142
|
+
"extract_format": "markdown",
|
|
143
|
+
"extract_depth": "basic",
|
|
144
|
+
"name": "Tavily Reader",
|
|
145
|
+
"description": "Extracts content from URLs using Tavily's Extract API with markdown or text output",
|
|
146
|
+
}
|
|
147
|
+
config.update(kwargs)
|
|
148
|
+
return TavilyReader(**config)
|
|
149
|
+
|
|
123
150
|
@classmethod
|
|
124
151
|
def _get_youtube_reader(cls, **kwargs) -> Reader:
|
|
125
152
|
"""Get YouTube reader instance."""
|
|
@@ -202,6 +229,8 @@ class ReaderFactory:
|
|
|
202
229
|
return cls.create_reader("csv")
|
|
203
230
|
elif extension in [".docx", ".doc", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
|
|
204
231
|
return cls.create_reader("docx")
|
|
232
|
+
elif extension == ".pptx":
|
|
233
|
+
return cls.create_reader("pptx")
|
|
205
234
|
elif extension == ".json":
|
|
206
235
|
return cls.create_reader("json")
|
|
207
236
|
elif extension in [".md", ".markdown"]:
|
|
@@ -242,6 +271,7 @@ class ReaderFactory:
|
|
|
242
271
|
url_reader_priority = [
|
|
243
272
|
"website",
|
|
244
273
|
"firecrawl",
|
|
274
|
+
"tavily",
|
|
245
275
|
"youtube",
|
|
246
276
|
]
|
|
247
277
|
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
from agno.knowledge.chunking.semantic import SemanticChunking
|
|
6
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
7
|
+
from agno.knowledge.document.base import Document
|
|
8
|
+
from agno.knowledge.reader.base import Reader
|
|
9
|
+
from agno.knowledge.types import ContentType
|
|
10
|
+
from agno.utils.log import log_debug, logger
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from tavily import TavilyClient # type: ignore[attr-defined]
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise ImportError(
|
|
16
|
+
"The `tavily-python` package is not installed. Please install it via `pip install tavily-python`."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class TavilyReader(Reader):
|
|
22
|
+
api_key: Optional[str] = None
|
|
23
|
+
params: Optional[Dict] = None
|
|
24
|
+
extract_format: Literal["markdown", "text"] = "markdown"
|
|
25
|
+
extract_depth: Literal["basic", "advanced"] = "basic"
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
api_key: Optional[str] = None,
|
|
30
|
+
params: Optional[Dict] = None,
|
|
31
|
+
extract_format: Literal["markdown", "text"] = "markdown",
|
|
32
|
+
extract_depth: Literal["basic", "advanced"] = "basic",
|
|
33
|
+
chunk: bool = True,
|
|
34
|
+
chunk_size: int = 5000,
|
|
35
|
+
chunking_strategy: Optional[ChunkingStrategy] = SemanticChunking(),
|
|
36
|
+
name: Optional[str] = None,
|
|
37
|
+
description: Optional[str] = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Initialize TavilyReader for extracting content from URLs using Tavily's Extract API.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
api_key: Tavily API key (or use TAVILY_API_KEY env var)
|
|
44
|
+
params: Additional parameters to pass to the extract API
|
|
45
|
+
extract_format: Output format - "markdown" or "text"
|
|
46
|
+
extract_depth: Extraction depth - "basic" (1 credit/5 URLs) or "advanced" (2 credits/5 URLs)
|
|
47
|
+
chunk: Whether to chunk the extracted content
|
|
48
|
+
chunk_size: Size of chunks when chunking is enabled
|
|
49
|
+
chunking_strategy: Strategy to use for chunking
|
|
50
|
+
name: Name of the reader
|
|
51
|
+
description: Description of the reader
|
|
52
|
+
"""
|
|
53
|
+
# Initialize base Reader (handles chunk_size / strategy)
|
|
54
|
+
super().__init__(
|
|
55
|
+
chunk=chunk, chunk_size=chunk_size, chunking_strategy=chunking_strategy, name=name, description=description
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Tavily-specific attributes
|
|
59
|
+
self.api_key = api_key
|
|
60
|
+
self.params = params or {}
|
|
61
|
+
self.extract_format = extract_format
|
|
62
|
+
self.extract_depth = extract_depth
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
66
|
+
"""Get the list of supported chunking strategies for Tavily readers."""
|
|
67
|
+
return [
|
|
68
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
69
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
70
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
71
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
72
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def get_supported_content_types(self) -> List[ContentType]:
|
|
77
|
+
return [ContentType.URL]
|
|
78
|
+
|
|
79
|
+
def _extract(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
80
|
+
"""
|
|
81
|
+
Internal method to extract content from a URL using Tavily's Extract API.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
url: The URL to extract content from
|
|
85
|
+
name: Optional name for the document (defaults to URL)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A list of documents containing the extracted content
|
|
89
|
+
"""
|
|
90
|
+
log_debug(f"Extracting content from: {url}")
|
|
91
|
+
|
|
92
|
+
client = TavilyClient(api_key=self.api_key)
|
|
93
|
+
|
|
94
|
+
# Prepare extract parameters
|
|
95
|
+
extract_params = {
|
|
96
|
+
"urls": [url],
|
|
97
|
+
"depth": self.extract_depth,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Add optional params if provided
|
|
101
|
+
if self.params:
|
|
102
|
+
extract_params.update(self.params)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# Call Tavily Extract API
|
|
106
|
+
response = client.extract(**extract_params)
|
|
107
|
+
|
|
108
|
+
# Extract content from response
|
|
109
|
+
if not response or "results" not in response:
|
|
110
|
+
logger.warning(f"No results received for URL: {url}")
|
|
111
|
+
return [Document(name=name or url, id=url, content="")]
|
|
112
|
+
|
|
113
|
+
results = response.get("results", [])
|
|
114
|
+
if not results:
|
|
115
|
+
logger.warning(f"Empty results for URL: {url}")
|
|
116
|
+
return [Document(name=name or url, id=url, content="")]
|
|
117
|
+
|
|
118
|
+
# Get the first result (since we're extracting a single URL)
|
|
119
|
+
result = results[0]
|
|
120
|
+
|
|
121
|
+
# Check if extraction failed
|
|
122
|
+
if "failed_reason" in result:
|
|
123
|
+
logger.warning(f"Extraction failed for {url}: {result['failed_reason']}")
|
|
124
|
+
return [Document(name=name or url, id=url, content="")]
|
|
125
|
+
|
|
126
|
+
# Get raw content
|
|
127
|
+
content = result.get("raw_content", "")
|
|
128
|
+
|
|
129
|
+
if content is None:
|
|
130
|
+
content = ""
|
|
131
|
+
logger.warning(f"No content received for URL: {url}")
|
|
132
|
+
|
|
133
|
+
# Debug logging
|
|
134
|
+
log_debug(f"Received content type: {type(content)}")
|
|
135
|
+
log_debug(f"Content length: {len(content) if content else 0}")
|
|
136
|
+
|
|
137
|
+
# Create documents
|
|
138
|
+
documents = []
|
|
139
|
+
if self.chunk and content:
|
|
140
|
+
documents.extend(self.chunk_document(Document(name=name or url, id=url, content=content)))
|
|
141
|
+
else:
|
|
142
|
+
documents.append(Document(name=name or url, id=url, content=content))
|
|
143
|
+
|
|
144
|
+
return documents
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"Error extracting content from {url}: {e}")
|
|
148
|
+
return [Document(name=name or url, id=url, content="")]
|
|
149
|
+
|
|
150
|
+
async def _async_extract(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
151
|
+
"""
|
|
152
|
+
Internal async method to extract content from a URL.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
url: The URL to extract content from
|
|
156
|
+
name: Optional name for the document
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
A list of documents containing the extracted content
|
|
160
|
+
"""
|
|
161
|
+
log_debug(f"Async extracting content from: {url}")
|
|
162
|
+
|
|
163
|
+
# Use asyncio.to_thread to run the synchronous extract in a thread
|
|
164
|
+
return await asyncio.to_thread(self._extract, url, name)
|
|
165
|
+
|
|
166
|
+
def read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
167
|
+
"""
|
|
168
|
+
Reads content from a URL using Tavily Extract API.
|
|
169
|
+
|
|
170
|
+
This is the public API method that users should call.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
url: The URL to extract content from
|
|
174
|
+
name: Optional name for the document
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
A list of documents containing the extracted content
|
|
178
|
+
"""
|
|
179
|
+
return self._extract(url, name)
|
|
180
|
+
|
|
181
|
+
async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
182
|
+
"""
|
|
183
|
+
Asynchronously reads content from a URL using Tavily Extract API.
|
|
184
|
+
|
|
185
|
+
This is the public API method that users should call for async operations.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
url: The URL to extract content from
|
|
189
|
+
name: Optional name for the document
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
A list of documents containing the extracted content
|
|
193
|
+
"""
|
|
194
|
+
return await self._async_extract(url, name)
|