agno 2.0.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +6015 -2823
- agno/api/api.py +2 -0
- agno/api/os.py +1 -1
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +385 -6
- agno/db/dynamo/dynamo.py +388 -81
- agno/db/dynamo/schemas.py +47 -10
- agno/db/dynamo/utils.py +63 -4
- agno/db/firestore/firestore.py +435 -64
- agno/db/firestore/schemas.py +11 -0
- agno/db/firestore/utils.py +102 -4
- agno/db/gcs_json/gcs_json_db.py +384 -42
- agno/db/gcs_json/utils.py +60 -26
- agno/db/in_memory/in_memory_db.py +351 -66
- agno/db/in_memory/utils.py +60 -2
- agno/db/json/json_db.py +339 -48
- agno/db/json/utils.py +60 -26
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +510 -37
- agno/db/migrations/versions/__init__.py +0 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +15 -1
- agno/db/mongo/async_mongo.py +2036 -0
- agno/db/mongo/mongo.py +653 -76
- agno/db/mongo/schemas.py +13 -0
- agno/db/mongo/utils.py +80 -8
- agno/db/mysql/mysql.py +687 -25
- agno/db/mysql/schemas.py +61 -37
- agno/db/mysql/utils.py +60 -2
- agno/db/postgres/__init__.py +2 -1
- agno/db/postgres/async_postgres.py +2001 -0
- agno/db/postgres/postgres.py +676 -57
- agno/db/postgres/schemas.py +43 -18
- agno/db/postgres/utils.py +164 -2
- agno/db/redis/redis.py +344 -38
- agno/db/redis/schemas.py +18 -0
- agno/db/redis/utils.py +60 -2
- agno/db/schemas/__init__.py +2 -1
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/memory.py +13 -0
- agno/db/singlestore/schemas.py +26 -1
- agno/db/singlestore/singlestore.py +687 -53
- agno/db/singlestore/utils.py +60 -2
- agno/db/sqlite/__init__.py +2 -1
- agno/db/sqlite/async_sqlite.py +2371 -0
- agno/db/sqlite/schemas.py +24 -0
- agno/db/sqlite/sqlite.py +774 -85
- agno/db/sqlite/utils.py +168 -5
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +309 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1361 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +50 -22
- agno/eval/accuracy.py +50 -43
- agno/eval/performance.py +6 -3
- agno/eval/reliability.py +6 -3
- agno/eval/utils.py +33 -16
- agno/exceptions.py +68 -1
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/integrations/discord/client.py +1 -0
- agno/knowledge/chunking/agentic.py +13 -10
- agno/knowledge/chunking/fixed.py +1 -1
- agno/knowledge/chunking/semantic.py +40 -8
- agno/knowledge/chunking/strategy.py +59 -15
- agno/knowledge/embedder/aws_bedrock.py +9 -4
- agno/knowledge/embedder/azure_openai.py +54 -0
- agno/knowledge/embedder/base.py +2 -0
- agno/knowledge/embedder/cohere.py +184 -5
- agno/knowledge/embedder/fastembed.py +1 -1
- agno/knowledge/embedder/google.py +79 -1
- agno/knowledge/embedder/huggingface.py +9 -4
- agno/knowledge/embedder/jina.py +63 -0
- agno/knowledge/embedder/mistral.py +78 -11
- agno/knowledge/embedder/nebius.py +1 -1
- agno/knowledge/embedder/ollama.py +13 -0
- agno/knowledge/embedder/openai.py +37 -65
- agno/knowledge/embedder/sentence_transformer.py +8 -4
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +69 -16
- agno/knowledge/knowledge.py +594 -186
- agno/knowledge/reader/base.py +9 -2
- agno/knowledge/reader/csv_reader.py +8 -10
- agno/knowledge/reader/docx_reader.py +5 -6
- agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
- agno/knowledge/reader/json_reader.py +6 -5
- agno/knowledge/reader/markdown_reader.py +13 -13
- agno/knowledge/reader/pdf_reader.py +43 -68
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +51 -6
- agno/knowledge/reader/s3_reader.py +3 -15
- agno/knowledge/reader/tavily_reader.py +194 -0
- agno/knowledge/reader/text_reader.py +13 -13
- agno/knowledge/reader/web_search_reader.py +2 -43
- agno/knowledge/reader/website_reader.py +43 -25
- agno/knowledge/reranker/__init__.py +2 -8
- agno/knowledge/types.py +9 -0
- agno/knowledge/utils.py +20 -0
- agno/media.py +72 -0
- agno/memory/manager.py +336 -82
- agno/models/aimlapi/aimlapi.py +2 -2
- agno/models/anthropic/claude.py +183 -37
- agno/models/aws/bedrock.py +52 -112
- agno/models/aws/claude.py +33 -1
- agno/models/azure/ai_foundry.py +33 -15
- agno/models/azure/openai_chat.py +25 -8
- agno/models/base.py +999 -519
- agno/models/cerebras/cerebras.py +19 -13
- agno/models/cerebras/cerebras_openai.py +8 -5
- agno/models/cohere/chat.py +27 -1
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +57 -0
- agno/models/dashscope/dashscope.py +1 -0
- agno/models/deepinfra/deepinfra.py +2 -2
- agno/models/deepseek/deepseek.py +2 -2
- agno/models/fireworks/fireworks.py +2 -2
- agno/models/google/gemini.py +103 -31
- agno/models/groq/groq.py +28 -11
- agno/models/huggingface/huggingface.py +2 -1
- agno/models/internlm/internlm.py +2 -2
- agno/models/langdb/langdb.py +4 -4
- agno/models/litellm/chat.py +18 -1
- agno/models/litellm/litellm_openai.py +2 -2
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/message.py +139 -0
- agno/models/meta/llama.py +27 -10
- agno/models/meta/llama_openai.py +5 -17
- agno/models/nebius/nebius.py +6 -6
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/nvidia.py +2 -2
- agno/models/ollama/chat.py +59 -5
- agno/models/openai/chat.py +69 -29
- agno/models/openai/responses.py +103 -106
- agno/models/openrouter/openrouter.py +41 -3
- agno/models/perplexity/perplexity.py +4 -5
- agno/models/portkey/portkey.py +3 -3
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +52 -0
- agno/models/response.py +77 -1
- agno/models/sambanova/sambanova.py +2 -2
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +25 -0
- agno/models/together/together.py +2 -2
- agno/models/utils.py +254 -8
- agno/models/vercel/v0.py +2 -2
- agno/models/vertexai/__init__.py +0 -0
- agno/models/vertexai/claude.py +96 -0
- agno/models/vllm/vllm.py +1 -0
- agno/models/xai/xai.py +3 -2
- agno/os/app.py +543 -178
- agno/os/auth.py +24 -14
- agno/os/config.py +1 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +250 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/agui.py +23 -7
- agno/os/interfaces/agui/router.py +27 -3
- agno/os/interfaces/agui/utils.py +242 -142
- agno/os/interfaces/base.py +6 -2
- agno/os/interfaces/slack/router.py +81 -23
- agno/os/interfaces/slack/slack.py +29 -14
- agno/os/interfaces/whatsapp/router.py +11 -4
- agno/os/interfaces/whatsapp/whatsapp.py +14 -7
- agno/os/mcp.py +111 -54
- agno/os/middleware/__init__.py +7 -0
- agno/os/middleware/jwt.py +233 -0
- agno/os/router.py +556 -139
- agno/os/routers/evals/evals.py +71 -34
- agno/os/routers/evals/schemas.py +31 -31
- agno/os/routers/evals/utils.py +6 -5
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/knowledge.py +185 -38
- agno/os/routers/knowledge/schemas.py +82 -22
- agno/os/routers/memory/memory.py +158 -53
- agno/os/routers/memory/schemas.py +20 -16
- agno/os/routers/metrics/metrics.py +20 -8
- agno/os/routers/metrics/schemas.py +16 -16
- agno/os/routers/session/session.py +499 -38
- agno/os/schema.py +308 -198
- agno/os/utils.py +401 -41
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +2 -2
- agno/reasoning/deepseek.py +2 -2
- agno/reasoning/default.py +3 -1
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +2 -2
- agno/reasoning/ollama.py +2 -2
- agno/reasoning/openai.py +7 -2
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +248 -94
- agno/run/base.py +44 -5
- agno/run/team.py +238 -97
- agno/run/workflow.py +144 -33
- agno/session/agent.py +105 -89
- agno/session/summary.py +65 -25
- agno/session/team.py +176 -96
- agno/session/workflow.py +406 -40
- agno/team/team.py +3854 -1610
- agno/tools/dalle.py +2 -4
- agno/tools/decorator.py +4 -2
- agno/tools/duckduckgo.py +15 -11
- agno/tools/e2b.py +14 -7
- agno/tools/eleven_labs.py +23 -25
- agno/tools/exa.py +21 -16
- agno/tools/file.py +153 -23
- agno/tools/file_generation.py +350 -0
- agno/tools/firecrawl.py +4 -4
- agno/tools/function.py +250 -30
- agno/tools/gmail.py +238 -14
- agno/tools/google_drive.py +270 -0
- agno/tools/googlecalendar.py +36 -8
- agno/tools/googlesheets.py +20 -5
- agno/tools/jira.py +20 -0
- agno/tools/knowledge.py +3 -3
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +11 -17
- agno/tools/memori.py +1 -53
- agno/tools/memory.py +419 -0
- agno/tools/models/nebius.py +5 -5
- agno/tools/models_labs.py +20 -10
- agno/tools/notion.py +204 -0
- agno/tools/parallel.py +314 -0
- agno/tools/scrapegraph.py +58 -31
- agno/tools/searxng.py +2 -2
- agno/tools/serper.py +2 -2
- agno/tools/slack.py +18 -3
- agno/tools/spider.py +2 -2
- agno/tools/tavily.py +146 -0
- agno/tools/whatsapp.py +1 -1
- agno/tools/workflow.py +278 -0
- agno/tools/yfinance.py +12 -11
- agno/utils/agent.py +820 -0
- agno/utils/audio.py +27 -0
- agno/utils/common.py +90 -1
- agno/utils/events.py +217 -2
- agno/utils/gemini.py +180 -22
- agno/utils/hooks.py +57 -0
- agno/utils/http.py +111 -0
- agno/utils/knowledge.py +12 -5
- agno/utils/log.py +1 -0
- agno/utils/mcp.py +92 -2
- agno/utils/media.py +188 -10
- agno/utils/merge_dict.py +22 -1
- agno/utils/message.py +60 -0
- agno/utils/models/claude.py +40 -11
- agno/utils/print_response/agent.py +105 -21
- agno/utils/print_response/team.py +103 -38
- agno/utils/print_response/workflow.py +251 -34
- agno/utils/reasoning.py +22 -1
- agno/utils/serialize.py +32 -0
- agno/utils/streamlit.py +16 -10
- agno/utils/string.py +41 -0
- agno/utils/team.py +98 -9
- agno/utils/tools.py +1 -1
- agno/vectordb/base.py +23 -4
- agno/vectordb/cassandra/cassandra.py +65 -9
- agno/vectordb/chroma/chromadb.py +182 -38
- agno/vectordb/clickhouse/clickhousedb.py +64 -11
- agno/vectordb/couchbase/couchbase.py +105 -10
- agno/vectordb/lancedb/lance_db.py +124 -133
- agno/vectordb/langchaindb/langchaindb.py +25 -7
- agno/vectordb/lightrag/lightrag.py +17 -3
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +46 -7
- agno/vectordb/milvus/milvus.py +126 -9
- agno/vectordb/mongodb/__init__.py +7 -1
- agno/vectordb/mongodb/mongodb.py +112 -7
- agno/vectordb/pgvector/pgvector.py +142 -21
- agno/vectordb/pineconedb/pineconedb.py +80 -8
- agno/vectordb/qdrant/qdrant.py +125 -39
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +694 -0
- agno/vectordb/singlestore/singlestore.py +111 -25
- agno/vectordb/surrealdb/surrealdb.py +31 -5
- agno/vectordb/upstashdb/upstashdb.py +76 -8
- agno/vectordb/weaviate/weaviate.py +86 -15
- agno/workflow/__init__.py +2 -0
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +112 -18
- agno/workflow/loop.py +69 -10
- agno/workflow/parallel.py +266 -118
- agno/workflow/router.py +110 -17
- agno/workflow/step.py +638 -129
- agno/workflow/steps.py +65 -6
- agno/workflow/types.py +61 -23
- agno/workflow/workflow.py +2085 -272
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/METADATA +182 -58
- agno-2.3.0.dist-info/RECORD +577 -0
- agno/knowledge/reader/url_reader.py +0 -128
- agno/tools/googlesearch.py +0 -98
- agno/tools/mcp.py +0 -610
- agno/utils/models/aws_claude.py +0 -170
- agno-2.0.1.dist-info/RECORD +0 -515
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/WHEEL +0 -0
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/licenses/LICENSE +0 -0
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/top_level.txt +0 -0
|
@@ -4,11 +4,12 @@ from pathlib import Path
|
|
|
4
4
|
from typing import IO, Any, List, Optional, Tuple, Union
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
from agno.knowledge.chunking.
|
|
7
|
+
from agno.knowledge.chunking.document import DocumentChunking
|
|
8
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
9
|
from agno.knowledge.document.base import Document
|
|
9
10
|
from agno.knowledge.reader.base import Reader
|
|
10
11
|
from agno.knowledge.types import ContentType
|
|
11
|
-
from agno.utils.log import
|
|
12
|
+
from agno.utils.log import log_debug, log_error
|
|
12
13
|
|
|
13
14
|
try:
|
|
14
15
|
from pypdf import PdfReader as DocumentReader # noqa: F401
|
|
@@ -117,6 +118,10 @@ def _clean_page_numbers(
|
|
|
117
118
|
page_numbers = [find_page_number(content) for content in page_content_list]
|
|
118
119
|
if all(x is None or x > 5 for x in page_numbers):
|
|
119
120
|
# This approach won't work reliably for higher page numbers.
|
|
121
|
+
page_content_list = [
|
|
122
|
+
f"\n{page_content_list[i]}\n{extra_content[i]}" if extra_content else page_content_list[i]
|
|
123
|
+
for i in range(len(page_content_list))
|
|
124
|
+
]
|
|
120
125
|
return page_content_list, None
|
|
121
126
|
|
|
122
127
|
# Possible range shifts to detect page numbering
|
|
@@ -179,6 +184,7 @@ class BasePDFReader(Reader):
|
|
|
179
184
|
page_start_numbering_format: Optional[str] = None,
|
|
180
185
|
page_end_numbering_format: Optional[str] = None,
|
|
181
186
|
password: Optional[str] = None,
|
|
187
|
+
chunking_strategy: Optional[ChunkingStrategy] = DocumentChunking(chunk_size=5000),
|
|
182
188
|
**kwargs,
|
|
183
189
|
):
|
|
184
190
|
if page_start_numbering_format is None:
|
|
@@ -191,11 +197,7 @@ class BasePDFReader(Reader):
|
|
|
191
197
|
self.page_end_numbering_format = page_end_numbering_format
|
|
192
198
|
self.password = password
|
|
193
199
|
|
|
194
|
-
|
|
195
|
-
from agno.knowledge.chunking.document import DocumentChunking
|
|
196
|
-
|
|
197
|
-
self.chunking_strategy = DocumentChunking(chunk_size=5000)
|
|
198
|
-
super().__init__(**kwargs)
|
|
200
|
+
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
199
201
|
|
|
200
202
|
@classmethod
|
|
201
203
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
@@ -214,6 +216,19 @@ class BasePDFReader(Reader):
|
|
|
214
216
|
chunked_documents.extend(self.chunk_document(document))
|
|
215
217
|
return chunked_documents
|
|
216
218
|
|
|
219
|
+
def _get_doc_name(self, pdf_source: Union[str, Path, IO[Any]], name: Optional[str] = None) -> str:
|
|
220
|
+
"""Determines the document name from the source or a provided name."""
|
|
221
|
+
try:
|
|
222
|
+
if name:
|
|
223
|
+
return name
|
|
224
|
+
if isinstance(pdf_source, str):
|
|
225
|
+
return pdf_source.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
226
|
+
# Assumes a file-like object with a .name attribute
|
|
227
|
+
return pdf_source.name.split(".")[0]
|
|
228
|
+
except Exception:
|
|
229
|
+
# The original code had a bug here, it should check `name` first.
|
|
230
|
+
return name or "pdf"
|
|
231
|
+
|
|
217
232
|
def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
|
|
218
233
|
if not doc_reader.is_encrypted:
|
|
219
234
|
return True
|
|
@@ -221,13 +236,13 @@ class BasePDFReader(Reader):
|
|
|
221
236
|
# Use provided password or fall back to instance password
|
|
222
237
|
pdf_password = password or self.password
|
|
223
238
|
if not pdf_password:
|
|
224
|
-
|
|
239
|
+
log_error(f'PDF file "{doc_name}" is password protected but no password provided')
|
|
225
240
|
return False
|
|
226
241
|
|
|
227
242
|
try:
|
|
228
243
|
decrypted_pdf = doc_reader.decrypt(pdf_password)
|
|
229
244
|
if decrypted_pdf:
|
|
230
|
-
|
|
245
|
+
log_debug(f'Successfully decrypted PDF file "{doc_name}" with user password')
|
|
231
246
|
return True
|
|
232
247
|
else:
|
|
233
248
|
log_error(f'Failed to decrypt PDF file "{doc_name}": incorrect password')
|
|
@@ -261,7 +276,6 @@ class BasePDFReader(Reader):
|
|
|
261
276
|
|
|
262
277
|
if self.chunk:
|
|
263
278
|
return self._build_chunked_documents(documents)
|
|
264
|
-
|
|
265
279
|
return documents
|
|
266
280
|
|
|
267
281
|
def _pdf_reader_to_documents(
|
|
@@ -329,40 +343,14 @@ class PDFReader(BasePDFReader):
|
|
|
329
343
|
def read(
|
|
330
344
|
self, pdf: Union[str, Path, IO[Any]], name: Optional[str] = None, password: Optional[str] = None
|
|
331
345
|
) -> List[Document]:
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
doc_name = name
|
|
335
|
-
elif isinstance(pdf, str):
|
|
336
|
-
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
337
|
-
else:
|
|
338
|
-
doc_name = pdf.name.split(".")[0]
|
|
339
|
-
except Exception:
|
|
340
|
-
doc_name = "pdf"
|
|
341
|
-
|
|
342
|
-
log_info(f"Reading: {doc_name}")
|
|
343
|
-
|
|
344
|
-
try:
|
|
345
|
-
DocumentReader(pdf)
|
|
346
|
-
except PdfStreamError as e:
|
|
347
|
-
logger.error(f"Error reading PDF: {e}")
|
|
348
|
-
return []
|
|
349
|
-
|
|
350
|
-
try:
|
|
351
|
-
if isinstance(pdf, str):
|
|
352
|
-
doc_name = name or pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
353
|
-
else:
|
|
354
|
-
doc_name = name or pdf.name.split(".")[0]
|
|
355
|
-
except Exception:
|
|
356
|
-
doc_name = name or "pdf"
|
|
357
|
-
|
|
358
|
-
log_info(f"Reading: {doc_name}")
|
|
346
|
+
doc_name = self._get_doc_name(pdf, name)
|
|
347
|
+
log_debug(f"Reading: {doc_name}")
|
|
359
348
|
|
|
360
349
|
try:
|
|
361
350
|
pdf_reader = DocumentReader(pdf)
|
|
362
351
|
except PdfStreamError as e:
|
|
363
|
-
|
|
352
|
+
log_error(f"Error reading PDF: {e}")
|
|
364
353
|
return []
|
|
365
|
-
|
|
366
354
|
# Handle PDF decryption
|
|
367
355
|
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
368
356
|
return []
|
|
@@ -379,21 +367,13 @@ class PDFReader(BasePDFReader):
|
|
|
379
367
|
if pdf is None:
|
|
380
368
|
log_error("No pdf provided")
|
|
381
369
|
return []
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
if isinstance(pdf, str):
|
|
385
|
-
doc_name = name or pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
386
|
-
else:
|
|
387
|
-
doc_name = pdf.name.split(".")[0]
|
|
388
|
-
except Exception:
|
|
389
|
-
doc_name = name or "pdf"
|
|
390
|
-
|
|
391
|
-
log_info(f"Reading: {doc_name}")
|
|
370
|
+
doc_name = self._get_doc_name(pdf, name)
|
|
371
|
+
log_debug(f"Reading: {doc_name}")
|
|
392
372
|
|
|
393
373
|
try:
|
|
394
374
|
pdf_reader = DocumentReader(pdf)
|
|
395
375
|
except PdfStreamError as e:
|
|
396
|
-
|
|
376
|
+
log_error(f"Error reading PDF: {e}")
|
|
397
377
|
return []
|
|
398
378
|
|
|
399
379
|
# Handle PDF decryption
|
|
@@ -413,16 +393,13 @@ class PDFImageReader(BasePDFReader):
|
|
|
413
393
|
if not pdf:
|
|
414
394
|
raise ValueError("No pdf provided")
|
|
415
395
|
|
|
396
|
+
doc_name = self._get_doc_name(pdf, name)
|
|
397
|
+
log_debug(f"Reading: {doc_name}")
|
|
416
398
|
try:
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
except Exception:
|
|
422
|
-
doc_name = "pdf"
|
|
423
|
-
|
|
424
|
-
log_info(f"Reading: {doc_name}")
|
|
425
|
-
pdf_reader = DocumentReader(pdf)
|
|
399
|
+
pdf_reader = DocumentReader(pdf)
|
|
400
|
+
except PdfStreamError as e:
|
|
401
|
+
log_error(f"Error reading PDF: {e}")
|
|
402
|
+
return []
|
|
426
403
|
|
|
427
404
|
# Handle PDF decryption
|
|
428
405
|
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
@@ -437,16 +414,14 @@ class PDFImageReader(BasePDFReader):
|
|
|
437
414
|
if not pdf:
|
|
438
415
|
raise ValueError("No pdf provided")
|
|
439
416
|
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
doc_name = name or pdf.split("/")[-1].split(".")[0].replace(" ", "_")
|
|
443
|
-
else:
|
|
444
|
-
doc_name = pdf.name.split(".")[0]
|
|
445
|
-
except Exception:
|
|
446
|
-
doc_name = "pdf"
|
|
417
|
+
doc_name = self._get_doc_name(pdf, name)
|
|
418
|
+
log_debug(f"Reading: {doc_name}")
|
|
447
419
|
|
|
448
|
-
|
|
449
|
-
|
|
420
|
+
try:
|
|
421
|
+
pdf_reader = DocumentReader(pdf)
|
|
422
|
+
except PdfStreamError as e:
|
|
423
|
+
log_error(f"Error reading PDF: {e}")
|
|
424
|
+
return []
|
|
450
425
|
|
|
451
426
|
# Handle PDF decryption
|
|
452
427
|
if not self._decrypt_pdf(pdf_reader, doc_name, password):
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import IO, Any, List, Optional, Union
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from agno.knowledge.chunking.document import DocumentChunking
|
|
7
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
|
+
from agno.knowledge.document.base import Document
|
|
9
|
+
from agno.knowledge.reader.base import Reader
|
|
10
|
+
from agno.knowledge.types import ContentType
|
|
11
|
+
from agno.utils.log import log_debug, log_error
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pptx import Presentation # type: ignore
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError("The `python-pptx` package is not installed. Please install it via `pip install python-pptx`.")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PPTXReader(Reader):
|
|
20
|
+
"""Reader for PPTX files"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, chunking_strategy: Optional[ChunkingStrategy] = DocumentChunking(), **kwargs):
|
|
23
|
+
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
27
|
+
"""Get the list of supported chunking strategies for PPTX readers."""
|
|
28
|
+
return [
|
|
29
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def get_supported_content_types(self) -> List[ContentType]:
|
|
38
|
+
return [ContentType.PPTX]
|
|
39
|
+
|
|
40
|
+
def read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
41
|
+
"""Read a pptx file and return a list of documents"""
|
|
42
|
+
try:
|
|
43
|
+
if isinstance(file, Path):
|
|
44
|
+
if not file.exists():
|
|
45
|
+
raise FileNotFoundError(f"Could not find file: {file}")
|
|
46
|
+
log_debug(f"Reading: {file}")
|
|
47
|
+
presentation = Presentation(str(file))
|
|
48
|
+
doc_name = name or file.stem
|
|
49
|
+
else:
|
|
50
|
+
log_debug(f"Reading uploaded file: {getattr(file, 'name', 'pptx_file')}")
|
|
51
|
+
presentation = Presentation(file)
|
|
52
|
+
doc_name = name or (
|
|
53
|
+
getattr(file, "name", "pptx_file").split(".")[0] if hasattr(file, "name") else "pptx_file"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Extract text from all slides
|
|
57
|
+
slide_texts = []
|
|
58
|
+
for slide_number, slide in enumerate(presentation.slides, 1):
|
|
59
|
+
slide_text = f"Slide {slide_number}:\n"
|
|
60
|
+
|
|
61
|
+
# Extract text from shapes that contain text
|
|
62
|
+
text_content = []
|
|
63
|
+
for shape in slide.shapes:
|
|
64
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
65
|
+
text_content.append(shape.text.strip())
|
|
66
|
+
|
|
67
|
+
if text_content:
|
|
68
|
+
slide_text += "\n".join(text_content)
|
|
69
|
+
else:
|
|
70
|
+
slide_text += "(No text content)"
|
|
71
|
+
|
|
72
|
+
slide_texts.append(slide_text)
|
|
73
|
+
|
|
74
|
+
doc_content = "\n\n".join(slide_texts)
|
|
75
|
+
|
|
76
|
+
documents = [
|
|
77
|
+
Document(
|
|
78
|
+
name=doc_name,
|
|
79
|
+
id=str(uuid4()),
|
|
80
|
+
content=doc_content,
|
|
81
|
+
)
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
if self.chunk:
|
|
85
|
+
chunked_documents = []
|
|
86
|
+
for document in documents:
|
|
87
|
+
chunked_documents.extend(self.chunk_document(document))
|
|
88
|
+
return chunked_documents
|
|
89
|
+
return documents
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
log_error(f"Error reading file: {e}")
|
|
93
|
+
return []
|
|
94
|
+
|
|
95
|
+
async def async_read(self, file: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
96
|
+
"""Asynchronously read a pptx file and return a list of documents"""
|
|
97
|
+
try:
|
|
98
|
+
return await asyncio.to_thread(self.read, file, name)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
log_error(f"Error reading file asynchronously: {e}")
|
|
101
|
+
return []
|
|
@@ -16,8 +16,7 @@ class ReaderFactory:
|
|
|
16
16
|
from agno.knowledge.reader.pdf_reader import PDFReader
|
|
17
17
|
|
|
18
18
|
config: Dict[str, Any] = {
|
|
19
|
-
"
|
|
20
|
-
"chunk_size": 100,
|
|
19
|
+
"name": "PDF Reader",
|
|
21
20
|
"description": "Processes PDF documents with OCR support for images and text extraction",
|
|
22
21
|
}
|
|
23
22
|
config.update(kwargs)
|
|
@@ -35,6 +34,18 @@ class ReaderFactory:
|
|
|
35
34
|
config.update(kwargs)
|
|
36
35
|
return CSVReader(**config)
|
|
37
36
|
|
|
37
|
+
@classmethod
|
|
38
|
+
def _get_field_labeled_csv_reader(cls, **kwargs) -> Reader:
|
|
39
|
+
"""Get Field Labeled CSV reader instance."""
|
|
40
|
+
from agno.knowledge.reader.field_labeled_csv_reader import FieldLabeledCSVReader
|
|
41
|
+
|
|
42
|
+
config: Dict[str, Any] = {
|
|
43
|
+
"name": "Field Labeled CSV Reader",
|
|
44
|
+
"description": "Converts CSV rows to field-labeled text format for enhanced readability and context",
|
|
45
|
+
}
|
|
46
|
+
config.update(kwargs)
|
|
47
|
+
return FieldLabeledCSVReader(**config)
|
|
48
|
+
|
|
38
49
|
@classmethod
|
|
39
50
|
def _get_docx_reader(cls, **kwargs) -> Reader:
|
|
40
51
|
"""Get Docx reader instance."""
|
|
@@ -47,6 +58,18 @@ class ReaderFactory:
|
|
|
47
58
|
config.update(kwargs)
|
|
48
59
|
return DocxReader(**config)
|
|
49
60
|
|
|
61
|
+
@classmethod
|
|
62
|
+
def _get_pptx_reader(cls, **kwargs) -> Reader:
|
|
63
|
+
"""Get PPTX reader instance."""
|
|
64
|
+
from agno.knowledge.reader.pptx_reader import PPTXReader
|
|
65
|
+
|
|
66
|
+
config: Dict[str, Any] = {
|
|
67
|
+
"name": "PPTX Reader",
|
|
68
|
+
"description": "Extracts text content from Microsoft PowerPoint presentations (.pptx format)",
|
|
69
|
+
}
|
|
70
|
+
config.update(kwargs)
|
|
71
|
+
return PPTXReader(**config)
|
|
72
|
+
|
|
50
73
|
@classmethod
|
|
51
74
|
def _get_json_reader(cls, **kwargs) -> Reader:
|
|
52
75
|
"""Get JSON reader instance."""
|
|
@@ -109,6 +132,21 @@ class ReaderFactory:
|
|
|
109
132
|
config.update(kwargs)
|
|
110
133
|
return FirecrawlReader(**config)
|
|
111
134
|
|
|
135
|
+
@classmethod
|
|
136
|
+
def _get_tavily_reader(cls, **kwargs) -> Reader:
|
|
137
|
+
"""Get Tavily reader instance."""
|
|
138
|
+
from agno.knowledge.reader.tavily_reader import TavilyReader
|
|
139
|
+
|
|
140
|
+
config: Dict[str, Any] = {
|
|
141
|
+
"api_key": kwargs.get("api_key") or os.getenv("TAVILY_API_KEY"),
|
|
142
|
+
"extract_format": "markdown",
|
|
143
|
+
"extract_depth": "basic",
|
|
144
|
+
"name": "Tavily Reader",
|
|
145
|
+
"description": "Extracts content from URLs using Tavily's Extract API with markdown or text output",
|
|
146
|
+
}
|
|
147
|
+
config.update(kwargs)
|
|
148
|
+
return TavilyReader(**config)
|
|
149
|
+
|
|
112
150
|
@classmethod
|
|
113
151
|
def _get_youtube_reader(cls, **kwargs) -> Reader:
|
|
114
152
|
"""Get YouTube reader instance."""
|
|
@@ -189,8 +227,10 @@ class ReaderFactory:
|
|
|
189
227
|
return cls.create_reader("pdf")
|
|
190
228
|
elif extension in [".csv", "text/csv"]:
|
|
191
229
|
return cls.create_reader("csv")
|
|
192
|
-
elif extension in [".docx", ".doc"]:
|
|
230
|
+
elif extension in [".docx", ".doc", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
|
|
193
231
|
return cls.create_reader("docx")
|
|
232
|
+
elif extension == ".pptx":
|
|
233
|
+
return cls.create_reader("pptx")
|
|
194
234
|
elif extension == ".json":
|
|
195
235
|
return cls.create_reader("json")
|
|
196
236
|
elif extension in [".md", ".markdown"]:
|
|
@@ -210,8 +250,8 @@ class ReaderFactory:
|
|
|
210
250
|
if any(domain in url_lower for domain in ["youtube.com", "youtu.be"]):
|
|
211
251
|
return cls.create_reader("youtube")
|
|
212
252
|
|
|
213
|
-
# Default to
|
|
214
|
-
return cls.create_reader("
|
|
253
|
+
# Default to website reader
|
|
254
|
+
return cls.create_reader("website")
|
|
215
255
|
|
|
216
256
|
@classmethod
|
|
217
257
|
def get_all_reader_keys(cls) -> List[str]:
|
|
@@ -228,7 +268,12 @@ class ReaderFactory:
|
|
|
228
268
|
reader_keys.append(reader_key)
|
|
229
269
|
|
|
230
270
|
# Define priority order for URL readers
|
|
231
|
-
url_reader_priority = [
|
|
271
|
+
url_reader_priority = [
|
|
272
|
+
"website",
|
|
273
|
+
"firecrawl",
|
|
274
|
+
"tavily",
|
|
275
|
+
"youtube",
|
|
276
|
+
]
|
|
232
277
|
|
|
233
278
|
# Sort with URL readers in priority order, others alphabetically
|
|
234
279
|
def sort_key(reader_key):
|
|
@@ -10,7 +10,7 @@ from agno.knowledge.reader.base import Reader
|
|
|
10
10
|
from agno.knowledge.reader.pdf_reader import PDFReader
|
|
11
11
|
from agno.knowledge.reader.text_reader import TextReader
|
|
12
12
|
from agno.knowledge.types import ContentType
|
|
13
|
-
from agno.utils.log import
|
|
13
|
+
from agno.utils.log import log_debug, log_error
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
16
|
from agno.aws.resource.s3.object import S3Object # type: ignore
|
|
@@ -51,7 +51,7 @@ class S3Reader(Reader):
|
|
|
51
51
|
|
|
52
52
|
def read(self, name: Optional[str], s3_object: S3Object) -> List[Document]:
|
|
53
53
|
try:
|
|
54
|
-
|
|
54
|
+
log_debug(f"Reading S3 file: {s3_object.uri}")
|
|
55
55
|
|
|
56
56
|
# Read PDF files
|
|
57
57
|
if s3_object.uri.endswith(".pdf"):
|
|
@@ -74,25 +74,13 @@ class S3Reader(Reader):
|
|
|
74
74
|
obj_name = s3_object.name.split("/")[-1]
|
|
75
75
|
temporary_file = Path("storage").joinpath(obj_name)
|
|
76
76
|
s3_object.download(temporary_file)
|
|
77
|
-
|
|
78
|
-
# TODO: Before we were using textract here. Needed?
|
|
79
|
-
# s3_object.download(temporary_file)
|
|
80
|
-
# doc_content = textract.process(temporary_file)
|
|
81
|
-
# documents = [
|
|
82
|
-
# Document(
|
|
83
|
-
# name=doc_name,
|
|
84
|
-
# id=doc_name,
|
|
85
|
-
# content=doc_content.decode("utf-8"),
|
|
86
|
-
# )
|
|
87
|
-
# ]
|
|
88
|
-
|
|
89
77
|
documents = TextReader().read(file=temporary_file, name=doc_name)
|
|
90
78
|
|
|
91
79
|
temporary_file.unlink()
|
|
92
80
|
return documents
|
|
93
81
|
|
|
94
82
|
except Exception as e:
|
|
95
|
-
|
|
83
|
+
log_error(f"Error reading: {s3_object.uri}: {e}")
|
|
96
84
|
|
|
97
85
|
return []
|
|
98
86
|
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
from agno.knowledge.chunking.semantic import SemanticChunking
|
|
6
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
7
|
+
from agno.knowledge.document.base import Document
|
|
8
|
+
from agno.knowledge.reader.base import Reader
|
|
9
|
+
from agno.knowledge.types import ContentType
|
|
10
|
+
from agno.utils.log import log_debug, logger
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from tavily import TavilyClient # type: ignore[attr-defined]
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise ImportError(
|
|
16
|
+
"The `tavily-python` package is not installed. Please install it via `pip install tavily-python`."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class TavilyReader(Reader):
|
|
22
|
+
api_key: Optional[str] = None
|
|
23
|
+
params: Optional[Dict] = None
|
|
24
|
+
extract_format: Literal["markdown", "text"] = "markdown"
|
|
25
|
+
extract_depth: Literal["basic", "advanced"] = "basic"
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
api_key: Optional[str] = None,
|
|
30
|
+
params: Optional[Dict] = None,
|
|
31
|
+
extract_format: Literal["markdown", "text"] = "markdown",
|
|
32
|
+
extract_depth: Literal["basic", "advanced"] = "basic",
|
|
33
|
+
chunk: bool = True,
|
|
34
|
+
chunk_size: int = 5000,
|
|
35
|
+
chunking_strategy: Optional[ChunkingStrategy] = SemanticChunking(),
|
|
36
|
+
name: Optional[str] = None,
|
|
37
|
+
description: Optional[str] = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Initialize TavilyReader for extracting content from URLs using Tavily's Extract API.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
api_key: Tavily API key (or use TAVILY_API_KEY env var)
|
|
44
|
+
params: Additional parameters to pass to the extract API
|
|
45
|
+
extract_format: Output format - "markdown" or "text"
|
|
46
|
+
extract_depth: Extraction depth - "basic" (1 credit/5 URLs) or "advanced" (2 credits/5 URLs)
|
|
47
|
+
chunk: Whether to chunk the extracted content
|
|
48
|
+
chunk_size: Size of chunks when chunking is enabled
|
|
49
|
+
chunking_strategy: Strategy to use for chunking
|
|
50
|
+
name: Name of the reader
|
|
51
|
+
description: Description of the reader
|
|
52
|
+
"""
|
|
53
|
+
# Initialize base Reader (handles chunk_size / strategy)
|
|
54
|
+
super().__init__(
|
|
55
|
+
chunk=chunk, chunk_size=chunk_size, chunking_strategy=chunking_strategy, name=name, description=description
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Tavily-specific attributes
|
|
59
|
+
self.api_key = api_key
|
|
60
|
+
self.params = params or {}
|
|
61
|
+
self.extract_format = extract_format
|
|
62
|
+
self.extract_depth = extract_depth
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
66
|
+
"""Get the list of supported chunking strategies for Tavily readers."""
|
|
67
|
+
return [
|
|
68
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
69
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
70
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
71
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
72
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def get_supported_content_types(self) -> List[ContentType]:
|
|
77
|
+
return [ContentType.URL]
|
|
78
|
+
|
|
79
|
+
def _extract(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
80
|
+
"""
|
|
81
|
+
Internal method to extract content from a URL using Tavily's Extract API.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
url: The URL to extract content from
|
|
85
|
+
name: Optional name for the document (defaults to URL)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A list of documents containing the extracted content
|
|
89
|
+
"""
|
|
90
|
+
log_debug(f"Extracting content from: {url}")
|
|
91
|
+
|
|
92
|
+
client = TavilyClient(api_key=self.api_key)
|
|
93
|
+
|
|
94
|
+
# Prepare extract parameters
|
|
95
|
+
extract_params = {
|
|
96
|
+
"urls": [url],
|
|
97
|
+
"depth": self.extract_depth,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Add optional params if provided
|
|
101
|
+
if self.params:
|
|
102
|
+
extract_params.update(self.params)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# Call Tavily Extract API
|
|
106
|
+
response = client.extract(**extract_params)
|
|
107
|
+
|
|
108
|
+
# Extract content from response
|
|
109
|
+
if not response or "results" not in response:
|
|
110
|
+
logger.warning(f"No results received for URL: {url}")
|
|
111
|
+
return [Document(name=name or url, id=url, content="")]
|
|
112
|
+
|
|
113
|
+
results = response.get("results", [])
|
|
114
|
+
if not results:
|
|
115
|
+
logger.warning(f"Empty results for URL: {url}")
|
|
116
|
+
return [Document(name=name or url, id=url, content="")]
|
|
117
|
+
|
|
118
|
+
# Get the first result (since we're extracting a single URL)
|
|
119
|
+
result = results[0]
|
|
120
|
+
|
|
121
|
+
# Check if extraction failed
|
|
122
|
+
if "failed_reason" in result:
|
|
123
|
+
logger.warning(f"Extraction failed for {url}: {result['failed_reason']}")
|
|
124
|
+
return [Document(name=name or url, id=url, content="")]
|
|
125
|
+
|
|
126
|
+
# Get raw content
|
|
127
|
+
content = result.get("raw_content", "")
|
|
128
|
+
|
|
129
|
+
if content is None:
|
|
130
|
+
content = ""
|
|
131
|
+
logger.warning(f"No content received for URL: {url}")
|
|
132
|
+
|
|
133
|
+
# Debug logging
|
|
134
|
+
log_debug(f"Received content type: {type(content)}")
|
|
135
|
+
log_debug(f"Content length: {len(content) if content else 0}")
|
|
136
|
+
|
|
137
|
+
# Create documents
|
|
138
|
+
documents = []
|
|
139
|
+
if self.chunk and content:
|
|
140
|
+
documents.extend(self.chunk_document(Document(name=name or url, id=url, content=content)))
|
|
141
|
+
else:
|
|
142
|
+
documents.append(Document(name=name or url, id=url, content=content))
|
|
143
|
+
|
|
144
|
+
return documents
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"Error extracting content from {url}: {e}")
|
|
148
|
+
return [Document(name=name or url, id=url, content="")]
|
|
149
|
+
|
|
150
|
+
async def _async_extract(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
151
|
+
"""
|
|
152
|
+
Internal async method to extract content from a URL.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
url: The URL to extract content from
|
|
156
|
+
name: Optional name for the document
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
A list of documents containing the extracted content
|
|
160
|
+
"""
|
|
161
|
+
log_debug(f"Async extracting content from: {url}")
|
|
162
|
+
|
|
163
|
+
# Use asyncio.to_thread to run the synchronous extract in a thread
|
|
164
|
+
return await asyncio.to_thread(self._extract, url, name)
|
|
165
|
+
|
|
166
|
+
def read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
167
|
+
"""
|
|
168
|
+
Reads content from a URL using Tavily Extract API.
|
|
169
|
+
|
|
170
|
+
This is the public API method that users should call.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
url: The URL to extract content from
|
|
174
|
+
name: Optional name for the document
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
A list of documents containing the extracted content
|
|
178
|
+
"""
|
|
179
|
+
return self._extract(url, name)
|
|
180
|
+
|
|
181
|
+
async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
182
|
+
"""
|
|
183
|
+
Asynchronously reads content from a URL using Tavily Extract API.
|
|
184
|
+
|
|
185
|
+
This is the public API method that users should call for async operations.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
url: The URL to extract content from
|
|
189
|
+
name: Optional name for the document
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
A list of documents containing the extracted content
|
|
193
|
+
"""
|
|
194
|
+
return await self._async_extract(url, name)
|