agno 2.0.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +6015 -2823
- agno/api/api.py +2 -0
- agno/api/os.py +1 -1
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +385 -6
- agno/db/dynamo/dynamo.py +388 -81
- agno/db/dynamo/schemas.py +47 -10
- agno/db/dynamo/utils.py +63 -4
- agno/db/firestore/firestore.py +435 -64
- agno/db/firestore/schemas.py +11 -0
- agno/db/firestore/utils.py +102 -4
- agno/db/gcs_json/gcs_json_db.py +384 -42
- agno/db/gcs_json/utils.py +60 -26
- agno/db/in_memory/in_memory_db.py +351 -66
- agno/db/in_memory/utils.py +60 -2
- agno/db/json/json_db.py +339 -48
- agno/db/json/utils.py +60 -26
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +510 -37
- agno/db/migrations/versions/__init__.py +0 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +15 -1
- agno/db/mongo/async_mongo.py +2036 -0
- agno/db/mongo/mongo.py +653 -76
- agno/db/mongo/schemas.py +13 -0
- agno/db/mongo/utils.py +80 -8
- agno/db/mysql/mysql.py +687 -25
- agno/db/mysql/schemas.py +61 -37
- agno/db/mysql/utils.py +60 -2
- agno/db/postgres/__init__.py +2 -1
- agno/db/postgres/async_postgres.py +2001 -0
- agno/db/postgres/postgres.py +676 -57
- agno/db/postgres/schemas.py +43 -18
- agno/db/postgres/utils.py +164 -2
- agno/db/redis/redis.py +344 -38
- agno/db/redis/schemas.py +18 -0
- agno/db/redis/utils.py +60 -2
- agno/db/schemas/__init__.py +2 -1
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/memory.py +13 -0
- agno/db/singlestore/schemas.py +26 -1
- agno/db/singlestore/singlestore.py +687 -53
- agno/db/singlestore/utils.py +60 -2
- agno/db/sqlite/__init__.py +2 -1
- agno/db/sqlite/async_sqlite.py +2371 -0
- agno/db/sqlite/schemas.py +24 -0
- agno/db/sqlite/sqlite.py +774 -85
- agno/db/sqlite/utils.py +168 -5
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +309 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1361 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +50 -22
- agno/eval/accuracy.py +50 -43
- agno/eval/performance.py +6 -3
- agno/eval/reliability.py +6 -3
- agno/eval/utils.py +33 -16
- agno/exceptions.py +68 -1
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/integrations/discord/client.py +1 -0
- agno/knowledge/chunking/agentic.py +13 -10
- agno/knowledge/chunking/fixed.py +1 -1
- agno/knowledge/chunking/semantic.py +40 -8
- agno/knowledge/chunking/strategy.py +59 -15
- agno/knowledge/embedder/aws_bedrock.py +9 -4
- agno/knowledge/embedder/azure_openai.py +54 -0
- agno/knowledge/embedder/base.py +2 -0
- agno/knowledge/embedder/cohere.py +184 -5
- agno/knowledge/embedder/fastembed.py +1 -1
- agno/knowledge/embedder/google.py +79 -1
- agno/knowledge/embedder/huggingface.py +9 -4
- agno/knowledge/embedder/jina.py +63 -0
- agno/knowledge/embedder/mistral.py +78 -11
- agno/knowledge/embedder/nebius.py +1 -1
- agno/knowledge/embedder/ollama.py +13 -0
- agno/knowledge/embedder/openai.py +37 -65
- agno/knowledge/embedder/sentence_transformer.py +8 -4
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +69 -16
- agno/knowledge/knowledge.py +594 -186
- agno/knowledge/reader/base.py +9 -2
- agno/knowledge/reader/csv_reader.py +8 -10
- agno/knowledge/reader/docx_reader.py +5 -6
- agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
- agno/knowledge/reader/json_reader.py +6 -5
- agno/knowledge/reader/markdown_reader.py +13 -13
- agno/knowledge/reader/pdf_reader.py +43 -68
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +51 -6
- agno/knowledge/reader/s3_reader.py +3 -15
- agno/knowledge/reader/tavily_reader.py +194 -0
- agno/knowledge/reader/text_reader.py +13 -13
- agno/knowledge/reader/web_search_reader.py +2 -43
- agno/knowledge/reader/website_reader.py +43 -25
- agno/knowledge/reranker/__init__.py +2 -8
- agno/knowledge/types.py +9 -0
- agno/knowledge/utils.py +20 -0
- agno/media.py +72 -0
- agno/memory/manager.py +336 -82
- agno/models/aimlapi/aimlapi.py +2 -2
- agno/models/anthropic/claude.py +183 -37
- agno/models/aws/bedrock.py +52 -112
- agno/models/aws/claude.py +33 -1
- agno/models/azure/ai_foundry.py +33 -15
- agno/models/azure/openai_chat.py +25 -8
- agno/models/base.py +999 -519
- agno/models/cerebras/cerebras.py +19 -13
- agno/models/cerebras/cerebras_openai.py +8 -5
- agno/models/cohere/chat.py +27 -1
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +57 -0
- agno/models/dashscope/dashscope.py +1 -0
- agno/models/deepinfra/deepinfra.py +2 -2
- agno/models/deepseek/deepseek.py +2 -2
- agno/models/fireworks/fireworks.py +2 -2
- agno/models/google/gemini.py +103 -31
- agno/models/groq/groq.py +28 -11
- agno/models/huggingface/huggingface.py +2 -1
- agno/models/internlm/internlm.py +2 -2
- agno/models/langdb/langdb.py +4 -4
- agno/models/litellm/chat.py +18 -1
- agno/models/litellm/litellm_openai.py +2 -2
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/message.py +139 -0
- agno/models/meta/llama.py +27 -10
- agno/models/meta/llama_openai.py +5 -17
- agno/models/nebius/nebius.py +6 -6
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/nvidia.py +2 -2
- agno/models/ollama/chat.py +59 -5
- agno/models/openai/chat.py +69 -29
- agno/models/openai/responses.py +103 -106
- agno/models/openrouter/openrouter.py +41 -3
- agno/models/perplexity/perplexity.py +4 -5
- agno/models/portkey/portkey.py +3 -3
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +52 -0
- agno/models/response.py +77 -1
- agno/models/sambanova/sambanova.py +2 -2
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +25 -0
- agno/models/together/together.py +2 -2
- agno/models/utils.py +254 -8
- agno/models/vercel/v0.py +2 -2
- agno/models/vertexai/__init__.py +0 -0
- agno/models/vertexai/claude.py +96 -0
- agno/models/vllm/vllm.py +1 -0
- agno/models/xai/xai.py +3 -2
- agno/os/app.py +543 -178
- agno/os/auth.py +24 -14
- agno/os/config.py +1 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +250 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/agui.py +23 -7
- agno/os/interfaces/agui/router.py +27 -3
- agno/os/interfaces/agui/utils.py +242 -142
- agno/os/interfaces/base.py +6 -2
- agno/os/interfaces/slack/router.py +81 -23
- agno/os/interfaces/slack/slack.py +29 -14
- agno/os/interfaces/whatsapp/router.py +11 -4
- agno/os/interfaces/whatsapp/whatsapp.py +14 -7
- agno/os/mcp.py +111 -54
- agno/os/middleware/__init__.py +7 -0
- agno/os/middleware/jwt.py +233 -0
- agno/os/router.py +556 -139
- agno/os/routers/evals/evals.py +71 -34
- agno/os/routers/evals/schemas.py +31 -31
- agno/os/routers/evals/utils.py +6 -5
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/knowledge.py +185 -38
- agno/os/routers/knowledge/schemas.py +82 -22
- agno/os/routers/memory/memory.py +158 -53
- agno/os/routers/memory/schemas.py +20 -16
- agno/os/routers/metrics/metrics.py +20 -8
- agno/os/routers/metrics/schemas.py +16 -16
- agno/os/routers/session/session.py +499 -38
- agno/os/schema.py +308 -198
- agno/os/utils.py +401 -41
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +2 -2
- agno/reasoning/deepseek.py +2 -2
- agno/reasoning/default.py +3 -1
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +2 -2
- agno/reasoning/ollama.py +2 -2
- agno/reasoning/openai.py +7 -2
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +248 -94
- agno/run/base.py +44 -5
- agno/run/team.py +238 -97
- agno/run/workflow.py +144 -33
- agno/session/agent.py +105 -89
- agno/session/summary.py +65 -25
- agno/session/team.py +176 -96
- agno/session/workflow.py +406 -40
- agno/team/team.py +3854 -1610
- agno/tools/dalle.py +2 -4
- agno/tools/decorator.py +4 -2
- agno/tools/duckduckgo.py +15 -11
- agno/tools/e2b.py +14 -7
- agno/tools/eleven_labs.py +23 -25
- agno/tools/exa.py +21 -16
- agno/tools/file.py +153 -23
- agno/tools/file_generation.py +350 -0
- agno/tools/firecrawl.py +4 -4
- agno/tools/function.py +250 -30
- agno/tools/gmail.py +238 -14
- agno/tools/google_drive.py +270 -0
- agno/tools/googlecalendar.py +36 -8
- agno/tools/googlesheets.py +20 -5
- agno/tools/jira.py +20 -0
- agno/tools/knowledge.py +3 -3
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +11 -17
- agno/tools/memori.py +1 -53
- agno/tools/memory.py +419 -0
- agno/tools/models/nebius.py +5 -5
- agno/tools/models_labs.py +20 -10
- agno/tools/notion.py +204 -0
- agno/tools/parallel.py +314 -0
- agno/tools/scrapegraph.py +58 -31
- agno/tools/searxng.py +2 -2
- agno/tools/serper.py +2 -2
- agno/tools/slack.py +18 -3
- agno/tools/spider.py +2 -2
- agno/tools/tavily.py +146 -0
- agno/tools/whatsapp.py +1 -1
- agno/tools/workflow.py +278 -0
- agno/tools/yfinance.py +12 -11
- agno/utils/agent.py +820 -0
- agno/utils/audio.py +27 -0
- agno/utils/common.py +90 -1
- agno/utils/events.py +217 -2
- agno/utils/gemini.py +180 -22
- agno/utils/hooks.py +57 -0
- agno/utils/http.py +111 -0
- agno/utils/knowledge.py +12 -5
- agno/utils/log.py +1 -0
- agno/utils/mcp.py +92 -2
- agno/utils/media.py +188 -10
- agno/utils/merge_dict.py +22 -1
- agno/utils/message.py +60 -0
- agno/utils/models/claude.py +40 -11
- agno/utils/print_response/agent.py +105 -21
- agno/utils/print_response/team.py +103 -38
- agno/utils/print_response/workflow.py +251 -34
- agno/utils/reasoning.py +22 -1
- agno/utils/serialize.py +32 -0
- agno/utils/streamlit.py +16 -10
- agno/utils/string.py +41 -0
- agno/utils/team.py +98 -9
- agno/utils/tools.py +1 -1
- agno/vectordb/base.py +23 -4
- agno/vectordb/cassandra/cassandra.py +65 -9
- agno/vectordb/chroma/chromadb.py +182 -38
- agno/vectordb/clickhouse/clickhousedb.py +64 -11
- agno/vectordb/couchbase/couchbase.py +105 -10
- agno/vectordb/lancedb/lance_db.py +124 -133
- agno/vectordb/langchaindb/langchaindb.py +25 -7
- agno/vectordb/lightrag/lightrag.py +17 -3
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +46 -7
- agno/vectordb/milvus/milvus.py +126 -9
- agno/vectordb/mongodb/__init__.py +7 -1
- agno/vectordb/mongodb/mongodb.py +112 -7
- agno/vectordb/pgvector/pgvector.py +142 -21
- agno/vectordb/pineconedb/pineconedb.py +80 -8
- agno/vectordb/qdrant/qdrant.py +125 -39
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +694 -0
- agno/vectordb/singlestore/singlestore.py +111 -25
- agno/vectordb/surrealdb/surrealdb.py +31 -5
- agno/vectordb/upstashdb/upstashdb.py +76 -8
- agno/vectordb/weaviate/weaviate.py +86 -15
- agno/workflow/__init__.py +2 -0
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +112 -18
- agno/workflow/loop.py +69 -10
- agno/workflow/parallel.py +266 -118
- agno/workflow/router.py +110 -17
- agno/workflow/step.py +638 -129
- agno/workflow/steps.py +65 -6
- agno/workflow/types.py +61 -23
- agno/workflow/workflow.py +2085 -272
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/METADATA +182 -58
- agno-2.3.0.dist-info/RECORD +577 -0
- agno/knowledge/reader/url_reader.py +0 -128
- agno/tools/googlesearch.py +0 -98
- agno/tools/mcp.py +0 -610
- agno/utils/models/aws_claude.py +0 -170
- agno-2.0.1.dist-info/RECORD +0 -515
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/WHEEL +0 -0
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/licenses/LICENSE +0 -0
- {agno-2.0.1.dist-info → agno-2.3.0.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -4,24 +4,23 @@ import io
|
|
|
4
4
|
import time
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from functools import cached_property
|
|
8
7
|
from io import BytesIO
|
|
9
8
|
from os.path import basename
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
12
|
-
from uuid import uuid4
|
|
13
11
|
|
|
14
12
|
from httpx import AsyncClient
|
|
15
13
|
|
|
16
|
-
from agno.db.base import BaseDb
|
|
14
|
+
from agno.db.base import AsyncBaseDb, BaseDb
|
|
17
15
|
from agno.db.schemas.knowledge import KnowledgeRow
|
|
16
|
+
from agno.filters import FilterExpr
|
|
18
17
|
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
19
18
|
from agno.knowledge.document import Document
|
|
20
19
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
21
20
|
from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
|
|
22
21
|
from agno.utils.http import async_fetch_with_retry
|
|
23
22
|
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
24
|
-
from agno.
|
|
23
|
+
from agno.utils.string import generate_id
|
|
25
24
|
|
|
26
25
|
ContentDict = Dict[str, Union[str, Dict[str, str]]]
|
|
27
26
|
|
|
@@ -39,19 +38,19 @@ class Knowledge:
|
|
|
39
38
|
|
|
40
39
|
name: Optional[str] = None
|
|
41
40
|
description: Optional[str] = None
|
|
42
|
-
vector_db: Optional[
|
|
43
|
-
contents_db: Optional[BaseDb] = None
|
|
41
|
+
vector_db: Optional[Any] = None
|
|
42
|
+
contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
|
|
44
43
|
max_results: int = 10
|
|
45
44
|
readers: Optional[Dict[str, Reader]] = None
|
|
46
45
|
|
|
47
46
|
def __post_init__(self):
|
|
47
|
+
from agno.vectordb import VectorDb
|
|
48
|
+
|
|
49
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
48
50
|
if self.vector_db and not self.vector_db.exists():
|
|
49
51
|
self.vector_db.create()
|
|
50
52
|
|
|
51
53
|
self.construct_readers()
|
|
52
|
-
self.valid_metadata_filters = set()
|
|
53
|
-
|
|
54
|
-
# --- SDK Specific Methods ---
|
|
55
54
|
|
|
56
55
|
# --- Add Contents ---
|
|
57
56
|
@overload
|
|
@@ -64,9 +63,12 @@ class Knowledge:
|
|
|
64
63
|
paths: Optional[List[str]] = None,
|
|
65
64
|
urls: Optional[List[str]] = None,
|
|
66
65
|
metadata: Optional[Dict[str, str]] = None,
|
|
66
|
+
topics: Optional[List[str]] = None,
|
|
67
|
+
text_contents: Optional[List[str]] = None,
|
|
68
|
+
reader: Optional[Reader] = None,
|
|
67
69
|
include: Optional[List[str]] = None,
|
|
68
70
|
exclude: Optional[List[str]] = None,
|
|
69
|
-
upsert: bool =
|
|
71
|
+
upsert: bool = True,
|
|
70
72
|
skip_if_exists: bool = False,
|
|
71
73
|
remote_content: Optional[RemoteContent] = None,
|
|
72
74
|
) -> None: ...
|
|
@@ -74,6 +76,8 @@ class Knowledge:
|
|
|
74
76
|
async def add_contents_async(self, *args, **kwargs) -> None:
|
|
75
77
|
if args and isinstance(args[0], list):
|
|
76
78
|
arguments = args[0]
|
|
79
|
+
upsert = kwargs.get("upsert", True)
|
|
80
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
77
81
|
for argument in arguments:
|
|
78
82
|
await self.add_content_async(
|
|
79
83
|
name=argument.get("name"),
|
|
@@ -82,11 +86,12 @@ class Knowledge:
|
|
|
82
86
|
url=argument.get("url"),
|
|
83
87
|
metadata=argument.get("metadata"),
|
|
84
88
|
topics=argument.get("topics"),
|
|
89
|
+
text_content=argument.get("text_content"),
|
|
85
90
|
reader=argument.get("reader"),
|
|
86
91
|
include=argument.get("include"),
|
|
87
92
|
exclude=argument.get("exclude"),
|
|
88
|
-
upsert=argument.get("upsert",
|
|
89
|
-
skip_if_exists=argument.get("skip_if_exists",
|
|
93
|
+
upsert=argument.get("upsert", upsert),
|
|
94
|
+
skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
|
|
90
95
|
remote_content=argument.get("remote_content", None),
|
|
91
96
|
)
|
|
92
97
|
|
|
@@ -95,14 +100,15 @@ class Knowledge:
|
|
|
95
100
|
metadata = kwargs.get("metadata", {})
|
|
96
101
|
description = kwargs.get("description", [])
|
|
97
102
|
topics = kwargs.get("topics", [])
|
|
103
|
+
reader = kwargs.get("reader", None)
|
|
98
104
|
paths = kwargs.get("paths", [])
|
|
99
105
|
urls = kwargs.get("urls", [])
|
|
106
|
+
text_contents = kwargs.get("text_contents", [])
|
|
100
107
|
include = kwargs.get("include")
|
|
101
108
|
exclude = kwargs.get("exclude")
|
|
102
|
-
upsert = kwargs.get("upsert",
|
|
109
|
+
upsert = kwargs.get("upsert", True)
|
|
103
110
|
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
104
111
|
remote_content = kwargs.get("remote_content", None)
|
|
105
|
-
|
|
106
112
|
for path in paths:
|
|
107
113
|
await self.add_content_async(
|
|
108
114
|
name=name,
|
|
@@ -113,6 +119,7 @@ class Knowledge:
|
|
|
113
119
|
exclude=exclude,
|
|
114
120
|
upsert=upsert,
|
|
115
121
|
skip_if_exists=skip_if_exists,
|
|
122
|
+
reader=reader,
|
|
116
123
|
)
|
|
117
124
|
for url in urls:
|
|
118
125
|
await self.add_content_async(
|
|
@@ -124,6 +131,21 @@ class Knowledge:
|
|
|
124
131
|
exclude=exclude,
|
|
125
132
|
upsert=upsert,
|
|
126
133
|
skip_if_exists=skip_if_exists,
|
|
134
|
+
reader=reader,
|
|
135
|
+
)
|
|
136
|
+
for i, text_content in enumerate(text_contents):
|
|
137
|
+
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
138
|
+
log_debug(f"Adding text content: {content_name}")
|
|
139
|
+
await self.add_content_async(
|
|
140
|
+
name=content_name,
|
|
141
|
+
description=description,
|
|
142
|
+
text_content=text_content,
|
|
143
|
+
metadata=metadata,
|
|
144
|
+
include=include,
|
|
145
|
+
exclude=exclude,
|
|
146
|
+
upsert=upsert,
|
|
147
|
+
skip_if_exists=skip_if_exists,
|
|
148
|
+
reader=reader,
|
|
127
149
|
)
|
|
128
150
|
if topics:
|
|
129
151
|
await self.add_content_async(
|
|
@@ -135,6 +157,7 @@ class Knowledge:
|
|
|
135
157
|
exclude=exclude,
|
|
136
158
|
upsert=upsert,
|
|
137
159
|
skip_if_exists=skip_if_exists,
|
|
160
|
+
reader=reader,
|
|
138
161
|
)
|
|
139
162
|
|
|
140
163
|
if remote_content:
|
|
@@ -145,6 +168,7 @@ class Knowledge:
|
|
|
145
168
|
remote_content=remote_content,
|
|
146
169
|
upsert=upsert,
|
|
147
170
|
skip_if_exists=skip_if_exists,
|
|
171
|
+
reader=reader,
|
|
148
172
|
)
|
|
149
173
|
|
|
150
174
|
else:
|
|
@@ -160,10 +184,14 @@ class Knowledge:
|
|
|
160
184
|
paths: Optional[List[str]] = None,
|
|
161
185
|
urls: Optional[List[str]] = None,
|
|
162
186
|
metadata: Optional[Dict[str, str]] = None,
|
|
187
|
+
topics: Optional[List[str]] = None,
|
|
188
|
+
text_contents: Optional[List[str]] = None,
|
|
189
|
+
reader: Optional[Reader] = None,
|
|
163
190
|
include: Optional[List[str]] = None,
|
|
164
191
|
exclude: Optional[List[str]] = None,
|
|
165
|
-
upsert: bool =
|
|
192
|
+
upsert: bool = True,
|
|
166
193
|
skip_if_exists: bool = False,
|
|
194
|
+
remote_content: Optional[RemoteContent] = None,
|
|
167
195
|
) -> None: ...
|
|
168
196
|
|
|
169
197
|
def add_contents(self, *args, **kwargs) -> None:
|
|
@@ -181,10 +209,14 @@ class Knowledge:
|
|
|
181
209
|
paths: Optional list of file paths to load content from
|
|
182
210
|
urls: Optional list of URLs to load content from
|
|
183
211
|
metadata: Optional metadata dictionary to apply to all content
|
|
212
|
+
topics: Optional list of topics to add
|
|
213
|
+
text_contents: Optional list of text content strings to add
|
|
214
|
+
reader: Optional reader to use for processing content
|
|
184
215
|
include: Optional list of file patterns to include
|
|
185
216
|
exclude: Optional list of file patterns to exclude
|
|
186
217
|
upsert: Whether to update existing content if it already exists
|
|
187
218
|
skip_if_exists: Whether to skip adding content if it already exists
|
|
219
|
+
remote_content: Optional remote content (S3, GCS, etc.) to add
|
|
188
220
|
"""
|
|
189
221
|
asyncio.run(self.add_contents_async(*args, **kwargs))
|
|
190
222
|
|
|
@@ -200,7 +232,7 @@ class Knowledge:
|
|
|
200
232
|
metadata: Optional[Dict[str, str]] = None,
|
|
201
233
|
include: Optional[List[str]] = None,
|
|
202
234
|
exclude: Optional[List[str]] = None,
|
|
203
|
-
upsert: bool =
|
|
235
|
+
upsert: bool = True,
|
|
204
236
|
skip_if_exists: bool = False,
|
|
205
237
|
reader: Optional[Reader] = None,
|
|
206
238
|
auth: Optional[ContentAuth] = None,
|
|
@@ -228,11 +260,13 @@ class Knowledge:
|
|
|
228
260
|
) -> None:
|
|
229
261
|
# Validation: At least one of the parameters must be provided
|
|
230
262
|
if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
|
|
231
|
-
|
|
263
|
+
log_warning(
|
|
264
|
+
"At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
|
|
265
|
+
)
|
|
232
266
|
return
|
|
233
267
|
|
|
234
268
|
if not skip_if_exists:
|
|
235
|
-
|
|
269
|
+
log_debug("skip_if_exists is disabled, disabling upsert")
|
|
236
270
|
upsert = False
|
|
237
271
|
|
|
238
272
|
content = None
|
|
@@ -241,7 +275,6 @@ class Knowledge:
|
|
|
241
275
|
file_data = FileData(content=text_content, type="Text")
|
|
242
276
|
|
|
243
277
|
content = Content(
|
|
244
|
-
id=str(uuid4()),
|
|
245
278
|
name=name,
|
|
246
279
|
description=description,
|
|
247
280
|
path=path,
|
|
@@ -253,6 +286,8 @@ class Knowledge:
|
|
|
253
286
|
reader=reader,
|
|
254
287
|
auth=auth,
|
|
255
288
|
)
|
|
289
|
+
content.content_hash = self._build_content_hash(content)
|
|
290
|
+
content.id = generate_id(content.content_hash)
|
|
256
291
|
|
|
257
292
|
await self._load_content(content, upsert, skip_if_exists, include, exclude)
|
|
258
293
|
|
|
@@ -266,7 +301,7 @@ class Knowledge:
|
|
|
266
301
|
metadata: Optional[Dict[str, str]] = None,
|
|
267
302
|
include: Optional[List[str]] = None,
|
|
268
303
|
exclude: Optional[List[str]] = None,
|
|
269
|
-
upsert: bool =
|
|
304
|
+
upsert: bool = True,
|
|
270
305
|
skip_if_exists: bool = False,
|
|
271
306
|
reader: Optional[Reader] = None,
|
|
272
307
|
auth: Optional[ContentAuth] = None,
|
|
@@ -289,7 +324,7 @@ class Knowledge:
|
|
|
289
324
|
include: Optional[List[str]] = None,
|
|
290
325
|
exclude: Optional[List[str]] = None,
|
|
291
326
|
upsert: bool = True,
|
|
292
|
-
skip_if_exists: bool =
|
|
327
|
+
skip_if_exists: bool = False,
|
|
293
328
|
auth: Optional[ContentAuth] = None,
|
|
294
329
|
) -> None:
|
|
295
330
|
"""
|
|
@@ -303,7 +338,7 @@ class Knowledge:
|
|
|
303
338
|
text_content: Optional text content to add directly
|
|
304
339
|
metadata: Optional metadata dictionary
|
|
305
340
|
topics: Optional list of topics
|
|
306
|
-
|
|
341
|
+
remote_content: Optional cloud storage configuration
|
|
307
342
|
reader: Optional custom reader for processing the content
|
|
308
343
|
include: Optional list of file patterns to include
|
|
309
344
|
exclude: Optional list of file patterns to exclude
|
|
@@ -329,6 +364,26 @@ class Knowledge:
|
|
|
329
364
|
)
|
|
330
365
|
)
|
|
331
366
|
|
|
367
|
+
def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
|
|
368
|
+
"""
|
|
369
|
+
Handle the skip_if_exists logic for content that already exists in the vector database.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
content_hash: The content hash string to check for existence
|
|
373
|
+
skip_if_exists: Whether to skip if content already exists
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
bool: True if should skip processing, False if should continue
|
|
377
|
+
"""
|
|
378
|
+
from agno.vectordb import VectorDb
|
|
379
|
+
|
|
380
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
381
|
+
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
382
|
+
log_debug(f"Content already exists: {content_hash}, skipping...")
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
return False
|
|
386
|
+
|
|
332
387
|
async def _load_from_path(
|
|
333
388
|
self,
|
|
334
389
|
content: Content,
|
|
@@ -337,25 +392,28 @@ class Knowledge:
|
|
|
337
392
|
include: Optional[List[str]] = None,
|
|
338
393
|
exclude: Optional[List[str]] = None,
|
|
339
394
|
):
|
|
395
|
+
from agno.vectordb import VectorDb
|
|
396
|
+
|
|
397
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
398
|
+
|
|
340
399
|
log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
|
|
341
400
|
path = Path(content.path) # type: ignore
|
|
342
401
|
|
|
343
402
|
if path.is_file():
|
|
344
403
|
if self._should_include_file(str(path), include, exclude):
|
|
345
|
-
|
|
404
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
405
|
+
|
|
406
|
+
await self._add_to_contents_db(content)
|
|
407
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
408
|
+
content.status = ContentStatus.COMPLETED
|
|
409
|
+
await self._aupdate_content(content)
|
|
410
|
+
return
|
|
346
411
|
|
|
347
412
|
# Handle LightRAG special case - read file and upload directly
|
|
348
413
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
349
414
|
await self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
|
|
350
415
|
return
|
|
351
416
|
|
|
352
|
-
content.content_hash = self._build_content_hash(content)
|
|
353
|
-
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
354
|
-
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
355
|
-
return
|
|
356
|
-
|
|
357
|
-
self._add_to_contents_db(content)
|
|
358
|
-
|
|
359
417
|
if content.reader:
|
|
360
418
|
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
361
419
|
import inspect
|
|
@@ -370,7 +428,7 @@ class Knowledge:
|
|
|
370
428
|
|
|
371
429
|
else:
|
|
372
430
|
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
373
|
-
|
|
431
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
374
432
|
if reader:
|
|
375
433
|
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
376
434
|
import inspect
|
|
@@ -407,15 +465,16 @@ class Knowledge:
|
|
|
407
465
|
log_debug(f"Skipping file {file_path} due to include/exclude filters")
|
|
408
466
|
continue
|
|
409
467
|
|
|
410
|
-
id = str(uuid4())
|
|
411
468
|
file_content = Content(
|
|
412
|
-
id=id,
|
|
413
469
|
name=content.name,
|
|
414
470
|
path=str(file_path),
|
|
415
471
|
metadata=content.metadata,
|
|
416
472
|
description=content.description,
|
|
417
473
|
reader=content.reader,
|
|
418
474
|
)
|
|
475
|
+
file_content.content_hash = self._build_content_hash(file_content)
|
|
476
|
+
file_content.id = generate_id(file_content.content_hash)
|
|
477
|
+
|
|
419
478
|
await self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
|
|
420
479
|
else:
|
|
421
480
|
log_warning(f"Invalid path: {path}")
|
|
@@ -433,22 +492,26 @@ class Knowledge:
|
|
|
433
492
|
3. Read the content
|
|
434
493
|
4. Prepare and insert the content in the vector database
|
|
435
494
|
"""
|
|
495
|
+
from agno.vectordb import VectorDb
|
|
496
|
+
|
|
497
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
498
|
+
|
|
436
499
|
log_info(f"Adding content from URL {content.url}")
|
|
437
500
|
content.file_type = "url"
|
|
438
501
|
|
|
439
502
|
if not content.url:
|
|
440
503
|
raise ValueError("No url provided")
|
|
441
504
|
|
|
442
|
-
|
|
443
|
-
|
|
505
|
+
# 1. Add content to contents database
|
|
506
|
+
await self._add_to_contents_db(content)
|
|
507
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
508
|
+
content.status = ContentStatus.COMPLETED
|
|
509
|
+
await self._aupdate_content(content)
|
|
444
510
|
return
|
|
445
511
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
449
|
-
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
512
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
513
|
+
await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
450
514
|
return
|
|
451
|
-
self._add_to_contents_db(content)
|
|
452
515
|
|
|
453
516
|
# 2. Validate URL
|
|
454
517
|
try:
|
|
@@ -458,18 +521,23 @@ class Knowledge:
|
|
|
458
521
|
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
459
522
|
content.status = ContentStatus.FAILED
|
|
460
523
|
content.status_message = f"Invalid URL format: {content.url}"
|
|
461
|
-
self.
|
|
524
|
+
await self._aupdate_content(content)
|
|
462
525
|
log_warning(f"Invalid URL format: {content.url}")
|
|
463
526
|
except Exception as e:
|
|
464
527
|
content.status = ContentStatus.FAILED
|
|
465
528
|
content.status_message = f"Invalid URL: {content.url} - {str(e)}"
|
|
466
|
-
self.
|
|
529
|
+
await self._aupdate_content(content)
|
|
467
530
|
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
468
531
|
|
|
469
|
-
# 3. Fetch and load content
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
532
|
+
# 3. Fetch and load content if file has an extension
|
|
533
|
+
url_path = Path(parsed_url.path)
|
|
534
|
+
file_extension = url_path.suffix.lower()
|
|
535
|
+
|
|
536
|
+
bytes_content = None
|
|
537
|
+
if file_extension:
|
|
538
|
+
async with AsyncClient() as client:
|
|
539
|
+
response = await async_fetch_with_retry(content.url, client=client)
|
|
540
|
+
bytes_content = BytesIO(response.content)
|
|
473
541
|
|
|
474
542
|
# 4. Select reader
|
|
475
543
|
# If a reader was provided by the user, use it
|
|
@@ -477,8 +545,6 @@ class Knowledge:
|
|
|
477
545
|
name = content.name if content.name else content.url
|
|
478
546
|
# Else select based on file extension
|
|
479
547
|
if reader is None:
|
|
480
|
-
url_path = Path(parsed_url.path)
|
|
481
|
-
file_extension = url_path.suffix.lower()
|
|
482
548
|
if file_extension == ".csv":
|
|
483
549
|
name = basename(parsed_url.path) or "data.csv"
|
|
484
550
|
reader = self.csv_reader
|
|
@@ -486,6 +552,8 @@ class Knowledge:
|
|
|
486
552
|
reader = self.pdf_reader
|
|
487
553
|
elif file_extension == ".docx":
|
|
488
554
|
reader = self.docx_reader
|
|
555
|
+
elif file_extension == ".pptx":
|
|
556
|
+
reader = self.pptx_reader
|
|
489
557
|
elif file_extension == ".json":
|
|
490
558
|
reader = self.json_reader
|
|
491
559
|
elif file_extension == ".markdown":
|
|
@@ -504,20 +572,26 @@ class Knowledge:
|
|
|
504
572
|
if reader.__class__.__name__ == "YouTubeReader":
|
|
505
573
|
read_documents = reader.read(content.url, name=name)
|
|
506
574
|
elif "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
507
|
-
|
|
575
|
+
if bytes_content:
|
|
576
|
+
read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
|
|
577
|
+
else:
|
|
578
|
+
read_documents = reader.read(content.url, name=name, password=content.auth.password)
|
|
508
579
|
else:
|
|
509
|
-
|
|
580
|
+
if bytes_content:
|
|
581
|
+
read_documents = reader.read(bytes_content, name=name)
|
|
582
|
+
else:
|
|
583
|
+
read_documents = reader.read(content.url, name=name)
|
|
584
|
+
|
|
510
585
|
except Exception as e:
|
|
511
586
|
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
512
587
|
content.status = ContentStatus.FAILED
|
|
513
588
|
content.status_message = f"Error reading URL: {content.url} - {str(e)}"
|
|
514
|
-
self.
|
|
589
|
+
await self._aupdate_content(content)
|
|
515
590
|
return
|
|
516
591
|
|
|
517
592
|
# 6. Chunk documents if needed
|
|
518
593
|
if reader and not reader.chunk:
|
|
519
594
|
read_documents = await reader.chunk_documents_async(read_documents)
|
|
520
|
-
|
|
521
595
|
# 7. Prepare and insert the content in the vector database
|
|
522
596
|
file_size = 0
|
|
523
597
|
if read_documents:
|
|
@@ -531,8 +605,12 @@ class Knowledge:
|
|
|
531
605
|
self,
|
|
532
606
|
content: Content,
|
|
533
607
|
upsert: bool = True,
|
|
534
|
-
skip_if_exists: bool =
|
|
608
|
+
skip_if_exists: bool = False,
|
|
535
609
|
):
|
|
610
|
+
from agno.vectordb import VectorDb
|
|
611
|
+
|
|
612
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
613
|
+
|
|
536
614
|
if content.name:
|
|
537
615
|
name = content.name
|
|
538
616
|
elif content.file_data and content.file_data.content:
|
|
@@ -554,28 +632,24 @@ class Knowledge:
|
|
|
554
632
|
|
|
555
633
|
log_info(f"Adding content from {content.name}")
|
|
556
634
|
|
|
557
|
-
|
|
558
|
-
|
|
635
|
+
await self._add_to_contents_db(content)
|
|
636
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
637
|
+
content.status = ContentStatus.COMPLETED
|
|
638
|
+
await self._aupdate_content(content)
|
|
559
639
|
return
|
|
560
640
|
|
|
561
|
-
content.
|
|
562
|
-
|
|
563
|
-
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
564
|
-
|
|
641
|
+
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
642
|
+
await self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
|
|
565
643
|
return
|
|
566
|
-
self._add_to_contents_db(content)
|
|
567
644
|
|
|
568
645
|
read_documents = []
|
|
569
646
|
|
|
570
647
|
if isinstance(content.file_data, str):
|
|
571
|
-
|
|
572
|
-
content_bytes = content.file_data.encode("utf-8")
|
|
573
|
-
except UnicodeEncodeError:
|
|
574
|
-
content_bytes = content.file_data.encode("latin-1")
|
|
648
|
+
content_bytes = content.file_data.encode("utf-8", errors="replace")
|
|
575
649
|
content_io = io.BytesIO(content_bytes)
|
|
576
650
|
|
|
577
651
|
if content.reader:
|
|
578
|
-
|
|
652
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
579
653
|
read_documents = content.reader.read(content_io, name=name)
|
|
580
654
|
else:
|
|
581
655
|
text_reader = self.text_reader
|
|
@@ -584,7 +658,7 @@ class Knowledge:
|
|
|
584
658
|
else:
|
|
585
659
|
content.status = ContentStatus.FAILED
|
|
586
660
|
content.status_message = "Text reader not available"
|
|
587
|
-
self.
|
|
661
|
+
await self._aupdate_content(content)
|
|
588
662
|
return
|
|
589
663
|
|
|
590
664
|
elif isinstance(content.file_data, FileData):
|
|
@@ -592,27 +666,19 @@ class Knowledge:
|
|
|
592
666
|
if isinstance(content.file_data.content, bytes):
|
|
593
667
|
content_io = io.BytesIO(content.file_data.content)
|
|
594
668
|
elif isinstance(content.file_data.content, str):
|
|
595
|
-
|
|
596
|
-
try:
|
|
597
|
-
content_bytes = content.file_data.content.encode("utf-8")
|
|
598
|
-
except UnicodeEncodeError:
|
|
599
|
-
log_debug(f"UTF-8 encoding failed for {content.file_data.type}, using latin-1")
|
|
600
|
-
content_bytes = content.file_data.content.encode("latin-1")
|
|
601
|
-
else:
|
|
602
|
-
content_bytes = content.file_data.content.encode("latin-1")
|
|
669
|
+
content_bytes = content.file_data.content.encode("utf-8", errors="replace")
|
|
603
670
|
content_io = io.BytesIO(content_bytes)
|
|
604
671
|
else:
|
|
605
672
|
content_io = content.file_data.content # type: ignore
|
|
606
673
|
|
|
607
674
|
# Respect an explicitly provided reader; otherwise select based on file type
|
|
608
675
|
if content.reader:
|
|
609
|
-
|
|
676
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
610
677
|
reader = content.reader
|
|
611
678
|
else:
|
|
612
679
|
reader = self._select_reader(content.file_data.type)
|
|
613
680
|
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
614
681
|
read_documents = reader.read(content_io, name=name)
|
|
615
|
-
|
|
616
682
|
for read_document in read_documents:
|
|
617
683
|
if content.metadata:
|
|
618
684
|
read_document.meta_data.update(content.metadata)
|
|
@@ -621,12 +687,13 @@ class Knowledge:
|
|
|
621
687
|
if len(read_documents) == 0:
|
|
622
688
|
content.status = ContentStatus.FAILED
|
|
623
689
|
content.status_message = "Content could not be read"
|
|
624
|
-
self.
|
|
690
|
+
await self._aupdate_content(content)
|
|
691
|
+
return
|
|
625
692
|
|
|
626
693
|
else:
|
|
627
694
|
content.status = ContentStatus.FAILED
|
|
628
695
|
content.status_message = "No content provided"
|
|
629
|
-
self.
|
|
696
|
+
await self._aupdate_content(content)
|
|
630
697
|
return
|
|
631
698
|
|
|
632
699
|
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
@@ -637,6 +704,9 @@ class Knowledge:
|
|
|
637
704
|
upsert: bool,
|
|
638
705
|
skip_if_exists: bool,
|
|
639
706
|
):
|
|
707
|
+
from agno.vectordb import VectorDb
|
|
708
|
+
|
|
709
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
640
710
|
log_info(f"Adding content from topics: {content.topics}")
|
|
641
711
|
|
|
642
712
|
if content.topics is None:
|
|
@@ -644,9 +714,7 @@ class Knowledge:
|
|
|
644
714
|
return
|
|
645
715
|
|
|
646
716
|
for topic in content.topics:
|
|
647
|
-
id = str(uuid4())
|
|
648
717
|
content = Content(
|
|
649
|
-
id=id,
|
|
650
718
|
name=topic,
|
|
651
719
|
metadata=content.metadata,
|
|
652
720
|
reader=content.reader,
|
|
@@ -656,30 +724,41 @@ class Knowledge:
|
|
|
656
724
|
),
|
|
657
725
|
topics=[topic],
|
|
658
726
|
)
|
|
727
|
+
content.content_hash = self._build_content_hash(content)
|
|
728
|
+
content.id = generate_id(content.content_hash)
|
|
729
|
+
|
|
730
|
+
await self._add_to_contents_db(content)
|
|
731
|
+
if self._should_skip(content.content_hash, skip_if_exists):
|
|
732
|
+
content.status = ContentStatus.COMPLETED
|
|
733
|
+
await self._aupdate_content(content)
|
|
734
|
+
return
|
|
659
735
|
|
|
660
736
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
661
737
|
await self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
662
738
|
return
|
|
663
739
|
|
|
664
|
-
content.content_hash = self._build_content_hash(content)
|
|
665
740
|
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
666
741
|
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
667
742
|
continue
|
|
668
743
|
|
|
669
|
-
self._add_to_contents_db(content)
|
|
744
|
+
await self._add_to_contents_db(content)
|
|
670
745
|
if content.reader is None:
|
|
671
746
|
log_error(f"No reader available for topic: {topic}")
|
|
747
|
+
content.status = ContentStatus.FAILED
|
|
748
|
+
content.status_message = "No reader available for topic"
|
|
749
|
+
await self._aupdate_content(content)
|
|
672
750
|
continue
|
|
751
|
+
|
|
673
752
|
read_documents = content.reader.read(topic)
|
|
674
753
|
if len(read_documents) > 0:
|
|
675
754
|
for read_document in read_documents:
|
|
676
|
-
read_document.content_id = id
|
|
755
|
+
read_document.content_id = content.id
|
|
677
756
|
if read_document.content:
|
|
678
757
|
read_document.size = len(read_document.content.encode("utf-8"))
|
|
679
758
|
else:
|
|
680
759
|
content.status = ContentStatus.FAILED
|
|
681
760
|
content.status_message = "No content found for topic"
|
|
682
|
-
self.
|
|
761
|
+
await self._aupdate_content(content)
|
|
683
762
|
|
|
684
763
|
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
685
764
|
|
|
@@ -735,11 +814,9 @@ class Knowledge:
|
|
|
735
814
|
|
|
736
815
|
for s3_object in objects_to_read:
|
|
737
816
|
# 2. Setup Content object
|
|
738
|
-
id = str(uuid4())
|
|
739
817
|
content_name = content.name or ""
|
|
740
818
|
content_name += "_" + (s3_object.name or "")
|
|
741
819
|
content_entry = Content(
|
|
742
|
-
id=id,
|
|
743
820
|
name=content_name,
|
|
744
821
|
description=content.description,
|
|
745
822
|
status=ContentStatus.PROCESSING,
|
|
@@ -748,11 +825,13 @@ class Knowledge:
|
|
|
748
825
|
)
|
|
749
826
|
|
|
750
827
|
# 3. Hash content and add it to the contents database
|
|
751
|
-
content_hash = self._build_content_hash(content_entry)
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
828
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
829
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
830
|
+
await self._add_to_contents_db(content_entry)
|
|
831
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
832
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
833
|
+
await self._aupdate_content(content_entry)
|
|
834
|
+
return
|
|
756
835
|
|
|
757
836
|
# 4. Select reader
|
|
758
837
|
reader = content.reader
|
|
@@ -763,6 +842,8 @@ class Knowledge:
|
|
|
763
842
|
reader = self.csv_reader
|
|
764
843
|
elif s3_object.uri.endswith(".docx"):
|
|
765
844
|
reader = self.docx_reader
|
|
845
|
+
elif s3_object.uri.endswith(".pptx"):
|
|
846
|
+
reader = self.pptx_reader
|
|
766
847
|
elif s3_object.uri.endswith(".json"):
|
|
767
848
|
reader = self.json_reader
|
|
768
849
|
elif s3_object.uri.endswith(".markdown"):
|
|
@@ -818,10 +899,8 @@ class Knowledge:
|
|
|
818
899
|
|
|
819
900
|
for gcs_object in objects_to_read:
|
|
820
901
|
# 2. Setup Content object
|
|
821
|
-
id = str(uuid4())
|
|
822
902
|
name = (content.name or "content") + "_" + gcs_object.name
|
|
823
903
|
content_entry = Content(
|
|
824
|
-
id=id,
|
|
825
904
|
name=name,
|
|
826
905
|
description=content.description,
|
|
827
906
|
status=ContentStatus.PROCESSING,
|
|
@@ -830,15 +909,15 @@ class Knowledge:
|
|
|
830
909
|
)
|
|
831
910
|
|
|
832
911
|
# 3. Hash content and add it to the contents database
|
|
833
|
-
content_hash = self._build_content_hash(content_entry)
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
912
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
913
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
914
|
+
await self._add_to_contents_db(content_entry)
|
|
915
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
916
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
917
|
+
await self._aupdate_content(content_entry)
|
|
918
|
+
return
|
|
840
919
|
|
|
841
|
-
#
|
|
920
|
+
# 4. Select reader
|
|
842
921
|
reader = content.reader
|
|
843
922
|
if reader is None:
|
|
844
923
|
if gcs_object.name.endswith(".pdf"):
|
|
@@ -847,6 +926,8 @@ class Knowledge:
|
|
|
847
926
|
reader = self.csv_reader
|
|
848
927
|
elif gcs_object.name.endswith(".docx"):
|
|
849
928
|
reader = self.docx_reader
|
|
929
|
+
elif gcs_object.name.endswith(".pptx"):
|
|
930
|
+
reader = self.pptx_reader
|
|
850
931
|
elif gcs_object.name.endswith(".json"):
|
|
851
932
|
reader = self.json_reader
|
|
852
933
|
elif gcs_object.name.endswith(".markdown"):
|
|
@@ -866,37 +947,43 @@ class Knowledge:
|
|
|
866
947
|
read_document.content_id = content.id
|
|
867
948
|
await self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
868
949
|
|
|
869
|
-
async def _handle_vector_db_insert(self, content, read_documents, upsert):
|
|
950
|
+
async def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
951
|
+
from agno.vectordb import VectorDb
|
|
952
|
+
|
|
953
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
954
|
+
|
|
870
955
|
if not self.vector_db:
|
|
871
956
|
log_error("No vector database configured")
|
|
872
957
|
content.status = ContentStatus.FAILED
|
|
873
958
|
content.status_message = "No vector database configured"
|
|
874
|
-
self.
|
|
959
|
+
await self._aupdate_content(content)
|
|
875
960
|
return
|
|
876
961
|
|
|
877
962
|
if self.vector_db.upsert_available() and upsert:
|
|
878
963
|
try:
|
|
879
|
-
await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata)
|
|
964
|
+
await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
880
965
|
except Exception as e:
|
|
881
966
|
log_error(f"Error upserting document: {e}")
|
|
882
967
|
content.status = ContentStatus.FAILED
|
|
883
968
|
content.status_message = "Could not upsert embedding"
|
|
884
|
-
self.
|
|
969
|
+
await self._aupdate_content(content)
|
|
885
970
|
return
|
|
886
971
|
else:
|
|
887
972
|
try:
|
|
888
973
|
await self.vector_db.async_insert(
|
|
889
|
-
content.content_hash,
|
|
974
|
+
content.content_hash, # type: ignore[arg-type]
|
|
975
|
+
documents=read_documents,
|
|
976
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
890
977
|
)
|
|
891
978
|
except Exception as e:
|
|
892
979
|
log_error(f"Error inserting document: {e}")
|
|
893
980
|
content.status = ContentStatus.FAILED
|
|
894
981
|
content.status_message = "Could not insert embedding"
|
|
895
|
-
self.
|
|
982
|
+
await self._aupdate_content(content)
|
|
896
983
|
return
|
|
897
984
|
|
|
898
985
|
content.status = ContentStatus.COMPLETED
|
|
899
|
-
self.
|
|
986
|
+
await self._aupdate_content(content)
|
|
900
987
|
|
|
901
988
|
async def _load_content(
|
|
902
989
|
self,
|
|
@@ -906,11 +993,6 @@ class Knowledge:
|
|
|
906
993
|
include: Optional[List[str]] = None,
|
|
907
994
|
exclude: Optional[List[str]] = None,
|
|
908
995
|
) -> None:
|
|
909
|
-
log_info(f"Loading content: {content.id}")
|
|
910
|
-
|
|
911
|
-
if content.metadata:
|
|
912
|
-
self.add_filters(content.metadata)
|
|
913
|
-
|
|
914
996
|
if content.path:
|
|
915
997
|
await self._load_from_path(content, upsert, skip_if_exists, include, exclude)
|
|
916
998
|
|
|
@@ -954,7 +1036,49 @@ class Knowledge:
|
|
|
954
1036
|
)
|
|
955
1037
|
return hashlib.sha256(fallback.encode()).hexdigest()
|
|
956
1038
|
|
|
957
|
-
def
|
|
1039
|
+
def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
|
|
1040
|
+
"""
|
|
1041
|
+
Safely ensure a field is a string, handling various edge cases.
|
|
1042
|
+
|
|
1043
|
+
Args:
|
|
1044
|
+
value: The value to convert to string
|
|
1045
|
+
field_name: Name of the field for logging purposes
|
|
1046
|
+
default: Default string value if conversion fails
|
|
1047
|
+
|
|
1048
|
+
Returns:
|
|
1049
|
+
str: A safe string value
|
|
1050
|
+
"""
|
|
1051
|
+
# Handle None/falsy values
|
|
1052
|
+
if value is None or value == "":
|
|
1053
|
+
return default
|
|
1054
|
+
|
|
1055
|
+
# Handle unexpected list types (the root cause of our Pydantic warning)
|
|
1056
|
+
if isinstance(value, list):
|
|
1057
|
+
if len(value) == 0:
|
|
1058
|
+
log_debug(f"Empty list found for {field_name}, using default: '{default}'")
|
|
1059
|
+
return default
|
|
1060
|
+
elif len(value) == 1:
|
|
1061
|
+
# Single item list, extract the item
|
|
1062
|
+
log_debug(f"Single-item list found for {field_name}, extracting: '{value[0]}'")
|
|
1063
|
+
return str(value[0]) if value[0] is not None else default
|
|
1064
|
+
else:
|
|
1065
|
+
# Multiple items, join them
|
|
1066
|
+
log_debug(f"Multi-item list found for {field_name}, joining: {value}")
|
|
1067
|
+
return " | ".join(str(item) for item in value if item is not None)
|
|
1068
|
+
|
|
1069
|
+
# Handle other unexpected types
|
|
1070
|
+
if not isinstance(value, str):
|
|
1071
|
+
log_debug(f"Non-string type {type(value)} found for {field_name}, converting: '{value}'")
|
|
1072
|
+
try:
|
|
1073
|
+
return str(value)
|
|
1074
|
+
except Exception as e:
|
|
1075
|
+
log_warning(f"Failed to convert {field_name} to string: {e}, using default")
|
|
1076
|
+
return default
|
|
1077
|
+
|
|
1078
|
+
# Already a string, return as-is
|
|
1079
|
+
return value
|
|
1080
|
+
|
|
1081
|
+
async def _add_to_contents_db(self, content: Content):
|
|
958
1082
|
if self.contents_db:
|
|
959
1083
|
created_at = content.created_at if content.created_at else int(time.time())
|
|
960
1084
|
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
@@ -966,10 +1090,18 @@ class Knowledge:
|
|
|
966
1090
|
if content.file_data and content.file_data.type
|
|
967
1091
|
else None
|
|
968
1092
|
)
|
|
1093
|
+
# Safely handle string fields with proper type checking
|
|
1094
|
+
safe_name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1095
|
+
safe_description = self._ensure_string_field(content.description, "content.description", default="")
|
|
1096
|
+
safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
|
|
1097
|
+
safe_status_message = self._ensure_string_field(
|
|
1098
|
+
content.status_message, "content.status_message", default=""
|
|
1099
|
+
)
|
|
1100
|
+
|
|
969
1101
|
content_row = KnowledgeRow(
|
|
970
1102
|
id=content.id,
|
|
971
|
-
name=
|
|
972
|
-
description=
|
|
1103
|
+
name=safe_name,
|
|
1104
|
+
description=safe_description,
|
|
973
1105
|
metadata=content.metadata,
|
|
974
1106
|
type=file_type,
|
|
975
1107
|
size=content.size
|
|
@@ -977,17 +1109,28 @@ class Knowledge:
|
|
|
977
1109
|
else len(content.file_data.content)
|
|
978
1110
|
if content.file_data and content.file_data.content
|
|
979
1111
|
else None,
|
|
980
|
-
linked_to=
|
|
1112
|
+
linked_to=safe_linked_to,
|
|
981
1113
|
access_count=0,
|
|
982
1114
|
status=content.status if content.status else ContentStatus.PROCESSING,
|
|
983
|
-
status_message=
|
|
1115
|
+
status_message=safe_status_message,
|
|
984
1116
|
created_at=created_at,
|
|
985
1117
|
updated_at=updated_at,
|
|
986
1118
|
)
|
|
987
|
-
self.contents_db
|
|
1119
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1120
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1121
|
+
else:
|
|
1122
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
988
1123
|
|
|
989
1124
|
def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1125
|
+
from agno.vectordb import VectorDb
|
|
1126
|
+
|
|
1127
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
990
1128
|
if self.contents_db:
|
|
1129
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1130
|
+
raise ValueError(
|
|
1131
|
+
"update_content() is not supported with an async DB. Please use aupdate_content() instead."
|
|
1132
|
+
)
|
|
1133
|
+
|
|
991
1134
|
if not content.id:
|
|
992
1135
|
log_warning("Content id is required to update Knowledge content")
|
|
993
1136
|
return None
|
|
@@ -998,6 +1141,55 @@ class Knowledge:
|
|
|
998
1141
|
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
999
1142
|
return None
|
|
1000
1143
|
|
|
1144
|
+
# Apply safe string handling for updates as well
|
|
1145
|
+
if content.name is not None:
|
|
1146
|
+
content_row.name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1147
|
+
if content.description is not None:
|
|
1148
|
+
content_row.description = self._ensure_string_field(
|
|
1149
|
+
content.description, "content.description", default=""
|
|
1150
|
+
)
|
|
1151
|
+
if content.metadata is not None:
|
|
1152
|
+
content_row.metadata = content.metadata
|
|
1153
|
+
if content.status is not None:
|
|
1154
|
+
content_row.status = content.status
|
|
1155
|
+
if content.status_message is not None:
|
|
1156
|
+
content_row.status_message = self._ensure_string_field(
|
|
1157
|
+
content.status_message, "content.status_message", default=""
|
|
1158
|
+
)
|
|
1159
|
+
if content.external_id is not None:
|
|
1160
|
+
content_row.external_id = self._ensure_string_field(
|
|
1161
|
+
content.external_id, "content.external_id", default=""
|
|
1162
|
+
)
|
|
1163
|
+
content_row.updated_at = int(time.time())
|
|
1164
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1165
|
+
|
|
1166
|
+
if self.vector_db and content.metadata:
|
|
1167
|
+
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata)
|
|
1168
|
+
|
|
1169
|
+
return content_row.to_dict()
|
|
1170
|
+
|
|
1171
|
+
else:
|
|
1172
|
+
if self.name:
|
|
1173
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
1174
|
+
else:
|
|
1175
|
+
log_warning("Contents DB not found for knowledge base")
|
|
1176
|
+
return None
|
|
1177
|
+
|
|
1178
|
+
async def _aupdate_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1179
|
+
if self.contents_db:
|
|
1180
|
+
if not content.id:
|
|
1181
|
+
log_warning("Content id is required to update Knowledge content")
|
|
1182
|
+
return None
|
|
1183
|
+
|
|
1184
|
+
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
1185
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1186
|
+
content_row = await self.contents_db.get_knowledge_content(content.id)
|
|
1187
|
+
else:
|
|
1188
|
+
content_row = self.contents_db.get_knowledge_content(content.id)
|
|
1189
|
+
if content_row is None:
|
|
1190
|
+
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
1191
|
+
return None
|
|
1192
|
+
|
|
1001
1193
|
if content.name is not None:
|
|
1002
1194
|
content_row.name = content.name
|
|
1003
1195
|
if content.description is not None:
|
|
@@ -1012,22 +1204,29 @@ class Knowledge:
|
|
|
1012
1204
|
content_row.external_id = content.external_id
|
|
1013
1205
|
|
|
1014
1206
|
content_row.updated_at = int(time.time())
|
|
1015
|
-
self.contents_db
|
|
1207
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1208
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1209
|
+
else:
|
|
1210
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1016
1211
|
|
|
1017
1212
|
if self.vector_db and content.metadata:
|
|
1018
1213
|
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata)
|
|
1019
1214
|
|
|
1020
|
-
if content.metadata:
|
|
1021
|
-
self.add_filters(content.metadata)
|
|
1022
|
-
|
|
1023
1215
|
return content_row.to_dict()
|
|
1024
1216
|
|
|
1025
1217
|
else:
|
|
1026
|
-
|
|
1218
|
+
if self.name:
|
|
1219
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
1220
|
+
else:
|
|
1221
|
+
log_warning("Contents DB not found for knowledge base")
|
|
1027
1222
|
return None
|
|
1028
1223
|
|
|
1029
1224
|
async def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
1030
|
-
|
|
1225
|
+
from agno.vectordb import VectorDb
|
|
1226
|
+
|
|
1227
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1228
|
+
|
|
1229
|
+
await self._add_to_contents_db(content)
|
|
1031
1230
|
if content_type == KnowledgeContentOrigin.PATH:
|
|
1032
1231
|
if content.file_data is None:
|
|
1033
1232
|
log_warning("No file data provided")
|
|
@@ -1058,18 +1257,18 @@ class Knowledge:
|
|
|
1058
1257
|
else:
|
|
1059
1258
|
log_error("Vector database does not support file insertion")
|
|
1060
1259
|
content.status = ContentStatus.FAILED
|
|
1061
|
-
self.
|
|
1260
|
+
await self._aupdate_content(content)
|
|
1062
1261
|
return
|
|
1063
1262
|
content.external_id = result
|
|
1064
1263
|
content.status = ContentStatus.COMPLETED
|
|
1065
|
-
self.
|
|
1264
|
+
await self._aupdate_content(content)
|
|
1066
1265
|
return
|
|
1067
1266
|
|
|
1068
1267
|
except Exception as e:
|
|
1069
1268
|
log_error(f"Error uploading file to LightRAG: {e}")
|
|
1070
1269
|
content.status = ContentStatus.FAILED
|
|
1071
1270
|
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
1072
|
-
self.
|
|
1271
|
+
await self._aupdate_content(content)
|
|
1073
1272
|
return
|
|
1074
1273
|
|
|
1075
1274
|
elif content_type == KnowledgeContentOrigin.URL:
|
|
@@ -1079,7 +1278,7 @@ class Knowledge:
|
|
|
1079
1278
|
if reader is None:
|
|
1080
1279
|
log_error("No URL reader available")
|
|
1081
1280
|
content.status = ContentStatus.FAILED
|
|
1082
|
-
self.
|
|
1281
|
+
await self._aupdate_content(content)
|
|
1083
1282
|
return
|
|
1084
1283
|
|
|
1085
1284
|
reader.chunk = False
|
|
@@ -1091,7 +1290,7 @@ class Knowledge:
|
|
|
1091
1290
|
if not read_documents:
|
|
1092
1291
|
log_error("No documents read from URL")
|
|
1093
1292
|
content.status = ContentStatus.FAILED
|
|
1094
|
-
self.
|
|
1293
|
+
await self._aupdate_content(content)
|
|
1095
1294
|
return
|
|
1096
1295
|
|
|
1097
1296
|
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
@@ -1102,19 +1301,19 @@ class Knowledge:
|
|
|
1102
1301
|
else:
|
|
1103
1302
|
log_error("Vector database does not support text insertion")
|
|
1104
1303
|
content.status = ContentStatus.FAILED
|
|
1105
|
-
self.
|
|
1304
|
+
await self._aupdate_content(content)
|
|
1106
1305
|
return
|
|
1107
1306
|
|
|
1108
1307
|
content.external_id = result
|
|
1109
1308
|
content.status = ContentStatus.COMPLETED
|
|
1110
|
-
self.
|
|
1309
|
+
await self._aupdate_content(content)
|
|
1111
1310
|
return
|
|
1112
1311
|
|
|
1113
1312
|
except Exception as e:
|
|
1114
1313
|
log_error(f"Error uploading file to LightRAG: {e}")
|
|
1115
1314
|
content.status = ContentStatus.FAILED
|
|
1116
1315
|
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
1117
|
-
self.
|
|
1316
|
+
await self._aupdate_content(content)
|
|
1118
1317
|
return
|
|
1119
1318
|
|
|
1120
1319
|
elif content_type == KnowledgeContentOrigin.CONTENT:
|
|
@@ -1135,11 +1334,11 @@ class Knowledge:
|
|
|
1135
1334
|
else:
|
|
1136
1335
|
log_error("Vector database does not support file insertion")
|
|
1137
1336
|
content.status = ContentStatus.FAILED
|
|
1138
|
-
self.
|
|
1337
|
+
await self._aupdate_content(content)
|
|
1139
1338
|
return
|
|
1140
1339
|
content.external_id = result
|
|
1141
1340
|
content.status = ContentStatus.COMPLETED
|
|
1142
|
-
self.
|
|
1341
|
+
await self._aupdate_content(content)
|
|
1143
1342
|
else:
|
|
1144
1343
|
log_warning(f"No file data available for LightRAG upload: {content.name}")
|
|
1145
1344
|
return
|
|
@@ -1150,20 +1349,17 @@ class Knowledge:
|
|
|
1150
1349
|
if content.reader is None:
|
|
1151
1350
|
log_error("No reader available for topic content")
|
|
1152
1351
|
content.status = ContentStatus.FAILED
|
|
1153
|
-
self.
|
|
1352
|
+
await self._aupdate_content(content)
|
|
1154
1353
|
return
|
|
1155
1354
|
|
|
1156
1355
|
if not content.topics:
|
|
1157
1356
|
log_error("No topics available for content")
|
|
1158
1357
|
content.status = ContentStatus.FAILED
|
|
1159
|
-
self.
|
|
1358
|
+
await self._aupdate_content(content)
|
|
1160
1359
|
return
|
|
1161
1360
|
|
|
1162
1361
|
read_documents = content.reader.read(content.topics)
|
|
1163
1362
|
if len(read_documents) > 0:
|
|
1164
|
-
print("READ DOCUMENTS: ", len(read_documents))
|
|
1165
|
-
print("READ DOCUMENTS: ", read_documents[0])
|
|
1166
|
-
|
|
1167
1363
|
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
1168
1364
|
result = await self.vector_db.insert_text(
|
|
1169
1365
|
file_source=content.topics[0],
|
|
@@ -1172,21 +1368,35 @@ class Knowledge:
|
|
|
1172
1368
|
else:
|
|
1173
1369
|
log_error("Vector database does not support text insertion")
|
|
1174
1370
|
content.status = ContentStatus.FAILED
|
|
1175
|
-
self.
|
|
1371
|
+
await self._aupdate_content(content)
|
|
1176
1372
|
return
|
|
1177
1373
|
content.external_id = result
|
|
1178
1374
|
content.status = ContentStatus.COMPLETED
|
|
1179
|
-
self.
|
|
1375
|
+
await self._aupdate_content(content)
|
|
1180
1376
|
return
|
|
1181
1377
|
else:
|
|
1182
1378
|
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
1183
1379
|
return
|
|
1184
1380
|
|
|
1185
1381
|
def search(
|
|
1186
|
-
self,
|
|
1382
|
+
self,
|
|
1383
|
+
query: str,
|
|
1384
|
+
max_results: Optional[int] = None,
|
|
1385
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
1386
|
+
search_type: Optional[str] = None,
|
|
1187
1387
|
) -> List[Document]:
|
|
1188
1388
|
"""Returns relevant documents matching a query"""
|
|
1389
|
+
from agno.vectordb import VectorDb
|
|
1390
|
+
from agno.vectordb.search import SearchType
|
|
1391
|
+
|
|
1392
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1189
1393
|
|
|
1394
|
+
if (
|
|
1395
|
+
hasattr(self.vector_db, "search_type")
|
|
1396
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
1397
|
+
and search_type
|
|
1398
|
+
):
|
|
1399
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
1190
1400
|
try:
|
|
1191
1401
|
if self.vector_db is None:
|
|
1192
1402
|
log_warning("No vector db provided")
|
|
@@ -1200,10 +1410,23 @@ class Knowledge:
|
|
|
1200
1410
|
return []
|
|
1201
1411
|
|
|
1202
1412
|
async def async_search(
|
|
1203
|
-
self,
|
|
1413
|
+
self,
|
|
1414
|
+
query: str,
|
|
1415
|
+
max_results: Optional[int] = None,
|
|
1416
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
1417
|
+
search_type: Optional[str] = None,
|
|
1204
1418
|
) -> List[Document]:
|
|
1205
1419
|
"""Returns relevant documents matching a query"""
|
|
1206
|
-
|
|
1420
|
+
from agno.vectordb import VectorDb
|
|
1421
|
+
from agno.vectordb.search import SearchType
|
|
1422
|
+
|
|
1423
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1424
|
+
if (
|
|
1425
|
+
hasattr(self.vector_db, "search_type")
|
|
1426
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
1427
|
+
and search_type
|
|
1428
|
+
):
|
|
1429
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
1207
1430
|
try:
|
|
1208
1431
|
if self.vector_db is None:
|
|
1209
1432
|
log_warning("No vector db provided")
|
|
@@ -1220,66 +1443,114 @@ class Knowledge:
|
|
|
1220
1443
|
log_error(f"Error searching for documents: {e}")
|
|
1221
1444
|
return []
|
|
1222
1445
|
|
|
1223
|
-
def
|
|
1224
|
-
if self.
|
|
1225
|
-
|
|
1226
|
-
|
|
1446
|
+
def get_valid_filters(self) -> Set[str]:
|
|
1447
|
+
if self.contents_db is None:
|
|
1448
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
1449
|
+
return set()
|
|
1450
|
+
contents, _ = self.get_content()
|
|
1451
|
+
valid_filters: Set[str] = set()
|
|
1452
|
+
for content in contents:
|
|
1453
|
+
if content.metadata:
|
|
1454
|
+
valid_filters.update(content.metadata.keys())
|
|
1455
|
+
|
|
1456
|
+
return valid_filters
|
|
1457
|
+
|
|
1458
|
+
async def async_get_valid_filters(self) -> Set[str]:
|
|
1459
|
+
if self.contents_db is None:
|
|
1460
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
1461
|
+
return set()
|
|
1462
|
+
contents, _ = await self.aget_content()
|
|
1463
|
+
valid_filters: Set[str] = set()
|
|
1464
|
+
for content in contents:
|
|
1465
|
+
if content.metadata:
|
|
1466
|
+
valid_filters.update(content.metadata.keys())
|
|
1467
|
+
|
|
1468
|
+
return valid_filters
|
|
1227
1469
|
|
|
1470
|
+
def _validate_filters(
|
|
1471
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]], valid_metadata_filters: Set[str]
|
|
1472
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
1228
1473
|
if not filters:
|
|
1229
1474
|
return {}, []
|
|
1230
1475
|
|
|
1231
|
-
valid_filters: Dict[str, Any] = {}
|
|
1476
|
+
valid_filters: Union[Dict[str, Any], List[FilterExpr]] = {}
|
|
1232
1477
|
invalid_keys = []
|
|
1233
1478
|
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1479
|
+
if isinstance(filters, dict):
|
|
1480
|
+
# If no metadata filters tracked yet, all keys are considered invalid
|
|
1481
|
+
if valid_metadata_filters is None or not valid_metadata_filters:
|
|
1482
|
+
invalid_keys = list(filters.keys())
|
|
1483
|
+
log_warning(
|
|
1484
|
+
f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}"
|
|
1485
|
+
)
|
|
1486
|
+
return {}, invalid_keys
|
|
1487
|
+
|
|
1488
|
+
for key, value in filters.items():
|
|
1489
|
+
# Handle both normal keys and prefixed keys like meta_data.key
|
|
1490
|
+
base_key = key.split(".")[-1] if "." in key else key
|
|
1491
|
+
if base_key in valid_metadata_filters or key in valid_metadata_filters:
|
|
1492
|
+
valid_filters[key] = value # type: ignore
|
|
1493
|
+
else:
|
|
1494
|
+
invalid_keys.append(key)
|
|
1495
|
+
log_warning(f"Invalid filter key: {key} - not present in knowledge base")
|
|
1496
|
+
|
|
1497
|
+
elif isinstance(filters, List):
|
|
1498
|
+
# Validate that list contains FilterExpr instances
|
|
1499
|
+
for i, filter_item in enumerate(filters):
|
|
1500
|
+
if not isinstance(filter_item, FilterExpr):
|
|
1501
|
+
log_warning(
|
|
1502
|
+
f"Invalid filter at index {i}: expected FilterExpr instance, "
|
|
1503
|
+
f"got {type(filter_item).__name__}. "
|
|
1504
|
+
f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
|
|
1505
|
+
f"AND(...), OR(...), NOT(...) from agno.filters"
|
|
1506
|
+
)
|
|
1507
|
+
# Filter expressions are already validated, return empty dict/list
|
|
1508
|
+
# The actual filtering happens in the vector_db layer
|
|
1509
|
+
return filters, []
|
|
1248
1510
|
|
|
1249
1511
|
return valid_filters, invalid_keys
|
|
1250
1512
|
|
|
1251
|
-
def
|
|
1252
|
-
|
|
1253
|
-
|
|
1513
|
+
def validate_filters(
|
|
1514
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
1515
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
1516
|
+
valid_filters_from_db = self.get_valid_filters()
|
|
1254
1517
|
|
|
1255
|
-
|
|
1256
|
-
for key in metadata.keys():
|
|
1257
|
-
self.valid_metadata_filters.add(key)
|
|
1518
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
1258
1519
|
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1520
|
+
return valid_filters, invalid_keys
|
|
1521
|
+
|
|
1522
|
+
async def async_validate_filters(
|
|
1523
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
1524
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
1525
|
+
"""Return a tuple containing a dict with all valid filters and a list of invalid filter keys"""
|
|
1526
|
+
valid_filters_from_db = await self.async_get_valid_filters()
|
|
1527
|
+
|
|
1528
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
1529
|
+
|
|
1530
|
+
return valid_filters, invalid_keys
|
|
1269
1531
|
|
|
1270
1532
|
def remove_vector_by_id(self, id: str) -> bool:
|
|
1533
|
+
from agno.vectordb import VectorDb
|
|
1534
|
+
|
|
1535
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1271
1536
|
if self.vector_db is None:
|
|
1272
1537
|
log_warning("No vector DB provided")
|
|
1273
1538
|
return False
|
|
1274
1539
|
return self.vector_db.delete_by_id(id)
|
|
1275
1540
|
|
|
1276
1541
|
def remove_vectors_by_name(self, name: str) -> bool:
|
|
1542
|
+
from agno.vectordb import VectorDb
|
|
1543
|
+
|
|
1544
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1277
1545
|
if self.vector_db is None:
|
|
1278
1546
|
log_warning("No vector DB provided")
|
|
1279
1547
|
return False
|
|
1280
1548
|
return self.vector_db.delete_by_name(name)
|
|
1281
1549
|
|
|
1282
1550
|
def remove_vectors_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
1551
|
+
from agno.vectordb import VectorDb
|
|
1552
|
+
|
|
1553
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1283
1554
|
if self.vector_db is None:
|
|
1284
1555
|
log_warning("No vector DB provided")
|
|
1285
1556
|
return False
|
|
@@ -1290,10 +1561,46 @@ class Knowledge:
|
|
|
1290
1561
|
def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1291
1562
|
return self._update_content(content)
|
|
1292
1563
|
|
|
1564
|
+
async def apatch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1565
|
+
return await self._aupdate_content(content)
|
|
1566
|
+
|
|
1293
1567
|
def get_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
1294
1568
|
if self.contents_db is None:
|
|
1295
1569
|
raise ValueError("No contents db provided")
|
|
1570
|
+
|
|
1571
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1572
|
+
raise ValueError(
|
|
1573
|
+
"get_content_by_id() is not supported for async databases. Please use aget_content_by_id() instead."
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1296
1576
|
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
1577
|
+
|
|
1578
|
+
if content_row is None:
|
|
1579
|
+
return None
|
|
1580
|
+
content = Content(
|
|
1581
|
+
id=content_row.id,
|
|
1582
|
+
name=content_row.name,
|
|
1583
|
+
description=content_row.description,
|
|
1584
|
+
metadata=content_row.metadata,
|
|
1585
|
+
file_type=content_row.type,
|
|
1586
|
+
size=content_row.size,
|
|
1587
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
1588
|
+
status_message=content_row.status_message,
|
|
1589
|
+
created_at=content_row.created_at,
|
|
1590
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
1591
|
+
external_id=content_row.external_id,
|
|
1592
|
+
)
|
|
1593
|
+
return content
|
|
1594
|
+
|
|
1595
|
+
async def aget_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
1596
|
+
if self.contents_db is None:
|
|
1597
|
+
raise ValueError("No contents db provided")
|
|
1598
|
+
|
|
1599
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1600
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
1601
|
+
else:
|
|
1602
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
1603
|
+
|
|
1297
1604
|
if content_row is None:
|
|
1298
1605
|
return None
|
|
1299
1606
|
content = Content(
|
|
@@ -1320,6 +1627,10 @@ class Knowledge:
|
|
|
1320
1627
|
) -> Tuple[List[Content], int]:
|
|
1321
1628
|
if self.contents_db is None:
|
|
1322
1629
|
raise ValueError("No contents db provided")
|
|
1630
|
+
|
|
1631
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1632
|
+
raise ValueError("get_content() is not supported for async databases. Please use aget_content() instead.")
|
|
1633
|
+
|
|
1323
1634
|
contents, count = self.contents_db.get_knowledge_contents(
|
|
1324
1635
|
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
1325
1636
|
)
|
|
@@ -1343,9 +1654,53 @@ class Knowledge:
|
|
|
1343
1654
|
result.append(content)
|
|
1344
1655
|
return result, count
|
|
1345
1656
|
|
|
1657
|
+
async def aget_content(
|
|
1658
|
+
self,
|
|
1659
|
+
limit: Optional[int] = None,
|
|
1660
|
+
page: Optional[int] = None,
|
|
1661
|
+
sort_by: Optional[str] = None,
|
|
1662
|
+
sort_order: Optional[str] = None,
|
|
1663
|
+
) -> Tuple[List[Content], int]:
|
|
1664
|
+
if self.contents_db is None:
|
|
1665
|
+
raise ValueError("No contents db provided")
|
|
1666
|
+
|
|
1667
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1668
|
+
contents, count = await self.contents_db.get_knowledge_contents(
|
|
1669
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
1670
|
+
)
|
|
1671
|
+
else:
|
|
1672
|
+
contents, count = self.contents_db.get_knowledge_contents(
|
|
1673
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
1674
|
+
)
|
|
1675
|
+
|
|
1676
|
+
result = []
|
|
1677
|
+
for content_row in contents:
|
|
1678
|
+
# Create Content from database row
|
|
1679
|
+
content = Content(
|
|
1680
|
+
id=content_row.id,
|
|
1681
|
+
name=content_row.name,
|
|
1682
|
+
description=content_row.description,
|
|
1683
|
+
metadata=content_row.metadata,
|
|
1684
|
+
size=content_row.size,
|
|
1685
|
+
file_type=content_row.type,
|
|
1686
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
1687
|
+
status_message=content_row.status_message,
|
|
1688
|
+
created_at=content_row.created_at,
|
|
1689
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
1690
|
+
external_id=content_row.external_id,
|
|
1691
|
+
)
|
|
1692
|
+
result.append(content)
|
|
1693
|
+
return result, count
|
|
1694
|
+
|
|
1346
1695
|
def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
1347
1696
|
if self.contents_db is None:
|
|
1348
1697
|
raise ValueError("No contents db provided")
|
|
1698
|
+
|
|
1699
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1700
|
+
raise ValueError(
|
|
1701
|
+
"get_content_status() is not supported for async databases. Please use aget_content_status() instead."
|
|
1702
|
+
)
|
|
1703
|
+
|
|
1349
1704
|
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
1350
1705
|
if content_row is None:
|
|
1351
1706
|
return None, "Content not found"
|
|
@@ -1365,7 +1720,37 @@ class Knowledge:
|
|
|
1365
1720
|
|
|
1366
1721
|
return status, content_row.status_message
|
|
1367
1722
|
|
|
1723
|
+
async def aget_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
1724
|
+
if self.contents_db is None:
|
|
1725
|
+
raise ValueError("No contents db provided")
|
|
1726
|
+
|
|
1727
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1728
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
1729
|
+
else:
|
|
1730
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
1731
|
+
|
|
1732
|
+
if content_row is None:
|
|
1733
|
+
return None, "Content not found"
|
|
1734
|
+
|
|
1735
|
+
# Convert string status to enum, defaulting to PROCESSING if unknown
|
|
1736
|
+
status_str = content_row.status
|
|
1737
|
+
try:
|
|
1738
|
+
status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
|
|
1739
|
+
except ValueError:
|
|
1740
|
+
# Handle legacy or unknown statuses
|
|
1741
|
+
if status_str and "failed" in status_str.lower():
|
|
1742
|
+
status = ContentStatus.FAILED
|
|
1743
|
+
elif status_str and "completed" in status_str.lower():
|
|
1744
|
+
status = ContentStatus.COMPLETED
|
|
1745
|
+
else:
|
|
1746
|
+
status = ContentStatus.PROCESSING
|
|
1747
|
+
|
|
1748
|
+
return status, content_row.status_message
|
|
1749
|
+
|
|
1368
1750
|
def remove_content_by_id(self, content_id: str):
|
|
1751
|
+
from agno.vectordb import VectorDb
|
|
1752
|
+
|
|
1753
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1369
1754
|
if self.vector_db is not None:
|
|
1370
1755
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1371
1756
|
# For LightRAG, get the content first to find the external_id
|
|
@@ -1380,12 +1765,36 @@ class Knowledge:
|
|
|
1380
1765
|
if self.contents_db is not None:
|
|
1381
1766
|
self.contents_db.delete_knowledge_content(content_id)
|
|
1382
1767
|
|
|
1768
|
+
async def aremove_content_by_id(self, content_id: str):
|
|
1769
|
+
if self.vector_db is not None:
|
|
1770
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1771
|
+
# For LightRAG, get the content first to find the external_id
|
|
1772
|
+
content = await self.aget_content_by_id(content_id)
|
|
1773
|
+
if content and content.external_id:
|
|
1774
|
+
self.vector_db.delete_by_external_id(content.external_id) # type: ignore
|
|
1775
|
+
else:
|
|
1776
|
+
log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
|
|
1777
|
+
else:
|
|
1778
|
+
self.vector_db.delete_by_content_id(content_id)
|
|
1779
|
+
|
|
1780
|
+
if self.contents_db is not None:
|
|
1781
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1782
|
+
await self.contents_db.delete_knowledge_content(content_id)
|
|
1783
|
+
else:
|
|
1784
|
+
self.contents_db.delete_knowledge_content(content_id)
|
|
1785
|
+
|
|
1383
1786
|
def remove_all_content(self):
|
|
1384
1787
|
contents, _ = self.get_content()
|
|
1385
1788
|
for content in contents:
|
|
1386
1789
|
if content.id is not None:
|
|
1387
1790
|
self.remove_content_by_id(content.id)
|
|
1388
1791
|
|
|
1792
|
+
async def aremove_all_content(self):
|
|
1793
|
+
contents, _ = await self.aget_content()
|
|
1794
|
+
for content in contents:
|
|
1795
|
+
if content.id is not None:
|
|
1796
|
+
await self.aremove_content_by_id(content.id)
|
|
1797
|
+
|
|
1389
1798
|
# --- Reader Factory Integration ---
|
|
1390
1799
|
|
|
1391
1800
|
def construct_readers(self):
|
|
@@ -1423,12 +1832,6 @@ class Knowledge:
|
|
|
1423
1832
|
log_info(f"Selecting reader for extension: {extension}")
|
|
1424
1833
|
return ReaderFactory.get_reader_for_extension(extension)
|
|
1425
1834
|
|
|
1426
|
-
def get_filters(self) -> List[str]:
|
|
1427
|
-
return [
|
|
1428
|
-
"filter_tag_1",
|
|
1429
|
-
"filter_tag2",
|
|
1430
|
-
]
|
|
1431
|
-
|
|
1432
1835
|
# --- Convenience Properties for Backward Compatibility ---
|
|
1433
1836
|
|
|
1434
1837
|
def _is_text_mime_type(self, mime_type: str) -> bool:
|
|
@@ -1520,6 +1923,11 @@ class Knowledge:
|
|
|
1520
1923
|
"""Docx reader - lazy loaded via factory."""
|
|
1521
1924
|
return self._get_reader("docx")
|
|
1522
1925
|
|
|
1926
|
+
@property
|
|
1927
|
+
def pptx_reader(self) -> Optional[Reader]:
|
|
1928
|
+
"""PPTX reader - lazy loaded via factory."""
|
|
1929
|
+
return self._get_reader("pptx")
|
|
1930
|
+
|
|
1523
1931
|
@property
|
|
1524
1932
|
def json_reader(self) -> Optional[Reader]:
|
|
1525
1933
|
"""JSON reader - lazy loaded via factory."""
|