agno 2.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +5540 -2273
- agno/api/api.py +2 -0
- agno/api/os.py +1 -1
- agno/compression/__init__.py +3 -0
- agno/compression/manager.py +247 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +689 -6
- agno/db/dynamo/dynamo.py +933 -37
- agno/db/dynamo/schemas.py +174 -10
- agno/db/dynamo/utils.py +63 -4
- agno/db/firestore/firestore.py +831 -9
- agno/db/firestore/schemas.py +51 -0
- agno/db/firestore/utils.py +102 -4
- agno/db/gcs_json/gcs_json_db.py +660 -12
- agno/db/gcs_json/utils.py +60 -26
- agno/db/in_memory/in_memory_db.py +287 -14
- agno/db/in_memory/utils.py +60 -2
- agno/db/json/json_db.py +590 -14
- agno/db/json/utils.py +60 -26
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +43 -13
- agno/db/migrations/versions/__init__.py +0 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +15 -1
- agno/db/mongo/async_mongo.py +2760 -0
- agno/db/mongo/mongo.py +879 -11
- agno/db/mongo/schemas.py +42 -0
- agno/db/mongo/utils.py +80 -8
- agno/db/mysql/__init__.py +2 -1
- agno/db/mysql/async_mysql.py +2912 -0
- agno/db/mysql/mysql.py +946 -68
- agno/db/mysql/schemas.py +72 -10
- agno/db/mysql/utils.py +198 -7
- agno/db/postgres/__init__.py +2 -1
- agno/db/postgres/async_postgres.py +2579 -0
- agno/db/postgres/postgres.py +942 -57
- agno/db/postgres/schemas.py +81 -18
- agno/db/postgres/utils.py +164 -2
- agno/db/redis/redis.py +671 -7
- agno/db/redis/schemas.py +50 -0
- agno/db/redis/utils.py +65 -7
- agno/db/schemas/__init__.py +2 -1
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +1 -0
- agno/db/schemas/memory.py +17 -2
- agno/db/singlestore/schemas.py +63 -0
- agno/db/singlestore/singlestore.py +949 -83
- agno/db/singlestore/utils.py +60 -2
- agno/db/sqlite/__init__.py +2 -1
- agno/db/sqlite/async_sqlite.py +2911 -0
- agno/db/sqlite/schemas.py +62 -0
- agno/db/sqlite/sqlite.py +965 -46
- agno/db/sqlite/utils.py +169 -8
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +334 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1908 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +2 -0
- agno/eval/__init__.py +10 -0
- agno/eval/accuracy.py +75 -55
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/performance.py +16 -7
- agno/eval/reliability.py +28 -16
- agno/eval/utils.py +35 -17
- agno/exceptions.py +27 -2
- agno/filters.py +354 -0
- agno/guardrails/prompt_injection.py +1 -0
- agno/hooks/__init__.py +3 -0
- agno/hooks/decorator.py +164 -0
- agno/integrations/discord/client.py +1 -1
- agno/knowledge/chunking/agentic.py +13 -10
- agno/knowledge/chunking/fixed.py +4 -1
- agno/knowledge/chunking/semantic.py +9 -4
- agno/knowledge/chunking/strategy.py +59 -15
- agno/knowledge/embedder/fastembed.py +1 -1
- agno/knowledge/embedder/nebius.py +1 -1
- agno/knowledge/embedder/ollama.py +8 -0
- agno/knowledge/embedder/openai.py +8 -8
- agno/knowledge/embedder/sentence_transformer.py +6 -2
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/knowledge.py +1618 -318
- agno/knowledge/reader/base.py +6 -2
- agno/knowledge/reader/csv_reader.py +8 -10
- agno/knowledge/reader/docx_reader.py +5 -6
- agno/knowledge/reader/field_labeled_csv_reader.py +16 -20
- agno/knowledge/reader/json_reader.py +5 -4
- agno/knowledge/reader/markdown_reader.py +8 -8
- agno/knowledge/reader/pdf_reader.py +17 -19
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +32 -3
- agno/knowledge/reader/s3_reader.py +3 -3
- agno/knowledge/reader/tavily_reader.py +193 -0
- agno/knowledge/reader/text_reader.py +22 -10
- agno/knowledge/reader/web_search_reader.py +1 -48
- agno/knowledge/reader/website_reader.py +10 -10
- agno/knowledge/reader/wikipedia_reader.py +33 -1
- agno/knowledge/types.py +1 -0
- agno/knowledge/utils.py +72 -7
- agno/media.py +22 -6
- agno/memory/__init__.py +14 -1
- agno/memory/manager.py +544 -83
- agno/memory/strategies/__init__.py +15 -0
- agno/memory/strategies/base.py +66 -0
- agno/memory/strategies/summarize.py +196 -0
- agno/memory/strategies/types.py +37 -0
- agno/models/aimlapi/aimlapi.py +17 -0
- agno/models/anthropic/claude.py +515 -40
- agno/models/aws/bedrock.py +102 -21
- agno/models/aws/claude.py +131 -274
- agno/models/azure/ai_foundry.py +41 -19
- agno/models/azure/openai_chat.py +39 -8
- agno/models/base.py +1249 -525
- agno/models/cerebras/cerebras.py +91 -21
- agno/models/cerebras/cerebras_openai.py +21 -2
- agno/models/cohere/chat.py +40 -6
- agno/models/cometapi/cometapi.py +18 -1
- agno/models/dashscope/dashscope.py +2 -3
- agno/models/deepinfra/deepinfra.py +18 -1
- agno/models/deepseek/deepseek.py +69 -3
- agno/models/fireworks/fireworks.py +18 -1
- agno/models/google/gemini.py +877 -80
- agno/models/google/utils.py +22 -0
- agno/models/groq/groq.py +51 -18
- agno/models/huggingface/huggingface.py +17 -6
- agno/models/ibm/watsonx.py +16 -6
- agno/models/internlm/internlm.py +18 -1
- agno/models/langdb/langdb.py +13 -1
- agno/models/litellm/chat.py +44 -9
- agno/models/litellm/litellm_openai.py +18 -1
- agno/models/message.py +28 -5
- agno/models/meta/llama.py +47 -14
- agno/models/meta/llama_openai.py +22 -17
- agno/models/mistral/mistral.py +8 -4
- agno/models/nebius/nebius.py +6 -7
- agno/models/nvidia/nvidia.py +20 -3
- agno/models/ollama/chat.py +24 -8
- agno/models/openai/chat.py +104 -29
- agno/models/openai/responses.py +101 -81
- agno/models/openrouter/openrouter.py +60 -3
- agno/models/perplexity/perplexity.py +17 -1
- agno/models/portkey/portkey.py +7 -6
- agno/models/requesty/requesty.py +24 -4
- agno/models/response.py +73 -2
- agno/models/sambanova/sambanova.py +20 -3
- agno/models/siliconflow/siliconflow.py +19 -2
- agno/models/together/together.py +20 -3
- agno/models/utils.py +254 -8
- agno/models/vercel/v0.py +20 -3
- agno/models/vertexai/__init__.py +0 -0
- agno/models/vertexai/claude.py +190 -0
- agno/models/vllm/vllm.py +19 -14
- agno/models/xai/xai.py +19 -2
- agno/os/app.py +549 -152
- agno/os/auth.py +190 -3
- agno/os/config.py +23 -0
- agno/os/interfaces/a2a/router.py +8 -11
- agno/os/interfaces/a2a/utils.py +1 -1
- agno/os/interfaces/agui/router.py +18 -3
- agno/os/interfaces/agui/utils.py +152 -39
- agno/os/interfaces/slack/router.py +55 -37
- agno/os/interfaces/slack/slack.py +9 -1
- agno/os/interfaces/whatsapp/router.py +0 -1
- agno/os/interfaces/whatsapp/security.py +3 -1
- agno/os/mcp.py +110 -52
- agno/os/middleware/__init__.py +2 -0
- agno/os/middleware/jwt.py +676 -112
- agno/os/router.py +40 -1478
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +599 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/evals.py +96 -39
- agno/os/routers/evals/schemas.py +65 -33
- agno/os/routers/evals/utils.py +80 -10
- agno/os/routers/health.py +10 -4
- agno/os/routers/knowledge/knowledge.py +196 -38
- agno/os/routers/knowledge/schemas.py +82 -22
- agno/os/routers/memory/memory.py +279 -52
- agno/os/routers/memory/schemas.py +46 -17
- agno/os/routers/metrics/metrics.py +20 -8
- agno/os/routers/metrics/schemas.py +16 -16
- agno/os/routers/session/session.py +462 -34
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +512 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/traces/__init__.py +3 -0
- agno/os/routers/traces/schemas.py +414 -0
- agno/os/routers/traces/traces.py +499 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +624 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +256 -693
- agno/os/scopes.py +469 -0
- agno/os/utils.py +514 -36
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/openai.py +5 -0
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +155 -32
- agno/run/base.py +55 -3
- agno/run/requirement.py +181 -0
- agno/run/team.py +125 -38
- agno/run/workflow.py +72 -18
- agno/session/agent.py +102 -89
- agno/session/summary.py +56 -15
- agno/session/team.py +164 -90
- agno/session/workflow.py +405 -40
- agno/table.py +10 -0
- agno/team/team.py +3974 -1903
- agno/tools/dalle.py +2 -4
- agno/tools/eleven_labs.py +23 -25
- agno/tools/exa.py +21 -16
- agno/tools/file.py +153 -23
- agno/tools/file_generation.py +16 -10
- agno/tools/firecrawl.py +15 -7
- agno/tools/function.py +193 -38
- agno/tools/gmail.py +238 -14
- agno/tools/google_drive.py +271 -0
- agno/tools/googlecalendar.py +36 -8
- agno/tools/googlesheets.py +20 -5
- agno/tools/jira.py +20 -0
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +3 -3
- agno/tools/models/nebius.py +5 -5
- agno/tools/models_labs.py +20 -10
- agno/tools/nano_banana.py +151 -0
- agno/tools/notion.py +204 -0
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +76 -36
- agno/tools/redshift.py +406 -0
- agno/tools/scrapegraph.py +1 -1
- agno/tools/shopify.py +1519 -0
- agno/tools/slack.py +18 -3
- agno/tools/spotify.py +919 -0
- agno/tools/tavily.py +146 -0
- agno/tools/toolkit.py +25 -0
- agno/tools/workflow.py +8 -1
- agno/tools/yfinance.py +12 -11
- agno/tracing/__init__.py +12 -0
- agno/tracing/exporter.py +157 -0
- agno/tracing/schemas.py +276 -0
- agno/tracing/setup.py +111 -0
- agno/utils/agent.py +938 -0
- agno/utils/cryptography.py +22 -0
- agno/utils/dttm.py +33 -0
- agno/utils/events.py +151 -3
- agno/utils/gemini.py +15 -5
- agno/utils/hooks.py +118 -4
- agno/utils/http.py +113 -2
- agno/utils/knowledge.py +12 -5
- agno/utils/log.py +1 -0
- agno/utils/mcp.py +92 -2
- agno/utils/media.py +187 -1
- agno/utils/merge_dict.py +3 -3
- agno/utils/message.py +60 -0
- agno/utils/models/ai_foundry.py +9 -2
- agno/utils/models/claude.py +49 -14
- agno/utils/models/cohere.py +9 -2
- agno/utils/models/llama.py +9 -2
- agno/utils/models/mistral.py +4 -2
- agno/utils/print_response/agent.py +109 -16
- agno/utils/print_response/team.py +223 -30
- agno/utils/print_response/workflow.py +251 -34
- agno/utils/streamlit.py +1 -1
- agno/utils/team.py +98 -9
- agno/utils/tokens.py +657 -0
- agno/vectordb/base.py +39 -7
- agno/vectordb/cassandra/cassandra.py +21 -5
- agno/vectordb/chroma/chromadb.py +43 -12
- agno/vectordb/clickhouse/clickhousedb.py +21 -5
- agno/vectordb/couchbase/couchbase.py +29 -5
- agno/vectordb/lancedb/lance_db.py +92 -181
- agno/vectordb/langchaindb/langchaindb.py +24 -4
- agno/vectordb/lightrag/lightrag.py +17 -3
- agno/vectordb/llamaindex/llamaindexdb.py +25 -5
- agno/vectordb/milvus/milvus.py +50 -37
- agno/vectordb/mongodb/__init__.py +7 -1
- agno/vectordb/mongodb/mongodb.py +36 -30
- agno/vectordb/pgvector/pgvector.py +201 -77
- agno/vectordb/pineconedb/pineconedb.py +41 -23
- agno/vectordb/qdrant/qdrant.py +67 -54
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +682 -0
- agno/vectordb/singlestore/singlestore.py +50 -29
- agno/vectordb/surrealdb/surrealdb.py +31 -41
- agno/vectordb/upstashdb/upstashdb.py +34 -6
- agno/vectordb/weaviate/weaviate.py +53 -14
- agno/workflow/__init__.py +2 -0
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +120 -18
- agno/workflow/loop.py +77 -10
- agno/workflow/parallel.py +231 -143
- agno/workflow/router.py +118 -17
- agno/workflow/step.py +609 -170
- agno/workflow/steps.py +73 -6
- agno/workflow/types.py +96 -21
- agno/workflow/workflow.py +2039 -262
- {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/METADATA +201 -66
- agno-2.3.13.dist-info/RECORD +613 -0
- agno/tools/googlesearch.py +0 -98
- agno/tools/mcp.py +0 -679
- agno/tools/memori.py +0 -339
- agno-2.1.2.dist-info/RECORD +0 -543
- {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +0 -0
- {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/licenses/LICENSE +0 -0
- {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -4,7 +4,6 @@ import io
|
|
|
4
4
|
import time
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from functools import cached_property
|
|
8
7
|
from io import BytesIO
|
|
9
8
|
from os.path import basename
|
|
10
9
|
from pathlib import Path
|
|
@@ -12,8 +11,9 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
|
12
11
|
|
|
13
12
|
from httpx import AsyncClient
|
|
14
13
|
|
|
15
|
-
from agno.db.base import BaseDb
|
|
14
|
+
from agno.db.base import AsyncBaseDb, BaseDb
|
|
16
15
|
from agno.db.schemas.knowledge import KnowledgeRow
|
|
16
|
+
from agno.filters import FilterExpr
|
|
17
17
|
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
18
18
|
from agno.knowledge.document import Document
|
|
19
19
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
@@ -39,7 +39,7 @@ class Knowledge:
|
|
|
39
39
|
name: Optional[str] = None
|
|
40
40
|
description: Optional[str] = None
|
|
41
41
|
vector_db: Optional[Any] = None
|
|
42
|
-
contents_db: Optional[BaseDb] = None
|
|
42
|
+
contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
|
|
43
43
|
max_results: int = 10
|
|
44
44
|
readers: Optional[Dict[str, Reader]] = None
|
|
45
45
|
|
|
@@ -51,9 +51,6 @@ class Knowledge:
|
|
|
51
51
|
self.vector_db.create()
|
|
52
52
|
|
|
53
53
|
self.construct_readers()
|
|
54
|
-
self.valid_metadata_filters = set()
|
|
55
|
-
|
|
56
|
-
# --- SDK Specific Methods ---
|
|
57
54
|
|
|
58
55
|
# --- Add Contents ---
|
|
59
56
|
@overload
|
|
@@ -122,6 +119,7 @@ class Knowledge:
|
|
|
122
119
|
exclude=exclude,
|
|
123
120
|
upsert=upsert,
|
|
124
121
|
skip_if_exists=skip_if_exists,
|
|
122
|
+
reader=reader,
|
|
125
123
|
)
|
|
126
124
|
for url in urls:
|
|
127
125
|
await self.add_content_async(
|
|
@@ -133,6 +131,7 @@ class Knowledge:
|
|
|
133
131
|
exclude=exclude,
|
|
134
132
|
upsert=upsert,
|
|
135
133
|
skip_if_exists=skip_if_exists,
|
|
134
|
+
reader=reader,
|
|
136
135
|
)
|
|
137
136
|
for i, text_content in enumerate(text_contents):
|
|
138
137
|
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
@@ -146,6 +145,7 @@ class Knowledge:
|
|
|
146
145
|
exclude=exclude,
|
|
147
146
|
upsert=upsert,
|
|
148
147
|
skip_if_exists=skip_if_exists,
|
|
148
|
+
reader=reader,
|
|
149
149
|
)
|
|
150
150
|
if topics:
|
|
151
151
|
await self.add_content_async(
|
|
@@ -168,6 +168,7 @@ class Knowledge:
|
|
|
168
168
|
remote_content=remote_content,
|
|
169
169
|
upsert=upsert,
|
|
170
170
|
skip_if_exists=skip_if_exists,
|
|
171
|
+
reader=reader,
|
|
171
172
|
)
|
|
172
173
|
|
|
173
174
|
else:
|
|
@@ -183,18 +184,20 @@ class Knowledge:
|
|
|
183
184
|
paths: Optional[List[str]] = None,
|
|
184
185
|
urls: Optional[List[str]] = None,
|
|
185
186
|
metadata: Optional[Dict[str, str]] = None,
|
|
187
|
+
topics: Optional[List[str]] = None,
|
|
188
|
+
text_contents: Optional[List[str]] = None,
|
|
189
|
+
reader: Optional[Reader] = None,
|
|
186
190
|
include: Optional[List[str]] = None,
|
|
187
191
|
exclude: Optional[List[str]] = None,
|
|
188
192
|
upsert: bool = True,
|
|
189
193
|
skip_if_exists: bool = False,
|
|
194
|
+
remote_content: Optional[RemoteContent] = None,
|
|
190
195
|
) -> None: ...
|
|
191
196
|
|
|
192
197
|
def add_contents(self, *args, **kwargs) -> None:
|
|
193
198
|
"""
|
|
194
199
|
Synchronously add multiple content items to the knowledge base.
|
|
195
200
|
|
|
196
|
-
This method wraps the asynchronous add_contents method
|
|
197
|
-
|
|
198
201
|
Supports two usage patterns:
|
|
199
202
|
1. Pass a list of content dictionaries as first argument
|
|
200
203
|
2. Pass keyword arguments with paths, urls, metadata, etc.
|
|
@@ -204,12 +207,114 @@ class Knowledge:
|
|
|
204
207
|
paths: Optional list of file paths to load content from
|
|
205
208
|
urls: Optional list of URLs to load content from
|
|
206
209
|
metadata: Optional metadata dictionary to apply to all content
|
|
210
|
+
topics: Optional list of topics to add
|
|
211
|
+
text_contents: Optional list of text content strings to add
|
|
212
|
+
reader: Optional reader to use for processing content
|
|
207
213
|
include: Optional list of file patterns to include
|
|
208
214
|
exclude: Optional list of file patterns to exclude
|
|
209
|
-
upsert: Whether to update existing content if it already exists
|
|
210
|
-
skip_if_exists: Whether to skip adding content if it already exists
|
|
215
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
216
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: True)
|
|
217
|
+
remote_content: Optional remote content (S3, GCS, etc.) to add
|
|
211
218
|
"""
|
|
212
|
-
|
|
219
|
+
if args and isinstance(args[0], list):
|
|
220
|
+
arguments = args[0]
|
|
221
|
+
upsert = kwargs.get("upsert", True)
|
|
222
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
223
|
+
for argument in arguments:
|
|
224
|
+
self.add_content(
|
|
225
|
+
name=argument.get("name"),
|
|
226
|
+
description=argument.get("description"),
|
|
227
|
+
path=argument.get("path"),
|
|
228
|
+
url=argument.get("url"),
|
|
229
|
+
metadata=argument.get("metadata"),
|
|
230
|
+
topics=argument.get("topics"),
|
|
231
|
+
text_content=argument.get("text_content"),
|
|
232
|
+
reader=argument.get("reader"),
|
|
233
|
+
include=argument.get("include"),
|
|
234
|
+
exclude=argument.get("exclude"),
|
|
235
|
+
upsert=argument.get("upsert", upsert),
|
|
236
|
+
skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
|
|
237
|
+
remote_content=argument.get("remote_content", None),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
elif kwargs:
|
|
241
|
+
name = kwargs.get("name", [])
|
|
242
|
+
metadata = kwargs.get("metadata", {})
|
|
243
|
+
description = kwargs.get("description", [])
|
|
244
|
+
topics = kwargs.get("topics", [])
|
|
245
|
+
reader = kwargs.get("reader", None)
|
|
246
|
+
paths = kwargs.get("paths", [])
|
|
247
|
+
urls = kwargs.get("urls", [])
|
|
248
|
+
text_contents = kwargs.get("text_contents", [])
|
|
249
|
+
include = kwargs.get("include")
|
|
250
|
+
exclude = kwargs.get("exclude")
|
|
251
|
+
upsert = kwargs.get("upsert", True)
|
|
252
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
253
|
+
remote_content = kwargs.get("remote_content", None)
|
|
254
|
+
for path in paths:
|
|
255
|
+
self.add_content(
|
|
256
|
+
name=name,
|
|
257
|
+
description=description,
|
|
258
|
+
path=path,
|
|
259
|
+
metadata=metadata,
|
|
260
|
+
include=include,
|
|
261
|
+
exclude=exclude,
|
|
262
|
+
upsert=upsert,
|
|
263
|
+
skip_if_exists=skip_if_exists,
|
|
264
|
+
reader=reader,
|
|
265
|
+
)
|
|
266
|
+
for url in urls:
|
|
267
|
+
self.add_content(
|
|
268
|
+
name=name,
|
|
269
|
+
description=description,
|
|
270
|
+
url=url,
|
|
271
|
+
metadata=metadata,
|
|
272
|
+
include=include,
|
|
273
|
+
exclude=exclude,
|
|
274
|
+
upsert=upsert,
|
|
275
|
+
skip_if_exists=skip_if_exists,
|
|
276
|
+
reader=reader,
|
|
277
|
+
)
|
|
278
|
+
for i, text_content in enumerate(text_contents):
|
|
279
|
+
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
280
|
+
log_debug(f"Adding text content: {content_name}")
|
|
281
|
+
self.add_content(
|
|
282
|
+
name=content_name,
|
|
283
|
+
description=description,
|
|
284
|
+
text_content=text_content,
|
|
285
|
+
metadata=metadata,
|
|
286
|
+
include=include,
|
|
287
|
+
exclude=exclude,
|
|
288
|
+
upsert=upsert,
|
|
289
|
+
skip_if_exists=skip_if_exists,
|
|
290
|
+
reader=reader,
|
|
291
|
+
)
|
|
292
|
+
if topics:
|
|
293
|
+
self.add_content(
|
|
294
|
+
name=name,
|
|
295
|
+
description=description,
|
|
296
|
+
topics=topics,
|
|
297
|
+
metadata=metadata,
|
|
298
|
+
include=include,
|
|
299
|
+
exclude=exclude,
|
|
300
|
+
upsert=upsert,
|
|
301
|
+
skip_if_exists=skip_if_exists,
|
|
302
|
+
reader=reader,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
if remote_content:
|
|
306
|
+
self.add_content(
|
|
307
|
+
name=name,
|
|
308
|
+
metadata=metadata,
|
|
309
|
+
description=description,
|
|
310
|
+
remote_content=remote_content,
|
|
311
|
+
upsert=upsert,
|
|
312
|
+
skip_if_exists=skip_if_exists,
|
|
313
|
+
reader=reader,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
raise ValueError("Invalid usage of add_contents.")
|
|
213
318
|
|
|
214
319
|
# --- Add Content ---
|
|
215
320
|
|
|
@@ -246,7 +351,7 @@ class Knowledge:
|
|
|
246
351
|
include: Optional[List[str]] = None,
|
|
247
352
|
exclude: Optional[List[str]] = None,
|
|
248
353
|
upsert: bool = True,
|
|
249
|
-
skip_if_exists: bool =
|
|
354
|
+
skip_if_exists: bool = False,
|
|
250
355
|
auth: Optional[ContentAuth] = None,
|
|
251
356
|
) -> None:
|
|
252
357
|
# Validation: At least one of the parameters must be provided
|
|
@@ -256,10 +361,6 @@ class Knowledge:
|
|
|
256
361
|
)
|
|
257
362
|
return
|
|
258
363
|
|
|
259
|
-
if not skip_if_exists:
|
|
260
|
-
log_info("skip_if_exists is disabled, disabling upsert")
|
|
261
|
-
upsert = False
|
|
262
|
-
|
|
263
364
|
content = None
|
|
264
365
|
file_data = None
|
|
265
366
|
if text_content:
|
|
@@ -280,7 +381,7 @@ class Knowledge:
|
|
|
280
381
|
content.content_hash = self._build_content_hash(content)
|
|
281
382
|
content.id = generate_id(content.content_hash)
|
|
282
383
|
|
|
283
|
-
await self.
|
|
384
|
+
await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
|
|
284
385
|
|
|
285
386
|
@overload
|
|
286
387
|
def add_content(
|
|
@@ -333,27 +434,37 @@ class Knowledge:
|
|
|
333
434
|
reader: Optional custom reader for processing the content
|
|
334
435
|
include: Optional list of file patterns to include
|
|
335
436
|
exclude: Optional list of file patterns to exclude
|
|
336
|
-
upsert: Whether to update existing content if it already exists
|
|
337
|
-
skip_if_exists: Whether to skip adding content if it already exists
|
|
437
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
438
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: False)
|
|
338
439
|
"""
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
path=path,
|
|
344
|
-
url=url,
|
|
345
|
-
text_content=text_content,
|
|
346
|
-
metadata=metadata,
|
|
347
|
-
topics=topics,
|
|
348
|
-
remote_content=remote_content,
|
|
349
|
-
reader=reader,
|
|
350
|
-
include=include,
|
|
351
|
-
exclude=exclude,
|
|
352
|
-
upsert=upsert,
|
|
353
|
-
skip_if_exists=skip_if_exists,
|
|
354
|
-
auth=auth,
|
|
440
|
+
# Validation: At least one of the parameters must be provided
|
|
441
|
+
if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
|
|
442
|
+
log_warning(
|
|
443
|
+
"At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
|
|
355
444
|
)
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
content = None
|
|
448
|
+
file_data = None
|
|
449
|
+
if text_content:
|
|
450
|
+
file_data = FileData(content=text_content, type="Text")
|
|
451
|
+
|
|
452
|
+
content = Content(
|
|
453
|
+
name=name,
|
|
454
|
+
description=description,
|
|
455
|
+
path=path,
|
|
456
|
+
url=url,
|
|
457
|
+
file_data=file_data if file_data else None,
|
|
458
|
+
metadata=metadata,
|
|
459
|
+
topics=topics,
|
|
460
|
+
remote_content=remote_content,
|
|
461
|
+
reader=reader,
|
|
462
|
+
auth=auth,
|
|
356
463
|
)
|
|
464
|
+
content.content_hash = self._build_content_hash(content)
|
|
465
|
+
content.id = generate_id(content.content_hash)
|
|
466
|
+
|
|
467
|
+
self._load_content(content, upsert, skip_if_exists, include, exclude)
|
|
357
468
|
|
|
358
469
|
def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
|
|
359
470
|
"""
|
|
@@ -375,7 +486,178 @@ class Knowledge:
|
|
|
375
486
|
|
|
376
487
|
return False
|
|
377
488
|
|
|
378
|
-
|
|
489
|
+
def _select_reader_by_extension(
|
|
490
|
+
self, file_extension: str, provided_reader: Optional[Reader] = None
|
|
491
|
+
) -> Tuple[Optional[Reader], str]:
|
|
492
|
+
"""
|
|
493
|
+
Select a reader based on file extension.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
file_extension: File extension (e.g., '.pdf', '.csv')
|
|
497
|
+
provided_reader: Optional reader already provided
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Tuple of (reader, name) where name may be adjusted based on extension
|
|
501
|
+
"""
|
|
502
|
+
if provided_reader:
|
|
503
|
+
return provided_reader, ""
|
|
504
|
+
|
|
505
|
+
file_extension = file_extension.lower()
|
|
506
|
+
if file_extension == ".csv":
|
|
507
|
+
return self.csv_reader, "data.csv"
|
|
508
|
+
elif file_extension == ".pdf":
|
|
509
|
+
return self.pdf_reader, ""
|
|
510
|
+
elif file_extension == ".docx":
|
|
511
|
+
return self.docx_reader, ""
|
|
512
|
+
elif file_extension == ".pptx":
|
|
513
|
+
return self.pptx_reader, ""
|
|
514
|
+
elif file_extension == ".json":
|
|
515
|
+
return self.json_reader, ""
|
|
516
|
+
elif file_extension == ".markdown":
|
|
517
|
+
return self.markdown_reader, ""
|
|
518
|
+
else:
|
|
519
|
+
return self.text_reader, ""
|
|
520
|
+
|
|
521
|
+
def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
|
|
522
|
+
"""
|
|
523
|
+
Select a reader based on URI/file path extension.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
uri: URI or file path
|
|
527
|
+
provided_reader: Optional reader already provided
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
Selected reader or None
|
|
531
|
+
"""
|
|
532
|
+
if provided_reader:
|
|
533
|
+
return provided_reader
|
|
534
|
+
|
|
535
|
+
uri_lower = uri.lower()
|
|
536
|
+
if uri_lower.endswith(".pdf"):
|
|
537
|
+
return self.pdf_reader
|
|
538
|
+
elif uri_lower.endswith(".csv"):
|
|
539
|
+
return self.csv_reader
|
|
540
|
+
elif uri_lower.endswith(".docx"):
|
|
541
|
+
return self.docx_reader
|
|
542
|
+
elif uri_lower.endswith(".pptx"):
|
|
543
|
+
return self.pptx_reader
|
|
544
|
+
elif uri_lower.endswith(".json"):
|
|
545
|
+
return self.json_reader
|
|
546
|
+
elif uri_lower.endswith(".markdown"):
|
|
547
|
+
return self.markdown_reader
|
|
548
|
+
else:
|
|
549
|
+
return self.text_reader
|
|
550
|
+
|
|
551
|
+
def _read(
|
|
552
|
+
self,
|
|
553
|
+
reader: Reader,
|
|
554
|
+
source: Union[Path, str, BytesIO],
|
|
555
|
+
name: Optional[str] = None,
|
|
556
|
+
password: Optional[str] = None,
|
|
557
|
+
) -> List[Document]:
|
|
558
|
+
"""
|
|
559
|
+
Read content using a reader with optional password handling.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
reader: Reader to use
|
|
563
|
+
source: Source to read from (Path, URL string, or BytesIO)
|
|
564
|
+
name: Optional name for the document
|
|
565
|
+
password: Optional password for protected files
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
List of documents read
|
|
569
|
+
"""
|
|
570
|
+
import inspect
|
|
571
|
+
|
|
572
|
+
read_signature = inspect.signature(reader.read)
|
|
573
|
+
if password and "password" in read_signature.parameters:
|
|
574
|
+
if isinstance(source, BytesIO):
|
|
575
|
+
return reader.read(source, name=name, password=password)
|
|
576
|
+
else:
|
|
577
|
+
return reader.read(source, name=name, password=password)
|
|
578
|
+
else:
|
|
579
|
+
if isinstance(source, BytesIO):
|
|
580
|
+
return reader.read(source, name=name)
|
|
581
|
+
else:
|
|
582
|
+
return reader.read(source, name=name)
|
|
583
|
+
|
|
584
|
+
async def _read_async(
|
|
585
|
+
self,
|
|
586
|
+
reader: Reader,
|
|
587
|
+
source: Union[Path, str, BytesIO],
|
|
588
|
+
name: Optional[str] = None,
|
|
589
|
+
password: Optional[str] = None,
|
|
590
|
+
) -> List[Document]:
|
|
591
|
+
"""
|
|
592
|
+
Read content using a reader's async_read method with optional password handling.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
reader: Reader to use
|
|
596
|
+
source: Source to read from (Path, URL string, or BytesIO)
|
|
597
|
+
name: Optional name for the document
|
|
598
|
+
password: Optional password for protected files
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
List of documents read
|
|
602
|
+
"""
|
|
603
|
+
import inspect
|
|
604
|
+
|
|
605
|
+
read_signature = inspect.signature(reader.async_read)
|
|
606
|
+
if password and "password" in read_signature.parameters:
|
|
607
|
+
return await reader.async_read(source, name=name, password=password)
|
|
608
|
+
else:
|
|
609
|
+
if isinstance(source, BytesIO):
|
|
610
|
+
return await reader.async_read(source, name=name)
|
|
611
|
+
else:
|
|
612
|
+
return await reader.async_read(source, name=name)
|
|
613
|
+
|
|
614
|
+
def _prepare_documents_for_insert(
|
|
615
|
+
self,
|
|
616
|
+
documents: List[Document],
|
|
617
|
+
content_id: str,
|
|
618
|
+
calculate_sizes: bool = False,
|
|
619
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
620
|
+
) -> List[Document]:
|
|
621
|
+
"""
|
|
622
|
+
Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
documents: List of documents to prepare
|
|
626
|
+
content_id: Content ID to assign to documents
|
|
627
|
+
calculate_sizes: Whether to calculate document sizes
|
|
628
|
+
metadata: Optional metadata to merge into document metadata
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
List of prepared documents
|
|
632
|
+
"""
|
|
633
|
+
for document in documents:
|
|
634
|
+
document.content_id = content_id
|
|
635
|
+
if calculate_sizes and document.content and not document.size:
|
|
636
|
+
document.size = len(document.content.encode("utf-8"))
|
|
637
|
+
if metadata:
|
|
638
|
+
document.meta_data.update(metadata)
|
|
639
|
+
return documents
|
|
640
|
+
|
|
641
|
+
def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
|
|
642
|
+
"""
|
|
643
|
+
Chunk documents synchronously.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
reader: Reader with chunking strategy
|
|
647
|
+
documents: Documents to chunk
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
List of chunked documents
|
|
651
|
+
"""
|
|
652
|
+
if not reader or reader.chunk:
|
|
653
|
+
return documents
|
|
654
|
+
|
|
655
|
+
chunked_documents = []
|
|
656
|
+
for doc in documents:
|
|
657
|
+
chunked_documents.extend(reader.chunk_document(doc))
|
|
658
|
+
return chunked_documents
|
|
659
|
+
|
|
660
|
+
async def _load_from_path_async(
|
|
379
661
|
self,
|
|
380
662
|
content: Content,
|
|
381
663
|
upsert: bool,
|
|
@@ -392,45 +674,32 @@ class Knowledge:
|
|
|
392
674
|
|
|
393
675
|
if path.is_file():
|
|
394
676
|
if self._should_include_file(str(path), include, exclude):
|
|
395
|
-
|
|
677
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
396
678
|
|
|
397
|
-
self.
|
|
679
|
+
await self._add_to_contents_db_async(content)
|
|
398
680
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
399
681
|
content.status = ContentStatus.COMPLETED
|
|
400
|
-
self.
|
|
682
|
+
await self._aupdate_content(content)
|
|
401
683
|
return
|
|
402
684
|
|
|
403
685
|
# Handle LightRAG special case - read file and upload directly
|
|
404
686
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
405
|
-
await self.
|
|
687
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
|
|
406
688
|
return
|
|
407
689
|
|
|
408
690
|
if content.reader:
|
|
409
|
-
|
|
410
|
-
import inspect
|
|
411
|
-
|
|
412
|
-
read_signature = inspect.signature(content.reader.read)
|
|
413
|
-
if "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
414
|
-
read_documents = content.reader.read(
|
|
415
|
-
path, name=content.name or path.name, password=content.auth.password
|
|
416
|
-
)
|
|
417
|
-
else:
|
|
418
|
-
read_documents = content.reader.read(path, name=content.name or path.name)
|
|
419
|
-
|
|
691
|
+
reader = content.reader
|
|
420
692
|
else:
|
|
421
693
|
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
422
|
-
|
|
423
|
-
if reader:
|
|
424
|
-
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
425
|
-
import inspect
|
|
694
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
426
695
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
696
|
+
if reader:
|
|
697
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
698
|
+
read_documents = await self._read_async(
|
|
699
|
+
reader, path, name=content.name or path.name, password=password
|
|
700
|
+
)
|
|
701
|
+
else:
|
|
702
|
+
read_documents = []
|
|
434
703
|
|
|
435
704
|
if not content.file_type:
|
|
436
705
|
content.file_type = path.suffix
|
|
@@ -444,10 +713,11 @@ class Knowledge:
|
|
|
444
713
|
log_warning(f"Could not get file size for {path}: {e}")
|
|
445
714
|
content.size = 0
|
|
446
715
|
|
|
447
|
-
|
|
448
|
-
|
|
716
|
+
if not content.id:
|
|
717
|
+
content.id = generate_id(content.content_hash or "")
|
|
718
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
449
719
|
|
|
450
|
-
await self.
|
|
720
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
451
721
|
|
|
452
722
|
elif path.is_dir():
|
|
453
723
|
for file_path in path.iterdir():
|
|
@@ -466,48 +736,240 @@ class Knowledge:
|
|
|
466
736
|
file_content.content_hash = self._build_content_hash(file_content)
|
|
467
737
|
file_content.id = generate_id(file_content.content_hash)
|
|
468
738
|
|
|
469
|
-
await self.
|
|
739
|
+
await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
|
|
470
740
|
else:
|
|
471
741
|
log_warning(f"Invalid path: {path}")
|
|
472
742
|
|
|
473
|
-
|
|
743
|
+
def _load_from_path(
|
|
474
744
|
self,
|
|
475
745
|
content: Content,
|
|
476
746
|
upsert: bool,
|
|
477
747
|
skip_if_exists: bool,
|
|
748
|
+
include: Optional[List[str]] = None,
|
|
749
|
+
exclude: Optional[List[str]] = None,
|
|
478
750
|
):
|
|
479
|
-
"""Load the content in the contextual URL
|
|
480
|
-
|
|
481
|
-
1. Set content hash
|
|
482
|
-
2. Validate the URL
|
|
483
|
-
3. Read the content
|
|
484
|
-
4. Prepare and insert the content in the vector database
|
|
485
|
-
"""
|
|
486
|
-
|
|
487
751
|
from agno.vectordb import VectorDb
|
|
488
752
|
|
|
489
753
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
490
754
|
|
|
491
|
-
log_info(f"Adding content from
|
|
492
|
-
content.
|
|
493
|
-
|
|
494
|
-
if not content.url:
|
|
495
|
-
raise ValueError("No url provided")
|
|
755
|
+
log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
|
|
756
|
+
path = Path(content.path) # type: ignore
|
|
496
757
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
content.status = ContentStatus.COMPLETED
|
|
501
|
-
self._update_content(content)
|
|
502
|
-
return
|
|
758
|
+
if path.is_file():
|
|
759
|
+
if self._should_include_file(str(path), include, exclude):
|
|
760
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
503
761
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
762
|
+
self._add_to_contents_db(content)
|
|
763
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
764
|
+
content.status = ContentStatus.COMPLETED
|
|
765
|
+
self._update_content(content)
|
|
766
|
+
return
|
|
507
767
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
768
|
+
# Handle LightRAG special case - read file and upload directly
|
|
769
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
770
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if content.reader:
|
|
774
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
775
|
+
import inspect
|
|
776
|
+
|
|
777
|
+
read_signature = inspect.signature(content.reader.read)
|
|
778
|
+
if "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
779
|
+
read_documents = content.reader.read(
|
|
780
|
+
path, name=content.name or path.name, password=content.auth.password
|
|
781
|
+
)
|
|
782
|
+
else:
|
|
783
|
+
read_documents = content.reader.read(path, name=content.name or path.name)
|
|
784
|
+
|
|
785
|
+
else:
|
|
786
|
+
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
787
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
788
|
+
if reader:
|
|
789
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
790
|
+
import inspect
|
|
791
|
+
|
|
792
|
+
read_signature = inspect.signature(reader.read)
|
|
793
|
+
if "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
794
|
+
read_documents = reader.read(
|
|
795
|
+
path, name=content.name or path.name, password=content.auth.password
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
read_documents = reader.read(path, name=content.name or path.name)
|
|
799
|
+
|
|
800
|
+
if not content.file_type:
|
|
801
|
+
content.file_type = path.suffix
|
|
802
|
+
|
|
803
|
+
if not content.size and content.file_data:
|
|
804
|
+
content.size = len(content.file_data.content) # type: ignore
|
|
805
|
+
if not content.size:
|
|
806
|
+
try:
|
|
807
|
+
content.size = path.stat().st_size
|
|
808
|
+
except (OSError, IOError) as e:
|
|
809
|
+
log_warning(f"Could not get file size for {path}: {e}")
|
|
810
|
+
content.size = 0
|
|
811
|
+
|
|
812
|
+
if not content.id:
|
|
813
|
+
content.id = generate_id(content.content_hash or "")
|
|
814
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
815
|
+
|
|
816
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
817
|
+
|
|
818
|
+
elif path.is_dir():
|
|
819
|
+
for file_path in path.iterdir():
|
|
820
|
+
# Apply include/exclude filtering
|
|
821
|
+
if not self._should_include_file(str(file_path), include, exclude):
|
|
822
|
+
log_debug(f"Skipping file {file_path} due to include/exclude filters")
|
|
823
|
+
continue
|
|
824
|
+
|
|
825
|
+
file_content = Content(
|
|
826
|
+
name=content.name,
|
|
827
|
+
path=str(file_path),
|
|
828
|
+
metadata=content.metadata,
|
|
829
|
+
description=content.description,
|
|
830
|
+
reader=content.reader,
|
|
831
|
+
)
|
|
832
|
+
file_content.content_hash = self._build_content_hash(file_content)
|
|
833
|
+
file_content.id = generate_id(file_content.content_hash)
|
|
834
|
+
|
|
835
|
+
self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
|
|
836
|
+
else:
|
|
837
|
+
log_warning(f"Invalid path: {path}")
|
|
838
|
+
|
|
839
|
+
async def _load_from_url_async(
|
|
840
|
+
self,
|
|
841
|
+
content: Content,
|
|
842
|
+
upsert: bool,
|
|
843
|
+
skip_if_exists: bool,
|
|
844
|
+
):
|
|
845
|
+
"""Load the content in the contextual URL
|
|
846
|
+
|
|
847
|
+
1. Set content hash
|
|
848
|
+
2. Validate the URL
|
|
849
|
+
3. Read the content
|
|
850
|
+
4. Prepare and insert the content in the vector database
|
|
851
|
+
"""
|
|
852
|
+
from agno.vectordb import VectorDb
|
|
853
|
+
|
|
854
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
855
|
+
|
|
856
|
+
log_info(f"Adding content from URL {content.url}")
|
|
857
|
+
content.file_type = "url"
|
|
858
|
+
|
|
859
|
+
if not content.url:
|
|
860
|
+
raise ValueError("No url provided")
|
|
861
|
+
|
|
862
|
+
# 1. Add content to contents database
|
|
863
|
+
await self._add_to_contents_db_async(content)
|
|
864
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
865
|
+
content.status = ContentStatus.COMPLETED
|
|
866
|
+
await self._aupdate_content(content)
|
|
867
|
+
return
|
|
868
|
+
|
|
869
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
870
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
|
|
871
|
+
return
|
|
872
|
+
|
|
873
|
+
# 2. Validate URL
|
|
874
|
+
try:
|
|
875
|
+
from urllib.parse import urlparse
|
|
876
|
+
|
|
877
|
+
parsed_url = urlparse(content.url)
|
|
878
|
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
879
|
+
content.status = ContentStatus.FAILED
|
|
880
|
+
content.status_message = f"Invalid URL format: {content.url}"
|
|
881
|
+
await self._aupdate_content(content)
|
|
882
|
+
log_warning(f"Invalid URL format: {content.url}")
|
|
883
|
+
except Exception as e:
|
|
884
|
+
content.status = ContentStatus.FAILED
|
|
885
|
+
content.status_message = f"Invalid URL: {content.url} - {str(e)}"
|
|
886
|
+
await self._aupdate_content(content)
|
|
887
|
+
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
888
|
+
# 3. Fetch and load content if file has an extension
|
|
889
|
+
url_path = Path(parsed_url.path)
|
|
890
|
+
file_extension = url_path.suffix.lower()
|
|
891
|
+
|
|
892
|
+
bytes_content = None
|
|
893
|
+
if file_extension:
|
|
894
|
+
async with AsyncClient() as client:
|
|
895
|
+
response = await async_fetch_with_retry(content.url, client=client)
|
|
896
|
+
bytes_content = BytesIO(response.content)
|
|
897
|
+
|
|
898
|
+
# 4. Select reader
|
|
899
|
+
name = content.name if content.name else content.url
|
|
900
|
+
if file_extension:
|
|
901
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
902
|
+
if default_name and file_extension == ".csv":
|
|
903
|
+
name = basename(parsed_url.path) or default_name
|
|
904
|
+
else:
|
|
905
|
+
reader = content.reader or self.website_reader
|
|
906
|
+
# 5. Read content
|
|
907
|
+
try:
|
|
908
|
+
read_documents = []
|
|
909
|
+
if reader is not None:
|
|
910
|
+
# Special handling for YouTubeReader
|
|
911
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
912
|
+
read_documents = await reader.async_read(content.url, name=name)
|
|
913
|
+
else:
|
|
914
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
915
|
+
source = bytes_content if bytes_content else content.url
|
|
916
|
+
read_documents = await self._read_async(reader, source, name=name, password=password)
|
|
917
|
+
|
|
918
|
+
except Exception as e:
|
|
919
|
+
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
920
|
+
content.status = ContentStatus.FAILED
|
|
921
|
+
content.status_message = f"Error reading URL: {content.url} - {str(e)}"
|
|
922
|
+
await self._aupdate_content(content)
|
|
923
|
+
return
|
|
924
|
+
|
|
925
|
+
# 6. Chunk documents if needed
|
|
926
|
+
if reader and not reader.chunk:
|
|
927
|
+
read_documents = await reader.chunk_documents_async(read_documents)
|
|
928
|
+
# 7. Prepare and insert the content in the vector database
|
|
929
|
+
if not content.id:
|
|
930
|
+
content.id = generate_id(content.content_hash or "")
|
|
931
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
932
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
933
|
+
|
|
934
|
+
def _load_from_url(
|
|
935
|
+
self,
|
|
936
|
+
content: Content,
|
|
937
|
+
upsert: bool,
|
|
938
|
+
skip_if_exists: bool,
|
|
939
|
+
):
|
|
940
|
+
"""Synchronous version of _load_from_url.
|
|
941
|
+
|
|
942
|
+
Load the content from a URL:
|
|
943
|
+
1. Set content hash
|
|
944
|
+
2. Validate the URL
|
|
945
|
+
3. Read the content
|
|
946
|
+
4. Prepare and insert the content in the vector database
|
|
947
|
+
"""
|
|
948
|
+
from agno.utils.http import fetch_with_retry
|
|
949
|
+
from agno.vectordb import VectorDb
|
|
950
|
+
|
|
951
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
952
|
+
|
|
953
|
+
log_info(f"Adding content from URL {content.url}")
|
|
954
|
+
content.file_type = "url"
|
|
955
|
+
|
|
956
|
+
if not content.url:
|
|
957
|
+
raise ValueError("No url provided")
|
|
958
|
+
|
|
959
|
+
# 1. Add content to contents database
|
|
960
|
+
self._add_to_contents_db(content)
|
|
961
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
962
|
+
content.status = ContentStatus.COMPLETED
|
|
963
|
+
self._update_content(content)
|
|
964
|
+
return
|
|
965
|
+
|
|
966
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
967
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
968
|
+
return
|
|
969
|
+
|
|
970
|
+
# 2. Validate URL
|
|
971
|
+
try:
|
|
972
|
+
from urllib.parse import urlparse
|
|
511
973
|
|
|
512
974
|
parsed_url = urlparse(content.url)
|
|
513
975
|
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
@@ -527,50 +989,29 @@ class Knowledge:
|
|
|
527
989
|
|
|
528
990
|
bytes_content = None
|
|
529
991
|
if file_extension:
|
|
530
|
-
|
|
531
|
-
response = await async_fetch_with_retry(content.url, client=client)
|
|
992
|
+
response = fetch_with_retry(content.url)
|
|
532
993
|
bytes_content = BytesIO(response.content)
|
|
533
994
|
|
|
534
995
|
# 4. Select reader
|
|
535
|
-
# If a reader was provided by the user, use it
|
|
536
|
-
reader = content.reader
|
|
537
996
|
name = content.name if content.name else content.url
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
if file_extension == ".csv":
|
|
541
|
-
name = basename(parsed_url.path) or
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
reader = self.pdf_reader
|
|
545
|
-
elif file_extension == ".docx":
|
|
546
|
-
reader = self.docx_reader
|
|
547
|
-
elif file_extension == ".json":
|
|
548
|
-
reader = self.json_reader
|
|
549
|
-
elif file_extension == ".markdown":
|
|
550
|
-
reader = self.markdown_reader
|
|
551
|
-
else:
|
|
552
|
-
reader = self.text_reader
|
|
997
|
+
if file_extension:
|
|
998
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
999
|
+
if default_name and file_extension == ".csv":
|
|
1000
|
+
name = basename(parsed_url.path) or default_name
|
|
1001
|
+
else:
|
|
1002
|
+
reader = content.reader or self.website_reader
|
|
553
1003
|
|
|
554
1004
|
# 5. Read content
|
|
555
1005
|
try:
|
|
556
1006
|
read_documents = []
|
|
557
1007
|
if reader is not None:
|
|
558
|
-
#
|
|
559
|
-
import inspect
|
|
560
|
-
|
|
561
|
-
read_signature = inspect.signature(reader.read)
|
|
1008
|
+
# Special handling for YouTubeReader
|
|
562
1009
|
if reader.__class__.__name__ == "YouTubeReader":
|
|
563
1010
|
read_documents = reader.read(content.url, name=name)
|
|
564
|
-
elif "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
565
|
-
if bytes_content:
|
|
566
|
-
read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
|
|
567
|
-
else:
|
|
568
|
-
read_documents = reader.read(content.url, name=name, password=content.auth.password)
|
|
569
1011
|
else:
|
|
570
|
-
if
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
read_documents = reader.read(content.url, name=name)
|
|
1012
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
1013
|
+
source = bytes_content if bytes_content else content.url
|
|
1014
|
+
read_documents = self._read(reader, source, name=name, password=password)
|
|
574
1015
|
|
|
575
1016
|
except Exception as e:
|
|
576
1017
|
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
@@ -579,19 +1020,17 @@ class Knowledge:
|
|
|
579
1020
|
self._update_content(content)
|
|
580
1021
|
return
|
|
581
1022
|
|
|
582
|
-
# 6. Chunk documents if needed
|
|
583
|
-
if reader
|
|
584
|
-
read_documents =
|
|
1023
|
+
# 6. Chunk documents if needed (sync version)
|
|
1024
|
+
if reader:
|
|
1025
|
+
read_documents = self._chunk_documents_sync(reader, read_documents)
|
|
1026
|
+
|
|
585
1027
|
# 7. Prepare and insert the content in the vector database
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
593
|
-
|
|
594
|
-
async def _load_from_content(
|
|
1028
|
+
if not content.id:
|
|
1029
|
+
content.id = generate_id(content.content_hash or "")
|
|
1030
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1031
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1032
|
+
|
|
1033
|
+
async def _load_from_content_async(
|
|
595
1034
|
self,
|
|
596
1035
|
content: Content,
|
|
597
1036
|
upsert: bool = True,
|
|
@@ -622,6 +1061,103 @@ class Knowledge:
|
|
|
622
1061
|
|
|
623
1062
|
log_info(f"Adding content from {content.name}")
|
|
624
1063
|
|
|
1064
|
+
await self._add_to_contents_db_async(content)
|
|
1065
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
1066
|
+
content.status = ContentStatus.COMPLETED
|
|
1067
|
+
await self._aupdate_content(content)
|
|
1068
|
+
return
|
|
1069
|
+
|
|
1070
|
+
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
1071
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
|
|
1072
|
+
return
|
|
1073
|
+
|
|
1074
|
+
read_documents = []
|
|
1075
|
+
|
|
1076
|
+
if isinstance(content.file_data, str):
|
|
1077
|
+
content_bytes = content.file_data.encode("utf-8", errors="replace")
|
|
1078
|
+
content_io = io.BytesIO(content_bytes)
|
|
1079
|
+
|
|
1080
|
+
if content.reader:
|
|
1081
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1082
|
+
read_documents = await content.reader.async_read(content_io, name=name)
|
|
1083
|
+
else:
|
|
1084
|
+
text_reader = self.text_reader
|
|
1085
|
+
if text_reader:
|
|
1086
|
+
read_documents = await text_reader.async_read(content_io, name=name)
|
|
1087
|
+
else:
|
|
1088
|
+
content.status = ContentStatus.FAILED
|
|
1089
|
+
content.status_message = "Text reader not available"
|
|
1090
|
+
await self._aupdate_content(content)
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
elif isinstance(content.file_data, FileData):
|
|
1094
|
+
if content.file_data.type:
|
|
1095
|
+
if isinstance(content.file_data.content, bytes):
|
|
1096
|
+
content_io = io.BytesIO(content.file_data.content)
|
|
1097
|
+
elif isinstance(content.file_data.content, str):
|
|
1098
|
+
content_bytes = content.file_data.content.encode("utf-8", errors="replace")
|
|
1099
|
+
content_io = io.BytesIO(content_bytes)
|
|
1100
|
+
else:
|
|
1101
|
+
content_io = content.file_data.content # type: ignore
|
|
1102
|
+
|
|
1103
|
+
# Respect an explicitly provided reader; otherwise select based on file type
|
|
1104
|
+
if content.reader:
|
|
1105
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1106
|
+
reader = content.reader
|
|
1107
|
+
else:
|
|
1108
|
+
reader = self._select_reader(content.file_data.type)
|
|
1109
|
+
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
1110
|
+
read_documents = await reader.async_read(content_io, name=name)
|
|
1111
|
+
if not content.id:
|
|
1112
|
+
content.id = generate_id(content.content_hash or "")
|
|
1113
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1114
|
+
|
|
1115
|
+
if len(read_documents) == 0:
|
|
1116
|
+
content.status = ContentStatus.FAILED
|
|
1117
|
+
content.status_message = "Content could not be read"
|
|
1118
|
+
await self._aupdate_content(content)
|
|
1119
|
+
return
|
|
1120
|
+
|
|
1121
|
+
else:
|
|
1122
|
+
content.status = ContentStatus.FAILED
|
|
1123
|
+
content.status_message = "No content provided"
|
|
1124
|
+
await self._aupdate_content(content)
|
|
1125
|
+
return
|
|
1126
|
+
|
|
1127
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1128
|
+
|
|
1129
|
+
def _load_from_content(
|
|
1130
|
+
self,
|
|
1131
|
+
content: Content,
|
|
1132
|
+
upsert: bool = True,
|
|
1133
|
+
skip_if_exists: bool = False,
|
|
1134
|
+
):
|
|
1135
|
+
"""Synchronous version of _load_from_content."""
|
|
1136
|
+
from agno.vectordb import VectorDb
|
|
1137
|
+
|
|
1138
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1139
|
+
|
|
1140
|
+
if content.name:
|
|
1141
|
+
name = content.name
|
|
1142
|
+
elif content.file_data and content.file_data.content:
|
|
1143
|
+
if isinstance(content.file_data.content, bytes):
|
|
1144
|
+
name = content.file_data.content[:10].decode("utf-8", errors="ignore")
|
|
1145
|
+
elif isinstance(content.file_data.content, str):
|
|
1146
|
+
name = (
|
|
1147
|
+
content.file_data.content[:10]
|
|
1148
|
+
if len(content.file_data.content) >= 10
|
|
1149
|
+
else content.file_data.content
|
|
1150
|
+
)
|
|
1151
|
+
else:
|
|
1152
|
+
name = str(content.file_data.content)[:10]
|
|
1153
|
+
else:
|
|
1154
|
+
name = None
|
|
1155
|
+
|
|
1156
|
+
if name is not None:
|
|
1157
|
+
content.name = name
|
|
1158
|
+
|
|
1159
|
+
log_info(f"Adding content from {content.name}")
|
|
1160
|
+
|
|
625
1161
|
self._add_to_contents_db(content)
|
|
626
1162
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
627
1163
|
content.status = ContentStatus.COMPLETED
|
|
@@ -629,7 +1165,7 @@ class Knowledge:
|
|
|
629
1165
|
return
|
|
630
1166
|
|
|
631
1167
|
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
632
|
-
|
|
1168
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
|
|
633
1169
|
return
|
|
634
1170
|
|
|
635
1171
|
read_documents = []
|
|
@@ -639,7 +1175,7 @@ class Knowledge:
|
|
|
639
1175
|
content_io = io.BytesIO(content_bytes)
|
|
640
1176
|
|
|
641
1177
|
if content.reader:
|
|
642
|
-
|
|
1178
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
643
1179
|
read_documents = content.reader.read(content_io, name=name)
|
|
644
1180
|
else:
|
|
645
1181
|
text_reader = self.text_reader
|
|
@@ -663,21 +1199,21 @@ class Knowledge:
|
|
|
663
1199
|
|
|
664
1200
|
# Respect an explicitly provided reader; otherwise select based on file type
|
|
665
1201
|
if content.reader:
|
|
666
|
-
|
|
1202
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
667
1203
|
reader = content.reader
|
|
668
1204
|
else:
|
|
669
1205
|
reader = self._select_reader(content.file_data.type)
|
|
670
1206
|
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
671
1207
|
read_documents = reader.read(content_io, name=name)
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
read_document.content_id = content.id
|
|
1208
|
+
if not content.id:
|
|
1209
|
+
content.id = generate_id(content.content_hash or "")
|
|
1210
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
676
1211
|
|
|
677
1212
|
if len(read_documents) == 0:
|
|
678
1213
|
content.status = ContentStatus.FAILED
|
|
679
1214
|
content.status_message = "Content could not be read"
|
|
680
1215
|
self._update_content(content)
|
|
1216
|
+
return
|
|
681
1217
|
|
|
682
1218
|
else:
|
|
683
1219
|
content.status = ContentStatus.FAILED
|
|
@@ -685,14 +1221,76 @@ class Knowledge:
|
|
|
685
1221
|
self._update_content(content)
|
|
686
1222
|
return
|
|
687
1223
|
|
|
688
|
-
|
|
1224
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1225
|
+
|
|
1226
|
+
async def _load_from_topics_async(
|
|
1227
|
+
self,
|
|
1228
|
+
content: Content,
|
|
1229
|
+
upsert: bool,
|
|
1230
|
+
skip_if_exists: bool,
|
|
1231
|
+
):
|
|
1232
|
+
from agno.vectordb import VectorDb
|
|
1233
|
+
|
|
1234
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1235
|
+
log_info(f"Adding content from topics: {content.topics}")
|
|
1236
|
+
|
|
1237
|
+
if content.topics is None:
|
|
1238
|
+
log_warning("No topics provided for content")
|
|
1239
|
+
return
|
|
1240
|
+
|
|
1241
|
+
for topic in content.topics:
|
|
1242
|
+
content = Content(
|
|
1243
|
+
name=topic,
|
|
1244
|
+
metadata=content.metadata,
|
|
1245
|
+
reader=content.reader,
|
|
1246
|
+
status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
|
|
1247
|
+
file_data=FileData(
|
|
1248
|
+
type="Topic",
|
|
1249
|
+
),
|
|
1250
|
+
topics=[topic],
|
|
1251
|
+
)
|
|
1252
|
+
content.content_hash = self._build_content_hash(content)
|
|
1253
|
+
content.id = generate_id(content.content_hash)
|
|
1254
|
+
|
|
1255
|
+
await self._add_to_contents_db_async(content)
|
|
1256
|
+
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1257
|
+
content.status = ContentStatus.COMPLETED
|
|
1258
|
+
await self._aupdate_content(content)
|
|
1259
|
+
return
|
|
689
1260
|
|
|
690
|
-
|
|
1261
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1262
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
|
|
1263
|
+
return
|
|
1264
|
+
|
|
1265
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1266
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1267
|
+
continue
|
|
1268
|
+
|
|
1269
|
+
await self._add_to_contents_db_async(content)
|
|
1270
|
+
if content.reader is None:
|
|
1271
|
+
log_error(f"No reader available for topic: {topic}")
|
|
1272
|
+
content.status = ContentStatus.FAILED
|
|
1273
|
+
content.status_message = "No reader available for topic"
|
|
1274
|
+
await self._aupdate_content(content)
|
|
1275
|
+
continue
|
|
1276
|
+
|
|
1277
|
+
read_documents = await content.reader.async_read(topic)
|
|
1278
|
+
if len(read_documents) > 0:
|
|
1279
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1280
|
+
else:
|
|
1281
|
+
content.status = ContentStatus.FAILED
|
|
1282
|
+
content.status_message = "No content found for topic"
|
|
1283
|
+
await self._aupdate_content(content)
|
|
1284
|
+
|
|
1285
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1286
|
+
|
|
1287
|
+
def _load_from_topics(
|
|
691
1288
|
self,
|
|
692
1289
|
content: Content,
|
|
693
1290
|
upsert: bool,
|
|
694
1291
|
skip_if_exists: bool,
|
|
695
1292
|
):
|
|
1293
|
+
"""Synchronous version of _load_from_topics."""
|
|
696
1294
|
from agno.vectordb import VectorDb
|
|
697
1295
|
|
|
698
1296
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
@@ -723,9 +1321,14 @@ class Knowledge:
|
|
|
723
1321
|
return
|
|
724
1322
|
|
|
725
1323
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
726
|
-
|
|
1324
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
727
1325
|
return
|
|
728
1326
|
|
|
1327
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1328
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1329
|
+
continue
|
|
1330
|
+
|
|
1331
|
+
self._add_to_contents_db(content)
|
|
729
1332
|
if content.reader is None:
|
|
730
1333
|
log_error(f"No reader available for topic: {topic}")
|
|
731
1334
|
content.status = ContentStatus.FAILED
|
|
@@ -735,24 +1338,178 @@ class Knowledge:
|
|
|
735
1338
|
|
|
736
1339
|
read_documents = content.reader.read(topic)
|
|
737
1340
|
if len(read_documents) > 0:
|
|
738
|
-
|
|
739
|
-
read_document.content_id = content.id
|
|
740
|
-
if read_document.content:
|
|
741
|
-
read_document.size = len(read_document.content.encode("utf-8"))
|
|
1341
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
742
1342
|
else:
|
|
743
1343
|
content.status = ContentStatus.FAILED
|
|
744
1344
|
content.status_message = "No content found for topic"
|
|
745
1345
|
self._update_content(content)
|
|
746
|
-
continue
|
|
747
1346
|
|
|
748
|
-
|
|
1347
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1348
|
+
|
|
1349
|
+
async def _load_from_remote_content_async(
|
|
1350
|
+
self,
|
|
1351
|
+
content: Content,
|
|
1352
|
+
upsert: bool,
|
|
1353
|
+
skip_if_exists: bool,
|
|
1354
|
+
):
|
|
1355
|
+
if content.remote_content is None:
|
|
1356
|
+
log_warning("No remote content provided for content")
|
|
1357
|
+
return
|
|
1358
|
+
|
|
1359
|
+
remote_content = content.remote_content
|
|
1360
|
+
|
|
1361
|
+
if isinstance(remote_content, S3Content):
|
|
1362
|
+
await self._load_from_s3_async(content, upsert, skip_if_exists)
|
|
1363
|
+
|
|
1364
|
+
elif isinstance(remote_content, GCSContent):
|
|
1365
|
+
await self._load_from_gcs_async(content, upsert, skip_if_exists)
|
|
1366
|
+
|
|
1367
|
+
else:
|
|
1368
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1369
|
+
|
|
1370
|
+
async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1371
|
+
"""Load the contextual S3 content.
|
|
1372
|
+
|
|
1373
|
+
1. Identify objects to read
|
|
1374
|
+
2. Setup Content object
|
|
1375
|
+
3. Hash content and add it to the contents database
|
|
1376
|
+
4. Select reader
|
|
1377
|
+
5. Fetch and load the content
|
|
1378
|
+
6. Read the content
|
|
1379
|
+
7. Prepare and insert the content in the vector database
|
|
1380
|
+
8. Remove temporary file if needed
|
|
1381
|
+
"""
|
|
1382
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
1383
|
+
|
|
1384
|
+
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
1385
|
+
|
|
1386
|
+
# 1. Identify objects to read
|
|
1387
|
+
objects_to_read: List[S3Object] = []
|
|
1388
|
+
if remote_content.bucket is not None:
|
|
1389
|
+
if remote_content.key is not None:
|
|
1390
|
+
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
1391
|
+
objects_to_read.append(_object)
|
|
1392
|
+
elif remote_content.object is not None:
|
|
1393
|
+
objects_to_read.append(remote_content.object)
|
|
1394
|
+
elif remote_content.prefix is not None:
|
|
1395
|
+
objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
|
|
1396
|
+
else:
|
|
1397
|
+
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
1398
|
+
|
|
1399
|
+
for s3_object in objects_to_read:
|
|
1400
|
+
# 2. Setup Content object
|
|
1401
|
+
content_name = content.name or ""
|
|
1402
|
+
content_name += "_" + (s3_object.name or "")
|
|
1403
|
+
content_entry = Content(
|
|
1404
|
+
name=content_name,
|
|
1405
|
+
description=content.description,
|
|
1406
|
+
status=ContentStatus.PROCESSING,
|
|
1407
|
+
metadata=content.metadata,
|
|
1408
|
+
file_type="s3",
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
# 3. Hash content and add it to the contents database
|
|
1412
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1413
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1414
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1415
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1416
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1417
|
+
await self._aupdate_content(content_entry)
|
|
1418
|
+
return
|
|
1419
|
+
|
|
1420
|
+
# 4. Select reader
|
|
1421
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
1422
|
+
reader = cast(Reader, reader)
|
|
1423
|
+
|
|
1424
|
+
# 5. Fetch and load the content
|
|
1425
|
+
temporary_file = None
|
|
1426
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
1427
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
1428
|
+
if s3_object.uri.endswith(".pdf"):
|
|
1429
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
1430
|
+
else:
|
|
1431
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
1432
|
+
readable_content = temporary_file
|
|
1433
|
+
s3_object.download(readable_content) # type: ignore
|
|
1434
|
+
|
|
1435
|
+
# 6. Read the content
|
|
1436
|
+
read_documents = await reader.async_read(readable_content, name=obj_name)
|
|
1437
|
+
|
|
1438
|
+
# 7. Prepare and insert the content in the vector database
|
|
1439
|
+
if not content.id:
|
|
1440
|
+
content.id = generate_id(content.content_hash or "")
|
|
1441
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1442
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
1443
|
+
|
|
1444
|
+
# 8. Remove temporary file if needed
|
|
1445
|
+
if temporary_file:
|
|
1446
|
+
temporary_file.unlink()
|
|
1447
|
+
|
|
1448
|
+
async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1449
|
+
"""Load the contextual GCS content.
|
|
1450
|
+
|
|
1451
|
+
1. Identify objects to read
|
|
1452
|
+
2. Setup Content object
|
|
1453
|
+
3. Hash content and add it to the contents database
|
|
1454
|
+
4. Select reader
|
|
1455
|
+
5. Fetch and load the content
|
|
1456
|
+
6. Read the content
|
|
1457
|
+
7. Prepare and insert the content in the vector database
|
|
1458
|
+
"""
|
|
1459
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
1460
|
+
|
|
1461
|
+
# 1. Identify objects to read
|
|
1462
|
+
objects_to_read = []
|
|
1463
|
+
if remote_content.blob_name is not None:
|
|
1464
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
1465
|
+
elif remote_content.prefix is not None:
|
|
1466
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
1467
|
+
else:
|
|
1468
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
1469
|
+
|
|
1470
|
+
for gcs_object in objects_to_read:
|
|
1471
|
+
# 2. Setup Content object
|
|
1472
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
1473
|
+
content_entry = Content(
|
|
1474
|
+
name=name,
|
|
1475
|
+
description=content.description,
|
|
1476
|
+
status=ContentStatus.PROCESSING,
|
|
1477
|
+
metadata=content.metadata,
|
|
1478
|
+
file_type="gcs",
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# 3. Hash content and add it to the contents database
|
|
1482
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1483
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1484
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1485
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1486
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1487
|
+
await self._aupdate_content(content_entry)
|
|
1488
|
+
return
|
|
1489
|
+
|
|
1490
|
+
# 4. Select reader
|
|
1491
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
1492
|
+
reader = cast(Reader, reader)
|
|
1493
|
+
|
|
1494
|
+
# 5. Fetch and load the content
|
|
1495
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
1496
|
+
|
|
1497
|
+
# 6. Read the content
|
|
1498
|
+
read_documents = await reader.async_read(readable_content, name=name)
|
|
1499
|
+
|
|
1500
|
+
# 7. Prepare and insert the content in the vector database
|
|
1501
|
+
if not content.id:
|
|
1502
|
+
content.id = generate_id(content.content_hash or "")
|
|
1503
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1504
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
749
1505
|
|
|
750
|
-
|
|
1506
|
+
def _load_from_remote_content(
|
|
751
1507
|
self,
|
|
752
1508
|
content: Content,
|
|
753
1509
|
upsert: bool,
|
|
754
1510
|
skip_if_exists: bool,
|
|
755
1511
|
):
|
|
1512
|
+
"""Synchronous version of _load_from_remote_content."""
|
|
756
1513
|
if content.remote_content is None:
|
|
757
1514
|
log_warning("No remote content provided for content")
|
|
758
1515
|
return
|
|
@@ -760,17 +1517,18 @@ class Knowledge:
|
|
|
760
1517
|
remote_content = content.remote_content
|
|
761
1518
|
|
|
762
1519
|
if isinstance(remote_content, S3Content):
|
|
763
|
-
|
|
1520
|
+
self._load_from_s3(content, upsert, skip_if_exists)
|
|
764
1521
|
|
|
765
1522
|
elif isinstance(remote_content, GCSContent):
|
|
766
|
-
|
|
1523
|
+
self._load_from_gcs(content, upsert, skip_if_exists)
|
|
767
1524
|
|
|
768
1525
|
else:
|
|
769
1526
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
770
1527
|
|
|
771
|
-
|
|
772
|
-
"""
|
|
1528
|
+
def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1529
|
+
"""Synchronous version of _load_from_s3.
|
|
773
1530
|
|
|
1531
|
+
Load the contextual S3 content:
|
|
774
1532
|
1. Identify objects to read
|
|
775
1533
|
2. Setup Content object
|
|
776
1534
|
3. Hash content and add it to the contents database
|
|
@@ -819,20 +1577,7 @@ class Knowledge:
|
|
|
819
1577
|
return
|
|
820
1578
|
|
|
821
1579
|
# 4. Select reader
|
|
822
|
-
reader = content.reader
|
|
823
|
-
if reader is None:
|
|
824
|
-
if s3_object.uri.endswith(".pdf"):
|
|
825
|
-
reader = self.pdf_reader
|
|
826
|
-
elif s3_object.uri.endswith(".csv"):
|
|
827
|
-
reader = self.csv_reader
|
|
828
|
-
elif s3_object.uri.endswith(".docx"):
|
|
829
|
-
reader = self.docx_reader
|
|
830
|
-
elif s3_object.uri.endswith(".json"):
|
|
831
|
-
reader = self.json_reader
|
|
832
|
-
elif s3_object.uri.endswith(".markdown"):
|
|
833
|
-
reader = self.markdown_reader
|
|
834
|
-
else:
|
|
835
|
-
reader = self.text_reader
|
|
1580
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
836
1581
|
reader = cast(Reader, reader)
|
|
837
1582
|
|
|
838
1583
|
# 5. Fetch and load the content
|
|
@@ -850,17 +1595,19 @@ class Knowledge:
|
|
|
850
1595
|
read_documents = reader.read(readable_content, name=obj_name)
|
|
851
1596
|
|
|
852
1597
|
# 7. Prepare and insert the content in the vector database
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
1598
|
+
if not content.id:
|
|
1599
|
+
content.id = generate_id(content.content_hash or "")
|
|
1600
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1601
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
856
1602
|
|
|
857
1603
|
# 8. Remove temporary file if needed
|
|
858
1604
|
if temporary_file:
|
|
859
1605
|
temporary_file.unlink()
|
|
860
1606
|
|
|
861
|
-
|
|
862
|
-
"""
|
|
1607
|
+
def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1608
|
+
"""Synchronous version of _load_from_gcs.
|
|
863
1609
|
|
|
1610
|
+
Load the contextual GCS content:
|
|
864
1611
|
1. Identify objects to read
|
|
865
1612
|
2. Setup Content object
|
|
866
1613
|
3. Hash content and add it to the contents database
|
|
@@ -901,20 +1648,7 @@ class Knowledge:
|
|
|
901
1648
|
return
|
|
902
1649
|
|
|
903
1650
|
# 4. Select reader
|
|
904
|
-
reader = content.reader
|
|
905
|
-
if reader is None:
|
|
906
|
-
if gcs_object.name.endswith(".pdf"):
|
|
907
|
-
reader = self.pdf_reader
|
|
908
|
-
elif gcs_object.name.endswith(".csv"):
|
|
909
|
-
reader = self.csv_reader
|
|
910
|
-
elif gcs_object.name.endswith(".docx"):
|
|
911
|
-
reader = self.docx_reader
|
|
912
|
-
elif gcs_object.name.endswith(".json"):
|
|
913
|
-
reader = self.json_reader
|
|
914
|
-
elif gcs_object.name.endswith(".markdown"):
|
|
915
|
-
reader = self.markdown_reader
|
|
916
|
-
else:
|
|
917
|
-
reader = self.text_reader
|
|
1651
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
918
1652
|
reader = cast(Reader, reader)
|
|
919
1653
|
|
|
920
1654
|
# 5. Fetch and load the content
|
|
@@ -924,11 +1658,12 @@ class Knowledge:
|
|
|
924
1658
|
read_documents = reader.read(readable_content, name=name)
|
|
925
1659
|
|
|
926
1660
|
# 7. Prepare and insert the content in the vector database
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
1661
|
+
if not content.id:
|
|
1662
|
+
content.id = generate_id(content.content_hash or "")
|
|
1663
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1664
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
930
1665
|
|
|
931
|
-
async def
|
|
1666
|
+
async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
|
|
932
1667
|
from agno.vectordb import VectorDb
|
|
933
1668
|
|
|
934
1669
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
@@ -937,7 +1672,7 @@ class Knowledge:
|
|
|
937
1672
|
log_error("No vector database configured")
|
|
938
1673
|
content.status = ContentStatus.FAILED
|
|
939
1674
|
content.status_message = "No vector database configured"
|
|
940
|
-
self.
|
|
1675
|
+
await self._aupdate_content(content)
|
|
941
1676
|
return
|
|
942
1677
|
|
|
943
1678
|
if self.vector_db.upsert_available() and upsert:
|
|
@@ -947,7 +1682,7 @@ class Knowledge:
|
|
|
947
1682
|
log_error(f"Error upserting document: {e}")
|
|
948
1683
|
content.status = ContentStatus.FAILED
|
|
949
1684
|
content.status_message = "Could not upsert embedding"
|
|
950
|
-
self.
|
|
1685
|
+
await self._aupdate_content(content)
|
|
951
1686
|
return
|
|
952
1687
|
else:
|
|
953
1688
|
try:
|
|
@@ -956,6 +1691,45 @@ class Knowledge:
|
|
|
956
1691
|
documents=read_documents,
|
|
957
1692
|
filters=content.metadata, # type: ignore[arg-type]
|
|
958
1693
|
)
|
|
1694
|
+
except Exception as e:
|
|
1695
|
+
log_error(f"Error inserting document: {e}")
|
|
1696
|
+
content.status = ContentStatus.FAILED
|
|
1697
|
+
content.status_message = "Could not insert embedding"
|
|
1698
|
+
await self._aupdate_content(content)
|
|
1699
|
+
return
|
|
1700
|
+
|
|
1701
|
+
content.status = ContentStatus.COMPLETED
|
|
1702
|
+
await self._aupdate_content(content)
|
|
1703
|
+
|
|
1704
|
+
def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
1705
|
+
"""Synchronously handle vector database insertion."""
|
|
1706
|
+
from agno.vectordb import VectorDb
|
|
1707
|
+
|
|
1708
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1709
|
+
|
|
1710
|
+
if not self.vector_db:
|
|
1711
|
+
log_error("No vector database configured")
|
|
1712
|
+
content.status = ContentStatus.FAILED
|
|
1713
|
+
content.status_message = "No vector database configured"
|
|
1714
|
+
self._update_content(content)
|
|
1715
|
+
return
|
|
1716
|
+
|
|
1717
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1718
|
+
try:
|
|
1719
|
+
self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
1720
|
+
except Exception as e:
|
|
1721
|
+
log_error(f"Error upserting document: {e}")
|
|
1722
|
+
content.status = ContentStatus.FAILED
|
|
1723
|
+
content.status_message = "Could not upsert embedding"
|
|
1724
|
+
self._update_content(content)
|
|
1725
|
+
return
|
|
1726
|
+
else:
|
|
1727
|
+
try:
|
|
1728
|
+
self.vector_db.insert(
|
|
1729
|
+
content.content_hash, # type: ignore[arg-type]
|
|
1730
|
+
documents=read_documents,
|
|
1731
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
1732
|
+
)
|
|
959
1733
|
except Exception as e:
|
|
960
1734
|
log_error(f"Error inserting document: {e}")
|
|
961
1735
|
content.status = ContentStatus.FAILED
|
|
@@ -966,7 +1740,7 @@ class Knowledge:
|
|
|
966
1740
|
content.status = ContentStatus.COMPLETED
|
|
967
1741
|
self._update_content(content)
|
|
968
1742
|
|
|
969
|
-
|
|
1743
|
+
def _load_content(
|
|
970
1744
|
self,
|
|
971
1745
|
content: Content,
|
|
972
1746
|
upsert: bool,
|
|
@@ -974,42 +1748,93 @@ class Knowledge:
|
|
|
974
1748
|
include: Optional[List[str]] = None,
|
|
975
1749
|
exclude: Optional[List[str]] = None,
|
|
976
1750
|
) -> None:
|
|
977
|
-
|
|
1751
|
+
"""Synchronously load content."""
|
|
1752
|
+
if content.path:
|
|
1753
|
+
self._load_from_path(content, upsert, skip_if_exists, include, exclude)
|
|
1754
|
+
|
|
1755
|
+
if content.url:
|
|
1756
|
+
self._load_from_url(content, upsert, skip_if_exists)
|
|
1757
|
+
|
|
1758
|
+
if content.file_data:
|
|
1759
|
+
self._load_from_content(content, upsert, skip_if_exists)
|
|
1760
|
+
|
|
1761
|
+
if content.topics:
|
|
1762
|
+
self._load_from_topics(content, upsert, skip_if_exists)
|
|
978
1763
|
|
|
979
|
-
if content.
|
|
980
|
-
self.
|
|
1764
|
+
if content.remote_content:
|
|
1765
|
+
self._load_from_remote_content(content, upsert, skip_if_exists)
|
|
981
1766
|
|
|
1767
|
+
async def _load_content_async(
|
|
1768
|
+
self,
|
|
1769
|
+
content: Content,
|
|
1770
|
+
upsert: bool,
|
|
1771
|
+
skip_if_exists: bool,
|
|
1772
|
+
include: Optional[List[str]] = None,
|
|
1773
|
+
exclude: Optional[List[str]] = None,
|
|
1774
|
+
) -> None:
|
|
982
1775
|
if content.path:
|
|
983
|
-
await self.
|
|
1776
|
+
await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
|
|
984
1777
|
|
|
985
1778
|
if content.url:
|
|
986
|
-
await self.
|
|
1779
|
+
await self._load_from_url_async(content, upsert, skip_if_exists)
|
|
987
1780
|
|
|
988
1781
|
if content.file_data:
|
|
989
|
-
await self.
|
|
1782
|
+
await self._load_from_content_async(content, upsert, skip_if_exists)
|
|
990
1783
|
|
|
991
1784
|
if content.topics:
|
|
992
|
-
await self.
|
|
1785
|
+
await self._load_from_topics_async(content, upsert, skip_if_exists)
|
|
993
1786
|
|
|
994
1787
|
if content.remote_content:
|
|
995
|
-
await self.
|
|
1788
|
+
await self._load_from_remote_content_async(content, upsert, skip_if_exists)
|
|
996
1789
|
|
|
997
1790
|
def _build_content_hash(self, content: Content) -> str:
|
|
998
1791
|
"""
|
|
999
1792
|
Build the content hash from the content.
|
|
1793
|
+
|
|
1794
|
+
For URLs and paths, includes the name and description in the hash if provided
|
|
1795
|
+
to ensure unique content with the same URL/path but different names/descriptions
|
|
1796
|
+
get different hashes.
|
|
1797
|
+
|
|
1798
|
+
Hash format:
|
|
1799
|
+
- URL with name and description: hash("{name}:{description}:{url}")
|
|
1800
|
+
- URL with name only: hash("{name}:{url}")
|
|
1801
|
+
- URL with description only: hash("{description}:{url}")
|
|
1802
|
+
- URL without name/description: hash("{url}") (backward compatible)
|
|
1803
|
+
- Same logic applies to paths
|
|
1000
1804
|
"""
|
|
1805
|
+
hash_parts = []
|
|
1806
|
+
if content.name:
|
|
1807
|
+
hash_parts.append(content.name)
|
|
1808
|
+
if content.description:
|
|
1809
|
+
hash_parts.append(content.description)
|
|
1810
|
+
|
|
1001
1811
|
if content.path:
|
|
1002
|
-
|
|
1812
|
+
hash_parts.append(str(content.path))
|
|
1003
1813
|
elif content.url:
|
|
1004
|
-
|
|
1005
|
-
return hash
|
|
1814
|
+
hash_parts.append(content.url)
|
|
1006
1815
|
elif content.file_data and content.file_data.content:
|
|
1007
|
-
|
|
1008
|
-
|
|
1816
|
+
# For file_data, always add filename, type, size, or content for uniqueness
|
|
1817
|
+
if content.file_data.filename:
|
|
1818
|
+
hash_parts.append(content.file_data.filename)
|
|
1819
|
+
elif content.file_data.type:
|
|
1820
|
+
hash_parts.append(content.file_data.type)
|
|
1821
|
+
elif content.file_data.size is not None:
|
|
1822
|
+
hash_parts.append(str(content.file_data.size))
|
|
1823
|
+
else:
|
|
1824
|
+
# Fallback: use the content for uniqueness
|
|
1825
|
+
# Include type information to distinguish str vs bytes
|
|
1826
|
+
content_type = "str" if isinstance(content.file_data.content, str) else "bytes"
|
|
1827
|
+
content_bytes = (
|
|
1828
|
+
content.file_data.content.encode()
|
|
1829
|
+
if isinstance(content.file_data.content, str)
|
|
1830
|
+
else content.file_data.content
|
|
1831
|
+
)
|
|
1832
|
+
content_hash = hashlib.sha256(content_bytes).hexdigest()[:16] # Use first 16 chars
|
|
1833
|
+
hash_parts.append(f"{content_type}:{content_hash}")
|
|
1009
1834
|
elif content.topics and len(content.topics) > 0:
|
|
1010
1835
|
topic = content.topics[0]
|
|
1011
1836
|
reader = type(content.reader).__name__ if content.reader else "unknown"
|
|
1012
|
-
|
|
1837
|
+
hash_parts.append(f"{topic}-{reader}")
|
|
1013
1838
|
else:
|
|
1014
1839
|
# Fallback for edge cases
|
|
1015
1840
|
import random
|
|
@@ -1020,7 +1845,10 @@ class Knowledge:
|
|
|
1020
1845
|
or content.id
|
|
1021
1846
|
or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
|
|
1022
1847
|
)
|
|
1023
|
-
|
|
1848
|
+
hash_parts.append(fallback)
|
|
1849
|
+
|
|
1850
|
+
hash_input = ":".join(hash_parts)
|
|
1851
|
+
return hashlib.sha256(hash_input.encode()).hexdigest()
|
|
1024
1852
|
|
|
1025
1853
|
def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
|
|
1026
1854
|
"""
|
|
@@ -1064,8 +1892,57 @@ class Knowledge:
|
|
|
1064
1892
|
# Already a string, return as-is
|
|
1065
1893
|
return value
|
|
1066
1894
|
|
|
1895
|
+
async def _add_to_contents_db_async(self, content: Content):
|
|
1896
|
+
if self.contents_db:
|
|
1897
|
+
created_at = content.created_at if content.created_at else int(time.time())
|
|
1898
|
+
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
1899
|
+
|
|
1900
|
+
file_type = (
|
|
1901
|
+
content.file_type
|
|
1902
|
+
if content.file_type
|
|
1903
|
+
else content.file_data.type
|
|
1904
|
+
if content.file_data and content.file_data.type
|
|
1905
|
+
else None
|
|
1906
|
+
)
|
|
1907
|
+
# Safely handle string fields with proper type checking
|
|
1908
|
+
safe_name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1909
|
+
safe_description = self._ensure_string_field(content.description, "content.description", default="")
|
|
1910
|
+
safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
|
|
1911
|
+
safe_status_message = self._ensure_string_field(
|
|
1912
|
+
content.status_message, "content.status_message", default=""
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
content_row = KnowledgeRow(
|
|
1916
|
+
id=content.id,
|
|
1917
|
+
name=safe_name,
|
|
1918
|
+
description=safe_description,
|
|
1919
|
+
metadata=content.metadata,
|
|
1920
|
+
type=file_type,
|
|
1921
|
+
size=content.size
|
|
1922
|
+
if content.size
|
|
1923
|
+
else len(content.file_data.content)
|
|
1924
|
+
if content.file_data and content.file_data.content
|
|
1925
|
+
else None,
|
|
1926
|
+
linked_to=safe_linked_to,
|
|
1927
|
+
access_count=0,
|
|
1928
|
+
status=content.status if content.status else ContentStatus.PROCESSING,
|
|
1929
|
+
status_message=safe_status_message,
|
|
1930
|
+
created_at=created_at,
|
|
1931
|
+
updated_at=updated_at,
|
|
1932
|
+
)
|
|
1933
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1934
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1935
|
+
else:
|
|
1936
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1937
|
+
|
|
1067
1938
|
def _add_to_contents_db(self, content: Content):
|
|
1939
|
+
"""Synchronously add content to contents database."""
|
|
1068
1940
|
if self.contents_db:
|
|
1941
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1942
|
+
raise ValueError(
|
|
1943
|
+
"_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
|
|
1944
|
+
)
|
|
1945
|
+
|
|
1069
1946
|
created_at = content.created_at if content.created_at else int(time.time())
|
|
1070
1947
|
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
1071
1948
|
|
|
@@ -1108,55 +1985,261 @@ class Knowledge:
|
|
|
1108
1985
|
from agno.vectordb import VectorDb
|
|
1109
1986
|
|
|
1110
1987
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1988
|
+
if self.contents_db:
|
|
1989
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1990
|
+
raise ValueError(
|
|
1991
|
+
"update_content() is not supported with an async DB. Please use aupdate_content() instead."
|
|
1992
|
+
)
|
|
1993
|
+
|
|
1994
|
+
if not content.id:
|
|
1995
|
+
log_warning("Content id is required to update Knowledge content")
|
|
1996
|
+
return None
|
|
1997
|
+
|
|
1998
|
+
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
1999
|
+
content_row = self.contents_db.get_knowledge_content(content.id)
|
|
2000
|
+
if content_row is None:
|
|
2001
|
+
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
2002
|
+
return None
|
|
2003
|
+
|
|
2004
|
+
# Apply safe string handling for updates as well
|
|
2005
|
+
if content.name is not None:
|
|
2006
|
+
content_row.name = self._ensure_string_field(content.name, "content.name", default="")
|
|
2007
|
+
if content.description is not None:
|
|
2008
|
+
content_row.description = self._ensure_string_field(
|
|
2009
|
+
content.description, "content.description", default=""
|
|
2010
|
+
)
|
|
2011
|
+
if content.metadata is not None:
|
|
2012
|
+
content_row.metadata = content.metadata
|
|
2013
|
+
if content.status is not None:
|
|
2014
|
+
content_row.status = content.status
|
|
2015
|
+
if content.status_message is not None:
|
|
2016
|
+
content_row.status_message = self._ensure_string_field(
|
|
2017
|
+
content.status_message, "content.status_message", default=""
|
|
2018
|
+
)
|
|
2019
|
+
if content.external_id is not None:
|
|
2020
|
+
content_row.external_id = self._ensure_string_field(
|
|
2021
|
+
content.external_id, "content.external_id", default=""
|
|
2022
|
+
)
|
|
2023
|
+
content_row.updated_at = int(time.time())
|
|
2024
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2025
|
+
|
|
2026
|
+
if self.vector_db:
|
|
2027
|
+
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
|
|
2028
|
+
|
|
2029
|
+
return content_row.to_dict()
|
|
2030
|
+
|
|
2031
|
+
else:
|
|
2032
|
+
if self.name:
|
|
2033
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
2034
|
+
else:
|
|
2035
|
+
log_warning("Contents DB not found for knowledge base")
|
|
2036
|
+
return None
|
|
2037
|
+
|
|
2038
|
+
async def _aupdate_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1111
2039
|
if self.contents_db:
|
|
1112
2040
|
if not content.id:
|
|
1113
2041
|
log_warning("Content id is required to update Knowledge content")
|
|
1114
2042
|
return None
|
|
1115
2043
|
|
|
1116
|
-
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
2044
|
+
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
2045
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2046
|
+
content_row = await self.contents_db.get_knowledge_content(content.id)
|
|
2047
|
+
else:
|
|
2048
|
+
content_row = self.contents_db.get_knowledge_content(content.id)
|
|
2049
|
+
if content_row is None:
|
|
2050
|
+
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
2051
|
+
return None
|
|
2052
|
+
|
|
2053
|
+
if content.name is not None:
|
|
2054
|
+
content_row.name = content.name
|
|
2055
|
+
if content.description is not None:
|
|
2056
|
+
content_row.description = content.description
|
|
2057
|
+
if content.metadata is not None:
|
|
2058
|
+
content_row.metadata = content.metadata
|
|
2059
|
+
if content.status is not None:
|
|
2060
|
+
content_row.status = content.status
|
|
2061
|
+
if content.status_message is not None:
|
|
2062
|
+
content_row.status_message = content.status_message if content.status_message else ""
|
|
2063
|
+
if content.external_id is not None:
|
|
2064
|
+
content_row.external_id = content.external_id
|
|
2065
|
+
|
|
2066
|
+
content_row.updated_at = int(time.time())
|
|
2067
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2068
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2069
|
+
else:
|
|
2070
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2071
|
+
|
|
2072
|
+
if self.vector_db:
|
|
2073
|
+
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
|
|
2074
|
+
|
|
2075
|
+
return content_row.to_dict()
|
|
2076
|
+
|
|
2077
|
+
else:
|
|
2078
|
+
if self.name:
|
|
2079
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
2080
|
+
else:
|
|
2081
|
+
log_warning("Contents DB not found for knowledge base")
|
|
2082
|
+
return None
|
|
2083
|
+
|
|
2084
|
+
async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
2085
|
+
from agno.vectordb import VectorDb
|
|
2086
|
+
|
|
2087
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2088
|
+
|
|
2089
|
+
await self._add_to_contents_db_async(content)
|
|
2090
|
+
if content_type == KnowledgeContentOrigin.PATH:
|
|
2091
|
+
if content.file_data is None:
|
|
2092
|
+
log_warning("No file data provided")
|
|
2093
|
+
|
|
2094
|
+
if content.path is None:
|
|
2095
|
+
log_error("No path provided for content")
|
|
2096
|
+
return
|
|
2097
|
+
|
|
2098
|
+
path = Path(content.path)
|
|
2099
|
+
|
|
2100
|
+
log_info(f"Uploading file to LightRAG from path: {path}")
|
|
2101
|
+
try:
|
|
2102
|
+
# Read the file content from path
|
|
2103
|
+
with open(path, "rb") as f:
|
|
2104
|
+
file_content = f.read()
|
|
2105
|
+
|
|
2106
|
+
# Get file type from extension or content.file_type
|
|
2107
|
+
file_type = content.file_type or path.suffix
|
|
2108
|
+
|
|
2109
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2110
|
+
result = await self.vector_db.insert_file_bytes(
|
|
2111
|
+
file_content=file_content,
|
|
2112
|
+
filename=path.name, # Use the original filename with extension
|
|
2113
|
+
content_type=file_type,
|
|
2114
|
+
send_metadata=True, # Enable metadata so server knows the file type
|
|
2115
|
+
)
|
|
2116
|
+
|
|
2117
|
+
else:
|
|
2118
|
+
log_error("Vector database does not support file insertion")
|
|
2119
|
+
content.status = ContentStatus.FAILED
|
|
2120
|
+
await self._aupdate_content(content)
|
|
2121
|
+
return
|
|
2122
|
+
content.external_id = result
|
|
2123
|
+
content.status = ContentStatus.COMPLETED
|
|
2124
|
+
await self._aupdate_content(content)
|
|
2125
|
+
return
|
|
2126
|
+
|
|
2127
|
+
except Exception as e:
|
|
2128
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2129
|
+
content.status = ContentStatus.FAILED
|
|
2130
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2131
|
+
await self._aupdate_content(content)
|
|
2132
|
+
return
|
|
2133
|
+
|
|
2134
|
+
elif content_type == KnowledgeContentOrigin.URL:
|
|
2135
|
+
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
2136
|
+
try:
|
|
2137
|
+
reader = content.reader or self.website_reader
|
|
2138
|
+
if reader is None:
|
|
2139
|
+
log_error("No URL reader available")
|
|
2140
|
+
content.status = ContentStatus.FAILED
|
|
2141
|
+
await self._aupdate_content(content)
|
|
2142
|
+
return
|
|
2143
|
+
|
|
2144
|
+
reader.chunk = False
|
|
2145
|
+
read_documents = reader.read(content.url, name=content.name)
|
|
2146
|
+
if not content.id:
|
|
2147
|
+
content.id = generate_id(content.content_hash or "")
|
|
2148
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2149
|
+
|
|
2150
|
+
if not read_documents:
|
|
2151
|
+
log_error("No documents read from URL")
|
|
2152
|
+
content.status = ContentStatus.FAILED
|
|
2153
|
+
await self._aupdate_content(content)
|
|
2154
|
+
return
|
|
2155
|
+
|
|
2156
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2157
|
+
result = await self.vector_db.insert_text(
|
|
2158
|
+
file_source=content.url,
|
|
2159
|
+
text=read_documents[0].content,
|
|
2160
|
+
)
|
|
2161
|
+
else:
|
|
2162
|
+
log_error("Vector database does not support text insertion")
|
|
2163
|
+
content.status = ContentStatus.FAILED
|
|
2164
|
+
await self._aupdate_content(content)
|
|
2165
|
+
return
|
|
2166
|
+
|
|
2167
|
+
content.external_id = result
|
|
2168
|
+
content.status = ContentStatus.COMPLETED
|
|
2169
|
+
await self._aupdate_content(content)
|
|
2170
|
+
return
|
|
2171
|
+
|
|
2172
|
+
except Exception as e:
|
|
2173
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2174
|
+
content.status = ContentStatus.FAILED
|
|
2175
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2176
|
+
await self._aupdate_content(content)
|
|
2177
|
+
return
|
|
2178
|
+
|
|
2179
|
+
elif content_type == KnowledgeContentOrigin.CONTENT:
|
|
2180
|
+
filename = (
|
|
2181
|
+
content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
|
|
2182
|
+
)
|
|
2183
|
+
log_info(f"Uploading file to LightRAG: {filename}")
|
|
1121
2184
|
|
|
1122
|
-
#
|
|
1123
|
-
if content.
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
)
|
|
1141
|
-
|
|
1142
|
-
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2185
|
+
# Use the content from file_data
|
|
2186
|
+
if content.file_data and content.file_data.content:
|
|
2187
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2188
|
+
result = await self.vector_db.insert_file_bytes(
|
|
2189
|
+
file_content=content.file_data.content,
|
|
2190
|
+
filename=filename,
|
|
2191
|
+
content_type=content.file_data.type,
|
|
2192
|
+
send_metadata=True, # Enable metadata so server knows the file type
|
|
2193
|
+
)
|
|
2194
|
+
else:
|
|
2195
|
+
log_error("Vector database does not support file insertion")
|
|
2196
|
+
content.status = ContentStatus.FAILED
|
|
2197
|
+
await self._aupdate_content(content)
|
|
2198
|
+
return
|
|
2199
|
+
content.external_id = result
|
|
2200
|
+
content.status = ContentStatus.COMPLETED
|
|
2201
|
+
await self._aupdate_content(content)
|
|
2202
|
+
else:
|
|
2203
|
+
log_warning(f"No file data available for LightRAG upload: {content.name}")
|
|
2204
|
+
return
|
|
1143
2205
|
|
|
1144
|
-
|
|
1145
|
-
|
|
2206
|
+
elif content_type == KnowledgeContentOrigin.TOPIC:
|
|
2207
|
+
log_info(f"Uploading file to LightRAG: {content.name}")
|
|
1146
2208
|
|
|
1147
|
-
if content.
|
|
1148
|
-
|
|
2209
|
+
if content.reader is None:
|
|
2210
|
+
log_error("No reader available for topic content")
|
|
2211
|
+
content.status = ContentStatus.FAILED
|
|
2212
|
+
await self._aupdate_content(content)
|
|
2213
|
+
return
|
|
1149
2214
|
|
|
1150
|
-
|
|
2215
|
+
if not content.topics:
|
|
2216
|
+
log_error("No topics available for content")
|
|
2217
|
+
content.status = ContentStatus.FAILED
|
|
2218
|
+
await self._aupdate_content(content)
|
|
2219
|
+
return
|
|
1151
2220
|
|
|
1152
|
-
|
|
1153
|
-
if
|
|
1154
|
-
|
|
2221
|
+
read_documents = content.reader.read(content.topics)
|
|
2222
|
+
if len(read_documents) > 0:
|
|
2223
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2224
|
+
result = await self.vector_db.insert_text(
|
|
2225
|
+
file_source=content.topics[0],
|
|
2226
|
+
text=read_documents[0].content,
|
|
2227
|
+
)
|
|
2228
|
+
else:
|
|
2229
|
+
log_error("Vector database does not support text insertion")
|
|
2230
|
+
content.status = ContentStatus.FAILED
|
|
2231
|
+
await self._aupdate_content(content)
|
|
2232
|
+
return
|
|
2233
|
+
content.external_id = result
|
|
2234
|
+
content.status = ContentStatus.COMPLETED
|
|
2235
|
+
await self._aupdate_content(content)
|
|
2236
|
+
return
|
|
1155
2237
|
else:
|
|
1156
|
-
log_warning("
|
|
1157
|
-
|
|
2238
|
+
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
2239
|
+
return
|
|
1158
2240
|
|
|
1159
|
-
|
|
2241
|
+
def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
2242
|
+
"""Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
|
|
1160
2243
|
from agno.vectordb import VectorDb
|
|
1161
2244
|
|
|
1162
2245
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
@@ -1182,13 +2265,15 @@ class Knowledge:
|
|
|
1182
2265
|
file_type = content.file_type or path.suffix
|
|
1183
2266
|
|
|
1184
2267
|
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
2268
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2269
|
+
result = asyncio.run(
|
|
2270
|
+
self.vector_db.insert_file_bytes(
|
|
2271
|
+
file_content=file_content,
|
|
2272
|
+
filename=path.name,
|
|
2273
|
+
content_type=file_type,
|
|
2274
|
+
send_metadata=True,
|
|
2275
|
+
)
|
|
1190
2276
|
)
|
|
1191
|
-
|
|
1192
2277
|
else:
|
|
1193
2278
|
log_error("Vector database does not support file insertion")
|
|
1194
2279
|
content.status = ContentStatus.FAILED
|
|
@@ -1218,9 +2303,9 @@ class Knowledge:
|
|
|
1218
2303
|
|
|
1219
2304
|
reader.chunk = False
|
|
1220
2305
|
read_documents = reader.read(content.url, name=content.name)
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
2306
|
+
if not content.id:
|
|
2307
|
+
content.id = generate_id(content.content_hash or "")
|
|
2308
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1224
2309
|
|
|
1225
2310
|
if not read_documents:
|
|
1226
2311
|
log_error("No documents read from URL")
|
|
@@ -1229,9 +2314,12 @@ class Knowledge:
|
|
|
1229
2314
|
return
|
|
1230
2315
|
|
|
1231
2316
|
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
2317
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2318
|
+
result = asyncio.run(
|
|
2319
|
+
self.vector_db.insert_text(
|
|
2320
|
+
file_source=content.url,
|
|
2321
|
+
text=read_documents[0].content,
|
|
2322
|
+
)
|
|
1235
2323
|
)
|
|
1236
2324
|
else:
|
|
1237
2325
|
log_error("Vector database does not support text insertion")
|
|
@@ -1260,11 +2348,14 @@ class Knowledge:
|
|
|
1260
2348
|
# Use the content from file_data
|
|
1261
2349
|
if content.file_data and content.file_data.content:
|
|
1262
2350
|
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
2351
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2352
|
+
result = asyncio.run(
|
|
2353
|
+
self.vector_db.insert_file_bytes(
|
|
2354
|
+
file_content=content.file_data.content,
|
|
2355
|
+
filename=filename,
|
|
2356
|
+
content_type=content.file_data.type,
|
|
2357
|
+
send_metadata=True,
|
|
2358
|
+
)
|
|
1268
2359
|
)
|
|
1269
2360
|
else:
|
|
1270
2361
|
log_error("Vector database does not support file insertion")
|
|
@@ -1296,9 +2387,12 @@ class Knowledge:
|
|
|
1296
2387
|
read_documents = content.reader.read(content.topics)
|
|
1297
2388
|
if len(read_documents) > 0:
|
|
1298
2389
|
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
2390
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2391
|
+
result = asyncio.run(
|
|
2392
|
+
self.vector_db.insert_text(
|
|
2393
|
+
file_source=content.topics[0],
|
|
2394
|
+
text=read_documents[0].content,
|
|
2395
|
+
)
|
|
1302
2396
|
)
|
|
1303
2397
|
else:
|
|
1304
2398
|
log_error("Vector database does not support text insertion")
|
|
@@ -1314,13 +2408,24 @@ class Knowledge:
|
|
|
1314
2408
|
return
|
|
1315
2409
|
|
|
1316
2410
|
def search(
|
|
1317
|
-
self,
|
|
2411
|
+
self,
|
|
2412
|
+
query: str,
|
|
2413
|
+
max_results: Optional[int] = None,
|
|
2414
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
2415
|
+
search_type: Optional[str] = None,
|
|
1318
2416
|
) -> List[Document]:
|
|
1319
2417
|
"""Returns relevant documents matching a query"""
|
|
1320
|
-
|
|
1321
2418
|
from agno.vectordb import VectorDb
|
|
2419
|
+
from agno.vectordb.search import SearchType
|
|
1322
2420
|
|
|
1323
2421
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2422
|
+
|
|
2423
|
+
if (
|
|
2424
|
+
hasattr(self.vector_db, "search_type")
|
|
2425
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
2426
|
+
and search_type
|
|
2427
|
+
):
|
|
2428
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
1324
2429
|
try:
|
|
1325
2430
|
if self.vector_db is None:
|
|
1326
2431
|
log_warning("No vector db provided")
|
|
@@ -1334,13 +2439,23 @@ class Knowledge:
|
|
|
1334
2439
|
return []
|
|
1335
2440
|
|
|
1336
2441
|
async def async_search(
|
|
1337
|
-
self,
|
|
2442
|
+
self,
|
|
2443
|
+
query: str,
|
|
2444
|
+
max_results: Optional[int] = None,
|
|
2445
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
2446
|
+
search_type: Optional[str] = None,
|
|
1338
2447
|
) -> List[Document]:
|
|
1339
2448
|
"""Returns relevant documents matching a query"""
|
|
1340
|
-
|
|
1341
2449
|
from agno.vectordb import VectorDb
|
|
2450
|
+
from agno.vectordb.search import SearchType
|
|
1342
2451
|
|
|
1343
2452
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2453
|
+
if (
|
|
2454
|
+
hasattr(self.vector_db, "search_type")
|
|
2455
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
2456
|
+
and search_type
|
|
2457
|
+
):
|
|
2458
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
1344
2459
|
try:
|
|
1345
2460
|
if self.vector_db is None:
|
|
1346
2461
|
log_warning("No vector db provided")
|
|
@@ -1358,57 +2473,90 @@ class Knowledge:
|
|
|
1358
2473
|
return []
|
|
1359
2474
|
|
|
1360
2475
|
def get_valid_filters(self) -> Set[str]:
|
|
1361
|
-
if self.
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
2476
|
+
if self.contents_db is None:
|
|
2477
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
2478
|
+
return set()
|
|
2479
|
+
contents, _ = self.get_content()
|
|
2480
|
+
valid_filters: Set[str] = set()
|
|
2481
|
+
for content in contents:
|
|
2482
|
+
if content.metadata:
|
|
2483
|
+
valid_filters.update(content.metadata.keys())
|
|
2484
|
+
|
|
2485
|
+
return valid_filters
|
|
2486
|
+
|
|
2487
|
+
async def async_get_valid_filters(self) -> Set[str]:
|
|
2488
|
+
if self.contents_db is None:
|
|
2489
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
2490
|
+
return set()
|
|
2491
|
+
contents, _ = await self.aget_content()
|
|
2492
|
+
valid_filters: Set[str] = set()
|
|
2493
|
+
for content in contents:
|
|
2494
|
+
if content.metadata:
|
|
2495
|
+
valid_filters.update(content.metadata.keys())
|
|
1365
2496
|
|
|
1366
|
-
|
|
1367
|
-
if self.valid_metadata_filters is None:
|
|
1368
|
-
self.valid_metadata_filters = set()
|
|
1369
|
-
self.valid_metadata_filters.update(self._get_filters_from_db)
|
|
2497
|
+
return valid_filters
|
|
1370
2498
|
|
|
2499
|
+
def _validate_filters(
|
|
2500
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]], valid_metadata_filters: Set[str]
|
|
2501
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
1371
2502
|
if not filters:
|
|
1372
2503
|
return {}, []
|
|
1373
2504
|
|
|
1374
|
-
valid_filters: Dict[str, Any] = {}
|
|
2505
|
+
valid_filters: Union[Dict[str, Any], List[FilterExpr]] = {}
|
|
1375
2506
|
invalid_keys = []
|
|
1376
2507
|
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
2508
|
+
if isinstance(filters, dict):
|
|
2509
|
+
# If no metadata filters tracked yet, all keys are considered invalid
|
|
2510
|
+
if valid_metadata_filters is None or not valid_metadata_filters:
|
|
2511
|
+
invalid_keys = list(filters.keys())
|
|
2512
|
+
log_warning(
|
|
2513
|
+
f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}"
|
|
2514
|
+
)
|
|
2515
|
+
return {}, invalid_keys
|
|
2516
|
+
|
|
2517
|
+
for key, value in filters.items():
|
|
2518
|
+
# Handle both normal keys and prefixed keys like meta_data.key
|
|
2519
|
+
base_key = key.split(".")[-1] if "." in key else key
|
|
2520
|
+
if base_key in valid_metadata_filters or key in valid_metadata_filters:
|
|
2521
|
+
valid_filters[key] = value # type: ignore
|
|
2522
|
+
else:
|
|
2523
|
+
invalid_keys.append(key)
|
|
2524
|
+
log_warning(f"Invalid filter key: {key} - not present in knowledge base")
|
|
2525
|
+
|
|
2526
|
+
elif isinstance(filters, List):
|
|
2527
|
+
# Validate that list contains FilterExpr instances
|
|
2528
|
+
for i, filter_item in enumerate(filters):
|
|
2529
|
+
if not isinstance(filter_item, FilterExpr):
|
|
2530
|
+
log_warning(
|
|
2531
|
+
f"Invalid filter at index {i}: expected FilterExpr instance, "
|
|
2532
|
+
f"got {type(filter_item).__name__}. "
|
|
2533
|
+
f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
|
|
2534
|
+
f"AND(...), OR(...), NOT(...) from agno.filters"
|
|
2535
|
+
)
|
|
2536
|
+
# Filter expressions are already validated, return empty dict/list
|
|
2537
|
+
# The actual filtering happens in the vector_db layer
|
|
2538
|
+
return filters, []
|
|
1391
2539
|
|
|
1392
2540
|
return valid_filters, invalid_keys
|
|
1393
2541
|
|
|
1394
|
-
def
|
|
1395
|
-
|
|
1396
|
-
|
|
2542
|
+
def validate_filters(
|
|
2543
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
2544
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
2545
|
+
valid_filters_from_db = self.get_valid_filters()
|
|
1397
2546
|
|
|
1398
|
-
|
|
1399
|
-
for key in metadata.keys():
|
|
1400
|
-
self.valid_metadata_filters.add(key)
|
|
2547
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
1401
2548
|
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
2549
|
+
return valid_filters, invalid_keys
|
|
2550
|
+
|
|
2551
|
+
async def async_validate_filters(
|
|
2552
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
2553
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
2554
|
+
"""Return a tuple containing a dict with all valid filters and a list of invalid filter keys"""
|
|
2555
|
+
valid_filters_from_db = await self.async_get_valid_filters()
|
|
2556
|
+
|
|
2557
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
2558
|
+
|
|
2559
|
+
return valid_filters, invalid_keys
|
|
1412
2560
|
|
|
1413
2561
|
def remove_vector_by_id(self, id: str) -> bool:
|
|
1414
2562
|
from agno.vectordb import VectorDb
|
|
@@ -1442,10 +2590,46 @@ class Knowledge:
|
|
|
1442
2590
|
def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1443
2591
|
return self._update_content(content)
|
|
1444
2592
|
|
|
2593
|
+
async def apatch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
2594
|
+
return await self._aupdate_content(content)
|
|
2595
|
+
|
|
1445
2596
|
def get_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
1446
2597
|
if self.contents_db is None:
|
|
1447
2598
|
raise ValueError("No contents db provided")
|
|
2599
|
+
|
|
2600
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2601
|
+
raise ValueError(
|
|
2602
|
+
"get_content_by_id() is not supported for async databases. Please use aget_content_by_id() instead."
|
|
2603
|
+
)
|
|
2604
|
+
|
|
1448
2605
|
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2606
|
+
|
|
2607
|
+
if content_row is None:
|
|
2608
|
+
return None
|
|
2609
|
+
content = Content(
|
|
2610
|
+
id=content_row.id,
|
|
2611
|
+
name=content_row.name,
|
|
2612
|
+
description=content_row.description,
|
|
2613
|
+
metadata=content_row.metadata,
|
|
2614
|
+
file_type=content_row.type,
|
|
2615
|
+
size=content_row.size,
|
|
2616
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2617
|
+
status_message=content_row.status_message,
|
|
2618
|
+
created_at=content_row.created_at,
|
|
2619
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2620
|
+
external_id=content_row.external_id,
|
|
2621
|
+
)
|
|
2622
|
+
return content
|
|
2623
|
+
|
|
2624
|
+
async def aget_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
2625
|
+
if self.contents_db is None:
|
|
2626
|
+
raise ValueError("No contents db provided")
|
|
2627
|
+
|
|
2628
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2629
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
2630
|
+
else:
|
|
2631
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2632
|
+
|
|
1449
2633
|
if content_row is None:
|
|
1450
2634
|
return None
|
|
1451
2635
|
content = Content(
|
|
@@ -1472,6 +2656,10 @@ class Knowledge:
|
|
|
1472
2656
|
) -> Tuple[List[Content], int]:
|
|
1473
2657
|
if self.contents_db is None:
|
|
1474
2658
|
raise ValueError("No contents db provided")
|
|
2659
|
+
|
|
2660
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2661
|
+
raise ValueError("get_content() is not supported for async databases. Please use aget_content() instead.")
|
|
2662
|
+
|
|
1475
2663
|
contents, count = self.contents_db.get_knowledge_contents(
|
|
1476
2664
|
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
1477
2665
|
)
|
|
@@ -1495,9 +2683,53 @@ class Knowledge:
|
|
|
1495
2683
|
result.append(content)
|
|
1496
2684
|
return result, count
|
|
1497
2685
|
|
|
2686
|
+
async def aget_content(
|
|
2687
|
+
self,
|
|
2688
|
+
limit: Optional[int] = None,
|
|
2689
|
+
page: Optional[int] = None,
|
|
2690
|
+
sort_by: Optional[str] = None,
|
|
2691
|
+
sort_order: Optional[str] = None,
|
|
2692
|
+
) -> Tuple[List[Content], int]:
|
|
2693
|
+
if self.contents_db is None:
|
|
2694
|
+
raise ValueError("No contents db provided")
|
|
2695
|
+
|
|
2696
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2697
|
+
contents, count = await self.contents_db.get_knowledge_contents(
|
|
2698
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
2699
|
+
)
|
|
2700
|
+
else:
|
|
2701
|
+
contents, count = self.contents_db.get_knowledge_contents(
|
|
2702
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
2703
|
+
)
|
|
2704
|
+
|
|
2705
|
+
result = []
|
|
2706
|
+
for content_row in contents:
|
|
2707
|
+
# Create Content from database row
|
|
2708
|
+
content = Content(
|
|
2709
|
+
id=content_row.id,
|
|
2710
|
+
name=content_row.name,
|
|
2711
|
+
description=content_row.description,
|
|
2712
|
+
metadata=content_row.metadata,
|
|
2713
|
+
size=content_row.size,
|
|
2714
|
+
file_type=content_row.type,
|
|
2715
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2716
|
+
status_message=content_row.status_message,
|
|
2717
|
+
created_at=content_row.created_at,
|
|
2718
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2719
|
+
external_id=content_row.external_id,
|
|
2720
|
+
)
|
|
2721
|
+
result.append(content)
|
|
2722
|
+
return result, count
|
|
2723
|
+
|
|
1498
2724
|
def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
1499
2725
|
if self.contents_db is None:
|
|
1500
2726
|
raise ValueError("No contents db provided")
|
|
2727
|
+
|
|
2728
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2729
|
+
raise ValueError(
|
|
2730
|
+
"get_content_status() is not supported for async databases. Please use aget_content_status() instead."
|
|
2731
|
+
)
|
|
2732
|
+
|
|
1501
2733
|
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
1502
2734
|
if content_row is None:
|
|
1503
2735
|
return None, "Content not found"
|
|
@@ -1517,6 +2749,33 @@ class Knowledge:
|
|
|
1517
2749
|
|
|
1518
2750
|
return status, content_row.status_message
|
|
1519
2751
|
|
|
2752
|
+
async def aget_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
2753
|
+
if self.contents_db is None:
|
|
2754
|
+
raise ValueError("No contents db provided")
|
|
2755
|
+
|
|
2756
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2757
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
2758
|
+
else:
|
|
2759
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2760
|
+
|
|
2761
|
+
if content_row is None:
|
|
2762
|
+
return None, "Content not found"
|
|
2763
|
+
|
|
2764
|
+
# Convert string status to enum, defaulting to PROCESSING if unknown
|
|
2765
|
+
status_str = content_row.status
|
|
2766
|
+
try:
|
|
2767
|
+
status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
|
|
2768
|
+
except ValueError:
|
|
2769
|
+
# Handle legacy or unknown statuses
|
|
2770
|
+
if status_str and "failed" in status_str.lower():
|
|
2771
|
+
status = ContentStatus.FAILED
|
|
2772
|
+
elif status_str and "completed" in status_str.lower():
|
|
2773
|
+
status = ContentStatus.COMPLETED
|
|
2774
|
+
else:
|
|
2775
|
+
status = ContentStatus.PROCESSING
|
|
2776
|
+
|
|
2777
|
+
return status, content_row.status_message
|
|
2778
|
+
|
|
1520
2779
|
def remove_content_by_id(self, content_id: str):
|
|
1521
2780
|
from agno.vectordb import VectorDb
|
|
1522
2781
|
|
|
@@ -1535,12 +2794,36 @@ class Knowledge:
|
|
|
1535
2794
|
if self.contents_db is not None:
|
|
1536
2795
|
self.contents_db.delete_knowledge_content(content_id)
|
|
1537
2796
|
|
|
2797
|
+
async def aremove_content_by_id(self, content_id: str):
|
|
2798
|
+
if self.vector_db is not None:
|
|
2799
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
2800
|
+
# For LightRAG, get the content first to find the external_id
|
|
2801
|
+
content = await self.aget_content_by_id(content_id)
|
|
2802
|
+
if content and content.external_id:
|
|
2803
|
+
self.vector_db.delete_by_external_id(content.external_id) # type: ignore
|
|
2804
|
+
else:
|
|
2805
|
+
log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
|
|
2806
|
+
else:
|
|
2807
|
+
self.vector_db.delete_by_content_id(content_id)
|
|
2808
|
+
|
|
2809
|
+
if self.contents_db is not None:
|
|
2810
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2811
|
+
await self.contents_db.delete_knowledge_content(content_id)
|
|
2812
|
+
else:
|
|
2813
|
+
self.contents_db.delete_knowledge_content(content_id)
|
|
2814
|
+
|
|
1538
2815
|
def remove_all_content(self):
|
|
1539
2816
|
contents, _ = self.get_content()
|
|
1540
2817
|
for content in contents:
|
|
1541
2818
|
if content.id is not None:
|
|
1542
2819
|
self.remove_content_by_id(content.id)
|
|
1543
2820
|
|
|
2821
|
+
async def aremove_all_content(self):
|
|
2822
|
+
contents, _ = await self.aget_content()
|
|
2823
|
+
for content in contents:
|
|
2824
|
+
if content.id is not None:
|
|
2825
|
+
await self.aremove_content_by_id(content.id)
|
|
2826
|
+
|
|
1544
2827
|
# --- Reader Factory Integration ---
|
|
1545
2828
|
|
|
1546
2829
|
def construct_readers(self):
|
|
@@ -1563,6 +2846,24 @@ class Knowledge:
|
|
|
1563
2846
|
"""Get all currently loaded readers (only returns readers that have been used)."""
|
|
1564
2847
|
if self.readers is None:
|
|
1565
2848
|
self.readers = {}
|
|
2849
|
+
elif not isinstance(self.readers, dict):
|
|
2850
|
+
# Defensive check: if readers is not a dict (e.g., was set to a list), convert it
|
|
2851
|
+
if isinstance(self.readers, list):
|
|
2852
|
+
readers_dict: Dict[str, Reader] = {}
|
|
2853
|
+
for reader in self.readers:
|
|
2854
|
+
if isinstance(reader, Reader):
|
|
2855
|
+
reader_key = self._generate_reader_key(reader)
|
|
2856
|
+
# Handle potential duplicate keys by appending index if needed
|
|
2857
|
+
original_key = reader_key
|
|
2858
|
+
counter = 1
|
|
2859
|
+
while reader_key in readers_dict:
|
|
2860
|
+
reader_key = f"{original_key}_{counter}"
|
|
2861
|
+
counter += 1
|
|
2862
|
+
readers_dict[reader_key] = reader
|
|
2863
|
+
self.readers = readers_dict
|
|
2864
|
+
else:
|
|
2865
|
+
# For any other unexpected type, reset to empty dict
|
|
2866
|
+
self.readers = {}
|
|
1566
2867
|
|
|
1567
2868
|
return self.readers
|
|
1568
2869
|
|
|
@@ -1578,12 +2879,6 @@ class Knowledge:
|
|
|
1578
2879
|
log_info(f"Selecting reader for extension: {extension}")
|
|
1579
2880
|
return ReaderFactory.get_reader_for_extension(extension)
|
|
1580
2881
|
|
|
1581
|
-
def get_filters(self) -> List[str]:
|
|
1582
|
-
return [
|
|
1583
|
-
"filter_tag_1",
|
|
1584
|
-
"filter_tag2",
|
|
1585
|
-
]
|
|
1586
|
-
|
|
1587
2882
|
# --- Convenience Properties for Backward Compatibility ---
|
|
1588
2883
|
|
|
1589
2884
|
def _is_text_mime_type(self, mime_type: str) -> bool:
|
|
@@ -1675,6 +2970,11 @@ class Knowledge:
|
|
|
1675
2970
|
"""Docx reader - lazy loaded via factory."""
|
|
1676
2971
|
return self._get_reader("docx")
|
|
1677
2972
|
|
|
2973
|
+
@property
|
|
2974
|
+
def pptx_reader(self) -> Optional[Reader]:
|
|
2975
|
+
"""PPTX reader - lazy loaded via factory."""
|
|
2976
|
+
return self._get_reader("pptx")
|
|
2977
|
+
|
|
1678
2978
|
@property
|
|
1679
2979
|
def json_reader(self) -> Optional[Reader]:
|
|
1680
2980
|
"""JSON reader - lazy loaded via factory."""
|