agno 2.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. agno/agent/agent.py +5540 -2273
  2. agno/api/api.py +2 -0
  3. agno/api/os.py +1 -1
  4. agno/compression/__init__.py +3 -0
  5. agno/compression/manager.py +247 -0
  6. agno/culture/__init__.py +3 -0
  7. agno/culture/manager.py +956 -0
  8. agno/db/async_postgres/__init__.py +3 -0
  9. agno/db/base.py +689 -6
  10. agno/db/dynamo/dynamo.py +933 -37
  11. agno/db/dynamo/schemas.py +174 -10
  12. agno/db/dynamo/utils.py +63 -4
  13. agno/db/firestore/firestore.py +831 -9
  14. agno/db/firestore/schemas.py +51 -0
  15. agno/db/firestore/utils.py +102 -4
  16. agno/db/gcs_json/gcs_json_db.py +660 -12
  17. agno/db/gcs_json/utils.py +60 -26
  18. agno/db/in_memory/in_memory_db.py +287 -14
  19. agno/db/in_memory/utils.py +60 -2
  20. agno/db/json/json_db.py +590 -14
  21. agno/db/json/utils.py +60 -26
  22. agno/db/migrations/manager.py +199 -0
  23. agno/db/migrations/v1_to_v2.py +43 -13
  24. agno/db/migrations/versions/__init__.py +0 -0
  25. agno/db/migrations/versions/v2_3_0.py +938 -0
  26. agno/db/mongo/__init__.py +15 -1
  27. agno/db/mongo/async_mongo.py +2760 -0
  28. agno/db/mongo/mongo.py +879 -11
  29. agno/db/mongo/schemas.py +42 -0
  30. agno/db/mongo/utils.py +80 -8
  31. agno/db/mysql/__init__.py +2 -1
  32. agno/db/mysql/async_mysql.py +2912 -0
  33. agno/db/mysql/mysql.py +946 -68
  34. agno/db/mysql/schemas.py +72 -10
  35. agno/db/mysql/utils.py +198 -7
  36. agno/db/postgres/__init__.py +2 -1
  37. agno/db/postgres/async_postgres.py +2579 -0
  38. agno/db/postgres/postgres.py +942 -57
  39. agno/db/postgres/schemas.py +81 -18
  40. agno/db/postgres/utils.py +164 -2
  41. agno/db/redis/redis.py +671 -7
  42. agno/db/redis/schemas.py +50 -0
  43. agno/db/redis/utils.py +65 -7
  44. agno/db/schemas/__init__.py +2 -1
  45. agno/db/schemas/culture.py +120 -0
  46. agno/db/schemas/evals.py +1 -0
  47. agno/db/schemas/memory.py +17 -2
  48. agno/db/singlestore/schemas.py +63 -0
  49. agno/db/singlestore/singlestore.py +949 -83
  50. agno/db/singlestore/utils.py +60 -2
  51. agno/db/sqlite/__init__.py +2 -1
  52. agno/db/sqlite/async_sqlite.py +2911 -0
  53. agno/db/sqlite/schemas.py +62 -0
  54. agno/db/sqlite/sqlite.py +965 -46
  55. agno/db/sqlite/utils.py +169 -8
  56. agno/db/surrealdb/__init__.py +3 -0
  57. agno/db/surrealdb/metrics.py +292 -0
  58. agno/db/surrealdb/models.py +334 -0
  59. agno/db/surrealdb/queries.py +71 -0
  60. agno/db/surrealdb/surrealdb.py +1908 -0
  61. agno/db/surrealdb/utils.py +147 -0
  62. agno/db/utils.py +2 -0
  63. agno/eval/__init__.py +10 -0
  64. agno/eval/accuracy.py +75 -55
  65. agno/eval/agent_as_judge.py +861 -0
  66. agno/eval/base.py +29 -0
  67. agno/eval/performance.py +16 -7
  68. agno/eval/reliability.py +28 -16
  69. agno/eval/utils.py +35 -17
  70. agno/exceptions.py +27 -2
  71. agno/filters.py +354 -0
  72. agno/guardrails/prompt_injection.py +1 -0
  73. agno/hooks/__init__.py +3 -0
  74. agno/hooks/decorator.py +164 -0
  75. agno/integrations/discord/client.py +1 -1
  76. agno/knowledge/chunking/agentic.py +13 -10
  77. agno/knowledge/chunking/fixed.py +4 -1
  78. agno/knowledge/chunking/semantic.py +9 -4
  79. agno/knowledge/chunking/strategy.py +59 -15
  80. agno/knowledge/embedder/fastembed.py +1 -1
  81. agno/knowledge/embedder/nebius.py +1 -1
  82. agno/knowledge/embedder/ollama.py +8 -0
  83. agno/knowledge/embedder/openai.py +8 -8
  84. agno/knowledge/embedder/sentence_transformer.py +6 -2
  85. agno/knowledge/embedder/vllm.py +262 -0
  86. agno/knowledge/knowledge.py +1618 -318
  87. agno/knowledge/reader/base.py +6 -2
  88. agno/knowledge/reader/csv_reader.py +8 -10
  89. agno/knowledge/reader/docx_reader.py +5 -6
  90. agno/knowledge/reader/field_labeled_csv_reader.py +16 -20
  91. agno/knowledge/reader/json_reader.py +5 -4
  92. agno/knowledge/reader/markdown_reader.py +8 -8
  93. agno/knowledge/reader/pdf_reader.py +17 -19
  94. agno/knowledge/reader/pptx_reader.py +101 -0
  95. agno/knowledge/reader/reader_factory.py +32 -3
  96. agno/knowledge/reader/s3_reader.py +3 -3
  97. agno/knowledge/reader/tavily_reader.py +193 -0
  98. agno/knowledge/reader/text_reader.py +22 -10
  99. agno/knowledge/reader/web_search_reader.py +1 -48
  100. agno/knowledge/reader/website_reader.py +10 -10
  101. agno/knowledge/reader/wikipedia_reader.py +33 -1
  102. agno/knowledge/types.py +1 -0
  103. agno/knowledge/utils.py +72 -7
  104. agno/media.py +22 -6
  105. agno/memory/__init__.py +14 -1
  106. agno/memory/manager.py +544 -83
  107. agno/memory/strategies/__init__.py +15 -0
  108. agno/memory/strategies/base.py +66 -0
  109. agno/memory/strategies/summarize.py +196 -0
  110. agno/memory/strategies/types.py +37 -0
  111. agno/models/aimlapi/aimlapi.py +17 -0
  112. agno/models/anthropic/claude.py +515 -40
  113. agno/models/aws/bedrock.py +102 -21
  114. agno/models/aws/claude.py +131 -274
  115. agno/models/azure/ai_foundry.py +41 -19
  116. agno/models/azure/openai_chat.py +39 -8
  117. agno/models/base.py +1249 -525
  118. agno/models/cerebras/cerebras.py +91 -21
  119. agno/models/cerebras/cerebras_openai.py +21 -2
  120. agno/models/cohere/chat.py +40 -6
  121. agno/models/cometapi/cometapi.py +18 -1
  122. agno/models/dashscope/dashscope.py +2 -3
  123. agno/models/deepinfra/deepinfra.py +18 -1
  124. agno/models/deepseek/deepseek.py +69 -3
  125. agno/models/fireworks/fireworks.py +18 -1
  126. agno/models/google/gemini.py +877 -80
  127. agno/models/google/utils.py +22 -0
  128. agno/models/groq/groq.py +51 -18
  129. agno/models/huggingface/huggingface.py +17 -6
  130. agno/models/ibm/watsonx.py +16 -6
  131. agno/models/internlm/internlm.py +18 -1
  132. agno/models/langdb/langdb.py +13 -1
  133. agno/models/litellm/chat.py +44 -9
  134. agno/models/litellm/litellm_openai.py +18 -1
  135. agno/models/message.py +28 -5
  136. agno/models/meta/llama.py +47 -14
  137. agno/models/meta/llama_openai.py +22 -17
  138. agno/models/mistral/mistral.py +8 -4
  139. agno/models/nebius/nebius.py +6 -7
  140. agno/models/nvidia/nvidia.py +20 -3
  141. agno/models/ollama/chat.py +24 -8
  142. agno/models/openai/chat.py +104 -29
  143. agno/models/openai/responses.py +101 -81
  144. agno/models/openrouter/openrouter.py +60 -3
  145. agno/models/perplexity/perplexity.py +17 -1
  146. agno/models/portkey/portkey.py +7 -6
  147. agno/models/requesty/requesty.py +24 -4
  148. agno/models/response.py +73 -2
  149. agno/models/sambanova/sambanova.py +20 -3
  150. agno/models/siliconflow/siliconflow.py +19 -2
  151. agno/models/together/together.py +20 -3
  152. agno/models/utils.py +254 -8
  153. agno/models/vercel/v0.py +20 -3
  154. agno/models/vertexai/__init__.py +0 -0
  155. agno/models/vertexai/claude.py +190 -0
  156. agno/models/vllm/vllm.py +19 -14
  157. agno/models/xai/xai.py +19 -2
  158. agno/os/app.py +549 -152
  159. agno/os/auth.py +190 -3
  160. agno/os/config.py +23 -0
  161. agno/os/interfaces/a2a/router.py +8 -11
  162. agno/os/interfaces/a2a/utils.py +1 -1
  163. agno/os/interfaces/agui/router.py +18 -3
  164. agno/os/interfaces/agui/utils.py +152 -39
  165. agno/os/interfaces/slack/router.py +55 -37
  166. agno/os/interfaces/slack/slack.py +9 -1
  167. agno/os/interfaces/whatsapp/router.py +0 -1
  168. agno/os/interfaces/whatsapp/security.py +3 -1
  169. agno/os/mcp.py +110 -52
  170. agno/os/middleware/__init__.py +2 -0
  171. agno/os/middleware/jwt.py +676 -112
  172. agno/os/router.py +40 -1478
  173. agno/os/routers/agents/__init__.py +3 -0
  174. agno/os/routers/agents/router.py +599 -0
  175. agno/os/routers/agents/schema.py +261 -0
  176. agno/os/routers/evals/evals.py +96 -39
  177. agno/os/routers/evals/schemas.py +65 -33
  178. agno/os/routers/evals/utils.py +80 -10
  179. agno/os/routers/health.py +10 -4
  180. agno/os/routers/knowledge/knowledge.py +196 -38
  181. agno/os/routers/knowledge/schemas.py +82 -22
  182. agno/os/routers/memory/memory.py +279 -52
  183. agno/os/routers/memory/schemas.py +46 -17
  184. agno/os/routers/metrics/metrics.py +20 -8
  185. agno/os/routers/metrics/schemas.py +16 -16
  186. agno/os/routers/session/session.py +462 -34
  187. agno/os/routers/teams/__init__.py +3 -0
  188. agno/os/routers/teams/router.py +512 -0
  189. agno/os/routers/teams/schema.py +257 -0
  190. agno/os/routers/traces/__init__.py +3 -0
  191. agno/os/routers/traces/schemas.py +414 -0
  192. agno/os/routers/traces/traces.py +499 -0
  193. agno/os/routers/workflows/__init__.py +3 -0
  194. agno/os/routers/workflows/router.py +624 -0
  195. agno/os/routers/workflows/schema.py +75 -0
  196. agno/os/schema.py +256 -693
  197. agno/os/scopes.py +469 -0
  198. agno/os/utils.py +514 -36
  199. agno/reasoning/anthropic.py +80 -0
  200. agno/reasoning/gemini.py +73 -0
  201. agno/reasoning/openai.py +5 -0
  202. agno/reasoning/vertexai.py +76 -0
  203. agno/run/__init__.py +6 -0
  204. agno/run/agent.py +155 -32
  205. agno/run/base.py +55 -3
  206. agno/run/requirement.py +181 -0
  207. agno/run/team.py +125 -38
  208. agno/run/workflow.py +72 -18
  209. agno/session/agent.py +102 -89
  210. agno/session/summary.py +56 -15
  211. agno/session/team.py +164 -90
  212. agno/session/workflow.py +405 -40
  213. agno/table.py +10 -0
  214. agno/team/team.py +3974 -1903
  215. agno/tools/dalle.py +2 -4
  216. agno/tools/eleven_labs.py +23 -25
  217. agno/tools/exa.py +21 -16
  218. agno/tools/file.py +153 -23
  219. agno/tools/file_generation.py +16 -10
  220. agno/tools/firecrawl.py +15 -7
  221. agno/tools/function.py +193 -38
  222. agno/tools/gmail.py +238 -14
  223. agno/tools/google_drive.py +271 -0
  224. agno/tools/googlecalendar.py +36 -8
  225. agno/tools/googlesheets.py +20 -5
  226. agno/tools/jira.py +20 -0
  227. agno/tools/mcp/__init__.py +10 -0
  228. agno/tools/mcp/mcp.py +331 -0
  229. agno/tools/mcp/multi_mcp.py +347 -0
  230. agno/tools/mcp/params.py +24 -0
  231. agno/tools/mcp_toolbox.py +3 -3
  232. agno/tools/models/nebius.py +5 -5
  233. agno/tools/models_labs.py +20 -10
  234. agno/tools/nano_banana.py +151 -0
  235. agno/tools/notion.py +204 -0
  236. agno/tools/parallel.py +314 -0
  237. agno/tools/postgres.py +76 -36
  238. agno/tools/redshift.py +406 -0
  239. agno/tools/scrapegraph.py +1 -1
  240. agno/tools/shopify.py +1519 -0
  241. agno/tools/slack.py +18 -3
  242. agno/tools/spotify.py +919 -0
  243. agno/tools/tavily.py +146 -0
  244. agno/tools/toolkit.py +25 -0
  245. agno/tools/workflow.py +8 -1
  246. agno/tools/yfinance.py +12 -11
  247. agno/tracing/__init__.py +12 -0
  248. agno/tracing/exporter.py +157 -0
  249. agno/tracing/schemas.py +276 -0
  250. agno/tracing/setup.py +111 -0
  251. agno/utils/agent.py +938 -0
  252. agno/utils/cryptography.py +22 -0
  253. agno/utils/dttm.py +33 -0
  254. agno/utils/events.py +151 -3
  255. agno/utils/gemini.py +15 -5
  256. agno/utils/hooks.py +118 -4
  257. agno/utils/http.py +113 -2
  258. agno/utils/knowledge.py +12 -5
  259. agno/utils/log.py +1 -0
  260. agno/utils/mcp.py +92 -2
  261. agno/utils/media.py +187 -1
  262. agno/utils/merge_dict.py +3 -3
  263. agno/utils/message.py +60 -0
  264. agno/utils/models/ai_foundry.py +9 -2
  265. agno/utils/models/claude.py +49 -14
  266. agno/utils/models/cohere.py +9 -2
  267. agno/utils/models/llama.py +9 -2
  268. agno/utils/models/mistral.py +4 -2
  269. agno/utils/print_response/agent.py +109 -16
  270. agno/utils/print_response/team.py +223 -30
  271. agno/utils/print_response/workflow.py +251 -34
  272. agno/utils/streamlit.py +1 -1
  273. agno/utils/team.py +98 -9
  274. agno/utils/tokens.py +657 -0
  275. agno/vectordb/base.py +39 -7
  276. agno/vectordb/cassandra/cassandra.py +21 -5
  277. agno/vectordb/chroma/chromadb.py +43 -12
  278. agno/vectordb/clickhouse/clickhousedb.py +21 -5
  279. agno/vectordb/couchbase/couchbase.py +29 -5
  280. agno/vectordb/lancedb/lance_db.py +92 -181
  281. agno/vectordb/langchaindb/langchaindb.py +24 -4
  282. agno/vectordb/lightrag/lightrag.py +17 -3
  283. agno/vectordb/llamaindex/llamaindexdb.py +25 -5
  284. agno/vectordb/milvus/milvus.py +50 -37
  285. agno/vectordb/mongodb/__init__.py +7 -1
  286. agno/vectordb/mongodb/mongodb.py +36 -30
  287. agno/vectordb/pgvector/pgvector.py +201 -77
  288. agno/vectordb/pineconedb/pineconedb.py +41 -23
  289. agno/vectordb/qdrant/qdrant.py +67 -54
  290. agno/vectordb/redis/__init__.py +9 -0
  291. agno/vectordb/redis/redisdb.py +682 -0
  292. agno/vectordb/singlestore/singlestore.py +50 -29
  293. agno/vectordb/surrealdb/surrealdb.py +31 -41
  294. agno/vectordb/upstashdb/upstashdb.py +34 -6
  295. agno/vectordb/weaviate/weaviate.py +53 -14
  296. agno/workflow/__init__.py +2 -0
  297. agno/workflow/agent.py +299 -0
  298. agno/workflow/condition.py +120 -18
  299. agno/workflow/loop.py +77 -10
  300. agno/workflow/parallel.py +231 -143
  301. agno/workflow/router.py +118 -17
  302. agno/workflow/step.py +609 -170
  303. agno/workflow/steps.py +73 -6
  304. agno/workflow/types.py +96 -21
  305. agno/workflow/workflow.py +2039 -262
  306. {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/METADATA +201 -66
  307. agno-2.3.13.dist-info/RECORD +613 -0
  308. agno/tools/googlesearch.py +0 -98
  309. agno/tools/mcp.py +0 -679
  310. agno/tools/memori.py +0 -339
  311. agno-2.1.2.dist-info/RECORD +0 -543
  312. {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +0 -0
  313. {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/licenses/LICENSE +0 -0
  314. {agno-2.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ import io
4
4
  import time
5
5
  from dataclasses import dataclass
6
6
  from enum import Enum
7
- from functools import cached_property
8
7
  from io import BytesIO
9
8
  from os.path import basename
10
9
  from pathlib import Path
@@ -12,8 +11,9 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
12
11
 
13
12
  from httpx import AsyncClient
14
13
 
15
- from agno.db.base import BaseDb
14
+ from agno.db.base import AsyncBaseDb, BaseDb
16
15
  from agno.db.schemas.knowledge import KnowledgeRow
16
+ from agno.filters import FilterExpr
17
17
  from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
18
18
  from agno.knowledge.document import Document
19
19
  from agno.knowledge.reader import Reader, ReaderFactory
@@ -39,7 +39,7 @@ class Knowledge:
39
39
  name: Optional[str] = None
40
40
  description: Optional[str] = None
41
41
  vector_db: Optional[Any] = None
42
- contents_db: Optional[BaseDb] = None
42
+ contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
43
43
  max_results: int = 10
44
44
  readers: Optional[Dict[str, Reader]] = None
45
45
 
@@ -51,9 +51,6 @@ class Knowledge:
51
51
  self.vector_db.create()
52
52
 
53
53
  self.construct_readers()
54
- self.valid_metadata_filters = set()
55
-
56
- # --- SDK Specific Methods ---
57
54
 
58
55
  # --- Add Contents ---
59
56
  @overload
@@ -122,6 +119,7 @@ class Knowledge:
122
119
  exclude=exclude,
123
120
  upsert=upsert,
124
121
  skip_if_exists=skip_if_exists,
122
+ reader=reader,
125
123
  )
126
124
  for url in urls:
127
125
  await self.add_content_async(
@@ -133,6 +131,7 @@ class Knowledge:
133
131
  exclude=exclude,
134
132
  upsert=upsert,
135
133
  skip_if_exists=skip_if_exists,
134
+ reader=reader,
136
135
  )
137
136
  for i, text_content in enumerate(text_contents):
138
137
  content_name = f"{name}_{i}" if name else f"text_content_{i}"
@@ -146,6 +145,7 @@ class Knowledge:
146
145
  exclude=exclude,
147
146
  upsert=upsert,
148
147
  skip_if_exists=skip_if_exists,
148
+ reader=reader,
149
149
  )
150
150
  if topics:
151
151
  await self.add_content_async(
@@ -168,6 +168,7 @@ class Knowledge:
168
168
  remote_content=remote_content,
169
169
  upsert=upsert,
170
170
  skip_if_exists=skip_if_exists,
171
+ reader=reader,
171
172
  )
172
173
 
173
174
  else:
@@ -183,18 +184,20 @@ class Knowledge:
183
184
  paths: Optional[List[str]] = None,
184
185
  urls: Optional[List[str]] = None,
185
186
  metadata: Optional[Dict[str, str]] = None,
187
+ topics: Optional[List[str]] = None,
188
+ text_contents: Optional[List[str]] = None,
189
+ reader: Optional[Reader] = None,
186
190
  include: Optional[List[str]] = None,
187
191
  exclude: Optional[List[str]] = None,
188
192
  upsert: bool = True,
189
193
  skip_if_exists: bool = False,
194
+ remote_content: Optional[RemoteContent] = None,
190
195
  ) -> None: ...
191
196
 
192
197
  def add_contents(self, *args, **kwargs) -> None:
193
198
  """
194
199
  Synchronously add multiple content items to the knowledge base.
195
200
 
196
- This method wraps the asynchronous add_contents method
197
-
198
201
  Supports two usage patterns:
199
202
  1. Pass a list of content dictionaries as first argument
200
203
  2. Pass keyword arguments with paths, urls, metadata, etc.
@@ -204,12 +207,114 @@ class Knowledge:
204
207
  paths: Optional list of file paths to load content from
205
208
  urls: Optional list of URLs to load content from
206
209
  metadata: Optional metadata dictionary to apply to all content
210
+ topics: Optional list of topics to add
211
+ text_contents: Optional list of text content strings to add
212
+ reader: Optional reader to use for processing content
207
213
  include: Optional list of file patterns to include
208
214
  exclude: Optional list of file patterns to exclude
209
- upsert: Whether to update existing content if it already exists
210
- skip_if_exists: Whether to skip adding content if it already exists
215
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
216
+ skip_if_exists: Whether to skip adding content if it already exists (default: True)
217
+ remote_content: Optional remote content (S3, GCS, etc.) to add
211
218
  """
212
- asyncio.run(self.add_contents_async(*args, **kwargs))
219
+ if args and isinstance(args[0], list):
220
+ arguments = args[0]
221
+ upsert = kwargs.get("upsert", True)
222
+ skip_if_exists = kwargs.get("skip_if_exists", False)
223
+ for argument in arguments:
224
+ self.add_content(
225
+ name=argument.get("name"),
226
+ description=argument.get("description"),
227
+ path=argument.get("path"),
228
+ url=argument.get("url"),
229
+ metadata=argument.get("metadata"),
230
+ topics=argument.get("topics"),
231
+ text_content=argument.get("text_content"),
232
+ reader=argument.get("reader"),
233
+ include=argument.get("include"),
234
+ exclude=argument.get("exclude"),
235
+ upsert=argument.get("upsert", upsert),
236
+ skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
237
+ remote_content=argument.get("remote_content", None),
238
+ )
239
+
240
+ elif kwargs:
241
+ name = kwargs.get("name", [])
242
+ metadata = kwargs.get("metadata", {})
243
+ description = kwargs.get("description", [])
244
+ topics = kwargs.get("topics", [])
245
+ reader = kwargs.get("reader", None)
246
+ paths = kwargs.get("paths", [])
247
+ urls = kwargs.get("urls", [])
248
+ text_contents = kwargs.get("text_contents", [])
249
+ include = kwargs.get("include")
250
+ exclude = kwargs.get("exclude")
251
+ upsert = kwargs.get("upsert", True)
252
+ skip_if_exists = kwargs.get("skip_if_exists", False)
253
+ remote_content = kwargs.get("remote_content", None)
254
+ for path in paths:
255
+ self.add_content(
256
+ name=name,
257
+ description=description,
258
+ path=path,
259
+ metadata=metadata,
260
+ include=include,
261
+ exclude=exclude,
262
+ upsert=upsert,
263
+ skip_if_exists=skip_if_exists,
264
+ reader=reader,
265
+ )
266
+ for url in urls:
267
+ self.add_content(
268
+ name=name,
269
+ description=description,
270
+ url=url,
271
+ metadata=metadata,
272
+ include=include,
273
+ exclude=exclude,
274
+ upsert=upsert,
275
+ skip_if_exists=skip_if_exists,
276
+ reader=reader,
277
+ )
278
+ for i, text_content in enumerate(text_contents):
279
+ content_name = f"{name}_{i}" if name else f"text_content_{i}"
280
+ log_debug(f"Adding text content: {content_name}")
281
+ self.add_content(
282
+ name=content_name,
283
+ description=description,
284
+ text_content=text_content,
285
+ metadata=metadata,
286
+ include=include,
287
+ exclude=exclude,
288
+ upsert=upsert,
289
+ skip_if_exists=skip_if_exists,
290
+ reader=reader,
291
+ )
292
+ if topics:
293
+ self.add_content(
294
+ name=name,
295
+ description=description,
296
+ topics=topics,
297
+ metadata=metadata,
298
+ include=include,
299
+ exclude=exclude,
300
+ upsert=upsert,
301
+ skip_if_exists=skip_if_exists,
302
+ reader=reader,
303
+ )
304
+
305
+ if remote_content:
306
+ self.add_content(
307
+ name=name,
308
+ metadata=metadata,
309
+ description=description,
310
+ remote_content=remote_content,
311
+ upsert=upsert,
312
+ skip_if_exists=skip_if_exists,
313
+ reader=reader,
314
+ )
315
+
316
+ else:
317
+ raise ValueError("Invalid usage of add_contents.")
213
318
 
214
319
  # --- Add Content ---
215
320
 
@@ -246,7 +351,7 @@ class Knowledge:
246
351
  include: Optional[List[str]] = None,
247
352
  exclude: Optional[List[str]] = None,
248
353
  upsert: bool = True,
249
- skip_if_exists: bool = True,
354
+ skip_if_exists: bool = False,
250
355
  auth: Optional[ContentAuth] = None,
251
356
  ) -> None:
252
357
  # Validation: At least one of the parameters must be provided
@@ -256,10 +361,6 @@ class Knowledge:
256
361
  )
257
362
  return
258
363
 
259
- if not skip_if_exists:
260
- log_info("skip_if_exists is disabled, disabling upsert")
261
- upsert = False
262
-
263
364
  content = None
264
365
  file_data = None
265
366
  if text_content:
@@ -280,7 +381,7 @@ class Knowledge:
280
381
  content.content_hash = self._build_content_hash(content)
281
382
  content.id = generate_id(content.content_hash)
282
383
 
283
- await self._load_content(content, upsert, skip_if_exists, include, exclude)
384
+ await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
284
385
 
285
386
  @overload
286
387
  def add_content(
@@ -333,27 +434,37 @@ class Knowledge:
333
434
  reader: Optional custom reader for processing the content
334
435
  include: Optional list of file patterns to include
335
436
  exclude: Optional list of file patterns to exclude
336
- upsert: Whether to update existing content if it already exists
337
- skip_if_exists: Whether to skip adding content if it already exists
437
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
438
+ skip_if_exists: Whether to skip adding content if it already exists (default: False)
338
439
  """
339
- asyncio.run(
340
- self.add_content_async(
341
- name=name,
342
- description=description,
343
- path=path,
344
- url=url,
345
- text_content=text_content,
346
- metadata=metadata,
347
- topics=topics,
348
- remote_content=remote_content,
349
- reader=reader,
350
- include=include,
351
- exclude=exclude,
352
- upsert=upsert,
353
- skip_if_exists=skip_if_exists,
354
- auth=auth,
440
+ # Validation: At least one of the parameters must be provided
441
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
442
+ log_warning(
443
+ "At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
355
444
  )
445
+ return
446
+
447
+ content = None
448
+ file_data = None
449
+ if text_content:
450
+ file_data = FileData(content=text_content, type="Text")
451
+
452
+ content = Content(
453
+ name=name,
454
+ description=description,
455
+ path=path,
456
+ url=url,
457
+ file_data=file_data if file_data else None,
458
+ metadata=metadata,
459
+ topics=topics,
460
+ remote_content=remote_content,
461
+ reader=reader,
462
+ auth=auth,
356
463
  )
464
+ content.content_hash = self._build_content_hash(content)
465
+ content.id = generate_id(content.content_hash)
466
+
467
+ self._load_content(content, upsert, skip_if_exists, include, exclude)
357
468
 
358
469
  def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
359
470
  """
@@ -375,7 +486,178 @@ class Knowledge:
375
486
 
376
487
  return False
377
488
 
378
- async def _load_from_path(
489
+ def _select_reader_by_extension(
490
+ self, file_extension: str, provided_reader: Optional[Reader] = None
491
+ ) -> Tuple[Optional[Reader], str]:
492
+ """
493
+ Select a reader based on file extension.
494
+
495
+ Args:
496
+ file_extension: File extension (e.g., '.pdf', '.csv')
497
+ provided_reader: Optional reader already provided
498
+
499
+ Returns:
500
+ Tuple of (reader, name) where name may be adjusted based on extension
501
+ """
502
+ if provided_reader:
503
+ return provided_reader, ""
504
+
505
+ file_extension = file_extension.lower()
506
+ if file_extension == ".csv":
507
+ return self.csv_reader, "data.csv"
508
+ elif file_extension == ".pdf":
509
+ return self.pdf_reader, ""
510
+ elif file_extension == ".docx":
511
+ return self.docx_reader, ""
512
+ elif file_extension == ".pptx":
513
+ return self.pptx_reader, ""
514
+ elif file_extension == ".json":
515
+ return self.json_reader, ""
516
+ elif file_extension == ".markdown":
517
+ return self.markdown_reader, ""
518
+ else:
519
+ return self.text_reader, ""
520
+
521
+ def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
522
+ """
523
+ Select a reader based on URI/file path extension.
524
+
525
+ Args:
526
+ uri: URI or file path
527
+ provided_reader: Optional reader already provided
528
+
529
+ Returns:
530
+ Selected reader or None
531
+ """
532
+ if provided_reader:
533
+ return provided_reader
534
+
535
+ uri_lower = uri.lower()
536
+ if uri_lower.endswith(".pdf"):
537
+ return self.pdf_reader
538
+ elif uri_lower.endswith(".csv"):
539
+ return self.csv_reader
540
+ elif uri_lower.endswith(".docx"):
541
+ return self.docx_reader
542
+ elif uri_lower.endswith(".pptx"):
543
+ return self.pptx_reader
544
+ elif uri_lower.endswith(".json"):
545
+ return self.json_reader
546
+ elif uri_lower.endswith(".markdown"):
547
+ return self.markdown_reader
548
+ else:
549
+ return self.text_reader
550
+
551
+ def _read(
552
+ self,
553
+ reader: Reader,
554
+ source: Union[Path, str, BytesIO],
555
+ name: Optional[str] = None,
556
+ password: Optional[str] = None,
557
+ ) -> List[Document]:
558
+ """
559
+ Read content using a reader with optional password handling.
560
+
561
+ Args:
562
+ reader: Reader to use
563
+ source: Source to read from (Path, URL string, or BytesIO)
564
+ name: Optional name for the document
565
+ password: Optional password for protected files
566
+
567
+ Returns:
568
+ List of documents read
569
+ """
570
+ import inspect
571
+
572
+ read_signature = inspect.signature(reader.read)
573
+ if password and "password" in read_signature.parameters:
574
+ if isinstance(source, BytesIO):
575
+ return reader.read(source, name=name, password=password)
576
+ else:
577
+ return reader.read(source, name=name, password=password)
578
+ else:
579
+ if isinstance(source, BytesIO):
580
+ return reader.read(source, name=name)
581
+ else:
582
+ return reader.read(source, name=name)
583
+
584
+ async def _read_async(
585
+ self,
586
+ reader: Reader,
587
+ source: Union[Path, str, BytesIO],
588
+ name: Optional[str] = None,
589
+ password: Optional[str] = None,
590
+ ) -> List[Document]:
591
+ """
592
+ Read content using a reader's async_read method with optional password handling.
593
+
594
+ Args:
595
+ reader: Reader to use
596
+ source: Source to read from (Path, URL string, or BytesIO)
597
+ name: Optional name for the document
598
+ password: Optional password for protected files
599
+
600
+ Returns:
601
+ List of documents read
602
+ """
603
+ import inspect
604
+
605
+ read_signature = inspect.signature(reader.async_read)
606
+ if password and "password" in read_signature.parameters:
607
+ return await reader.async_read(source, name=name, password=password)
608
+ else:
609
+ if isinstance(source, BytesIO):
610
+ return await reader.async_read(source, name=name)
611
+ else:
612
+ return await reader.async_read(source, name=name)
613
+
614
+ def _prepare_documents_for_insert(
615
+ self,
616
+ documents: List[Document],
617
+ content_id: str,
618
+ calculate_sizes: bool = False,
619
+ metadata: Optional[Dict[str, Any]] = None,
620
+ ) -> List[Document]:
621
+ """
622
+ Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
623
+
624
+ Args:
625
+ documents: List of documents to prepare
626
+ content_id: Content ID to assign to documents
627
+ calculate_sizes: Whether to calculate document sizes
628
+ metadata: Optional metadata to merge into document metadata
629
+
630
+ Returns:
631
+ List of prepared documents
632
+ """
633
+ for document in documents:
634
+ document.content_id = content_id
635
+ if calculate_sizes and document.content and not document.size:
636
+ document.size = len(document.content.encode("utf-8"))
637
+ if metadata:
638
+ document.meta_data.update(metadata)
639
+ return documents
640
+
641
+ def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
642
+ """
643
+ Chunk documents synchronously.
644
+
645
+ Args:
646
+ reader: Reader with chunking strategy
647
+ documents: Documents to chunk
648
+
649
+ Returns:
650
+ List of chunked documents
651
+ """
652
+ if not reader or reader.chunk:
653
+ return documents
654
+
655
+ chunked_documents = []
656
+ for doc in documents:
657
+ chunked_documents.extend(reader.chunk_document(doc))
658
+ return chunked_documents
659
+
660
+ async def _load_from_path_async(
379
661
  self,
380
662
  content: Content,
381
663
  upsert: bool,
@@ -392,45 +674,32 @@ class Knowledge:
392
674
 
393
675
  if path.is_file():
394
676
  if self._should_include_file(str(path), include, exclude):
395
- log_info(f"Adding file {path} due to include/exclude filters")
677
+ log_debug(f"Adding file {path} due to include/exclude filters")
396
678
 
397
- self._add_to_contents_db(content)
679
+ await self._add_to_contents_db_async(content)
398
680
  if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
399
681
  content.status = ContentStatus.COMPLETED
400
- self._update_content(content)
682
+ await self._aupdate_content(content)
401
683
  return
402
684
 
403
685
  # Handle LightRAG special case - read file and upload directly
404
686
  if self.vector_db.__class__.__name__ == "LightRag":
405
- await self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
687
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
406
688
  return
407
689
 
408
690
  if content.reader:
409
- # TODO: We will refactor this to eventually pass authorization to all readers
410
- import inspect
411
-
412
- read_signature = inspect.signature(content.reader.read)
413
- if "password" in read_signature.parameters and content.auth and content.auth.password:
414
- read_documents = content.reader.read(
415
- path, name=content.name or path.name, password=content.auth.password
416
- )
417
- else:
418
- read_documents = content.reader.read(path, name=content.name or path.name)
419
-
691
+ reader = content.reader
420
692
  else:
421
693
  reader = ReaderFactory.get_reader_for_extension(path.suffix)
422
- log_info(f"Using Reader: {reader.__class__.__name__}")
423
- if reader:
424
- # TODO: We will refactor this to eventually pass authorization to all readers
425
- import inspect
694
+ log_debug(f"Using Reader: {reader.__class__.__name__}")
426
695
 
427
- read_signature = inspect.signature(reader.read)
428
- if "password" in read_signature.parameters and content.auth and content.auth.password:
429
- read_documents = reader.read(
430
- path, name=content.name or path.name, password=content.auth.password
431
- )
432
- else:
433
- read_documents = reader.read(path, name=content.name or path.name)
696
+ if reader:
697
+ password = content.auth.password if content.auth and content.auth.password else None
698
+ read_documents = await self._read_async(
699
+ reader, path, name=content.name or path.name, password=password
700
+ )
701
+ else:
702
+ read_documents = []
434
703
 
435
704
  if not content.file_type:
436
705
  content.file_type = path.suffix
@@ -444,10 +713,11 @@ class Knowledge:
444
713
  log_warning(f"Could not get file size for {path}: {e}")
445
714
  content.size = 0
446
715
 
447
- for read_document in read_documents:
448
- read_document.content_id = content.id
716
+ if not content.id:
717
+ content.id = generate_id(content.content_hash or "")
718
+ self._prepare_documents_for_insert(read_documents, content.id)
449
719
 
450
- await self._handle_vector_db_insert(content, read_documents, upsert)
720
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
451
721
 
452
722
  elif path.is_dir():
453
723
  for file_path in path.iterdir():
@@ -466,48 +736,240 @@ class Knowledge:
466
736
  file_content.content_hash = self._build_content_hash(file_content)
467
737
  file_content.id = generate_id(file_content.content_hash)
468
738
 
469
- await self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
739
+ await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
470
740
  else:
471
741
  log_warning(f"Invalid path: {path}")
472
742
 
473
- async def _load_from_url(
743
+ def _load_from_path(
474
744
  self,
475
745
  content: Content,
476
746
  upsert: bool,
477
747
  skip_if_exists: bool,
748
+ include: Optional[List[str]] = None,
749
+ exclude: Optional[List[str]] = None,
478
750
  ):
479
- """Load the content in the contextual URL
480
-
481
- 1. Set content hash
482
- 2. Validate the URL
483
- 3. Read the content
484
- 4. Prepare and insert the content in the vector database
485
- """
486
-
487
751
  from agno.vectordb import VectorDb
488
752
 
489
753
  self.vector_db = cast(VectorDb, self.vector_db)
490
754
 
491
- log_info(f"Adding content from URL {content.url}")
492
- content.file_type = "url"
493
-
494
- if not content.url:
495
- raise ValueError("No url provided")
755
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
756
+ path = Path(content.path) # type: ignore
496
757
 
497
- # 1. Add content to contents database
498
- self._add_to_contents_db(content)
499
- if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
500
- content.status = ContentStatus.COMPLETED
501
- self._update_content(content)
502
- return
758
+ if path.is_file():
759
+ if self._should_include_file(str(path), include, exclude):
760
+ log_debug(f"Adding file {path} due to include/exclude filters")
503
761
 
504
- if self.vector_db.__class__.__name__ == "LightRag":
505
- await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
506
- return
762
+ self._add_to_contents_db(content)
763
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
764
+ content.status = ContentStatus.COMPLETED
765
+ self._update_content(content)
766
+ return
507
767
 
508
- # 2. Validate URL
509
- try:
510
- from urllib.parse import urlparse
768
+ # Handle LightRAG special case - read file and upload directly
769
+ if self.vector_db.__class__.__name__ == "LightRag":
770
+ self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
771
+ return
772
+
773
+ if content.reader:
774
+ # TODO: We will refactor this to eventually pass authorization to all readers
775
+ import inspect
776
+
777
+ read_signature = inspect.signature(content.reader.read)
778
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
779
+ read_documents = content.reader.read(
780
+ path, name=content.name or path.name, password=content.auth.password
781
+ )
782
+ else:
783
+ read_documents = content.reader.read(path, name=content.name or path.name)
784
+
785
+ else:
786
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
787
+ log_debug(f"Using Reader: {reader.__class__.__name__}")
788
+ if reader:
789
+ # TODO: We will refactor this to eventually pass authorization to all readers
790
+ import inspect
791
+
792
+ read_signature = inspect.signature(reader.read)
793
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
794
+ read_documents = reader.read(
795
+ path, name=content.name or path.name, password=content.auth.password
796
+ )
797
+ else:
798
+ read_documents = reader.read(path, name=content.name or path.name)
799
+
800
+ if not content.file_type:
801
+ content.file_type = path.suffix
802
+
803
+ if not content.size and content.file_data:
804
+ content.size = len(content.file_data.content) # type: ignore
805
+ if not content.size:
806
+ try:
807
+ content.size = path.stat().st_size
808
+ except (OSError, IOError) as e:
809
+ log_warning(f"Could not get file size for {path}: {e}")
810
+ content.size = 0
811
+
812
+ if not content.id:
813
+ content.id = generate_id(content.content_hash or "")
814
+ self._prepare_documents_for_insert(read_documents, content.id)
815
+
816
+ self._handle_vector_db_insert(content, read_documents, upsert)
817
+
818
+ elif path.is_dir():
819
+ for file_path in path.iterdir():
820
+ # Apply include/exclude filtering
821
+ if not self._should_include_file(str(file_path), include, exclude):
822
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
823
+ continue
824
+
825
+ file_content = Content(
826
+ name=content.name,
827
+ path=str(file_path),
828
+ metadata=content.metadata,
829
+ description=content.description,
830
+ reader=content.reader,
831
+ )
832
+ file_content.content_hash = self._build_content_hash(file_content)
833
+ file_content.id = generate_id(file_content.content_hash)
834
+
835
+ self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
836
+ else:
837
+ log_warning(f"Invalid path: {path}")
838
+
839
+ async def _load_from_url_async(
840
+ self,
841
+ content: Content,
842
+ upsert: bool,
843
+ skip_if_exists: bool,
844
+ ):
845
+ """Load the content in the contextual URL
846
+
847
+ 1. Set content hash
848
+ 2. Validate the URL
849
+ 3. Read the content
850
+ 4. Prepare and insert the content in the vector database
851
+ """
852
+ from agno.vectordb import VectorDb
853
+
854
+ self.vector_db = cast(VectorDb, self.vector_db)
855
+
856
+ log_info(f"Adding content from URL {content.url}")
857
+ content.file_type = "url"
858
+
859
+ if not content.url:
860
+ raise ValueError("No url provided")
861
+
862
+ # 1. Add content to contents database
863
+ await self._add_to_contents_db_async(content)
864
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
865
+ content.status = ContentStatus.COMPLETED
866
+ await self._aupdate_content(content)
867
+ return
868
+
869
+ if self.vector_db.__class__.__name__ == "LightRag":
870
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
871
+ return
872
+
873
+ # 2. Validate URL
874
+ try:
875
+ from urllib.parse import urlparse
876
+
877
+ parsed_url = urlparse(content.url)
878
+ if not all([parsed_url.scheme, parsed_url.netloc]):
879
+ content.status = ContentStatus.FAILED
880
+ content.status_message = f"Invalid URL format: {content.url}"
881
+ await self._aupdate_content(content)
882
+ log_warning(f"Invalid URL format: {content.url}")
883
+ except Exception as e:
884
+ content.status = ContentStatus.FAILED
885
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
886
+ await self._aupdate_content(content)
887
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
888
+ # 3. Fetch and load content if file has an extension
889
+ url_path = Path(parsed_url.path)
890
+ file_extension = url_path.suffix.lower()
891
+
892
+ bytes_content = None
893
+ if file_extension:
894
+ async with AsyncClient() as client:
895
+ response = await async_fetch_with_retry(content.url, client=client)
896
+ bytes_content = BytesIO(response.content)
897
+
898
+ # 4. Select reader
899
+ name = content.name if content.name else content.url
900
+ if file_extension:
901
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
902
+ if default_name and file_extension == ".csv":
903
+ name = basename(parsed_url.path) or default_name
904
+ else:
905
+ reader = content.reader or self.website_reader
906
+ # 5. Read content
907
+ try:
908
+ read_documents = []
909
+ if reader is not None:
910
+ # Special handling for YouTubeReader
911
+ if reader.__class__.__name__ == "YouTubeReader":
912
+ read_documents = await reader.async_read(content.url, name=name)
913
+ else:
914
+ password = content.auth.password if content.auth and content.auth.password else None
915
+ source = bytes_content if bytes_content else content.url
916
+ read_documents = await self._read_async(reader, source, name=name, password=password)
917
+
918
+ except Exception as e:
919
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
920
+ content.status = ContentStatus.FAILED
921
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
922
+ await self._aupdate_content(content)
923
+ return
924
+
925
+ # 6. Chunk documents if needed
926
+ if reader and not reader.chunk:
927
+ read_documents = await reader.chunk_documents_async(read_documents)
928
+ # 7. Prepare and insert the content in the vector database
929
+ if not content.id:
930
+ content.id = generate_id(content.content_hash or "")
931
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
932
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
933
+
934
+ def _load_from_url(
935
+ self,
936
+ content: Content,
937
+ upsert: bool,
938
+ skip_if_exists: bool,
939
+ ):
940
+ """Synchronous version of _load_from_url.
941
+
942
+ Load the content from a URL:
943
+ 1. Set content hash
944
+ 2. Validate the URL
945
+ 3. Read the content
946
+ 4. Prepare and insert the content in the vector database
947
+ """
948
+ from agno.utils.http import fetch_with_retry
949
+ from agno.vectordb import VectorDb
950
+
951
+ self.vector_db = cast(VectorDb, self.vector_db)
952
+
953
+ log_info(f"Adding content from URL {content.url}")
954
+ content.file_type = "url"
955
+
956
+ if not content.url:
957
+ raise ValueError("No url provided")
958
+
959
+ # 1. Add content to contents database
960
+ self._add_to_contents_db(content)
961
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
962
+ content.status = ContentStatus.COMPLETED
963
+ self._update_content(content)
964
+ return
965
+
966
+ if self.vector_db.__class__.__name__ == "LightRag":
967
+ self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
968
+ return
969
+
970
+ # 2. Validate URL
971
+ try:
972
+ from urllib.parse import urlparse
511
973
 
512
974
  parsed_url = urlparse(content.url)
513
975
  if not all([parsed_url.scheme, parsed_url.netloc]):
@@ -527,50 +989,29 @@ class Knowledge:
527
989
 
528
990
  bytes_content = None
529
991
  if file_extension:
530
- async with AsyncClient() as client:
531
- response = await async_fetch_with_retry(content.url, client=client)
992
+ response = fetch_with_retry(content.url)
532
993
  bytes_content = BytesIO(response.content)
533
994
 
534
995
  # 4. Select reader
535
- # If a reader was provided by the user, use it
536
- reader = content.reader
537
996
  name = content.name if content.name else content.url
538
- # Else select based on file extension
539
- if reader is None:
540
- if file_extension == ".csv":
541
- name = basename(parsed_url.path) or "data.csv"
542
- reader = self.csv_reader
543
- elif file_extension == ".pdf":
544
- reader = self.pdf_reader
545
- elif file_extension == ".docx":
546
- reader = self.docx_reader
547
- elif file_extension == ".json":
548
- reader = self.json_reader
549
- elif file_extension == ".markdown":
550
- reader = self.markdown_reader
551
- else:
552
- reader = self.text_reader
997
+ if file_extension:
998
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
999
+ if default_name and file_extension == ".csv":
1000
+ name = basename(parsed_url.path) or default_name
1001
+ else:
1002
+ reader = content.reader or self.website_reader
553
1003
 
554
1004
  # 5. Read content
555
1005
  try:
556
1006
  read_documents = []
557
1007
  if reader is not None:
558
- # TODO: We will refactor this to eventually pass authorization to all readers
559
- import inspect
560
-
561
- read_signature = inspect.signature(reader.read)
1008
+ # Special handling for YouTubeReader
562
1009
  if reader.__class__.__name__ == "YouTubeReader":
563
1010
  read_documents = reader.read(content.url, name=name)
564
- elif "password" in read_signature.parameters and content.auth and content.auth.password:
565
- if bytes_content:
566
- read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
567
- else:
568
- read_documents = reader.read(content.url, name=name, password=content.auth.password)
569
1011
  else:
570
- if bytes_content:
571
- read_documents = reader.read(bytes_content, name=name)
572
- else:
573
- read_documents = reader.read(content.url, name=name)
1012
+ password = content.auth.password if content.auth and content.auth.password else None
1013
+ source = bytes_content if bytes_content else content.url
1014
+ read_documents = self._read(reader, source, name=name, password=password)
574
1015
 
575
1016
  except Exception as e:
576
1017
  log_error(f"Error reading URL: {content.url} - {str(e)}")
@@ -579,19 +1020,17 @@ class Knowledge:
579
1020
  self._update_content(content)
580
1021
  return
581
1022
 
582
- # 6. Chunk documents if needed
583
- if reader and not reader.chunk:
584
- read_documents = await reader.chunk_documents_async(read_documents)
1023
+ # 6. Chunk documents if needed (sync version)
1024
+ if reader:
1025
+ read_documents = self._chunk_documents_sync(reader, read_documents)
1026
+
585
1027
  # 7. Prepare and insert the content in the vector database
586
- file_size = 0
587
- if read_documents:
588
- for read_document in read_documents:
589
- if read_document.size:
590
- file_size += read_document.size
591
- read_document.content_id = content.id
592
- await self._handle_vector_db_insert(content, read_documents, upsert)
593
-
594
- async def _load_from_content(
1028
+ if not content.id:
1029
+ content.id = generate_id(content.content_hash or "")
1030
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1031
+ self._handle_vector_db_insert(content, read_documents, upsert)
1032
+
1033
+ async def _load_from_content_async(
595
1034
  self,
596
1035
  content: Content,
597
1036
  upsert: bool = True,
@@ -622,6 +1061,103 @@ class Knowledge:
622
1061
 
623
1062
  log_info(f"Adding content from {content.name}")
624
1063
 
1064
+ await self._add_to_contents_db_async(content)
1065
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
1066
+ content.status = ContentStatus.COMPLETED
1067
+ await self._aupdate_content(content)
1068
+ return
1069
+
1070
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
1071
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
1072
+ return
1073
+
1074
+ read_documents = []
1075
+
1076
+ if isinstance(content.file_data, str):
1077
+ content_bytes = content.file_data.encode("utf-8", errors="replace")
1078
+ content_io = io.BytesIO(content_bytes)
1079
+
1080
+ if content.reader:
1081
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1082
+ read_documents = await content.reader.async_read(content_io, name=name)
1083
+ else:
1084
+ text_reader = self.text_reader
1085
+ if text_reader:
1086
+ read_documents = await text_reader.async_read(content_io, name=name)
1087
+ else:
1088
+ content.status = ContentStatus.FAILED
1089
+ content.status_message = "Text reader not available"
1090
+ await self._aupdate_content(content)
1091
+ return
1092
+
1093
+ elif isinstance(content.file_data, FileData):
1094
+ if content.file_data.type:
1095
+ if isinstance(content.file_data.content, bytes):
1096
+ content_io = io.BytesIO(content.file_data.content)
1097
+ elif isinstance(content.file_data.content, str):
1098
+ content_bytes = content.file_data.content.encode("utf-8", errors="replace")
1099
+ content_io = io.BytesIO(content_bytes)
1100
+ else:
1101
+ content_io = content.file_data.content # type: ignore
1102
+
1103
+ # Respect an explicitly provided reader; otherwise select based on file type
1104
+ if content.reader:
1105
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1106
+ reader = content.reader
1107
+ else:
1108
+ reader = self._select_reader(content.file_data.type)
1109
+ name = content.name if content.name else f"content_{content.file_data.type}"
1110
+ read_documents = await reader.async_read(content_io, name=name)
1111
+ if not content.id:
1112
+ content.id = generate_id(content.content_hash or "")
1113
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1114
+
1115
+ if len(read_documents) == 0:
1116
+ content.status = ContentStatus.FAILED
1117
+ content.status_message = "Content could not be read"
1118
+ await self._aupdate_content(content)
1119
+ return
1120
+
1121
+ else:
1122
+ content.status = ContentStatus.FAILED
1123
+ content.status_message = "No content provided"
1124
+ await self._aupdate_content(content)
1125
+ return
1126
+
1127
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1128
+
1129
+ def _load_from_content(
1130
+ self,
1131
+ content: Content,
1132
+ upsert: bool = True,
1133
+ skip_if_exists: bool = False,
1134
+ ):
1135
+ """Synchronous version of _load_from_content."""
1136
+ from agno.vectordb import VectorDb
1137
+
1138
+ self.vector_db = cast(VectorDb, self.vector_db)
1139
+
1140
+ if content.name:
1141
+ name = content.name
1142
+ elif content.file_data and content.file_data.content:
1143
+ if isinstance(content.file_data.content, bytes):
1144
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
1145
+ elif isinstance(content.file_data.content, str):
1146
+ name = (
1147
+ content.file_data.content[:10]
1148
+ if len(content.file_data.content) >= 10
1149
+ else content.file_data.content
1150
+ )
1151
+ else:
1152
+ name = str(content.file_data.content)[:10]
1153
+ else:
1154
+ name = None
1155
+
1156
+ if name is not None:
1157
+ content.name = name
1158
+
1159
+ log_info(f"Adding content from {content.name}")
1160
+
625
1161
  self._add_to_contents_db(content)
626
1162
  if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
627
1163
  content.status = ContentStatus.COMPLETED
@@ -629,7 +1165,7 @@ class Knowledge:
629
1165
  return
630
1166
 
631
1167
  if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
632
- await self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
1168
+ self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
633
1169
  return
634
1170
 
635
1171
  read_documents = []
@@ -639,7 +1175,7 @@ class Knowledge:
639
1175
  content_io = io.BytesIO(content_bytes)
640
1176
 
641
1177
  if content.reader:
642
- log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
1178
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
643
1179
  read_documents = content.reader.read(content_io, name=name)
644
1180
  else:
645
1181
  text_reader = self.text_reader
@@ -663,21 +1199,21 @@ class Knowledge:
663
1199
 
664
1200
  # Respect an explicitly provided reader; otherwise select based on file type
665
1201
  if content.reader:
666
- log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
1202
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
667
1203
  reader = content.reader
668
1204
  else:
669
1205
  reader = self._select_reader(content.file_data.type)
670
1206
  name = content.name if content.name else f"content_{content.file_data.type}"
671
1207
  read_documents = reader.read(content_io, name=name)
672
- for read_document in read_documents:
673
- if content.metadata:
674
- read_document.meta_data.update(content.metadata)
675
- read_document.content_id = content.id
1208
+ if not content.id:
1209
+ content.id = generate_id(content.content_hash or "")
1210
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
676
1211
 
677
1212
  if len(read_documents) == 0:
678
1213
  content.status = ContentStatus.FAILED
679
1214
  content.status_message = "Content could not be read"
680
1215
  self._update_content(content)
1216
+ return
681
1217
 
682
1218
  else:
683
1219
  content.status = ContentStatus.FAILED
@@ -685,14 +1221,76 @@ class Knowledge:
685
1221
  self._update_content(content)
686
1222
  return
687
1223
 
688
- await self._handle_vector_db_insert(content, read_documents, upsert)
1224
+ self._handle_vector_db_insert(content, read_documents, upsert)
1225
+
1226
+ async def _load_from_topics_async(
1227
+ self,
1228
+ content: Content,
1229
+ upsert: bool,
1230
+ skip_if_exists: bool,
1231
+ ):
1232
+ from agno.vectordb import VectorDb
1233
+
1234
+ self.vector_db = cast(VectorDb, self.vector_db)
1235
+ log_info(f"Adding content from topics: {content.topics}")
1236
+
1237
+ if content.topics is None:
1238
+ log_warning("No topics provided for content")
1239
+ return
1240
+
1241
+ for topic in content.topics:
1242
+ content = Content(
1243
+ name=topic,
1244
+ metadata=content.metadata,
1245
+ reader=content.reader,
1246
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
1247
+ file_data=FileData(
1248
+ type="Topic",
1249
+ ),
1250
+ topics=[topic],
1251
+ )
1252
+ content.content_hash = self._build_content_hash(content)
1253
+ content.id = generate_id(content.content_hash)
1254
+
1255
+ await self._add_to_contents_db_async(content)
1256
+ if self._should_skip(content.content_hash, skip_if_exists):
1257
+ content.status = ContentStatus.COMPLETED
1258
+ await self._aupdate_content(content)
1259
+ return
689
1260
 
690
- async def _load_from_topics(
1261
+ if self.vector_db.__class__.__name__ == "LightRag":
1262
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
1263
+ return
1264
+
1265
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1266
+ log_info(f"Content {content.content_hash} already exists, skipping")
1267
+ continue
1268
+
1269
+ await self._add_to_contents_db_async(content)
1270
+ if content.reader is None:
1271
+ log_error(f"No reader available for topic: {topic}")
1272
+ content.status = ContentStatus.FAILED
1273
+ content.status_message = "No reader available for topic"
1274
+ await self._aupdate_content(content)
1275
+ continue
1276
+
1277
+ read_documents = await content.reader.async_read(topic)
1278
+ if len(read_documents) > 0:
1279
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1280
+ else:
1281
+ content.status = ContentStatus.FAILED
1282
+ content.status_message = "No content found for topic"
1283
+ await self._aupdate_content(content)
1284
+
1285
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1286
+
1287
+ def _load_from_topics(
691
1288
  self,
692
1289
  content: Content,
693
1290
  upsert: bool,
694
1291
  skip_if_exists: bool,
695
1292
  ):
1293
+ """Synchronous version of _load_from_topics."""
696
1294
  from agno.vectordb import VectorDb
697
1295
 
698
1296
  self.vector_db = cast(VectorDb, self.vector_db)
@@ -723,9 +1321,14 @@ class Knowledge:
723
1321
  return
724
1322
 
725
1323
  if self.vector_db.__class__.__name__ == "LightRag":
726
- await self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1324
+ self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
727
1325
  return
728
1326
 
1327
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1328
+ log_info(f"Content {content.content_hash} already exists, skipping")
1329
+ continue
1330
+
1331
+ self._add_to_contents_db(content)
729
1332
  if content.reader is None:
730
1333
  log_error(f"No reader available for topic: {topic}")
731
1334
  content.status = ContentStatus.FAILED
@@ -735,24 +1338,178 @@ class Knowledge:
735
1338
 
736
1339
  read_documents = content.reader.read(topic)
737
1340
  if len(read_documents) > 0:
738
- for read_document in read_documents:
739
- read_document.content_id = content.id
740
- if read_document.content:
741
- read_document.size = len(read_document.content.encode("utf-8"))
1341
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
742
1342
  else:
743
1343
  content.status = ContentStatus.FAILED
744
1344
  content.status_message = "No content found for topic"
745
1345
  self._update_content(content)
746
- continue
747
1346
 
748
- await self._handle_vector_db_insert(content, read_documents, upsert)
1347
+ self._handle_vector_db_insert(content, read_documents, upsert)
1348
+
1349
+ async def _load_from_remote_content_async(
1350
+ self,
1351
+ content: Content,
1352
+ upsert: bool,
1353
+ skip_if_exists: bool,
1354
+ ):
1355
+ if content.remote_content is None:
1356
+ log_warning("No remote content provided for content")
1357
+ return
1358
+
1359
+ remote_content = content.remote_content
1360
+
1361
+ if isinstance(remote_content, S3Content):
1362
+ await self._load_from_s3_async(content, upsert, skip_if_exists)
1363
+
1364
+ elif isinstance(remote_content, GCSContent):
1365
+ await self._load_from_gcs_async(content, upsert, skip_if_exists)
1366
+
1367
+ else:
1368
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
1369
+
1370
+ async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1371
+ """Load the contextual S3 content.
1372
+
1373
+ 1. Identify objects to read
1374
+ 2. Setup Content object
1375
+ 3. Hash content and add it to the contents database
1376
+ 4. Select reader
1377
+ 5. Fetch and load the content
1378
+ 6. Read the content
1379
+ 7. Prepare and insert the content in the vector database
1380
+ 8. Remove temporary file if needed
1381
+ """
1382
+ from agno.cloud.aws.s3.object import S3Object
1383
+
1384
+ remote_content: S3Content = cast(S3Content, content.remote_content)
1385
+
1386
+ # 1. Identify objects to read
1387
+ objects_to_read: List[S3Object] = []
1388
+ if remote_content.bucket is not None:
1389
+ if remote_content.key is not None:
1390
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
1391
+ objects_to_read.append(_object)
1392
+ elif remote_content.object is not None:
1393
+ objects_to_read.append(remote_content.object)
1394
+ elif remote_content.prefix is not None:
1395
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
1396
+ else:
1397
+ objects_to_read.extend(remote_content.bucket.get_objects())
1398
+
1399
+ for s3_object in objects_to_read:
1400
+ # 2. Setup Content object
1401
+ content_name = content.name or ""
1402
+ content_name += "_" + (s3_object.name or "")
1403
+ content_entry = Content(
1404
+ name=content_name,
1405
+ description=content.description,
1406
+ status=ContentStatus.PROCESSING,
1407
+ metadata=content.metadata,
1408
+ file_type="s3",
1409
+ )
1410
+
1411
+ # 3. Hash content and add it to the contents database
1412
+ content_entry.content_hash = self._build_content_hash(content_entry)
1413
+ content_entry.id = generate_id(content_entry.content_hash)
1414
+ await self._add_to_contents_db_async(content_entry)
1415
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1416
+ content_entry.status = ContentStatus.COMPLETED
1417
+ await self._aupdate_content(content_entry)
1418
+ return
1419
+
1420
+ # 4. Select reader
1421
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
1422
+ reader = cast(Reader, reader)
1423
+
1424
+ # 5. Fetch and load the content
1425
+ temporary_file = None
1426
+ obj_name = content_name or s3_object.name.split("/")[-1]
1427
+ readable_content: Optional[Union[BytesIO, Path]] = None
1428
+ if s3_object.uri.endswith(".pdf"):
1429
+ readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
1430
+ else:
1431
+ temporary_file = Path("storage").joinpath(obj_name)
1432
+ readable_content = temporary_file
1433
+ s3_object.download(readable_content) # type: ignore
1434
+
1435
+ # 6. Read the content
1436
+ read_documents = await reader.async_read(readable_content, name=obj_name)
1437
+
1438
+ # 7. Prepare and insert the content in the vector database
1439
+ if not content.id:
1440
+ content.id = generate_id(content.content_hash or "")
1441
+ self._prepare_documents_for_insert(read_documents, content.id)
1442
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
1443
+
1444
+ # 8. Remove temporary file if needed
1445
+ if temporary_file:
1446
+ temporary_file.unlink()
1447
+
1448
+ async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1449
+ """Load the contextual GCS content.
1450
+
1451
+ 1. Identify objects to read
1452
+ 2. Setup Content object
1453
+ 3. Hash content and add it to the contents database
1454
+ 4. Select reader
1455
+ 5. Fetch and load the content
1456
+ 6. Read the content
1457
+ 7. Prepare and insert the content in the vector database
1458
+ """
1459
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
1460
+
1461
+ # 1. Identify objects to read
1462
+ objects_to_read = []
1463
+ if remote_content.blob_name is not None:
1464
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
1465
+ elif remote_content.prefix is not None:
1466
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
1467
+ else:
1468
+ objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
1469
+
1470
+ for gcs_object in objects_to_read:
1471
+ # 2. Setup Content object
1472
+ name = (content.name or "content") + "_" + gcs_object.name
1473
+ content_entry = Content(
1474
+ name=name,
1475
+ description=content.description,
1476
+ status=ContentStatus.PROCESSING,
1477
+ metadata=content.metadata,
1478
+ file_type="gcs",
1479
+ )
1480
+
1481
+ # 3. Hash content and add it to the contents database
1482
+ content_entry.content_hash = self._build_content_hash(content_entry)
1483
+ content_entry.id = generate_id(content_entry.content_hash)
1484
+ await self._add_to_contents_db_async(content_entry)
1485
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1486
+ content_entry.status = ContentStatus.COMPLETED
1487
+ await self._aupdate_content(content_entry)
1488
+ return
1489
+
1490
+ # 4. Select reader
1491
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
1492
+ reader = cast(Reader, reader)
1493
+
1494
+ # 5. Fetch and load the content
1495
+ readable_content = BytesIO(gcs_object.download_as_bytes())
1496
+
1497
+ # 6. Read the content
1498
+ read_documents = await reader.async_read(readable_content, name=name)
1499
+
1500
+ # 7. Prepare and insert the content in the vector database
1501
+ if not content.id:
1502
+ content.id = generate_id(content.content_hash or "")
1503
+ self._prepare_documents_for_insert(read_documents, content.id)
1504
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
749
1505
 
750
- async def _load_from_remote_content(
1506
+ def _load_from_remote_content(
751
1507
  self,
752
1508
  content: Content,
753
1509
  upsert: bool,
754
1510
  skip_if_exists: bool,
755
1511
  ):
1512
+ """Synchronous version of _load_from_remote_content."""
756
1513
  if content.remote_content is None:
757
1514
  log_warning("No remote content provided for content")
758
1515
  return
@@ -760,17 +1517,18 @@ class Knowledge:
760
1517
  remote_content = content.remote_content
761
1518
 
762
1519
  if isinstance(remote_content, S3Content):
763
- await self._load_from_s3(content, upsert, skip_if_exists)
1520
+ self._load_from_s3(content, upsert, skip_if_exists)
764
1521
 
765
1522
  elif isinstance(remote_content, GCSContent):
766
- await self._load_from_gcs(content, upsert, skip_if_exists)
1523
+ self._load_from_gcs(content, upsert, skip_if_exists)
767
1524
 
768
1525
  else:
769
1526
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
770
1527
 
771
- async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
772
- """Load the contextual S3 content.
1528
+ def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
1529
+ """Synchronous version of _load_from_s3.
773
1530
 
1531
+ Load the contextual S3 content:
774
1532
  1. Identify objects to read
775
1533
  2. Setup Content object
776
1534
  3. Hash content and add it to the contents database
@@ -819,20 +1577,7 @@ class Knowledge:
819
1577
  return
820
1578
 
821
1579
  # 4. Select reader
822
- reader = content.reader
823
- if reader is None:
824
- if s3_object.uri.endswith(".pdf"):
825
- reader = self.pdf_reader
826
- elif s3_object.uri.endswith(".csv"):
827
- reader = self.csv_reader
828
- elif s3_object.uri.endswith(".docx"):
829
- reader = self.docx_reader
830
- elif s3_object.uri.endswith(".json"):
831
- reader = self.json_reader
832
- elif s3_object.uri.endswith(".markdown"):
833
- reader = self.markdown_reader
834
- else:
835
- reader = self.text_reader
1580
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
836
1581
  reader = cast(Reader, reader)
837
1582
 
838
1583
  # 5. Fetch and load the content
@@ -850,17 +1595,19 @@ class Knowledge:
850
1595
  read_documents = reader.read(readable_content, name=obj_name)
851
1596
 
852
1597
  # 7. Prepare and insert the content in the vector database
853
- for read_document in read_documents:
854
- read_document.content_id = content.id
855
- await self._handle_vector_db_insert(content_entry, read_documents, upsert)
1598
+ if not content.id:
1599
+ content.id = generate_id(content.content_hash or "")
1600
+ self._prepare_documents_for_insert(read_documents, content.id)
1601
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
856
1602
 
857
1603
  # 8. Remove temporary file if needed
858
1604
  if temporary_file:
859
1605
  temporary_file.unlink()
860
1606
 
861
- async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
862
- """Load the contextual GCS content.
1607
+ def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
1608
+ """Synchronous version of _load_from_gcs.
863
1609
 
1610
+ Load the contextual GCS content:
864
1611
  1. Identify objects to read
865
1612
  2. Setup Content object
866
1613
  3. Hash content and add it to the contents database
@@ -901,20 +1648,7 @@ class Knowledge:
901
1648
  return
902
1649
 
903
1650
  # 4. Select reader
904
- reader = content.reader
905
- if reader is None:
906
- if gcs_object.name.endswith(".pdf"):
907
- reader = self.pdf_reader
908
- elif gcs_object.name.endswith(".csv"):
909
- reader = self.csv_reader
910
- elif gcs_object.name.endswith(".docx"):
911
- reader = self.docx_reader
912
- elif gcs_object.name.endswith(".json"):
913
- reader = self.json_reader
914
- elif gcs_object.name.endswith(".markdown"):
915
- reader = self.markdown_reader
916
- else:
917
- reader = self.text_reader
1651
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
918
1652
  reader = cast(Reader, reader)
919
1653
 
920
1654
  # 5. Fetch and load the content
@@ -924,11 +1658,12 @@ class Knowledge:
924
1658
  read_documents = reader.read(readable_content, name=name)
925
1659
 
926
1660
  # 7. Prepare and insert the content in the vector database
927
- for read_document in read_documents:
928
- read_document.content_id = content.id
929
- await self._handle_vector_db_insert(content_entry, read_documents, upsert)
1661
+ if not content.id:
1662
+ content.id = generate_id(content.content_hash or "")
1663
+ self._prepare_documents_for_insert(read_documents, content.id)
1664
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
930
1665
 
931
- async def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
1666
+ async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
932
1667
  from agno.vectordb import VectorDb
933
1668
 
934
1669
  self.vector_db = cast(VectorDb, self.vector_db)
@@ -937,7 +1672,7 @@ class Knowledge:
937
1672
  log_error("No vector database configured")
938
1673
  content.status = ContentStatus.FAILED
939
1674
  content.status_message = "No vector database configured"
940
- self._update_content(content)
1675
+ await self._aupdate_content(content)
941
1676
  return
942
1677
 
943
1678
  if self.vector_db.upsert_available() and upsert:
@@ -947,7 +1682,7 @@ class Knowledge:
947
1682
  log_error(f"Error upserting document: {e}")
948
1683
  content.status = ContentStatus.FAILED
949
1684
  content.status_message = "Could not upsert embedding"
950
- self._update_content(content)
1685
+ await self._aupdate_content(content)
951
1686
  return
952
1687
  else:
953
1688
  try:
@@ -956,6 +1691,45 @@ class Knowledge:
956
1691
  documents=read_documents,
957
1692
  filters=content.metadata, # type: ignore[arg-type]
958
1693
  )
1694
+ except Exception as e:
1695
+ log_error(f"Error inserting document: {e}")
1696
+ content.status = ContentStatus.FAILED
1697
+ content.status_message = "Could not insert embedding"
1698
+ await self._aupdate_content(content)
1699
+ return
1700
+
1701
+ content.status = ContentStatus.COMPLETED
1702
+ await self._aupdate_content(content)
1703
+
1704
+ def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
1705
+ """Synchronously handle vector database insertion."""
1706
+ from agno.vectordb import VectorDb
1707
+
1708
+ self.vector_db = cast(VectorDb, self.vector_db)
1709
+
1710
+ if not self.vector_db:
1711
+ log_error("No vector database configured")
1712
+ content.status = ContentStatus.FAILED
1713
+ content.status_message = "No vector database configured"
1714
+ self._update_content(content)
1715
+ return
1716
+
1717
+ if self.vector_db.upsert_available() and upsert:
1718
+ try:
1719
+ self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
1720
+ except Exception as e:
1721
+ log_error(f"Error upserting document: {e}")
1722
+ content.status = ContentStatus.FAILED
1723
+ content.status_message = "Could not upsert embedding"
1724
+ self._update_content(content)
1725
+ return
1726
+ else:
1727
+ try:
1728
+ self.vector_db.insert(
1729
+ content.content_hash, # type: ignore[arg-type]
1730
+ documents=read_documents,
1731
+ filters=content.metadata, # type: ignore[arg-type]
1732
+ )
959
1733
  except Exception as e:
960
1734
  log_error(f"Error inserting document: {e}")
961
1735
  content.status = ContentStatus.FAILED
@@ -966,7 +1740,7 @@ class Knowledge:
966
1740
  content.status = ContentStatus.COMPLETED
967
1741
  self._update_content(content)
968
1742
 
969
- async def _load_content(
1743
+ def _load_content(
970
1744
  self,
971
1745
  content: Content,
972
1746
  upsert: bool,
@@ -974,42 +1748,93 @@ class Knowledge:
974
1748
  include: Optional[List[str]] = None,
975
1749
  exclude: Optional[List[str]] = None,
976
1750
  ) -> None:
977
- log_info(f"Loading content: {content.id}")
1751
+ """Synchronously load content."""
1752
+ if content.path:
1753
+ self._load_from_path(content, upsert, skip_if_exists, include, exclude)
1754
+
1755
+ if content.url:
1756
+ self._load_from_url(content, upsert, skip_if_exists)
1757
+
1758
+ if content.file_data:
1759
+ self._load_from_content(content, upsert, skip_if_exists)
1760
+
1761
+ if content.topics:
1762
+ self._load_from_topics(content, upsert, skip_if_exists)
978
1763
 
979
- if content.metadata:
980
- self.add_filters(content.metadata)
1764
+ if content.remote_content:
1765
+ self._load_from_remote_content(content, upsert, skip_if_exists)
981
1766
 
1767
+ async def _load_content_async(
1768
+ self,
1769
+ content: Content,
1770
+ upsert: bool,
1771
+ skip_if_exists: bool,
1772
+ include: Optional[List[str]] = None,
1773
+ exclude: Optional[List[str]] = None,
1774
+ ) -> None:
982
1775
  if content.path:
983
- await self._load_from_path(content, upsert, skip_if_exists, include, exclude)
1776
+ await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
984
1777
 
985
1778
  if content.url:
986
- await self._load_from_url(content, upsert, skip_if_exists)
1779
+ await self._load_from_url_async(content, upsert, skip_if_exists)
987
1780
 
988
1781
  if content.file_data:
989
- await self._load_from_content(content, upsert, skip_if_exists)
1782
+ await self._load_from_content_async(content, upsert, skip_if_exists)
990
1783
 
991
1784
  if content.topics:
992
- await self._load_from_topics(content, upsert, skip_if_exists)
1785
+ await self._load_from_topics_async(content, upsert, skip_if_exists)
993
1786
 
994
1787
  if content.remote_content:
995
- await self._load_from_remote_content(content, upsert, skip_if_exists)
1788
+ await self._load_from_remote_content_async(content, upsert, skip_if_exists)
996
1789
 
997
1790
  def _build_content_hash(self, content: Content) -> str:
998
1791
  """
999
1792
  Build the content hash from the content.
1793
+
1794
+ For URLs and paths, includes the name and description in the hash if provided
1795
+ to ensure unique content with the same URL/path but different names/descriptions
1796
+ get different hashes.
1797
+
1798
+ Hash format:
1799
+ - URL with name and description: hash("{name}:{description}:{url}")
1800
+ - URL with name only: hash("{name}:{url}")
1801
+ - URL with description only: hash("{description}:{url}")
1802
+ - URL without name/description: hash("{url}") (backward compatible)
1803
+ - Same logic applies to paths
1000
1804
  """
1805
+ hash_parts = []
1806
+ if content.name:
1807
+ hash_parts.append(content.name)
1808
+ if content.description:
1809
+ hash_parts.append(content.description)
1810
+
1001
1811
  if content.path:
1002
- return hashlib.sha256(str(content.path).encode()).hexdigest()
1812
+ hash_parts.append(str(content.path))
1003
1813
  elif content.url:
1004
- hash = hashlib.sha256(content.url.encode()).hexdigest()
1005
- return hash
1814
+ hash_parts.append(content.url)
1006
1815
  elif content.file_data and content.file_data.content:
1007
- name = content.name or "content"
1008
- return hashlib.sha256(name.encode()).hexdigest()
1816
+ # For file_data, always add filename, type, size, or content for uniqueness
1817
+ if content.file_data.filename:
1818
+ hash_parts.append(content.file_data.filename)
1819
+ elif content.file_data.type:
1820
+ hash_parts.append(content.file_data.type)
1821
+ elif content.file_data.size is not None:
1822
+ hash_parts.append(str(content.file_data.size))
1823
+ else:
1824
+ # Fallback: use the content for uniqueness
1825
+ # Include type information to distinguish str vs bytes
1826
+ content_type = "str" if isinstance(content.file_data.content, str) else "bytes"
1827
+ content_bytes = (
1828
+ content.file_data.content.encode()
1829
+ if isinstance(content.file_data.content, str)
1830
+ else content.file_data.content
1831
+ )
1832
+ content_hash = hashlib.sha256(content_bytes).hexdigest()[:16] # Use first 16 chars
1833
+ hash_parts.append(f"{content_type}:{content_hash}")
1009
1834
  elif content.topics and len(content.topics) > 0:
1010
1835
  topic = content.topics[0]
1011
1836
  reader = type(content.reader).__name__ if content.reader else "unknown"
1012
- return hashlib.sha256(f"{topic}-{reader}".encode()).hexdigest()
1837
+ hash_parts.append(f"{topic}-{reader}")
1013
1838
  else:
1014
1839
  # Fallback for edge cases
1015
1840
  import random
@@ -1020,7 +1845,10 @@ class Knowledge:
1020
1845
  or content.id
1021
1846
  or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
1022
1847
  )
1023
- return hashlib.sha256(fallback.encode()).hexdigest()
1848
+ hash_parts.append(fallback)
1849
+
1850
+ hash_input = ":".join(hash_parts)
1851
+ return hashlib.sha256(hash_input.encode()).hexdigest()
1024
1852
 
1025
1853
  def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
1026
1854
  """
@@ -1064,8 +1892,57 @@ class Knowledge:
1064
1892
  # Already a string, return as-is
1065
1893
  return value
1066
1894
 
1895
+ async def _add_to_contents_db_async(self, content: Content):
1896
+ if self.contents_db:
1897
+ created_at = content.created_at if content.created_at else int(time.time())
1898
+ updated_at = content.updated_at if content.updated_at else int(time.time())
1899
+
1900
+ file_type = (
1901
+ content.file_type
1902
+ if content.file_type
1903
+ else content.file_data.type
1904
+ if content.file_data and content.file_data.type
1905
+ else None
1906
+ )
1907
+ # Safely handle string fields with proper type checking
1908
+ safe_name = self._ensure_string_field(content.name, "content.name", default="")
1909
+ safe_description = self._ensure_string_field(content.description, "content.description", default="")
1910
+ safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
1911
+ safe_status_message = self._ensure_string_field(
1912
+ content.status_message, "content.status_message", default=""
1913
+ )
1914
+
1915
+ content_row = KnowledgeRow(
1916
+ id=content.id,
1917
+ name=safe_name,
1918
+ description=safe_description,
1919
+ metadata=content.metadata,
1920
+ type=file_type,
1921
+ size=content.size
1922
+ if content.size
1923
+ else len(content.file_data.content)
1924
+ if content.file_data and content.file_data.content
1925
+ else None,
1926
+ linked_to=safe_linked_to,
1927
+ access_count=0,
1928
+ status=content.status if content.status else ContentStatus.PROCESSING,
1929
+ status_message=safe_status_message,
1930
+ created_at=created_at,
1931
+ updated_at=updated_at,
1932
+ )
1933
+ if isinstance(self.contents_db, AsyncBaseDb):
1934
+ await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1935
+ else:
1936
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1937
+
1067
1938
  def _add_to_contents_db(self, content: Content):
1939
+ """Synchronously add content to contents database."""
1068
1940
  if self.contents_db:
1941
+ if isinstance(self.contents_db, AsyncBaseDb):
1942
+ raise ValueError(
1943
+ "_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
1944
+ )
1945
+
1069
1946
  created_at = content.created_at if content.created_at else int(time.time())
1070
1947
  updated_at = content.updated_at if content.updated_at else int(time.time())
1071
1948
 
@@ -1108,55 +1985,261 @@ class Knowledge:
1108
1985
  from agno.vectordb import VectorDb
1109
1986
 
1110
1987
  self.vector_db = cast(VectorDb, self.vector_db)
1988
+ if self.contents_db:
1989
+ if isinstance(self.contents_db, AsyncBaseDb):
1990
+ raise ValueError(
1991
+ "update_content() is not supported with an async DB. Please use aupdate_content() instead."
1992
+ )
1993
+
1994
+ if not content.id:
1995
+ log_warning("Content id is required to update Knowledge content")
1996
+ return None
1997
+
1998
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
1999
+ content_row = self.contents_db.get_knowledge_content(content.id)
2000
+ if content_row is None:
2001
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
2002
+ return None
2003
+
2004
+ # Apply safe string handling for updates as well
2005
+ if content.name is not None:
2006
+ content_row.name = self._ensure_string_field(content.name, "content.name", default="")
2007
+ if content.description is not None:
2008
+ content_row.description = self._ensure_string_field(
2009
+ content.description, "content.description", default=""
2010
+ )
2011
+ if content.metadata is not None:
2012
+ content_row.metadata = content.metadata
2013
+ if content.status is not None:
2014
+ content_row.status = content.status
2015
+ if content.status_message is not None:
2016
+ content_row.status_message = self._ensure_string_field(
2017
+ content.status_message, "content.status_message", default=""
2018
+ )
2019
+ if content.external_id is not None:
2020
+ content_row.external_id = self._ensure_string_field(
2021
+ content.external_id, "content.external_id", default=""
2022
+ )
2023
+ content_row.updated_at = int(time.time())
2024
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2025
+
2026
+ if self.vector_db:
2027
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
2028
+
2029
+ return content_row.to_dict()
2030
+
2031
+ else:
2032
+ if self.name:
2033
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
2034
+ else:
2035
+ log_warning("Contents DB not found for knowledge base")
2036
+ return None
2037
+
2038
+ async def _aupdate_content(self, content: Content) -> Optional[Dict[str, Any]]:
1111
2039
  if self.contents_db:
1112
2040
  if not content.id:
1113
2041
  log_warning("Content id is required to update Knowledge content")
1114
2042
  return None
1115
2043
 
1116
- # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
1117
- content_row = self.contents_db.get_knowledge_content(content.id)
1118
- if content_row is None:
1119
- log_warning(f"Content row not found for id: {content.id}, cannot update status")
1120
- return None
2044
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
2045
+ if isinstance(self.contents_db, AsyncBaseDb):
2046
+ content_row = await self.contents_db.get_knowledge_content(content.id)
2047
+ else:
2048
+ content_row = self.contents_db.get_knowledge_content(content.id)
2049
+ if content_row is None:
2050
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
2051
+ return None
2052
+
2053
+ if content.name is not None:
2054
+ content_row.name = content.name
2055
+ if content.description is not None:
2056
+ content_row.description = content.description
2057
+ if content.metadata is not None:
2058
+ content_row.metadata = content.metadata
2059
+ if content.status is not None:
2060
+ content_row.status = content.status
2061
+ if content.status_message is not None:
2062
+ content_row.status_message = content.status_message if content.status_message else ""
2063
+ if content.external_id is not None:
2064
+ content_row.external_id = content.external_id
2065
+
2066
+ content_row.updated_at = int(time.time())
2067
+ if isinstance(self.contents_db, AsyncBaseDb):
2068
+ await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2069
+ else:
2070
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2071
+
2072
+ if self.vector_db:
2073
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
2074
+
2075
+ return content_row.to_dict()
2076
+
2077
+ else:
2078
+ if self.name:
2079
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
2080
+ else:
2081
+ log_warning("Contents DB not found for knowledge base")
2082
+ return None
2083
+
2084
+ async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2085
+ from agno.vectordb import VectorDb
2086
+
2087
+ self.vector_db = cast(VectorDb, self.vector_db)
2088
+
2089
+ await self._add_to_contents_db_async(content)
2090
+ if content_type == KnowledgeContentOrigin.PATH:
2091
+ if content.file_data is None:
2092
+ log_warning("No file data provided")
2093
+
2094
+ if content.path is None:
2095
+ log_error("No path provided for content")
2096
+ return
2097
+
2098
+ path = Path(content.path)
2099
+
2100
+ log_info(f"Uploading file to LightRAG from path: {path}")
2101
+ try:
2102
+ # Read the file content from path
2103
+ with open(path, "rb") as f:
2104
+ file_content = f.read()
2105
+
2106
+ # Get file type from extension or content.file_type
2107
+ file_type = content.file_type or path.suffix
2108
+
2109
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2110
+ result = await self.vector_db.insert_file_bytes(
2111
+ file_content=file_content,
2112
+ filename=path.name, # Use the original filename with extension
2113
+ content_type=file_type,
2114
+ send_metadata=True, # Enable metadata so server knows the file type
2115
+ )
2116
+
2117
+ else:
2118
+ log_error("Vector database does not support file insertion")
2119
+ content.status = ContentStatus.FAILED
2120
+ await self._aupdate_content(content)
2121
+ return
2122
+ content.external_id = result
2123
+ content.status = ContentStatus.COMPLETED
2124
+ await self._aupdate_content(content)
2125
+ return
2126
+
2127
+ except Exception as e:
2128
+ log_error(f"Error uploading file to LightRAG: {e}")
2129
+ content.status = ContentStatus.FAILED
2130
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2131
+ await self._aupdate_content(content)
2132
+ return
2133
+
2134
+ elif content_type == KnowledgeContentOrigin.URL:
2135
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
2136
+ try:
2137
+ reader = content.reader or self.website_reader
2138
+ if reader is None:
2139
+ log_error("No URL reader available")
2140
+ content.status = ContentStatus.FAILED
2141
+ await self._aupdate_content(content)
2142
+ return
2143
+
2144
+ reader.chunk = False
2145
+ read_documents = reader.read(content.url, name=content.name)
2146
+ if not content.id:
2147
+ content.id = generate_id(content.content_hash or "")
2148
+ self._prepare_documents_for_insert(read_documents, content.id)
2149
+
2150
+ if not read_documents:
2151
+ log_error("No documents read from URL")
2152
+ content.status = ContentStatus.FAILED
2153
+ await self._aupdate_content(content)
2154
+ return
2155
+
2156
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2157
+ result = await self.vector_db.insert_text(
2158
+ file_source=content.url,
2159
+ text=read_documents[0].content,
2160
+ )
2161
+ else:
2162
+ log_error("Vector database does not support text insertion")
2163
+ content.status = ContentStatus.FAILED
2164
+ await self._aupdate_content(content)
2165
+ return
2166
+
2167
+ content.external_id = result
2168
+ content.status = ContentStatus.COMPLETED
2169
+ await self._aupdate_content(content)
2170
+ return
2171
+
2172
+ except Exception as e:
2173
+ log_error(f"Error uploading file to LightRAG: {e}")
2174
+ content.status = ContentStatus.FAILED
2175
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2176
+ await self._aupdate_content(content)
2177
+ return
2178
+
2179
+ elif content_type == KnowledgeContentOrigin.CONTENT:
2180
+ filename = (
2181
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
2182
+ )
2183
+ log_info(f"Uploading file to LightRAG: {filename}")
1121
2184
 
1122
- # Apply safe string handling for updates as well
1123
- if content.name is not None:
1124
- content_row.name = self._ensure_string_field(content.name, "content.name", default="")
1125
- if content.description is not None:
1126
- content_row.description = self._ensure_string_field(
1127
- content.description, "content.description", default=""
1128
- )
1129
- if content.metadata is not None:
1130
- content_row.metadata = content.metadata
1131
- if content.status is not None:
1132
- content_row.status = content.status
1133
- if content.status_message is not None:
1134
- content_row.status_message = self._ensure_string_field(
1135
- content.status_message, "content.status_message", default=""
1136
- )
1137
- if content.external_id is not None:
1138
- content_row.external_id = self._ensure_string_field(
1139
- content.external_id, "content.external_id", default=""
1140
- )
1141
- content_row.updated_at = int(time.time())
1142
- self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2185
+ # Use the content from file_data
2186
+ if content.file_data and content.file_data.content:
2187
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2188
+ result = await self.vector_db.insert_file_bytes(
2189
+ file_content=content.file_data.content,
2190
+ filename=filename,
2191
+ content_type=content.file_data.type,
2192
+ send_metadata=True, # Enable metadata so server knows the file type
2193
+ )
2194
+ else:
2195
+ log_error("Vector database does not support file insertion")
2196
+ content.status = ContentStatus.FAILED
2197
+ await self._aupdate_content(content)
2198
+ return
2199
+ content.external_id = result
2200
+ content.status = ContentStatus.COMPLETED
2201
+ await self._aupdate_content(content)
2202
+ else:
2203
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
2204
+ return
1143
2205
 
1144
- if self.vector_db and content.metadata:
1145
- self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata)
2206
+ elif content_type == KnowledgeContentOrigin.TOPIC:
2207
+ log_info(f"Uploading file to LightRAG: {content.name}")
1146
2208
 
1147
- if content.metadata:
1148
- self.add_filters(content.metadata)
2209
+ if content.reader is None:
2210
+ log_error("No reader available for topic content")
2211
+ content.status = ContentStatus.FAILED
2212
+ await self._aupdate_content(content)
2213
+ return
1149
2214
 
1150
- return content_row.to_dict()
2215
+ if not content.topics:
2216
+ log_error("No topics available for content")
2217
+ content.status = ContentStatus.FAILED
2218
+ await self._aupdate_content(content)
2219
+ return
1151
2220
 
1152
- else:
1153
- if self.name:
1154
- log_warning(f"Contents DB not found for knowledge base: {self.name}")
2221
+ read_documents = content.reader.read(content.topics)
2222
+ if len(read_documents) > 0:
2223
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2224
+ result = await self.vector_db.insert_text(
2225
+ file_source=content.topics[0],
2226
+ text=read_documents[0].content,
2227
+ )
2228
+ else:
2229
+ log_error("Vector database does not support text insertion")
2230
+ content.status = ContentStatus.FAILED
2231
+ await self._aupdate_content(content)
2232
+ return
2233
+ content.external_id = result
2234
+ content.status = ContentStatus.COMPLETED
2235
+ await self._aupdate_content(content)
2236
+ return
1155
2237
  else:
1156
- log_warning("Contents DB not found for knowledge base")
1157
- return None
2238
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
2239
+ return
1158
2240
 
1159
- async def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2241
+ def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2242
+ """Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
1160
2243
  from agno.vectordb import VectorDb
1161
2244
 
1162
2245
  self.vector_db = cast(VectorDb, self.vector_db)
@@ -1182,13 +2265,15 @@ class Knowledge:
1182
2265
  file_type = content.file_type or path.suffix
1183
2266
 
1184
2267
  if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
1185
- result = await self.vector_db.insert_file_bytes(
1186
- file_content=file_content,
1187
- filename=path.name, # Use the original filename with extension
1188
- content_type=file_type,
1189
- send_metadata=True, # Enable metadata so server knows the file type
2268
+ # LightRAG only has async methods, use asyncio.run() here
2269
+ result = asyncio.run(
2270
+ self.vector_db.insert_file_bytes(
2271
+ file_content=file_content,
2272
+ filename=path.name,
2273
+ content_type=file_type,
2274
+ send_metadata=True,
2275
+ )
1190
2276
  )
1191
-
1192
2277
  else:
1193
2278
  log_error("Vector database does not support file insertion")
1194
2279
  content.status = ContentStatus.FAILED
@@ -1218,9 +2303,9 @@ class Knowledge:
1218
2303
 
1219
2304
  reader.chunk = False
1220
2305
  read_documents = reader.read(content.url, name=content.name)
1221
-
1222
- for read_document in read_documents:
1223
- read_document.content_id = content.id
2306
+ if not content.id:
2307
+ content.id = generate_id(content.content_hash or "")
2308
+ self._prepare_documents_for_insert(read_documents, content.id)
1224
2309
 
1225
2310
  if not read_documents:
1226
2311
  log_error("No documents read from URL")
@@ -1229,9 +2314,12 @@ class Knowledge:
1229
2314
  return
1230
2315
 
1231
2316
  if self.vector_db and hasattr(self.vector_db, "insert_text"):
1232
- result = await self.vector_db.insert_text(
1233
- file_source=content.url,
1234
- text=read_documents[0].content,
2317
+ # LightRAG only has async methods, use asyncio.run() here
2318
+ result = asyncio.run(
2319
+ self.vector_db.insert_text(
2320
+ file_source=content.url,
2321
+ text=read_documents[0].content,
2322
+ )
1235
2323
  )
1236
2324
  else:
1237
2325
  log_error("Vector database does not support text insertion")
@@ -1260,11 +2348,14 @@ class Knowledge:
1260
2348
  # Use the content from file_data
1261
2349
  if content.file_data and content.file_data.content:
1262
2350
  if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
1263
- result = await self.vector_db.insert_file_bytes(
1264
- file_content=content.file_data.content,
1265
- filename=filename,
1266
- content_type=content.file_data.type,
1267
- send_metadata=True, # Enable metadata so server knows the file type
2351
+ # LightRAG only has async methods, use asyncio.run() here
2352
+ result = asyncio.run(
2353
+ self.vector_db.insert_file_bytes(
2354
+ file_content=content.file_data.content,
2355
+ filename=filename,
2356
+ content_type=content.file_data.type,
2357
+ send_metadata=True,
2358
+ )
1268
2359
  )
1269
2360
  else:
1270
2361
  log_error("Vector database does not support file insertion")
@@ -1296,9 +2387,12 @@ class Knowledge:
1296
2387
  read_documents = content.reader.read(content.topics)
1297
2388
  if len(read_documents) > 0:
1298
2389
  if self.vector_db and hasattr(self.vector_db, "insert_text"):
1299
- result = await self.vector_db.insert_text(
1300
- file_source=content.topics[0],
1301
- text=read_documents[0].content,
2390
+ # LightRAG only has async methods, use asyncio.run() here
2391
+ result = asyncio.run(
2392
+ self.vector_db.insert_text(
2393
+ file_source=content.topics[0],
2394
+ text=read_documents[0].content,
2395
+ )
1302
2396
  )
1303
2397
  else:
1304
2398
  log_error("Vector database does not support text insertion")
@@ -1314,13 +2408,24 @@ class Knowledge:
1314
2408
  return
1315
2409
 
1316
2410
  def search(
1317
- self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
2411
+ self,
2412
+ query: str,
2413
+ max_results: Optional[int] = None,
2414
+ filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
2415
+ search_type: Optional[str] = None,
1318
2416
  ) -> List[Document]:
1319
2417
  """Returns relevant documents matching a query"""
1320
-
1321
2418
  from agno.vectordb import VectorDb
2419
+ from agno.vectordb.search import SearchType
1322
2420
 
1323
2421
  self.vector_db = cast(VectorDb, self.vector_db)
2422
+
2423
+ if (
2424
+ hasattr(self.vector_db, "search_type")
2425
+ and isinstance(self.vector_db.search_type, SearchType)
2426
+ and search_type
2427
+ ):
2428
+ self.vector_db.search_type = SearchType(search_type)
1324
2429
  try:
1325
2430
  if self.vector_db is None:
1326
2431
  log_warning("No vector db provided")
@@ -1334,13 +2439,23 @@ class Knowledge:
1334
2439
  return []
1335
2440
 
1336
2441
  async def async_search(
1337
- self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
2442
+ self,
2443
+ query: str,
2444
+ max_results: Optional[int] = None,
2445
+ filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
2446
+ search_type: Optional[str] = None,
1338
2447
  ) -> List[Document]:
1339
2448
  """Returns relevant documents matching a query"""
1340
-
1341
2449
  from agno.vectordb import VectorDb
2450
+ from agno.vectordb.search import SearchType
1342
2451
 
1343
2452
  self.vector_db = cast(VectorDb, self.vector_db)
2453
+ if (
2454
+ hasattr(self.vector_db, "search_type")
2455
+ and isinstance(self.vector_db.search_type, SearchType)
2456
+ and search_type
2457
+ ):
2458
+ self.vector_db.search_type = SearchType(search_type)
1344
2459
  try:
1345
2460
  if self.vector_db is None:
1346
2461
  log_warning("No vector db provided")
@@ -1358,57 +2473,90 @@ class Knowledge:
1358
2473
  return []
1359
2474
 
1360
2475
  def get_valid_filters(self) -> Set[str]:
1361
- if self.valid_metadata_filters is None:
1362
- self.valid_metadata_filters = set()
1363
- self.valid_metadata_filters.update(self._get_filters_from_db)
1364
- return self.valid_metadata_filters
2476
+ if self.contents_db is None:
2477
+ log_warning("No contents db provided. This is required for filtering.")
2478
+ return set()
2479
+ contents, _ = self.get_content()
2480
+ valid_filters: Set[str] = set()
2481
+ for content in contents:
2482
+ if content.metadata:
2483
+ valid_filters.update(content.metadata.keys())
2484
+
2485
+ return valid_filters
2486
+
2487
+ async def async_get_valid_filters(self) -> Set[str]:
2488
+ if self.contents_db is None:
2489
+ log_warning("No contents db provided. This is required for filtering.")
2490
+ return set()
2491
+ contents, _ = await self.aget_content()
2492
+ valid_filters: Set[str] = set()
2493
+ for content in contents:
2494
+ if content.metadata:
2495
+ valid_filters.update(content.metadata.keys())
1365
2496
 
1366
- def validate_filters(self, filters: Optional[Dict[str, Any]]) -> Tuple[Dict[str, Any], List[str]]:
1367
- if self.valid_metadata_filters is None:
1368
- self.valid_metadata_filters = set()
1369
- self.valid_metadata_filters.update(self._get_filters_from_db)
2497
+ return valid_filters
1370
2498
 
2499
+ def _validate_filters(
2500
+ self, filters: Union[Dict[str, Any], List[FilterExpr]], valid_metadata_filters: Set[str]
2501
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
1371
2502
  if not filters:
1372
2503
  return {}, []
1373
2504
 
1374
- valid_filters: Dict[str, Any] = {}
2505
+ valid_filters: Union[Dict[str, Any], List[FilterExpr]] = {}
1375
2506
  invalid_keys = []
1376
2507
 
1377
- # If no metadata filters tracked yet, all keys are considered invalid
1378
- if self.valid_metadata_filters is None:
1379
- invalid_keys = list(filters.keys())
1380
- log_debug(f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}")
1381
- return {}, invalid_keys
1382
-
1383
- for key, value in filters.items():
1384
- # Handle both normal keys and prefixed keys like meta_data.key
1385
- base_key = key.split(".")[-1] if "." in key else key
1386
- if base_key in self.valid_metadata_filters or key in self.valid_metadata_filters:
1387
- valid_filters[key] = value
1388
- else:
1389
- invalid_keys.append(key)
1390
- log_debug(f"Invalid filter key: {key} - not present in knowledge base")
2508
+ if isinstance(filters, dict):
2509
+ # If no metadata filters tracked yet, all keys are considered invalid
2510
+ if valid_metadata_filters is None or not valid_metadata_filters:
2511
+ invalid_keys = list(filters.keys())
2512
+ log_warning(
2513
+ f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}"
2514
+ )
2515
+ return {}, invalid_keys
2516
+
2517
+ for key, value in filters.items():
2518
+ # Handle both normal keys and prefixed keys like meta_data.key
2519
+ base_key = key.split(".")[-1] if "." in key else key
2520
+ if base_key in valid_metadata_filters or key in valid_metadata_filters:
2521
+ valid_filters[key] = value # type: ignore
2522
+ else:
2523
+ invalid_keys.append(key)
2524
+ log_warning(f"Invalid filter key: {key} - not present in knowledge base")
2525
+
2526
+ elif isinstance(filters, List):
2527
+ # Validate that list contains FilterExpr instances
2528
+ for i, filter_item in enumerate(filters):
2529
+ if not isinstance(filter_item, FilterExpr):
2530
+ log_warning(
2531
+ f"Invalid filter at index {i}: expected FilterExpr instance, "
2532
+ f"got {type(filter_item).__name__}. "
2533
+ f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
2534
+ f"AND(...), OR(...), NOT(...) from agno.filters"
2535
+ )
2536
+ # Filter expressions are already validated, return empty dict/list
2537
+ # The actual filtering happens in the vector_db layer
2538
+ return filters, []
1391
2539
 
1392
2540
  return valid_filters, invalid_keys
1393
2541
 
1394
- def add_filters(self, metadata: Dict[str, Any]) -> None:
1395
- if self.valid_metadata_filters is None:
1396
- self.valid_metadata_filters = set()
2542
+ def validate_filters(
2543
+ self, filters: Union[Dict[str, Any], List[FilterExpr]]
2544
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
2545
+ valid_filters_from_db = self.get_valid_filters()
1397
2546
 
1398
- if metadata is not None:
1399
- for key in metadata.keys():
1400
- self.valid_metadata_filters.add(key)
2547
+ valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
1401
2548
 
1402
- @cached_property
1403
- def _get_filters_from_db(self) -> Set[str]:
1404
- if self.contents_db is None:
1405
- return set()
1406
- contents, _ = self.get_content()
1407
- valid_filters: Set[str] = set()
1408
- for content in contents:
1409
- if content.metadata:
1410
- valid_filters.update(content.metadata.keys())
1411
- return valid_filters
2549
+ return valid_filters, invalid_keys
2550
+
2551
+ async def async_validate_filters(
2552
+ self, filters: Union[Dict[str, Any], List[FilterExpr]]
2553
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
2554
+ """Return a tuple containing a dict with all valid filters and a list of invalid filter keys"""
2555
+ valid_filters_from_db = await self.async_get_valid_filters()
2556
+
2557
+ valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
2558
+
2559
+ return valid_filters, invalid_keys
1412
2560
 
1413
2561
  def remove_vector_by_id(self, id: str) -> bool:
1414
2562
  from agno.vectordb import VectorDb
@@ -1442,10 +2590,46 @@ class Knowledge:
1442
2590
  def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
1443
2591
  return self._update_content(content)
1444
2592
 
2593
+ async def apatch_content(self, content: Content) -> Optional[Dict[str, Any]]:
2594
+ return await self._aupdate_content(content)
2595
+
1445
2596
  def get_content_by_id(self, content_id: str) -> Optional[Content]:
1446
2597
  if self.contents_db is None:
1447
2598
  raise ValueError("No contents db provided")
2599
+
2600
+ if isinstance(self.contents_db, AsyncBaseDb):
2601
+ raise ValueError(
2602
+ "get_content_by_id() is not supported for async databases. Please use aget_content_by_id() instead."
2603
+ )
2604
+
1448
2605
  content_row = self.contents_db.get_knowledge_content(content_id)
2606
+
2607
+ if content_row is None:
2608
+ return None
2609
+ content = Content(
2610
+ id=content_row.id,
2611
+ name=content_row.name,
2612
+ description=content_row.description,
2613
+ metadata=content_row.metadata,
2614
+ file_type=content_row.type,
2615
+ size=content_row.size,
2616
+ status=ContentStatus(content_row.status) if content_row.status else None,
2617
+ status_message=content_row.status_message,
2618
+ created_at=content_row.created_at,
2619
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2620
+ external_id=content_row.external_id,
2621
+ )
2622
+ return content
2623
+
2624
+ async def aget_content_by_id(self, content_id: str) -> Optional[Content]:
2625
+ if self.contents_db is None:
2626
+ raise ValueError("No contents db provided")
2627
+
2628
+ if isinstance(self.contents_db, AsyncBaseDb):
2629
+ content_row = await self.contents_db.get_knowledge_content(content_id)
2630
+ else:
2631
+ content_row = self.contents_db.get_knowledge_content(content_id)
2632
+
1449
2633
  if content_row is None:
1450
2634
  return None
1451
2635
  content = Content(
@@ -1472,6 +2656,10 @@ class Knowledge:
1472
2656
  ) -> Tuple[List[Content], int]:
1473
2657
  if self.contents_db is None:
1474
2658
  raise ValueError("No contents db provided")
2659
+
2660
+ if isinstance(self.contents_db, AsyncBaseDb):
2661
+ raise ValueError("get_content() is not supported for async databases. Please use aget_content() instead.")
2662
+
1475
2663
  contents, count = self.contents_db.get_knowledge_contents(
1476
2664
  limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
1477
2665
  )
@@ -1495,9 +2683,53 @@ class Knowledge:
1495
2683
  result.append(content)
1496
2684
  return result, count
1497
2685
 
2686
+ async def aget_content(
2687
+ self,
2688
+ limit: Optional[int] = None,
2689
+ page: Optional[int] = None,
2690
+ sort_by: Optional[str] = None,
2691
+ sort_order: Optional[str] = None,
2692
+ ) -> Tuple[List[Content], int]:
2693
+ if self.contents_db is None:
2694
+ raise ValueError("No contents db provided")
2695
+
2696
+ if isinstance(self.contents_db, AsyncBaseDb):
2697
+ contents, count = await self.contents_db.get_knowledge_contents(
2698
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
2699
+ )
2700
+ else:
2701
+ contents, count = self.contents_db.get_knowledge_contents(
2702
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
2703
+ )
2704
+
2705
+ result = []
2706
+ for content_row in contents:
2707
+ # Create Content from database row
2708
+ content = Content(
2709
+ id=content_row.id,
2710
+ name=content_row.name,
2711
+ description=content_row.description,
2712
+ metadata=content_row.metadata,
2713
+ size=content_row.size,
2714
+ file_type=content_row.type,
2715
+ status=ContentStatus(content_row.status) if content_row.status else None,
2716
+ status_message=content_row.status_message,
2717
+ created_at=content_row.created_at,
2718
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2719
+ external_id=content_row.external_id,
2720
+ )
2721
+ result.append(content)
2722
+ return result, count
2723
+
1498
2724
  def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
1499
2725
  if self.contents_db is None:
1500
2726
  raise ValueError("No contents db provided")
2727
+
2728
+ if isinstance(self.contents_db, AsyncBaseDb):
2729
+ raise ValueError(
2730
+ "get_content_status() is not supported for async databases. Please use aget_content_status() instead."
2731
+ )
2732
+
1501
2733
  content_row = self.contents_db.get_knowledge_content(content_id)
1502
2734
  if content_row is None:
1503
2735
  return None, "Content not found"
@@ -1517,6 +2749,33 @@ class Knowledge:
1517
2749
 
1518
2750
  return status, content_row.status_message
1519
2751
 
2752
+ async def aget_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
2753
+ if self.contents_db is None:
2754
+ raise ValueError("No contents db provided")
2755
+
2756
+ if isinstance(self.contents_db, AsyncBaseDb):
2757
+ content_row = await self.contents_db.get_knowledge_content(content_id)
2758
+ else:
2759
+ content_row = self.contents_db.get_knowledge_content(content_id)
2760
+
2761
+ if content_row is None:
2762
+ return None, "Content not found"
2763
+
2764
+ # Convert string status to enum, defaulting to PROCESSING if unknown
2765
+ status_str = content_row.status
2766
+ try:
2767
+ status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
2768
+ except ValueError:
2769
+ # Handle legacy or unknown statuses
2770
+ if status_str and "failed" in status_str.lower():
2771
+ status = ContentStatus.FAILED
2772
+ elif status_str and "completed" in status_str.lower():
2773
+ status = ContentStatus.COMPLETED
2774
+ else:
2775
+ status = ContentStatus.PROCESSING
2776
+
2777
+ return status, content_row.status_message
2778
+
1520
2779
  def remove_content_by_id(self, content_id: str):
1521
2780
  from agno.vectordb import VectorDb
1522
2781
 
@@ -1535,12 +2794,36 @@ class Knowledge:
1535
2794
  if self.contents_db is not None:
1536
2795
  self.contents_db.delete_knowledge_content(content_id)
1537
2796
 
2797
+ async def aremove_content_by_id(self, content_id: str):
2798
+ if self.vector_db is not None:
2799
+ if self.vector_db.__class__.__name__ == "LightRag":
2800
+ # For LightRAG, get the content first to find the external_id
2801
+ content = await self.aget_content_by_id(content_id)
2802
+ if content and content.external_id:
2803
+ self.vector_db.delete_by_external_id(content.external_id) # type: ignore
2804
+ else:
2805
+ log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
2806
+ else:
2807
+ self.vector_db.delete_by_content_id(content_id)
2808
+
2809
+ if self.contents_db is not None:
2810
+ if isinstance(self.contents_db, AsyncBaseDb):
2811
+ await self.contents_db.delete_knowledge_content(content_id)
2812
+ else:
2813
+ self.contents_db.delete_knowledge_content(content_id)
2814
+
1538
2815
  def remove_all_content(self):
1539
2816
  contents, _ = self.get_content()
1540
2817
  for content in contents:
1541
2818
  if content.id is not None:
1542
2819
  self.remove_content_by_id(content.id)
1543
2820
 
2821
+ async def aremove_all_content(self):
2822
+ contents, _ = await self.aget_content()
2823
+ for content in contents:
2824
+ if content.id is not None:
2825
+ await self.aremove_content_by_id(content.id)
2826
+
1544
2827
  # --- Reader Factory Integration ---
1545
2828
 
1546
2829
  def construct_readers(self):
@@ -1563,6 +2846,24 @@ class Knowledge:
1563
2846
  """Get all currently loaded readers (only returns readers that have been used)."""
1564
2847
  if self.readers is None:
1565
2848
  self.readers = {}
2849
+ elif not isinstance(self.readers, dict):
2850
+ # Defensive check: if readers is not a dict (e.g., was set to a list), convert it
2851
+ if isinstance(self.readers, list):
2852
+ readers_dict: Dict[str, Reader] = {}
2853
+ for reader in self.readers:
2854
+ if isinstance(reader, Reader):
2855
+ reader_key = self._generate_reader_key(reader)
2856
+ # Handle potential duplicate keys by appending index if needed
2857
+ original_key = reader_key
2858
+ counter = 1
2859
+ while reader_key in readers_dict:
2860
+ reader_key = f"{original_key}_{counter}"
2861
+ counter += 1
2862
+ readers_dict[reader_key] = reader
2863
+ self.readers = readers_dict
2864
+ else:
2865
+ # For any other unexpected type, reset to empty dict
2866
+ self.readers = {}
1566
2867
 
1567
2868
  return self.readers
1568
2869
 
@@ -1578,12 +2879,6 @@ class Knowledge:
1578
2879
  log_info(f"Selecting reader for extension: {extension}")
1579
2880
  return ReaderFactory.get_reader_for_extension(extension)
1580
2881
 
1581
- def get_filters(self) -> List[str]:
1582
- return [
1583
- "filter_tag_1",
1584
- "filter_tag2",
1585
- ]
1586
-
1587
2882
  # --- Convenience Properties for Backward Compatibility ---
1588
2883
 
1589
2884
  def _is_text_mime_type(self, mime_type: str) -> bool:
@@ -1675,6 +2970,11 @@ class Knowledge:
1675
2970
  """Docx reader - lazy loaded via factory."""
1676
2971
  return self._get_reader("docx")
1677
2972
 
2973
+ @property
2974
+ def pptx_reader(self) -> Optional[Reader]:
2975
+ """PPTX reader - lazy loaded via factory."""
2976
+ return self._get_reader("pptx")
2977
+
1678
2978
  @property
1679
2979
  def json_reader(self) -> Optional[Reader]:
1680
2980
  """JSON reader - lazy loaded via factory."""