agno 0.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (723) hide show
  1. agno/__init__.py +8 -0
  2. agno/agent/__init__.py +44 -5
  3. agno/agent/agent.py +10531 -2975
  4. agno/api/agent.py +14 -53
  5. agno/api/api.py +7 -46
  6. agno/api/evals.py +22 -0
  7. agno/api/os.py +17 -0
  8. agno/api/routes.py +6 -25
  9. agno/api/schemas/__init__.py +9 -0
  10. agno/api/schemas/agent.py +6 -9
  11. agno/api/schemas/evals.py +16 -0
  12. agno/api/schemas/os.py +14 -0
  13. agno/api/schemas/team.py +10 -10
  14. agno/api/schemas/utils.py +21 -0
  15. agno/api/schemas/workflows.py +16 -0
  16. agno/api/settings.py +53 -0
  17. agno/api/team.py +22 -26
  18. agno/api/workflow.py +28 -0
  19. agno/cloud/aws/base.py +214 -0
  20. agno/cloud/aws/s3/__init__.py +2 -0
  21. agno/cloud/aws/s3/api_client.py +43 -0
  22. agno/cloud/aws/s3/bucket.py +195 -0
  23. agno/cloud/aws/s3/object.py +57 -0
  24. agno/compression/__init__.py +3 -0
  25. agno/compression/manager.py +247 -0
  26. agno/culture/__init__.py +3 -0
  27. agno/culture/manager.py +956 -0
  28. agno/db/__init__.py +24 -0
  29. agno/db/async_postgres/__init__.py +3 -0
  30. agno/db/base.py +946 -0
  31. agno/db/dynamo/__init__.py +3 -0
  32. agno/db/dynamo/dynamo.py +2781 -0
  33. agno/db/dynamo/schemas.py +442 -0
  34. agno/db/dynamo/utils.py +743 -0
  35. agno/db/firestore/__init__.py +3 -0
  36. agno/db/firestore/firestore.py +2379 -0
  37. agno/db/firestore/schemas.py +181 -0
  38. agno/db/firestore/utils.py +376 -0
  39. agno/db/gcs_json/__init__.py +3 -0
  40. agno/db/gcs_json/gcs_json_db.py +1791 -0
  41. agno/db/gcs_json/utils.py +228 -0
  42. agno/db/in_memory/__init__.py +3 -0
  43. agno/db/in_memory/in_memory_db.py +1312 -0
  44. agno/db/in_memory/utils.py +230 -0
  45. agno/db/json/__init__.py +3 -0
  46. agno/db/json/json_db.py +1777 -0
  47. agno/db/json/utils.py +230 -0
  48. agno/db/migrations/manager.py +199 -0
  49. agno/db/migrations/v1_to_v2.py +635 -0
  50. agno/db/migrations/versions/v2_3_0.py +938 -0
  51. agno/db/mongo/__init__.py +17 -0
  52. agno/db/mongo/async_mongo.py +2760 -0
  53. agno/db/mongo/mongo.py +2597 -0
  54. agno/db/mongo/schemas.py +119 -0
  55. agno/db/mongo/utils.py +276 -0
  56. agno/db/mysql/__init__.py +4 -0
  57. agno/db/mysql/async_mysql.py +2912 -0
  58. agno/db/mysql/mysql.py +2923 -0
  59. agno/db/mysql/schemas.py +186 -0
  60. agno/db/mysql/utils.py +488 -0
  61. agno/db/postgres/__init__.py +4 -0
  62. agno/db/postgres/async_postgres.py +2579 -0
  63. agno/db/postgres/postgres.py +2870 -0
  64. agno/db/postgres/schemas.py +187 -0
  65. agno/db/postgres/utils.py +442 -0
  66. agno/db/redis/__init__.py +3 -0
  67. agno/db/redis/redis.py +2141 -0
  68. agno/db/redis/schemas.py +159 -0
  69. agno/db/redis/utils.py +346 -0
  70. agno/db/schemas/__init__.py +4 -0
  71. agno/db/schemas/culture.py +120 -0
  72. agno/db/schemas/evals.py +34 -0
  73. agno/db/schemas/knowledge.py +40 -0
  74. agno/db/schemas/memory.py +61 -0
  75. agno/db/singlestore/__init__.py +3 -0
  76. agno/db/singlestore/schemas.py +179 -0
  77. agno/db/singlestore/singlestore.py +2877 -0
  78. agno/db/singlestore/utils.py +384 -0
  79. agno/db/sqlite/__init__.py +4 -0
  80. agno/db/sqlite/async_sqlite.py +2911 -0
  81. agno/db/sqlite/schemas.py +181 -0
  82. agno/db/sqlite/sqlite.py +2908 -0
  83. agno/db/sqlite/utils.py +429 -0
  84. agno/db/surrealdb/__init__.py +3 -0
  85. agno/db/surrealdb/metrics.py +292 -0
  86. agno/db/surrealdb/models.py +334 -0
  87. agno/db/surrealdb/queries.py +71 -0
  88. agno/db/surrealdb/surrealdb.py +1908 -0
  89. agno/db/surrealdb/utils.py +147 -0
  90. agno/db/utils.py +118 -0
  91. agno/eval/__init__.py +24 -0
  92. agno/eval/accuracy.py +666 -276
  93. agno/eval/agent_as_judge.py +861 -0
  94. agno/eval/base.py +29 -0
  95. agno/eval/performance.py +779 -0
  96. agno/eval/reliability.py +241 -62
  97. agno/eval/utils.py +120 -0
  98. agno/exceptions.py +143 -1
  99. agno/filters.py +354 -0
  100. agno/guardrails/__init__.py +6 -0
  101. agno/guardrails/base.py +19 -0
  102. agno/guardrails/openai.py +144 -0
  103. agno/guardrails/pii.py +94 -0
  104. agno/guardrails/prompt_injection.py +52 -0
  105. agno/hooks/__init__.py +3 -0
  106. agno/hooks/decorator.py +164 -0
  107. agno/integrations/discord/__init__.py +3 -0
  108. agno/integrations/discord/client.py +203 -0
  109. agno/knowledge/__init__.py +5 -1
  110. agno/{document → knowledge}/chunking/agentic.py +22 -14
  111. agno/{document → knowledge}/chunking/document.py +2 -2
  112. agno/{document → knowledge}/chunking/fixed.py +7 -6
  113. agno/knowledge/chunking/markdown.py +151 -0
  114. agno/{document → knowledge}/chunking/recursive.py +15 -3
  115. agno/knowledge/chunking/row.py +39 -0
  116. agno/knowledge/chunking/semantic.py +91 -0
  117. agno/knowledge/chunking/strategy.py +165 -0
  118. agno/knowledge/content.py +74 -0
  119. agno/knowledge/document/__init__.py +5 -0
  120. agno/{document → knowledge/document}/base.py +12 -2
  121. agno/knowledge/embedder/__init__.py +5 -0
  122. agno/knowledge/embedder/aws_bedrock.py +343 -0
  123. agno/knowledge/embedder/azure_openai.py +210 -0
  124. agno/{embedder → knowledge/embedder}/base.py +8 -0
  125. agno/knowledge/embedder/cohere.py +323 -0
  126. agno/knowledge/embedder/fastembed.py +62 -0
  127. agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
  128. agno/knowledge/embedder/google.py +258 -0
  129. agno/knowledge/embedder/huggingface.py +94 -0
  130. agno/knowledge/embedder/jina.py +182 -0
  131. agno/knowledge/embedder/langdb.py +22 -0
  132. agno/knowledge/embedder/mistral.py +206 -0
  133. agno/knowledge/embedder/nebius.py +13 -0
  134. agno/knowledge/embedder/ollama.py +154 -0
  135. agno/knowledge/embedder/openai.py +195 -0
  136. agno/knowledge/embedder/sentence_transformer.py +63 -0
  137. agno/{embedder → knowledge/embedder}/together.py +1 -1
  138. agno/knowledge/embedder/vllm.py +262 -0
  139. agno/knowledge/embedder/voyageai.py +165 -0
  140. agno/knowledge/knowledge.py +3006 -0
  141. agno/knowledge/reader/__init__.py +7 -0
  142. agno/knowledge/reader/arxiv_reader.py +81 -0
  143. agno/knowledge/reader/base.py +95 -0
  144. agno/knowledge/reader/csv_reader.py +164 -0
  145. agno/knowledge/reader/docx_reader.py +82 -0
  146. agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
  147. agno/knowledge/reader/firecrawl_reader.py +201 -0
  148. agno/knowledge/reader/json_reader.py +88 -0
  149. agno/knowledge/reader/markdown_reader.py +137 -0
  150. agno/knowledge/reader/pdf_reader.py +431 -0
  151. agno/knowledge/reader/pptx_reader.py +101 -0
  152. agno/knowledge/reader/reader_factory.py +313 -0
  153. agno/knowledge/reader/s3_reader.py +89 -0
  154. agno/knowledge/reader/tavily_reader.py +193 -0
  155. agno/knowledge/reader/text_reader.py +127 -0
  156. agno/knowledge/reader/web_search_reader.py +325 -0
  157. agno/knowledge/reader/website_reader.py +455 -0
  158. agno/knowledge/reader/wikipedia_reader.py +91 -0
  159. agno/knowledge/reader/youtube_reader.py +78 -0
  160. agno/knowledge/remote_content/remote_content.py +88 -0
  161. agno/knowledge/reranker/__init__.py +3 -0
  162. agno/{reranker → knowledge/reranker}/base.py +1 -1
  163. agno/{reranker → knowledge/reranker}/cohere.py +2 -2
  164. agno/knowledge/reranker/infinity.py +195 -0
  165. agno/knowledge/reranker/sentence_transformer.py +54 -0
  166. agno/knowledge/types.py +39 -0
  167. agno/knowledge/utils.py +234 -0
  168. agno/media.py +439 -95
  169. agno/memory/__init__.py +16 -3
  170. agno/memory/manager.py +1474 -123
  171. agno/memory/strategies/__init__.py +15 -0
  172. agno/memory/strategies/base.py +66 -0
  173. agno/memory/strategies/summarize.py +196 -0
  174. agno/memory/strategies/types.py +37 -0
  175. agno/models/aimlapi/__init__.py +5 -0
  176. agno/models/aimlapi/aimlapi.py +62 -0
  177. agno/models/anthropic/__init__.py +4 -0
  178. agno/models/anthropic/claude.py +960 -496
  179. agno/models/aws/__init__.py +15 -0
  180. agno/models/aws/bedrock.py +686 -451
  181. agno/models/aws/claude.py +190 -183
  182. agno/models/azure/__init__.py +18 -1
  183. agno/models/azure/ai_foundry.py +489 -0
  184. agno/models/azure/openai_chat.py +89 -40
  185. agno/models/base.py +2477 -550
  186. agno/models/cerebras/__init__.py +12 -0
  187. agno/models/cerebras/cerebras.py +565 -0
  188. agno/models/cerebras/cerebras_openai.py +131 -0
  189. agno/models/cohere/__init__.py +4 -0
  190. agno/models/cohere/chat.py +306 -492
  191. agno/models/cometapi/__init__.py +5 -0
  192. agno/models/cometapi/cometapi.py +74 -0
  193. agno/models/dashscope/__init__.py +5 -0
  194. agno/models/dashscope/dashscope.py +90 -0
  195. agno/models/deepinfra/__init__.py +5 -0
  196. agno/models/deepinfra/deepinfra.py +45 -0
  197. agno/models/deepseek/__init__.py +4 -0
  198. agno/models/deepseek/deepseek.py +110 -9
  199. agno/models/fireworks/__init__.py +4 -0
  200. agno/models/fireworks/fireworks.py +19 -22
  201. agno/models/google/__init__.py +3 -7
  202. agno/models/google/gemini.py +1717 -662
  203. agno/models/google/utils.py +22 -0
  204. agno/models/groq/__init__.py +4 -0
  205. agno/models/groq/groq.py +391 -666
  206. agno/models/huggingface/__init__.py +4 -0
  207. agno/models/huggingface/huggingface.py +266 -538
  208. agno/models/ibm/__init__.py +5 -0
  209. agno/models/ibm/watsonx.py +432 -0
  210. agno/models/internlm/__init__.py +3 -0
  211. agno/models/internlm/internlm.py +20 -3
  212. agno/models/langdb/__init__.py +1 -0
  213. agno/models/langdb/langdb.py +60 -0
  214. agno/models/litellm/__init__.py +14 -0
  215. agno/models/litellm/chat.py +503 -0
  216. agno/models/litellm/litellm_openai.py +42 -0
  217. agno/models/llama_cpp/__init__.py +5 -0
  218. agno/models/llama_cpp/llama_cpp.py +22 -0
  219. agno/models/lmstudio/__init__.py +5 -0
  220. agno/models/lmstudio/lmstudio.py +25 -0
  221. agno/models/message.py +361 -39
  222. agno/models/meta/__init__.py +12 -0
  223. agno/models/meta/llama.py +502 -0
  224. agno/models/meta/llama_openai.py +79 -0
  225. agno/models/metrics.py +120 -0
  226. agno/models/mistral/__init__.py +4 -0
  227. agno/models/mistral/mistral.py +293 -393
  228. agno/models/nebius/__init__.py +3 -0
  229. agno/models/nebius/nebius.py +53 -0
  230. agno/models/nexus/__init__.py +3 -0
  231. agno/models/nexus/nexus.py +22 -0
  232. agno/models/nvidia/__init__.py +4 -0
  233. agno/models/nvidia/nvidia.py +22 -3
  234. agno/models/ollama/__init__.py +4 -2
  235. agno/models/ollama/chat.py +257 -492
  236. agno/models/openai/__init__.py +7 -0
  237. agno/models/openai/chat.py +725 -770
  238. agno/models/openai/like.py +16 -2
  239. agno/models/openai/responses.py +1121 -0
  240. agno/models/openrouter/__init__.py +4 -0
  241. agno/models/openrouter/openrouter.py +62 -5
  242. agno/models/perplexity/__init__.py +5 -0
  243. agno/models/perplexity/perplexity.py +203 -0
  244. agno/models/portkey/__init__.py +3 -0
  245. agno/models/portkey/portkey.py +82 -0
  246. agno/models/requesty/__init__.py +5 -0
  247. agno/models/requesty/requesty.py +69 -0
  248. agno/models/response.py +177 -7
  249. agno/models/sambanova/__init__.py +4 -0
  250. agno/models/sambanova/sambanova.py +23 -4
  251. agno/models/siliconflow/__init__.py +5 -0
  252. agno/models/siliconflow/siliconflow.py +42 -0
  253. agno/models/together/__init__.py +4 -0
  254. agno/models/together/together.py +21 -164
  255. agno/models/utils.py +266 -0
  256. agno/models/vercel/__init__.py +3 -0
  257. agno/models/vercel/v0.py +43 -0
  258. agno/models/vertexai/__init__.py +0 -1
  259. agno/models/vertexai/claude.py +190 -0
  260. agno/models/vllm/__init__.py +3 -0
  261. agno/models/vllm/vllm.py +83 -0
  262. agno/models/xai/__init__.py +2 -0
  263. agno/models/xai/xai.py +111 -7
  264. agno/os/__init__.py +3 -0
  265. agno/os/app.py +1027 -0
  266. agno/os/auth.py +244 -0
  267. agno/os/config.py +126 -0
  268. agno/os/interfaces/__init__.py +1 -0
  269. agno/os/interfaces/a2a/__init__.py +3 -0
  270. agno/os/interfaces/a2a/a2a.py +42 -0
  271. agno/os/interfaces/a2a/router.py +249 -0
  272. agno/os/interfaces/a2a/utils.py +924 -0
  273. agno/os/interfaces/agui/__init__.py +3 -0
  274. agno/os/interfaces/agui/agui.py +47 -0
  275. agno/os/interfaces/agui/router.py +147 -0
  276. agno/os/interfaces/agui/utils.py +574 -0
  277. agno/os/interfaces/base.py +25 -0
  278. agno/os/interfaces/slack/__init__.py +3 -0
  279. agno/os/interfaces/slack/router.py +148 -0
  280. agno/os/interfaces/slack/security.py +30 -0
  281. agno/os/interfaces/slack/slack.py +47 -0
  282. agno/os/interfaces/whatsapp/__init__.py +3 -0
  283. agno/os/interfaces/whatsapp/router.py +210 -0
  284. agno/os/interfaces/whatsapp/security.py +55 -0
  285. agno/os/interfaces/whatsapp/whatsapp.py +36 -0
  286. agno/os/mcp.py +293 -0
  287. agno/os/middleware/__init__.py +9 -0
  288. agno/os/middleware/jwt.py +797 -0
  289. agno/os/router.py +258 -0
  290. agno/os/routers/__init__.py +3 -0
  291. agno/os/routers/agents/__init__.py +3 -0
  292. agno/os/routers/agents/router.py +599 -0
  293. agno/os/routers/agents/schema.py +261 -0
  294. agno/os/routers/evals/__init__.py +3 -0
  295. agno/os/routers/evals/evals.py +450 -0
  296. agno/os/routers/evals/schemas.py +174 -0
  297. agno/os/routers/evals/utils.py +231 -0
  298. agno/os/routers/health.py +31 -0
  299. agno/os/routers/home.py +52 -0
  300. agno/os/routers/knowledge/__init__.py +3 -0
  301. agno/os/routers/knowledge/knowledge.py +1008 -0
  302. agno/os/routers/knowledge/schemas.py +178 -0
  303. agno/os/routers/memory/__init__.py +3 -0
  304. agno/os/routers/memory/memory.py +661 -0
  305. agno/os/routers/memory/schemas.py +88 -0
  306. agno/os/routers/metrics/__init__.py +3 -0
  307. agno/os/routers/metrics/metrics.py +190 -0
  308. agno/os/routers/metrics/schemas.py +47 -0
  309. agno/os/routers/session/__init__.py +3 -0
  310. agno/os/routers/session/session.py +997 -0
  311. agno/os/routers/teams/__init__.py +3 -0
  312. agno/os/routers/teams/router.py +512 -0
  313. agno/os/routers/teams/schema.py +257 -0
  314. agno/os/routers/traces/__init__.py +3 -0
  315. agno/os/routers/traces/schemas.py +414 -0
  316. agno/os/routers/traces/traces.py +499 -0
  317. agno/os/routers/workflows/__init__.py +3 -0
  318. agno/os/routers/workflows/router.py +624 -0
  319. agno/os/routers/workflows/schema.py +75 -0
  320. agno/os/schema.py +534 -0
  321. agno/os/scopes.py +469 -0
  322. agno/{playground → os}/settings.py +7 -15
  323. agno/os/utils.py +973 -0
  324. agno/reasoning/anthropic.py +80 -0
  325. agno/reasoning/azure_ai_foundry.py +67 -0
  326. agno/reasoning/deepseek.py +63 -0
  327. agno/reasoning/default.py +97 -0
  328. agno/reasoning/gemini.py +73 -0
  329. agno/reasoning/groq.py +71 -0
  330. agno/reasoning/helpers.py +24 -1
  331. agno/reasoning/ollama.py +67 -0
  332. agno/reasoning/openai.py +86 -0
  333. agno/reasoning/step.py +2 -1
  334. agno/reasoning/vertexai.py +76 -0
  335. agno/run/__init__.py +6 -0
  336. agno/run/agent.py +822 -0
  337. agno/run/base.py +247 -0
  338. agno/run/cancel.py +81 -0
  339. agno/run/requirement.py +181 -0
  340. agno/run/team.py +767 -0
  341. agno/run/workflow.py +708 -0
  342. agno/session/__init__.py +10 -0
  343. agno/session/agent.py +260 -0
  344. agno/session/summary.py +265 -0
  345. agno/session/team.py +342 -0
  346. agno/session/workflow.py +501 -0
  347. agno/table.py +10 -0
  348. agno/team/__init__.py +37 -0
  349. agno/team/team.py +9536 -0
  350. agno/tools/__init__.py +7 -0
  351. agno/tools/agentql.py +120 -0
  352. agno/tools/airflow.py +22 -12
  353. agno/tools/api.py +122 -0
  354. agno/tools/apify.py +276 -83
  355. agno/tools/{arxiv_toolkit.py → arxiv.py} +20 -12
  356. agno/tools/aws_lambda.py +28 -7
  357. agno/tools/aws_ses.py +66 -0
  358. agno/tools/baidusearch.py +11 -4
  359. agno/tools/bitbucket.py +292 -0
  360. agno/tools/brandfetch.py +213 -0
  361. agno/tools/bravesearch.py +106 -0
  362. agno/tools/brightdata.py +367 -0
  363. agno/tools/browserbase.py +209 -0
  364. agno/tools/calcom.py +32 -23
  365. agno/tools/calculator.py +24 -37
  366. agno/tools/cartesia.py +187 -0
  367. agno/tools/{clickup_tool.py → clickup.py} +17 -28
  368. agno/tools/confluence.py +91 -26
  369. agno/tools/crawl4ai.py +139 -43
  370. agno/tools/csv_toolkit.py +28 -22
  371. agno/tools/dalle.py +36 -22
  372. agno/tools/daytona.py +475 -0
  373. agno/tools/decorator.py +169 -14
  374. agno/tools/desi_vocal.py +23 -11
  375. agno/tools/discord.py +32 -29
  376. agno/tools/docker.py +716 -0
  377. agno/tools/duckdb.py +76 -81
  378. agno/tools/duckduckgo.py +43 -40
  379. agno/tools/e2b.py +703 -0
  380. agno/tools/eleven_labs.py +65 -54
  381. agno/tools/email.py +13 -5
  382. agno/tools/evm.py +129 -0
  383. agno/tools/exa.py +324 -42
  384. agno/tools/fal.py +39 -35
  385. agno/tools/file.py +196 -30
  386. agno/tools/file_generation.py +356 -0
  387. agno/tools/financial_datasets.py +288 -0
  388. agno/tools/firecrawl.py +108 -33
  389. agno/tools/function.py +960 -122
  390. agno/tools/giphy.py +34 -12
  391. agno/tools/github.py +1294 -97
  392. agno/tools/gmail.py +922 -0
  393. agno/tools/google_bigquery.py +117 -0
  394. agno/tools/google_drive.py +271 -0
  395. agno/tools/google_maps.py +253 -0
  396. agno/tools/googlecalendar.py +607 -107
  397. agno/tools/googlesheets.py +377 -0
  398. agno/tools/hackernews.py +20 -12
  399. agno/tools/jina.py +24 -14
  400. agno/tools/jira.py +48 -19
  401. agno/tools/knowledge.py +218 -0
  402. agno/tools/linear.py +82 -43
  403. agno/tools/linkup.py +58 -0
  404. agno/tools/local_file_system.py +15 -7
  405. agno/tools/lumalab.py +41 -26
  406. agno/tools/mcp/__init__.py +10 -0
  407. agno/tools/mcp/mcp.py +331 -0
  408. agno/tools/mcp/multi_mcp.py +347 -0
  409. agno/tools/mcp/params.py +24 -0
  410. agno/tools/mcp_toolbox.py +284 -0
  411. agno/tools/mem0.py +193 -0
  412. agno/tools/memory.py +419 -0
  413. agno/tools/mlx_transcribe.py +11 -9
  414. agno/tools/models/azure_openai.py +190 -0
  415. agno/tools/models/gemini.py +203 -0
  416. agno/tools/models/groq.py +158 -0
  417. agno/tools/models/morph.py +186 -0
  418. agno/tools/models/nebius.py +124 -0
  419. agno/tools/models_labs.py +163 -82
  420. agno/tools/moviepy_video.py +18 -13
  421. agno/tools/nano_banana.py +151 -0
  422. agno/tools/neo4j.py +134 -0
  423. agno/tools/newspaper.py +15 -4
  424. agno/tools/newspaper4k.py +19 -6
  425. agno/tools/notion.py +204 -0
  426. agno/tools/openai.py +181 -17
  427. agno/tools/openbb.py +27 -20
  428. agno/tools/opencv.py +321 -0
  429. agno/tools/openweather.py +233 -0
  430. agno/tools/oxylabs.py +385 -0
  431. agno/tools/pandas.py +25 -15
  432. agno/tools/parallel.py +314 -0
  433. agno/tools/postgres.py +238 -185
  434. agno/tools/pubmed.py +125 -13
  435. agno/tools/python.py +48 -35
  436. agno/tools/reasoning.py +283 -0
  437. agno/tools/reddit.py +207 -29
  438. agno/tools/redshift.py +406 -0
  439. agno/tools/replicate.py +69 -26
  440. agno/tools/resend.py +11 -6
  441. agno/tools/scrapegraph.py +179 -19
  442. agno/tools/searxng.py +23 -31
  443. agno/tools/serpapi.py +15 -10
  444. agno/tools/serper.py +255 -0
  445. agno/tools/shell.py +23 -12
  446. agno/tools/shopify.py +1519 -0
  447. agno/tools/slack.py +56 -14
  448. agno/tools/sleep.py +8 -6
  449. agno/tools/spider.py +35 -11
  450. agno/tools/spotify.py +919 -0
  451. agno/tools/sql.py +34 -19
  452. agno/tools/tavily.py +158 -8
  453. agno/tools/telegram.py +18 -8
  454. agno/tools/todoist.py +218 -0
  455. agno/tools/toolkit.py +134 -9
  456. agno/tools/trafilatura.py +388 -0
  457. agno/tools/trello.py +25 -28
  458. agno/tools/twilio.py +18 -9
  459. agno/tools/user_control_flow.py +78 -0
  460. agno/tools/valyu.py +228 -0
  461. agno/tools/visualization.py +467 -0
  462. agno/tools/webbrowser.py +28 -0
  463. agno/tools/webex.py +76 -0
  464. agno/tools/website.py +23 -19
  465. agno/tools/webtools.py +45 -0
  466. agno/tools/whatsapp.py +286 -0
  467. agno/tools/wikipedia.py +28 -19
  468. agno/tools/workflow.py +285 -0
  469. agno/tools/{twitter.py → x.py} +142 -46
  470. agno/tools/yfinance.py +41 -39
  471. agno/tools/youtube.py +34 -17
  472. agno/tools/zendesk.py +15 -5
  473. agno/tools/zep.py +454 -0
  474. agno/tools/zoom.py +86 -37
  475. agno/tracing/__init__.py +12 -0
  476. agno/tracing/exporter.py +157 -0
  477. agno/tracing/schemas.py +276 -0
  478. agno/tracing/setup.py +111 -0
  479. agno/utils/agent.py +938 -0
  480. agno/utils/audio.py +37 -1
  481. agno/utils/certs.py +27 -0
  482. agno/utils/code_execution.py +11 -0
  483. agno/utils/common.py +103 -20
  484. agno/utils/cryptography.py +22 -0
  485. agno/utils/dttm.py +33 -0
  486. agno/utils/events.py +700 -0
  487. agno/utils/functions.py +107 -37
  488. agno/utils/gemini.py +426 -0
  489. agno/utils/hooks.py +171 -0
  490. agno/utils/http.py +185 -0
  491. agno/utils/json_schema.py +159 -37
  492. agno/utils/knowledge.py +36 -0
  493. agno/utils/location.py +19 -0
  494. agno/utils/log.py +221 -8
  495. agno/utils/mcp.py +214 -0
  496. agno/utils/media.py +335 -14
  497. agno/utils/merge_dict.py +22 -1
  498. agno/utils/message.py +77 -2
  499. agno/utils/models/ai_foundry.py +50 -0
  500. agno/utils/models/claude.py +373 -0
  501. agno/utils/models/cohere.py +94 -0
  502. agno/utils/models/llama.py +85 -0
  503. agno/utils/models/mistral.py +100 -0
  504. agno/utils/models/openai_responses.py +140 -0
  505. agno/utils/models/schema_utils.py +153 -0
  506. agno/utils/models/watsonx.py +41 -0
  507. agno/utils/openai.py +257 -0
  508. agno/utils/pickle.py +1 -1
  509. agno/utils/pprint.py +124 -8
  510. agno/utils/print_response/agent.py +930 -0
  511. agno/utils/print_response/team.py +1914 -0
  512. agno/utils/print_response/workflow.py +1668 -0
  513. agno/utils/prompts.py +111 -0
  514. agno/utils/reasoning.py +108 -0
  515. agno/utils/response.py +163 -0
  516. agno/utils/serialize.py +32 -0
  517. agno/utils/shell.py +4 -4
  518. agno/utils/streamlit.py +487 -0
  519. agno/utils/string.py +204 -51
  520. agno/utils/team.py +139 -0
  521. agno/utils/timer.py +9 -2
  522. agno/utils/tokens.py +657 -0
  523. agno/utils/tools.py +19 -1
  524. agno/utils/whatsapp.py +305 -0
  525. agno/utils/yaml_io.py +3 -3
  526. agno/vectordb/__init__.py +2 -0
  527. agno/vectordb/base.py +87 -9
  528. agno/vectordb/cassandra/__init__.py +5 -1
  529. agno/vectordb/cassandra/cassandra.py +383 -27
  530. agno/vectordb/chroma/__init__.py +4 -0
  531. agno/vectordb/chroma/chromadb.py +748 -83
  532. agno/vectordb/clickhouse/__init__.py +7 -1
  533. agno/vectordb/clickhouse/clickhousedb.py +554 -53
  534. agno/vectordb/couchbase/__init__.py +3 -0
  535. agno/vectordb/couchbase/couchbase.py +1446 -0
  536. agno/vectordb/lancedb/__init__.py +5 -0
  537. agno/vectordb/lancedb/lance_db.py +730 -98
  538. agno/vectordb/langchaindb/__init__.py +5 -0
  539. agno/vectordb/langchaindb/langchaindb.py +163 -0
  540. agno/vectordb/lightrag/__init__.py +5 -0
  541. agno/vectordb/lightrag/lightrag.py +388 -0
  542. agno/vectordb/llamaindex/__init__.py +3 -0
  543. agno/vectordb/llamaindex/llamaindexdb.py +166 -0
  544. agno/vectordb/milvus/__init__.py +3 -0
  545. agno/vectordb/milvus/milvus.py +966 -78
  546. agno/vectordb/mongodb/__init__.py +9 -1
  547. agno/vectordb/mongodb/mongodb.py +1175 -172
  548. agno/vectordb/pgvector/__init__.py +8 -0
  549. agno/vectordb/pgvector/pgvector.py +599 -115
  550. agno/vectordb/pineconedb/__init__.py +5 -1
  551. agno/vectordb/pineconedb/pineconedb.py +406 -43
  552. agno/vectordb/qdrant/__init__.py +4 -0
  553. agno/vectordb/qdrant/qdrant.py +914 -61
  554. agno/vectordb/redis/__init__.py +9 -0
  555. agno/vectordb/redis/redisdb.py +682 -0
  556. agno/vectordb/singlestore/__init__.py +8 -1
  557. agno/vectordb/singlestore/singlestore.py +771 -0
  558. agno/vectordb/surrealdb/__init__.py +3 -0
  559. agno/vectordb/surrealdb/surrealdb.py +663 -0
  560. agno/vectordb/upstashdb/__init__.py +5 -0
  561. agno/vectordb/upstashdb/upstashdb.py +718 -0
  562. agno/vectordb/weaviate/__init__.py +8 -0
  563. agno/vectordb/weaviate/index.py +15 -0
  564. agno/vectordb/weaviate/weaviate.py +1009 -0
  565. agno/workflow/__init__.py +23 -1
  566. agno/workflow/agent.py +299 -0
  567. agno/workflow/condition.py +759 -0
  568. agno/workflow/loop.py +756 -0
  569. agno/workflow/parallel.py +853 -0
  570. agno/workflow/router.py +723 -0
  571. agno/workflow/step.py +1564 -0
  572. agno/workflow/steps.py +613 -0
  573. agno/workflow/types.py +556 -0
  574. agno/workflow/workflow.py +4327 -514
  575. agno-2.3.13.dist-info/METADATA +639 -0
  576. agno-2.3.13.dist-info/RECORD +613 -0
  577. {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +1 -1
  578. agno-2.3.13.dist-info/licenses/LICENSE +201 -0
  579. agno/api/playground.py +0 -91
  580. agno/api/schemas/playground.py +0 -22
  581. agno/api/schemas/user.py +0 -22
  582. agno/api/schemas/workspace.py +0 -46
  583. agno/api/user.py +0 -160
  584. agno/api/workspace.py +0 -151
  585. agno/cli/auth_server.py +0 -118
  586. agno/cli/config.py +0 -275
  587. agno/cli/console.py +0 -88
  588. agno/cli/credentials.py +0 -23
  589. agno/cli/entrypoint.py +0 -571
  590. agno/cli/operator.py +0 -355
  591. agno/cli/settings.py +0 -85
  592. agno/cli/ws/ws_cli.py +0 -817
  593. agno/constants.py +0 -13
  594. agno/document/__init__.py +0 -1
  595. agno/document/chunking/semantic.py +0 -47
  596. agno/document/chunking/strategy.py +0 -31
  597. agno/document/reader/__init__.py +0 -1
  598. agno/document/reader/arxiv_reader.py +0 -41
  599. agno/document/reader/base.py +0 -22
  600. agno/document/reader/csv_reader.py +0 -84
  601. agno/document/reader/docx_reader.py +0 -46
  602. agno/document/reader/firecrawl_reader.py +0 -99
  603. agno/document/reader/json_reader.py +0 -43
  604. agno/document/reader/pdf_reader.py +0 -219
  605. agno/document/reader/s3/pdf_reader.py +0 -46
  606. agno/document/reader/s3/text_reader.py +0 -51
  607. agno/document/reader/text_reader.py +0 -41
  608. agno/document/reader/website_reader.py +0 -175
  609. agno/document/reader/youtube_reader.py +0 -50
  610. agno/embedder/__init__.py +0 -1
  611. agno/embedder/azure_openai.py +0 -86
  612. agno/embedder/cohere.py +0 -72
  613. agno/embedder/fastembed.py +0 -37
  614. agno/embedder/google.py +0 -73
  615. agno/embedder/huggingface.py +0 -54
  616. agno/embedder/mistral.py +0 -80
  617. agno/embedder/ollama.py +0 -57
  618. agno/embedder/openai.py +0 -74
  619. agno/embedder/sentence_transformer.py +0 -38
  620. agno/embedder/voyageai.py +0 -64
  621. agno/eval/perf.py +0 -201
  622. agno/file/__init__.py +0 -1
  623. agno/file/file.py +0 -16
  624. agno/file/local/csv.py +0 -32
  625. agno/file/local/txt.py +0 -19
  626. agno/infra/app.py +0 -240
  627. agno/infra/base.py +0 -144
  628. agno/infra/context.py +0 -20
  629. agno/infra/db_app.py +0 -52
  630. agno/infra/resource.py +0 -205
  631. agno/infra/resources.py +0 -55
  632. agno/knowledge/agent.py +0 -230
  633. agno/knowledge/arxiv.py +0 -22
  634. agno/knowledge/combined.py +0 -22
  635. agno/knowledge/csv.py +0 -28
  636. agno/knowledge/csv_url.py +0 -19
  637. agno/knowledge/document.py +0 -20
  638. agno/knowledge/docx.py +0 -30
  639. agno/knowledge/json.py +0 -28
  640. agno/knowledge/langchain.py +0 -71
  641. agno/knowledge/llamaindex.py +0 -66
  642. agno/knowledge/pdf.py +0 -28
  643. agno/knowledge/pdf_url.py +0 -26
  644. agno/knowledge/s3/base.py +0 -60
  645. agno/knowledge/s3/pdf.py +0 -21
  646. agno/knowledge/s3/text.py +0 -23
  647. agno/knowledge/text.py +0 -30
  648. agno/knowledge/website.py +0 -88
  649. agno/knowledge/wikipedia.py +0 -31
  650. agno/knowledge/youtube.py +0 -22
  651. agno/memory/agent.py +0 -392
  652. agno/memory/classifier.py +0 -104
  653. agno/memory/db/__init__.py +0 -1
  654. agno/memory/db/base.py +0 -42
  655. agno/memory/db/mongodb.py +0 -189
  656. agno/memory/db/postgres.py +0 -203
  657. agno/memory/db/sqlite.py +0 -193
  658. agno/memory/memory.py +0 -15
  659. agno/memory/row.py +0 -36
  660. agno/memory/summarizer.py +0 -192
  661. agno/memory/summary.py +0 -19
  662. agno/memory/workflow.py +0 -38
  663. agno/models/google/gemini_openai.py +0 -26
  664. agno/models/ollama/hermes.py +0 -221
  665. agno/models/ollama/tools.py +0 -362
  666. agno/models/vertexai/gemini.py +0 -595
  667. agno/playground/__init__.py +0 -3
  668. agno/playground/async_router.py +0 -421
  669. agno/playground/deploy.py +0 -249
  670. agno/playground/operator.py +0 -92
  671. agno/playground/playground.py +0 -91
  672. agno/playground/schemas.py +0 -76
  673. agno/playground/serve.py +0 -55
  674. agno/playground/sync_router.py +0 -405
  675. agno/reasoning/agent.py +0 -68
  676. agno/run/response.py +0 -112
  677. agno/storage/agent/__init__.py +0 -0
  678. agno/storage/agent/base.py +0 -38
  679. agno/storage/agent/dynamodb.py +0 -350
  680. agno/storage/agent/json.py +0 -92
  681. agno/storage/agent/mongodb.py +0 -228
  682. agno/storage/agent/postgres.py +0 -367
  683. agno/storage/agent/session.py +0 -79
  684. agno/storage/agent/singlestore.py +0 -303
  685. agno/storage/agent/sqlite.py +0 -357
  686. agno/storage/agent/yaml.py +0 -93
  687. agno/storage/workflow/__init__.py +0 -0
  688. agno/storage/workflow/base.py +0 -40
  689. agno/storage/workflow/mongodb.py +0 -233
  690. agno/storage/workflow/postgres.py +0 -366
  691. agno/storage/workflow/session.py +0 -60
  692. agno/storage/workflow/sqlite.py +0 -359
  693. agno/tools/googlesearch.py +0 -88
  694. agno/utils/defaults.py +0 -57
  695. agno/utils/filesystem.py +0 -39
  696. agno/utils/git.py +0 -52
  697. agno/utils/json_io.py +0 -30
  698. agno/utils/load_env.py +0 -19
  699. agno/utils/py_io.py +0 -19
  700. agno/utils/pyproject.py +0 -18
  701. agno/utils/resource_filter.py +0 -31
  702. agno/vectordb/singlestore/s2vectordb.py +0 -390
  703. agno/vectordb/singlestore/s2vectordb2.py +0 -355
  704. agno/workspace/__init__.py +0 -0
  705. agno/workspace/config.py +0 -325
  706. agno/workspace/enums.py +0 -6
  707. agno/workspace/helpers.py +0 -48
  708. agno/workspace/operator.py +0 -758
  709. agno/workspace/settings.py +0 -63
  710. agno-0.1.2.dist-info/LICENSE +0 -375
  711. agno-0.1.2.dist-info/METADATA +0 -502
  712. agno-0.1.2.dist-info/RECORD +0 -352
  713. agno-0.1.2.dist-info/entry_points.txt +0 -3
  714. /agno/{cli → db/migrations}/__init__.py +0 -0
  715. /agno/{cli/ws → db/migrations/versions}/__init__.py +0 -0
  716. /agno/{document/chunking/__init__.py → db/schemas/metrics.py} +0 -0
  717. /agno/{document/reader/s3 → integrations}/__init__.py +0 -0
  718. /agno/{file/local → knowledge/chunking}/__init__.py +0 -0
  719. /agno/{infra → knowledge/remote_content}/__init__.py +0 -0
  720. /agno/{knowledge/s3 → tools/models}/__init__.py +0 -0
  721. /agno/{reranker → utils/models}/__init__.py +0 -0
  722. /agno/{storage → utils/print_response}/__init__.py +0 -0
  723. {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3006 @@
1
+ import asyncio
2
+ import hashlib
3
+ import io
4
+ import time
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from io import BytesIO
8
+ from os.path import basename
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
11
+
12
+ from httpx import AsyncClient
13
+
14
+ from agno.db.base import AsyncBaseDb, BaseDb
15
+ from agno.db.schemas.knowledge import KnowledgeRow
16
+ from agno.filters import FilterExpr
17
+ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
18
+ from agno.knowledge.document import Document
19
+ from agno.knowledge.reader import Reader, ReaderFactory
20
+ from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
21
+ from agno.utils.http import async_fetch_with_retry
22
+ from agno.utils.log import log_debug, log_error, log_info, log_warning
23
+ from agno.utils.string import generate_id
24
+
25
+ ContentDict = Dict[str, Union[str, Dict[str, str]]]
26
+
27
+
28
+ class KnowledgeContentOrigin(Enum):
29
+ PATH = "path"
30
+ URL = "url"
31
+ TOPIC = "topic"
32
+ CONTENT = "content"
33
+
34
+
35
+ @dataclass
36
+ class Knowledge:
37
+ """Knowledge class"""
38
+
39
+ name: Optional[str] = None
40
+ description: Optional[str] = None
41
+ vector_db: Optional[Any] = None
42
+ contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
43
+ max_results: int = 10
44
+ readers: Optional[Dict[str, Reader]] = None
45
+
46
+ def __post_init__(self):
47
+ from agno.vectordb import VectorDb
48
+
49
+ self.vector_db = cast(VectorDb, self.vector_db)
50
+ if self.vector_db and not self.vector_db.exists():
51
+ self.vector_db.create()
52
+
53
+ self.construct_readers()
54
+
55
+ # --- Add Contents ---
56
+ @overload
57
+ async def add_contents_async(self, contents: List[ContentDict]) -> None: ...
58
+
59
+ @overload
60
+ async def add_contents_async(
61
+ self,
62
+ *,
63
+ paths: Optional[List[str]] = None,
64
+ urls: Optional[List[str]] = None,
65
+ metadata: Optional[Dict[str, str]] = None,
66
+ topics: Optional[List[str]] = None,
67
+ text_contents: Optional[List[str]] = None,
68
+ reader: Optional[Reader] = None,
69
+ include: Optional[List[str]] = None,
70
+ exclude: Optional[List[str]] = None,
71
+ upsert: bool = True,
72
+ skip_if_exists: bool = False,
73
+ remote_content: Optional[RemoteContent] = None,
74
+ ) -> None: ...
75
+
76
+ async def add_contents_async(self, *args, **kwargs) -> None:
77
+ if args and isinstance(args[0], list):
78
+ arguments = args[0]
79
+ upsert = kwargs.get("upsert", True)
80
+ skip_if_exists = kwargs.get("skip_if_exists", False)
81
+ for argument in arguments:
82
+ await self.add_content_async(
83
+ name=argument.get("name"),
84
+ description=argument.get("description"),
85
+ path=argument.get("path"),
86
+ url=argument.get("url"),
87
+ metadata=argument.get("metadata"),
88
+ topics=argument.get("topics"),
89
+ text_content=argument.get("text_content"),
90
+ reader=argument.get("reader"),
91
+ include=argument.get("include"),
92
+ exclude=argument.get("exclude"),
93
+ upsert=argument.get("upsert", upsert),
94
+ skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
95
+ remote_content=argument.get("remote_content", None),
96
+ )
97
+
98
+ elif kwargs:
99
+ name = kwargs.get("name", [])
100
+ metadata = kwargs.get("metadata", {})
101
+ description = kwargs.get("description", [])
102
+ topics = kwargs.get("topics", [])
103
+ reader = kwargs.get("reader", None)
104
+ paths = kwargs.get("paths", [])
105
+ urls = kwargs.get("urls", [])
106
+ text_contents = kwargs.get("text_contents", [])
107
+ include = kwargs.get("include")
108
+ exclude = kwargs.get("exclude")
109
+ upsert = kwargs.get("upsert", True)
110
+ skip_if_exists = kwargs.get("skip_if_exists", False)
111
+ remote_content = kwargs.get("remote_content", None)
112
+ for path in paths:
113
+ await self.add_content_async(
114
+ name=name,
115
+ description=description,
116
+ path=path,
117
+ metadata=metadata,
118
+ include=include,
119
+ exclude=exclude,
120
+ upsert=upsert,
121
+ skip_if_exists=skip_if_exists,
122
+ reader=reader,
123
+ )
124
+ for url in urls:
125
+ await self.add_content_async(
126
+ name=name,
127
+ description=description,
128
+ url=url,
129
+ metadata=metadata,
130
+ include=include,
131
+ exclude=exclude,
132
+ upsert=upsert,
133
+ skip_if_exists=skip_if_exists,
134
+ reader=reader,
135
+ )
136
+ for i, text_content in enumerate(text_contents):
137
+ content_name = f"{name}_{i}" if name else f"text_content_{i}"
138
+ log_debug(f"Adding text content: {content_name}")
139
+ await self.add_content_async(
140
+ name=content_name,
141
+ description=description,
142
+ text_content=text_content,
143
+ metadata=metadata,
144
+ include=include,
145
+ exclude=exclude,
146
+ upsert=upsert,
147
+ skip_if_exists=skip_if_exists,
148
+ reader=reader,
149
+ )
150
+ if topics:
151
+ await self.add_content_async(
152
+ name=name,
153
+ description=description,
154
+ topics=topics,
155
+ metadata=metadata,
156
+ include=include,
157
+ exclude=exclude,
158
+ upsert=upsert,
159
+ skip_if_exists=skip_if_exists,
160
+ reader=reader,
161
+ )
162
+
163
+ if remote_content:
164
+ await self.add_content_async(
165
+ name=name,
166
+ metadata=metadata,
167
+ description=description,
168
+ remote_content=remote_content,
169
+ upsert=upsert,
170
+ skip_if_exists=skip_if_exists,
171
+ reader=reader,
172
+ )
173
+
174
+ else:
175
+ raise ValueError("Invalid usage of add_contents.")
176
+
177
+ @overload
178
+ def add_contents(self, contents: List[ContentDict]) -> None: ...
179
+
180
+ @overload
181
+ def add_contents(
182
+ self,
183
+ *,
184
+ paths: Optional[List[str]] = None,
185
+ urls: Optional[List[str]] = None,
186
+ metadata: Optional[Dict[str, str]] = None,
187
+ topics: Optional[List[str]] = None,
188
+ text_contents: Optional[List[str]] = None,
189
+ reader: Optional[Reader] = None,
190
+ include: Optional[List[str]] = None,
191
+ exclude: Optional[List[str]] = None,
192
+ upsert: bool = True,
193
+ skip_if_exists: bool = False,
194
+ remote_content: Optional[RemoteContent] = None,
195
+ ) -> None: ...
196
+
197
+ def add_contents(self, *args, **kwargs) -> None:
198
+ """
199
+ Synchronously add multiple content items to the knowledge base.
200
+
201
+ Supports two usage patterns:
202
+ 1. Pass a list of content dictionaries as first argument
203
+ 2. Pass keyword arguments with paths, urls, metadata, etc.
204
+
205
+ Args:
206
+ contents: List of content dictionaries (when used as first overload)
207
+ paths: Optional list of file paths to load content from
208
+ urls: Optional list of URLs to load content from
209
+ metadata: Optional metadata dictionary to apply to all content
210
+ topics: Optional list of topics to add
211
+ text_contents: Optional list of text content strings to add
212
+ reader: Optional reader to use for processing content
213
+ include: Optional list of file patterns to include
214
+ exclude: Optional list of file patterns to exclude
215
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
216
+ skip_if_exists: Whether to skip adding content if it already exists (default: True)
217
+ remote_content: Optional remote content (S3, GCS, etc.) to add
218
+ """
219
+ if args and isinstance(args[0], list):
220
+ arguments = args[0]
221
+ upsert = kwargs.get("upsert", True)
222
+ skip_if_exists = kwargs.get("skip_if_exists", False)
223
+ for argument in arguments:
224
+ self.add_content(
225
+ name=argument.get("name"),
226
+ description=argument.get("description"),
227
+ path=argument.get("path"),
228
+ url=argument.get("url"),
229
+ metadata=argument.get("metadata"),
230
+ topics=argument.get("topics"),
231
+ text_content=argument.get("text_content"),
232
+ reader=argument.get("reader"),
233
+ include=argument.get("include"),
234
+ exclude=argument.get("exclude"),
235
+ upsert=argument.get("upsert", upsert),
236
+ skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
237
+ remote_content=argument.get("remote_content", None),
238
+ )
239
+
240
+ elif kwargs:
241
+ name = kwargs.get("name", [])
242
+ metadata = kwargs.get("metadata", {})
243
+ description = kwargs.get("description", [])
244
+ topics = kwargs.get("topics", [])
245
+ reader = kwargs.get("reader", None)
246
+ paths = kwargs.get("paths", [])
247
+ urls = kwargs.get("urls", [])
248
+ text_contents = kwargs.get("text_contents", [])
249
+ include = kwargs.get("include")
250
+ exclude = kwargs.get("exclude")
251
+ upsert = kwargs.get("upsert", True)
252
+ skip_if_exists = kwargs.get("skip_if_exists", False)
253
+ remote_content = kwargs.get("remote_content", None)
254
+ for path in paths:
255
+ self.add_content(
256
+ name=name,
257
+ description=description,
258
+ path=path,
259
+ metadata=metadata,
260
+ include=include,
261
+ exclude=exclude,
262
+ upsert=upsert,
263
+ skip_if_exists=skip_if_exists,
264
+ reader=reader,
265
+ )
266
+ for url in urls:
267
+ self.add_content(
268
+ name=name,
269
+ description=description,
270
+ url=url,
271
+ metadata=metadata,
272
+ include=include,
273
+ exclude=exclude,
274
+ upsert=upsert,
275
+ skip_if_exists=skip_if_exists,
276
+ reader=reader,
277
+ )
278
+ for i, text_content in enumerate(text_contents):
279
+ content_name = f"{name}_{i}" if name else f"text_content_{i}"
280
+ log_debug(f"Adding text content: {content_name}")
281
+ self.add_content(
282
+ name=content_name,
283
+ description=description,
284
+ text_content=text_content,
285
+ metadata=metadata,
286
+ include=include,
287
+ exclude=exclude,
288
+ upsert=upsert,
289
+ skip_if_exists=skip_if_exists,
290
+ reader=reader,
291
+ )
292
+ if topics:
293
+ self.add_content(
294
+ name=name,
295
+ description=description,
296
+ topics=topics,
297
+ metadata=metadata,
298
+ include=include,
299
+ exclude=exclude,
300
+ upsert=upsert,
301
+ skip_if_exists=skip_if_exists,
302
+ reader=reader,
303
+ )
304
+
305
+ if remote_content:
306
+ self.add_content(
307
+ name=name,
308
+ metadata=metadata,
309
+ description=description,
310
+ remote_content=remote_content,
311
+ upsert=upsert,
312
+ skip_if_exists=skip_if_exists,
313
+ reader=reader,
314
+ )
315
+
316
+ else:
317
+ raise ValueError("Invalid usage of add_contents.")
318
+
319
+ # --- Add Content ---
320
+
321
+ @overload
322
+ async def add_content_async(
323
+ self,
324
+ *,
325
+ path: Optional[str] = None,
326
+ url: Optional[str] = None,
327
+ text_content: Optional[str] = None,
328
+ metadata: Optional[Dict[str, str]] = None,
329
+ include: Optional[List[str]] = None,
330
+ exclude: Optional[List[str]] = None,
331
+ upsert: bool = True,
332
+ skip_if_exists: bool = False,
333
+ reader: Optional[Reader] = None,
334
+ auth: Optional[ContentAuth] = None,
335
+ ) -> None: ...
336
+
337
+ @overload
338
+ async def add_content_async(self, *args, **kwargs) -> None: ...
339
+
340
+ async def add_content_async(
341
+ self,
342
+ name: Optional[str] = None,
343
+ description: Optional[str] = None,
344
+ path: Optional[str] = None,
345
+ url: Optional[str] = None,
346
+ text_content: Optional[str] = None,
347
+ metadata: Optional[Dict[str, Any]] = None,
348
+ topics: Optional[List[str]] = None,
349
+ remote_content: Optional[RemoteContent] = None,
350
+ reader: Optional[Reader] = None,
351
+ include: Optional[List[str]] = None,
352
+ exclude: Optional[List[str]] = None,
353
+ upsert: bool = True,
354
+ skip_if_exists: bool = False,
355
+ auth: Optional[ContentAuth] = None,
356
+ ) -> None:
357
+ # Validation: At least one of the parameters must be provided
358
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
359
+ log_warning(
360
+ "At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
361
+ )
362
+ return
363
+
364
+ content = None
365
+ file_data = None
366
+ if text_content:
367
+ file_data = FileData(content=text_content, type="Text")
368
+
369
+ content = Content(
370
+ name=name,
371
+ description=description,
372
+ path=path,
373
+ url=url,
374
+ file_data=file_data if file_data else None,
375
+ metadata=metadata,
376
+ topics=topics,
377
+ remote_content=remote_content,
378
+ reader=reader,
379
+ auth=auth,
380
+ )
381
+ content.content_hash = self._build_content_hash(content)
382
+ content.id = generate_id(content.content_hash)
383
+
384
+ await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
385
+
386
+ @overload
387
+ def add_content(
388
+ self,
389
+ *,
390
+ path: Optional[str] = None,
391
+ url: Optional[str] = None,
392
+ text_content: Optional[str] = None,
393
+ metadata: Optional[Dict[str, str]] = None,
394
+ include: Optional[List[str]] = None,
395
+ exclude: Optional[List[str]] = None,
396
+ upsert: bool = True,
397
+ skip_if_exists: bool = False,
398
+ reader: Optional[Reader] = None,
399
+ auth: Optional[ContentAuth] = None,
400
+ ) -> None: ...
401
+
402
+ @overload
403
+ def add_content(self, *args, **kwargs) -> None: ...
404
+
405
+ def add_content(
406
+ self,
407
+ name: Optional[str] = None,
408
+ description: Optional[str] = None,
409
+ path: Optional[str] = None,
410
+ url: Optional[str] = None,
411
+ text_content: Optional[str] = None,
412
+ metadata: Optional[Dict[str, Any]] = None,
413
+ topics: Optional[List[str]] = None,
414
+ remote_content: Optional[RemoteContent] = None,
415
+ reader: Optional[Reader] = None,
416
+ include: Optional[List[str]] = None,
417
+ exclude: Optional[List[str]] = None,
418
+ upsert: bool = True,
419
+ skip_if_exists: bool = False,
420
+ auth: Optional[ContentAuth] = None,
421
+ ) -> None:
422
+ """
423
+ Synchronously add content to the knowledge base.
424
+
425
+ Args:
426
+ name: Optional name for the content
427
+ description: Optional description for the content
428
+ path: Optional file path to load content from
429
+ url: Optional URL to load content from
430
+ text_content: Optional text content to add directly
431
+ metadata: Optional metadata dictionary
432
+ topics: Optional list of topics
433
+ remote_content: Optional cloud storage configuration
434
+ reader: Optional custom reader for processing the content
435
+ include: Optional list of file patterns to include
436
+ exclude: Optional list of file patterns to exclude
437
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
438
+ skip_if_exists: Whether to skip adding content if it already exists (default: False)
439
+ """
440
+ # Validation: At least one of the parameters must be provided
441
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
442
+ log_warning(
443
+ "At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
444
+ )
445
+ return
446
+
447
+ content = None
448
+ file_data = None
449
+ if text_content:
450
+ file_data = FileData(content=text_content, type="Text")
451
+
452
+ content = Content(
453
+ name=name,
454
+ description=description,
455
+ path=path,
456
+ url=url,
457
+ file_data=file_data if file_data else None,
458
+ metadata=metadata,
459
+ topics=topics,
460
+ remote_content=remote_content,
461
+ reader=reader,
462
+ auth=auth,
463
+ )
464
+ content.content_hash = self._build_content_hash(content)
465
+ content.id = generate_id(content.content_hash)
466
+
467
+ self._load_content(content, upsert, skip_if_exists, include, exclude)
468
+
469
+ def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
470
+ """
471
+ Handle the skip_if_exists logic for content that already exists in the vector database.
472
+
473
+ Args:
474
+ content_hash: The content hash string to check for existence
475
+ skip_if_exists: Whether to skip if content already exists
476
+
477
+ Returns:
478
+ bool: True if should skip processing, False if should continue
479
+ """
480
+ from agno.vectordb import VectorDb
481
+
482
+ self.vector_db = cast(VectorDb, self.vector_db)
483
+ if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
484
+ log_debug(f"Content already exists: {content_hash}, skipping...")
485
+ return True
486
+
487
+ return False
488
+
489
+ def _select_reader_by_extension(
490
+ self, file_extension: str, provided_reader: Optional[Reader] = None
491
+ ) -> Tuple[Optional[Reader], str]:
492
+ """
493
+ Select a reader based on file extension.
494
+
495
+ Args:
496
+ file_extension: File extension (e.g., '.pdf', '.csv')
497
+ provided_reader: Optional reader already provided
498
+
499
+ Returns:
500
+ Tuple of (reader, name) where name may be adjusted based on extension
501
+ """
502
+ if provided_reader:
503
+ return provided_reader, ""
504
+
505
+ file_extension = file_extension.lower()
506
+ if file_extension == ".csv":
507
+ return self.csv_reader, "data.csv"
508
+ elif file_extension == ".pdf":
509
+ return self.pdf_reader, ""
510
+ elif file_extension == ".docx":
511
+ return self.docx_reader, ""
512
+ elif file_extension == ".pptx":
513
+ return self.pptx_reader, ""
514
+ elif file_extension == ".json":
515
+ return self.json_reader, ""
516
+ elif file_extension == ".markdown":
517
+ return self.markdown_reader, ""
518
+ else:
519
+ return self.text_reader, ""
520
+
521
+ def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
522
+ """
523
+ Select a reader based on URI/file path extension.
524
+
525
+ Args:
526
+ uri: URI or file path
527
+ provided_reader: Optional reader already provided
528
+
529
+ Returns:
530
+ Selected reader or None
531
+ """
532
+ if provided_reader:
533
+ return provided_reader
534
+
535
+ uri_lower = uri.lower()
536
+ if uri_lower.endswith(".pdf"):
537
+ return self.pdf_reader
538
+ elif uri_lower.endswith(".csv"):
539
+ return self.csv_reader
540
+ elif uri_lower.endswith(".docx"):
541
+ return self.docx_reader
542
+ elif uri_lower.endswith(".pptx"):
543
+ return self.pptx_reader
544
+ elif uri_lower.endswith(".json"):
545
+ return self.json_reader
546
+ elif uri_lower.endswith(".markdown"):
547
+ return self.markdown_reader
548
+ else:
549
+ return self.text_reader
550
+
551
+ def _read(
552
+ self,
553
+ reader: Reader,
554
+ source: Union[Path, str, BytesIO],
555
+ name: Optional[str] = None,
556
+ password: Optional[str] = None,
557
+ ) -> List[Document]:
558
+ """
559
+ Read content using a reader with optional password handling.
560
+
561
+ Args:
562
+ reader: Reader to use
563
+ source: Source to read from (Path, URL string, or BytesIO)
564
+ name: Optional name for the document
565
+ password: Optional password for protected files
566
+
567
+ Returns:
568
+ List of documents read
569
+ """
570
+ import inspect
571
+
572
+ read_signature = inspect.signature(reader.read)
573
+ if password and "password" in read_signature.parameters:
574
+ if isinstance(source, BytesIO):
575
+ return reader.read(source, name=name, password=password)
576
+ else:
577
+ return reader.read(source, name=name, password=password)
578
+ else:
579
+ if isinstance(source, BytesIO):
580
+ return reader.read(source, name=name)
581
+ else:
582
+ return reader.read(source, name=name)
583
+
584
+ async def _read_async(
585
+ self,
586
+ reader: Reader,
587
+ source: Union[Path, str, BytesIO],
588
+ name: Optional[str] = None,
589
+ password: Optional[str] = None,
590
+ ) -> List[Document]:
591
+ """
592
+ Read content using a reader's async_read method with optional password handling.
593
+
594
+ Args:
595
+ reader: Reader to use
596
+ source: Source to read from (Path, URL string, or BytesIO)
597
+ name: Optional name for the document
598
+ password: Optional password for protected files
599
+
600
+ Returns:
601
+ List of documents read
602
+ """
603
+ import inspect
604
+
605
+ read_signature = inspect.signature(reader.async_read)
606
+ if password and "password" in read_signature.parameters:
607
+ return await reader.async_read(source, name=name, password=password)
608
+ else:
609
+ if isinstance(source, BytesIO):
610
+ return await reader.async_read(source, name=name)
611
+ else:
612
+ return await reader.async_read(source, name=name)
613
+
614
+ def _prepare_documents_for_insert(
615
+ self,
616
+ documents: List[Document],
617
+ content_id: str,
618
+ calculate_sizes: bool = False,
619
+ metadata: Optional[Dict[str, Any]] = None,
620
+ ) -> List[Document]:
621
+ """
622
+ Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
623
+
624
+ Args:
625
+ documents: List of documents to prepare
626
+ content_id: Content ID to assign to documents
627
+ calculate_sizes: Whether to calculate document sizes
628
+ metadata: Optional metadata to merge into document metadata
629
+
630
+ Returns:
631
+ List of prepared documents
632
+ """
633
+ for document in documents:
634
+ document.content_id = content_id
635
+ if calculate_sizes and document.content and not document.size:
636
+ document.size = len(document.content.encode("utf-8"))
637
+ if metadata:
638
+ document.meta_data.update(metadata)
639
+ return documents
640
+
641
+ def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
642
+ """
643
+ Chunk documents synchronously.
644
+
645
+ Args:
646
+ reader: Reader with chunking strategy
647
+ documents: Documents to chunk
648
+
649
+ Returns:
650
+ List of chunked documents
651
+ """
652
+ if not reader or reader.chunk:
653
+ return documents
654
+
655
+ chunked_documents = []
656
+ for doc in documents:
657
+ chunked_documents.extend(reader.chunk_document(doc))
658
+ return chunked_documents
659
+
660
+ async def _load_from_path_async(
661
+ self,
662
+ content: Content,
663
+ upsert: bool,
664
+ skip_if_exists: bool,
665
+ include: Optional[List[str]] = None,
666
+ exclude: Optional[List[str]] = None,
667
+ ):
668
+ from agno.vectordb import VectorDb
669
+
670
+ self.vector_db = cast(VectorDb, self.vector_db)
671
+
672
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
673
+ path = Path(content.path) # type: ignore
674
+
675
+ if path.is_file():
676
+ if self._should_include_file(str(path), include, exclude):
677
+ log_debug(f"Adding file {path} due to include/exclude filters")
678
+
679
+ await self._add_to_contents_db_async(content)
680
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
681
+ content.status = ContentStatus.COMPLETED
682
+ await self._aupdate_content(content)
683
+ return
684
+
685
+ # Handle LightRAG special case - read file and upload directly
686
+ if self.vector_db.__class__.__name__ == "LightRag":
687
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
688
+ return
689
+
690
+ if content.reader:
691
+ reader = content.reader
692
+ else:
693
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
694
+ log_debug(f"Using Reader: {reader.__class__.__name__}")
695
+
696
+ if reader:
697
+ password = content.auth.password if content.auth and content.auth.password else None
698
+ read_documents = await self._read_async(
699
+ reader, path, name=content.name or path.name, password=password
700
+ )
701
+ else:
702
+ read_documents = []
703
+
704
+ if not content.file_type:
705
+ content.file_type = path.suffix
706
+
707
+ if not content.size and content.file_data:
708
+ content.size = len(content.file_data.content) # type: ignore
709
+ if not content.size:
710
+ try:
711
+ content.size = path.stat().st_size
712
+ except (OSError, IOError) as e:
713
+ log_warning(f"Could not get file size for {path}: {e}")
714
+ content.size = 0
715
+
716
+ if not content.id:
717
+ content.id = generate_id(content.content_hash or "")
718
+ self._prepare_documents_for_insert(read_documents, content.id)
719
+
720
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
721
+
722
+ elif path.is_dir():
723
+ for file_path in path.iterdir():
724
+ # Apply include/exclude filtering
725
+ if not self._should_include_file(str(file_path), include, exclude):
726
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
727
+ continue
728
+
729
+ file_content = Content(
730
+ name=content.name,
731
+ path=str(file_path),
732
+ metadata=content.metadata,
733
+ description=content.description,
734
+ reader=content.reader,
735
+ )
736
+ file_content.content_hash = self._build_content_hash(file_content)
737
+ file_content.id = generate_id(file_content.content_hash)
738
+
739
+ await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
740
+ else:
741
+ log_warning(f"Invalid path: {path}")
742
+
743
+ def _load_from_path(
744
+ self,
745
+ content: Content,
746
+ upsert: bool,
747
+ skip_if_exists: bool,
748
+ include: Optional[List[str]] = None,
749
+ exclude: Optional[List[str]] = None,
750
+ ):
751
+ from agno.vectordb import VectorDb
752
+
753
+ self.vector_db = cast(VectorDb, self.vector_db)
754
+
755
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
756
+ path = Path(content.path) # type: ignore
757
+
758
+ if path.is_file():
759
+ if self._should_include_file(str(path), include, exclude):
760
+ log_debug(f"Adding file {path} due to include/exclude filters")
761
+
762
+ self._add_to_contents_db(content)
763
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
764
+ content.status = ContentStatus.COMPLETED
765
+ self._update_content(content)
766
+ return
767
+
768
+ # Handle LightRAG special case - read file and upload directly
769
+ if self.vector_db.__class__.__name__ == "LightRag":
770
+ self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
771
+ return
772
+
773
+ if content.reader:
774
+ # TODO: We will refactor this to eventually pass authorization to all readers
775
+ import inspect
776
+
777
+ read_signature = inspect.signature(content.reader.read)
778
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
779
+ read_documents = content.reader.read(
780
+ path, name=content.name or path.name, password=content.auth.password
781
+ )
782
+ else:
783
+ read_documents = content.reader.read(path, name=content.name or path.name)
784
+
785
+ else:
786
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
787
+ log_debug(f"Using Reader: {reader.__class__.__name__}")
788
+ if reader:
789
+ # TODO: We will refactor this to eventually pass authorization to all readers
790
+ import inspect
791
+
792
+ read_signature = inspect.signature(reader.read)
793
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
794
+ read_documents = reader.read(
795
+ path, name=content.name or path.name, password=content.auth.password
796
+ )
797
+ else:
798
+ read_documents = reader.read(path, name=content.name or path.name)
799
+
800
+ if not content.file_type:
801
+ content.file_type = path.suffix
802
+
803
+ if not content.size and content.file_data:
804
+ content.size = len(content.file_data.content) # type: ignore
805
+ if not content.size:
806
+ try:
807
+ content.size = path.stat().st_size
808
+ except (OSError, IOError) as e:
809
+ log_warning(f"Could not get file size for {path}: {e}")
810
+ content.size = 0
811
+
812
+ if not content.id:
813
+ content.id = generate_id(content.content_hash or "")
814
+ self._prepare_documents_for_insert(read_documents, content.id)
815
+
816
+ self._handle_vector_db_insert(content, read_documents, upsert)
817
+
818
+ elif path.is_dir():
819
+ for file_path in path.iterdir():
820
+ # Apply include/exclude filtering
821
+ if not self._should_include_file(str(file_path), include, exclude):
822
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
823
+ continue
824
+
825
+ file_content = Content(
826
+ name=content.name,
827
+ path=str(file_path),
828
+ metadata=content.metadata,
829
+ description=content.description,
830
+ reader=content.reader,
831
+ )
832
+ file_content.content_hash = self._build_content_hash(file_content)
833
+ file_content.id = generate_id(file_content.content_hash)
834
+
835
+ self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
836
+ else:
837
+ log_warning(f"Invalid path: {path}")
838
+
839
+ async def _load_from_url_async(
840
+ self,
841
+ content: Content,
842
+ upsert: bool,
843
+ skip_if_exists: bool,
844
+ ):
845
+ """Load the content in the contextual URL
846
+
847
+ 1. Set content hash
848
+ 2. Validate the URL
849
+ 3. Read the content
850
+ 4. Prepare and insert the content in the vector database
851
+ """
852
+ from agno.vectordb import VectorDb
853
+
854
+ self.vector_db = cast(VectorDb, self.vector_db)
855
+
856
+ log_info(f"Adding content from URL {content.url}")
857
+ content.file_type = "url"
858
+
859
+ if not content.url:
860
+ raise ValueError("No url provided")
861
+
862
+ # 1. Add content to contents database
863
+ await self._add_to_contents_db_async(content)
864
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
865
+ content.status = ContentStatus.COMPLETED
866
+ await self._aupdate_content(content)
867
+ return
868
+
869
+ if self.vector_db.__class__.__name__ == "LightRag":
870
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
871
+ return
872
+
873
+ # 2. Validate URL
874
+ try:
875
+ from urllib.parse import urlparse
876
+
877
+ parsed_url = urlparse(content.url)
878
+ if not all([parsed_url.scheme, parsed_url.netloc]):
879
+ content.status = ContentStatus.FAILED
880
+ content.status_message = f"Invalid URL format: {content.url}"
881
+ await self._aupdate_content(content)
882
+ log_warning(f"Invalid URL format: {content.url}")
883
+ except Exception as e:
884
+ content.status = ContentStatus.FAILED
885
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
886
+ await self._aupdate_content(content)
887
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
888
+ # 3. Fetch and load content if file has an extension
889
+ url_path = Path(parsed_url.path)
890
+ file_extension = url_path.suffix.lower()
891
+
892
+ bytes_content = None
893
+ if file_extension:
894
+ async with AsyncClient() as client:
895
+ response = await async_fetch_with_retry(content.url, client=client)
896
+ bytes_content = BytesIO(response.content)
897
+
898
+ # 4. Select reader
899
+ name = content.name if content.name else content.url
900
+ if file_extension:
901
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
902
+ if default_name and file_extension == ".csv":
903
+ name = basename(parsed_url.path) or default_name
904
+ else:
905
+ reader = content.reader or self.website_reader
906
+ # 5. Read content
907
+ try:
908
+ read_documents = []
909
+ if reader is not None:
910
+ # Special handling for YouTubeReader
911
+ if reader.__class__.__name__ == "YouTubeReader":
912
+ read_documents = await reader.async_read(content.url, name=name)
913
+ else:
914
+ password = content.auth.password if content.auth and content.auth.password else None
915
+ source = bytes_content if bytes_content else content.url
916
+ read_documents = await self._read_async(reader, source, name=name, password=password)
917
+
918
+ except Exception as e:
919
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
920
+ content.status = ContentStatus.FAILED
921
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
922
+ await self._aupdate_content(content)
923
+ return
924
+
925
+ # 6. Chunk documents if needed
926
+ if reader and not reader.chunk:
927
+ read_documents = await reader.chunk_documents_async(read_documents)
928
+ # 7. Prepare and insert the content in the vector database
929
+ if not content.id:
930
+ content.id = generate_id(content.content_hash or "")
931
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
932
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
933
+
934
+ def _load_from_url(
935
+ self,
936
+ content: Content,
937
+ upsert: bool,
938
+ skip_if_exists: bool,
939
+ ):
940
+ """Synchronous version of _load_from_url.
941
+
942
+ Load the content from a URL:
943
+ 1. Set content hash
944
+ 2. Validate the URL
945
+ 3. Read the content
946
+ 4. Prepare and insert the content in the vector database
947
+ """
948
+ from agno.utils.http import fetch_with_retry
949
+ from agno.vectordb import VectorDb
950
+
951
+ self.vector_db = cast(VectorDb, self.vector_db)
952
+
953
+ log_info(f"Adding content from URL {content.url}")
954
+ content.file_type = "url"
955
+
956
+ if not content.url:
957
+ raise ValueError("No url provided")
958
+
959
+ # 1. Add content to contents database
960
+ self._add_to_contents_db(content)
961
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
962
+ content.status = ContentStatus.COMPLETED
963
+ self._update_content(content)
964
+ return
965
+
966
+ if self.vector_db.__class__.__name__ == "LightRag":
967
+ self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
968
+ return
969
+
970
+ # 2. Validate URL
971
+ try:
972
+ from urllib.parse import urlparse
973
+
974
+ parsed_url = urlparse(content.url)
975
+ if not all([parsed_url.scheme, parsed_url.netloc]):
976
+ content.status = ContentStatus.FAILED
977
+ content.status_message = f"Invalid URL format: {content.url}"
978
+ self._update_content(content)
979
+ log_warning(f"Invalid URL format: {content.url}")
980
+ except Exception as e:
981
+ content.status = ContentStatus.FAILED
982
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
983
+ self._update_content(content)
984
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
985
+
986
+ # 3. Fetch and load content if file has an extension
987
+ url_path = Path(parsed_url.path)
988
+ file_extension = url_path.suffix.lower()
989
+
990
+ bytes_content = None
991
+ if file_extension:
992
+ response = fetch_with_retry(content.url)
993
+ bytes_content = BytesIO(response.content)
994
+
995
+ # 4. Select reader
996
+ name = content.name if content.name else content.url
997
+ if file_extension:
998
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
999
+ if default_name and file_extension == ".csv":
1000
+ name = basename(parsed_url.path) or default_name
1001
+ else:
1002
+ reader = content.reader or self.website_reader
1003
+
1004
+ # 5. Read content
1005
+ try:
1006
+ read_documents = []
1007
+ if reader is not None:
1008
+ # Special handling for YouTubeReader
1009
+ if reader.__class__.__name__ == "YouTubeReader":
1010
+ read_documents = reader.read(content.url, name=name)
1011
+ else:
1012
+ password = content.auth.password if content.auth and content.auth.password else None
1013
+ source = bytes_content if bytes_content else content.url
1014
+ read_documents = self._read(reader, source, name=name, password=password)
1015
+
1016
+ except Exception as e:
1017
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
1018
+ content.status = ContentStatus.FAILED
1019
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
1020
+ self._update_content(content)
1021
+ return
1022
+
1023
+ # 6. Chunk documents if needed (sync version)
1024
+ if reader:
1025
+ read_documents = self._chunk_documents_sync(reader, read_documents)
1026
+
1027
+ # 7. Prepare and insert the content in the vector database
1028
+ if not content.id:
1029
+ content.id = generate_id(content.content_hash or "")
1030
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1031
+ self._handle_vector_db_insert(content, read_documents, upsert)
1032
+
1033
+ async def _load_from_content_async(
1034
+ self,
1035
+ content: Content,
1036
+ upsert: bool = True,
1037
+ skip_if_exists: bool = False,
1038
+ ):
1039
+ from agno.vectordb import VectorDb
1040
+
1041
+ self.vector_db = cast(VectorDb, self.vector_db)
1042
+
1043
+ if content.name:
1044
+ name = content.name
1045
+ elif content.file_data and content.file_data.content:
1046
+ if isinstance(content.file_data.content, bytes):
1047
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
1048
+ elif isinstance(content.file_data.content, str):
1049
+ name = (
1050
+ content.file_data.content[:10]
1051
+ if len(content.file_data.content) >= 10
1052
+ else content.file_data.content
1053
+ )
1054
+ else:
1055
+ name = str(content.file_data.content)[:10]
1056
+ else:
1057
+ name = None
1058
+
1059
+ if name is not None:
1060
+ content.name = name
1061
+
1062
+ log_info(f"Adding content from {content.name}")
1063
+
1064
+ await self._add_to_contents_db_async(content)
1065
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
1066
+ content.status = ContentStatus.COMPLETED
1067
+ await self._aupdate_content(content)
1068
+ return
1069
+
1070
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
1071
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
1072
+ return
1073
+
1074
+ read_documents = []
1075
+
1076
+ if isinstance(content.file_data, str):
1077
+ content_bytes = content.file_data.encode("utf-8", errors="replace")
1078
+ content_io = io.BytesIO(content_bytes)
1079
+
1080
+ if content.reader:
1081
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1082
+ read_documents = await content.reader.async_read(content_io, name=name)
1083
+ else:
1084
+ text_reader = self.text_reader
1085
+ if text_reader:
1086
+ read_documents = await text_reader.async_read(content_io, name=name)
1087
+ else:
1088
+ content.status = ContentStatus.FAILED
1089
+ content.status_message = "Text reader not available"
1090
+ await self._aupdate_content(content)
1091
+ return
1092
+
1093
+ elif isinstance(content.file_data, FileData):
1094
+ if content.file_data.type:
1095
+ if isinstance(content.file_data.content, bytes):
1096
+ content_io = io.BytesIO(content.file_data.content)
1097
+ elif isinstance(content.file_data.content, str):
1098
+ content_bytes = content.file_data.content.encode("utf-8", errors="replace")
1099
+ content_io = io.BytesIO(content_bytes)
1100
+ else:
1101
+ content_io = content.file_data.content # type: ignore
1102
+
1103
+ # Respect an explicitly provided reader; otherwise select based on file type
1104
+ if content.reader:
1105
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1106
+ reader = content.reader
1107
+ else:
1108
+ reader = self._select_reader(content.file_data.type)
1109
+ name = content.name if content.name else f"content_{content.file_data.type}"
1110
+ read_documents = await reader.async_read(content_io, name=name)
1111
+ if not content.id:
1112
+ content.id = generate_id(content.content_hash or "")
1113
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1114
+
1115
+ if len(read_documents) == 0:
1116
+ content.status = ContentStatus.FAILED
1117
+ content.status_message = "Content could not be read"
1118
+ await self._aupdate_content(content)
1119
+ return
1120
+
1121
+ else:
1122
+ content.status = ContentStatus.FAILED
1123
+ content.status_message = "No content provided"
1124
+ await self._aupdate_content(content)
1125
+ return
1126
+
1127
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1128
+
1129
+ def _load_from_content(
1130
+ self,
1131
+ content: Content,
1132
+ upsert: bool = True,
1133
+ skip_if_exists: bool = False,
1134
+ ):
1135
+ """Synchronous version of _load_from_content."""
1136
+ from agno.vectordb import VectorDb
1137
+
1138
+ self.vector_db = cast(VectorDb, self.vector_db)
1139
+
1140
+ if content.name:
1141
+ name = content.name
1142
+ elif content.file_data and content.file_data.content:
1143
+ if isinstance(content.file_data.content, bytes):
1144
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
1145
+ elif isinstance(content.file_data.content, str):
1146
+ name = (
1147
+ content.file_data.content[:10]
1148
+ if len(content.file_data.content) >= 10
1149
+ else content.file_data.content
1150
+ )
1151
+ else:
1152
+ name = str(content.file_data.content)[:10]
1153
+ else:
1154
+ name = None
1155
+
1156
+ if name is not None:
1157
+ content.name = name
1158
+
1159
+ log_info(f"Adding content from {content.name}")
1160
+
1161
+ self._add_to_contents_db(content)
1162
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
1163
+ content.status = ContentStatus.COMPLETED
1164
+ self._update_content(content)
1165
+ return
1166
+
1167
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
1168
+ self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
1169
+ return
1170
+
1171
+ read_documents = []
1172
+
1173
+ if isinstance(content.file_data, str):
1174
+ content_bytes = content.file_data.encode("utf-8", errors="replace")
1175
+ content_io = io.BytesIO(content_bytes)
1176
+
1177
+ if content.reader:
1178
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1179
+ read_documents = content.reader.read(content_io, name=name)
1180
+ else:
1181
+ text_reader = self.text_reader
1182
+ if text_reader:
1183
+ read_documents = text_reader.read(content_io, name=name)
1184
+ else:
1185
+ content.status = ContentStatus.FAILED
1186
+ content.status_message = "Text reader not available"
1187
+ self._update_content(content)
1188
+ return
1189
+
1190
+ elif isinstance(content.file_data, FileData):
1191
+ if content.file_data.type:
1192
+ if isinstance(content.file_data.content, bytes):
1193
+ content_io = io.BytesIO(content.file_data.content)
1194
+ elif isinstance(content.file_data.content, str):
1195
+ content_bytes = content.file_data.content.encode("utf-8", errors="replace")
1196
+ content_io = io.BytesIO(content_bytes)
1197
+ else:
1198
+ content_io = content.file_data.content # type: ignore
1199
+
1200
+ # Respect an explicitly provided reader; otherwise select based on file type
1201
+ if content.reader:
1202
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1203
+ reader = content.reader
1204
+ else:
1205
+ reader = self._select_reader(content.file_data.type)
1206
+ name = content.name if content.name else f"content_{content.file_data.type}"
1207
+ read_documents = reader.read(content_io, name=name)
1208
+ if not content.id:
1209
+ content.id = generate_id(content.content_hash or "")
1210
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1211
+
1212
+ if len(read_documents) == 0:
1213
+ content.status = ContentStatus.FAILED
1214
+ content.status_message = "Content could not be read"
1215
+ self._update_content(content)
1216
+ return
1217
+
1218
+ else:
1219
+ content.status = ContentStatus.FAILED
1220
+ content.status_message = "No content provided"
1221
+ self._update_content(content)
1222
+ return
1223
+
1224
+ self._handle_vector_db_insert(content, read_documents, upsert)
1225
+
1226
+ async def _load_from_topics_async(
1227
+ self,
1228
+ content: Content,
1229
+ upsert: bool,
1230
+ skip_if_exists: bool,
1231
+ ):
1232
+ from agno.vectordb import VectorDb
1233
+
1234
+ self.vector_db = cast(VectorDb, self.vector_db)
1235
+ log_info(f"Adding content from topics: {content.topics}")
1236
+
1237
+ if content.topics is None:
1238
+ log_warning("No topics provided for content")
1239
+ return
1240
+
1241
+ for topic in content.topics:
1242
+ content = Content(
1243
+ name=topic,
1244
+ metadata=content.metadata,
1245
+ reader=content.reader,
1246
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
1247
+ file_data=FileData(
1248
+ type="Topic",
1249
+ ),
1250
+ topics=[topic],
1251
+ )
1252
+ content.content_hash = self._build_content_hash(content)
1253
+ content.id = generate_id(content.content_hash)
1254
+
1255
+ await self._add_to_contents_db_async(content)
1256
+ if self._should_skip(content.content_hash, skip_if_exists):
1257
+ content.status = ContentStatus.COMPLETED
1258
+ await self._aupdate_content(content)
1259
+ return
1260
+
1261
+ if self.vector_db.__class__.__name__ == "LightRag":
1262
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
1263
+ return
1264
+
1265
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1266
+ log_info(f"Content {content.content_hash} already exists, skipping")
1267
+ continue
1268
+
1269
+ await self._add_to_contents_db_async(content)
1270
+ if content.reader is None:
1271
+ log_error(f"No reader available for topic: {topic}")
1272
+ content.status = ContentStatus.FAILED
1273
+ content.status_message = "No reader available for topic"
1274
+ await self._aupdate_content(content)
1275
+ continue
1276
+
1277
+ read_documents = await content.reader.async_read(topic)
1278
+ if len(read_documents) > 0:
1279
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1280
+ else:
1281
+ content.status = ContentStatus.FAILED
1282
+ content.status_message = "No content found for topic"
1283
+ await self._aupdate_content(content)
1284
+
1285
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1286
+
1287
+ def _load_from_topics(
1288
+ self,
1289
+ content: Content,
1290
+ upsert: bool,
1291
+ skip_if_exists: bool,
1292
+ ):
1293
+ """Synchronous version of _load_from_topics."""
1294
+ from agno.vectordb import VectorDb
1295
+
1296
+ self.vector_db = cast(VectorDb, self.vector_db)
1297
+ log_info(f"Adding content from topics: {content.topics}")
1298
+
1299
+ if content.topics is None:
1300
+ log_warning("No topics provided for content")
1301
+ return
1302
+
1303
+ for topic in content.topics:
1304
+ content = Content(
1305
+ name=topic,
1306
+ metadata=content.metadata,
1307
+ reader=content.reader,
1308
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
1309
+ file_data=FileData(
1310
+ type="Topic",
1311
+ ),
1312
+ topics=[topic],
1313
+ )
1314
+ content.content_hash = self._build_content_hash(content)
1315
+ content.id = generate_id(content.content_hash)
1316
+
1317
+ self._add_to_contents_db(content)
1318
+ if self._should_skip(content.content_hash, skip_if_exists):
1319
+ content.status = ContentStatus.COMPLETED
1320
+ self._update_content(content)
1321
+ return
1322
+
1323
+ if self.vector_db.__class__.__name__ == "LightRag":
1324
+ self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1325
+ return
1326
+
1327
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1328
+ log_info(f"Content {content.content_hash} already exists, skipping")
1329
+ continue
1330
+
1331
+ self._add_to_contents_db(content)
1332
+ if content.reader is None:
1333
+ log_error(f"No reader available for topic: {topic}")
1334
+ content.status = ContentStatus.FAILED
1335
+ content.status_message = "No reader available for topic"
1336
+ self._update_content(content)
1337
+ continue
1338
+
1339
+ read_documents = content.reader.read(topic)
1340
+ if len(read_documents) > 0:
1341
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1342
+ else:
1343
+ content.status = ContentStatus.FAILED
1344
+ content.status_message = "No content found for topic"
1345
+ self._update_content(content)
1346
+
1347
+ self._handle_vector_db_insert(content, read_documents, upsert)
1348
+
1349
+ async def _load_from_remote_content_async(
1350
+ self,
1351
+ content: Content,
1352
+ upsert: bool,
1353
+ skip_if_exists: bool,
1354
+ ):
1355
+ if content.remote_content is None:
1356
+ log_warning("No remote content provided for content")
1357
+ return
1358
+
1359
+ remote_content = content.remote_content
1360
+
1361
+ if isinstance(remote_content, S3Content):
1362
+ await self._load_from_s3_async(content, upsert, skip_if_exists)
1363
+
1364
+ elif isinstance(remote_content, GCSContent):
1365
+ await self._load_from_gcs_async(content, upsert, skip_if_exists)
1366
+
1367
+ else:
1368
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
1369
+
1370
+ async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1371
+ """Load the contextual S3 content.
1372
+
1373
+ 1. Identify objects to read
1374
+ 2. Setup Content object
1375
+ 3. Hash content and add it to the contents database
1376
+ 4. Select reader
1377
+ 5. Fetch and load the content
1378
+ 6. Read the content
1379
+ 7. Prepare and insert the content in the vector database
1380
+ 8. Remove temporary file if needed
1381
+ """
1382
+ from agno.cloud.aws.s3.object import S3Object
1383
+
1384
+ remote_content: S3Content = cast(S3Content, content.remote_content)
1385
+
1386
+ # 1. Identify objects to read
1387
+ objects_to_read: List[S3Object] = []
1388
+ if remote_content.bucket is not None:
1389
+ if remote_content.key is not None:
1390
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
1391
+ objects_to_read.append(_object)
1392
+ elif remote_content.object is not None:
1393
+ objects_to_read.append(remote_content.object)
1394
+ elif remote_content.prefix is not None:
1395
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
1396
+ else:
1397
+ objects_to_read.extend(remote_content.bucket.get_objects())
1398
+
1399
+ for s3_object in objects_to_read:
1400
+ # 2. Setup Content object
1401
+ content_name = content.name or ""
1402
+ content_name += "_" + (s3_object.name or "")
1403
+ content_entry = Content(
1404
+ name=content_name,
1405
+ description=content.description,
1406
+ status=ContentStatus.PROCESSING,
1407
+ metadata=content.metadata,
1408
+ file_type="s3",
1409
+ )
1410
+
1411
+ # 3. Hash content and add it to the contents database
1412
+ content_entry.content_hash = self._build_content_hash(content_entry)
1413
+ content_entry.id = generate_id(content_entry.content_hash)
1414
+ await self._add_to_contents_db_async(content_entry)
1415
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1416
+ content_entry.status = ContentStatus.COMPLETED
1417
+ await self._aupdate_content(content_entry)
1418
+ return
1419
+
1420
+ # 4. Select reader
1421
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
1422
+ reader = cast(Reader, reader)
1423
+
1424
+ # 5. Fetch and load the content
1425
+ temporary_file = None
1426
+ obj_name = content_name or s3_object.name.split("/")[-1]
1427
+ readable_content: Optional[Union[BytesIO, Path]] = None
1428
+ if s3_object.uri.endswith(".pdf"):
1429
+ readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
1430
+ else:
1431
+ temporary_file = Path("storage").joinpath(obj_name)
1432
+ readable_content = temporary_file
1433
+ s3_object.download(readable_content) # type: ignore
1434
+
1435
+ # 6. Read the content
1436
+ read_documents = await reader.async_read(readable_content, name=obj_name)
1437
+
1438
+ # 7. Prepare and insert the content in the vector database
1439
+ if not content.id:
1440
+ content.id = generate_id(content.content_hash or "")
1441
+ self._prepare_documents_for_insert(read_documents, content.id)
1442
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
1443
+
1444
+ # 8. Remove temporary file if needed
1445
+ if temporary_file:
1446
+ temporary_file.unlink()
1447
+
1448
+ async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1449
+ """Load the contextual GCS content.
1450
+
1451
+ 1. Identify objects to read
1452
+ 2. Setup Content object
1453
+ 3. Hash content and add it to the contents database
1454
+ 4. Select reader
1455
+ 5. Fetch and load the content
1456
+ 6. Read the content
1457
+ 7. Prepare and insert the content in the vector database
1458
+ """
1459
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
1460
+
1461
+ # 1. Identify objects to read
1462
+ objects_to_read = []
1463
+ if remote_content.blob_name is not None:
1464
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
1465
+ elif remote_content.prefix is not None:
1466
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
1467
+ else:
1468
+ objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
1469
+
1470
+ for gcs_object in objects_to_read:
1471
+ # 2. Setup Content object
1472
+ name = (content.name or "content") + "_" + gcs_object.name
1473
+ content_entry = Content(
1474
+ name=name,
1475
+ description=content.description,
1476
+ status=ContentStatus.PROCESSING,
1477
+ metadata=content.metadata,
1478
+ file_type="gcs",
1479
+ )
1480
+
1481
+ # 3. Hash content and add it to the contents database
1482
+ content_entry.content_hash = self._build_content_hash(content_entry)
1483
+ content_entry.id = generate_id(content_entry.content_hash)
1484
+ await self._add_to_contents_db_async(content_entry)
1485
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1486
+ content_entry.status = ContentStatus.COMPLETED
1487
+ await self._aupdate_content(content_entry)
1488
+ return
1489
+
1490
+ # 4. Select reader
1491
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
1492
+ reader = cast(Reader, reader)
1493
+
1494
+ # 5. Fetch and load the content
1495
+ readable_content = BytesIO(gcs_object.download_as_bytes())
1496
+
1497
+ # 6. Read the content
1498
+ read_documents = await reader.async_read(readable_content, name=name)
1499
+
1500
+ # 7. Prepare and insert the content in the vector database
1501
+ if not content.id:
1502
+ content.id = generate_id(content.content_hash or "")
1503
+ self._prepare_documents_for_insert(read_documents, content.id)
1504
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
1505
+
1506
+ def _load_from_remote_content(
1507
+ self,
1508
+ content: Content,
1509
+ upsert: bool,
1510
+ skip_if_exists: bool,
1511
+ ):
1512
+ """Synchronous version of _load_from_remote_content."""
1513
+ if content.remote_content is None:
1514
+ log_warning("No remote content provided for content")
1515
+ return
1516
+
1517
+ remote_content = content.remote_content
1518
+
1519
+ if isinstance(remote_content, S3Content):
1520
+ self._load_from_s3(content, upsert, skip_if_exists)
1521
+
1522
+ elif isinstance(remote_content, GCSContent):
1523
+ self._load_from_gcs(content, upsert, skip_if_exists)
1524
+
1525
+ else:
1526
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
1527
+
1528
+ def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
1529
+ """Synchronous version of _load_from_s3.
1530
+
1531
+ Load the contextual S3 content:
1532
+ 1. Identify objects to read
1533
+ 2. Setup Content object
1534
+ 3. Hash content and add it to the contents database
1535
+ 4. Select reader
1536
+ 5. Fetch and load the content
1537
+ 6. Read the content
1538
+ 7. Prepare and insert the content in the vector database
1539
+ 8. Remove temporary file if needed
1540
+ """
1541
+ from agno.cloud.aws.s3.object import S3Object
1542
+
1543
+ remote_content: S3Content = cast(S3Content, content.remote_content)
1544
+
1545
+ # 1. Identify objects to read
1546
+ objects_to_read: List[S3Object] = []
1547
+ if remote_content.bucket is not None:
1548
+ if remote_content.key is not None:
1549
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
1550
+ objects_to_read.append(_object)
1551
+ elif remote_content.object is not None:
1552
+ objects_to_read.append(remote_content.object)
1553
+ elif remote_content.prefix is not None:
1554
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
1555
+ else:
1556
+ objects_to_read.extend(remote_content.bucket.get_objects())
1557
+
1558
+ for s3_object in objects_to_read:
1559
+ # 2. Setup Content object
1560
+ content_name = content.name or ""
1561
+ content_name += "_" + (s3_object.name or "")
1562
+ content_entry = Content(
1563
+ name=content_name,
1564
+ description=content.description,
1565
+ status=ContentStatus.PROCESSING,
1566
+ metadata=content.metadata,
1567
+ file_type="s3",
1568
+ )
1569
+
1570
+ # 3. Hash content and add it to the contents database
1571
+ content_entry.content_hash = self._build_content_hash(content_entry)
1572
+ content_entry.id = generate_id(content_entry.content_hash)
1573
+ self._add_to_contents_db(content_entry)
1574
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1575
+ content_entry.status = ContentStatus.COMPLETED
1576
+ self._update_content(content_entry)
1577
+ return
1578
+
1579
+ # 4. Select reader
1580
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
1581
+ reader = cast(Reader, reader)
1582
+
1583
+ # 5. Fetch and load the content
1584
+ temporary_file = None
1585
+ obj_name = content_name or s3_object.name.split("/")[-1]
1586
+ readable_content: Optional[Union[BytesIO, Path]] = None
1587
+ if s3_object.uri.endswith(".pdf"):
1588
+ readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
1589
+ else:
1590
+ temporary_file = Path("storage").joinpath(obj_name)
1591
+ readable_content = temporary_file
1592
+ s3_object.download(readable_content) # type: ignore
1593
+
1594
+ # 6. Read the content
1595
+ read_documents = reader.read(readable_content, name=obj_name)
1596
+
1597
+ # 7. Prepare and insert the content in the vector database
1598
+ if not content.id:
1599
+ content.id = generate_id(content.content_hash or "")
1600
+ self._prepare_documents_for_insert(read_documents, content.id)
1601
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
1602
+
1603
+ # 8. Remove temporary file if needed
1604
+ if temporary_file:
1605
+ temporary_file.unlink()
1606
+
1607
+ def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
1608
+ """Synchronous version of _load_from_gcs.
1609
+
1610
+ Load the contextual GCS content:
1611
+ 1. Identify objects to read
1612
+ 2. Setup Content object
1613
+ 3. Hash content and add it to the contents database
1614
+ 4. Select reader
1615
+ 5. Fetch and load the content
1616
+ 6. Read the content
1617
+ 7. Prepare and insert the content in the vector database
1618
+ """
1619
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
1620
+
1621
+ # 1. Identify objects to read
1622
+ objects_to_read = []
1623
+ if remote_content.blob_name is not None:
1624
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
1625
+ elif remote_content.prefix is not None:
1626
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
1627
+ else:
1628
+ objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
1629
+
1630
+ for gcs_object in objects_to_read:
1631
+ # 2. Setup Content object
1632
+ name = (content.name or "content") + "_" + gcs_object.name
1633
+ content_entry = Content(
1634
+ name=name,
1635
+ description=content.description,
1636
+ status=ContentStatus.PROCESSING,
1637
+ metadata=content.metadata,
1638
+ file_type="gcs",
1639
+ )
1640
+
1641
+ # 3. Hash content and add it to the contents database
1642
+ content_entry.content_hash = self._build_content_hash(content_entry)
1643
+ content_entry.id = generate_id(content_entry.content_hash)
1644
+ self._add_to_contents_db(content_entry)
1645
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1646
+ content_entry.status = ContentStatus.COMPLETED
1647
+ self._update_content(content_entry)
1648
+ return
1649
+
1650
+ # 4. Select reader
1651
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
1652
+ reader = cast(Reader, reader)
1653
+
1654
+ # 5. Fetch and load the content
1655
+ readable_content = BytesIO(gcs_object.download_as_bytes())
1656
+
1657
+ # 6. Read the content
1658
+ read_documents = reader.read(readable_content, name=name)
1659
+
1660
+ # 7. Prepare and insert the content in the vector database
1661
+ if not content.id:
1662
+ content.id = generate_id(content.content_hash or "")
1663
+ self._prepare_documents_for_insert(read_documents, content.id)
1664
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
1665
+
1666
+ async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
1667
+ from agno.vectordb import VectorDb
1668
+
1669
+ self.vector_db = cast(VectorDb, self.vector_db)
1670
+
1671
+ if not self.vector_db:
1672
+ log_error("No vector database configured")
1673
+ content.status = ContentStatus.FAILED
1674
+ content.status_message = "No vector database configured"
1675
+ await self._aupdate_content(content)
1676
+ return
1677
+
1678
+ if self.vector_db.upsert_available() and upsert:
1679
+ try:
1680
+ await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
1681
+ except Exception as e:
1682
+ log_error(f"Error upserting document: {e}")
1683
+ content.status = ContentStatus.FAILED
1684
+ content.status_message = "Could not upsert embedding"
1685
+ await self._aupdate_content(content)
1686
+ return
1687
+ else:
1688
+ try:
1689
+ await self.vector_db.async_insert(
1690
+ content.content_hash, # type: ignore[arg-type]
1691
+ documents=read_documents,
1692
+ filters=content.metadata, # type: ignore[arg-type]
1693
+ )
1694
+ except Exception as e:
1695
+ log_error(f"Error inserting document: {e}")
1696
+ content.status = ContentStatus.FAILED
1697
+ content.status_message = "Could not insert embedding"
1698
+ await self._aupdate_content(content)
1699
+ return
1700
+
1701
+ content.status = ContentStatus.COMPLETED
1702
+ await self._aupdate_content(content)
1703
+
1704
+ def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
1705
+ """Synchronously handle vector database insertion."""
1706
+ from agno.vectordb import VectorDb
1707
+
1708
+ self.vector_db = cast(VectorDb, self.vector_db)
1709
+
1710
+ if not self.vector_db:
1711
+ log_error("No vector database configured")
1712
+ content.status = ContentStatus.FAILED
1713
+ content.status_message = "No vector database configured"
1714
+ self._update_content(content)
1715
+ return
1716
+
1717
+ if self.vector_db.upsert_available() and upsert:
1718
+ try:
1719
+ self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
1720
+ except Exception as e:
1721
+ log_error(f"Error upserting document: {e}")
1722
+ content.status = ContentStatus.FAILED
1723
+ content.status_message = "Could not upsert embedding"
1724
+ self._update_content(content)
1725
+ return
1726
+ else:
1727
+ try:
1728
+ self.vector_db.insert(
1729
+ content.content_hash, # type: ignore[arg-type]
1730
+ documents=read_documents,
1731
+ filters=content.metadata, # type: ignore[arg-type]
1732
+ )
1733
+ except Exception as e:
1734
+ log_error(f"Error inserting document: {e}")
1735
+ content.status = ContentStatus.FAILED
1736
+ content.status_message = "Could not insert embedding"
1737
+ self._update_content(content)
1738
+ return
1739
+
1740
+ content.status = ContentStatus.COMPLETED
1741
+ self._update_content(content)
1742
+
1743
+ def _load_content(
1744
+ self,
1745
+ content: Content,
1746
+ upsert: bool,
1747
+ skip_if_exists: bool,
1748
+ include: Optional[List[str]] = None,
1749
+ exclude: Optional[List[str]] = None,
1750
+ ) -> None:
1751
+ """Synchronously load content."""
1752
+ if content.path:
1753
+ self._load_from_path(content, upsert, skip_if_exists, include, exclude)
1754
+
1755
+ if content.url:
1756
+ self._load_from_url(content, upsert, skip_if_exists)
1757
+
1758
+ if content.file_data:
1759
+ self._load_from_content(content, upsert, skip_if_exists)
1760
+
1761
+ if content.topics:
1762
+ self._load_from_topics(content, upsert, skip_if_exists)
1763
+
1764
+ if content.remote_content:
1765
+ self._load_from_remote_content(content, upsert, skip_if_exists)
1766
+
1767
+ async def _load_content_async(
1768
+ self,
1769
+ content: Content,
1770
+ upsert: bool,
1771
+ skip_if_exists: bool,
1772
+ include: Optional[List[str]] = None,
1773
+ exclude: Optional[List[str]] = None,
1774
+ ) -> None:
1775
+ if content.path:
1776
+ await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
1777
+
1778
+ if content.url:
1779
+ await self._load_from_url_async(content, upsert, skip_if_exists)
1780
+
1781
+ if content.file_data:
1782
+ await self._load_from_content_async(content, upsert, skip_if_exists)
1783
+
1784
+ if content.topics:
1785
+ await self._load_from_topics_async(content, upsert, skip_if_exists)
1786
+
1787
+ if content.remote_content:
1788
+ await self._load_from_remote_content_async(content, upsert, skip_if_exists)
1789
+
1790
+ def _build_content_hash(self, content: Content) -> str:
1791
+ """
1792
+ Build the content hash from the content.
1793
+
1794
+ For URLs and paths, includes the name and description in the hash if provided
1795
+ to ensure unique content with the same URL/path but different names/descriptions
1796
+ get different hashes.
1797
+
1798
+ Hash format:
1799
+ - URL with name and description: hash("{name}:{description}:{url}")
1800
+ - URL with name only: hash("{name}:{url}")
1801
+ - URL with description only: hash("{description}:{url}")
1802
+ - URL without name/description: hash("{url}") (backward compatible)
1803
+ - Same logic applies to paths
1804
+ """
1805
+ hash_parts = []
1806
+ if content.name:
1807
+ hash_parts.append(content.name)
1808
+ if content.description:
1809
+ hash_parts.append(content.description)
1810
+
1811
+ if content.path:
1812
+ hash_parts.append(str(content.path))
1813
+ elif content.url:
1814
+ hash_parts.append(content.url)
1815
+ elif content.file_data and content.file_data.content:
1816
+ # For file_data, always add filename, type, size, or content for uniqueness
1817
+ if content.file_data.filename:
1818
+ hash_parts.append(content.file_data.filename)
1819
+ elif content.file_data.type:
1820
+ hash_parts.append(content.file_data.type)
1821
+ elif content.file_data.size is not None:
1822
+ hash_parts.append(str(content.file_data.size))
1823
+ else:
1824
+ # Fallback: use the content for uniqueness
1825
+ # Include type information to distinguish str vs bytes
1826
+ content_type = "str" if isinstance(content.file_data.content, str) else "bytes"
1827
+ content_bytes = (
1828
+ content.file_data.content.encode()
1829
+ if isinstance(content.file_data.content, str)
1830
+ else content.file_data.content
1831
+ )
1832
+ content_hash = hashlib.sha256(content_bytes).hexdigest()[:16] # Use first 16 chars
1833
+ hash_parts.append(f"{content_type}:{content_hash}")
1834
+ elif content.topics and len(content.topics) > 0:
1835
+ topic = content.topics[0]
1836
+ reader = type(content.reader).__name__ if content.reader else "unknown"
1837
+ hash_parts.append(f"{topic}-{reader}")
1838
+ else:
1839
+ # Fallback for edge cases
1840
+ import random
1841
+ import string
1842
+
1843
+ fallback = (
1844
+ content.name
1845
+ or content.id
1846
+ or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
1847
+ )
1848
+ hash_parts.append(fallback)
1849
+
1850
+ hash_input = ":".join(hash_parts)
1851
+ return hashlib.sha256(hash_input.encode()).hexdigest()
1852
+
1853
+ def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
1854
+ """
1855
+ Safely ensure a field is a string, handling various edge cases.
1856
+
1857
+ Args:
1858
+ value: The value to convert to string
1859
+ field_name: Name of the field for logging purposes
1860
+ default: Default string value if conversion fails
1861
+
1862
+ Returns:
1863
+ str: A safe string value
1864
+ """
1865
+ # Handle None/falsy values
1866
+ if value is None or value == "":
1867
+ return default
1868
+
1869
+ # Handle unexpected list types (the root cause of our Pydantic warning)
1870
+ if isinstance(value, list):
1871
+ if len(value) == 0:
1872
+ log_debug(f"Empty list found for {field_name}, using default: '{default}'")
1873
+ return default
1874
+ elif len(value) == 1:
1875
+ # Single item list, extract the item
1876
+ log_debug(f"Single-item list found for {field_name}, extracting: '{value[0]}'")
1877
+ return str(value[0]) if value[0] is not None else default
1878
+ else:
1879
+ # Multiple items, join them
1880
+ log_debug(f"Multi-item list found for {field_name}, joining: {value}")
1881
+ return " | ".join(str(item) for item in value if item is not None)
1882
+
1883
+ # Handle other unexpected types
1884
+ if not isinstance(value, str):
1885
+ log_debug(f"Non-string type {type(value)} found for {field_name}, converting: '{value}'")
1886
+ try:
1887
+ return str(value)
1888
+ except Exception as e:
1889
+ log_warning(f"Failed to convert {field_name} to string: {e}, using default")
1890
+ return default
1891
+
1892
+ # Already a string, return as-is
1893
+ return value
1894
+
1895
+ async def _add_to_contents_db_async(self, content: Content):
1896
+ if self.contents_db:
1897
+ created_at = content.created_at if content.created_at else int(time.time())
1898
+ updated_at = content.updated_at if content.updated_at else int(time.time())
1899
+
1900
+ file_type = (
1901
+ content.file_type
1902
+ if content.file_type
1903
+ else content.file_data.type
1904
+ if content.file_data and content.file_data.type
1905
+ else None
1906
+ )
1907
+ # Safely handle string fields with proper type checking
1908
+ safe_name = self._ensure_string_field(content.name, "content.name", default="")
1909
+ safe_description = self._ensure_string_field(content.description, "content.description", default="")
1910
+ safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
1911
+ safe_status_message = self._ensure_string_field(
1912
+ content.status_message, "content.status_message", default=""
1913
+ )
1914
+
1915
+ content_row = KnowledgeRow(
1916
+ id=content.id,
1917
+ name=safe_name,
1918
+ description=safe_description,
1919
+ metadata=content.metadata,
1920
+ type=file_type,
1921
+ size=content.size
1922
+ if content.size
1923
+ else len(content.file_data.content)
1924
+ if content.file_data and content.file_data.content
1925
+ else None,
1926
+ linked_to=safe_linked_to,
1927
+ access_count=0,
1928
+ status=content.status if content.status else ContentStatus.PROCESSING,
1929
+ status_message=safe_status_message,
1930
+ created_at=created_at,
1931
+ updated_at=updated_at,
1932
+ )
1933
+ if isinstance(self.contents_db, AsyncBaseDb):
1934
+ await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1935
+ else:
1936
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1937
+
1938
+ def _add_to_contents_db(self, content: Content):
1939
+ """Synchronously add content to contents database."""
1940
+ if self.contents_db:
1941
+ if isinstance(self.contents_db, AsyncBaseDb):
1942
+ raise ValueError(
1943
+ "_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
1944
+ )
1945
+
1946
+ created_at = content.created_at if content.created_at else int(time.time())
1947
+ updated_at = content.updated_at if content.updated_at else int(time.time())
1948
+
1949
+ file_type = (
1950
+ content.file_type
1951
+ if content.file_type
1952
+ else content.file_data.type
1953
+ if content.file_data and content.file_data.type
1954
+ else None
1955
+ )
1956
+ # Safely handle string fields with proper type checking
1957
+ safe_name = self._ensure_string_field(content.name, "content.name", default="")
1958
+ safe_description = self._ensure_string_field(content.description, "content.description", default="")
1959
+ safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
1960
+ safe_status_message = self._ensure_string_field(
1961
+ content.status_message, "content.status_message", default=""
1962
+ )
1963
+
1964
+ content_row = KnowledgeRow(
1965
+ id=content.id,
1966
+ name=safe_name,
1967
+ description=safe_description,
1968
+ metadata=content.metadata,
1969
+ type=file_type,
1970
+ size=content.size
1971
+ if content.size
1972
+ else len(content.file_data.content)
1973
+ if content.file_data and content.file_data.content
1974
+ else None,
1975
+ linked_to=safe_linked_to,
1976
+ access_count=0,
1977
+ status=content.status if content.status else ContentStatus.PROCESSING,
1978
+ status_message=safe_status_message,
1979
+ created_at=created_at,
1980
+ updated_at=updated_at,
1981
+ )
1982
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1983
+
1984
+ def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
1985
+ from agno.vectordb import VectorDb
1986
+
1987
+ self.vector_db = cast(VectorDb, self.vector_db)
1988
+ if self.contents_db:
1989
+ if isinstance(self.contents_db, AsyncBaseDb):
1990
+ raise ValueError(
1991
+ "update_content() is not supported with an async DB. Please use aupdate_content() instead."
1992
+ )
1993
+
1994
+ if not content.id:
1995
+ log_warning("Content id is required to update Knowledge content")
1996
+ return None
1997
+
1998
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
1999
+ content_row = self.contents_db.get_knowledge_content(content.id)
2000
+ if content_row is None:
2001
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
2002
+ return None
2003
+
2004
+ # Apply safe string handling for updates as well
2005
+ if content.name is not None:
2006
+ content_row.name = self._ensure_string_field(content.name, "content.name", default="")
2007
+ if content.description is not None:
2008
+ content_row.description = self._ensure_string_field(
2009
+ content.description, "content.description", default=""
2010
+ )
2011
+ if content.metadata is not None:
2012
+ content_row.metadata = content.metadata
2013
+ if content.status is not None:
2014
+ content_row.status = content.status
2015
+ if content.status_message is not None:
2016
+ content_row.status_message = self._ensure_string_field(
2017
+ content.status_message, "content.status_message", default=""
2018
+ )
2019
+ if content.external_id is not None:
2020
+ content_row.external_id = self._ensure_string_field(
2021
+ content.external_id, "content.external_id", default=""
2022
+ )
2023
+ content_row.updated_at = int(time.time())
2024
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2025
+
2026
+ if self.vector_db:
2027
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
2028
+
2029
+ return content_row.to_dict()
2030
+
2031
+ else:
2032
+ if self.name:
2033
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
2034
+ else:
2035
+ log_warning("Contents DB not found for knowledge base")
2036
+ return None
2037
+
2038
+ async def _aupdate_content(self, content: Content) -> Optional[Dict[str, Any]]:
2039
+ if self.contents_db:
2040
+ if not content.id:
2041
+ log_warning("Content id is required to update Knowledge content")
2042
+ return None
2043
+
2044
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
2045
+ if isinstance(self.contents_db, AsyncBaseDb):
2046
+ content_row = await self.contents_db.get_knowledge_content(content.id)
2047
+ else:
2048
+ content_row = self.contents_db.get_knowledge_content(content.id)
2049
+ if content_row is None:
2050
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
2051
+ return None
2052
+
2053
+ if content.name is not None:
2054
+ content_row.name = content.name
2055
+ if content.description is not None:
2056
+ content_row.description = content.description
2057
+ if content.metadata is not None:
2058
+ content_row.metadata = content.metadata
2059
+ if content.status is not None:
2060
+ content_row.status = content.status
2061
+ if content.status_message is not None:
2062
+ content_row.status_message = content.status_message if content.status_message else ""
2063
+ if content.external_id is not None:
2064
+ content_row.external_id = content.external_id
2065
+
2066
+ content_row.updated_at = int(time.time())
2067
+ if isinstance(self.contents_db, AsyncBaseDb):
2068
+ await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2069
+ else:
2070
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
2071
+
2072
+ if self.vector_db:
2073
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
2074
+
2075
+ return content_row.to_dict()
2076
+
2077
+ else:
2078
+ if self.name:
2079
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
2080
+ else:
2081
+ log_warning("Contents DB not found for knowledge base")
2082
+ return None
2083
+
2084
+ async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2085
+ from agno.vectordb import VectorDb
2086
+
2087
+ self.vector_db = cast(VectorDb, self.vector_db)
2088
+
2089
+ await self._add_to_contents_db_async(content)
2090
+ if content_type == KnowledgeContentOrigin.PATH:
2091
+ if content.file_data is None:
2092
+ log_warning("No file data provided")
2093
+
2094
+ if content.path is None:
2095
+ log_error("No path provided for content")
2096
+ return
2097
+
2098
+ path = Path(content.path)
2099
+
2100
+ log_info(f"Uploading file to LightRAG from path: {path}")
2101
+ try:
2102
+ # Read the file content from path
2103
+ with open(path, "rb") as f:
2104
+ file_content = f.read()
2105
+
2106
+ # Get file type from extension or content.file_type
2107
+ file_type = content.file_type or path.suffix
2108
+
2109
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2110
+ result = await self.vector_db.insert_file_bytes(
2111
+ file_content=file_content,
2112
+ filename=path.name, # Use the original filename with extension
2113
+ content_type=file_type,
2114
+ send_metadata=True, # Enable metadata so server knows the file type
2115
+ )
2116
+
2117
+ else:
2118
+ log_error("Vector database does not support file insertion")
2119
+ content.status = ContentStatus.FAILED
2120
+ await self._aupdate_content(content)
2121
+ return
2122
+ content.external_id = result
2123
+ content.status = ContentStatus.COMPLETED
2124
+ await self._aupdate_content(content)
2125
+ return
2126
+
2127
+ except Exception as e:
2128
+ log_error(f"Error uploading file to LightRAG: {e}")
2129
+ content.status = ContentStatus.FAILED
2130
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2131
+ await self._aupdate_content(content)
2132
+ return
2133
+
2134
+ elif content_type == KnowledgeContentOrigin.URL:
2135
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
2136
+ try:
2137
+ reader = content.reader or self.website_reader
2138
+ if reader is None:
2139
+ log_error("No URL reader available")
2140
+ content.status = ContentStatus.FAILED
2141
+ await self._aupdate_content(content)
2142
+ return
2143
+
2144
+ reader.chunk = False
2145
+ read_documents = reader.read(content.url, name=content.name)
2146
+ if not content.id:
2147
+ content.id = generate_id(content.content_hash or "")
2148
+ self._prepare_documents_for_insert(read_documents, content.id)
2149
+
2150
+ if not read_documents:
2151
+ log_error("No documents read from URL")
2152
+ content.status = ContentStatus.FAILED
2153
+ await self._aupdate_content(content)
2154
+ return
2155
+
2156
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2157
+ result = await self.vector_db.insert_text(
2158
+ file_source=content.url,
2159
+ text=read_documents[0].content,
2160
+ )
2161
+ else:
2162
+ log_error("Vector database does not support text insertion")
2163
+ content.status = ContentStatus.FAILED
2164
+ await self._aupdate_content(content)
2165
+ return
2166
+
2167
+ content.external_id = result
2168
+ content.status = ContentStatus.COMPLETED
2169
+ await self._aupdate_content(content)
2170
+ return
2171
+
2172
+ except Exception as e:
2173
+ log_error(f"Error uploading file to LightRAG: {e}")
2174
+ content.status = ContentStatus.FAILED
2175
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2176
+ await self._aupdate_content(content)
2177
+ return
2178
+
2179
+ elif content_type == KnowledgeContentOrigin.CONTENT:
2180
+ filename = (
2181
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
2182
+ )
2183
+ log_info(f"Uploading file to LightRAG: {filename}")
2184
+
2185
+ # Use the content from file_data
2186
+ if content.file_data and content.file_data.content:
2187
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2188
+ result = await self.vector_db.insert_file_bytes(
2189
+ file_content=content.file_data.content,
2190
+ filename=filename,
2191
+ content_type=content.file_data.type,
2192
+ send_metadata=True, # Enable metadata so server knows the file type
2193
+ )
2194
+ else:
2195
+ log_error("Vector database does not support file insertion")
2196
+ content.status = ContentStatus.FAILED
2197
+ await self._aupdate_content(content)
2198
+ return
2199
+ content.external_id = result
2200
+ content.status = ContentStatus.COMPLETED
2201
+ await self._aupdate_content(content)
2202
+ else:
2203
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
2204
+ return
2205
+
2206
+ elif content_type == KnowledgeContentOrigin.TOPIC:
2207
+ log_info(f"Uploading file to LightRAG: {content.name}")
2208
+
2209
+ if content.reader is None:
2210
+ log_error("No reader available for topic content")
2211
+ content.status = ContentStatus.FAILED
2212
+ await self._aupdate_content(content)
2213
+ return
2214
+
2215
+ if not content.topics:
2216
+ log_error("No topics available for content")
2217
+ content.status = ContentStatus.FAILED
2218
+ await self._aupdate_content(content)
2219
+ return
2220
+
2221
+ read_documents = content.reader.read(content.topics)
2222
+ if len(read_documents) > 0:
2223
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2224
+ result = await self.vector_db.insert_text(
2225
+ file_source=content.topics[0],
2226
+ text=read_documents[0].content,
2227
+ )
2228
+ else:
2229
+ log_error("Vector database does not support text insertion")
2230
+ content.status = ContentStatus.FAILED
2231
+ await self._aupdate_content(content)
2232
+ return
2233
+ content.external_id = result
2234
+ content.status = ContentStatus.COMPLETED
2235
+ await self._aupdate_content(content)
2236
+ return
2237
+ else:
2238
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
2239
+ return
2240
+
2241
+ def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2242
+ """Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
2243
+ from agno.vectordb import VectorDb
2244
+
2245
+ self.vector_db = cast(VectorDb, self.vector_db)
2246
+
2247
+ self._add_to_contents_db(content)
2248
+ if content_type == KnowledgeContentOrigin.PATH:
2249
+ if content.file_data is None:
2250
+ log_warning("No file data provided")
2251
+
2252
+ if content.path is None:
2253
+ log_error("No path provided for content")
2254
+ return
2255
+
2256
+ path = Path(content.path)
2257
+
2258
+ log_info(f"Uploading file to LightRAG from path: {path}")
2259
+ try:
2260
+ # Read the file content from path
2261
+ with open(path, "rb") as f:
2262
+ file_content = f.read()
2263
+
2264
+ # Get file type from extension or content.file_type
2265
+ file_type = content.file_type or path.suffix
2266
+
2267
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2268
+ # LightRAG only has async methods, use asyncio.run() here
2269
+ result = asyncio.run(
2270
+ self.vector_db.insert_file_bytes(
2271
+ file_content=file_content,
2272
+ filename=path.name,
2273
+ content_type=file_type,
2274
+ send_metadata=True,
2275
+ )
2276
+ )
2277
+ else:
2278
+ log_error("Vector database does not support file insertion")
2279
+ content.status = ContentStatus.FAILED
2280
+ self._update_content(content)
2281
+ return
2282
+ content.external_id = result
2283
+ content.status = ContentStatus.COMPLETED
2284
+ self._update_content(content)
2285
+ return
2286
+
2287
+ except Exception as e:
2288
+ log_error(f"Error uploading file to LightRAG: {e}")
2289
+ content.status = ContentStatus.FAILED
2290
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2291
+ self._update_content(content)
2292
+ return
2293
+
2294
+ elif content_type == KnowledgeContentOrigin.URL:
2295
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
2296
+ try:
2297
+ reader = content.reader or self.website_reader
2298
+ if reader is None:
2299
+ log_error("No URL reader available")
2300
+ content.status = ContentStatus.FAILED
2301
+ self._update_content(content)
2302
+ return
2303
+
2304
+ reader.chunk = False
2305
+ read_documents = reader.read(content.url, name=content.name)
2306
+ if not content.id:
2307
+ content.id = generate_id(content.content_hash or "")
2308
+ self._prepare_documents_for_insert(read_documents, content.id)
2309
+
2310
+ if not read_documents:
2311
+ log_error("No documents read from URL")
2312
+ content.status = ContentStatus.FAILED
2313
+ self._update_content(content)
2314
+ return
2315
+
2316
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2317
+ # LightRAG only has async methods, use asyncio.run() here
2318
+ result = asyncio.run(
2319
+ self.vector_db.insert_text(
2320
+ file_source=content.url,
2321
+ text=read_documents[0].content,
2322
+ )
2323
+ )
2324
+ else:
2325
+ log_error("Vector database does not support text insertion")
2326
+ content.status = ContentStatus.FAILED
2327
+ self._update_content(content)
2328
+ return
2329
+
2330
+ content.external_id = result
2331
+ content.status = ContentStatus.COMPLETED
2332
+ self._update_content(content)
2333
+ return
2334
+
2335
+ except Exception as e:
2336
+ log_error(f"Error uploading file to LightRAG: {e}")
2337
+ content.status = ContentStatus.FAILED
2338
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2339
+ self._update_content(content)
2340
+ return
2341
+
2342
+ elif content_type == KnowledgeContentOrigin.CONTENT:
2343
+ filename = (
2344
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
2345
+ )
2346
+ log_info(f"Uploading file to LightRAG: {filename}")
2347
+
2348
+ # Use the content from file_data
2349
+ if content.file_data and content.file_data.content:
2350
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2351
+ # LightRAG only has async methods, use asyncio.run() here
2352
+ result = asyncio.run(
2353
+ self.vector_db.insert_file_bytes(
2354
+ file_content=content.file_data.content,
2355
+ filename=filename,
2356
+ content_type=content.file_data.type,
2357
+ send_metadata=True,
2358
+ )
2359
+ )
2360
+ else:
2361
+ log_error("Vector database does not support file insertion")
2362
+ content.status = ContentStatus.FAILED
2363
+ self._update_content(content)
2364
+ return
2365
+ content.external_id = result
2366
+ content.status = ContentStatus.COMPLETED
2367
+ self._update_content(content)
2368
+ else:
2369
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
2370
+ return
2371
+
2372
+ elif content_type == KnowledgeContentOrigin.TOPIC:
2373
+ log_info(f"Uploading file to LightRAG: {content.name}")
2374
+
2375
+ if content.reader is None:
2376
+ log_error("No reader available for topic content")
2377
+ content.status = ContentStatus.FAILED
2378
+ self._update_content(content)
2379
+ return
2380
+
2381
+ if not content.topics:
2382
+ log_error("No topics available for content")
2383
+ content.status = ContentStatus.FAILED
2384
+ self._update_content(content)
2385
+ return
2386
+
2387
+ read_documents = content.reader.read(content.topics)
2388
+ if len(read_documents) > 0:
2389
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2390
+ # LightRAG only has async methods, use asyncio.run() here
2391
+ result = asyncio.run(
2392
+ self.vector_db.insert_text(
2393
+ file_source=content.topics[0],
2394
+ text=read_documents[0].content,
2395
+ )
2396
+ )
2397
+ else:
2398
+ log_error("Vector database does not support text insertion")
2399
+ content.status = ContentStatus.FAILED
2400
+ self._update_content(content)
2401
+ return
2402
+ content.external_id = result
2403
+ content.status = ContentStatus.COMPLETED
2404
+ self._update_content(content)
2405
+ return
2406
+ else:
2407
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
2408
+ return
2409
+
2410
+ def search(
2411
+ self,
2412
+ query: str,
2413
+ max_results: Optional[int] = None,
2414
+ filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
2415
+ search_type: Optional[str] = None,
2416
+ ) -> List[Document]:
2417
+ """Returns relevant documents matching a query"""
2418
+ from agno.vectordb import VectorDb
2419
+ from agno.vectordb.search import SearchType
2420
+
2421
+ self.vector_db = cast(VectorDb, self.vector_db)
2422
+
2423
+ if (
2424
+ hasattr(self.vector_db, "search_type")
2425
+ and isinstance(self.vector_db.search_type, SearchType)
2426
+ and search_type
2427
+ ):
2428
+ self.vector_db.search_type = SearchType(search_type)
2429
+ try:
2430
+ if self.vector_db is None:
2431
+ log_warning("No vector db provided")
2432
+ return []
2433
+
2434
+ _max_results = max_results or self.max_results
2435
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
2436
+ return self.vector_db.search(query=query, limit=_max_results, filters=filters)
2437
+ except Exception as e:
2438
+ log_error(f"Error searching for documents: {e}")
2439
+ return []
2440
+
2441
+ async def async_search(
2442
+ self,
2443
+ query: str,
2444
+ max_results: Optional[int] = None,
2445
+ filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
2446
+ search_type: Optional[str] = None,
2447
+ ) -> List[Document]:
2448
+ """Returns relevant documents matching a query"""
2449
+ from agno.vectordb import VectorDb
2450
+ from agno.vectordb.search import SearchType
2451
+
2452
+ self.vector_db = cast(VectorDb, self.vector_db)
2453
+ if (
2454
+ hasattr(self.vector_db, "search_type")
2455
+ and isinstance(self.vector_db.search_type, SearchType)
2456
+ and search_type
2457
+ ):
2458
+ self.vector_db.search_type = SearchType(search_type)
2459
+ try:
2460
+ if self.vector_db is None:
2461
+ log_warning("No vector db provided")
2462
+ return []
2463
+
2464
+ _max_results = max_results or self.max_results
2465
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
2466
+ try:
2467
+ return await self.vector_db.async_search(query=query, limit=_max_results, filters=filters)
2468
+ except NotImplementedError:
2469
+ log_info("Vector db does not support async search")
2470
+ return self.search(query=query, max_results=_max_results, filters=filters)
2471
+ except Exception as e:
2472
+ log_error(f"Error searching for documents: {e}")
2473
+ return []
2474
+
2475
+ def get_valid_filters(self) -> Set[str]:
2476
+ if self.contents_db is None:
2477
+ log_warning("No contents db provided. This is required for filtering.")
2478
+ return set()
2479
+ contents, _ = self.get_content()
2480
+ valid_filters: Set[str] = set()
2481
+ for content in contents:
2482
+ if content.metadata:
2483
+ valid_filters.update(content.metadata.keys())
2484
+
2485
+ return valid_filters
2486
+
2487
+ async def async_get_valid_filters(self) -> Set[str]:
2488
+ if self.contents_db is None:
2489
+ log_warning("No contents db provided. This is required for filtering.")
2490
+ return set()
2491
+ contents, _ = await self.aget_content()
2492
+ valid_filters: Set[str] = set()
2493
+ for content in contents:
2494
+ if content.metadata:
2495
+ valid_filters.update(content.metadata.keys())
2496
+
2497
+ return valid_filters
2498
+
2499
+ def _validate_filters(
2500
+ self, filters: Union[Dict[str, Any], List[FilterExpr]], valid_metadata_filters: Set[str]
2501
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
2502
+ if not filters:
2503
+ return {}, []
2504
+
2505
+ valid_filters: Union[Dict[str, Any], List[FilterExpr]] = {}
2506
+ invalid_keys = []
2507
+
2508
+ if isinstance(filters, dict):
2509
+ # If no metadata filters tracked yet, all keys are considered invalid
2510
+ if valid_metadata_filters is None or not valid_metadata_filters:
2511
+ invalid_keys = list(filters.keys())
2512
+ log_warning(
2513
+ f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}"
2514
+ )
2515
+ return {}, invalid_keys
2516
+
2517
+ for key, value in filters.items():
2518
+ # Handle both normal keys and prefixed keys like meta_data.key
2519
+ base_key = key.split(".")[-1] if "." in key else key
2520
+ if base_key in valid_metadata_filters or key in valid_metadata_filters:
2521
+ valid_filters[key] = value # type: ignore
2522
+ else:
2523
+ invalid_keys.append(key)
2524
+ log_warning(f"Invalid filter key: {key} - not present in knowledge base")
2525
+
2526
+ elif isinstance(filters, List):
2527
+ # Validate that list contains FilterExpr instances
2528
+ for i, filter_item in enumerate(filters):
2529
+ if not isinstance(filter_item, FilterExpr):
2530
+ log_warning(
2531
+ f"Invalid filter at index {i}: expected FilterExpr instance, "
2532
+ f"got {type(filter_item).__name__}. "
2533
+ f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
2534
+ f"AND(...), OR(...), NOT(...) from agno.filters"
2535
+ )
2536
+ # Filter expressions are already validated, return empty dict/list
2537
+ # The actual filtering happens in the vector_db layer
2538
+ return filters, []
2539
+
2540
+ return valid_filters, invalid_keys
2541
+
2542
+ def validate_filters(
2543
+ self, filters: Union[Dict[str, Any], List[FilterExpr]]
2544
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
2545
+ valid_filters_from_db = self.get_valid_filters()
2546
+
2547
+ valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
2548
+
2549
+ return valid_filters, invalid_keys
2550
+
2551
+ async def async_validate_filters(
2552
+ self, filters: Union[Dict[str, Any], List[FilterExpr]]
2553
+ ) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
2554
+ """Return a tuple containing a dict with all valid filters and a list of invalid filter keys"""
2555
+ valid_filters_from_db = await self.async_get_valid_filters()
2556
+
2557
+ valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
2558
+
2559
+ return valid_filters, invalid_keys
2560
+
2561
+ def remove_vector_by_id(self, id: str) -> bool:
2562
+ from agno.vectordb import VectorDb
2563
+
2564
+ self.vector_db = cast(VectorDb, self.vector_db)
2565
+ if self.vector_db is None:
2566
+ log_warning("No vector DB provided")
2567
+ return False
2568
+ return self.vector_db.delete_by_id(id)
2569
+
2570
+ def remove_vectors_by_name(self, name: str) -> bool:
2571
+ from agno.vectordb import VectorDb
2572
+
2573
+ self.vector_db = cast(VectorDb, self.vector_db)
2574
+ if self.vector_db is None:
2575
+ log_warning("No vector DB provided")
2576
+ return False
2577
+ return self.vector_db.delete_by_name(name)
2578
+
2579
+ def remove_vectors_by_metadata(self, metadata: Dict[str, Any]) -> bool:
2580
+ from agno.vectordb import VectorDb
2581
+
2582
+ self.vector_db = cast(VectorDb, self.vector_db)
2583
+ if self.vector_db is None:
2584
+ log_warning("No vector DB provided")
2585
+ return False
2586
+ return self.vector_db.delete_by_metadata(metadata)
2587
+
2588
+ # --- API Only Methods ---
2589
+
2590
+ def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
2591
+ return self._update_content(content)
2592
+
2593
+ async def apatch_content(self, content: Content) -> Optional[Dict[str, Any]]:
2594
+ return await self._aupdate_content(content)
2595
+
2596
+ def get_content_by_id(self, content_id: str) -> Optional[Content]:
2597
+ if self.contents_db is None:
2598
+ raise ValueError("No contents db provided")
2599
+
2600
+ if isinstance(self.contents_db, AsyncBaseDb):
2601
+ raise ValueError(
2602
+ "get_content_by_id() is not supported for async databases. Please use aget_content_by_id() instead."
2603
+ )
2604
+
2605
+ content_row = self.contents_db.get_knowledge_content(content_id)
2606
+
2607
+ if content_row is None:
2608
+ return None
2609
+ content = Content(
2610
+ id=content_row.id,
2611
+ name=content_row.name,
2612
+ description=content_row.description,
2613
+ metadata=content_row.metadata,
2614
+ file_type=content_row.type,
2615
+ size=content_row.size,
2616
+ status=ContentStatus(content_row.status) if content_row.status else None,
2617
+ status_message=content_row.status_message,
2618
+ created_at=content_row.created_at,
2619
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2620
+ external_id=content_row.external_id,
2621
+ )
2622
+ return content
2623
+
2624
+ async def aget_content_by_id(self, content_id: str) -> Optional[Content]:
2625
+ if self.contents_db is None:
2626
+ raise ValueError("No contents db provided")
2627
+
2628
+ if isinstance(self.contents_db, AsyncBaseDb):
2629
+ content_row = await self.contents_db.get_knowledge_content(content_id)
2630
+ else:
2631
+ content_row = self.contents_db.get_knowledge_content(content_id)
2632
+
2633
+ if content_row is None:
2634
+ return None
2635
+ content = Content(
2636
+ id=content_row.id,
2637
+ name=content_row.name,
2638
+ description=content_row.description,
2639
+ metadata=content_row.metadata,
2640
+ file_type=content_row.type,
2641
+ size=content_row.size,
2642
+ status=ContentStatus(content_row.status) if content_row.status else None,
2643
+ status_message=content_row.status_message,
2644
+ created_at=content_row.created_at,
2645
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2646
+ external_id=content_row.external_id,
2647
+ )
2648
+ return content
2649
+
2650
+ def get_content(
2651
+ self,
2652
+ limit: Optional[int] = None,
2653
+ page: Optional[int] = None,
2654
+ sort_by: Optional[str] = None,
2655
+ sort_order: Optional[str] = None,
2656
+ ) -> Tuple[List[Content], int]:
2657
+ if self.contents_db is None:
2658
+ raise ValueError("No contents db provided")
2659
+
2660
+ if isinstance(self.contents_db, AsyncBaseDb):
2661
+ raise ValueError("get_content() is not supported for async databases. Please use aget_content() instead.")
2662
+
2663
+ contents, count = self.contents_db.get_knowledge_contents(
2664
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
2665
+ )
2666
+
2667
+ result = []
2668
+ for content_row in contents:
2669
+ # Create Content from database row
2670
+ content = Content(
2671
+ id=content_row.id,
2672
+ name=content_row.name,
2673
+ description=content_row.description,
2674
+ metadata=content_row.metadata,
2675
+ size=content_row.size,
2676
+ file_type=content_row.type,
2677
+ status=ContentStatus(content_row.status) if content_row.status else None,
2678
+ status_message=content_row.status_message,
2679
+ created_at=content_row.created_at,
2680
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2681
+ external_id=content_row.external_id,
2682
+ )
2683
+ result.append(content)
2684
+ return result, count
2685
+
2686
+ async def aget_content(
2687
+ self,
2688
+ limit: Optional[int] = None,
2689
+ page: Optional[int] = None,
2690
+ sort_by: Optional[str] = None,
2691
+ sort_order: Optional[str] = None,
2692
+ ) -> Tuple[List[Content], int]:
2693
+ if self.contents_db is None:
2694
+ raise ValueError("No contents db provided")
2695
+
2696
+ if isinstance(self.contents_db, AsyncBaseDb):
2697
+ contents, count = await self.contents_db.get_knowledge_contents(
2698
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
2699
+ )
2700
+ else:
2701
+ contents, count = self.contents_db.get_knowledge_contents(
2702
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
2703
+ )
2704
+
2705
+ result = []
2706
+ for content_row in contents:
2707
+ # Create Content from database row
2708
+ content = Content(
2709
+ id=content_row.id,
2710
+ name=content_row.name,
2711
+ description=content_row.description,
2712
+ metadata=content_row.metadata,
2713
+ size=content_row.size,
2714
+ file_type=content_row.type,
2715
+ status=ContentStatus(content_row.status) if content_row.status else None,
2716
+ status_message=content_row.status_message,
2717
+ created_at=content_row.created_at,
2718
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
2719
+ external_id=content_row.external_id,
2720
+ )
2721
+ result.append(content)
2722
+ return result, count
2723
+
2724
+ def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
2725
+ if self.contents_db is None:
2726
+ raise ValueError("No contents db provided")
2727
+
2728
+ if isinstance(self.contents_db, AsyncBaseDb):
2729
+ raise ValueError(
2730
+ "get_content_status() is not supported for async databases. Please use aget_content_status() instead."
2731
+ )
2732
+
2733
+ content_row = self.contents_db.get_knowledge_content(content_id)
2734
+ if content_row is None:
2735
+ return None, "Content not found"
2736
+
2737
+ # Convert string status to enum, defaulting to PROCESSING if unknown
2738
+ status_str = content_row.status
2739
+ try:
2740
+ status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
2741
+ except ValueError:
2742
+ # Handle legacy or unknown statuses
2743
+ if status_str and "failed" in status_str.lower():
2744
+ status = ContentStatus.FAILED
2745
+ elif status_str and "completed" in status_str.lower():
2746
+ status = ContentStatus.COMPLETED
2747
+ else:
2748
+ status = ContentStatus.PROCESSING
2749
+
2750
+ return status, content_row.status_message
2751
+
2752
+ async def aget_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
2753
+ if self.contents_db is None:
2754
+ raise ValueError("No contents db provided")
2755
+
2756
+ if isinstance(self.contents_db, AsyncBaseDb):
2757
+ content_row = await self.contents_db.get_knowledge_content(content_id)
2758
+ else:
2759
+ content_row = self.contents_db.get_knowledge_content(content_id)
2760
+
2761
+ if content_row is None:
2762
+ return None, "Content not found"
2763
+
2764
+ # Convert string status to enum, defaulting to PROCESSING if unknown
2765
+ status_str = content_row.status
2766
+ try:
2767
+ status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
2768
+ except ValueError:
2769
+ # Handle legacy or unknown statuses
2770
+ if status_str and "failed" in status_str.lower():
2771
+ status = ContentStatus.FAILED
2772
+ elif status_str and "completed" in status_str.lower():
2773
+ status = ContentStatus.COMPLETED
2774
+ else:
2775
+ status = ContentStatus.PROCESSING
2776
+
2777
+ return status, content_row.status_message
2778
+
2779
+ def remove_content_by_id(self, content_id: str):
2780
+ from agno.vectordb import VectorDb
2781
+
2782
+ self.vector_db = cast(VectorDb, self.vector_db)
2783
+ if self.vector_db is not None:
2784
+ if self.vector_db.__class__.__name__ == "LightRag":
2785
+ # For LightRAG, get the content first to find the external_id
2786
+ content = self.get_content_by_id(content_id)
2787
+ if content and content.external_id:
2788
+ self.vector_db.delete_by_external_id(content.external_id) # type: ignore
2789
+ else:
2790
+ log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
2791
+ else:
2792
+ self.vector_db.delete_by_content_id(content_id)
2793
+
2794
+ if self.contents_db is not None:
2795
+ self.contents_db.delete_knowledge_content(content_id)
2796
+
2797
+ async def aremove_content_by_id(self, content_id: str):
2798
+ if self.vector_db is not None:
2799
+ if self.vector_db.__class__.__name__ == "LightRag":
2800
+ # For LightRAG, get the content first to find the external_id
2801
+ content = await self.aget_content_by_id(content_id)
2802
+ if content and content.external_id:
2803
+ self.vector_db.delete_by_external_id(content.external_id) # type: ignore
2804
+ else:
2805
+ log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
2806
+ else:
2807
+ self.vector_db.delete_by_content_id(content_id)
2808
+
2809
+ if self.contents_db is not None:
2810
+ if isinstance(self.contents_db, AsyncBaseDb):
2811
+ await self.contents_db.delete_knowledge_content(content_id)
2812
+ else:
2813
+ self.contents_db.delete_knowledge_content(content_id)
2814
+
2815
+ def remove_all_content(self):
2816
+ contents, _ = self.get_content()
2817
+ for content in contents:
2818
+ if content.id is not None:
2819
+ self.remove_content_by_id(content.id)
2820
+
2821
+ async def aremove_all_content(self):
2822
+ contents, _ = await self.aget_content()
2823
+ for content in contents:
2824
+ if content.id is not None:
2825
+ await self.aremove_content_by_id(content.id)
2826
+
2827
+ # --- Reader Factory Integration ---
2828
+
2829
+ def construct_readers(self):
2830
+ """Initialize readers dictionary for lazy loading."""
2831
+ # Initialize empty readers dict - readers will be created on-demand
2832
+ if self.readers is None:
2833
+ self.readers = {}
2834
+
2835
+ def add_reader(self, reader: Reader):
2836
+ """Add a custom reader to the knowledge base."""
2837
+ if self.readers is None:
2838
+ self.readers = {}
2839
+
2840
+ # Generate a key for the reader
2841
+ reader_key = self._generate_reader_key(reader)
2842
+ self.readers[reader_key] = reader
2843
+ return reader
2844
+
2845
+ def get_readers(self) -> Dict[str, Reader]:
2846
+ """Get all currently loaded readers (only returns readers that have been used)."""
2847
+ if self.readers is None:
2848
+ self.readers = {}
2849
+ elif not isinstance(self.readers, dict):
2850
+ # Defensive check: if readers is not a dict (e.g., was set to a list), convert it
2851
+ if isinstance(self.readers, list):
2852
+ readers_dict: Dict[str, Reader] = {}
2853
+ for reader in self.readers:
2854
+ if isinstance(reader, Reader):
2855
+ reader_key = self._generate_reader_key(reader)
2856
+ # Handle potential duplicate keys by appending index if needed
2857
+ original_key = reader_key
2858
+ counter = 1
2859
+ while reader_key in readers_dict:
2860
+ reader_key = f"{original_key}_{counter}"
2861
+ counter += 1
2862
+ readers_dict[reader_key] = reader
2863
+ self.readers = readers_dict
2864
+ else:
2865
+ # For any other unexpected type, reset to empty dict
2866
+ self.readers = {}
2867
+
2868
+ return self.readers
2869
+
2870
+ def _generate_reader_key(self, reader: Reader) -> str:
2871
+ """Generate a key for a reader instance."""
2872
+ if reader.name:
2873
+ return f"{reader.name.lower().replace(' ', '_')}"
2874
+ else:
2875
+ return f"{reader.__class__.__name__.lower().replace(' ', '_')}"
2876
+
2877
+ def _select_reader(self, extension: str) -> Reader:
2878
+ """Select the appropriate reader for a file extension."""
2879
+ log_info(f"Selecting reader for extension: {extension}")
2880
+ return ReaderFactory.get_reader_for_extension(extension)
2881
+
2882
+ # --- Convenience Properties for Backward Compatibility ---
2883
+
2884
+ def _is_text_mime_type(self, mime_type: str) -> bool:
2885
+ """
2886
+ Check if a MIME type represents text content that can be safely encoded as UTF-8.
2887
+
2888
+ Args:
2889
+ mime_type: The MIME type to check
2890
+
2891
+ Returns:
2892
+ bool: True if it's a text type, False if binary
2893
+ """
2894
+ if not mime_type:
2895
+ return False
2896
+
2897
+ text_types = [
2898
+ "text/",
2899
+ "application/json",
2900
+ "application/xml",
2901
+ "application/javascript",
2902
+ "application/csv",
2903
+ "application/sql",
2904
+ ]
2905
+
2906
+ return any(mime_type.startswith(t) for t in text_types)
2907
+
2908
+ def _should_include_file(self, file_path: str, include: Optional[List[str]], exclude: Optional[List[str]]) -> bool:
2909
+ """
2910
+ Determine if a file should be included based on include/exclude patterns.
2911
+
2912
+ Logic:
2913
+ 1. If include is specified, file must match at least one include pattern
2914
+ 2. If exclude is specified, file must not match any exclude pattern
2915
+ 3. If neither specified, include all files
2916
+
2917
+ Args:
2918
+ file_path: Path to the file to check
2919
+ include: Optional list of include patterns (glob-style)
2920
+ exclude: Optional list of exclude patterns (glob-style)
2921
+
2922
+ Returns:
2923
+ bool: True if file should be included, False otherwise
2924
+ """
2925
+ import fnmatch
2926
+
2927
+ # If include patterns specified, file must match at least one
2928
+ if include:
2929
+ if not any(fnmatch.fnmatch(file_path, pattern) for pattern in include):
2930
+ return False
2931
+
2932
+ # If exclude patterns specified, file must not match any
2933
+ if exclude:
2934
+ if any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude):
2935
+ return False
2936
+
2937
+ return True
2938
+
2939
+ def _get_reader(self, reader_type: str) -> Optional[Reader]:
2940
+ """Get a cached reader or create it if not cached, handling missing dependencies gracefully."""
2941
+ if self.readers is None:
2942
+ self.readers = {}
2943
+
2944
+ if reader_type not in self.readers:
2945
+ try:
2946
+ reader = ReaderFactory.create_reader(reader_type)
2947
+ if reader:
2948
+ self.readers[reader_type] = reader
2949
+ else:
2950
+ return None
2951
+
2952
+ except Exception as e:
2953
+ log_warning(f"Cannot create {reader_type} reader {e}")
2954
+ return None
2955
+
2956
+ return self.readers.get(reader_type)
2957
+
2958
+ @property
2959
+ def pdf_reader(self) -> Optional[Reader]:
2960
+ """PDF reader - lazy loaded via factory."""
2961
+ return self._get_reader("pdf")
2962
+
2963
+ @property
2964
+ def csv_reader(self) -> Optional[Reader]:
2965
+ """CSV reader - lazy loaded via factory."""
2966
+ return self._get_reader("csv")
2967
+
2968
+ @property
2969
+ def docx_reader(self) -> Optional[Reader]:
2970
+ """Docx reader - lazy loaded via factory."""
2971
+ return self._get_reader("docx")
2972
+
2973
+ @property
2974
+ def pptx_reader(self) -> Optional[Reader]:
2975
+ """PPTX reader - lazy loaded via factory."""
2976
+ return self._get_reader("pptx")
2977
+
2978
+ @property
2979
+ def json_reader(self) -> Optional[Reader]:
2980
+ """JSON reader - lazy loaded via factory."""
2981
+ return self._get_reader("json")
2982
+
2983
+ @property
2984
+ def markdown_reader(self) -> Optional[Reader]:
2985
+ """Markdown reader - lazy loaded via factory."""
2986
+ return self._get_reader("markdown")
2987
+
2988
+ @property
2989
+ def text_reader(self) -> Optional[Reader]:
2990
+ """Text reader - lazy loaded via factory."""
2991
+ return self._get_reader("text")
2992
+
2993
+ @property
2994
+ def website_reader(self) -> Optional[Reader]:
2995
+ """Website reader - lazy loaded via factory."""
2996
+ return self._get_reader("website")
2997
+
2998
+ @property
2999
+ def firecrawl_reader(self) -> Optional[Reader]:
3000
+ """Firecrawl reader - lazy loaded via factory."""
3001
+ return self._get_reader("firecrawl")
3002
+
3003
+ @property
3004
+ def youtube_reader(self) -> Optional[Reader]:
3005
+ """YouTube reader - lazy loaded via factory."""
3006
+ return self._get_reader("youtube")