agno 1.8.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (589) hide show
  1. agno/agent/__init__.py +19 -27
  2. agno/agent/agent.py +3143 -4170
  3. agno/api/agent.py +11 -67
  4. agno/api/api.py +5 -46
  5. agno/api/evals.py +8 -19
  6. agno/api/os.py +17 -0
  7. agno/api/routes.py +6 -41
  8. agno/api/schemas/__init__.py +9 -0
  9. agno/api/schemas/agent.py +5 -21
  10. agno/api/schemas/evals.py +7 -16
  11. agno/api/schemas/os.py +14 -0
  12. agno/api/schemas/team.py +5 -21
  13. agno/api/schemas/utils.py +21 -0
  14. agno/api/schemas/workflows.py +11 -7
  15. agno/api/settings.py +53 -0
  16. agno/api/team.py +11 -66
  17. agno/api/workflow.py +28 -0
  18. agno/cloud/aws/base.py +214 -0
  19. agno/cloud/aws/s3/__init__.py +2 -0
  20. agno/cloud/aws/s3/api_client.py +43 -0
  21. agno/cloud/aws/s3/bucket.py +195 -0
  22. agno/cloud/aws/s3/object.py +57 -0
  23. agno/db/__init__.py +24 -0
  24. agno/db/base.py +245 -0
  25. agno/db/dynamo/__init__.py +3 -0
  26. agno/db/dynamo/dynamo.py +1743 -0
  27. agno/db/dynamo/schemas.py +278 -0
  28. agno/db/dynamo/utils.py +684 -0
  29. agno/db/firestore/__init__.py +3 -0
  30. agno/db/firestore/firestore.py +1432 -0
  31. agno/db/firestore/schemas.py +130 -0
  32. agno/db/firestore/utils.py +278 -0
  33. agno/db/gcs_json/__init__.py +3 -0
  34. agno/db/gcs_json/gcs_json_db.py +1001 -0
  35. agno/db/gcs_json/utils.py +194 -0
  36. agno/db/in_memory/__init__.py +3 -0
  37. agno/db/in_memory/in_memory_db.py +882 -0
  38. agno/db/in_memory/utils.py +172 -0
  39. agno/db/json/__init__.py +3 -0
  40. agno/db/json/json_db.py +1045 -0
  41. agno/db/json/utils.py +196 -0
  42. agno/db/migrations/v1_to_v2.py +162 -0
  43. agno/db/mongo/__init__.py +3 -0
  44. agno/db/mongo/mongo.py +1416 -0
  45. agno/db/mongo/schemas.py +77 -0
  46. agno/db/mongo/utils.py +204 -0
  47. agno/db/mysql/__init__.py +3 -0
  48. agno/db/mysql/mysql.py +1719 -0
  49. agno/db/mysql/schemas.py +124 -0
  50. agno/db/mysql/utils.py +297 -0
  51. agno/db/postgres/__init__.py +3 -0
  52. agno/db/postgres/postgres.py +1710 -0
  53. agno/db/postgres/schemas.py +124 -0
  54. agno/db/postgres/utils.py +280 -0
  55. agno/db/redis/__init__.py +3 -0
  56. agno/db/redis/redis.py +1367 -0
  57. agno/db/redis/schemas.py +109 -0
  58. agno/db/redis/utils.py +288 -0
  59. agno/db/schemas/__init__.py +3 -0
  60. agno/db/schemas/evals.py +33 -0
  61. agno/db/schemas/knowledge.py +40 -0
  62. agno/db/schemas/memory.py +46 -0
  63. agno/db/singlestore/__init__.py +3 -0
  64. agno/db/singlestore/schemas.py +116 -0
  65. agno/db/singlestore/singlestore.py +1712 -0
  66. agno/db/singlestore/utils.py +326 -0
  67. agno/db/sqlite/__init__.py +3 -0
  68. agno/db/sqlite/schemas.py +119 -0
  69. agno/db/sqlite/sqlite.py +1676 -0
  70. agno/db/sqlite/utils.py +268 -0
  71. agno/db/utils.py +88 -0
  72. agno/eval/__init__.py +14 -0
  73. agno/eval/accuracy.py +154 -48
  74. agno/eval/performance.py +88 -23
  75. agno/eval/reliability.py +73 -20
  76. agno/eval/utils.py +23 -13
  77. agno/integrations/discord/__init__.py +3 -0
  78. agno/{app → integrations}/discord/client.py +10 -10
  79. agno/knowledge/__init__.py +2 -2
  80. agno/{document → knowledge}/chunking/agentic.py +2 -2
  81. agno/{document → knowledge}/chunking/document.py +2 -2
  82. agno/{document → knowledge}/chunking/fixed.py +3 -3
  83. agno/{document → knowledge}/chunking/markdown.py +2 -2
  84. agno/{document → knowledge}/chunking/recursive.py +2 -2
  85. agno/{document → knowledge}/chunking/row.py +2 -2
  86. agno/knowledge/chunking/semantic.py +59 -0
  87. agno/knowledge/chunking/strategy.py +121 -0
  88. agno/knowledge/content.py +74 -0
  89. agno/knowledge/document/__init__.py +5 -0
  90. agno/{document → knowledge/document}/base.py +12 -2
  91. agno/knowledge/embedder/__init__.py +5 -0
  92. agno/{embedder → knowledge/embedder}/aws_bedrock.py +127 -1
  93. agno/{embedder → knowledge/embedder}/azure_openai.py +65 -1
  94. agno/{embedder → knowledge/embedder}/base.py +6 -0
  95. agno/{embedder → knowledge/embedder}/cohere.py +72 -1
  96. agno/{embedder → knowledge/embedder}/fastembed.py +17 -1
  97. agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
  98. agno/{embedder → knowledge/embedder}/google.py +74 -1
  99. agno/{embedder → knowledge/embedder}/huggingface.py +36 -2
  100. agno/{embedder → knowledge/embedder}/jina.py +48 -2
  101. agno/knowledge/embedder/langdb.py +22 -0
  102. agno/knowledge/embedder/mistral.py +139 -0
  103. agno/{embedder → knowledge/embedder}/nebius.py +1 -1
  104. agno/{embedder → knowledge/embedder}/ollama.py +54 -3
  105. agno/knowledge/embedder/openai.py +223 -0
  106. agno/{embedder → knowledge/embedder}/sentence_transformer.py +16 -1
  107. agno/{embedder → knowledge/embedder}/together.py +1 -1
  108. agno/{embedder → knowledge/embedder}/voyageai.py +49 -1
  109. agno/knowledge/knowledge.py +1551 -0
  110. agno/knowledge/reader/__init__.py +7 -0
  111. agno/{document → knowledge}/reader/arxiv_reader.py +32 -4
  112. agno/knowledge/reader/base.py +88 -0
  113. agno/{document → knowledge}/reader/csv_reader.py +47 -65
  114. agno/knowledge/reader/docx_reader.py +83 -0
  115. agno/{document → knowledge}/reader/firecrawl_reader.py +42 -21
  116. agno/{document → knowledge}/reader/json_reader.py +30 -9
  117. agno/{document → knowledge}/reader/markdown_reader.py +58 -9
  118. agno/{document → knowledge}/reader/pdf_reader.py +71 -126
  119. agno/knowledge/reader/reader_factory.py +268 -0
  120. agno/knowledge/reader/s3_reader.py +101 -0
  121. agno/{document → knowledge}/reader/text_reader.py +31 -10
  122. agno/knowledge/reader/url_reader.py +128 -0
  123. agno/knowledge/reader/web_search_reader.py +366 -0
  124. agno/{document → knowledge}/reader/website_reader.py +37 -10
  125. agno/knowledge/reader/wikipedia_reader.py +59 -0
  126. agno/knowledge/reader/youtube_reader.py +78 -0
  127. agno/knowledge/remote_content/remote_content.py +88 -0
  128. agno/{reranker → knowledge/reranker}/base.py +1 -1
  129. agno/{reranker → knowledge/reranker}/cohere.py +2 -2
  130. agno/{reranker → knowledge/reranker}/infinity.py +2 -2
  131. agno/{reranker → knowledge/reranker}/sentence_transformer.py +2 -2
  132. agno/knowledge/types.py +30 -0
  133. agno/knowledge/utils.py +169 -0
  134. agno/media.py +269 -268
  135. agno/memory/__init__.py +2 -10
  136. agno/memory/manager.py +1003 -148
  137. agno/models/aimlapi/__init__.py +2 -2
  138. agno/models/aimlapi/aimlapi.py +6 -6
  139. agno/models/anthropic/claude.py +128 -72
  140. agno/models/aws/bedrock.py +107 -175
  141. agno/models/aws/claude.py +64 -18
  142. agno/models/azure/ai_foundry.py +73 -23
  143. agno/models/base.py +346 -290
  144. agno/models/cerebras/cerebras.py +84 -27
  145. agno/models/cohere/chat.py +106 -98
  146. agno/models/google/gemini.py +105 -46
  147. agno/models/groq/groq.py +97 -35
  148. agno/models/huggingface/huggingface.py +92 -27
  149. agno/models/ibm/watsonx.py +72 -13
  150. agno/models/litellm/chat.py +85 -13
  151. agno/models/message.py +46 -151
  152. agno/models/meta/llama.py +85 -49
  153. agno/models/metrics.py +120 -0
  154. agno/models/mistral/mistral.py +90 -21
  155. agno/models/ollama/__init__.py +0 -2
  156. agno/models/ollama/chat.py +85 -47
  157. agno/models/openai/chat.py +154 -37
  158. agno/models/openai/responses.py +178 -105
  159. agno/models/perplexity/perplexity.py +26 -2
  160. agno/models/portkey/portkey.py +0 -7
  161. agno/models/response.py +15 -9
  162. agno/models/utils.py +20 -0
  163. agno/models/vercel/__init__.py +2 -2
  164. agno/models/vercel/v0.py +1 -1
  165. agno/models/vllm/__init__.py +2 -2
  166. agno/models/vllm/vllm.py +3 -3
  167. agno/models/xai/xai.py +10 -10
  168. agno/os/__init__.py +3 -0
  169. agno/os/app.py +497 -0
  170. agno/os/auth.py +47 -0
  171. agno/os/config.py +103 -0
  172. agno/os/interfaces/agui/__init__.py +3 -0
  173. agno/os/interfaces/agui/agui.py +31 -0
  174. agno/{app/agui/async_router.py → os/interfaces/agui/router.py} +16 -16
  175. agno/{app → os/interfaces}/agui/utils.py +65 -28
  176. agno/os/interfaces/base.py +21 -0
  177. agno/os/interfaces/slack/__init__.py +3 -0
  178. agno/{app/slack/async_router.py → os/interfaces/slack/router.py} +3 -5
  179. agno/os/interfaces/slack/slack.py +32 -0
  180. agno/os/interfaces/whatsapp/__init__.py +3 -0
  181. agno/{app/whatsapp/async_router.py → os/interfaces/whatsapp/router.py} +4 -7
  182. agno/os/interfaces/whatsapp/whatsapp.py +29 -0
  183. agno/os/mcp.py +235 -0
  184. agno/os/router.py +1400 -0
  185. agno/os/routers/__init__.py +3 -0
  186. agno/os/routers/evals/__init__.py +3 -0
  187. agno/os/routers/evals/evals.py +393 -0
  188. agno/os/routers/evals/schemas.py +142 -0
  189. agno/os/routers/evals/utils.py +161 -0
  190. agno/os/routers/knowledge/__init__.py +3 -0
  191. agno/os/routers/knowledge/knowledge.py +850 -0
  192. agno/os/routers/knowledge/schemas.py +118 -0
  193. agno/os/routers/memory/__init__.py +3 -0
  194. agno/os/routers/memory/memory.py +410 -0
  195. agno/os/routers/memory/schemas.py +58 -0
  196. agno/os/routers/metrics/__init__.py +3 -0
  197. agno/os/routers/metrics/metrics.py +178 -0
  198. agno/os/routers/metrics/schemas.py +47 -0
  199. agno/os/routers/session/__init__.py +3 -0
  200. agno/os/routers/session/session.py +536 -0
  201. agno/os/schema.py +945 -0
  202. agno/{app/playground → os}/settings.py +7 -15
  203. agno/os/utils.py +270 -0
  204. agno/reasoning/azure_ai_foundry.py +4 -4
  205. agno/reasoning/deepseek.py +4 -4
  206. agno/reasoning/default.py +6 -11
  207. agno/reasoning/groq.py +4 -4
  208. agno/reasoning/helpers.py +4 -6
  209. agno/reasoning/ollama.py +4 -4
  210. agno/reasoning/openai.py +4 -4
  211. agno/run/agent.py +633 -0
  212. agno/run/base.py +53 -77
  213. agno/run/cancel.py +81 -0
  214. agno/run/team.py +243 -96
  215. agno/run/workflow.py +550 -12
  216. agno/session/__init__.py +10 -0
  217. agno/session/agent.py +244 -0
  218. agno/session/summary.py +225 -0
  219. agno/session/team.py +262 -0
  220. agno/{storage/session/v2 → session}/workflow.py +47 -24
  221. agno/team/__init__.py +15 -16
  222. agno/team/team.py +3260 -4824
  223. agno/tools/agentql.py +14 -5
  224. agno/tools/airflow.py +9 -4
  225. agno/tools/api.py +7 -3
  226. agno/tools/apify.py +2 -46
  227. agno/tools/arxiv.py +8 -3
  228. agno/tools/aws_lambda.py +7 -5
  229. agno/tools/aws_ses.py +7 -1
  230. agno/tools/baidusearch.py +4 -1
  231. agno/tools/bitbucket.py +4 -4
  232. agno/tools/brandfetch.py +14 -11
  233. agno/tools/bravesearch.py +4 -1
  234. agno/tools/brightdata.py +43 -23
  235. agno/tools/browserbase.py +13 -4
  236. agno/tools/calcom.py +12 -10
  237. agno/tools/calculator.py +10 -27
  238. agno/tools/cartesia.py +20 -17
  239. agno/tools/{clickup_tool.py → clickup.py} +12 -25
  240. agno/tools/confluence.py +8 -8
  241. agno/tools/crawl4ai.py +7 -1
  242. agno/tools/csv_toolkit.py +9 -8
  243. agno/tools/dalle.py +22 -12
  244. agno/tools/daytona.py +13 -16
  245. agno/tools/decorator.py +6 -3
  246. agno/tools/desi_vocal.py +17 -8
  247. agno/tools/discord.py +11 -8
  248. agno/tools/docker.py +30 -42
  249. agno/tools/duckdb.py +34 -53
  250. agno/tools/duckduckgo.py +8 -7
  251. agno/tools/e2b.py +62 -62
  252. agno/tools/eleven_labs.py +36 -29
  253. agno/tools/email.py +4 -1
  254. agno/tools/evm.py +7 -1
  255. agno/tools/exa.py +19 -14
  256. agno/tools/fal.py +30 -30
  257. agno/tools/file.py +9 -8
  258. agno/tools/financial_datasets.py +25 -44
  259. agno/tools/firecrawl.py +17 -18
  260. agno/tools/function.py +127 -18
  261. agno/tools/giphy.py +23 -11
  262. agno/tools/github.py +48 -126
  263. agno/tools/gmail.py +45 -61
  264. agno/tools/google_bigquery.py +7 -6
  265. agno/tools/google_maps.py +11 -26
  266. agno/tools/googlesearch.py +7 -2
  267. agno/tools/googlesheets.py +21 -17
  268. agno/tools/hackernews.py +9 -5
  269. agno/tools/jina.py +5 -4
  270. agno/tools/jira.py +18 -9
  271. agno/tools/knowledge.py +31 -32
  272. agno/tools/linear.py +18 -33
  273. agno/tools/linkup.py +5 -1
  274. agno/tools/local_file_system.py +8 -5
  275. agno/tools/lumalab.py +32 -20
  276. agno/tools/mcp.py +1 -2
  277. agno/tools/mem0.py +18 -12
  278. agno/tools/memori.py +14 -10
  279. agno/tools/mlx_transcribe.py +3 -2
  280. agno/tools/models/azure_openai.py +33 -15
  281. agno/tools/models/gemini.py +59 -32
  282. agno/tools/models/groq.py +30 -23
  283. agno/tools/models/nebius.py +28 -12
  284. agno/tools/models_labs.py +40 -16
  285. agno/tools/moviepy_video.py +7 -6
  286. agno/tools/neo4j.py +10 -8
  287. agno/tools/newspaper.py +7 -2
  288. agno/tools/newspaper4k.py +8 -3
  289. agno/tools/openai.py +58 -32
  290. agno/tools/openbb.py +12 -11
  291. agno/tools/opencv.py +63 -47
  292. agno/tools/openweather.py +14 -12
  293. agno/tools/pandas.py +11 -3
  294. agno/tools/postgres.py +4 -12
  295. agno/tools/pubmed.py +4 -1
  296. agno/tools/python.py +9 -22
  297. agno/tools/reasoning.py +35 -27
  298. agno/tools/reddit.py +11 -26
  299. agno/tools/replicate.py +55 -42
  300. agno/tools/resend.py +4 -1
  301. agno/tools/scrapegraph.py +15 -14
  302. agno/tools/searxng.py +10 -23
  303. agno/tools/serpapi.py +6 -3
  304. agno/tools/serper.py +13 -4
  305. agno/tools/shell.py +9 -2
  306. agno/tools/slack.py +12 -11
  307. agno/tools/sleep.py +3 -2
  308. agno/tools/spider.py +24 -4
  309. agno/tools/sql.py +7 -6
  310. agno/tools/tavily.py +6 -4
  311. agno/tools/telegram.py +12 -4
  312. agno/tools/todoist.py +11 -31
  313. agno/tools/toolkit.py +1 -1
  314. agno/tools/trafilatura.py +22 -6
  315. agno/tools/trello.py +9 -22
  316. agno/tools/twilio.py +10 -3
  317. agno/tools/user_control_flow.py +6 -1
  318. agno/tools/valyu.py +34 -5
  319. agno/tools/visualization.py +19 -28
  320. agno/tools/webbrowser.py +4 -3
  321. agno/tools/webex.py +11 -7
  322. agno/tools/website.py +15 -46
  323. agno/tools/webtools.py +12 -4
  324. agno/tools/whatsapp.py +5 -9
  325. agno/tools/wikipedia.py +20 -13
  326. agno/tools/x.py +14 -13
  327. agno/tools/yfinance.py +13 -40
  328. agno/tools/youtube.py +26 -20
  329. agno/tools/zendesk.py +7 -2
  330. agno/tools/zep.py +10 -7
  331. agno/tools/zoom.py +10 -9
  332. agno/utils/common.py +1 -19
  333. agno/utils/events.py +100 -123
  334. agno/utils/gemini.py +1 -1
  335. agno/utils/knowledge.py +29 -0
  336. agno/utils/log.py +54 -4
  337. agno/utils/mcp.py +68 -10
  338. agno/utils/media.py +39 -0
  339. agno/utils/message.py +12 -1
  340. agno/utils/models/aws_claude.py +1 -1
  341. agno/utils/models/claude.py +6 -12
  342. agno/utils/models/cohere.py +1 -1
  343. agno/utils/models/mistral.py +8 -7
  344. agno/utils/models/schema_utils.py +3 -3
  345. agno/utils/models/watsonx.py +1 -1
  346. agno/utils/openai.py +1 -1
  347. agno/utils/pprint.py +33 -32
  348. agno/utils/print_response/agent.py +779 -0
  349. agno/utils/print_response/team.py +1669 -0
  350. agno/utils/print_response/workflow.py +1451 -0
  351. agno/utils/prompts.py +14 -14
  352. agno/utils/reasoning.py +87 -0
  353. agno/utils/response.py +42 -42
  354. agno/utils/streamlit.py +481 -0
  355. agno/utils/string.py +8 -22
  356. agno/utils/team.py +50 -0
  357. agno/utils/timer.py +2 -2
  358. agno/vectordb/base.py +33 -21
  359. agno/vectordb/cassandra/cassandra.py +287 -23
  360. agno/vectordb/chroma/chromadb.py +482 -59
  361. agno/vectordb/clickhouse/clickhousedb.py +270 -63
  362. agno/vectordb/couchbase/couchbase.py +309 -29
  363. agno/vectordb/lancedb/lance_db.py +360 -21
  364. agno/vectordb/langchaindb/__init__.py +5 -0
  365. agno/vectordb/langchaindb/langchaindb.py +145 -0
  366. agno/vectordb/lightrag/__init__.py +5 -0
  367. agno/vectordb/lightrag/lightrag.py +374 -0
  368. agno/vectordb/llamaindex/llamaindexdb.py +127 -0
  369. agno/vectordb/milvus/milvus.py +242 -32
  370. agno/vectordb/mongodb/mongodb.py +200 -24
  371. agno/vectordb/pgvector/pgvector.py +319 -37
  372. agno/vectordb/pineconedb/pineconedb.py +221 -27
  373. agno/vectordb/qdrant/qdrant.py +334 -14
  374. agno/vectordb/singlestore/singlestore.py +286 -29
  375. agno/vectordb/surrealdb/surrealdb.py +187 -7
  376. agno/vectordb/upstashdb/upstashdb.py +342 -26
  377. agno/vectordb/weaviate/weaviate.py +227 -165
  378. agno/workflow/__init__.py +17 -13
  379. agno/workflow/{v2/condition.py → condition.py} +135 -32
  380. agno/workflow/{v2/loop.py → loop.py} +115 -28
  381. agno/workflow/{v2/parallel.py → parallel.py} +138 -108
  382. agno/workflow/{v2/router.py → router.py} +133 -32
  383. agno/workflow/{v2/step.py → step.py} +207 -49
  384. agno/workflow/{v2/steps.py → steps.py} +147 -66
  385. agno/workflow/types.py +482 -0
  386. agno/workflow/workflow.py +2410 -696
  387. agno-2.0.0.dist-info/METADATA +494 -0
  388. agno-2.0.0.dist-info/RECORD +515 -0
  389. agno-2.0.0.dist-info/licenses/LICENSE +201 -0
  390. agno/agent/metrics.py +0 -110
  391. agno/api/app.py +0 -35
  392. agno/api/playground.py +0 -92
  393. agno/api/schemas/app.py +0 -12
  394. agno/api/schemas/playground.py +0 -22
  395. agno/api/schemas/user.py +0 -35
  396. agno/api/schemas/workspace.py +0 -46
  397. agno/api/user.py +0 -160
  398. agno/api/workflows.py +0 -33
  399. agno/api/workspace.py +0 -175
  400. agno/app/agui/__init__.py +0 -3
  401. agno/app/agui/app.py +0 -17
  402. agno/app/agui/sync_router.py +0 -120
  403. agno/app/base.py +0 -186
  404. agno/app/discord/__init__.py +0 -3
  405. agno/app/fastapi/__init__.py +0 -3
  406. agno/app/fastapi/app.py +0 -107
  407. agno/app/fastapi/async_router.py +0 -457
  408. agno/app/fastapi/sync_router.py +0 -448
  409. agno/app/playground/app.py +0 -228
  410. agno/app/playground/async_router.py +0 -1053
  411. agno/app/playground/deploy.py +0 -249
  412. agno/app/playground/operator.py +0 -183
  413. agno/app/playground/schemas.py +0 -223
  414. agno/app/playground/serve.py +0 -55
  415. agno/app/playground/sync_router.py +0 -1045
  416. agno/app/playground/utils.py +0 -46
  417. agno/app/settings.py +0 -15
  418. agno/app/slack/__init__.py +0 -3
  419. agno/app/slack/app.py +0 -19
  420. agno/app/slack/sync_router.py +0 -92
  421. agno/app/utils.py +0 -54
  422. agno/app/whatsapp/__init__.py +0 -3
  423. agno/app/whatsapp/app.py +0 -15
  424. agno/app/whatsapp/sync_router.py +0 -197
  425. agno/cli/auth_server.py +0 -249
  426. agno/cli/config.py +0 -274
  427. agno/cli/console.py +0 -88
  428. agno/cli/credentials.py +0 -23
  429. agno/cli/entrypoint.py +0 -571
  430. agno/cli/operator.py +0 -357
  431. agno/cli/settings.py +0 -96
  432. agno/cli/ws/ws_cli.py +0 -817
  433. agno/constants.py +0 -13
  434. agno/document/__init__.py +0 -5
  435. agno/document/chunking/semantic.py +0 -45
  436. agno/document/chunking/strategy.py +0 -31
  437. agno/document/reader/__init__.py +0 -5
  438. agno/document/reader/base.py +0 -47
  439. agno/document/reader/docx_reader.py +0 -60
  440. agno/document/reader/gcs/pdf_reader.py +0 -44
  441. agno/document/reader/s3/pdf_reader.py +0 -59
  442. agno/document/reader/s3/text_reader.py +0 -63
  443. agno/document/reader/url_reader.py +0 -59
  444. agno/document/reader/youtube_reader.py +0 -58
  445. agno/embedder/__init__.py +0 -5
  446. agno/embedder/langdb.py +0 -80
  447. agno/embedder/mistral.py +0 -82
  448. agno/embedder/openai.py +0 -78
  449. agno/file/__init__.py +0 -5
  450. agno/file/file.py +0 -16
  451. agno/file/local/csv.py +0 -32
  452. agno/file/local/txt.py +0 -19
  453. agno/infra/app.py +0 -240
  454. agno/infra/base.py +0 -144
  455. agno/infra/context.py +0 -20
  456. agno/infra/db_app.py +0 -52
  457. agno/infra/resource.py +0 -205
  458. agno/infra/resources.py +0 -55
  459. agno/knowledge/agent.py +0 -702
  460. agno/knowledge/arxiv.py +0 -33
  461. agno/knowledge/combined.py +0 -36
  462. agno/knowledge/csv.py +0 -144
  463. agno/knowledge/csv_url.py +0 -124
  464. agno/knowledge/document.py +0 -223
  465. agno/knowledge/docx.py +0 -137
  466. agno/knowledge/firecrawl.py +0 -34
  467. agno/knowledge/gcs/__init__.py +0 -0
  468. agno/knowledge/gcs/base.py +0 -39
  469. agno/knowledge/gcs/pdf.py +0 -125
  470. agno/knowledge/json.py +0 -137
  471. agno/knowledge/langchain.py +0 -71
  472. agno/knowledge/light_rag.py +0 -273
  473. agno/knowledge/llamaindex.py +0 -66
  474. agno/knowledge/markdown.py +0 -154
  475. agno/knowledge/pdf.py +0 -164
  476. agno/knowledge/pdf_bytes.py +0 -42
  477. agno/knowledge/pdf_url.py +0 -148
  478. agno/knowledge/s3/__init__.py +0 -0
  479. agno/knowledge/s3/base.py +0 -64
  480. agno/knowledge/s3/pdf.py +0 -33
  481. agno/knowledge/s3/text.py +0 -34
  482. agno/knowledge/text.py +0 -141
  483. agno/knowledge/url.py +0 -46
  484. agno/knowledge/website.py +0 -179
  485. agno/knowledge/wikipedia.py +0 -32
  486. agno/knowledge/youtube.py +0 -35
  487. agno/memory/agent.py +0 -423
  488. agno/memory/classifier.py +0 -104
  489. agno/memory/db/__init__.py +0 -5
  490. agno/memory/db/base.py +0 -42
  491. agno/memory/db/mongodb.py +0 -189
  492. agno/memory/db/postgres.py +0 -203
  493. agno/memory/db/sqlite.py +0 -193
  494. agno/memory/memory.py +0 -22
  495. agno/memory/row.py +0 -36
  496. agno/memory/summarizer.py +0 -201
  497. agno/memory/summary.py +0 -19
  498. agno/memory/team.py +0 -415
  499. agno/memory/v2/__init__.py +0 -2
  500. agno/memory/v2/db/__init__.py +0 -1
  501. agno/memory/v2/db/base.py +0 -42
  502. agno/memory/v2/db/firestore.py +0 -339
  503. agno/memory/v2/db/mongodb.py +0 -196
  504. agno/memory/v2/db/postgres.py +0 -214
  505. agno/memory/v2/db/redis.py +0 -187
  506. agno/memory/v2/db/schema.py +0 -54
  507. agno/memory/v2/db/sqlite.py +0 -209
  508. agno/memory/v2/manager.py +0 -437
  509. agno/memory/v2/memory.py +0 -1097
  510. agno/memory/v2/schema.py +0 -55
  511. agno/memory/v2/summarizer.py +0 -215
  512. agno/memory/workflow.py +0 -38
  513. agno/models/ollama/tools.py +0 -430
  514. agno/models/qwen/__init__.py +0 -5
  515. agno/playground/__init__.py +0 -10
  516. agno/playground/deploy.py +0 -3
  517. agno/playground/playground.py +0 -3
  518. agno/playground/serve.py +0 -3
  519. agno/playground/settings.py +0 -3
  520. agno/reranker/__init__.py +0 -0
  521. agno/run/response.py +0 -467
  522. agno/run/v2/__init__.py +0 -0
  523. agno/run/v2/workflow.py +0 -567
  524. agno/storage/__init__.py +0 -0
  525. agno/storage/agent/__init__.py +0 -0
  526. agno/storage/agent/dynamodb.py +0 -1
  527. agno/storage/agent/json.py +0 -1
  528. agno/storage/agent/mongodb.py +0 -1
  529. agno/storage/agent/postgres.py +0 -1
  530. agno/storage/agent/singlestore.py +0 -1
  531. agno/storage/agent/sqlite.py +0 -1
  532. agno/storage/agent/yaml.py +0 -1
  533. agno/storage/base.py +0 -60
  534. agno/storage/dynamodb.py +0 -673
  535. agno/storage/firestore.py +0 -297
  536. agno/storage/gcs_json.py +0 -261
  537. agno/storage/in_memory.py +0 -234
  538. agno/storage/json.py +0 -237
  539. agno/storage/mongodb.py +0 -328
  540. agno/storage/mysql.py +0 -685
  541. agno/storage/postgres.py +0 -682
  542. agno/storage/redis.py +0 -336
  543. agno/storage/session/__init__.py +0 -16
  544. agno/storage/session/agent.py +0 -64
  545. agno/storage/session/team.py +0 -63
  546. agno/storage/session/v2/__init__.py +0 -5
  547. agno/storage/session/workflow.py +0 -61
  548. agno/storage/singlestore.py +0 -606
  549. agno/storage/sqlite.py +0 -646
  550. agno/storage/workflow/__init__.py +0 -0
  551. agno/storage/workflow/mongodb.py +0 -1
  552. agno/storage/workflow/postgres.py +0 -1
  553. agno/storage/workflow/sqlite.py +0 -1
  554. agno/storage/yaml.py +0 -241
  555. agno/tools/thinking.py +0 -73
  556. agno/utils/defaults.py +0 -57
  557. agno/utils/filesystem.py +0 -39
  558. agno/utils/git.py +0 -52
  559. agno/utils/json_io.py +0 -30
  560. agno/utils/load_env.py +0 -19
  561. agno/utils/py_io.py +0 -19
  562. agno/utils/pyproject.py +0 -18
  563. agno/utils/resource_filter.py +0 -31
  564. agno/workflow/v2/__init__.py +0 -21
  565. agno/workflow/v2/types.py +0 -357
  566. agno/workflow/v2/workflow.py +0 -3313
  567. agno/workspace/__init__.py +0 -0
  568. agno/workspace/config.py +0 -325
  569. agno/workspace/enums.py +0 -6
  570. agno/workspace/helpers.py +0 -52
  571. agno/workspace/operator.py +0 -757
  572. agno/workspace/settings.py +0 -158
  573. agno-1.8.2.dist-info/METADATA +0 -982
  574. agno-1.8.2.dist-info/RECORD +0 -566
  575. agno-1.8.2.dist-info/entry_points.txt +0 -3
  576. agno-1.8.2.dist-info/licenses/LICENSE +0 -375
  577. /agno/{app → db/migrations}/__init__.py +0 -0
  578. /agno/{app/playground/__init__.py → db/schemas/metrics.py} +0 -0
  579. /agno/{cli → integrations}/__init__.py +0 -0
  580. /agno/{cli/ws → knowledge/chunking}/__init__.py +0 -0
  581. /agno/{document/chunking → knowledge/remote_content}/__init__.py +0 -0
  582. /agno/{document/reader/gcs → knowledge/reranker}/__init__.py +0 -0
  583. /agno/{document/reader/s3 → os/interfaces}/__init__.py +0 -0
  584. /agno/{app → os/interfaces}/slack/security.py +0 -0
  585. /agno/{app → os/interfaces}/whatsapp/security.py +0 -0
  586. /agno/{file/local → utils/print_response}/__init__.py +0 -0
  587. /agno/{infra → vectordb/llamaindex}/__init__.py +0 -0
  588. {agno-1.8.2.dist-info → agno-2.0.0.dist-info}/WHEEL +0 -0
  589. {agno-1.8.2.dist-info → agno-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1551 @@
1
+ import asyncio
2
+ import hashlib
3
+ import io
4
+ import time
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from functools import cached_property
8
+ from io import BytesIO
9
+ from os.path import basename
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
12
+ from uuid import uuid4
13
+
14
+ from httpx import AsyncClient
15
+
16
+ from agno.db.base import BaseDb
17
+ from agno.db.schemas.knowledge import KnowledgeRow
18
+ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
19
+ from agno.knowledge.document import Document
20
+ from agno.knowledge.reader import Reader, ReaderFactory
21
+ from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
22
+ from agno.utils.http import async_fetch_with_retry
23
+ from agno.utils.log import log_debug, log_error, log_info, log_warning
24
+ from agno.vectordb import VectorDb
25
+
26
+ ContentDict = Dict[str, Union[str, Dict[str, str]]]
27
+
28
+
29
+ class KnowledgeContentOrigin(Enum):
30
+ PATH = "path"
31
+ URL = "url"
32
+ TOPIC = "topic"
33
+ CONTENT = "content"
34
+
35
+
36
+ @dataclass
37
+ class Knowledge:
38
+ """Knowledge class"""
39
+
40
+ name: Optional[str] = None
41
+ description: Optional[str] = None
42
+ vector_db: Optional[VectorDb] = None
43
+ contents_db: Optional[BaseDb] = None
44
+ max_results: int = 10
45
+ readers: Optional[Dict[str, Reader]] = None
46
+
47
+ def __post_init__(self):
48
+ if self.vector_db and not self.vector_db.exists():
49
+ self.vector_db.create()
50
+
51
+ self.construct_readers()
52
+ self.valid_metadata_filters = set()
53
+
54
+ # --- SDK Specific Methods ---
55
+
56
+ # --- Add Contents ---
57
+ @overload
58
+ async def add_contents_async(self, contents: List[ContentDict]) -> None: ...
59
+
60
+ @overload
61
+ async def add_contents_async(
62
+ self,
63
+ *,
64
+ paths: Optional[List[str]] = None,
65
+ urls: Optional[List[str]] = None,
66
+ metadata: Optional[Dict[str, str]] = None,
67
+ include: Optional[List[str]] = None,
68
+ exclude: Optional[List[str]] = None,
69
+ upsert: bool = False,
70
+ skip_if_exists: bool = False,
71
+ remote_content: Optional[RemoteContent] = None,
72
+ ) -> None: ...
73
+
74
+ async def add_contents_async(self, *args, **kwargs) -> None:
75
+ if args and isinstance(args[0], list):
76
+ arguments = args[0]
77
+ for argument in arguments:
78
+ await self.add_content_async(
79
+ name=argument.get("name"),
80
+ description=argument.get("description"),
81
+ path=argument.get("path"),
82
+ url=argument.get("url"),
83
+ metadata=argument.get("metadata"),
84
+ topics=argument.get("topics"),
85
+ reader=argument.get("reader"),
86
+ include=argument.get("include"),
87
+ exclude=argument.get("exclude"),
88
+ upsert=argument.get("upsert", False),
89
+ skip_if_exists=argument.get("skip_if_exists", False),
90
+ remote_content=argument.get("remote_content", None),
91
+ )
92
+
93
+ elif kwargs:
94
+ name = kwargs.get("name", [])
95
+ metadata = kwargs.get("metadata", {})
96
+ description = kwargs.get("description", [])
97
+ topics = kwargs.get("topics", [])
98
+ paths = kwargs.get("paths", [])
99
+ urls = kwargs.get("urls", [])
100
+ include = kwargs.get("include")
101
+ exclude = kwargs.get("exclude")
102
+ upsert = kwargs.get("upsert", False)
103
+ skip_if_exists = kwargs.get("skip_if_exists", False)
104
+ remote_content = kwargs.get("remote_content", None)
105
+
106
+ for path in paths:
107
+ await self.add_content_async(
108
+ name=name,
109
+ description=description,
110
+ path=path,
111
+ metadata=metadata,
112
+ include=include,
113
+ exclude=exclude,
114
+ upsert=upsert,
115
+ skip_if_exists=skip_if_exists,
116
+ )
117
+ for url in urls:
118
+ await self.add_content_async(
119
+ name=name,
120
+ description=description,
121
+ url=url,
122
+ metadata=metadata,
123
+ include=include,
124
+ exclude=exclude,
125
+ upsert=upsert,
126
+ skip_if_exists=skip_if_exists,
127
+ )
128
+ if topics:
129
+ await self.add_content_async(
130
+ name=name,
131
+ description=description,
132
+ topics=topics,
133
+ metadata=metadata,
134
+ include=include,
135
+ exclude=exclude,
136
+ upsert=upsert,
137
+ skip_if_exists=skip_if_exists,
138
+ )
139
+
140
+ if remote_content:
141
+ await self.add_content_async(
142
+ name=name,
143
+ metadata=metadata,
144
+ description=description,
145
+ remote_content=remote_content,
146
+ upsert=upsert,
147
+ skip_if_exists=skip_if_exists,
148
+ )
149
+
150
+ else:
151
+ raise ValueError("Invalid usage of add_contents.")
152
+
153
+ @overload
154
+ def add_contents(self, contents: List[ContentDict]) -> None: ...
155
+
156
+ @overload
157
+ def add_contents(
158
+ self,
159
+ *,
160
+ paths: Optional[List[str]] = None,
161
+ urls: Optional[List[str]] = None,
162
+ metadata: Optional[Dict[str, str]] = None,
163
+ include: Optional[List[str]] = None,
164
+ exclude: Optional[List[str]] = None,
165
+ upsert: bool = False,
166
+ skip_if_exists: bool = False,
167
+ ) -> None: ...
168
+
169
+ def add_contents(self, *args, **kwargs) -> None:
170
+ """
171
+ Synchronously add multiple content items to the knowledge base.
172
+
173
+ This method wraps the asynchronous add_contents method
174
+
175
+ Supports two usage patterns:
176
+ 1. Pass a list of content dictionaries as first argument
177
+ 2. Pass keyword arguments with paths, urls, metadata, etc.
178
+
179
+ Args:
180
+ contents: List of content dictionaries (when used as first overload)
181
+ paths: Optional list of file paths to load content from
182
+ urls: Optional list of URLs to load content from
183
+ metadata: Optional metadata dictionary to apply to all content
184
+ include: Optional list of file patterns to include
185
+ exclude: Optional list of file patterns to exclude
186
+ upsert: Whether to update existing content if it already exists
187
+ skip_if_exists: Whether to skip adding content if it already exists
188
+ """
189
+ asyncio.run(self.add_contents_async(*args, **kwargs))
190
+
191
+ # --- Add Content ---
192
+
193
+ @overload
194
+ async def add_content_async(
195
+ self,
196
+ *,
197
+ path: Optional[str] = None,
198
+ url: Optional[str] = None,
199
+ text_content: Optional[str] = None,
200
+ metadata: Optional[Dict[str, str]] = None,
201
+ include: Optional[List[str]] = None,
202
+ exclude: Optional[List[str]] = None,
203
+ upsert: bool = False,
204
+ skip_if_exists: bool = False,
205
+ reader: Optional[Reader] = None,
206
+ auth: Optional[ContentAuth] = None,
207
+ ) -> None: ...
208
+
209
+ @overload
210
+ async def add_content_async(self, *args, **kwargs) -> None: ...
211
+
212
+ async def add_content_async(
213
+ self,
214
+ name: Optional[str] = None,
215
+ description: Optional[str] = None,
216
+ path: Optional[str] = None,
217
+ url: Optional[str] = None,
218
+ text_content: Optional[str] = None,
219
+ metadata: Optional[Dict[str, Any]] = None,
220
+ topics: Optional[List[str]] = None,
221
+ remote_content: Optional[RemoteContent] = None,
222
+ reader: Optional[Reader] = None,
223
+ include: Optional[List[str]] = None,
224
+ exclude: Optional[List[str]] = None,
225
+ upsert: bool = True,
226
+ skip_if_exists: bool = True,
227
+ auth: Optional[ContentAuth] = None,
228
+ ) -> None:
229
+ # Validation: At least one of the parameters must be provided
230
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
231
+ log_info("At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided.")
232
+ return
233
+
234
+ if not skip_if_exists:
235
+ log_info("skip_if_exists is disabled, disabling upsert")
236
+ upsert = False
237
+
238
+ content = None
239
+ file_data = None
240
+ if text_content:
241
+ file_data = FileData(content=text_content, type="Text")
242
+
243
+ content = Content(
244
+ id=str(uuid4()),
245
+ name=name,
246
+ description=description,
247
+ path=path,
248
+ url=url,
249
+ file_data=file_data if file_data else None,
250
+ metadata=metadata,
251
+ topics=topics,
252
+ remote_content=remote_content,
253
+ reader=reader,
254
+ auth=auth,
255
+ )
256
+
257
+ await self._load_content(content, upsert, skip_if_exists, include, exclude)
258
+
259
+ @overload
260
+ def add_content(
261
+ self,
262
+ *,
263
+ path: Optional[str] = None,
264
+ url: Optional[str] = None,
265
+ text_content: Optional[str] = None,
266
+ metadata: Optional[Dict[str, str]] = None,
267
+ include: Optional[List[str]] = None,
268
+ exclude: Optional[List[str]] = None,
269
+ upsert: bool = False,
270
+ skip_if_exists: bool = False,
271
+ reader: Optional[Reader] = None,
272
+ auth: Optional[ContentAuth] = None,
273
+ ) -> None: ...
274
+
275
+ @overload
276
+ def add_content(self, *args, **kwargs) -> None: ...
277
+
278
+ def add_content(
279
+ self,
280
+ name: Optional[str] = None,
281
+ description: Optional[str] = None,
282
+ path: Optional[str] = None,
283
+ url: Optional[str] = None,
284
+ text_content: Optional[str] = None,
285
+ metadata: Optional[Dict[str, Any]] = None,
286
+ topics: Optional[List[str]] = None,
287
+ remote_content: Optional[RemoteContent] = None,
288
+ reader: Optional[Reader] = None,
289
+ include: Optional[List[str]] = None,
290
+ exclude: Optional[List[str]] = None,
291
+ upsert: bool = True,
292
+ skip_if_exists: bool = True,
293
+ auth: Optional[ContentAuth] = None,
294
+ ) -> None:
295
+ """
296
+ Synchronously add content to the knowledge base.
297
+
298
+ Args:
299
+ name: Optional name for the content
300
+ description: Optional description for the content
301
+ path: Optional file path to load content from
302
+ url: Optional URL to load content from
303
+ text_content: Optional text content to add directly
304
+ metadata: Optional metadata dictionary
305
+ topics: Optional list of topics
306
+ config: Optional cloud storage configuration
307
+ reader: Optional custom reader for processing the content
308
+ include: Optional list of file patterns to include
309
+ exclude: Optional list of file patterns to exclude
310
+ upsert: Whether to update existing content if it already exists
311
+ skip_if_exists: Whether to skip adding content if it already exists
312
+ """
313
+ asyncio.run(
314
+ self.add_content_async(
315
+ name=name,
316
+ description=description,
317
+ path=path,
318
+ url=url,
319
+ text_content=text_content,
320
+ metadata=metadata,
321
+ topics=topics,
322
+ remote_content=remote_content,
323
+ reader=reader,
324
+ include=include,
325
+ exclude=exclude,
326
+ upsert=upsert,
327
+ skip_if_exists=skip_if_exists,
328
+ auth=auth,
329
+ )
330
+ )
331
+
332
+ async def _load_from_path(
333
+ self,
334
+ content: Content,
335
+ upsert: bool,
336
+ skip_if_exists: bool,
337
+ include: Optional[List[str]] = None,
338
+ exclude: Optional[List[str]] = None,
339
+ ):
340
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
341
+ path = Path(content.path) # type: ignore
342
+
343
+ if path.is_file():
344
+ if self._should_include_file(str(path), include, exclude):
345
+ log_info(f"Adding file {path} due to include/exclude filters")
346
+
347
+ # Handle LightRAG special case - read file and upload directly
348
+ if self.vector_db.__class__.__name__ == "LightRag":
349
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
350
+ return
351
+
352
+ content.content_hash = self._build_content_hash(content)
353
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
354
+ log_info(f"Content {content.content_hash} already exists, skipping")
355
+ return
356
+
357
+ self._add_to_contents_db(content)
358
+
359
+ if content.reader:
360
+ # TODO: We will refactor this to eventually pass authorization to all readers
361
+ import inspect
362
+
363
+ read_signature = inspect.signature(content.reader.read)
364
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
365
+ read_documents = content.reader.read(
366
+ path, name=content.name or path.name, password=content.auth.password
367
+ )
368
+ else:
369
+ read_documents = content.reader.read(path, name=content.name or path.name)
370
+
371
+ else:
372
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
373
+ log_info(f"Using Reader: {reader.__class__.__name__}")
374
+ if reader:
375
+ # TODO: We will refactor this to eventually pass authorization to all readers
376
+ import inspect
377
+
378
+ read_signature = inspect.signature(reader.read)
379
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
380
+ read_documents = reader.read(
381
+ path, name=content.name or path.name, password=content.auth.password
382
+ )
383
+ else:
384
+ read_documents = reader.read(path, name=content.name or path.name)
385
+
386
+ if not content.file_type:
387
+ content.file_type = path.suffix
388
+
389
+ if not content.size and content.file_data:
390
+ content.size = len(content.file_data.content) # type: ignore
391
+ if not content.size:
392
+ try:
393
+ content.size = path.stat().st_size
394
+ except (OSError, IOError) as e:
395
+ log_warning(f"Could not get file size for {path}: {e}")
396
+ content.size = 0
397
+
398
+ for read_document in read_documents:
399
+ read_document.content_id = content.id
400
+
401
+ await self._handle_vector_db_insert(content, read_documents, upsert)
402
+
403
+ elif path.is_dir():
404
+ for file_path in path.iterdir():
405
+ # Apply include/exclude filtering
406
+ if not self._should_include_file(str(file_path), include, exclude):
407
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
408
+ continue
409
+
410
+ id = str(uuid4())
411
+ file_content = Content(
412
+ id=id,
413
+ name=content.name,
414
+ path=str(file_path),
415
+ metadata=content.metadata,
416
+ description=content.description,
417
+ reader=content.reader,
418
+ )
419
+ await self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
420
+ else:
421
+ log_warning(f"Invalid path: {path}")
422
+
423
+ async def _load_from_url(
424
+ self,
425
+ content: Content,
426
+ upsert: bool,
427
+ skip_if_exists: bool,
428
+ ):
429
+ """Load the content in the contextual URL
430
+
431
+ 1. Set content hash
432
+ 2. Validate the URL
433
+ 3. Read the content
434
+ 4. Prepare and insert the content in the vector database
435
+ """
436
+ log_info(f"Adding content from URL {content.url}")
437
+ content.file_type = "url"
438
+
439
+ if not content.url:
440
+ raise ValueError("No url provided")
441
+
442
+ if self.vector_db.__class__.__name__ == "LightRag":
443
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
444
+ return
445
+
446
+ # 1. Set content hash
447
+ content.content_hash = self._build_content_hash(content)
448
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
449
+ log_info(f"Content {content.content_hash} already exists, skipping")
450
+ return
451
+ self._add_to_contents_db(content)
452
+
453
+ # 2. Validate URL
454
+ try:
455
+ from urllib.parse import urlparse
456
+
457
+ parsed_url = urlparse(content.url)
458
+ if not all([parsed_url.scheme, parsed_url.netloc]):
459
+ content.status = ContentStatus.FAILED
460
+ content.status_message = f"Invalid URL format: {content.url}"
461
+ self._update_content(content)
462
+ log_warning(f"Invalid URL format: {content.url}")
463
+ except Exception as e:
464
+ content.status = ContentStatus.FAILED
465
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
466
+ self._update_content(content)
467
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
468
+
469
+ # 3. Fetch and load content
470
+ async with AsyncClient() as client:
471
+ response = await async_fetch_with_retry(content.url, client=client)
472
+ bytes_content = BytesIO(response.content)
473
+
474
+ # 4. Select reader
475
+ # If a reader was provided by the user, use it
476
+ reader = content.reader
477
+ name = content.name
478
+ # Else select based on file extension
479
+ if reader is None:
480
+ url_path = Path(parsed_url.path)
481
+ file_extension = url_path.suffix.lower()
482
+ if file_extension == ".csv":
483
+ name = basename(parsed_url.path) or "data.csv"
484
+ reader = self.csv_reader
485
+ elif file_extension == ".pdf":
486
+ reader = self.pdf_reader
487
+ elif file_extension == ".docx":
488
+ reader = self.docx_reader
489
+ elif file_extension == ".json":
490
+ reader = self.json_reader
491
+ elif file_extension == ".markdown":
492
+ reader = self.markdown_reader
493
+ else:
494
+ reader = self.text_reader
495
+
496
+ # 5. Read content
497
+ try:
498
+ read_documents = []
499
+ if reader is not None:
500
+ # TODO: We will refactor this to eventually pass authorization to all readers
501
+ import inspect
502
+
503
+ read_signature = inspect.signature(reader.read)
504
+ if reader.__class__.__name__ == "YouTubeReader":
505
+ read_documents = reader.read(content.url, name=name)
506
+ elif "password" in read_signature.parameters and content.auth and content.auth.password:
507
+ read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
508
+ else:
509
+ read_documents = reader.read(bytes_content, name=name)
510
+ except Exception as e:
511
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
512
+ content.status = ContentStatus.FAILED
513
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
514
+ self._update_content(content)
515
+ return
516
+
517
+ # 6. Chunk documents if needed
518
+ if reader and not reader.chunk:
519
+ read_documents = await reader.chunk_documents_async(read_documents)
520
+
521
+ # 7. Prepare and insert the content in the vector database
522
+ file_size = 0
523
+ if read_documents:
524
+ for read_document in read_documents:
525
+ if read_document.size:
526
+ file_size += read_document.size
527
+ read_document.content_id = content.id
528
+ await self._handle_vector_db_insert(content, read_documents, upsert)
529
+
530
+ async def _load_from_content(
531
+ self,
532
+ content: Content,
533
+ upsert: bool = True,
534
+ skip_if_exists: bool = True,
535
+ ):
536
+ if content.name:
537
+ name = content.name
538
+ elif content.file_data and content.file_data.content:
539
+ if isinstance(content.file_data.content, bytes):
540
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
541
+ elif isinstance(content.file_data.content, str):
542
+ name = (
543
+ content.file_data.content[:10]
544
+ if len(content.file_data.content) >= 10
545
+ else content.file_data.content
546
+ )
547
+ else:
548
+ name = str(content.file_data.content)[:10]
549
+ else:
550
+ name = None
551
+
552
+ if name is not None:
553
+ content.name = name
554
+
555
+ log_info(f"Adding content from {content.name}")
556
+
557
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
558
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
559
+ return
560
+
561
+ content.content_hash = self._build_content_hash(content)
562
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
563
+ log_info(f"Content {content.content_hash} already exists, skipping")
564
+
565
+ return
566
+ self._add_to_contents_db(content)
567
+
568
+ read_documents = []
569
+
570
+ if isinstance(content.file_data, str):
571
+ try:
572
+ content_bytes = content.file_data.encode("utf-8")
573
+ except UnicodeEncodeError:
574
+ content_bytes = content.file_data.encode("latin-1")
575
+ content_io = io.BytesIO(content_bytes)
576
+
577
+ if content.reader:
578
+ log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
579
+ read_documents = content.reader.read(content_io, name=name)
580
+ else:
581
+ text_reader = self.text_reader
582
+ if text_reader:
583
+ read_documents = text_reader.read(content_io, name=name)
584
+ else:
585
+ content.status = ContentStatus.FAILED
586
+ content.status_message = "Text reader not available"
587
+ self._update_content(content)
588
+ return
589
+
590
+ elif isinstance(content.file_data, FileData):
591
+ if content.file_data.type:
592
+ if isinstance(content.file_data.content, bytes):
593
+ content_io = io.BytesIO(content.file_data.content)
594
+ elif isinstance(content.file_data.content, str):
595
+ if self._is_text_mime_type(content.file_data.type):
596
+ try:
597
+ content_bytes = content.file_data.content.encode("utf-8")
598
+ except UnicodeEncodeError:
599
+ log_debug(f"UTF-8 encoding failed for {content.file_data.type}, using latin-1")
600
+ content_bytes = content.file_data.content.encode("latin-1")
601
+ else:
602
+ content_bytes = content.file_data.content.encode("latin-1")
603
+ content_io = io.BytesIO(content_bytes)
604
+ else:
605
+ content_io = content.file_data.content # type: ignore
606
+
607
+ # Respect an explicitly provided reader; otherwise select based on file type
608
+ if content.reader:
609
+ log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
610
+ reader = content.reader
611
+ else:
612
+ reader = self._select_reader(content.file_data.type)
613
+ name = content.name if content.name else f"content_{content.file_data.type}"
614
+ read_documents = reader.read(content_io, name=name)
615
+
616
+ for read_document in read_documents:
617
+ if content.metadata:
618
+ read_document.meta_data.update(content.metadata)
619
+ read_document.content_id = content.id
620
+
621
+ if len(read_documents) == 0:
622
+ content.status = ContentStatus.FAILED
623
+ content.status_message = "Content could not be read"
624
+ self._update_content(content)
625
+
626
+ else:
627
+ content.status = ContentStatus.FAILED
628
+ content.status_message = "No content provided"
629
+ self._update_content(content)
630
+ return
631
+
632
+ await self._handle_vector_db_insert(content, read_documents, upsert)
633
+
634
+ async def _load_from_topics(
635
+ self,
636
+ content: Content,
637
+ upsert: bool,
638
+ skip_if_exists: bool,
639
+ ):
640
+ log_info(f"Adding content from topics: {content.topics}")
641
+
642
+ if content.topics is None:
643
+ log_warning("No topics provided for content")
644
+ return
645
+
646
+ for topic in content.topics:
647
+ id = str(uuid4())
648
+ content = Content(
649
+ id=id,
650
+ name=topic,
651
+ metadata=content.metadata,
652
+ reader=content.reader,
653
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
654
+ file_data=FileData(
655
+ type="Topic",
656
+ ),
657
+ topics=[topic],
658
+ )
659
+
660
+ if self.vector_db.__class__.__name__ == "LightRag":
661
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
662
+ return
663
+
664
+ content.content_hash = self._build_content_hash(content)
665
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
666
+ log_info(f"Content {content.content_hash} already exists, skipping")
667
+ continue
668
+
669
+ self._add_to_contents_db(content)
670
+ if content.reader is None:
671
+ log_error(f"No reader available for topic: {topic}")
672
+ continue
673
+ read_documents = content.reader.read(topic)
674
+ if len(read_documents) > 0:
675
+ for read_document in read_documents:
676
+ read_document.content_id = id
677
+ if read_document.content:
678
+ read_document.size = len(read_document.content.encode("utf-8"))
679
+ else:
680
+ content.status = ContentStatus.FAILED
681
+ content.status_message = "No content found for topic"
682
+ self._update_content(content)
683
+
684
+ await self._handle_vector_db_insert(content, read_documents, upsert)
685
+
686
+ async def _load_from_remote_content(
687
+ self,
688
+ content: Content,
689
+ upsert: bool,
690
+ skip_if_exists: bool,
691
+ ):
692
+ if content.remote_content is None:
693
+ log_warning("No remote content provided for content")
694
+ return
695
+
696
+ remote_content = content.remote_content
697
+
698
+ if isinstance(remote_content, S3Content):
699
+ await self._load_from_s3(content, upsert, skip_if_exists)
700
+
701
+ elif isinstance(remote_content, GCSContent):
702
+ await self._load_from_gcs(content, upsert, skip_if_exists)
703
+
704
+ else:
705
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
706
+
707
+ async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
708
+ """Load the contextual S3 content.
709
+
710
+ 1. Identify objects to read
711
+ 2. Setup Content object
712
+ 3. Hash content and add it to the contents database
713
+ 4. Select reader
714
+ 5. Fetch and load the content
715
+ 6. Read the content
716
+ 7. Prepare and insert the content in the vector database
717
+ 8. Remove temporary file if needed
718
+ """
719
+ from agno.cloud.aws.s3.object import S3Object
720
+
721
+ remote_content: S3Content = cast(S3Content, content.remote_content)
722
+
723
+ # 1. Identify objects to read
724
+ objects_to_read: List[S3Object] = []
725
+ if remote_content.bucket is not None:
726
+ if remote_content.key is not None:
727
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
728
+ objects_to_read.append(_object)
729
+ elif remote_content.object is not None:
730
+ objects_to_read.append(remote_content.object)
731
+ elif remote_content.prefix is not None:
732
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
733
+ else:
734
+ objects_to_read.extend(remote_content.bucket.get_objects())
735
+
736
+ for s3_object in objects_to_read:
737
+ # 2. Setup Content object
738
+ id = str(uuid4())
739
+ content_name = content.name or ""
740
+ content_name += "_" + (s3_object.name or "")
741
+ content_entry = Content(
742
+ id=id,
743
+ name=content_name,
744
+ description=content.description,
745
+ status=ContentStatus.PROCESSING,
746
+ metadata=content.metadata,
747
+ file_type="s3",
748
+ )
749
+
750
+ # 3. Hash content and add it to the contents database
751
+ content_hash = self._build_content_hash(content_entry)
752
+ if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
753
+ log_info(f"Content {content_hash} already exists, skipping")
754
+ continue
755
+ self._add_to_contents_db(content_entry)
756
+
757
+ # 4. Select reader
758
+ reader = content.reader
759
+ if reader is None:
760
+ if s3_object.uri.endswith(".pdf"):
761
+ reader = self.pdf_reader
762
+ elif s3_object.uri.endswith(".csv"):
763
+ reader = self.csv_reader
764
+ elif s3_object.uri.endswith(".docx"):
765
+ reader = self.docx_reader
766
+ elif s3_object.uri.endswith(".json"):
767
+ reader = self.json_reader
768
+ elif s3_object.uri.endswith(".markdown"):
769
+ reader = self.markdown_reader
770
+ else:
771
+ reader = self.text_reader
772
+ reader = cast(Reader, reader)
773
+
774
+ # 5. Fetch and load the content
775
+ temporary_file = None
776
+ obj_name = content_name or s3_object.name.split("/")[-1]
777
+ readable_content: Optional[Union[BytesIO, Path]] = None
778
+ if s3_object.uri.endswith(".pdf"):
779
+ readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
780
+ else:
781
+ temporary_file = Path("storage").joinpath(obj_name)
782
+ readable_content = temporary_file
783
+ s3_object.download(readable_content) # type: ignore
784
+
785
+ # 6. Read the content
786
+ read_documents = reader.read(readable_content, name=obj_name)
787
+
788
+ # 7. Prepare and insert the content in the vector database
789
+ for read_document in read_documents:
790
+ read_document.content_id = content.id
791
+ await self._handle_vector_db_insert(content_entry, read_documents, upsert)
792
+
793
+ # 8. Remove temporary file if needed
794
+ if temporary_file:
795
+ temporary_file.unlink()
796
+
797
+ async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
798
+ """Load the contextual GCS content.
799
+
800
+ 1. Identify objects to read
801
+ 2. Setup Content object
802
+ 3. Hash content and add it to the contents database
803
+ 4. Select reader
804
+ 5. Fetch and load the content
805
+ 6. Read the content
806
+ 7. Prepare and insert the content in the vector database
807
+ """
808
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
809
+
810
+ # 1. Identify objects to read
811
+ objects_to_read = []
812
+ if remote_content.blob_name is not None:
813
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
814
+ elif remote_content.prefix is not None:
815
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
816
+ else:
817
+ objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
818
+
819
+ for gcs_object in objects_to_read:
820
+ # 2. Setup Content object
821
+ id = str(uuid4())
822
+ name = (content.name or "content") + "_" + gcs_object.name
823
+ content_entry = Content(
824
+ id=id,
825
+ name=name,
826
+ description=content.description,
827
+ status=ContentStatus.PROCESSING,
828
+ metadata=content.metadata,
829
+ file_type="gcs",
830
+ )
831
+
832
+ # 3. Hash content and add it to the contents database
833
+ content_hash = self._build_content_hash(content_entry)
834
+ if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
835
+ log_info(f"Content {content_hash} already exists, skipping")
836
+ continue
837
+
838
+ # 4. Add it to the contents database
839
+ self._add_to_contents_db(content_entry)
840
+
841
+ # 5. Select reader
842
+ reader = content.reader
843
+ if reader is None:
844
+ if gcs_object.name.endswith(".pdf"):
845
+ reader = self.pdf_reader
846
+ elif gcs_object.name.endswith(".csv"):
847
+ reader = self.csv_reader
848
+ elif gcs_object.name.endswith(".docx"):
849
+ reader = self.docx_reader
850
+ elif gcs_object.name.endswith(".json"):
851
+ reader = self.json_reader
852
+ elif gcs_object.name.endswith(".markdown"):
853
+ reader = self.markdown_reader
854
+ else:
855
+ reader = self.text_reader
856
+ reader = cast(Reader, reader)
857
+
858
+ # 5. Fetch and load the content
859
+ readable_content = BytesIO(gcs_object.download_as_bytes())
860
+
861
+ # 6. Read the content
862
+ read_documents = reader.read(readable_content, name=name)
863
+
864
+ # 7. Prepare and insert the content in the vector database
865
+ for read_document in read_documents:
866
+ read_document.content_id = content.id
867
+ await self._handle_vector_db_insert(content_entry, read_documents, upsert)
868
+
869
+ async def _handle_vector_db_insert(self, content, read_documents, upsert):
870
+ if not self.vector_db:
871
+ log_error("No vector database configured")
872
+ content.status = ContentStatus.FAILED
873
+ content.status_message = "No vector database configured"
874
+ self._update_content(content)
875
+ return
876
+
877
+ if self.vector_db.upsert_available() and upsert:
878
+ try:
879
+ await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata)
880
+ except Exception as e:
881
+ log_error(f"Error upserting document: {e}")
882
+ content.status = ContentStatus.FAILED
883
+ content.status_message = "Could not upsert embedding"
884
+ self._update_content(content)
885
+ return
886
+ else:
887
+ try:
888
+ await self.vector_db.async_insert(
889
+ content.content_hash, documents=read_documents, filters=content.metadata
890
+ )
891
+ except Exception as e:
892
+ log_error(f"Error inserting document: {e}")
893
+ content.status = ContentStatus.FAILED
894
+ content.status_message = "Could not insert embedding"
895
+ self._update_content(content)
896
+ return
897
+
898
+ content.status = ContentStatus.COMPLETED
899
+ self._update_content(content)
900
+
901
+ async def _load_content(
902
+ self,
903
+ content: Content,
904
+ upsert: bool,
905
+ skip_if_exists: bool,
906
+ include: Optional[List[str]] = None,
907
+ exclude: Optional[List[str]] = None,
908
+ ) -> None:
909
+ log_info(f"Loading content: {content.id}")
910
+
911
+ if content.metadata:
912
+ self.add_filters(content.metadata)
913
+
914
+ if content.path:
915
+ await self._load_from_path(content, upsert, skip_if_exists, include, exclude)
916
+
917
+ if content.url:
918
+ await self._load_from_url(content, upsert, skip_if_exists)
919
+
920
+ if content.file_data:
921
+ await self._load_from_content(content, upsert, skip_if_exists)
922
+
923
+ if content.topics:
924
+ await self._load_from_topics(content, upsert, skip_if_exists)
925
+
926
+ if content.remote_content:
927
+ await self._load_from_remote_content(content, upsert, skip_if_exists)
928
+
929
+ def _build_content_hash(self, content: Content) -> str:
930
+ """
931
+ Build the content hash from the content.
932
+ """
933
+ if content.path:
934
+ return hashlib.sha256(str(content.path).encode()).hexdigest()
935
+ elif content.url:
936
+ hash = hashlib.sha256(content.url.encode()).hexdigest()
937
+ return hash
938
+ elif content.file_data and content.file_data.content:
939
+ name = content.name or "content"
940
+ return hashlib.sha256(name.encode()).hexdigest()
941
+ elif content.topics and len(content.topics) > 0:
942
+ topic = content.topics[0]
943
+ reader = type(content.reader).__name__ if content.reader else "unknown"
944
+ return hashlib.sha256(f"{topic}-{reader}".encode()).hexdigest()
945
+ else:
946
+ # Fallback for edge cases
947
+ import random
948
+ import string
949
+
950
+ fallback = (
951
+ content.name
952
+ or content.id
953
+ or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
954
+ )
955
+ return hashlib.sha256(fallback.encode()).hexdigest()
956
+
957
+ def _add_to_contents_db(self, content: Content):
958
+ if self.contents_db:
959
+ created_at = content.created_at if content.created_at else int(time.time())
960
+ updated_at = content.updated_at if content.updated_at else int(time.time())
961
+
962
+ file_type = (
963
+ content.file_type
964
+ if content.file_type
965
+ else content.file_data.type
966
+ if content.file_data and content.file_data.type
967
+ else None
968
+ )
969
+ content_row = KnowledgeRow(
970
+ id=content.id,
971
+ name=content.name if content.name else "",
972
+ description=content.description if content.description else "",
973
+ metadata=content.metadata,
974
+ type=file_type,
975
+ size=content.size
976
+ if content.size
977
+ else len(content.file_data.content)
978
+ if content.file_data and content.file_data.content
979
+ else None,
980
+ linked_to=self.name,
981
+ access_count=0,
982
+ status=content.status if content.status else ContentStatus.PROCESSING,
983
+ status_message="",
984
+ created_at=created_at,
985
+ updated_at=updated_at,
986
+ )
987
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
988
+
989
+ def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
990
+ if self.contents_db:
991
+ if not content.id:
992
+ log_warning("Content id is required to update Knowledge content")
993
+ return None
994
+
995
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
996
+ content_row = self.contents_db.get_knowledge_content(content.id)
997
+ if content_row is None:
998
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
999
+ return None
1000
+
1001
+ if content.name is not None:
1002
+ content_row.name = content.name
1003
+ if content.description is not None:
1004
+ content_row.description = content.description
1005
+ if content.metadata is not None:
1006
+ content_row.metadata = content.metadata
1007
+ if content.status is not None:
1008
+ content_row.status = content.status
1009
+ if content.status_message is not None:
1010
+ content_row.status_message = content.status_message if content.status_message else ""
1011
+ if content.external_id is not None:
1012
+ content_row.external_id = content.external_id
1013
+
1014
+ content_row.updated_at = int(time.time())
1015
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1016
+
1017
+ if self.vector_db and content.metadata:
1018
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata)
1019
+
1020
+ if content.metadata:
1021
+ self.add_filters(content.metadata)
1022
+
1023
+ return content_row.to_dict()
1024
+
1025
+ else:
1026
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
1027
+ return None
1028
+
1029
+ async def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
1030
+ self._add_to_contents_db(content)
1031
+ if content_type == KnowledgeContentOrigin.PATH:
1032
+ if content.file_data is None:
1033
+ log_warning("No file data provided")
1034
+
1035
+ if content.path is None:
1036
+ log_error("No path provided for content")
1037
+ return
1038
+
1039
+ path = Path(content.path)
1040
+
1041
+ log_info(f"Uploading file to LightRAG from path: {path}")
1042
+ try:
1043
+ # Read the file content from path
1044
+ with open(path, "rb") as f:
1045
+ file_content = f.read()
1046
+
1047
+ # Get file type from extension or content.file_type
1048
+ file_type = content.file_type or path.suffix
1049
+
1050
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
1051
+ result = await self.vector_db.insert_file_bytes(
1052
+ file_content=file_content,
1053
+ filename=path.name, # Use the original filename with extension
1054
+ content_type=file_type,
1055
+ send_metadata=True, # Enable metadata so server knows the file type
1056
+ )
1057
+
1058
+ else:
1059
+ log_error("Vector database does not support file insertion")
1060
+ content.status = ContentStatus.FAILED
1061
+ self._update_content(content)
1062
+ return
1063
+ content.external_id = result
1064
+ content.status = ContentStatus.COMPLETED
1065
+ self._update_content(content)
1066
+ return
1067
+
1068
+ except Exception as e:
1069
+ log_error(f"Error uploading file to LightRAG: {e}")
1070
+ content.status = ContentStatus.FAILED
1071
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
1072
+ self._update_content(content)
1073
+ return
1074
+
1075
+ elif content_type == KnowledgeContentOrigin.URL:
1076
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
1077
+ try:
1078
+ reader = content.reader or self.website_reader
1079
+ if reader is None:
1080
+ log_error("No URL reader available")
1081
+ content.status = ContentStatus.FAILED
1082
+ self._update_content(content)
1083
+ return
1084
+
1085
+ reader.chunk = False
1086
+ read_documents = reader.read(content.url, name=content.name)
1087
+
1088
+ for read_document in read_documents:
1089
+ read_document.content_id = content.id
1090
+
1091
+ if not read_documents:
1092
+ log_error("No documents read from URL")
1093
+ content.status = ContentStatus.FAILED
1094
+ self._update_content(content)
1095
+ return
1096
+
1097
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
1098
+ result = await self.vector_db.insert_text(
1099
+ file_source=content.url,
1100
+ text=read_documents[0].content,
1101
+ )
1102
+ else:
1103
+ log_error("Vector database does not support text insertion")
1104
+ content.status = ContentStatus.FAILED
1105
+ self._update_content(content)
1106
+ return
1107
+
1108
+ content.external_id = result
1109
+ content.status = ContentStatus.COMPLETED
1110
+ self._update_content(content)
1111
+ return
1112
+
1113
+ except Exception as e:
1114
+ log_error(f"Error uploading file to LightRAG: {e}")
1115
+ content.status = ContentStatus.FAILED
1116
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
1117
+ self._update_content(content)
1118
+ return
1119
+
1120
+ elif content_type == KnowledgeContentOrigin.CONTENT:
1121
+ filename = (
1122
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
1123
+ )
1124
+ log_info(f"Uploading file to LightRAG: {filename}")
1125
+
1126
+ # Use the content from file_data
1127
+ if content.file_data and content.file_data.content:
1128
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
1129
+ result = await self.vector_db.insert_file_bytes(
1130
+ file_content=content.file_data.content,
1131
+ filename=filename,
1132
+ content_type=content.file_data.type,
1133
+ send_metadata=True, # Enable metadata so server knows the file type
1134
+ )
1135
+ else:
1136
+ log_error("Vector database does not support file insertion")
1137
+ content.status = ContentStatus.FAILED
1138
+ self._update_content(content)
1139
+ return
1140
+ content.external_id = result
1141
+ content.status = ContentStatus.COMPLETED
1142
+ self._update_content(content)
1143
+ else:
1144
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
1145
+ return
1146
+
1147
+ elif content_type == KnowledgeContentOrigin.TOPIC:
1148
+ log_info(f"Uploading file to LightRAG: {content.name}")
1149
+
1150
+ if content.reader is None:
1151
+ log_error("No reader available for topic content")
1152
+ content.status = ContentStatus.FAILED
1153
+ self._update_content(content)
1154
+ return
1155
+
1156
+ if not content.topics:
1157
+ log_error("No topics available for content")
1158
+ content.status = ContentStatus.FAILED
1159
+ self._update_content(content)
1160
+ return
1161
+
1162
+ read_documents = content.reader.read(content.topics)
1163
+ if len(read_documents) > 0:
1164
+ print("READ DOCUMENTS: ", len(read_documents))
1165
+ print("READ DOCUMENTS: ", read_documents[0])
1166
+
1167
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
1168
+ result = await self.vector_db.insert_text(
1169
+ file_source=content.topics[0],
1170
+ text=read_documents[0].content,
1171
+ )
1172
+ else:
1173
+ log_error("Vector database does not support text insertion")
1174
+ content.status = ContentStatus.FAILED
1175
+ self._update_content(content)
1176
+ return
1177
+ content.external_id = result
1178
+ content.status = ContentStatus.COMPLETED
1179
+ self._update_content(content)
1180
+ return
1181
+ else:
1182
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
1183
+ return
1184
+
1185
+ def search(
1186
+ self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
1187
+ ) -> List[Document]:
1188
+ """Returns relevant documents matching a query"""
1189
+
1190
+ try:
1191
+ if self.vector_db is None:
1192
+ log_warning("No vector db provided")
1193
+ return []
1194
+
1195
+ _max_results = max_results or self.max_results
1196
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
1197
+ return self.vector_db.search(query=query, limit=_max_results, filters=filters)
1198
+ except Exception as e:
1199
+ log_error(f"Error searching for documents: {e}")
1200
+ return []
1201
+
1202
+ async def async_search(
1203
+ self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
1204
+ ) -> List[Document]:
1205
+ """Returns relevant documents matching a query"""
1206
+
1207
+ try:
1208
+ if self.vector_db is None:
1209
+ log_warning("No vector db provided")
1210
+ return []
1211
+
1212
+ _max_results = max_results or self.max_results
1213
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
1214
+ try:
1215
+ return await self.vector_db.async_search(query=query, limit=_max_results, filters=filters)
1216
+ except NotImplementedError:
1217
+ log_info("Vector db does not support async search")
1218
+ return self.search(query=query, max_results=_max_results, filters=filters)
1219
+ except Exception as e:
1220
+ log_error(f"Error searching for documents: {e}")
1221
+ return []
1222
+
1223
+ def validate_filters(self, filters: Optional[Dict[str, Any]]) -> Tuple[Dict[str, Any], List[str]]:
1224
+ if self.valid_metadata_filters is None:
1225
+ self.valid_metadata_filters = set()
1226
+ self.valid_metadata_filters.update(self._get_filters_from_db)
1227
+
1228
+ if not filters:
1229
+ return {}, []
1230
+
1231
+ valid_filters: Dict[str, Any] = {}
1232
+ invalid_keys = []
1233
+
1234
+ # If no metadata filters tracked yet, all keys are considered invalid
1235
+ if self.valid_metadata_filters is None:
1236
+ invalid_keys = list(filters.keys())
1237
+ log_debug(f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}")
1238
+ return {}, invalid_keys
1239
+
1240
+ for key, value in filters.items():
1241
+ # Handle both normal keys and prefixed keys like meta_data.key
1242
+ base_key = key.split(".")[-1] if "." in key else key
1243
+ if base_key in self.valid_metadata_filters or key in self.valid_metadata_filters:
1244
+ valid_filters[key] = value
1245
+ else:
1246
+ invalid_keys.append(key)
1247
+ log_debug(f"Invalid filter key: {key} - not present in knowledge base")
1248
+
1249
+ return valid_filters, invalid_keys
1250
+
1251
+ def add_filters(self, metadata: Dict[str, Any]) -> None:
1252
+ if self.valid_metadata_filters is None:
1253
+ self.valid_metadata_filters = set()
1254
+
1255
+ if metadata is not None:
1256
+ for key in metadata.keys():
1257
+ self.valid_metadata_filters.add(key)
1258
+
1259
+ @cached_property
1260
+ def _get_filters_from_db(self) -> Set[str]:
1261
+ if self.contents_db is None:
1262
+ return set()
1263
+ contents, _ = self.get_content()
1264
+ valid_filters: Set[str] = set()
1265
+ for content in contents:
1266
+ if content.metadata:
1267
+ valid_filters.update(content.metadata.keys())
1268
+ return valid_filters
1269
+
1270
+ def remove_vector_by_id(self, id: str) -> bool:
1271
+ if self.vector_db is None:
1272
+ log_warning("No vector DB provided")
1273
+ return False
1274
+ return self.vector_db.delete_by_id(id)
1275
+
1276
+ def remove_vectors_by_name(self, name: str) -> bool:
1277
+ if self.vector_db is None:
1278
+ log_warning("No vector DB provided")
1279
+ return False
1280
+ return self.vector_db.delete_by_name(name)
1281
+
1282
+ def remove_vectors_by_metadata(self, metadata: Dict[str, Any]) -> bool:
1283
+ if self.vector_db is None:
1284
+ log_warning("No vector DB provided")
1285
+ return False
1286
+ return self.vector_db.delete_by_metadata(metadata)
1287
+
1288
+ # --- API Only Methods ---
1289
+
1290
+ def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
1291
+ return self._update_content(content)
1292
+
1293
+ def get_content_by_id(self, content_id: str) -> Optional[Content]:
1294
+ if self.contents_db is None:
1295
+ raise ValueError("No contents db provided")
1296
+ content_row = self.contents_db.get_knowledge_content(content_id)
1297
+ if content_row is None:
1298
+ return None
1299
+ content = Content(
1300
+ id=content_row.id,
1301
+ name=content_row.name,
1302
+ description=content_row.description,
1303
+ metadata=content_row.metadata,
1304
+ file_type=content_row.type,
1305
+ size=content_row.size,
1306
+ status=ContentStatus(content_row.status) if content_row.status else None,
1307
+ status_message=content_row.status_message,
1308
+ created_at=content_row.created_at,
1309
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
1310
+ external_id=content_row.external_id,
1311
+ )
1312
+ return content
1313
+
1314
+ def get_content(
1315
+ self,
1316
+ limit: Optional[int] = None,
1317
+ page: Optional[int] = None,
1318
+ sort_by: Optional[str] = None,
1319
+ sort_order: Optional[str] = None,
1320
+ ) -> Tuple[List[Content], int]:
1321
+ if self.contents_db is None:
1322
+ raise ValueError("No contents db provided")
1323
+ contents, count = self.contents_db.get_knowledge_contents(
1324
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
1325
+ )
1326
+
1327
+ result = []
1328
+ for content_row in contents:
1329
+ # Create Content from database row
1330
+ content = Content(
1331
+ id=content_row.id,
1332
+ name=content_row.name,
1333
+ description=content_row.description,
1334
+ metadata=content_row.metadata,
1335
+ size=content_row.size,
1336
+ file_type=content_row.type,
1337
+ status=ContentStatus(content_row.status) if content_row.status else None,
1338
+ status_message=content_row.status_message,
1339
+ created_at=content_row.created_at,
1340
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
1341
+ external_id=content_row.external_id,
1342
+ )
1343
+ result.append(content)
1344
+ return result, count
1345
+
1346
+ def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
1347
+ if self.contents_db is None:
1348
+ raise ValueError("No contents db provided")
1349
+ content_row = self.contents_db.get_knowledge_content(content_id)
1350
+ if content_row is None:
1351
+ return None, "Content not found"
1352
+
1353
+ # Convert string status to enum, defaulting to PROCESSING if unknown
1354
+ status_str = content_row.status
1355
+ try:
1356
+ status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
1357
+ except ValueError:
1358
+ # Handle legacy or unknown statuses
1359
+ if status_str and "failed" in status_str.lower():
1360
+ status = ContentStatus.FAILED
1361
+ elif status_str and "completed" in status_str.lower():
1362
+ status = ContentStatus.COMPLETED
1363
+ else:
1364
+ status = ContentStatus.PROCESSING
1365
+
1366
+ return status, content_row.status_message
1367
+
1368
+ def remove_content_by_id(self, content_id: str):
1369
+ if self.vector_db is not None:
1370
+ if self.vector_db.__class__.__name__ == "LightRag":
1371
+ # For LightRAG, get the content first to find the external_id
1372
+ content = self.get_content_by_id(content_id)
1373
+ if content and content.external_id:
1374
+ self.vector_db.delete_by_external_id(content.external_id) # type: ignore
1375
+ else:
1376
+ log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
1377
+ else:
1378
+ self.vector_db.delete_by_content_id(content_id)
1379
+
1380
+ if self.contents_db is not None:
1381
+ self.contents_db.delete_knowledge_content(content_id)
1382
+
1383
+ def remove_all_content(self):
1384
+ contents, _ = self.get_content()
1385
+ for content in contents:
1386
+ if content.id is not None:
1387
+ self.remove_content_by_id(content.id)
1388
+
1389
+ # --- Reader Factory Integration ---
1390
+
1391
+ def construct_readers(self):
1392
+ """Initialize readers dictionary for lazy loading."""
1393
+ # Initialize empty readers dict - readers will be created on-demand
1394
+ if self.readers is None:
1395
+ self.readers = {}
1396
+
1397
+ def add_reader(self, reader: Reader):
1398
+ """Add a custom reader to the knowledge base."""
1399
+ if self.readers is None:
1400
+ self.readers = {}
1401
+
1402
+ # Generate a key for the reader
1403
+ reader_key = self._generate_reader_key(reader)
1404
+ self.readers[reader_key] = reader
1405
+ return reader
1406
+
1407
+ def get_readers(self) -> Dict[str, Reader]:
1408
+ """Get all currently loaded readers (only returns readers that have been used)."""
1409
+ if self.readers is None:
1410
+ self.readers = {}
1411
+
1412
+ return self.readers
1413
+
1414
+ def _generate_reader_key(self, reader: Reader) -> str:
1415
+ """Generate a key for a reader instance."""
1416
+ if reader.name:
1417
+ return f"{reader.name.lower().replace(' ', '_')}"
1418
+ else:
1419
+ return f"{reader.__class__.__name__.lower().replace(' ', '_')}"
1420
+
1421
+ def _select_reader(self, extension: str) -> Reader:
1422
+ """Select the appropriate reader for a file extension."""
1423
+ log_info(f"Selecting reader for extension: {extension}")
1424
+ return ReaderFactory.get_reader_for_extension(extension)
1425
+
1426
+ def get_filters(self) -> List[str]:
1427
+ return [
1428
+ "filter_tag_1",
1429
+ "filter_tag2",
1430
+ ]
1431
+
1432
+ # --- Convenience Properties for Backward Compatibility ---
1433
+
1434
+ def _is_text_mime_type(self, mime_type: str) -> bool:
1435
+ """
1436
+ Check if a MIME type represents text content that can be safely encoded as UTF-8.
1437
+
1438
+ Args:
1439
+ mime_type: The MIME type to check
1440
+
1441
+ Returns:
1442
+ bool: True if it's a text type, False if binary
1443
+ """
1444
+ if not mime_type:
1445
+ return False
1446
+
1447
+ text_types = [
1448
+ "text/",
1449
+ "application/json",
1450
+ "application/xml",
1451
+ "application/javascript",
1452
+ "application/csv",
1453
+ "application/sql",
1454
+ ]
1455
+
1456
+ return any(mime_type.startswith(t) for t in text_types)
1457
+
1458
+ def _should_include_file(self, file_path: str, include: Optional[List[str]], exclude: Optional[List[str]]) -> bool:
1459
+ """
1460
+ Determine if a file should be included based on include/exclude patterns.
1461
+
1462
+ Logic:
1463
+ 1. If include is specified, file must match at least one include pattern
1464
+ 2. If exclude is specified, file must not match any exclude pattern
1465
+ 3. If neither specified, include all files
1466
+
1467
+ Args:
1468
+ file_path: Path to the file to check
1469
+ include: Optional list of include patterns (glob-style)
1470
+ exclude: Optional list of exclude patterns (glob-style)
1471
+
1472
+ Returns:
1473
+ bool: True if file should be included, False otherwise
1474
+ """
1475
+ import fnmatch
1476
+
1477
+ # If include patterns specified, file must match at least one
1478
+ if include:
1479
+ if not any(fnmatch.fnmatch(file_path, pattern) for pattern in include):
1480
+ return False
1481
+
1482
+ # If exclude patterns specified, file must not match any
1483
+ if exclude:
1484
+ if any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude):
1485
+ return False
1486
+
1487
+ return True
1488
+
1489
+ def _get_reader(self, reader_type: str) -> Optional[Reader]:
1490
+ """Get a cached reader or create it if not cached, handling missing dependencies gracefully."""
1491
+ if self.readers is None:
1492
+ self.readers = {}
1493
+
1494
+ if reader_type not in self.readers:
1495
+ try:
1496
+ reader = ReaderFactory.create_reader(reader_type)
1497
+ if reader:
1498
+ self.readers[reader_type] = reader
1499
+ else:
1500
+ return None
1501
+
1502
+ except Exception as e:
1503
+ log_warning(f"Cannot create {reader_type} reader {e}")
1504
+ return None
1505
+
1506
+ return self.readers.get(reader_type)
1507
+
1508
+ @property
1509
+ def pdf_reader(self) -> Optional[Reader]:
1510
+ """PDF reader - lazy loaded via factory."""
1511
+ return self._get_reader("pdf")
1512
+
1513
+ @property
1514
+ def csv_reader(self) -> Optional[Reader]:
1515
+ """CSV reader - lazy loaded via factory."""
1516
+ return self._get_reader("csv")
1517
+
1518
+ @property
1519
+ def docx_reader(self) -> Optional[Reader]:
1520
+ """Docx reader - lazy loaded via factory."""
1521
+ return self._get_reader("docx")
1522
+
1523
+ @property
1524
+ def json_reader(self) -> Optional[Reader]:
1525
+ """JSON reader - lazy loaded via factory."""
1526
+ return self._get_reader("json")
1527
+
1528
+ @property
1529
+ def markdown_reader(self) -> Optional[Reader]:
1530
+ """Markdown reader - lazy loaded via factory."""
1531
+ return self._get_reader("markdown")
1532
+
1533
+ @property
1534
+ def text_reader(self) -> Optional[Reader]:
1535
+ """Text reader - lazy loaded via factory."""
1536
+ return self._get_reader("text")
1537
+
1538
+ @property
1539
+ def website_reader(self) -> Optional[Reader]:
1540
+ """Website reader - lazy loaded via factory."""
1541
+ return self._get_reader("website")
1542
+
1543
+ @property
1544
+ def firecrawl_reader(self) -> Optional[Reader]:
1545
+ """Firecrawl reader - lazy loaded via factory."""
1546
+ return self._get_reader("firecrawl")
1547
+
1548
+ @property
1549
+ def youtube_reader(self) -> Optional[Reader]:
1550
+ """YouTube reader - lazy loaded via factory."""
1551
+ return self._get_reader("youtube")