agno 1.8.1__py3-none-any.whl → 2.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (580) hide show
  1. agno/__init__.py +8 -0
  2. agno/agent/__init__.py +19 -27
  3. agno/agent/agent.py +2778 -4123
  4. agno/api/agent.py +9 -65
  5. agno/api/api.py +5 -46
  6. agno/api/evals.py +6 -17
  7. agno/api/os.py +17 -0
  8. agno/api/routes.py +6 -41
  9. agno/api/schemas/__init__.py +9 -0
  10. agno/api/schemas/agent.py +5 -21
  11. agno/api/schemas/evals.py +7 -16
  12. agno/api/schemas/os.py +14 -0
  13. agno/api/schemas/team.py +5 -21
  14. agno/api/schemas/utils.py +21 -0
  15. agno/api/schemas/workflows.py +11 -7
  16. agno/api/settings.py +53 -0
  17. agno/api/team.py +9 -64
  18. agno/api/workflow.py +28 -0
  19. agno/cloud/aws/base.py +214 -0
  20. agno/cloud/aws/s3/__init__.py +2 -0
  21. agno/cloud/aws/s3/api_client.py +43 -0
  22. agno/cloud/aws/s3/bucket.py +195 -0
  23. agno/cloud/aws/s3/object.py +57 -0
  24. agno/db/__init__.py +24 -0
  25. agno/db/base.py +245 -0
  26. agno/db/dynamo/__init__.py +3 -0
  27. agno/db/dynamo/dynamo.py +1749 -0
  28. agno/db/dynamo/schemas.py +278 -0
  29. agno/db/dynamo/utils.py +684 -0
  30. agno/db/firestore/__init__.py +3 -0
  31. agno/db/firestore/firestore.py +1438 -0
  32. agno/db/firestore/schemas.py +130 -0
  33. agno/db/firestore/utils.py +278 -0
  34. agno/db/gcs_json/__init__.py +3 -0
  35. agno/db/gcs_json/gcs_json_db.py +1001 -0
  36. agno/db/gcs_json/utils.py +194 -0
  37. agno/db/in_memory/__init__.py +3 -0
  38. agno/db/in_memory/in_memory_db.py +888 -0
  39. agno/db/in_memory/utils.py +172 -0
  40. agno/db/json/__init__.py +3 -0
  41. agno/db/json/json_db.py +1051 -0
  42. agno/db/json/utils.py +196 -0
  43. agno/db/migrations/v1_to_v2.py +162 -0
  44. agno/db/mongo/__init__.py +3 -0
  45. agno/db/mongo/mongo.py +1417 -0
  46. agno/db/mongo/schemas.py +77 -0
  47. agno/db/mongo/utils.py +204 -0
  48. agno/db/mysql/__init__.py +3 -0
  49. agno/db/mysql/mysql.py +1719 -0
  50. agno/db/mysql/schemas.py +124 -0
  51. agno/db/mysql/utils.py +298 -0
  52. agno/db/postgres/__init__.py +3 -0
  53. agno/db/postgres/postgres.py +1720 -0
  54. agno/db/postgres/schemas.py +124 -0
  55. agno/db/postgres/utils.py +281 -0
  56. agno/db/redis/__init__.py +3 -0
  57. agno/db/redis/redis.py +1371 -0
  58. agno/db/redis/schemas.py +109 -0
  59. agno/db/redis/utils.py +288 -0
  60. agno/db/schemas/__init__.py +3 -0
  61. agno/db/schemas/evals.py +33 -0
  62. agno/db/schemas/knowledge.py +40 -0
  63. agno/db/schemas/memory.py +46 -0
  64. agno/db/singlestore/__init__.py +3 -0
  65. agno/db/singlestore/schemas.py +116 -0
  66. agno/db/singlestore/singlestore.py +1722 -0
  67. agno/db/singlestore/utils.py +327 -0
  68. agno/db/sqlite/__init__.py +3 -0
  69. agno/db/sqlite/schemas.py +119 -0
  70. agno/db/sqlite/sqlite.py +1680 -0
  71. agno/db/sqlite/utils.py +269 -0
  72. agno/db/utils.py +88 -0
  73. agno/eval/__init__.py +14 -0
  74. agno/eval/accuracy.py +142 -43
  75. agno/eval/performance.py +88 -23
  76. agno/eval/reliability.py +73 -20
  77. agno/eval/utils.py +23 -13
  78. agno/integrations/discord/__init__.py +3 -0
  79. agno/{app → integrations}/discord/client.py +10 -10
  80. agno/knowledge/__init__.py +2 -2
  81. agno/{document → knowledge}/chunking/agentic.py +2 -2
  82. agno/{document → knowledge}/chunking/document.py +2 -2
  83. agno/{document → knowledge}/chunking/fixed.py +3 -3
  84. agno/{document → knowledge}/chunking/markdown.py +2 -2
  85. agno/{document → knowledge}/chunking/recursive.py +2 -2
  86. agno/{document → knowledge}/chunking/row.py +2 -2
  87. agno/knowledge/chunking/semantic.py +59 -0
  88. agno/knowledge/chunking/strategy.py +121 -0
  89. agno/knowledge/content.py +74 -0
  90. agno/knowledge/document/__init__.py +5 -0
  91. agno/{document → knowledge/document}/base.py +12 -2
  92. agno/knowledge/embedder/__init__.py +5 -0
  93. agno/{embedder → knowledge/embedder}/aws_bedrock.py +127 -1
  94. agno/{embedder → knowledge/embedder}/azure_openai.py +65 -1
  95. agno/{embedder → knowledge/embedder}/base.py +6 -0
  96. agno/{embedder → knowledge/embedder}/cohere.py +72 -1
  97. agno/{embedder → knowledge/embedder}/fastembed.py +17 -1
  98. agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
  99. agno/{embedder → knowledge/embedder}/google.py +74 -1
  100. agno/{embedder → knowledge/embedder}/huggingface.py +36 -2
  101. agno/{embedder → knowledge/embedder}/jina.py +48 -2
  102. agno/knowledge/embedder/langdb.py +22 -0
  103. agno/knowledge/embedder/mistral.py +139 -0
  104. agno/{embedder → knowledge/embedder}/nebius.py +1 -1
  105. agno/{embedder → knowledge/embedder}/ollama.py +54 -3
  106. agno/knowledge/embedder/openai.py +223 -0
  107. agno/{embedder → knowledge/embedder}/sentence_transformer.py +16 -1
  108. agno/{embedder → knowledge/embedder}/together.py +1 -1
  109. agno/{embedder → knowledge/embedder}/voyageai.py +49 -1
  110. agno/knowledge/knowledge.py +1515 -0
  111. agno/knowledge/reader/__init__.py +7 -0
  112. agno/{document → knowledge}/reader/arxiv_reader.py +32 -4
  113. agno/knowledge/reader/base.py +88 -0
  114. agno/{document → knowledge}/reader/csv_reader.py +68 -15
  115. agno/knowledge/reader/docx_reader.py +83 -0
  116. agno/{document → knowledge}/reader/firecrawl_reader.py +42 -21
  117. agno/knowledge/reader/gcs_reader.py +67 -0
  118. agno/{document → knowledge}/reader/json_reader.py +30 -9
  119. agno/{document → knowledge}/reader/markdown_reader.py +36 -9
  120. agno/{document → knowledge}/reader/pdf_reader.py +79 -21
  121. agno/knowledge/reader/reader_factory.py +275 -0
  122. agno/knowledge/reader/s3_reader.py +171 -0
  123. agno/{document → knowledge}/reader/text_reader.py +31 -10
  124. agno/knowledge/reader/url_reader.py +84 -0
  125. agno/knowledge/reader/web_search_reader.py +389 -0
  126. agno/{document → knowledge}/reader/website_reader.py +37 -10
  127. agno/knowledge/reader/wikipedia_reader.py +59 -0
  128. agno/knowledge/reader/youtube_reader.py +78 -0
  129. agno/knowledge/remote_content/remote_content.py +88 -0
  130. agno/{reranker → knowledge/reranker}/base.py +1 -1
  131. agno/{reranker → knowledge/reranker}/cohere.py +2 -2
  132. agno/{reranker → knowledge/reranker}/infinity.py +2 -2
  133. agno/{reranker → knowledge/reranker}/sentence_transformer.py +2 -2
  134. agno/knowledge/types.py +30 -0
  135. agno/knowledge/utils.py +169 -0
  136. agno/memory/__init__.py +2 -10
  137. agno/memory/manager.py +1003 -148
  138. agno/models/aimlapi/__init__.py +2 -2
  139. agno/models/aimlapi/aimlapi.py +6 -6
  140. agno/models/anthropic/claude.py +129 -82
  141. agno/models/aws/bedrock.py +107 -175
  142. agno/models/aws/claude.py +64 -18
  143. agno/models/azure/ai_foundry.py +73 -23
  144. agno/models/base.py +347 -287
  145. agno/models/cerebras/cerebras.py +84 -27
  146. agno/models/cohere/chat.py +106 -98
  147. agno/models/google/gemini.py +100 -42
  148. agno/models/groq/groq.py +97 -35
  149. agno/models/huggingface/huggingface.py +92 -27
  150. agno/models/ibm/watsonx.py +72 -13
  151. agno/models/litellm/chat.py +85 -13
  152. agno/models/message.py +38 -144
  153. agno/models/meta/llama.py +85 -49
  154. agno/models/metrics.py +120 -0
  155. agno/models/mistral/mistral.py +90 -21
  156. agno/models/ollama/__init__.py +0 -2
  157. agno/models/ollama/chat.py +84 -46
  158. agno/models/openai/chat.py +121 -23
  159. agno/models/openai/responses.py +178 -105
  160. agno/models/perplexity/perplexity.py +26 -2
  161. agno/models/portkey/portkey.py +0 -7
  162. agno/models/response.py +14 -8
  163. agno/models/utils.py +20 -0
  164. agno/models/vercel/__init__.py +2 -2
  165. agno/models/vercel/v0.py +1 -1
  166. agno/models/vllm/__init__.py +2 -2
  167. agno/models/vllm/vllm.py +3 -3
  168. agno/models/xai/xai.py +10 -10
  169. agno/os/__init__.py +3 -0
  170. agno/os/app.py +393 -0
  171. agno/os/auth.py +47 -0
  172. agno/os/config.py +103 -0
  173. agno/os/interfaces/agui/__init__.py +3 -0
  174. agno/os/interfaces/agui/agui.py +31 -0
  175. agno/{app/agui/async_router.py → os/interfaces/agui/router.py} +16 -16
  176. agno/{app → os/interfaces}/agui/utils.py +65 -28
  177. agno/os/interfaces/base.py +21 -0
  178. agno/os/interfaces/slack/__init__.py +3 -0
  179. agno/{app/slack/async_router.py → os/interfaces/slack/router.py} +3 -5
  180. agno/os/interfaces/slack/slack.py +33 -0
  181. agno/os/interfaces/whatsapp/__init__.py +3 -0
  182. agno/{app/whatsapp/async_router.py → os/interfaces/whatsapp/router.py} +4 -7
  183. agno/os/interfaces/whatsapp/whatsapp.py +30 -0
  184. agno/os/router.py +843 -0
  185. agno/os/routers/__init__.py +3 -0
  186. agno/os/routers/evals/__init__.py +3 -0
  187. agno/os/routers/evals/evals.py +204 -0
  188. agno/os/routers/evals/schemas.py +142 -0
  189. agno/os/routers/evals/utils.py +161 -0
  190. agno/os/routers/knowledge/__init__.py +3 -0
  191. agno/os/routers/knowledge/knowledge.py +413 -0
  192. agno/os/routers/knowledge/schemas.py +118 -0
  193. agno/os/routers/memory/__init__.py +3 -0
  194. agno/os/routers/memory/memory.py +179 -0
  195. agno/os/routers/memory/schemas.py +58 -0
  196. agno/os/routers/metrics/__init__.py +3 -0
  197. agno/os/routers/metrics/metrics.py +58 -0
  198. agno/os/routers/metrics/schemas.py +47 -0
  199. agno/os/routers/session/__init__.py +3 -0
  200. agno/os/routers/session/session.py +163 -0
  201. agno/os/schema.py +892 -0
  202. agno/{app/playground → os}/settings.py +8 -15
  203. agno/os/utils.py +270 -0
  204. agno/reasoning/azure_ai_foundry.py +4 -4
  205. agno/reasoning/deepseek.py +4 -4
  206. agno/reasoning/default.py +6 -11
  207. agno/reasoning/groq.py +4 -4
  208. agno/reasoning/helpers.py +4 -6
  209. agno/reasoning/ollama.py +4 -4
  210. agno/reasoning/openai.py +4 -4
  211. agno/run/{response.py → agent.py} +144 -72
  212. agno/run/base.py +44 -58
  213. agno/run/cancel.py +83 -0
  214. agno/run/team.py +133 -77
  215. agno/run/workflow.py +537 -12
  216. agno/session/__init__.py +10 -0
  217. agno/session/agent.py +244 -0
  218. agno/session/summary.py +225 -0
  219. agno/session/team.py +262 -0
  220. agno/{storage/session/v2 → session}/workflow.py +47 -24
  221. agno/team/__init__.py +15 -16
  222. agno/team/team.py +2961 -4253
  223. agno/tools/agentql.py +14 -5
  224. agno/tools/airflow.py +9 -4
  225. agno/tools/api.py +7 -3
  226. agno/tools/apify.py +2 -46
  227. agno/tools/arxiv.py +8 -3
  228. agno/tools/aws_lambda.py +7 -5
  229. agno/tools/aws_ses.py +7 -1
  230. agno/tools/baidusearch.py +4 -1
  231. agno/tools/bitbucket.py +4 -4
  232. agno/tools/brandfetch.py +14 -11
  233. agno/tools/bravesearch.py +4 -1
  234. agno/tools/brightdata.py +42 -22
  235. agno/tools/browserbase.py +13 -4
  236. agno/tools/calcom.py +12 -10
  237. agno/tools/calculator.py +10 -27
  238. agno/tools/cartesia.py +18 -13
  239. agno/tools/{clickup_tool.py → clickup.py} +12 -25
  240. agno/tools/confluence.py +8 -8
  241. agno/tools/crawl4ai.py +7 -1
  242. agno/tools/csv_toolkit.py +9 -8
  243. agno/tools/dalle.py +18 -11
  244. agno/tools/daytona.py +13 -16
  245. agno/tools/decorator.py +6 -3
  246. agno/tools/desi_vocal.py +16 -7
  247. agno/tools/discord.py +11 -8
  248. agno/tools/docker.py +30 -42
  249. agno/tools/duckdb.py +34 -53
  250. agno/tools/duckduckgo.py +8 -7
  251. agno/tools/e2b.py +61 -61
  252. agno/tools/eleven_labs.py +35 -28
  253. agno/tools/email.py +4 -1
  254. agno/tools/evm.py +7 -1
  255. agno/tools/exa.py +19 -14
  256. agno/tools/fal.py +29 -29
  257. agno/tools/file.py +9 -8
  258. agno/tools/financial_datasets.py +25 -44
  259. agno/tools/firecrawl.py +22 -22
  260. agno/tools/function.py +68 -17
  261. agno/tools/giphy.py +22 -10
  262. agno/tools/github.py +48 -126
  263. agno/tools/gmail.py +45 -61
  264. agno/tools/google_bigquery.py +7 -6
  265. agno/tools/google_maps.py +11 -26
  266. agno/tools/googlesearch.py +7 -2
  267. agno/tools/googlesheets.py +21 -17
  268. agno/tools/hackernews.py +9 -5
  269. agno/tools/jina.py +5 -4
  270. agno/tools/jira.py +18 -9
  271. agno/tools/knowledge.py +31 -32
  272. agno/tools/linear.py +18 -33
  273. agno/tools/linkup.py +5 -1
  274. agno/tools/local_file_system.py +8 -5
  275. agno/tools/lumalab.py +31 -19
  276. agno/tools/mem0.py +18 -12
  277. agno/tools/memori.py +14 -10
  278. agno/tools/mlx_transcribe.py +3 -2
  279. agno/tools/models/azure_openai.py +32 -14
  280. agno/tools/models/gemini.py +58 -31
  281. agno/tools/models/groq.py +29 -20
  282. agno/tools/models/nebius.py +27 -11
  283. agno/tools/models_labs.py +39 -15
  284. agno/tools/moviepy_video.py +7 -6
  285. agno/tools/neo4j.py +10 -8
  286. agno/tools/newspaper.py +7 -2
  287. agno/tools/newspaper4k.py +8 -3
  288. agno/tools/openai.py +57 -26
  289. agno/tools/openbb.py +12 -11
  290. agno/tools/opencv.py +62 -46
  291. agno/tools/openweather.py +14 -12
  292. agno/tools/pandas.py +11 -3
  293. agno/tools/postgres.py +4 -12
  294. agno/tools/pubmed.py +4 -1
  295. agno/tools/python.py +9 -22
  296. agno/tools/reasoning.py +35 -27
  297. agno/tools/reddit.py +11 -26
  298. agno/tools/replicate.py +54 -41
  299. agno/tools/resend.py +4 -1
  300. agno/tools/scrapegraph.py +15 -14
  301. agno/tools/searxng.py +10 -23
  302. agno/tools/serpapi.py +6 -3
  303. agno/tools/serper.py +13 -4
  304. agno/tools/shell.py +9 -2
  305. agno/tools/slack.py +12 -11
  306. agno/tools/sleep.py +3 -2
  307. agno/tools/spider.py +24 -4
  308. agno/tools/sql.py +7 -6
  309. agno/tools/tavily.py +6 -4
  310. agno/tools/telegram.py +12 -4
  311. agno/tools/todoist.py +11 -31
  312. agno/tools/toolkit.py +1 -1
  313. agno/tools/trafilatura.py +22 -6
  314. agno/tools/trello.py +9 -22
  315. agno/tools/twilio.py +10 -3
  316. agno/tools/user_control_flow.py +6 -1
  317. agno/tools/valyu.py +34 -5
  318. agno/tools/visualization.py +19 -28
  319. agno/tools/webbrowser.py +4 -3
  320. agno/tools/webex.py +11 -7
  321. agno/tools/website.py +15 -46
  322. agno/tools/webtools.py +12 -4
  323. agno/tools/whatsapp.py +5 -9
  324. agno/tools/wikipedia.py +20 -13
  325. agno/tools/x.py +14 -13
  326. agno/tools/yfinance.py +13 -40
  327. agno/tools/youtube.py +26 -20
  328. agno/tools/zendesk.py +7 -2
  329. agno/tools/zep.py +10 -7
  330. agno/tools/zoom.py +10 -9
  331. agno/utils/common.py +1 -19
  332. agno/utils/events.py +95 -118
  333. agno/utils/knowledge.py +29 -0
  334. agno/utils/log.py +2 -2
  335. agno/utils/mcp.py +11 -5
  336. agno/utils/media.py +39 -0
  337. agno/utils/message.py +12 -1
  338. agno/utils/models/claude.py +6 -4
  339. agno/utils/models/mistral.py +8 -7
  340. agno/utils/models/schema_utils.py +3 -3
  341. agno/utils/pprint.py +33 -32
  342. agno/utils/print_response/agent.py +779 -0
  343. agno/utils/print_response/team.py +1565 -0
  344. agno/utils/print_response/workflow.py +1451 -0
  345. agno/utils/prompts.py +14 -14
  346. agno/utils/reasoning.py +87 -0
  347. agno/utils/response.py +42 -42
  348. agno/utils/string.py +8 -22
  349. agno/utils/team.py +50 -0
  350. agno/utils/timer.py +2 -2
  351. agno/vectordb/base.py +33 -21
  352. agno/vectordb/cassandra/cassandra.py +287 -23
  353. agno/vectordb/chroma/chromadb.py +482 -59
  354. agno/vectordb/clickhouse/clickhousedb.py +270 -63
  355. agno/vectordb/couchbase/couchbase.py +309 -29
  356. agno/vectordb/lancedb/lance_db.py +360 -21
  357. agno/vectordb/langchaindb/__init__.py +5 -0
  358. agno/vectordb/langchaindb/langchaindb.py +145 -0
  359. agno/vectordb/lightrag/__init__.py +5 -0
  360. agno/vectordb/lightrag/lightrag.py +374 -0
  361. agno/vectordb/llamaindex/llamaindexdb.py +127 -0
  362. agno/vectordb/milvus/milvus.py +242 -32
  363. agno/vectordb/mongodb/mongodb.py +200 -24
  364. agno/vectordb/pgvector/pgvector.py +319 -37
  365. agno/vectordb/pineconedb/pineconedb.py +221 -27
  366. agno/vectordb/qdrant/qdrant.py +334 -14
  367. agno/vectordb/singlestore/singlestore.py +286 -29
  368. agno/vectordb/surrealdb/surrealdb.py +187 -7
  369. agno/vectordb/upstashdb/upstashdb.py +342 -26
  370. agno/vectordb/weaviate/weaviate.py +227 -165
  371. agno/workflow/__init__.py +17 -13
  372. agno/workflow/{v2/condition.py → condition.py} +135 -32
  373. agno/workflow/{v2/loop.py → loop.py} +115 -28
  374. agno/workflow/{v2/parallel.py → parallel.py} +138 -108
  375. agno/workflow/{v2/router.py → router.py} +133 -32
  376. agno/workflow/{v2/step.py → step.py} +200 -42
  377. agno/workflow/{v2/steps.py → steps.py} +147 -66
  378. agno/workflow/types.py +482 -0
  379. agno/workflow/workflow.py +2394 -696
  380. agno-2.0.0a1.dist-info/METADATA +355 -0
  381. agno-2.0.0a1.dist-info/RECORD +514 -0
  382. agno/agent/metrics.py +0 -107
  383. agno/api/app.py +0 -35
  384. agno/api/playground.py +0 -92
  385. agno/api/schemas/app.py +0 -12
  386. agno/api/schemas/playground.py +0 -22
  387. agno/api/schemas/user.py +0 -35
  388. agno/api/schemas/workspace.py +0 -46
  389. agno/api/user.py +0 -160
  390. agno/api/workflows.py +0 -33
  391. agno/api/workspace.py +0 -175
  392. agno/app/agui/__init__.py +0 -3
  393. agno/app/agui/app.py +0 -17
  394. agno/app/agui/sync_router.py +0 -120
  395. agno/app/base.py +0 -186
  396. agno/app/discord/__init__.py +0 -3
  397. agno/app/fastapi/__init__.py +0 -3
  398. agno/app/fastapi/app.py +0 -107
  399. agno/app/fastapi/async_router.py +0 -457
  400. agno/app/fastapi/sync_router.py +0 -448
  401. agno/app/playground/app.py +0 -228
  402. agno/app/playground/async_router.py +0 -1050
  403. agno/app/playground/deploy.py +0 -249
  404. agno/app/playground/operator.py +0 -183
  405. agno/app/playground/schemas.py +0 -220
  406. agno/app/playground/serve.py +0 -55
  407. agno/app/playground/sync_router.py +0 -1042
  408. agno/app/playground/utils.py +0 -46
  409. agno/app/settings.py +0 -15
  410. agno/app/slack/__init__.py +0 -3
  411. agno/app/slack/app.py +0 -19
  412. agno/app/slack/sync_router.py +0 -92
  413. agno/app/utils.py +0 -54
  414. agno/app/whatsapp/__init__.py +0 -3
  415. agno/app/whatsapp/app.py +0 -15
  416. agno/app/whatsapp/sync_router.py +0 -197
  417. agno/cli/auth_server.py +0 -249
  418. agno/cli/config.py +0 -274
  419. agno/cli/console.py +0 -88
  420. agno/cli/credentials.py +0 -23
  421. agno/cli/entrypoint.py +0 -571
  422. agno/cli/operator.py +0 -357
  423. agno/cli/settings.py +0 -96
  424. agno/cli/ws/ws_cli.py +0 -817
  425. agno/constants.py +0 -13
  426. agno/document/__init__.py +0 -5
  427. agno/document/chunking/semantic.py +0 -45
  428. agno/document/chunking/strategy.py +0 -31
  429. agno/document/reader/__init__.py +0 -5
  430. agno/document/reader/base.py +0 -47
  431. agno/document/reader/docx_reader.py +0 -60
  432. agno/document/reader/gcs/pdf_reader.py +0 -44
  433. agno/document/reader/s3/pdf_reader.py +0 -59
  434. agno/document/reader/s3/text_reader.py +0 -63
  435. agno/document/reader/url_reader.py +0 -59
  436. agno/document/reader/youtube_reader.py +0 -58
  437. agno/embedder/__init__.py +0 -5
  438. agno/embedder/langdb.py +0 -80
  439. agno/embedder/mistral.py +0 -82
  440. agno/embedder/openai.py +0 -78
  441. agno/file/__init__.py +0 -5
  442. agno/file/file.py +0 -16
  443. agno/file/local/csv.py +0 -32
  444. agno/file/local/txt.py +0 -19
  445. agno/infra/app.py +0 -240
  446. agno/infra/base.py +0 -144
  447. agno/infra/context.py +0 -20
  448. agno/infra/db_app.py +0 -52
  449. agno/infra/resource.py +0 -205
  450. agno/infra/resources.py +0 -55
  451. agno/knowledge/agent.py +0 -702
  452. agno/knowledge/arxiv.py +0 -33
  453. agno/knowledge/combined.py +0 -36
  454. agno/knowledge/csv.py +0 -144
  455. agno/knowledge/csv_url.py +0 -124
  456. agno/knowledge/document.py +0 -223
  457. agno/knowledge/docx.py +0 -137
  458. agno/knowledge/firecrawl.py +0 -34
  459. agno/knowledge/gcs/__init__.py +0 -0
  460. agno/knowledge/gcs/base.py +0 -39
  461. agno/knowledge/gcs/pdf.py +0 -125
  462. agno/knowledge/json.py +0 -137
  463. agno/knowledge/langchain.py +0 -71
  464. agno/knowledge/light_rag.py +0 -273
  465. agno/knowledge/llamaindex.py +0 -66
  466. agno/knowledge/markdown.py +0 -154
  467. agno/knowledge/pdf.py +0 -164
  468. agno/knowledge/pdf_bytes.py +0 -42
  469. agno/knowledge/pdf_url.py +0 -148
  470. agno/knowledge/s3/__init__.py +0 -0
  471. agno/knowledge/s3/base.py +0 -64
  472. agno/knowledge/s3/pdf.py +0 -33
  473. agno/knowledge/s3/text.py +0 -34
  474. agno/knowledge/text.py +0 -141
  475. agno/knowledge/url.py +0 -46
  476. agno/knowledge/website.py +0 -179
  477. agno/knowledge/wikipedia.py +0 -32
  478. agno/knowledge/youtube.py +0 -35
  479. agno/memory/agent.py +0 -423
  480. agno/memory/classifier.py +0 -104
  481. agno/memory/db/__init__.py +0 -5
  482. agno/memory/db/base.py +0 -42
  483. agno/memory/db/mongodb.py +0 -189
  484. agno/memory/db/postgres.py +0 -203
  485. agno/memory/db/sqlite.py +0 -193
  486. agno/memory/memory.py +0 -22
  487. agno/memory/row.py +0 -36
  488. agno/memory/summarizer.py +0 -201
  489. agno/memory/summary.py +0 -19
  490. agno/memory/team.py +0 -415
  491. agno/memory/v2/__init__.py +0 -2
  492. agno/memory/v2/db/__init__.py +0 -1
  493. agno/memory/v2/db/base.py +0 -42
  494. agno/memory/v2/db/firestore.py +0 -339
  495. agno/memory/v2/db/mongodb.py +0 -196
  496. agno/memory/v2/db/postgres.py +0 -214
  497. agno/memory/v2/db/redis.py +0 -187
  498. agno/memory/v2/db/schema.py +0 -54
  499. agno/memory/v2/db/sqlite.py +0 -209
  500. agno/memory/v2/manager.py +0 -437
  501. agno/memory/v2/memory.py +0 -1097
  502. agno/memory/v2/schema.py +0 -55
  503. agno/memory/v2/summarizer.py +0 -215
  504. agno/memory/workflow.py +0 -38
  505. agno/models/ollama/tools.py +0 -430
  506. agno/models/qwen/__init__.py +0 -5
  507. agno/playground/__init__.py +0 -10
  508. agno/playground/deploy.py +0 -3
  509. agno/playground/playground.py +0 -3
  510. agno/playground/serve.py +0 -3
  511. agno/playground/settings.py +0 -3
  512. agno/reranker/__init__.py +0 -0
  513. agno/run/v2/__init__.py +0 -0
  514. agno/run/v2/workflow.py +0 -567
  515. agno/storage/__init__.py +0 -0
  516. agno/storage/agent/__init__.py +0 -0
  517. agno/storage/agent/dynamodb.py +0 -1
  518. agno/storage/agent/json.py +0 -1
  519. agno/storage/agent/mongodb.py +0 -1
  520. agno/storage/agent/postgres.py +0 -1
  521. agno/storage/agent/singlestore.py +0 -1
  522. agno/storage/agent/sqlite.py +0 -1
  523. agno/storage/agent/yaml.py +0 -1
  524. agno/storage/base.py +0 -60
  525. agno/storage/dynamodb.py +0 -673
  526. agno/storage/firestore.py +0 -297
  527. agno/storage/gcs_json.py +0 -261
  528. agno/storage/in_memory.py +0 -234
  529. agno/storage/json.py +0 -237
  530. agno/storage/mongodb.py +0 -328
  531. agno/storage/mysql.py +0 -685
  532. agno/storage/postgres.py +0 -682
  533. agno/storage/redis.py +0 -336
  534. agno/storage/session/__init__.py +0 -16
  535. agno/storage/session/agent.py +0 -64
  536. agno/storage/session/team.py +0 -63
  537. agno/storage/session/v2/__init__.py +0 -5
  538. agno/storage/session/workflow.py +0 -61
  539. agno/storage/singlestore.py +0 -606
  540. agno/storage/sqlite.py +0 -646
  541. agno/storage/workflow/__init__.py +0 -0
  542. agno/storage/workflow/mongodb.py +0 -1
  543. agno/storage/workflow/postgres.py +0 -1
  544. agno/storage/workflow/sqlite.py +0 -1
  545. agno/storage/yaml.py +0 -241
  546. agno/tools/thinking.py +0 -73
  547. agno/utils/defaults.py +0 -57
  548. agno/utils/filesystem.py +0 -39
  549. agno/utils/git.py +0 -52
  550. agno/utils/json_io.py +0 -30
  551. agno/utils/load_env.py +0 -19
  552. agno/utils/py_io.py +0 -19
  553. agno/utils/pyproject.py +0 -18
  554. agno/utils/resource_filter.py +0 -31
  555. agno/workflow/v2/__init__.py +0 -21
  556. agno/workflow/v2/types.py +0 -357
  557. agno/workflow/v2/workflow.py +0 -3312
  558. agno/workspace/__init__.py +0 -0
  559. agno/workspace/config.py +0 -325
  560. agno/workspace/enums.py +0 -6
  561. agno/workspace/helpers.py +0 -52
  562. agno/workspace/operator.py +0 -757
  563. agno/workspace/settings.py +0 -158
  564. agno-1.8.1.dist-info/METADATA +0 -982
  565. agno-1.8.1.dist-info/RECORD +0 -566
  566. agno-1.8.1.dist-info/entry_points.txt +0 -3
  567. /agno/{app → db/migrations}/__init__.py +0 -0
  568. /agno/{app/playground/__init__.py → db/schemas/metrics.py} +0 -0
  569. /agno/{cli → integrations}/__init__.py +0 -0
  570. /agno/{cli/ws → knowledge/chunking}/__init__.py +0 -0
  571. /agno/{document/chunking → knowledge/remote_content}/__init__.py +0 -0
  572. /agno/{document/reader/gcs → knowledge/reranker}/__init__.py +0 -0
  573. /agno/{document/reader/s3 → os/interfaces}/__init__.py +0 -0
  574. /agno/{app → os/interfaces}/slack/security.py +0 -0
  575. /agno/{app → os/interfaces}/whatsapp/security.py +0 -0
  576. /agno/{file/local → utils/print_response}/__init__.py +0 -0
  577. /agno/{infra → vectordb/llamaindex}/__init__.py +0 -0
  578. {agno-1.8.1.dist-info → agno-2.0.0a1.dist-info}/WHEEL +0 -0
  579. {agno-1.8.1.dist-info → agno-2.0.0a1.dist-info}/licenses/LICENSE +0 -0
  580. {agno-1.8.1.dist-info → agno-2.0.0a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1515 @@
1
+ import asyncio
2
+ import hashlib
3
+ import io
4
+ import time
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from functools import cached_property
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
10
+ from uuid import uuid4
11
+
12
+ from agno.db.base import BaseDb
13
+ from agno.db.schemas.knowledge import KnowledgeRow
14
+ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
15
+ from agno.knowledge.document import Document
16
+ from agno.knowledge.reader import Reader, ReaderFactory
17
+ from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
18
+ from agno.utils.log import log_debug, log_error, log_info, log_warning
19
+ from agno.vectordb import VectorDb
20
+
21
+ ContentDict = Dict[str, Union[str, Dict[str, str]]]
22
+
23
+
24
+ class KnowledgeContentOrigin(Enum):
25
+ PATH = "path"
26
+ URL = "url"
27
+ TOPIC = "topic"
28
+ CONTENT = "content"
29
+
30
+
31
+ @dataclass
32
+ class Knowledge:
33
+ """Knowledge class"""
34
+
35
+ name: Optional[str] = None
36
+ description: Optional[str] = None
37
+ vector_db: Optional[VectorDb] = None
38
+ contents_db: Optional[BaseDb] = None
39
+ max_results: int = 10
40
+ readers: Optional[Dict[str, Reader]] = None
41
+
42
+ def __post_init__(self):
43
+ if self.vector_db and not self.vector_db.exists():
44
+ self.vector_db.create()
45
+
46
+ self.construct_readers()
47
+ self.valid_metadata_filters = set()
48
+
49
+ # --- SDK Specific Methods ---
50
+
51
+ # --- Add Contents ---
52
+ @overload
53
+ async def add_contents_async(self, contents: List[ContentDict]) -> None: ...
54
+
55
+ @overload
56
+ async def add_contents_async(
57
+ self,
58
+ *,
59
+ paths: Optional[List[str]] = None,
60
+ urls: Optional[List[str]] = None,
61
+ metadata: Optional[Dict[str, str]] = None,
62
+ include: Optional[List[str]] = None,
63
+ exclude: Optional[List[str]] = None,
64
+ upsert: bool = False,
65
+ skip_if_exists: bool = False,
66
+ remote_content: Optional[RemoteContent] = None,
67
+ ) -> None: ...
68
+
69
+ async def add_contents_async(self, *args, **kwargs) -> None:
70
+ if args and isinstance(args[0], list):
71
+ arguments = args[0]
72
+ for argument in arguments:
73
+ await self.add_content_async(
74
+ name=argument.get("name"),
75
+ description=argument.get("description"),
76
+ path=argument.get("path"),
77
+ url=argument.get("url"),
78
+ metadata=argument.get("metadata"),
79
+ topics=argument.get("topics"),
80
+ reader=argument.get("reader"),
81
+ include=argument.get("include"),
82
+ exclude=argument.get("exclude"),
83
+ upsert=argument.get("upsert", False),
84
+ skip_if_exists=argument.get("skip_if_exists", False),
85
+ remote_content=argument.get("remote_content", None),
86
+ )
87
+
88
+ elif kwargs:
89
+ name = kwargs.get("name", [])
90
+ metadata = kwargs.get("metadata", {})
91
+ description = kwargs.get("description", [])
92
+ topics = kwargs.get("topics", [])
93
+ paths = kwargs.get("paths", [])
94
+ urls = kwargs.get("urls", [])
95
+ include = kwargs.get("include")
96
+ exclude = kwargs.get("exclude")
97
+ upsert = kwargs.get("upsert", False)
98
+ skip_if_exists = kwargs.get("skip_if_exists", False)
99
+ remote_content = kwargs.get("remote_content", None)
100
+
101
+ for path in paths:
102
+ await self.add_content_async(
103
+ name=name,
104
+ description=description,
105
+ path=path,
106
+ metadata=metadata,
107
+ include=include,
108
+ exclude=exclude,
109
+ upsert=upsert,
110
+ skip_if_exists=skip_if_exists,
111
+ )
112
+ for url in urls:
113
+ await self.add_content_async(
114
+ name=name,
115
+ description=description,
116
+ url=url,
117
+ metadata=metadata,
118
+ include=include,
119
+ exclude=exclude,
120
+ upsert=upsert,
121
+ skip_if_exists=skip_if_exists,
122
+ )
123
+ if topics:
124
+ await self.add_content_async(
125
+ name=name,
126
+ description=description,
127
+ topics=topics,
128
+ metadata=metadata,
129
+ include=include,
130
+ exclude=exclude,
131
+ upsert=upsert,
132
+ skip_if_exists=skip_if_exists,
133
+ )
134
+
135
+ if remote_content:
136
+ await self.add_content_async(
137
+ name=name,
138
+ metadata=metadata,
139
+ description=description,
140
+ remote_content=remote_content,
141
+ upsert=upsert,
142
+ skip_if_exists=skip_if_exists,
143
+ )
144
+
145
+ else:
146
+ raise ValueError("Invalid usage of add_contents.")
147
+
148
+ @overload
149
+ def add_contents(self, contents: List[ContentDict]) -> None: ...
150
+
151
+ @overload
152
+ def add_contents(
153
+ self,
154
+ *,
155
+ paths: Optional[List[str]] = None,
156
+ urls: Optional[List[str]] = None,
157
+ metadata: Optional[Dict[str, str]] = None,
158
+ include: Optional[List[str]] = None,
159
+ exclude: Optional[List[str]] = None,
160
+ upsert: bool = False,
161
+ skip_if_exists: bool = False,
162
+ ) -> None: ...
163
+
164
+ def add_contents(self, *args, **kwargs) -> None:
165
+ """
166
+ Synchronously add multiple content items to the knowledge base.
167
+
168
+ This method wraps the asynchronous add_contents method
169
+
170
+ Supports two usage patterns:
171
+ 1. Pass a list of content dictionaries as first argument
172
+ 2. Pass keyword arguments with paths, urls, metadata, etc.
173
+
174
+ Args:
175
+ contents: List of content dictionaries (when used as first overload)
176
+ paths: Optional list of file paths to load content from
177
+ urls: Optional list of URLs to load content from
178
+ metadata: Optional metadata dictionary to apply to all content
179
+ include: Optional list of file patterns to include
180
+ exclude: Optional list of file patterns to exclude
181
+ upsert: Whether to update existing content if it already exists
182
+ skip_if_exists: Whether to skip adding content if it already exists
183
+ """
184
+ asyncio.run(self.add_contents_async(*args, **kwargs))
185
+
186
+ # --- Add Content ---
187
+
188
+ @overload
189
+ async def add_content_async(
190
+ self,
191
+ *,
192
+ path: Optional[str] = None,
193
+ url: Optional[str] = None,
194
+ text_content: Optional[str] = None,
195
+ metadata: Optional[Dict[str, str]] = None,
196
+ include: Optional[List[str]] = None,
197
+ exclude: Optional[List[str]] = None,
198
+ upsert: bool = False,
199
+ skip_if_exists: bool = False,
200
+ reader: Optional[Reader] = None,
201
+ auth: Optional[ContentAuth] = None,
202
+ ) -> None: ...
203
+
204
+ @overload
205
+ async def add_content_async(self, *args, **kwargs) -> None: ...
206
+
207
+ async def add_content_async(
208
+ self,
209
+ name: Optional[str] = None,
210
+ description: Optional[str] = None,
211
+ path: Optional[str] = None,
212
+ url: Optional[str] = None,
213
+ text_content: Optional[str] = None,
214
+ metadata: Optional[Dict[str, Any]] = None,
215
+ topics: Optional[List[str]] = None,
216
+ remote_content: Optional[RemoteContent] = None,
217
+ reader: Optional[Reader] = None,
218
+ include: Optional[List[str]] = None,
219
+ exclude: Optional[List[str]] = None,
220
+ upsert: bool = True,
221
+ skip_if_exists: bool = True,
222
+ auth: Optional[ContentAuth] = None,
223
+ ) -> None:
224
+ # Validation: At least one of the parameters must be provided
225
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
226
+ log_info("At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided.")
227
+ return
228
+
229
+ if not skip_if_exists:
230
+ log_info("skip_if_exists is disabled, disabling upsert")
231
+ upsert = False
232
+
233
+ content = None
234
+ file_data = None
235
+ if text_content:
236
+ file_data = FileData(content=text_content, type="Text")
237
+
238
+ content = Content(
239
+ id=str(uuid4()),
240
+ name=name,
241
+ description=description,
242
+ path=path,
243
+ url=url,
244
+ file_data=file_data if file_data else None,
245
+ metadata=metadata,
246
+ topics=topics,
247
+ remote_content=remote_content,
248
+ reader=reader,
249
+ auth=auth,
250
+ )
251
+
252
+ await self._load_content(content, upsert, skip_if_exists, include, exclude)
253
+
254
+ @overload
255
+ def add_content(
256
+ self,
257
+ *,
258
+ path: Optional[str] = None,
259
+ url: Optional[str] = None,
260
+ text_content: Optional[str] = None,
261
+ metadata: Optional[Dict[str, str]] = None,
262
+ include: Optional[List[str]] = None,
263
+ exclude: Optional[List[str]] = None,
264
+ upsert: bool = False,
265
+ skip_if_exists: bool = False,
266
+ reader: Optional[Reader] = None,
267
+ auth: Optional[ContentAuth] = None,
268
+ ) -> None: ...
269
+
270
+ @overload
271
+ def add_content(self, *args, **kwargs) -> None: ...
272
+
273
+ def add_content(
274
+ self,
275
+ name: Optional[str] = None,
276
+ description: Optional[str] = None,
277
+ path: Optional[str] = None,
278
+ url: Optional[str] = None,
279
+ text_content: Optional[str] = None,
280
+ metadata: Optional[Dict[str, Any]] = None,
281
+ topics: Optional[List[str]] = None,
282
+ remote_content: Optional[RemoteContent] = None,
283
+ reader: Optional[Reader] = None,
284
+ include: Optional[List[str]] = None,
285
+ exclude: Optional[List[str]] = None,
286
+ upsert: bool = True,
287
+ skip_if_exists: bool = True,
288
+ auth: Optional[ContentAuth] = None,
289
+ ) -> None:
290
+ """
291
+ Synchronously add content to the knowledge base.
292
+
293
+ Args:
294
+ name: Optional name for the content
295
+ description: Optional description for the content
296
+ path: Optional file path to load content from
297
+ url: Optional URL to load content from
298
+ text_content: Optional text content to add directly
299
+ metadata: Optional metadata dictionary
300
+ topics: Optional list of topics
301
+ config: Optional cloud storage configuration
302
+ reader: Optional custom reader for processing the content
303
+ include: Optional list of file patterns to include
304
+ exclude: Optional list of file patterns to exclude
305
+ upsert: Whether to update existing content if it already exists
306
+ skip_if_exists: Whether to skip adding content if it already exists
307
+ """
308
+ asyncio.run(
309
+ self.add_content_async(
310
+ name=name,
311
+ description=description,
312
+ path=path,
313
+ url=url,
314
+ text_content=text_content,
315
+ metadata=metadata,
316
+ topics=topics,
317
+ remote_content=remote_content,
318
+ reader=reader,
319
+ include=include,
320
+ exclude=exclude,
321
+ upsert=upsert,
322
+ skip_if_exists=skip_if_exists,
323
+ auth=auth,
324
+ )
325
+ )
326
+
327
+ async def _load_from_path(
328
+ self,
329
+ content: Content,
330
+ upsert: bool,
331
+ skip_if_exists: bool,
332
+ include: Optional[List[str]] = None,
333
+ exclude: Optional[List[str]] = None,
334
+ ):
335
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
336
+ path = Path(content.path) # type: ignore
337
+
338
+ if path.is_file():
339
+ if self._should_include_file(str(path), include, exclude):
340
+ log_info(f"Adding file {path} due to include/exclude filters")
341
+
342
+ # Handle LightRAG special case - read file and upload directly
343
+ if self.vector_db.__class__.__name__ == "LightRag":
344
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
345
+ return
346
+
347
+ content.content_hash = self._build_content_hash(content)
348
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
349
+ log_info(f"Content {content.content_hash} already exists, skipping")
350
+ return
351
+
352
+ self._add_to_contents_db(content)
353
+
354
+ if content.reader:
355
+ # TODO: We will refactor this to eventually pass authorization to all readers
356
+ import inspect
357
+
358
+ read_signature = inspect.signature(content.reader.read)
359
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
360
+ read_documents = content.reader.read(
361
+ path, name=content.name or path.name, password=content.auth.password
362
+ )
363
+ else:
364
+ read_documents = content.reader.read(path, name=content.name or path.name)
365
+
366
+ else:
367
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
368
+ log_info(f"Using Reader: {reader.__class__.__name__}")
369
+ if reader:
370
+ # TODO: We will refactor this to eventually pass authorization to all readers
371
+ import inspect
372
+
373
+ read_signature = inspect.signature(reader.read)
374
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
375
+ read_documents = reader.read(
376
+ path, name=content.name or path.name, password=content.auth.password
377
+ )
378
+ else:
379
+ read_documents = reader.read(path, name=content.name or path.name)
380
+
381
+ if not content.file_type:
382
+ content.file_type = path.suffix
383
+
384
+ if not content.size and content.file_data:
385
+ content.size = len(content.file_data.content) # type: ignore
386
+ if not content.size:
387
+ try:
388
+ content.size = path.stat().st_size
389
+ except (OSError, IOError) as e:
390
+ log_warning(f"Could not get file size for {path}: {e}")
391
+ content.size = 0
392
+
393
+ for read_document in read_documents:
394
+ read_document.content_id = content.id
395
+
396
+ await self._handle_vector_db_insert(content, read_documents, upsert)
397
+
398
+ elif path.is_dir():
399
+ for file_path in path.iterdir():
400
+ # Apply include/exclude filtering
401
+ if not self._should_include_file(str(file_path), include, exclude):
402
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
403
+ continue
404
+
405
+ id = str(uuid4())
406
+ file_content = Content(
407
+ id=id,
408
+ name=content.name,
409
+ path=str(file_path),
410
+ metadata=content.metadata,
411
+ description=content.description,
412
+ reader=content.reader,
413
+ )
414
+ await self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
415
+ else:
416
+ log_warning(f"Invalid path: {path}")
417
+
418
+ async def _load_from_url(
419
+ self,
420
+ content: Content,
421
+ upsert: bool,
422
+ skip_if_exists: bool,
423
+ ):
424
+ log_info(f"Adding content from URL {content.url}")
425
+ content.file_type = "url"
426
+
427
+ if self.vector_db.__class__.__name__ == "LightRag":
428
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
429
+ return
430
+
431
+ content.content_hash = self._build_content_hash(content)
432
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
433
+ log_info(f"Content {content.content_hash} already exists, skipping")
434
+ return
435
+ self._add_to_contents_db(content)
436
+
437
+ # Validate URL
438
+ try:
439
+ from urllib.parse import urlparse
440
+
441
+ parsed_url = urlparse(content.url)
442
+ if not all([parsed_url.scheme, parsed_url.netloc]):
443
+ content.status = ContentStatus.FAILED
444
+ content.status_message = f"Invalid URL format: {content.url}"
445
+ self._update_content(content)
446
+ log_warning(f"Invalid URL format: {content.url}")
447
+ except Exception as e:
448
+ content.status = ContentStatus.FAILED
449
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
450
+ self._update_content(content)
451
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
452
+
453
+ # Determine file type from URL
454
+ url_path = Path(parsed_url.path) # type: ignore
455
+ file_extension = url_path.suffix.lower()
456
+ read_documents = []
457
+ try:
458
+ if content.url.endswith("llms-full.txt") or content.url.endswith("llms.txt"): # type: ignore
459
+ log_info("Detected llms, using url reader")
460
+ reader = content.reader or self.url_reader
461
+ if reader is not None:
462
+ # TODO: We will refactor this to eventually pass authorization to all readers
463
+ import inspect
464
+
465
+ read_signature = inspect.signature(reader.read)
466
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
467
+ read_documents = reader.read(content.url, name=content.name, password=content.auth.password)
468
+ else:
469
+ read_documents = reader.read(content.url, name=content.name)
470
+
471
+ elif file_extension and file_extension is not None:
472
+ log_info(f"Detected file type: {file_extension} from URL: {content.url}")
473
+ if content.reader:
474
+ reader = content.reader
475
+ else:
476
+ reader = self._select_url_file_reader(file_extension)
477
+ if reader is not None:
478
+ log_info(f"Selected reader: {reader.__class__.__name__}")
479
+ # TODO: We will refactor this to eventually pass authorization to all readers
480
+ import inspect
481
+
482
+ read_signature = inspect.signature(reader.read)
483
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
484
+ read_documents = reader.read(content.url, name=content.name, password=content.auth.password)
485
+ else:
486
+ read_documents = reader.read(content.url, name=content.name)
487
+ else:
488
+ log_info(f"No reader found for file extension: {file_extension}")
489
+ else:
490
+ log_info(f"No file extension found for URL: {content.url}, determining website type")
491
+ if content.reader:
492
+ reader = content.reader
493
+ else:
494
+ reader = self._select_url_reader(content.url) # type: ignore
495
+ if reader is not None:
496
+ log_info(f"Selected reader: {reader.__class__.__name__}")
497
+ # TODO: We will refactor this to eventually pass authorization to all readers
498
+ import inspect
499
+
500
+ read_signature = inspect.signature(reader.read)
501
+ if "password" in read_signature.parameters and content.auth and content.auth.password:
502
+ read_documents = reader.read(content.url, name=content.name, password=content.auth.password)
503
+ else:
504
+ read_documents = reader.read(content.url, name=content.name)
505
+ else:
506
+ log_info(f"No reader found for URL: {content.url}")
507
+
508
+ except Exception as e:
509
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
510
+ content.status = ContentStatus.FAILED
511
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
512
+ self._update_content(content)
513
+ return
514
+
515
+ file_size = 0
516
+ if read_documents:
517
+ for read_document in read_documents:
518
+ if read_document.size:
519
+ file_size += read_document.size
520
+ read_document.content_id = content.id
521
+
522
+ await self._handle_vector_db_insert(content, read_documents, upsert)
523
+
524
+ async def _load_from_content(
525
+ self,
526
+ content: Content,
527
+ upsert: bool = True,
528
+ skip_if_exists: bool = True,
529
+ ):
530
+ if content.name:
531
+ name = content.name
532
+ elif content.file_data and content.file_data.content:
533
+ if isinstance(content.file_data.content, bytes):
534
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
535
+ elif isinstance(content.file_data.content, str):
536
+ name = (
537
+ content.file_data.content[:10]
538
+ if len(content.file_data.content) >= 10
539
+ else content.file_data.content
540
+ )
541
+ else:
542
+ name = str(content.file_data.content)[:10]
543
+ else:
544
+ name = None
545
+
546
+ if name is not None:
547
+ content.name = name
548
+
549
+ log_info(f"Adding content from {content.name}")
550
+
551
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
552
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
553
+ return
554
+
555
+ content.content_hash = self._build_content_hash(content)
556
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
557
+ log_info(f"Content {content.content_hash} already exists, skipping")
558
+
559
+ return
560
+ self._add_to_contents_db(content)
561
+
562
+ read_documents = []
563
+
564
+ if isinstance(content.file_data, str):
565
+ try:
566
+ content_bytes = content.file_data.encode("utf-8")
567
+ except UnicodeEncodeError:
568
+ content_bytes = content.file_data.encode("latin-1")
569
+ content_io = io.BytesIO(content_bytes)
570
+
571
+ if content.reader:
572
+ log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
573
+ read_documents = content.reader.read(content_io, name=name)
574
+ else:
575
+ text_reader = self.text_reader
576
+ if text_reader:
577
+ read_documents = text_reader.read(content_io, name=name)
578
+ else:
579
+ content.status = ContentStatus.FAILED
580
+ content.status_message = "Text reader not available"
581
+ self._update_content(content)
582
+ return
583
+
584
+ elif isinstance(content.file_data, FileData):
585
+ if content.file_data.type:
586
+ if isinstance(content.file_data.content, bytes):
587
+ content_io = io.BytesIO(content.file_data.content)
588
+ elif isinstance(content.file_data.content, str):
589
+ if self._is_text_mime_type(content.file_data.type):
590
+ try:
591
+ content_bytes = content.file_data.content.encode("utf-8")
592
+ except UnicodeEncodeError:
593
+ log_debug(f"UTF-8 encoding failed for {content.file_data.type}, using latin-1")
594
+ content_bytes = content.file_data.content.encode("latin-1")
595
+ else:
596
+ content_bytes = content.file_data.content.encode("latin-1")
597
+ content_io = io.BytesIO(content_bytes)
598
+ else:
599
+ content_io = content.file_data.content # type: ignore
600
+
601
+ # Respect an explicitly provided reader; otherwise select based on file type
602
+ if content.reader:
603
+ log_info(f"Using reader: {content.reader.__class__.__name__} to read content")
604
+ reader = content.reader
605
+ else:
606
+ reader = self._select_reader(content.file_data.type)
607
+ name = content.name if content.name else f"content_{content.file_data.type}"
608
+ read_documents = reader.read(content_io, name=name)
609
+
610
+ for read_document in read_documents:
611
+ if content.metadata:
612
+ read_document.meta_data.update(content.metadata)
613
+ read_document.content_id = content.id
614
+
615
+ if len(read_documents) == 0:
616
+ content.status = ContentStatus.FAILED
617
+ content.status_message = "Content could not be read"
618
+ self._update_content(content)
619
+
620
+ else:
621
+ content.status = ContentStatus.FAILED
622
+ content.status_message = "No content provided"
623
+ self._update_content(content)
624
+ return
625
+
626
+ await self._handle_vector_db_insert(content, read_documents, upsert)
627
+
628
+ async def _load_from_topics(
629
+ self,
630
+ content: Content,
631
+ upsert: bool,
632
+ skip_if_exists: bool,
633
+ ):
634
+ log_info(f"Adding content from topics: {content.topics}")
635
+
636
+ if content.topics is None:
637
+ log_warning("No topics provided for content")
638
+ return
639
+
640
+ for topic in content.topics:
641
+ id = str(uuid4())
642
+ content = Content(
643
+ id=id,
644
+ name=topic,
645
+ metadata=content.metadata,
646
+ reader=content.reader,
647
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
648
+ file_data=FileData(
649
+ type="Topic",
650
+ ),
651
+ topics=[topic],
652
+ )
653
+
654
+ if self.vector_db.__class__.__name__ == "LightRag":
655
+ await self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
656
+ return
657
+
658
+ content.content_hash = self._build_content_hash(content)
659
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
660
+ log_info(f"Content {content.content_hash} already exists, skipping")
661
+ continue
662
+
663
+ self._add_to_contents_db(content)
664
+ if content.reader is None:
665
+ log_error(f"No reader available for topic: {topic}")
666
+ continue
667
+ read_documents = content.reader.read(topic)
668
+ if len(read_documents) > 0:
669
+ for read_document in read_documents:
670
+ read_document.content_id = id
671
+ if read_document.content:
672
+ read_document.size = len(read_document.content.encode("utf-8"))
673
+ else:
674
+ content.status = ContentStatus.FAILED
675
+ content.status_message = "No content found for topic"
676
+ self._update_content(content)
677
+
678
+ await self._handle_vector_db_insert(content, read_documents, upsert)
679
+
680
+ async def _load_from_remote_content(
681
+ self,
682
+ content: Content,
683
+ upsert: bool,
684
+ skip_if_exists: bool,
685
+ ):
686
+ if content.remote_content is None:
687
+ log_warning("No remote content provided for content")
688
+ return
689
+
690
+ remote_content = content.remote_content
691
+
692
+ if isinstance(remote_content, S3Content):
693
+ await self._load_from_s3(content, upsert, skip_if_exists)
694
+
695
+ elif isinstance(remote_content, GCSContent):
696
+ await self._load_from_gcs(content, upsert, skip_if_exists)
697
+
698
+ else:
699
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
700
+
701
+ async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
702
+ from agno.aws.resource.s3.object import S3Object # type: ignore
703
+
704
+ if content.reader is None:
705
+ reader = self.s3_reader
706
+ else:
707
+ reader = content.reader
708
+
709
+ if reader is None:
710
+ log_warning("No reader provided for content")
711
+ return
712
+
713
+ remote_content: S3Content = cast(S3Content, content.remote_content)
714
+
715
+ objects_to_read: List[S3Object] = []
716
+
717
+ if remote_content.bucket is not None:
718
+ if remote_content.key is not None:
719
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
720
+ objects_to_read.append(_object)
721
+ elif remote_content.object is not None:
722
+ objects_to_read.append(remote_content.object)
723
+ elif remote_content.prefix is not None:
724
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
725
+ else:
726
+ objects_to_read.extend(remote_content.bucket.get_objects())
727
+
728
+ for object in objects_to_read:
729
+ id = str(uuid4())
730
+ content_name = content.name or ""
731
+ content_name += "_" + (object.name or "")
732
+ content_entry = Content(
733
+ id=id,
734
+ name=content_name,
735
+ description=content.description,
736
+ status=ContentStatus.PROCESSING,
737
+ metadata=content.metadata,
738
+ file_type="s3",
739
+ )
740
+
741
+ content_hash = self._build_content_hash(content_entry)
742
+ if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
743
+ log_info(f"Content {content_hash} already exists, skipping")
744
+ continue
745
+
746
+ self._add_to_contents_db(content_entry)
747
+
748
+ read_documents = reader.read(content_entry.name, object)
749
+
750
+ for read_document in read_documents:
751
+ read_document.content_id = content.id
752
+
753
+ await self._handle_vector_db_insert(content_entry, read_documents, upsert)
754
+
755
+ async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
756
+ if content.reader is None:
757
+ reader = self.gcs_reader
758
+ else:
759
+ reader = content.reader
760
+
761
+ if reader is None:
762
+ log_warning("No reader provided for content")
763
+ return
764
+
765
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
766
+ objects_to_read = []
767
+
768
+ if remote_content.blob_name is not None:
769
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name))
770
+ elif remote_content.prefix is not None:
771
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix))
772
+ else:
773
+ objects_to_read.extend(remote_content.bucket.list_blobs())
774
+
775
+ for object in objects_to_read:
776
+ id = str(uuid4())
777
+ content_entry = Content(
778
+ id=id,
779
+ name=(content.name or "content") + "_" + object.name,
780
+ description=content.description,
781
+ status=ContentStatus.PROCESSING,
782
+ metadata=content.metadata,
783
+ file_type="gcs",
784
+ )
785
+
786
+ content_hash = self._build_content_hash(content_entry)
787
+ if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
788
+ log_info(f"Content {content_hash} already exists, skipping")
789
+ continue
790
+
791
+ self._add_to_contents_db(content_entry)
792
+
793
+ read_documents = reader.read(content_entry.name, object)
794
+
795
+ for read_document in read_documents:
796
+ read_document.content_id = content.id
797
+
798
+ await self._handle_vector_db_insert(content_entry, read_documents, upsert)
799
+
800
+ async def _handle_vector_db_insert(self, content, read_documents, upsert):
801
+ if not self.vector_db:
802
+ log_error("No vector database configured")
803
+ content.status = ContentStatus.FAILED
804
+ content.status_message = "No vector database configured"
805
+ self._update_content(content)
806
+ return
807
+
808
+ if self.vector_db.upsert_available() and upsert:
809
+ try:
810
+ await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata)
811
+ except Exception as e:
812
+ log_error(f"Error upserting document: {e}")
813
+ content.status = ContentStatus.FAILED
814
+ content.status_message = "Could not upsert embedding"
815
+ self._update_content(content)
816
+ return
817
+ else:
818
+ try:
819
+ await self.vector_db.async_insert(
820
+ content.content_hash, documents=read_documents, filters=content.metadata
821
+ )
822
+ except Exception as e:
823
+ log_error(f"Error inserting document: {e}")
824
+ content.status = ContentStatus.FAILED
825
+ content.status_message = "Could not insert embedding"
826
+ self._update_content(content)
827
+ return
828
+
829
+ content.status = ContentStatus.COMPLETED
830
+ self._update_content(content)
831
+
832
+ async def _load_content(
833
+ self,
834
+ content: Content,
835
+ upsert: bool,
836
+ skip_if_exists: bool,
837
+ include: Optional[List[str]] = None,
838
+ exclude: Optional[List[str]] = None,
839
+ ) -> None:
840
+ log_info(f"Loading content: {content.id}")
841
+
842
+ if content.metadata:
843
+ self.add_filters(content.metadata)
844
+
845
+ if content.path:
846
+ await self._load_from_path(content, upsert, skip_if_exists, include, exclude)
847
+
848
+ if content.url:
849
+ await self._load_from_url(content, upsert, skip_if_exists)
850
+
851
+ if content.file_data:
852
+ await self._load_from_content(content, upsert, skip_if_exists)
853
+
854
+ if content.topics:
855
+ await self._load_from_topics(content, upsert, skip_if_exists)
856
+
857
+ if content.remote_content:
858
+ await self._load_from_remote_content(content, upsert, skip_if_exists)
859
+
860
+ def _build_content_hash(self, content: Content) -> str:
861
+ """
862
+ Build the content hash from the content.
863
+ """
864
+ if content.path:
865
+ return hashlib.sha256(str(content.path).encode()).hexdigest()
866
+ elif content.url:
867
+ hash = hashlib.sha256(content.url.encode()).hexdigest()
868
+ return hash
869
+ elif content.file_data and content.file_data.content:
870
+ name = content.name or "content"
871
+ return hashlib.sha256(name.encode()).hexdigest()
872
+ elif content.topics and len(content.topics) > 0:
873
+ topic = content.topics[0]
874
+ reader = type(content.reader).__name__ if content.reader else "unknown"
875
+ return hashlib.sha256(f"{topic}-{reader}".encode()).hexdigest()
876
+ else:
877
+ # Fallback for edge cases
878
+ import random
879
+ import string
880
+
881
+ fallback = (
882
+ content.name
883
+ or content.id
884
+ or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
885
+ )
886
+ return hashlib.sha256(fallback.encode()).hexdigest()
887
+
888
+ def _add_to_contents_db(self, content: Content):
889
+ if self.contents_db:
890
+ created_at = content.created_at if content.created_at else int(time.time())
891
+ updated_at = content.updated_at if content.updated_at else int(time.time())
892
+
893
+ file_type = (
894
+ content.file_type
895
+ if content.file_type
896
+ else content.file_data.type
897
+ if content.file_data and content.file_data.type
898
+ else None
899
+ )
900
+ content_row = KnowledgeRow(
901
+ id=content.id,
902
+ name=content.name if content.name else "",
903
+ description=content.description if content.description else "",
904
+ metadata=content.metadata,
905
+ type=file_type,
906
+ size=content.size
907
+ if content.size
908
+ else len(content.file_data.content)
909
+ if content.file_data and content.file_data.content
910
+ else None,
911
+ linked_to=self.name,
912
+ access_count=0,
913
+ status=content.status if content.status else ContentStatus.PROCESSING,
914
+ status_message="",
915
+ created_at=created_at,
916
+ updated_at=updated_at,
917
+ )
918
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
919
+
920
+ def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
921
+ if self.contents_db:
922
+ if not content.id:
923
+ log_warning("Content id is required to update Knowledge content")
924
+ return None
925
+
926
+ # TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
927
+ content_row = self.contents_db.get_knowledge_content(content.id)
928
+ if content_row is None:
929
+ log_warning(f"Content row not found for id: {content.id}, cannot update status")
930
+ return None
931
+
932
+ if content.name is not None:
933
+ content_row.name = content.name
934
+ if content.description is not None:
935
+ content_row.description = content.description
936
+ if content.metadata is not None:
937
+ content_row.metadata = content.metadata
938
+ if content.status is not None:
939
+ content_row.status = content.status
940
+ if content.status_message is not None:
941
+ content_row.status_message = content.status_message if content.status_message else ""
942
+ if content.external_id is not None:
943
+ content_row.external_id = content.external_id
944
+
945
+ content_row.updated_at = int(time.time())
946
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
947
+
948
+ if self.vector_db and content.metadata:
949
+ self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata)
950
+
951
+ if content.metadata:
952
+ self.add_filters(content.metadata)
953
+
954
+ return content_row.to_dict()
955
+
956
+ else:
957
+ log_warning(f"Contents DB not found for knowledge base: {self.name}")
958
+ return None
959
+
960
+ async def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
961
+ self._add_to_contents_db(content)
962
+ if content_type == KnowledgeContentOrigin.PATH:
963
+ if content.file_data is None:
964
+ log_warning("No file data provided")
965
+
966
+ if content.path is None:
967
+ log_error("No path provided for content")
968
+ return
969
+
970
+ path = Path(content.path)
971
+
972
+ log_info(f"Uploading file to LightRAG from path: {path}")
973
+ try:
974
+ # Read the file content from path
975
+ with open(path, "rb") as f:
976
+ file_content = f.read()
977
+
978
+ # Get file type from extension or content.file_type
979
+ file_type = content.file_type or path.suffix
980
+
981
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
982
+ result = await self.vector_db.insert_file_bytes(
983
+ file_content=file_content,
984
+ filename=path.name, # Use the original filename with extension
985
+ content_type=file_type,
986
+ send_metadata=True, # Enable metadata so server knows the file type
987
+ )
988
+
989
+ else:
990
+ log_error("Vector database does not support file insertion")
991
+ content.status = ContentStatus.FAILED
992
+ self._update_content(content)
993
+ return
994
+ content.external_id = result
995
+ content.status = ContentStatus.COMPLETED
996
+ self._update_content(content)
997
+ return
998
+
999
+ except Exception as e:
1000
+ log_error(f"Error uploading file to LightRAG: {e}")
1001
+ content.status = ContentStatus.FAILED
1002
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
1003
+ self._update_content(content)
1004
+ return
1005
+
1006
+ elif content_type == KnowledgeContentOrigin.URL:
1007
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
1008
+ try:
1009
+ reader = self.url_reader
1010
+ if reader is None:
1011
+ log_error("No URL reader available")
1012
+ content.status = ContentStatus.FAILED
1013
+ self._update_content(content)
1014
+ return
1015
+
1016
+ reader.chunk = False
1017
+ read_documents = reader.read(content.url, name=content.name)
1018
+
1019
+ for read_document in read_documents:
1020
+ read_document.content_id = content.id
1021
+
1022
+ if not read_documents:
1023
+ log_error("No documents read from URL")
1024
+ content.status = ContentStatus.FAILED
1025
+ self._update_content(content)
1026
+ return
1027
+
1028
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
1029
+ result = await self.vector_db.insert_text(
1030
+ file_source=content.url,
1031
+ text=read_documents[0].content,
1032
+ )
1033
+ else:
1034
+ log_error("Vector database does not support text insertion")
1035
+ content.status = ContentStatus.FAILED
1036
+ self._update_content(content)
1037
+ return
1038
+
1039
+ content.external_id = result
1040
+ content.status = ContentStatus.COMPLETED
1041
+ self._update_content(content)
1042
+ return
1043
+
1044
+ except Exception as e:
1045
+ log_error(f"Error uploading file to LightRAG: {e}")
1046
+ content.status = ContentStatus.FAILED
1047
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
1048
+ self._update_content(content)
1049
+ return
1050
+
1051
+ elif content_type == KnowledgeContentOrigin.CONTENT:
1052
+ filename = (
1053
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
1054
+ )
1055
+ log_info(f"Uploading file to LightRAG: {filename}")
1056
+
1057
+ # Use the content from file_data
1058
+ if content.file_data and content.file_data.content:
1059
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
1060
+ result = await self.vector_db.insert_file_bytes(
1061
+ file_content=content.file_data.content,
1062
+ filename=filename,
1063
+ content_type=content.file_data.type,
1064
+ send_metadata=True, # Enable metadata so server knows the file type
1065
+ )
1066
+ else:
1067
+ log_error("Vector database does not support file insertion")
1068
+ content.status = ContentStatus.FAILED
1069
+ self._update_content(content)
1070
+ return
1071
+ content.external_id = result
1072
+ content.status = ContentStatus.COMPLETED
1073
+ self._update_content(content)
1074
+ else:
1075
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
1076
+ return
1077
+
1078
+ elif content_type == KnowledgeContentOrigin.TOPIC:
1079
+ log_info(f"Uploading file to LightRAG: {content.name}")
1080
+
1081
+ if content.reader is None:
1082
+ log_error("No reader available for topic content")
1083
+ content.status = ContentStatus.FAILED
1084
+ self._update_content(content)
1085
+ return
1086
+
1087
+ if not content.topics:
1088
+ log_error("No topics available for content")
1089
+ content.status = ContentStatus.FAILED
1090
+ self._update_content(content)
1091
+ return
1092
+
1093
+ read_documents = content.reader.read(content.topics)
1094
+ if len(read_documents) > 0:
1095
+ print("READ DOCUMENTS: ", len(read_documents))
1096
+ print("READ DOCUMENTS: ", read_documents[0])
1097
+
1098
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
1099
+ result = await self.vector_db.insert_text(
1100
+ file_source=content.topics[0],
1101
+ text=read_documents[0].content,
1102
+ )
1103
+ else:
1104
+ log_error("Vector database does not support text insertion")
1105
+ content.status = ContentStatus.FAILED
1106
+ self._update_content(content)
1107
+ return
1108
+ content.external_id = result
1109
+ content.status = ContentStatus.COMPLETED
1110
+ self._update_content(content)
1111
+ return
1112
+ else:
1113
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
1114
+ return
1115
+
1116
+ def search(
1117
+ self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
1118
+ ) -> List[Document]:
1119
+ """Returns relevant documents matching a query"""
1120
+
1121
+ try:
1122
+ if self.vector_db is None:
1123
+ log_warning("No vector db provided")
1124
+ return []
1125
+
1126
+ _max_results = max_results or self.max_results
1127
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
1128
+ return self.vector_db.search(query=query, limit=_max_results, filters=filters)
1129
+ except Exception as e:
1130
+ log_error(f"Error searching for documents: {e}")
1131
+ return []
1132
+
1133
+ async def async_search(
1134
+ self, query: str, max_results: Optional[int] = None, filters: Optional[Dict[str, Any]] = None
1135
+ ) -> List[Document]:
1136
+ """Returns relevant documents matching a query"""
1137
+
1138
+ try:
1139
+ if self.vector_db is None:
1140
+ log_warning("No vector db provided")
1141
+ return []
1142
+
1143
+ _max_results = max_results or self.max_results
1144
+ log_debug(f"Getting {_max_results} relevant documents for query: {query}")
1145
+ try:
1146
+ return await self.vector_db.async_search(query=query, limit=_max_results, filters=filters)
1147
+ except NotImplementedError:
1148
+ log_info("Vector db does not support async search")
1149
+ return self.search(query=query, max_results=_max_results, filters=filters)
1150
+ except Exception as e:
1151
+ log_error(f"Error searching for documents: {e}")
1152
+ return []
1153
+
1154
+ def validate_filters(self, filters: Optional[Dict[str, Any]]) -> Tuple[Dict[str, Any], List[str]]:
1155
+ if self.valid_metadata_filters is None:
1156
+ self.valid_metadata_filters = set()
1157
+ self.valid_metadata_filters.update(self._get_filters_from_db)
1158
+
1159
+ if not filters:
1160
+ return {}, []
1161
+
1162
+ valid_filters: Dict[str, Any] = {}
1163
+ invalid_keys = []
1164
+
1165
+ # If no metadata filters tracked yet, all keys are considered invalid
1166
+ if self.valid_metadata_filters is None:
1167
+ invalid_keys = list(filters.keys())
1168
+ log_debug(f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}")
1169
+ return {}, invalid_keys
1170
+
1171
+ for key, value in filters.items():
1172
+ # Handle both normal keys and prefixed keys like meta_data.key
1173
+ base_key = key.split(".")[-1] if "." in key else key
1174
+ if base_key in self.valid_metadata_filters or key in self.valid_metadata_filters:
1175
+ valid_filters[key] = value
1176
+ else:
1177
+ invalid_keys.append(key)
1178
+ log_debug(f"Invalid filter key: {key} - not present in knowledge base")
1179
+
1180
+ return valid_filters, invalid_keys
1181
+
1182
+ def add_filters(self, metadata: Dict[str, Any]) -> None:
1183
+ if self.valid_metadata_filters is None:
1184
+ self.valid_metadata_filters = set()
1185
+
1186
+ if metadata is not None:
1187
+ for key in metadata.keys():
1188
+ self.valid_metadata_filters.add(key)
1189
+
1190
+ @cached_property
1191
+ def _get_filters_from_db(self) -> Set[str]:
1192
+ if self.contents_db is None:
1193
+ return set()
1194
+ contents, _ = self.get_content()
1195
+ valid_filters: Set[str] = set()
1196
+ for content in contents:
1197
+ if content.metadata:
1198
+ valid_filters.update(content.metadata.keys())
1199
+ return valid_filters
1200
+
1201
+ def remove_vector_by_id(self, id: str) -> bool:
1202
+ if self.vector_db is None:
1203
+ log_warning("No vector DB provided")
1204
+ return False
1205
+ return self.vector_db.delete_by_id(id)
1206
+
1207
+ def remove_vectors_by_name(self, name: str) -> bool:
1208
+ if self.vector_db is None:
1209
+ log_warning("No vector DB provided")
1210
+ return False
1211
+ return self.vector_db.delete_by_name(name)
1212
+
1213
+ def remove_vectors_by_metadata(self, metadata: Dict[str, Any]) -> bool:
1214
+ if self.vector_db is None:
1215
+ log_warning("No vector DB provided")
1216
+ return False
1217
+ return self.vector_db.delete_by_metadata(metadata)
1218
+
1219
+ # --- API Only Methods ---
1220
+
1221
+ def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
1222
+ return self._update_content(content)
1223
+
1224
+ def get_content_by_id(self, content_id: str) -> Optional[Content]:
1225
+ if self.contents_db is None:
1226
+ raise ValueError("No contents db provided")
1227
+ content_row = self.contents_db.get_knowledge_content(content_id)
1228
+ if content_row is None:
1229
+ return None
1230
+ content = Content(
1231
+ id=content_row.id,
1232
+ name=content_row.name,
1233
+ description=content_row.description,
1234
+ metadata=content_row.metadata,
1235
+ file_type=content_row.type,
1236
+ size=content_row.size,
1237
+ status=ContentStatus(content_row.status) if content_row.status else None,
1238
+ status_message=content_row.status_message,
1239
+ created_at=content_row.created_at,
1240
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
1241
+ external_id=content_row.external_id,
1242
+ )
1243
+ return content
1244
+
1245
+ def get_content(
1246
+ self,
1247
+ limit: Optional[int] = None,
1248
+ page: Optional[int] = None,
1249
+ sort_by: Optional[str] = None,
1250
+ sort_order: Optional[str] = None,
1251
+ ) -> Tuple[List[Content], int]:
1252
+ if self.contents_db is None:
1253
+ raise ValueError("No contents db provided")
1254
+ contents, count = self.contents_db.get_knowledge_contents(
1255
+ limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
1256
+ )
1257
+
1258
+ result = []
1259
+ for content_row in contents:
1260
+ # Create Content from database row
1261
+ content = Content(
1262
+ id=content_row.id,
1263
+ name=content_row.name,
1264
+ description=content_row.description,
1265
+ metadata=content_row.metadata,
1266
+ size=content_row.size,
1267
+ file_type=content_row.type,
1268
+ status=ContentStatus(content_row.status) if content_row.status else None,
1269
+ status_message=content_row.status_message,
1270
+ created_at=content_row.created_at,
1271
+ updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
1272
+ external_id=content_row.external_id,
1273
+ )
1274
+ result.append(content)
1275
+ return result, count
1276
+
1277
+ def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
1278
+ if self.contents_db is None:
1279
+ raise ValueError("No contents db provided")
1280
+ content_row = self.contents_db.get_knowledge_content(content_id)
1281
+ if content_row is None:
1282
+ return None, "Content not found"
1283
+
1284
+ # Convert string status to enum, defaulting to PROCESSING if unknown
1285
+ status_str = content_row.status
1286
+ try:
1287
+ status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
1288
+ except ValueError:
1289
+ # Handle legacy or unknown statuses
1290
+ if status_str and "failed" in status_str.lower():
1291
+ status = ContentStatus.FAILED
1292
+ elif status_str and "completed" in status_str.lower():
1293
+ status = ContentStatus.COMPLETED
1294
+ else:
1295
+ status = ContentStatus.PROCESSING
1296
+
1297
+ return status, content_row.status_message
1298
+
1299
+ def remove_content_by_id(self, content_id: str):
1300
+ if self.vector_db is not None:
1301
+ if self.vector_db.__class__.__name__ == "LightRag":
1302
+ # For LightRAG, get the content first to find the external_id
1303
+ content = self.get_content_by_id(content_id)
1304
+ if content and content.external_id:
1305
+ self.vector_db.delete_by_external_id(content.external_id) # type: ignore
1306
+ else:
1307
+ log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
1308
+ else:
1309
+ self.vector_db.delete_by_content_id(content_id)
1310
+
1311
+ if self.contents_db is not None:
1312
+ self.contents_db.delete_knowledge_content(content_id)
1313
+
1314
+ def remove_all_content(self):
1315
+ contents, _ = self.get_content()
1316
+ for content in contents:
1317
+ if content.id is not None:
1318
+ self.remove_content_by_id(content.id)
1319
+
1320
+ # --- Reader Factory Integration ---
1321
+
1322
+ def construct_readers(self):
1323
+ """Initialize readers dictionary for lazy loading."""
1324
+ # Initialize empty readers dict - readers will be created on-demand
1325
+ if self.readers is None:
1326
+ self.readers = {}
1327
+
1328
+ def add_reader(self, reader: Reader):
1329
+ """Add a custom reader to the knowledge base."""
1330
+ if self.readers is None:
1331
+ self.readers = {}
1332
+
1333
+ # Generate a key for the reader
1334
+ reader_key = self._generate_reader_key(reader)
1335
+ self.readers[reader_key] = reader
1336
+ return reader
1337
+
1338
+ def get_readers(self) -> Dict[str, Reader]:
1339
+ """Get all currently loaded readers (only returns readers that have been used)."""
1340
+ if self.readers is None:
1341
+ self.readers = {}
1342
+
1343
+ return self.readers
1344
+
1345
+ def _generate_reader_key(self, reader: Reader) -> str:
1346
+ """Generate a key for a reader instance."""
1347
+ if reader.name:
1348
+ return f"{reader.name.lower().replace(' ', '_')}"
1349
+ else:
1350
+ return f"{reader.__class__.__name__.lower().replace(' ', '_')}"
1351
+
1352
+ def _select_reader(self, extension: str) -> Reader:
1353
+ """Select the appropriate reader for a file extension."""
1354
+ log_info(f"Selecting reader for extension: {extension}")
1355
+ return ReaderFactory.get_reader_for_extension(extension)
1356
+
1357
+ def _select_url_reader(self, url: str) -> Reader:
1358
+ """Select the appropriate reader for a URL."""
1359
+ return ReaderFactory.get_reader_for_url(url)
1360
+
1361
+ def _select_url_file_reader(self, extension: str) -> Reader:
1362
+ """Select the appropriate reader for a URL file extension."""
1363
+ return ReaderFactory.get_reader_for_url_file(extension)
1364
+
1365
+ def get_filters(self) -> List[str]:
1366
+ return [
1367
+ "filter_tag_1",
1368
+ "filter_tag2",
1369
+ ]
1370
+
1371
+ # --- Convenience Properties for Backward Compatibility ---
1372
+
1373
+ def _is_text_mime_type(self, mime_type: str) -> bool:
1374
+ """
1375
+ Check if a MIME type represents text content that can be safely encoded as UTF-8.
1376
+
1377
+ Args:
1378
+ mime_type: The MIME type to check
1379
+
1380
+ Returns:
1381
+ bool: True if it's a text type, False if binary
1382
+ """
1383
+ if not mime_type:
1384
+ return False
1385
+
1386
+ text_types = [
1387
+ "text/",
1388
+ "application/json",
1389
+ "application/xml",
1390
+ "application/javascript",
1391
+ "application/csv",
1392
+ "application/sql",
1393
+ ]
1394
+
1395
+ return any(mime_type.startswith(t) for t in text_types)
1396
+
1397
+ def _should_include_file(self, file_path: str, include: Optional[List[str]], exclude: Optional[List[str]]) -> bool:
1398
+ """
1399
+ Determine if a file should be included based on include/exclude patterns.
1400
+
1401
+ Logic:
1402
+ 1. If include is specified, file must match at least one include pattern
1403
+ 2. If exclude is specified, file must not match any exclude pattern
1404
+ 3. If neither specified, include all files
1405
+
1406
+ Args:
1407
+ file_path: Path to the file to check
1408
+ include: Optional list of include patterns (glob-style)
1409
+ exclude: Optional list of exclude patterns (glob-style)
1410
+
1411
+ Returns:
1412
+ bool: True if file should be included, False otherwise
1413
+ """
1414
+ import fnmatch
1415
+
1416
+ # If include patterns specified, file must match at least one
1417
+ if include:
1418
+ if not any(fnmatch.fnmatch(file_path, pattern) for pattern in include):
1419
+ return False
1420
+
1421
+ # If exclude patterns specified, file must not match any
1422
+ if exclude:
1423
+ if any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude):
1424
+ return False
1425
+
1426
+ return True
1427
+
1428
+ def _get_reader(self, reader_type: str) -> Optional[Reader]:
1429
+ """Get a cached reader or create it if not cached, handling missing dependencies gracefully."""
1430
+ if self.readers is None:
1431
+ self.readers = {}
1432
+
1433
+ if reader_type not in self.readers:
1434
+ try:
1435
+ reader = ReaderFactory.create_reader(reader_type)
1436
+ if reader:
1437
+ self.readers[reader_type] = reader
1438
+ else:
1439
+ return None
1440
+
1441
+ except Exception as e:
1442
+ log_warning(f"Cannot create {reader_type} reader {e}")
1443
+ return None
1444
+
1445
+ return self.readers.get(reader_type)
1446
+
1447
+ @property
1448
+ def pdf_reader(self) -> Optional[Reader]:
1449
+ """PDF reader - lazy loaded via factory."""
1450
+ return self._get_reader("pdf")
1451
+
1452
+ @property
1453
+ def csv_reader(self) -> Optional[Reader]:
1454
+ """CSV reader - lazy loaded via factory."""
1455
+ return self._get_reader("csv")
1456
+
1457
+ @property
1458
+ def docx_reader(self) -> Optional[Reader]:
1459
+ """Docx reader - lazy loaded via factory."""
1460
+ return self._get_reader("docx")
1461
+
1462
+ @property
1463
+ def json_reader(self) -> Optional[Reader]:
1464
+ """JSON reader - lazy loaded via factory."""
1465
+ return self._get_reader("json")
1466
+
1467
+ @property
1468
+ def markdown_reader(self) -> Optional[Reader]:
1469
+ """Markdown reader - lazy loaded via factory."""
1470
+ return self._get_reader("markdown")
1471
+
1472
+ @property
1473
+ def text_reader(self) -> Optional[Reader]:
1474
+ """Text reader - lazy loaded via factory."""
1475
+ return self._get_reader("text")
1476
+
1477
+ @property
1478
+ def website_reader(self) -> Optional[Reader]:
1479
+ """Website reader - lazy loaded via factory."""
1480
+ return self._get_reader("website")
1481
+
1482
+ @property
1483
+ def firecrawl_reader(self) -> Optional[Reader]:
1484
+ """Firecrawl reader - lazy loaded via factory."""
1485
+ return self._get_reader("firecrawl")
1486
+
1487
+ @property
1488
+ def url_reader(self) -> Optional[Reader]:
1489
+ """URL reader - lazy loaded via factory."""
1490
+ return self._get_reader("url")
1491
+
1492
+ @property
1493
+ def pdf_url_reader(self) -> Optional[Reader]:
1494
+ """PDF URL reader - lazy loaded via factory."""
1495
+ return self._get_reader("pdf_url")
1496
+
1497
+ @property
1498
+ def youtube_reader(self) -> Optional[Reader]:
1499
+ """YouTube reader - lazy loaded via factory."""
1500
+ return self._get_reader("youtube")
1501
+
1502
+ @property
1503
+ def csv_url_reader(self) -> Optional[Reader]:
1504
+ """CSV URL reader - lazy loaded via factory."""
1505
+ return self._get_reader("csv_url")
1506
+
1507
+ @property
1508
+ def s3_reader(self) -> Optional[Reader]:
1509
+ """S3 reader - lazy loaded via factory."""
1510
+ return self._get_reader("s3")
1511
+
1512
+ @property
1513
+ def gcs_reader(self) -> Optional[Reader]:
1514
+ """GCS reader - lazy loaded via factory."""
1515
+ return self._get_reader("gcs")