agno 0.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (723) hide show
  1. agno/__init__.py +8 -0
  2. agno/agent/__init__.py +44 -5
  3. agno/agent/agent.py +10531 -2975
  4. agno/api/agent.py +14 -53
  5. agno/api/api.py +7 -46
  6. agno/api/evals.py +22 -0
  7. agno/api/os.py +17 -0
  8. agno/api/routes.py +6 -25
  9. agno/api/schemas/__init__.py +9 -0
  10. agno/api/schemas/agent.py +6 -9
  11. agno/api/schemas/evals.py +16 -0
  12. agno/api/schemas/os.py +14 -0
  13. agno/api/schemas/team.py +10 -10
  14. agno/api/schemas/utils.py +21 -0
  15. agno/api/schemas/workflows.py +16 -0
  16. agno/api/settings.py +53 -0
  17. agno/api/team.py +22 -26
  18. agno/api/workflow.py +28 -0
  19. agno/cloud/aws/base.py +214 -0
  20. agno/cloud/aws/s3/__init__.py +2 -0
  21. agno/cloud/aws/s3/api_client.py +43 -0
  22. agno/cloud/aws/s3/bucket.py +195 -0
  23. agno/cloud/aws/s3/object.py +57 -0
  24. agno/compression/__init__.py +3 -0
  25. agno/compression/manager.py +247 -0
  26. agno/culture/__init__.py +3 -0
  27. agno/culture/manager.py +956 -0
  28. agno/db/__init__.py +24 -0
  29. agno/db/async_postgres/__init__.py +3 -0
  30. agno/db/base.py +946 -0
  31. agno/db/dynamo/__init__.py +3 -0
  32. agno/db/dynamo/dynamo.py +2781 -0
  33. agno/db/dynamo/schemas.py +442 -0
  34. agno/db/dynamo/utils.py +743 -0
  35. agno/db/firestore/__init__.py +3 -0
  36. agno/db/firestore/firestore.py +2379 -0
  37. agno/db/firestore/schemas.py +181 -0
  38. agno/db/firestore/utils.py +376 -0
  39. agno/db/gcs_json/__init__.py +3 -0
  40. agno/db/gcs_json/gcs_json_db.py +1791 -0
  41. agno/db/gcs_json/utils.py +228 -0
  42. agno/db/in_memory/__init__.py +3 -0
  43. agno/db/in_memory/in_memory_db.py +1312 -0
  44. agno/db/in_memory/utils.py +230 -0
  45. agno/db/json/__init__.py +3 -0
  46. agno/db/json/json_db.py +1777 -0
  47. agno/db/json/utils.py +230 -0
  48. agno/db/migrations/manager.py +199 -0
  49. agno/db/migrations/v1_to_v2.py +635 -0
  50. agno/db/migrations/versions/v2_3_0.py +938 -0
  51. agno/db/mongo/__init__.py +17 -0
  52. agno/db/mongo/async_mongo.py +2760 -0
  53. agno/db/mongo/mongo.py +2597 -0
  54. agno/db/mongo/schemas.py +119 -0
  55. agno/db/mongo/utils.py +276 -0
  56. agno/db/mysql/__init__.py +4 -0
  57. agno/db/mysql/async_mysql.py +2912 -0
  58. agno/db/mysql/mysql.py +2923 -0
  59. agno/db/mysql/schemas.py +186 -0
  60. agno/db/mysql/utils.py +488 -0
  61. agno/db/postgres/__init__.py +4 -0
  62. agno/db/postgres/async_postgres.py +2579 -0
  63. agno/db/postgres/postgres.py +2870 -0
  64. agno/db/postgres/schemas.py +187 -0
  65. agno/db/postgres/utils.py +442 -0
  66. agno/db/redis/__init__.py +3 -0
  67. agno/db/redis/redis.py +2141 -0
  68. agno/db/redis/schemas.py +159 -0
  69. agno/db/redis/utils.py +346 -0
  70. agno/db/schemas/__init__.py +4 -0
  71. agno/db/schemas/culture.py +120 -0
  72. agno/db/schemas/evals.py +34 -0
  73. agno/db/schemas/knowledge.py +40 -0
  74. agno/db/schemas/memory.py +61 -0
  75. agno/db/singlestore/__init__.py +3 -0
  76. agno/db/singlestore/schemas.py +179 -0
  77. agno/db/singlestore/singlestore.py +2877 -0
  78. agno/db/singlestore/utils.py +384 -0
  79. agno/db/sqlite/__init__.py +4 -0
  80. agno/db/sqlite/async_sqlite.py +2911 -0
  81. agno/db/sqlite/schemas.py +181 -0
  82. agno/db/sqlite/sqlite.py +2908 -0
  83. agno/db/sqlite/utils.py +429 -0
  84. agno/db/surrealdb/__init__.py +3 -0
  85. agno/db/surrealdb/metrics.py +292 -0
  86. agno/db/surrealdb/models.py +334 -0
  87. agno/db/surrealdb/queries.py +71 -0
  88. agno/db/surrealdb/surrealdb.py +1908 -0
  89. agno/db/surrealdb/utils.py +147 -0
  90. agno/db/utils.py +118 -0
  91. agno/eval/__init__.py +24 -0
  92. agno/eval/accuracy.py +666 -276
  93. agno/eval/agent_as_judge.py +861 -0
  94. agno/eval/base.py +29 -0
  95. agno/eval/performance.py +779 -0
  96. agno/eval/reliability.py +241 -62
  97. agno/eval/utils.py +120 -0
  98. agno/exceptions.py +143 -1
  99. agno/filters.py +354 -0
  100. agno/guardrails/__init__.py +6 -0
  101. agno/guardrails/base.py +19 -0
  102. agno/guardrails/openai.py +144 -0
  103. agno/guardrails/pii.py +94 -0
  104. agno/guardrails/prompt_injection.py +52 -0
  105. agno/hooks/__init__.py +3 -0
  106. agno/hooks/decorator.py +164 -0
  107. agno/integrations/discord/__init__.py +3 -0
  108. agno/integrations/discord/client.py +203 -0
  109. agno/knowledge/__init__.py +5 -1
  110. agno/{document → knowledge}/chunking/agentic.py +22 -14
  111. agno/{document → knowledge}/chunking/document.py +2 -2
  112. agno/{document → knowledge}/chunking/fixed.py +7 -6
  113. agno/knowledge/chunking/markdown.py +151 -0
  114. agno/{document → knowledge}/chunking/recursive.py +15 -3
  115. agno/knowledge/chunking/row.py +39 -0
  116. agno/knowledge/chunking/semantic.py +91 -0
  117. agno/knowledge/chunking/strategy.py +165 -0
  118. agno/knowledge/content.py +74 -0
  119. agno/knowledge/document/__init__.py +5 -0
  120. agno/{document → knowledge/document}/base.py +12 -2
  121. agno/knowledge/embedder/__init__.py +5 -0
  122. agno/knowledge/embedder/aws_bedrock.py +343 -0
  123. agno/knowledge/embedder/azure_openai.py +210 -0
  124. agno/{embedder → knowledge/embedder}/base.py +8 -0
  125. agno/knowledge/embedder/cohere.py +323 -0
  126. agno/knowledge/embedder/fastembed.py +62 -0
  127. agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
  128. agno/knowledge/embedder/google.py +258 -0
  129. agno/knowledge/embedder/huggingface.py +94 -0
  130. agno/knowledge/embedder/jina.py +182 -0
  131. agno/knowledge/embedder/langdb.py +22 -0
  132. agno/knowledge/embedder/mistral.py +206 -0
  133. agno/knowledge/embedder/nebius.py +13 -0
  134. agno/knowledge/embedder/ollama.py +154 -0
  135. agno/knowledge/embedder/openai.py +195 -0
  136. agno/knowledge/embedder/sentence_transformer.py +63 -0
  137. agno/{embedder → knowledge/embedder}/together.py +1 -1
  138. agno/knowledge/embedder/vllm.py +262 -0
  139. agno/knowledge/embedder/voyageai.py +165 -0
  140. agno/knowledge/knowledge.py +3006 -0
  141. agno/knowledge/reader/__init__.py +7 -0
  142. agno/knowledge/reader/arxiv_reader.py +81 -0
  143. agno/knowledge/reader/base.py +95 -0
  144. agno/knowledge/reader/csv_reader.py +164 -0
  145. agno/knowledge/reader/docx_reader.py +82 -0
  146. agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
  147. agno/knowledge/reader/firecrawl_reader.py +201 -0
  148. agno/knowledge/reader/json_reader.py +88 -0
  149. agno/knowledge/reader/markdown_reader.py +137 -0
  150. agno/knowledge/reader/pdf_reader.py +431 -0
  151. agno/knowledge/reader/pptx_reader.py +101 -0
  152. agno/knowledge/reader/reader_factory.py +313 -0
  153. agno/knowledge/reader/s3_reader.py +89 -0
  154. agno/knowledge/reader/tavily_reader.py +193 -0
  155. agno/knowledge/reader/text_reader.py +127 -0
  156. agno/knowledge/reader/web_search_reader.py +325 -0
  157. agno/knowledge/reader/website_reader.py +455 -0
  158. agno/knowledge/reader/wikipedia_reader.py +91 -0
  159. agno/knowledge/reader/youtube_reader.py +78 -0
  160. agno/knowledge/remote_content/remote_content.py +88 -0
  161. agno/knowledge/reranker/__init__.py +3 -0
  162. agno/{reranker → knowledge/reranker}/base.py +1 -1
  163. agno/{reranker → knowledge/reranker}/cohere.py +2 -2
  164. agno/knowledge/reranker/infinity.py +195 -0
  165. agno/knowledge/reranker/sentence_transformer.py +54 -0
  166. agno/knowledge/types.py +39 -0
  167. agno/knowledge/utils.py +234 -0
  168. agno/media.py +439 -95
  169. agno/memory/__init__.py +16 -3
  170. agno/memory/manager.py +1474 -123
  171. agno/memory/strategies/__init__.py +15 -0
  172. agno/memory/strategies/base.py +66 -0
  173. agno/memory/strategies/summarize.py +196 -0
  174. agno/memory/strategies/types.py +37 -0
  175. agno/models/aimlapi/__init__.py +5 -0
  176. agno/models/aimlapi/aimlapi.py +62 -0
  177. agno/models/anthropic/__init__.py +4 -0
  178. agno/models/anthropic/claude.py +960 -496
  179. agno/models/aws/__init__.py +15 -0
  180. agno/models/aws/bedrock.py +686 -451
  181. agno/models/aws/claude.py +190 -183
  182. agno/models/azure/__init__.py +18 -1
  183. agno/models/azure/ai_foundry.py +489 -0
  184. agno/models/azure/openai_chat.py +89 -40
  185. agno/models/base.py +2477 -550
  186. agno/models/cerebras/__init__.py +12 -0
  187. agno/models/cerebras/cerebras.py +565 -0
  188. agno/models/cerebras/cerebras_openai.py +131 -0
  189. agno/models/cohere/__init__.py +4 -0
  190. agno/models/cohere/chat.py +306 -492
  191. agno/models/cometapi/__init__.py +5 -0
  192. agno/models/cometapi/cometapi.py +74 -0
  193. agno/models/dashscope/__init__.py +5 -0
  194. agno/models/dashscope/dashscope.py +90 -0
  195. agno/models/deepinfra/__init__.py +5 -0
  196. agno/models/deepinfra/deepinfra.py +45 -0
  197. agno/models/deepseek/__init__.py +4 -0
  198. agno/models/deepseek/deepseek.py +110 -9
  199. agno/models/fireworks/__init__.py +4 -0
  200. agno/models/fireworks/fireworks.py +19 -22
  201. agno/models/google/__init__.py +3 -7
  202. agno/models/google/gemini.py +1717 -662
  203. agno/models/google/utils.py +22 -0
  204. agno/models/groq/__init__.py +4 -0
  205. agno/models/groq/groq.py +391 -666
  206. agno/models/huggingface/__init__.py +4 -0
  207. agno/models/huggingface/huggingface.py +266 -538
  208. agno/models/ibm/__init__.py +5 -0
  209. agno/models/ibm/watsonx.py +432 -0
  210. agno/models/internlm/__init__.py +3 -0
  211. agno/models/internlm/internlm.py +20 -3
  212. agno/models/langdb/__init__.py +1 -0
  213. agno/models/langdb/langdb.py +60 -0
  214. agno/models/litellm/__init__.py +14 -0
  215. agno/models/litellm/chat.py +503 -0
  216. agno/models/litellm/litellm_openai.py +42 -0
  217. agno/models/llama_cpp/__init__.py +5 -0
  218. agno/models/llama_cpp/llama_cpp.py +22 -0
  219. agno/models/lmstudio/__init__.py +5 -0
  220. agno/models/lmstudio/lmstudio.py +25 -0
  221. agno/models/message.py +361 -39
  222. agno/models/meta/__init__.py +12 -0
  223. agno/models/meta/llama.py +502 -0
  224. agno/models/meta/llama_openai.py +79 -0
  225. agno/models/metrics.py +120 -0
  226. agno/models/mistral/__init__.py +4 -0
  227. agno/models/mistral/mistral.py +293 -393
  228. agno/models/nebius/__init__.py +3 -0
  229. agno/models/nebius/nebius.py +53 -0
  230. agno/models/nexus/__init__.py +3 -0
  231. agno/models/nexus/nexus.py +22 -0
  232. agno/models/nvidia/__init__.py +4 -0
  233. agno/models/nvidia/nvidia.py +22 -3
  234. agno/models/ollama/__init__.py +4 -2
  235. agno/models/ollama/chat.py +257 -492
  236. agno/models/openai/__init__.py +7 -0
  237. agno/models/openai/chat.py +725 -770
  238. agno/models/openai/like.py +16 -2
  239. agno/models/openai/responses.py +1121 -0
  240. agno/models/openrouter/__init__.py +4 -0
  241. agno/models/openrouter/openrouter.py +62 -5
  242. agno/models/perplexity/__init__.py +5 -0
  243. agno/models/perplexity/perplexity.py +203 -0
  244. agno/models/portkey/__init__.py +3 -0
  245. agno/models/portkey/portkey.py +82 -0
  246. agno/models/requesty/__init__.py +5 -0
  247. agno/models/requesty/requesty.py +69 -0
  248. agno/models/response.py +177 -7
  249. agno/models/sambanova/__init__.py +4 -0
  250. agno/models/sambanova/sambanova.py +23 -4
  251. agno/models/siliconflow/__init__.py +5 -0
  252. agno/models/siliconflow/siliconflow.py +42 -0
  253. agno/models/together/__init__.py +4 -0
  254. agno/models/together/together.py +21 -164
  255. agno/models/utils.py +266 -0
  256. agno/models/vercel/__init__.py +3 -0
  257. agno/models/vercel/v0.py +43 -0
  258. agno/models/vertexai/__init__.py +0 -1
  259. agno/models/vertexai/claude.py +190 -0
  260. agno/models/vllm/__init__.py +3 -0
  261. agno/models/vllm/vllm.py +83 -0
  262. agno/models/xai/__init__.py +2 -0
  263. agno/models/xai/xai.py +111 -7
  264. agno/os/__init__.py +3 -0
  265. agno/os/app.py +1027 -0
  266. agno/os/auth.py +244 -0
  267. agno/os/config.py +126 -0
  268. agno/os/interfaces/__init__.py +1 -0
  269. agno/os/interfaces/a2a/__init__.py +3 -0
  270. agno/os/interfaces/a2a/a2a.py +42 -0
  271. agno/os/interfaces/a2a/router.py +249 -0
  272. agno/os/interfaces/a2a/utils.py +924 -0
  273. agno/os/interfaces/agui/__init__.py +3 -0
  274. agno/os/interfaces/agui/agui.py +47 -0
  275. agno/os/interfaces/agui/router.py +147 -0
  276. agno/os/interfaces/agui/utils.py +574 -0
  277. agno/os/interfaces/base.py +25 -0
  278. agno/os/interfaces/slack/__init__.py +3 -0
  279. agno/os/interfaces/slack/router.py +148 -0
  280. agno/os/interfaces/slack/security.py +30 -0
  281. agno/os/interfaces/slack/slack.py +47 -0
  282. agno/os/interfaces/whatsapp/__init__.py +3 -0
  283. agno/os/interfaces/whatsapp/router.py +210 -0
  284. agno/os/interfaces/whatsapp/security.py +55 -0
  285. agno/os/interfaces/whatsapp/whatsapp.py +36 -0
  286. agno/os/mcp.py +293 -0
  287. agno/os/middleware/__init__.py +9 -0
  288. agno/os/middleware/jwt.py +797 -0
  289. agno/os/router.py +258 -0
  290. agno/os/routers/__init__.py +3 -0
  291. agno/os/routers/agents/__init__.py +3 -0
  292. agno/os/routers/agents/router.py +599 -0
  293. agno/os/routers/agents/schema.py +261 -0
  294. agno/os/routers/evals/__init__.py +3 -0
  295. agno/os/routers/evals/evals.py +450 -0
  296. agno/os/routers/evals/schemas.py +174 -0
  297. agno/os/routers/evals/utils.py +231 -0
  298. agno/os/routers/health.py +31 -0
  299. agno/os/routers/home.py +52 -0
  300. agno/os/routers/knowledge/__init__.py +3 -0
  301. agno/os/routers/knowledge/knowledge.py +1008 -0
  302. agno/os/routers/knowledge/schemas.py +178 -0
  303. agno/os/routers/memory/__init__.py +3 -0
  304. agno/os/routers/memory/memory.py +661 -0
  305. agno/os/routers/memory/schemas.py +88 -0
  306. agno/os/routers/metrics/__init__.py +3 -0
  307. agno/os/routers/metrics/metrics.py +190 -0
  308. agno/os/routers/metrics/schemas.py +47 -0
  309. agno/os/routers/session/__init__.py +3 -0
  310. agno/os/routers/session/session.py +997 -0
  311. agno/os/routers/teams/__init__.py +3 -0
  312. agno/os/routers/teams/router.py +512 -0
  313. agno/os/routers/teams/schema.py +257 -0
  314. agno/os/routers/traces/__init__.py +3 -0
  315. agno/os/routers/traces/schemas.py +414 -0
  316. agno/os/routers/traces/traces.py +499 -0
  317. agno/os/routers/workflows/__init__.py +3 -0
  318. agno/os/routers/workflows/router.py +624 -0
  319. agno/os/routers/workflows/schema.py +75 -0
  320. agno/os/schema.py +534 -0
  321. agno/os/scopes.py +469 -0
  322. agno/{playground → os}/settings.py +7 -15
  323. agno/os/utils.py +973 -0
  324. agno/reasoning/anthropic.py +80 -0
  325. agno/reasoning/azure_ai_foundry.py +67 -0
  326. agno/reasoning/deepseek.py +63 -0
  327. agno/reasoning/default.py +97 -0
  328. agno/reasoning/gemini.py +73 -0
  329. agno/reasoning/groq.py +71 -0
  330. agno/reasoning/helpers.py +24 -1
  331. agno/reasoning/ollama.py +67 -0
  332. agno/reasoning/openai.py +86 -0
  333. agno/reasoning/step.py +2 -1
  334. agno/reasoning/vertexai.py +76 -0
  335. agno/run/__init__.py +6 -0
  336. agno/run/agent.py +822 -0
  337. agno/run/base.py +247 -0
  338. agno/run/cancel.py +81 -0
  339. agno/run/requirement.py +181 -0
  340. agno/run/team.py +767 -0
  341. agno/run/workflow.py +708 -0
  342. agno/session/__init__.py +10 -0
  343. agno/session/agent.py +260 -0
  344. agno/session/summary.py +265 -0
  345. agno/session/team.py +342 -0
  346. agno/session/workflow.py +501 -0
  347. agno/table.py +10 -0
  348. agno/team/__init__.py +37 -0
  349. agno/team/team.py +9536 -0
  350. agno/tools/__init__.py +7 -0
  351. agno/tools/agentql.py +120 -0
  352. agno/tools/airflow.py +22 -12
  353. agno/tools/api.py +122 -0
  354. agno/tools/apify.py +276 -83
  355. agno/tools/{arxiv_toolkit.py → arxiv.py} +20 -12
  356. agno/tools/aws_lambda.py +28 -7
  357. agno/tools/aws_ses.py +66 -0
  358. agno/tools/baidusearch.py +11 -4
  359. agno/tools/bitbucket.py +292 -0
  360. agno/tools/brandfetch.py +213 -0
  361. agno/tools/bravesearch.py +106 -0
  362. agno/tools/brightdata.py +367 -0
  363. agno/tools/browserbase.py +209 -0
  364. agno/tools/calcom.py +32 -23
  365. agno/tools/calculator.py +24 -37
  366. agno/tools/cartesia.py +187 -0
  367. agno/tools/{clickup_tool.py → clickup.py} +17 -28
  368. agno/tools/confluence.py +91 -26
  369. agno/tools/crawl4ai.py +139 -43
  370. agno/tools/csv_toolkit.py +28 -22
  371. agno/tools/dalle.py +36 -22
  372. agno/tools/daytona.py +475 -0
  373. agno/tools/decorator.py +169 -14
  374. agno/tools/desi_vocal.py +23 -11
  375. agno/tools/discord.py +32 -29
  376. agno/tools/docker.py +716 -0
  377. agno/tools/duckdb.py +76 -81
  378. agno/tools/duckduckgo.py +43 -40
  379. agno/tools/e2b.py +703 -0
  380. agno/tools/eleven_labs.py +65 -54
  381. agno/tools/email.py +13 -5
  382. agno/tools/evm.py +129 -0
  383. agno/tools/exa.py +324 -42
  384. agno/tools/fal.py +39 -35
  385. agno/tools/file.py +196 -30
  386. agno/tools/file_generation.py +356 -0
  387. agno/tools/financial_datasets.py +288 -0
  388. agno/tools/firecrawl.py +108 -33
  389. agno/tools/function.py +960 -122
  390. agno/tools/giphy.py +34 -12
  391. agno/tools/github.py +1294 -97
  392. agno/tools/gmail.py +922 -0
  393. agno/tools/google_bigquery.py +117 -0
  394. agno/tools/google_drive.py +271 -0
  395. agno/tools/google_maps.py +253 -0
  396. agno/tools/googlecalendar.py +607 -107
  397. agno/tools/googlesheets.py +377 -0
  398. agno/tools/hackernews.py +20 -12
  399. agno/tools/jina.py +24 -14
  400. agno/tools/jira.py +48 -19
  401. agno/tools/knowledge.py +218 -0
  402. agno/tools/linear.py +82 -43
  403. agno/tools/linkup.py +58 -0
  404. agno/tools/local_file_system.py +15 -7
  405. agno/tools/lumalab.py +41 -26
  406. agno/tools/mcp/__init__.py +10 -0
  407. agno/tools/mcp/mcp.py +331 -0
  408. agno/tools/mcp/multi_mcp.py +347 -0
  409. agno/tools/mcp/params.py +24 -0
  410. agno/tools/mcp_toolbox.py +284 -0
  411. agno/tools/mem0.py +193 -0
  412. agno/tools/memory.py +419 -0
  413. agno/tools/mlx_transcribe.py +11 -9
  414. agno/tools/models/azure_openai.py +190 -0
  415. agno/tools/models/gemini.py +203 -0
  416. agno/tools/models/groq.py +158 -0
  417. agno/tools/models/morph.py +186 -0
  418. agno/tools/models/nebius.py +124 -0
  419. agno/tools/models_labs.py +163 -82
  420. agno/tools/moviepy_video.py +18 -13
  421. agno/tools/nano_banana.py +151 -0
  422. agno/tools/neo4j.py +134 -0
  423. agno/tools/newspaper.py +15 -4
  424. agno/tools/newspaper4k.py +19 -6
  425. agno/tools/notion.py +204 -0
  426. agno/tools/openai.py +181 -17
  427. agno/tools/openbb.py +27 -20
  428. agno/tools/opencv.py +321 -0
  429. agno/tools/openweather.py +233 -0
  430. agno/tools/oxylabs.py +385 -0
  431. agno/tools/pandas.py +25 -15
  432. agno/tools/parallel.py +314 -0
  433. agno/tools/postgres.py +238 -185
  434. agno/tools/pubmed.py +125 -13
  435. agno/tools/python.py +48 -35
  436. agno/tools/reasoning.py +283 -0
  437. agno/tools/reddit.py +207 -29
  438. agno/tools/redshift.py +406 -0
  439. agno/tools/replicate.py +69 -26
  440. agno/tools/resend.py +11 -6
  441. agno/tools/scrapegraph.py +179 -19
  442. agno/tools/searxng.py +23 -31
  443. agno/tools/serpapi.py +15 -10
  444. agno/tools/serper.py +255 -0
  445. agno/tools/shell.py +23 -12
  446. agno/tools/shopify.py +1519 -0
  447. agno/tools/slack.py +56 -14
  448. agno/tools/sleep.py +8 -6
  449. agno/tools/spider.py +35 -11
  450. agno/tools/spotify.py +919 -0
  451. agno/tools/sql.py +34 -19
  452. agno/tools/tavily.py +158 -8
  453. agno/tools/telegram.py +18 -8
  454. agno/tools/todoist.py +218 -0
  455. agno/tools/toolkit.py +134 -9
  456. agno/tools/trafilatura.py +388 -0
  457. agno/tools/trello.py +25 -28
  458. agno/tools/twilio.py +18 -9
  459. agno/tools/user_control_flow.py +78 -0
  460. agno/tools/valyu.py +228 -0
  461. agno/tools/visualization.py +467 -0
  462. agno/tools/webbrowser.py +28 -0
  463. agno/tools/webex.py +76 -0
  464. agno/tools/website.py +23 -19
  465. agno/tools/webtools.py +45 -0
  466. agno/tools/whatsapp.py +286 -0
  467. agno/tools/wikipedia.py +28 -19
  468. agno/tools/workflow.py +285 -0
  469. agno/tools/{twitter.py → x.py} +142 -46
  470. agno/tools/yfinance.py +41 -39
  471. agno/tools/youtube.py +34 -17
  472. agno/tools/zendesk.py +15 -5
  473. agno/tools/zep.py +454 -0
  474. agno/tools/zoom.py +86 -37
  475. agno/tracing/__init__.py +12 -0
  476. agno/tracing/exporter.py +157 -0
  477. agno/tracing/schemas.py +276 -0
  478. agno/tracing/setup.py +111 -0
  479. agno/utils/agent.py +938 -0
  480. agno/utils/audio.py +37 -1
  481. agno/utils/certs.py +27 -0
  482. agno/utils/code_execution.py +11 -0
  483. agno/utils/common.py +103 -20
  484. agno/utils/cryptography.py +22 -0
  485. agno/utils/dttm.py +33 -0
  486. agno/utils/events.py +700 -0
  487. agno/utils/functions.py +107 -37
  488. agno/utils/gemini.py +426 -0
  489. agno/utils/hooks.py +171 -0
  490. agno/utils/http.py +185 -0
  491. agno/utils/json_schema.py +159 -37
  492. agno/utils/knowledge.py +36 -0
  493. agno/utils/location.py +19 -0
  494. agno/utils/log.py +221 -8
  495. agno/utils/mcp.py +214 -0
  496. agno/utils/media.py +335 -14
  497. agno/utils/merge_dict.py +22 -1
  498. agno/utils/message.py +77 -2
  499. agno/utils/models/ai_foundry.py +50 -0
  500. agno/utils/models/claude.py +373 -0
  501. agno/utils/models/cohere.py +94 -0
  502. agno/utils/models/llama.py +85 -0
  503. agno/utils/models/mistral.py +100 -0
  504. agno/utils/models/openai_responses.py +140 -0
  505. agno/utils/models/schema_utils.py +153 -0
  506. agno/utils/models/watsonx.py +41 -0
  507. agno/utils/openai.py +257 -0
  508. agno/utils/pickle.py +1 -1
  509. agno/utils/pprint.py +124 -8
  510. agno/utils/print_response/agent.py +930 -0
  511. agno/utils/print_response/team.py +1914 -0
  512. agno/utils/print_response/workflow.py +1668 -0
  513. agno/utils/prompts.py +111 -0
  514. agno/utils/reasoning.py +108 -0
  515. agno/utils/response.py +163 -0
  516. agno/utils/serialize.py +32 -0
  517. agno/utils/shell.py +4 -4
  518. agno/utils/streamlit.py +487 -0
  519. agno/utils/string.py +204 -51
  520. agno/utils/team.py +139 -0
  521. agno/utils/timer.py +9 -2
  522. agno/utils/tokens.py +657 -0
  523. agno/utils/tools.py +19 -1
  524. agno/utils/whatsapp.py +305 -0
  525. agno/utils/yaml_io.py +3 -3
  526. agno/vectordb/__init__.py +2 -0
  527. agno/vectordb/base.py +87 -9
  528. agno/vectordb/cassandra/__init__.py +5 -1
  529. agno/vectordb/cassandra/cassandra.py +383 -27
  530. agno/vectordb/chroma/__init__.py +4 -0
  531. agno/vectordb/chroma/chromadb.py +748 -83
  532. agno/vectordb/clickhouse/__init__.py +7 -1
  533. agno/vectordb/clickhouse/clickhousedb.py +554 -53
  534. agno/vectordb/couchbase/__init__.py +3 -0
  535. agno/vectordb/couchbase/couchbase.py +1446 -0
  536. agno/vectordb/lancedb/__init__.py +5 -0
  537. agno/vectordb/lancedb/lance_db.py +730 -98
  538. agno/vectordb/langchaindb/__init__.py +5 -0
  539. agno/vectordb/langchaindb/langchaindb.py +163 -0
  540. agno/vectordb/lightrag/__init__.py +5 -0
  541. agno/vectordb/lightrag/lightrag.py +388 -0
  542. agno/vectordb/llamaindex/__init__.py +3 -0
  543. agno/vectordb/llamaindex/llamaindexdb.py +166 -0
  544. agno/vectordb/milvus/__init__.py +3 -0
  545. agno/vectordb/milvus/milvus.py +966 -78
  546. agno/vectordb/mongodb/__init__.py +9 -1
  547. agno/vectordb/mongodb/mongodb.py +1175 -172
  548. agno/vectordb/pgvector/__init__.py +8 -0
  549. agno/vectordb/pgvector/pgvector.py +599 -115
  550. agno/vectordb/pineconedb/__init__.py +5 -1
  551. agno/vectordb/pineconedb/pineconedb.py +406 -43
  552. agno/vectordb/qdrant/__init__.py +4 -0
  553. agno/vectordb/qdrant/qdrant.py +914 -61
  554. agno/vectordb/redis/__init__.py +9 -0
  555. agno/vectordb/redis/redisdb.py +682 -0
  556. agno/vectordb/singlestore/__init__.py +8 -1
  557. agno/vectordb/singlestore/singlestore.py +771 -0
  558. agno/vectordb/surrealdb/__init__.py +3 -0
  559. agno/vectordb/surrealdb/surrealdb.py +663 -0
  560. agno/vectordb/upstashdb/__init__.py +5 -0
  561. agno/vectordb/upstashdb/upstashdb.py +718 -0
  562. agno/vectordb/weaviate/__init__.py +8 -0
  563. agno/vectordb/weaviate/index.py +15 -0
  564. agno/vectordb/weaviate/weaviate.py +1009 -0
  565. agno/workflow/__init__.py +23 -1
  566. agno/workflow/agent.py +299 -0
  567. agno/workflow/condition.py +759 -0
  568. agno/workflow/loop.py +756 -0
  569. agno/workflow/parallel.py +853 -0
  570. agno/workflow/router.py +723 -0
  571. agno/workflow/step.py +1564 -0
  572. agno/workflow/steps.py +613 -0
  573. agno/workflow/types.py +556 -0
  574. agno/workflow/workflow.py +4327 -514
  575. agno-2.3.13.dist-info/METADATA +639 -0
  576. agno-2.3.13.dist-info/RECORD +613 -0
  577. {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +1 -1
  578. agno-2.3.13.dist-info/licenses/LICENSE +201 -0
  579. agno/api/playground.py +0 -91
  580. agno/api/schemas/playground.py +0 -22
  581. agno/api/schemas/user.py +0 -22
  582. agno/api/schemas/workspace.py +0 -46
  583. agno/api/user.py +0 -160
  584. agno/api/workspace.py +0 -151
  585. agno/cli/auth_server.py +0 -118
  586. agno/cli/config.py +0 -275
  587. agno/cli/console.py +0 -88
  588. agno/cli/credentials.py +0 -23
  589. agno/cli/entrypoint.py +0 -571
  590. agno/cli/operator.py +0 -355
  591. agno/cli/settings.py +0 -85
  592. agno/cli/ws/ws_cli.py +0 -817
  593. agno/constants.py +0 -13
  594. agno/document/__init__.py +0 -1
  595. agno/document/chunking/semantic.py +0 -47
  596. agno/document/chunking/strategy.py +0 -31
  597. agno/document/reader/__init__.py +0 -1
  598. agno/document/reader/arxiv_reader.py +0 -41
  599. agno/document/reader/base.py +0 -22
  600. agno/document/reader/csv_reader.py +0 -84
  601. agno/document/reader/docx_reader.py +0 -46
  602. agno/document/reader/firecrawl_reader.py +0 -99
  603. agno/document/reader/json_reader.py +0 -43
  604. agno/document/reader/pdf_reader.py +0 -219
  605. agno/document/reader/s3/pdf_reader.py +0 -46
  606. agno/document/reader/s3/text_reader.py +0 -51
  607. agno/document/reader/text_reader.py +0 -41
  608. agno/document/reader/website_reader.py +0 -175
  609. agno/document/reader/youtube_reader.py +0 -50
  610. agno/embedder/__init__.py +0 -1
  611. agno/embedder/azure_openai.py +0 -86
  612. agno/embedder/cohere.py +0 -72
  613. agno/embedder/fastembed.py +0 -37
  614. agno/embedder/google.py +0 -73
  615. agno/embedder/huggingface.py +0 -54
  616. agno/embedder/mistral.py +0 -80
  617. agno/embedder/ollama.py +0 -57
  618. agno/embedder/openai.py +0 -74
  619. agno/embedder/sentence_transformer.py +0 -38
  620. agno/embedder/voyageai.py +0 -64
  621. agno/eval/perf.py +0 -201
  622. agno/file/__init__.py +0 -1
  623. agno/file/file.py +0 -16
  624. agno/file/local/csv.py +0 -32
  625. agno/file/local/txt.py +0 -19
  626. agno/infra/app.py +0 -240
  627. agno/infra/base.py +0 -144
  628. agno/infra/context.py +0 -20
  629. agno/infra/db_app.py +0 -52
  630. agno/infra/resource.py +0 -205
  631. agno/infra/resources.py +0 -55
  632. agno/knowledge/agent.py +0 -230
  633. agno/knowledge/arxiv.py +0 -22
  634. agno/knowledge/combined.py +0 -22
  635. agno/knowledge/csv.py +0 -28
  636. agno/knowledge/csv_url.py +0 -19
  637. agno/knowledge/document.py +0 -20
  638. agno/knowledge/docx.py +0 -30
  639. agno/knowledge/json.py +0 -28
  640. agno/knowledge/langchain.py +0 -71
  641. agno/knowledge/llamaindex.py +0 -66
  642. agno/knowledge/pdf.py +0 -28
  643. agno/knowledge/pdf_url.py +0 -26
  644. agno/knowledge/s3/base.py +0 -60
  645. agno/knowledge/s3/pdf.py +0 -21
  646. agno/knowledge/s3/text.py +0 -23
  647. agno/knowledge/text.py +0 -30
  648. agno/knowledge/website.py +0 -88
  649. agno/knowledge/wikipedia.py +0 -31
  650. agno/knowledge/youtube.py +0 -22
  651. agno/memory/agent.py +0 -392
  652. agno/memory/classifier.py +0 -104
  653. agno/memory/db/__init__.py +0 -1
  654. agno/memory/db/base.py +0 -42
  655. agno/memory/db/mongodb.py +0 -189
  656. agno/memory/db/postgres.py +0 -203
  657. agno/memory/db/sqlite.py +0 -193
  658. agno/memory/memory.py +0 -15
  659. agno/memory/row.py +0 -36
  660. agno/memory/summarizer.py +0 -192
  661. agno/memory/summary.py +0 -19
  662. agno/memory/workflow.py +0 -38
  663. agno/models/google/gemini_openai.py +0 -26
  664. agno/models/ollama/hermes.py +0 -221
  665. agno/models/ollama/tools.py +0 -362
  666. agno/models/vertexai/gemini.py +0 -595
  667. agno/playground/__init__.py +0 -3
  668. agno/playground/async_router.py +0 -421
  669. agno/playground/deploy.py +0 -249
  670. agno/playground/operator.py +0 -92
  671. agno/playground/playground.py +0 -91
  672. agno/playground/schemas.py +0 -76
  673. agno/playground/serve.py +0 -55
  674. agno/playground/sync_router.py +0 -405
  675. agno/reasoning/agent.py +0 -68
  676. agno/run/response.py +0 -112
  677. agno/storage/agent/__init__.py +0 -0
  678. agno/storage/agent/base.py +0 -38
  679. agno/storage/agent/dynamodb.py +0 -350
  680. agno/storage/agent/json.py +0 -92
  681. agno/storage/agent/mongodb.py +0 -228
  682. agno/storage/agent/postgres.py +0 -367
  683. agno/storage/agent/session.py +0 -79
  684. agno/storage/agent/singlestore.py +0 -303
  685. agno/storage/agent/sqlite.py +0 -357
  686. agno/storage/agent/yaml.py +0 -93
  687. agno/storage/workflow/__init__.py +0 -0
  688. agno/storage/workflow/base.py +0 -40
  689. agno/storage/workflow/mongodb.py +0 -233
  690. agno/storage/workflow/postgres.py +0 -366
  691. agno/storage/workflow/session.py +0 -60
  692. agno/storage/workflow/sqlite.py +0 -359
  693. agno/tools/googlesearch.py +0 -88
  694. agno/utils/defaults.py +0 -57
  695. agno/utils/filesystem.py +0 -39
  696. agno/utils/git.py +0 -52
  697. agno/utils/json_io.py +0 -30
  698. agno/utils/load_env.py +0 -19
  699. agno/utils/py_io.py +0 -19
  700. agno/utils/pyproject.py +0 -18
  701. agno/utils/resource_filter.py +0 -31
  702. agno/vectordb/singlestore/s2vectordb.py +0 -390
  703. agno/vectordb/singlestore/s2vectordb2.py +0 -355
  704. agno/workspace/__init__.py +0 -0
  705. agno/workspace/config.py +0 -325
  706. agno/workspace/enums.py +0 -6
  707. agno/workspace/helpers.py +0 -48
  708. agno/workspace/operator.py +0 -758
  709. agno/workspace/settings.py +0 -63
  710. agno-0.1.2.dist-info/LICENSE +0 -375
  711. agno-0.1.2.dist-info/METADATA +0 -502
  712. agno-0.1.2.dist-info/RECORD +0 -352
  713. agno-0.1.2.dist-info/entry_points.txt +0 -3
  714. /agno/{cli → db/migrations}/__init__.py +0 -0
  715. /agno/{cli/ws → db/migrations/versions}/__init__.py +0 -0
  716. /agno/{document/chunking/__init__.py → db/schemas/metrics.py} +0 -0
  717. /agno/{document/reader/s3 → integrations}/__init__.py +0 -0
  718. /agno/{file/local → knowledge/chunking}/__init__.py +0 -0
  719. /agno/{infra → knowledge/remote_content}/__init__.py +0 -0
  720. /agno/{knowledge/s3 → tools/models}/__init__.py +0 -0
  721. /agno/{reranker → utils/models}/__init__.py +0 -0
  722. /agno/{storage → utils/print_response}/__init__.py +0 -0
  723. {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,16 @@
1
+ import asyncio
1
2
  import time
2
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Union
3
4
 
4
- from agno.document import Document
5
- from agno.embedder import Embedder
6
- from agno.embedder.openai import OpenAIEmbedder
7
- from agno.utils.log import logger
5
+ from bson import ObjectId
6
+
7
+ from agno.filters import FilterExpr
8
+ from agno.knowledge.document import Document
9
+ from agno.knowledge.embedder import Embedder
10
+ from agno.utils.log import log_debug, log_info, log_warning, logger
8
11
  from agno.vectordb.base import VectorDb
9
12
  from agno.vectordb.distance import Distance
13
+ from agno.vectordb.search import SearchType
10
14
 
11
15
  try:
12
16
  from hashlib import md5
@@ -14,7 +18,7 @@ try:
14
18
  except ImportError:
15
19
  raise ImportError("`hashlib` not installed. Please install using `pip install hashlib`")
16
20
  try:
17
- from pymongo import MongoClient, errors
21
+ from pymongo import AsyncMongoClient, MongoClient, errors
18
22
  from pymongo.collection import Collection
19
23
  from pymongo.operations import SearchIndexModel
20
24
 
@@ -22,7 +26,7 @@ except ImportError:
22
26
  raise ImportError("`pymongo` not installed. Please install using `pip install pymongo`")
23
27
 
24
28
 
25
- class MongoDBVector(VectorDb):
29
+ class MongoDb(VectorDb):
26
30
  """
27
31
  MongoDB Vector Database implementation with elegant handling of Atlas Search index creation.
28
32
  """
@@ -30,215 +34,536 @@ class MongoDBVector(VectorDb):
30
34
  def __init__(
31
35
  self,
32
36
  collection_name: str,
37
+ name: Optional[str] = None,
38
+ description: Optional[str] = None,
39
+ id: Optional[str] = None,
33
40
  db_url: Optional[str] = "mongodb://localhost:27017/",
34
- database: str = "ai",
35
- embedder: Embedder = OpenAIEmbedder(),
41
+ database: str = "agno",
42
+ embedder: Optional[Embedder] = None,
36
43
  distance_metric: str = Distance.cosine,
37
44
  overwrite: bool = False,
38
- wait_until_index_ready: Optional[float] = None,
39
- wait_after_insert: Optional[float] = None,
45
+ wait_until_index_ready_in_seconds: Optional[float] = 3,
46
+ wait_after_insert_in_seconds: Optional[float] = 3,
47
+ max_pool_size: int = 100,
48
+ retry_writes: bool = True,
49
+ client: Optional[MongoClient] = None,
50
+ search_index_name: Optional[str] = "vector_index_1",
51
+ cosmos_compatibility: Optional[bool] = False,
52
+ search_type: SearchType = SearchType.vector,
53
+ hybrid_vector_weight: float = 0.5,
54
+ hybrid_keyword_weight: float = 0.5,
55
+ hybrid_rank_constant: int = 60,
40
56
  **kwargs,
41
57
  ):
42
58
  """
43
- Initialize the MongoDBVector with MongoDB collection details.
59
+ Initialize the MongoDb with MongoDB collection details.
44
60
 
45
61
  Args:
46
62
  collection_name (str): Name of the MongoDB collection.
63
+ name (Optional[str]): Name of the vector database.
64
+ description (Optional[str]): Description of the vector database.
47
65
  db_url (Optional[str]): MongoDB connection string.
48
66
  database (str): Database name.
49
67
  embedder (Embedder): Embedder instance for generating embeddings.
50
68
  distance_metric (str): Distance metric for similarity.
51
69
  overwrite (bool): Overwrite existing collection and index if True.
52
- wait_until_index_ready (float): Time in seconds to wait until the index is ready.
70
+ wait_until_index_ready_in_seconds (float): Time in seconds to wait until the index is ready.
71
+ wait_after_insert_in_seconds (float): Time in seconds to wait after inserting documents.
72
+ max_pool_size (int): Maximum number of connections in the connection pool
73
+ retry_writes (bool): Whether to retry write operations
74
+ client (Optional[MongoClient]): An existing MongoClient instance.
75
+ search_index_name (str): Name of the search index (default: "vector_index_1")
76
+ cosmos_compatibility (bool): Whether to use Azure Cosmos DB Mongovcore compatibility mode.
77
+ search_type: The search type to use when searching for documents.
78
+ hybrid_vector_weight (float): Default weight for vector search results in hybrid search.
79
+ hybrid_keyword_weight (float): Default weight for keyword search results in hybrid search.
80
+ hybrid_rank_constant (int): Default rank constant (k) for Reciprocal Rank Fusion in hybrid search. This constant is added to the rank before taking the reciprocal, helping to smooth scores. A common value is 60.
53
81
  **kwargs: Additional arguments for MongoClient.
54
82
  """
83
+ # Validate required parameters
55
84
  if not collection_name:
56
85
  raise ValueError("Collection name must not be empty.")
86
+ if not database:
87
+ raise ValueError("Database name must not be empty.")
88
+
89
+ # Dynamic ID generation based on unique identifiers
90
+ if id is None:
91
+ from agno.utils.string import generate_id
92
+
93
+ connection_identifier = db_url or "mongodb://localhost:27017/"
94
+ seed = f"{connection_identifier}#{database}#{collection_name}"
95
+ id = generate_id(seed)
96
+
57
97
  self.collection_name = collection_name
98
+ # Initialize base class with name, description, and generated ID
99
+ super().__init__(id=id, name=name, description=description)
100
+
58
101
  self.database = database
102
+ self.search_index_name = search_index_name
103
+ self.cosmos_compatibility = cosmos_compatibility
104
+ self.search_type = search_type
105
+ self.hybrid_vector_weight = hybrid_vector_weight
106
+ self.hybrid_keyword_weight = hybrid_keyword_weight
107
+ self.hybrid_rank_constant = hybrid_rank_constant
108
+
109
+ if embedder is None:
110
+ from agno.knowledge.embedder.openai import OpenAIEmbedder
111
+
112
+ embedder = OpenAIEmbedder()
113
+ log_info("Embedder not provided, using OpenAIEmbedder as default.")
59
114
  self.embedder = embedder
115
+
60
116
  self.distance_metric = distance_metric
61
117
  self.connection_string = db_url
62
118
  self.overwrite = overwrite
63
- self.wait_until_index_ready = wait_until_index_ready
64
- self.wait_after_insert = wait_after_insert
119
+ self.wait_until_index_ready_in_seconds = wait_until_index_ready_in_seconds
120
+ self.wait_after_insert_in_seconds = wait_after_insert_in_seconds
65
121
  self.kwargs = kwargs
122
+ self.kwargs.update(
123
+ {
124
+ "maxPoolSize": max_pool_size,
125
+ "retryWrites": retry_writes,
126
+ "serverSelectionTimeoutMS": 5000, # 5 second timeout
127
+ }
128
+ )
129
+
130
+ self._client = client
131
+ self._db = None
132
+ self._collection: Optional[Collection] = None
66
133
 
67
- self._client = self._get_client()
68
- self._db = self._client[self.database]
69
- self._collection = self._get_or_create_collection()
134
+ self._async_client: Optional[AsyncMongoClient] = None
135
+ self._async_db = None
136
+ self._async_collection: Optional[Collection] = None
70
137
 
71
138
  def _get_client(self) -> MongoClient:
72
139
  """Create or retrieve the MongoDB client."""
73
- try:
74
- logger.debug("Creating MongoDB Client")
75
- client: MongoClient = MongoClient(self.connection_string, **self.kwargs)
76
- # Trigger a connection to verify the client
77
- client.admin.command("ping")
78
- logger.info("Connected to MongoDB successfully.")
79
- return client
80
- except errors.ConnectionFailure as e:
81
- logger.error(f"Failed to connect to MongoDB: {e}")
82
- raise ConnectionError(f"Failed to connect to MongoDB: {e}")
83
- except Exception as e:
84
- logger.error(f"An error occurred while connecting to MongoDB: {e}")
85
- raise
140
+ if self._client is None:
141
+ if self.cosmos_compatibility:
142
+ try:
143
+ log_debug("Creating MongoDB Client for Azure Cosmos DB")
144
+ # Cosmos DB specific settings
145
+ cosmos_kwargs = {
146
+ "retryWrites": False,
147
+ "ssl": True,
148
+ "tlsAllowInvalidCertificates": True,
149
+ "maxPoolSize": 100,
150
+ "maxIdleTimeMS": 30000,
151
+ }
152
+
153
+ # Suppress UserWarning about CosmosDB
154
+ import warnings
155
+
156
+ with warnings.catch_warnings():
157
+ warnings.filterwarnings(
158
+ "ignore", category=UserWarning, message=".*connected to a CosmosDB cluster.*"
159
+ )
160
+ self._client = MongoClient(self.connection_string, **cosmos_kwargs) # type: ignore
161
+
162
+ self._client.admin.command("ping")
163
+
164
+ log_info("Connected to Azure Cosmos DB successfully.")
165
+ self._db = self._client.get_database(self.database) # type: ignore
166
+ log_info(f"Using database: {self.database}")
167
+
168
+ except errors.ConnectionFailure as e:
169
+ raise ConnectionError(f"Failed to connect to Azure Cosmos DB: {e}")
170
+ except Exception as e:
171
+ logger.error(f"An error occurred while connecting to Azure Cosmos DB: {e}")
172
+ raise
173
+ else:
174
+ try:
175
+ log_debug("Creating MongoDB Client")
176
+ self._client = MongoClient(self.connection_string, **self.kwargs)
177
+ # Trigger a connection to verify the client
178
+ self._client.admin.command("ping")
179
+ log_info("Connected to MongoDB successfully.")
180
+ self._db = self._client[self.database] # type: ignore
181
+ except errors.ConnectionFailure as e:
182
+ logger.error(f"Failed to connect to MongoDB: {e}")
183
+ raise ConnectionError(f"Failed to connect to MongoDB: {e}")
184
+ except Exception as e:
185
+ logger.error(f"An error occurred while connecting to MongoDB: {e}")
186
+ raise
187
+ return self._client
188
+
189
+ async def _get_async_client(self) -> AsyncMongoClient:
190
+ """Create or retrieve the async MongoDB client."""
191
+ if self._async_client is None:
192
+ log_debug("Creating Async MongoDB Client")
193
+ self._async_client = AsyncMongoClient(
194
+ self.connection_string,
195
+ maxPoolSize=self.kwargs.get("maxPoolSize", 100),
196
+ retryWrites=self.kwargs.get("retryWrites", True),
197
+ serverSelectionTimeoutMS=5000,
198
+ )
199
+ # Verify connection
200
+ try:
201
+ await self._async_client.admin.command("ping")
202
+ log_info("Connected to MongoDB asynchronously.")
203
+ except Exception as e:
204
+ logger.error(f"Failed to connect to MongoDB asynchronously: {e}")
205
+ raise
206
+ return self._async_client
86
207
 
87
208
  def _get_or_create_collection(self) -> Collection:
88
209
  """Get or create the MongoDB collection, handling Atlas Search index creation."""
89
-
90
- self._collection = self._db[self.collection_name]
210
+ self._collection = self._db[self.collection_name] # type: ignore
91
211
 
92
212
  if not self.collection_exists():
93
- logger.info(f"Creating collection '{self.collection_name}'.")
94
- self._db.create_collection(self.collection_name)
213
+ log_info(f"Creating collection '{self.collection_name}'.")
214
+ self._db.create_collection(self.collection_name) # type: ignore
95
215
  self._create_search_index()
96
216
  else:
97
- logger.info(f"Using existing collection '{self.collection_name}'.")
217
+ log_info(f"Using existing collection '{self.collection_name}'.")
98
218
  # check if index exists
99
- logger.info(f"Checking if search index '{self.collection_name}' exists.")
219
+ log_info(f"Checking if search index '{self.collection_name}' exists.")
100
220
  if not self._search_index_exists():
101
- logger.info(f"Search index '{self.collection_name}' does not exist. Creating it.")
221
+ log_info(f"Search index '{self.collection_name}' does not exist. Creating it.")
102
222
  self._create_search_index()
103
- if self.wait_until_index_ready:
223
+ if self.wait_until_index_ready_in_seconds and not self.cosmos_compatibility:
104
224
  self._wait_for_index_ready()
225
+ else:
226
+ log_info("Using existing vector search index.")
227
+ return self._collection # type: ignore
228
+
229
+ def _get_collection(self) -> Collection:
230
+ """Get or create the MongoDB collection."""
231
+ if self._collection is None:
232
+ if self._client is None:
233
+ self._get_client()
234
+ self._collection = self._db[self.collection_name] # type: ignore
235
+ log_info(f"Using collection: {self.collection_name}")
105
236
  return self._collection
106
237
 
238
+ async def _get_async_collection(self):
239
+ """Get or create the async MongoDB collection."""
240
+ if self._async_collection is None:
241
+ client = await self._get_async_client()
242
+ self._async_db = client[self.database] # type: ignore
243
+ self._async_collection = self._async_db[self.collection_name] # type: ignore
244
+ return self._async_collection
245
+
107
246
  def _create_search_index(self, overwrite: bool = True) -> None:
108
- """Create or overwrite the Atlas Search index."""
109
- index_name = "vector_index_1"
110
- try:
111
- if overwrite and self._search_index_exists():
112
- logger.info(f"Dropping existing search index '{index_name}'.")
113
- self._collection.drop_search_index(index_name)
247
+ """Create or overwrite the Atlas Search index with proper error handling."""
248
+ index_name = self.search_index_name or "vector_index_1"
249
+ max_retries = 3
250
+ retry_delay = 5
114
251
 
115
- logger.info(f"Creating search index '{index_name}'.")
252
+ if self.cosmos_compatibility:
253
+ try:
254
+ collection = self._get_collection()
116
255
 
117
- search_index_model = SearchIndexModel(
118
- definition={
119
- "fields": [
120
- {
121
- "type": "vector",
122
- "numDimensions": 1536,
123
- "path": "embedding",
124
- "similarity": self.distance_metric, # cosine
256
+ # Handle overwrite if requested
257
+ if overwrite and index_name in collection.index_information():
258
+ log_info(f"Dropping existing index '{index_name}'")
259
+ collection.drop_index(index_name)
260
+
261
+ embedding_dim = getattr(self.embedder, "dimensions", 1536)
262
+ log_info(f"Creating vector search index '{index_name}'")
263
+
264
+ # Create vector search index using Cosmos DB IVF format
265
+ collection.create_index(
266
+ [("embedding", "cosmosSearch")],
267
+ name=index_name,
268
+ cosmosSearchOptions={
269
+ "kind": "vector-ivf",
270
+ "numLists": 1,
271
+ "dimensions": embedding_dim,
272
+ "similarity": self._get_cosmos_similarity_metric(),
273
+ },
274
+ )
275
+
276
+ log_info(f"Created vector search index '{index_name}' successfully")
277
+
278
+ except Exception as e:
279
+ logger.error(f"Error creating vector search index: {e}")
280
+ raise
281
+ else:
282
+ for attempt in range(max_retries):
283
+ try:
284
+ if overwrite and self._search_index_exists():
285
+ log_info(f"Dropping existing search index '{index_name}'.")
286
+ try:
287
+ collection = self._get_collection()
288
+ collection.drop_search_index(index_name)
289
+ # Wait longer after index deletion
290
+ time.sleep(retry_delay * 2)
291
+ except errors.OperationFailure as e:
292
+ if "Index already requested to be deleted" in str(e):
293
+ log_info("Index is already being deleted, waiting...")
294
+ time.sleep(retry_delay * 2) # Wait longer for deletion to complete
295
+ else:
296
+ raise
297
+
298
+ # Verify index is gone before creating new one
299
+ retries = 3
300
+ while retries > 0 and self._search_index_exists():
301
+ log_info("Waiting for index deletion to complete...")
302
+ time.sleep(retry_delay)
303
+ retries -= 1
304
+
305
+ log_info(f"Creating search index '{index_name}'.")
306
+
307
+ # Get embedding dimension from embedder
308
+ embedding_dim = getattr(self.embedder, "dimensions", 1536)
309
+
310
+ search_index_model = SearchIndexModel(
311
+ definition={
312
+ "fields": [
313
+ {
314
+ "type": "vector",
315
+ "numDimensions": embedding_dim,
316
+ "path": "embedding",
317
+ "similarity": self.distance_metric,
318
+ },
319
+ ]
125
320
  },
126
- ]
127
- },
128
- name=index_name,
129
- type="vectorSearch",
130
- )
321
+ name=index_name,
322
+ type="vectorSearch",
323
+ )
131
324
 
132
- # Create the Atlas Search index
133
- self._collection.create_search_index(model=search_index_model)
134
- logger.info(f"Search index '{index_name}' created successfully.")
135
- except errors.OperationFailure as e:
136
- logger.error(f"Failed to create search index: {e}")
137
- raise
325
+ collection = self._get_collection()
326
+ collection.create_search_index(model=search_index_model)
327
+
328
+ if self.wait_until_index_ready_in_seconds:
329
+ self._wait_for_index_ready()
330
+
331
+ log_info(f"Search index '{index_name}' created successfully.")
332
+ return
333
+
334
+ except errors.OperationFailure as e:
335
+ if "Duplicate Index" in str(e) and attempt < max_retries - 1:
336
+ logger.warning(f"Index already exists, retrying... (attempt {attempt + 1})")
337
+ time.sleep(retry_delay * (attempt + 1))
338
+ continue
339
+ logger.error(f"Failed to create search index: {e}")
340
+ raise
341
+ except Exception as e:
342
+ logger.error(f"Unexpected error creating search index: {e}")
343
+ raise
344
+
345
+ async def _create_search_index_async(self) -> None:
346
+ """Create the Atlas Search index asynchronously."""
347
+ index_name = self.search_index_name
348
+ max_retries = 3
349
+ retry_delay = 5
350
+
351
+ for attempt in range(max_retries):
352
+ try:
353
+ collection = await self._get_async_collection()
354
+
355
+ # Get embedding dimension from embedder
356
+ embedding_dim = getattr(self.embedder, "dimensions", 1536)
357
+
358
+ search_index_model = SearchIndexModel(
359
+ definition={
360
+ "fields": [
361
+ {
362
+ "type": "vector",
363
+ "numDimensions": embedding_dim,
364
+ "path": "embedding",
365
+ "similarity": self.distance_metric,
366
+ },
367
+ ]
368
+ },
369
+ name=index_name,
370
+ type="vectorSearch",
371
+ )
372
+
373
+ await collection.create_search_index(model=search_index_model)
374
+ log_info(f"Search index '{index_name}' created successfully.")
375
+ return
376
+
377
+ except Exception as e:
378
+ if attempt < max_retries - 1:
379
+ await asyncio.sleep(retry_delay * (attempt + 1))
380
+ continue
381
+ logger.error(f"Failed to create search index: {e}")
382
+ raise
138
383
 
139
384
  def _search_index_exists(self) -> bool:
140
385
  """Check if the search index exists."""
141
- index_name = "vector_index_1"
142
- try:
143
- indexes = list(self._collection.list_search_indexes())
144
- exists = any(index["name"] == index_name for index in indexes)
145
- return exists
146
- except Exception as e:
147
- logger.error(f"Error checking search index existence: {e}")
148
- return False
386
+ index_name = self.search_index_name
387
+ if self.cosmos_compatibility:
388
+ index_name = self.search_index_name or "vector_index_1"
389
+ try:
390
+ collection = self._get_collection()
391
+ indexes = collection.index_information()
392
+
393
+ for idx_name, idx_info in indexes.items():
394
+ if idx_name == index_name:
395
+ key_info = idx_info.get("key", [])
396
+ for key_value_pair in key_info:
397
+ # Ensure we have a tuple/list with exactly 2 elements
398
+ if isinstance(key_value_pair, (tuple, list)) and len(key_value_pair) == 2:
399
+ key, value = key_value_pair
400
+ if key == "embedding" and value == "cosmosSearch":
401
+ log_debug(f"Found existing vector search index: {index_name}")
402
+ return True
403
+
404
+ log_debug(f"Vector search index '{index_name}' not found")
405
+ return False
406
+ except Exception as e:
407
+ logger.error(f"Error checking search index existence: {e}")
408
+ return False
409
+ else:
410
+ try:
411
+ collection = self._get_collection()
412
+ indexes = list(collection.list_search_indexes()) # type: ignore
413
+ exists = any(index["name"] == index_name for index in indexes) # type: ignore
414
+ return exists
415
+ except Exception as e:
416
+ logger.error(f"Error checking search index existence: {e}")
417
+ return False
149
418
 
150
419
  def _wait_for_index_ready(self) -> None:
151
420
  """Wait until the Atlas Search index is ready."""
152
- start_time = time.time()
153
- index_name = "vector_index_1"
421
+ index_name = self.search_index_name
154
422
  while True:
155
423
  try:
156
424
  if self._search_index_exists():
157
- logger.info(f"Search index '{index_name}' is ready.")
425
+ log_info(f"Search index '{index_name}' is ready.")
158
426
  break
159
427
  except Exception as e:
160
428
  logger.error(f"Error checking index status: {e}")
161
- if time.time() - start_time > self.wait_until_index_ready: # type: ignore
162
429
  raise TimeoutError("Timeout waiting for search index to become ready.")
163
430
  time.sleep(1)
164
431
 
432
+ async def _wait_for_index_ready_async(self) -> None:
433
+ """Wait until the Atlas Search index is ready asynchronously."""
434
+ start_time = time.time()
435
+ index_name = self.search_index_name
436
+ while True:
437
+ try:
438
+ collection = await self._get_async_collection()
439
+ indexes = await collection.list_search_indexes()
440
+ if any(index["name"] == index_name for index in indexes):
441
+ log_info(f"Search index '{index_name}' is ready.")
442
+ break
443
+ except Exception as e:
444
+ logger.error(f"Error checking index status asynchronously: {e}")
445
+ import traceback
446
+
447
+ logger.error(f"Traceback: {traceback.format_exc()}")
448
+
449
+ if time.time() - start_time > self.wait_until_index_ready_in_seconds: # type: ignore
450
+ raise TimeoutError("Timeout waiting for search index to become ready.")
451
+ await asyncio.sleep(1)
452
+
165
453
  def collection_exists(self) -> bool:
166
454
  """Check if the collection exists in the database."""
167
- return self.collection_name in self._db.list_collection_names()
455
+ if self._db is None:
456
+ self._get_client()
457
+ return self.collection_name in self._db.list_collection_names() # type: ignore
168
458
 
169
459
  def create(self) -> None:
170
460
  """Create the MongoDB collection and indexes if they do not exist."""
171
461
  self._get_or_create_collection()
172
462
 
173
- def doc_exists(self, document: Document) -> bool:
174
- """Check if a document exists in the MongoDB collection based on its content."""
175
- doc_id = md5(document.content.encode("utf-8")).hexdigest()
176
- try:
177
- exists = self._collection.find_one({"_id": doc_id}) is not None
178
- logger.debug(f"Document {'exists' if exists else 'does not exist'}: {doc_id}")
179
- return exists
180
- except Exception as e:
181
- logger.error(f"Error checking document existence: {e}")
182
- return False
463
+ async def async_create(self) -> None:
464
+ """Create the MongoDB collection and indexes asynchronously."""
465
+ await self._get_async_collection()
466
+
467
+ if not await self.async_exists():
468
+ log_info(f"Creating collection '{self.collection_name}' asynchronously.")
469
+ await self._async_db.create_collection(self.collection_name) # type: ignore
470
+ await self._create_search_index_async()
471
+ if self.wait_until_index_ready_in_seconds:
472
+ await self._wait_for_index_ready_async()
183
473
 
184
474
  def name_exists(self, name: str) -> bool:
185
475
  """Check if a document with a given name exists in the collection."""
186
476
  try:
187
- exists = self._collection.find_one({"name": name}) is not None
188
- logger.debug(f"Document with name '{name}' {'exists' if exists else 'does not exist'}")
477
+ collection = self._get_collection()
478
+ exists = collection.find_one({"name": name}) is not None
479
+ log_debug(f"Document with name '{name}' {'exists' if exists else 'does not exist'}")
189
480
  return exists
190
481
  except Exception as e:
191
482
  logger.error(f"Error checking document name existence: {e}")
192
483
  return False
193
484
 
194
485
  def id_exists(self, id: str) -> bool:
195
- """Check if a document with a given ID exists in the collection."""
486
+ """Check if a document with the given ID exists in the collection.
487
+
488
+ Args:
489
+ id (str): The document ID to check.
490
+
491
+ Returns:
492
+ bool: True if the document exists, False otherwise.
493
+ """
196
494
  try:
197
- exists = self._collection.find_one({"_id": id}) is not None
198
- logger.debug(f"Document with ID '{id}' {'exists' if exists else 'does not exist'}")
495
+ collection = self._get_collection()
496
+ result = collection.find_one({"_id": id})
497
+ exists = result is not None
498
+ log_debug(f"Document with ID '{id}' {'exists' if exists else 'does not exist'}")
199
499
  return exists
200
500
  except Exception as e:
201
501
  logger.error(f"Error checking document ID existence: {e}")
202
502
  return False
203
503
 
204
- def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
504
+ def content_hash_exists(self, content_hash: str) -> bool:
505
+ """Check if documents with the given content hash exist in the collection.
506
+
507
+ Args:
508
+ content_hash (str): The content hash to check.
509
+
510
+ Returns:
511
+ bool: True if documents with the content hash exist, False otherwise.
512
+ """
513
+ try:
514
+ collection = self._get_collection()
515
+ result = collection.find_one({"content_hash": content_hash})
516
+ exists = result is not None
517
+ log_debug(f"Document with content_hash '{content_hash}' {'exists' if exists else 'does not exist'}")
518
+ return exists
519
+ except Exception as e:
520
+ logger.error(f"Error checking content_hash existence: {e}")
521
+ return False
522
+
523
+ def insert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
205
524
  """Insert documents into the MongoDB collection."""
206
- logger.info(f"Inserting {len(documents)} documents")
525
+ log_debug(f"Inserting {len(documents)} documents")
526
+ collection = self._get_collection()
207
527
 
208
528
  prepared_docs = []
209
529
  for document in documents:
210
530
  try:
211
- doc_data = self.prepare_doc(document)
531
+ document.embed(embedder=self.embedder)
532
+ if document.embedding is None:
533
+ raise ValueError(f"Failed to generate embedding for document: {document.id}")
534
+ doc_data = self.prepare_doc(content_hash, document, filters)
212
535
  prepared_docs.append(doc_data)
213
536
  except ValueError as e:
214
537
  logger.error(f"Error preparing document '{document.name}': {e}")
215
538
 
216
539
  if prepared_docs:
217
540
  try:
218
- self._collection.insert_many(prepared_docs, ordered=False)
219
- logger.info(f"Inserted {len(prepared_docs)} documents successfully.")
220
- # lets wait for 5 minutes.... just in case
221
- # feel free to 'optimize'... :)
222
- if self.wait_after_insert and self.wait_after_insert > 0:
223
- time.sleep(self.wait_after_insert)
541
+ collection.insert_many(prepared_docs, ordered=False)
542
+ log_info(f"Inserted {len(prepared_docs)} documents successfully.")
543
+ if self.wait_after_insert_in_seconds and self.wait_after_insert_in_seconds > 0:
544
+ time.sleep(self.wait_after_insert_in_seconds)
224
545
  except errors.BulkWriteError as e:
225
546
  logger.warning(f"Bulk write error while inserting documents: {e.details}")
226
547
  except Exception as e:
227
548
  logger.error(f"Error inserting documents: {e}")
228
549
 
229
- def upsert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
550
+ def upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
230
551
  """Upsert documents into the MongoDB collection."""
231
- logger.info(f"Upserting {len(documents)} documents")
552
+ log_info(f"Upserting {len(documents)} documents")
553
+ collection = self._get_collection()
232
554
 
233
555
  for document in documents:
234
556
  try:
235
- doc_data = self.prepare_doc(document)
236
- self._collection.update_one(
557
+ document.embed(embedder=self.embedder)
558
+ if document.embedding is None:
559
+ raise ValueError(f"Failed to generate embedding for document: {document.id}")
560
+ doc_data = self.prepare_doc(content_hash, document, filters)
561
+ collection.update_one(
237
562
  {"_id": doc_data["_id"]},
238
563
  {"$set": doc_data},
239
564
  upsert=True,
240
565
  )
241
- logger.info(f"Upserted document: {doc_data['_id']}")
566
+ log_info(f"Upserted document: {doc_data['_id']}")
242
567
  except Exception as e:
243
568
  logger.error(f"Error upserting document '{document.name}': {e}")
244
569
 
@@ -246,55 +571,142 @@ class MongoDBVector(VectorDb):
246
571
  """Indicate that upsert functionality is available."""
247
572
  return True
248
573
 
249
- def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
250
- """Search the MongoDB collection for documents relevant to the query."""
574
+ def search(
575
+ self,
576
+ query: str,
577
+ limit: int = 5,
578
+ filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
579
+ min_score: float = 0.0,
580
+ ) -> List[Document]:
581
+ """Search for documents using vector similarity."""
582
+ if isinstance(filters, List):
583
+ log_warning("Filters Expressions are not supported in MongoDB. No filters will be applied.")
584
+ filters = None
585
+ if self.search_type == SearchType.hybrid:
586
+ return self.hybrid_search(query, limit=limit, filters=filters)
587
+
251
588
  query_embedding = self.embedder.get_embedding(query)
252
589
  if query_embedding is None:
253
590
  logger.error(f"Failed to generate embedding for query: {query}")
254
591
  return []
255
592
 
256
- try:
257
- pipeline = [
258
- {
259
- "$vectorSearch": {
260
- "index": "vector_index_1",
261
- "limit": 10,
262
- "numCandidates": 10,
263
- "queryVector": self.embedder.get_embedding(query),
264
- "path": "embedding",
593
+ if self.cosmos_compatibility:
594
+ # Azure Cosmos DB Mongo Vcore compatibility mode
595
+ try:
596
+ collection = self._get_collection()
597
+
598
+ # Construct the search pipeline
599
+ search_stage = {
600
+ "$search": {
601
+ "cosmosSearch": {"vector": query_embedding, "path": "embedding", "k": limit, "nProbes": 2},
602
+ "returnStoredSource": True,
265
603
  }
266
- },
267
- {"$set": {"score": {"$meta": "vectorSearchScore"}}},
268
- ]
269
- pipeline.append({"$project": {"embedding": 0}})
270
- agg = list(self._collection.aggregate(pipeline)) # type: ignore
271
- docs = []
272
- for doc in agg:
273
- docs.append(
604
+ }
605
+
606
+ pipeline = [
607
+ search_stage,
608
+ {
609
+ "$project": {
610
+ "similarityScore": {"$meta": "searchScore"},
611
+ "_id": 1,
612
+ "name": 1,
613
+ "content": 1,
614
+ "meta_data": 1,
615
+ }
616
+ },
617
+ ]
618
+
619
+ results = list(collection.aggregate(pipeline))
620
+ docs = [
274
621
  Document(
275
622
  id=str(doc["_id"]),
276
623
  name=doc.get("name"),
277
624
  content=doc["content"],
278
- meta_data=doc.get("meta_data", {}),
625
+ meta_data={**doc.get("meta_data", {}), "score": doc.get("similarityScore", 0.0)},
626
+ content_id=doc.get("content_id"),
279
627
  )
280
- )
281
- logger.info(f"Search completed. Found {len(docs)} documents.")
282
- return docs
283
- except Exception as e:
284
- logger.error(f"Error during search: {e}")
285
- return []
628
+ for doc in results
629
+ ]
630
+
631
+ log_info(f"Search completed. Found {len(docs)} documents.")
632
+ return docs
633
+
634
+ except Exception as e:
635
+ logger.error(f"Error during vector search: {e}")
636
+ return []
637
+ else:
638
+ # MongoDB Atlas Search
639
+ try:
640
+ collection = self._get_collection()
641
+ pipeline = [
642
+ {
643
+ "$vectorSearch": {
644
+ "index": self.search_index_name,
645
+ "limit": limit,
646
+ "numCandidates": min(limit * 4, 100),
647
+ "queryVector": query_embedding,
648
+ "path": "embedding",
649
+ }
650
+ },
651
+ {"$set": {"score": {"$meta": "vectorSearchScore"}}},
652
+ ]
653
+
654
+ match_filters = {}
655
+ if min_score > 0:
656
+ match_filters["score"] = {"$gte": min_score}
657
+
658
+ # Handle filters if provided
659
+ if filters:
660
+ # MongoDB uses dot notation for nested fields, so we need to prepend meta_data. if needed
661
+ mongo_filters = {}
662
+ for key, value in filters.items():
663
+ # If the key doesn't already include a dot notation for meta_data
664
+ if not key.startswith("meta_data.") and "." not in key:
665
+ mongo_filters[f"meta_data.{key}"] = value
666
+ else:
667
+ mongo_filters[key] = value
668
+
669
+ match_filters.update(mongo_filters)
670
+
671
+ if match_filters:
672
+ pipeline.append({"$match": match_filters}) # type: ignore
673
+
674
+ pipeline.append({"$project": {"embedding": 0}})
675
+
676
+ results = list(collection.aggregate(pipeline)) # type: ignore
677
+
678
+ docs = []
679
+ for doc in results:
680
+ # Convert ObjectIds to strings before creating Document
681
+ clean_doc = self._convert_objectids_to_strings(doc)
682
+ document = Document(
683
+ id=str(clean_doc["_id"]),
684
+ name=clean_doc.get("name"),
685
+ content=clean_doc["content"],
686
+ meta_data={**clean_doc.get("meta_data", {}), "score": clean_doc.get("score", 0.0)},
687
+ content_id=clean_doc.get("content_id"),
688
+ )
689
+ docs.append(document)
690
+
691
+ log_info(f"Search completed. Found {len(docs)} documents.")
692
+ return docs
693
+
694
+ except Exception as e:
695
+ logger.error(f"Error during search: {e}")
696
+ raise
286
697
 
287
698
  def vector_search(self, query: str, limit: int = 5) -> List[Document]:
288
699
  """Perform a vector-based search."""
289
- logger.debug("Performing vector search.")
700
+ log_debug("Performing vector search.")
290
701
  return self.search(query, limit=limit)
291
702
 
292
703
  def keyword_search(self, query: str, limit: int = 5) -> List[Document]:
293
704
  """Perform a keyword-based search."""
294
705
  try:
295
- cursor = self._collection.find(
706
+ collection = self._get_collection()
707
+ cursor = collection.find(
296
708
  {"content": {"$regex": query, "$options": "i"}},
297
- {"_id": 1, "name": 1, "content": 1, "meta_data": 1},
709
+ {"_id": 1, "name": 1, "content": 1, "meta_data": 1, "content_id": 1},
298
710
  ).limit(limit)
299
711
  results = [
300
712
  Document(
@@ -302,42 +714,244 @@ class MongoDBVector(VectorDb):
302
714
  name=doc.get("name"),
303
715
  content=doc["content"],
304
716
  meta_data=doc.get("meta_data", {}),
717
+ content_id=doc.get("content_id"),
305
718
  )
306
719
  for doc in cursor
307
720
  ]
308
- logger.debug(f"Keyword search completed. Found {len(results)} documents.")
721
+ log_debug(f"Keyword search completed. Found {len(results)} documents.")
309
722
  return results
310
723
  except Exception as e:
311
724
  logger.error(f"Error during keyword search: {e}")
312
725
  return []
313
726
 
314
- def hybrid_search(self, query: str, limit: int = 5) -> List[Document]:
315
- """Perform a hybrid search combining vector and keyword-based searches."""
316
- logger.debug("Performing hybrid search is not yet implemented.")
317
- return []
727
+ def hybrid_search(
728
+ self,
729
+ query: str,
730
+ limit: int = 5,
731
+ filters: Optional[Dict[str, Any]] = None,
732
+ ) -> List[Document]:
733
+ """
734
+ Perform a hybrid search combining vector and keyword-based searches using Reciprocal Rank Fusion.
735
+
736
+ Weights for vector and keyword search are configured at the instance level (hybrid_vector_weight, hybrid_keyword_weight).
737
+ The rank constant k is used in the RRF formula `1 / (rank + k)` to smooth scores.
738
+
739
+ Reference: https://www.mongodb.com/docs/atlas/atlas-vector-search/tutorials/reciprocal-rank-fusion
740
+ """
741
+
742
+ if self.cosmos_compatibility:
743
+ log_warning("Hybrid search is not implemented for Cosmos DB compatibility mode. Returning empty list.")
744
+ return []
745
+
746
+ log_debug(f"Performing hybrid search for query: '{query}' with limit: {limit}")
747
+
748
+ query_embedding = self.embedder.get_embedding(query)
749
+ if query_embedding is None:
750
+ logger.error(f"Failed to generate embedding for query: {query}")
751
+ return []
752
+
753
+ collection = self._get_collection()
754
+
755
+ k = self.hybrid_rank_constant
756
+
757
+ mongo_filters = {}
758
+ if filters:
759
+ for key, value in filters.items():
760
+ # If the key doesn't already include a dot notation for meta_data
761
+ if not key.startswith("meta_data.") and "." not in key:
762
+ mongo_filters[f"meta_data.{key}"] = value
763
+ else:
764
+ mongo_filters[key] = value
765
+
766
+ pipeline = [
767
+ # Vector Search Branch
768
+ {
769
+ "$vectorSearch": {
770
+ "index": self.search_index_name,
771
+ "path": "embedding",
772
+ "queryVector": query_embedding,
773
+ "numCandidates": min(limit * 10, 200),
774
+ "limit": limit * 2,
775
+ }
776
+ },
777
+ {"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
778
+ {"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
779
+ {
780
+ "$addFields": {
781
+ "_id": "$docs._id",
782
+ "name": "$docs.name",
783
+ "content": "$docs.content",
784
+ "meta_data": "$docs.meta_data",
785
+ "content_id": "$docs.content_id",
786
+ "vs_score": {
787
+ "$divide": [
788
+ self.hybrid_vector_weight,
789
+ {"$add": ["$rank", k, 1]},
790
+ ]
791
+ },
792
+ "fts_score": 0.0, # Ensure fts_score exists with a default value
793
+ }
794
+ },
795
+ {
796
+ "$project": {
797
+ "_id": 1,
798
+ "name": 1,
799
+ "content": 1,
800
+ "meta_data": 1,
801
+ "content_id": 1,
802
+ "vs_score": 1,
803
+ # Now fts_score is included with its value (0.0 here)
804
+ "fts_score": 1,
805
+ }
806
+ },
807
+ # Union with Keyword Search Branch
808
+ {
809
+ "$unionWith": {
810
+ "coll": self.collection_name,
811
+ "pipeline": [
812
+ {
813
+ "$search": {
814
+ "index": "default",
815
+ "text": {"query": query, "path": "content"},
816
+ }
817
+ },
818
+ {"$limit": limit * 2},
819
+ {"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
820
+ {"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
821
+ {
822
+ "$addFields": {
823
+ "_id": "$docs._id",
824
+ "name": "$docs.name",
825
+ "content": "$docs.content",
826
+ "meta_data": "$docs.meta_data",
827
+ "content_id": "$docs.content_id",
828
+ "vs_score": 0.0,
829
+ "fts_score": {
830
+ "$divide": [
831
+ self.hybrid_keyword_weight,
832
+ {"$add": ["$rank", k, 1]},
833
+ ]
834
+ },
835
+ }
836
+ },
837
+ {
838
+ "$project": {
839
+ "_id": 1,
840
+ "name": 1,
841
+ "content": 1,
842
+ "meta_data": 1,
843
+ "content_id": 1,
844
+ "vs_score": 1,
845
+ "fts_score": 1,
846
+ }
847
+ },
848
+ ],
849
+ }
850
+ },
851
+ # Combine and Rank
852
+ {
853
+ "$group": {
854
+ "_id": "$_id",
855
+ "name": {"$first": "$name"},
856
+ "content": {"$first": "$content"},
857
+ "meta_data": {"$first": "$meta_data"},
858
+ "content_id": {"$first": "$content_id"},
859
+ "vs_score": {"$sum": "$vs_score"},
860
+ "fts_score": {"$sum": "$fts_score"},
861
+ }
862
+ },
863
+ {
864
+ "$project": {
865
+ "_id": 1,
866
+ "name": 1,
867
+ "content": 1,
868
+ "meta_data": 1,
869
+ "content_id": 1,
870
+ "score": {"$add": ["$vs_score", "$fts_score"]},
871
+ }
872
+ },
873
+ {"$sort": {"score": -1}},
874
+ {"$limit": limit},
875
+ ]
876
+
877
+ # Apply filters if provided
878
+ if mongo_filters:
879
+ pipeline.append({"$match": mongo_filters})
880
+
881
+ try:
882
+ from typing import Mapping, Sequence, cast
883
+
884
+ results = list(collection.aggregate(cast(Sequence[Mapping[str, Any]], pipeline)))
885
+
886
+ docs = []
887
+ for doc in results:
888
+ # Convert ObjectIds to strings before creating Document
889
+ clean_doc = self._convert_objectids_to_strings(doc)
890
+ document = Document(
891
+ id=str(clean_doc["_id"]),
892
+ name=clean_doc.get("name"),
893
+ content=clean_doc["content"],
894
+ meta_data={**clean_doc.get("meta_data", {}), "score": clean_doc.get("score", 0.0)},
895
+ content_id=clean_doc.get("content_id"),
896
+ )
897
+ docs.append(document)
898
+
899
+ log_info(f"Hybrid search completed. Found {len(docs)} documents.")
900
+ return docs
901
+ except errors.OperationFailure as e:
902
+ logger.error(
903
+ f"Error during hybrid search, potentially due to missing or misconfigured Atlas Search index for text search: {e}"
904
+ )
905
+ logger.error(f"Details: {e.details}")
906
+ return []
907
+ except Exception as e:
908
+ logger.error(f"Error during hybrid search: {e}")
909
+ import traceback
910
+
911
+ logger.error(f"Traceback: {traceback.format_exc()}")
912
+ return []
318
913
 
319
914
  def drop(self) -> None:
320
- """Drop the collection from the database."""
915
+ """Drop the collection and clean up indexes."""
916
+ collection = self._get_collection()
917
+ index_name = self.search_index_name or "vector_index_1"
918
+
321
919
  if self.exists():
322
- try:
323
- logger.debug(f"Dropping collection '{self.collection_name}'.")
324
- self._collection.drop()
325
- logger.info(f"Collection '{self.collection_name}' dropped successfully.")
326
- # Add delay to allow lucene index to be deleted
327
- time.sleep(50)
328
- """
329
- pymongo.errors.OperationFailure: Duplicate Index, full error: {'ok': 0.0, 'errmsg': 'Duplicate Index', 'code': 68, 'codeName': 'IndexAlreadyExists', '$clusterTime': {'clusterTime': Timestamp(1733205025, 28), 'signature': {'hash': b'', 'keyId': 7394931654956941332}}, 'operationTime': Timestamp(1733205025, 28)}
330
- """
331
- except Exception as e:
332
- logger.error(f"Error dropping collection '{self.collection_name}': {e}")
333
- raise
334
- else:
335
- logger.info(f"Collection '{self.collection_name}' does not exist.")
920
+ if self.cosmos_compatibility:
921
+ # Cosmos DB specific handling
922
+ try:
923
+ # Drop the index if it exists
924
+ if self._search_index_exists():
925
+ log_info(f"Dropping index '{index_name}'")
926
+ try:
927
+ collection.drop_index(index_name)
928
+ except Exception as e:
929
+ logger.error(f"Error dropping index: {e}")
930
+
931
+ except Exception as e:
932
+ logger.error(f"Error dropping collection: {e}")
933
+ raise
934
+ else:
935
+ # MongoDB Atlas specific handling
936
+ try:
937
+ if self._search_index_exists():
938
+ collection.drop_search_index(index_name)
939
+ time.sleep(2)
940
+
941
+ except Exception as e:
942
+ logger.error(f"Error dropping collection: {e}")
943
+ raise
944
+
945
+ # Drop the collection
946
+ collection.drop()
947
+ time.sleep(2)
948
+
949
+ log_info(f"Collection '{self.collection_name}' dropped successfully")
336
950
 
337
951
  def exists(self) -> bool:
338
952
  """Check if the MongoDB collection exists."""
339
953
  exists = self.collection_exists()
340
- logger.debug(f"Collection '{self.collection_name}' existence: {exists}")
954
+ log_debug(f"Collection '{self.collection_name}' existence: {exists}")
341
955
  return exists
342
956
 
343
957
  def optimize(self) -> None:
@@ -345,24 +959,31 @@ class MongoDBVector(VectorDb):
345
959
  pass
346
960
 
347
961
  def delete(self) -> bool:
348
- """Delete the entire collection from the database."""
962
+ """Delete all documents from the collection."""
349
963
  if self.exists():
350
964
  try:
351
- self._collection.drop()
352
- logger.info(f"Collection '{self.collection_name}' deleted successfully.")
353
- return True
965
+ collection = self._get_collection()
966
+ result = collection.delete_many({})
967
+ # Consider any deletion (even 0) as success
968
+ success = result.deleted_count >= 0
969
+ log_info(f"Deleted {result.deleted_count} documents from collection.")
970
+ return success
354
971
  except Exception as e:
355
- logger.error(f"Error deleting collection '{self.collection_name}': {e}")
972
+ logger.error(f"Error deleting documents: {e}")
356
973
  return False
357
- else:
358
- logger.warning(f"Collection '{self.collection_name}' does not exist.")
359
- return False
974
+ # Return True if collection doesn't exist (nothing to delete)
975
+ return True
360
976
 
361
- def prepare_doc(self, document: Document) -> Dict[str, Any]:
977
+ def prepare_doc(
978
+ self, content_hash: str, document: Document, filters: Optional[Dict[str, Any]] = None
979
+ ) -> Dict[str, Any]:
362
980
  """Prepare a document for insertion or upsertion into MongoDB."""
363
- document.embed(embedder=self.embedder)
364
- if document.embedding is None:
365
- raise ValueError(f"Failed to generate embedding for document: {document.id}")
981
+
982
+ # Add filters to document metadata if provided
983
+ if filters:
984
+ meta_data = document.meta_data.copy() if document.meta_data else {}
985
+ meta_data.update(filters)
986
+ document.meta_data = meta_data
366
987
 
367
988
  cleaned_content = document.content.replace("\x00", "\ufffd")
368
989
  doc_id = md5(cleaned_content.encode("utf-8")).hexdigest()
@@ -372,16 +993,398 @@ class MongoDBVector(VectorDb):
372
993
  "content": cleaned_content,
373
994
  "meta_data": document.meta_data,
374
995
  "embedding": document.embedding,
996
+ "content_id": document.content_id,
997
+ "content_hash": content_hash,
375
998
  }
376
- logger.debug(f"Prepared document: {doc_data['_id']}")
999
+ log_debug(f"Prepared document: {doc_data['_id']}")
377
1000
  return doc_data
378
1001
 
379
1002
  def get_count(self) -> int:
380
1003
  """Get the count of documents in the MongoDB collection."""
381
1004
  try:
382
- count = self._collection.count_documents({})
383
- logger.debug(f"Collection '{self.collection_name}' has {count} documents.")
1005
+ collection = self._get_collection()
1006
+ count = collection.count_documents({})
1007
+ log_debug(f"Collection '{self.collection_name}' has {count} documents.")
384
1008
  return count
385
1009
  except Exception as e:
386
1010
  logger.error(f"Error getting document count: {e}")
387
1011
  return 0
1012
+
1013
+ async def async_insert(
1014
+ self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
1015
+ ) -> None:
1016
+ """Insert documents asynchronously."""
1017
+ log_debug(f"Inserting {len(documents)} documents asynchronously")
1018
+ collection = await self._get_async_collection()
1019
+
1020
+ if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
1021
+ # Use batch embedding when enabled and supported
1022
+ try:
1023
+ # Extract content from all documents
1024
+ doc_contents = [doc.content for doc in documents]
1025
+
1026
+ # Get batch embeddings and usage
1027
+ embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
1028
+
1029
+ # Process documents with pre-computed embeddings
1030
+ for j, doc in enumerate(documents):
1031
+ try:
1032
+ if j < len(embeddings):
1033
+ doc.embedding = embeddings[j]
1034
+ doc.usage = usages[j] if j < len(usages) else None
1035
+ except Exception as e:
1036
+ logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
1037
+
1038
+ except Exception as e:
1039
+ # Check if this is a rate limit error - don't fall back as it would make things worse
1040
+ error_str = str(e).lower()
1041
+ is_rate_limit = any(
1042
+ phrase in error_str
1043
+ for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
1044
+ )
1045
+
1046
+ if is_rate_limit:
1047
+ logger.error(f"Rate limit detected during batch embedding. {e}")
1048
+ raise e
1049
+ else:
1050
+ logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
1051
+ # Fall back to individual embedding
1052
+ embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
1053
+ await asyncio.gather(*embed_tasks, return_exceptions=True)
1054
+ else:
1055
+ # Use individual embedding
1056
+ embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
1057
+ await asyncio.gather(*embed_tasks, return_exceptions=True)
1058
+
1059
+ prepared_docs = []
1060
+ for document in documents:
1061
+ try:
1062
+ doc_data = self.prepare_doc(content_hash, document, filters)
1063
+ prepared_docs.append(doc_data)
1064
+ except ValueError as e:
1065
+ logger.error(f"Error preparing document '{document.name}': {e}")
1066
+
1067
+ if prepared_docs:
1068
+ try:
1069
+ await collection.insert_many(prepared_docs, ordered=False)
1070
+ log_info(f"Inserted {len(prepared_docs)} documents successfully.")
1071
+ if self.wait_after_insert_in_seconds and self.wait_after_insert_in_seconds > 0:
1072
+ await asyncio.sleep(self.wait_after_insert_in_seconds)
1073
+ except errors.BulkWriteError as e:
1074
+ logger.warning(f"Bulk write error while inserting documents: {e.details}")
1075
+ except Exception as e:
1076
+ logger.error(f"Error inserting documents asynchronously: {e}")
1077
+
1078
+ async def async_upsert(
1079
+ self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
1080
+ ) -> None:
1081
+ """Upsert documents asynchronously."""
1082
+ log_info(f"Upserting {len(documents)} documents asynchronously")
1083
+ collection = await self._get_async_collection()
1084
+
1085
+ if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
1086
+ # Use batch embedding when enabled and supported
1087
+ try:
1088
+ # Extract content from all documents
1089
+ doc_contents = [doc.content for doc in documents]
1090
+
1091
+ # Get batch embeddings and usage
1092
+ embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
1093
+
1094
+ # Process documents with pre-computed embeddings
1095
+ for j, doc in enumerate(documents):
1096
+ try:
1097
+ if j < len(embeddings):
1098
+ doc.embedding = embeddings[j]
1099
+ doc.usage = usages[j] if j < len(usages) else None
1100
+ except Exception as e:
1101
+ logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
1102
+
1103
+ except Exception as e:
1104
+ # Check if this is a rate limit error - don't fall back as it would make things worse
1105
+ error_str = str(e).lower()
1106
+ is_rate_limit = any(
1107
+ phrase in error_str
1108
+ for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
1109
+ )
1110
+
1111
+ if is_rate_limit:
1112
+ logger.error(f"Rate limit detected during batch embedding. {e}")
1113
+ raise e
1114
+ else:
1115
+ logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
1116
+ # Fall back to individual embedding
1117
+ embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
1118
+ await asyncio.gather(*embed_tasks, return_exceptions=True)
1119
+ else:
1120
+ # Use individual embedding
1121
+ embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
1122
+ await asyncio.gather(*embed_tasks, return_exceptions=True)
1123
+
1124
+ for document in documents:
1125
+ try:
1126
+ doc_data = self.prepare_doc(content_hash, document, filters)
1127
+ await collection.update_one(
1128
+ {"_id": doc_data["_id"]},
1129
+ {"$set": doc_data},
1130
+ upsert=True,
1131
+ )
1132
+ log_info(f"Upserted document: {doc_data['_id']}")
1133
+ except Exception as e:
1134
+ logger.error(f"Error upserting document '{document.name}' asynchronously: {e}")
1135
+
1136
+ async def async_search(
1137
+ self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
1138
+ ) -> List[Document]:
1139
+ """Search for documents asynchronously."""
1140
+ if isinstance(filters, List):
1141
+ log_warning("Filters Expressions are not supported in MongoDB. No filters will be applied.")
1142
+ filters = None
1143
+ query_embedding = self.embedder.get_embedding(query)
1144
+ if query_embedding is None:
1145
+ logger.error(f"Failed to generate embedding for query: {query}")
1146
+ return []
1147
+
1148
+ try:
1149
+ collection = await self._get_async_collection()
1150
+ pipeline = [
1151
+ {
1152
+ "$vectorSearch": {
1153
+ "index": self.search_index_name,
1154
+ "limit": limit,
1155
+ "numCandidates": min(limit * 4, 100),
1156
+ "queryVector": query_embedding,
1157
+ "path": "embedding",
1158
+ }
1159
+ },
1160
+ {"$set": {"score": {"$meta": "vectorSearchScore"}}},
1161
+ ]
1162
+
1163
+ # Handle filters if provided
1164
+ if filters:
1165
+ # MongoDB uses dot notation for nested fields, so we need to prepend meta_data. if needed
1166
+ mongo_filters = {}
1167
+ for key, value in filters.items():
1168
+ # If the key doesn't already include a dot notation for meta_data
1169
+ if not key.startswith("meta_data.") and "." not in key:
1170
+ mongo_filters[f"meta_data.{key}"] = value
1171
+ else:
1172
+ mongo_filters[key] = value
1173
+
1174
+ pipeline.append({"$match": mongo_filters})
1175
+
1176
+ pipeline.append({"$project": {"embedding": 0}})
1177
+
1178
+ # With AsyncMongoClient, aggregate() returns a coroutine that resolves to a cursor
1179
+ # We need to await it first to get the cursor
1180
+ cursor = await collection.aggregate(pipeline)
1181
+
1182
+ # Now we can iterate over the cursor to get results
1183
+ results = []
1184
+ async for doc in cursor:
1185
+ results.append(doc)
1186
+ if len(results) >= limit:
1187
+ break
1188
+
1189
+ docs = [
1190
+ Document(
1191
+ id=str(doc["_id"]),
1192
+ name=doc.get("name"),
1193
+ content=doc["content"],
1194
+ meta_data={**doc.get("meta_data", {}), "score": doc.get("score", 0.0)},
1195
+ content_id=doc.get("content_id"),
1196
+ )
1197
+ for doc in results
1198
+ ]
1199
+
1200
+ log_info(f"Async search completed. Found {len(docs)} documents.")
1201
+ return docs
1202
+
1203
+ except Exception as e:
1204
+ logger.error(f"Error during async search: {e}")
1205
+ # Include traceback for better debugging
1206
+ import traceback
1207
+
1208
+ logger.error(f"Traceback: {traceback.format_exc()}")
1209
+ raise
1210
+
1211
+ async def async_drop(self) -> None:
1212
+ """Drop the collection asynchronously."""
1213
+ if await self.async_exists():
1214
+ try:
1215
+ collection = await self._get_async_collection()
1216
+ await collection.drop()
1217
+ log_info(f"Collection '{self.collection_name}' dropped asynchronously")
1218
+ except Exception as e:
1219
+ logger.error(f"Error dropping collection asynchronously: {e}")
1220
+ raise
1221
+
1222
+ async def async_exists(self) -> bool:
1223
+ """Check if the collection exists asynchronously."""
1224
+ try:
1225
+ client = await self._get_async_client()
1226
+ collection_names = await client[self.database].list_collection_names()
1227
+ exists = self.collection_name in collection_names
1228
+ log_debug(f"Collection '{self.collection_name}' existence (async): {exists}")
1229
+ return exists
1230
+ except Exception as e:
1231
+ logger.error(f"Error checking collection existence asynchronously: {e}")
1232
+ return False
1233
+
1234
+ async def async_name_exists(self, name: str) -> bool:
1235
+ """Check if a document with a given name exists asynchronously."""
1236
+ try:
1237
+ collection = await self._get_async_collection()
1238
+ exists = await collection.find_one({"name": name}) is not None
1239
+ log_debug(f"Document with name '{name}' {'exists' if exists else 'does not exist'} (async)")
1240
+ return exists
1241
+ except Exception as e:
1242
+ logger.error(f"Error checking document name existence asynchronously: {e}")
1243
+ return False
1244
+
1245
+ def _get_cosmos_similarity_metric(self) -> str:
1246
+ """Convert MongoDB distance metric to Cosmos DB format."""
1247
+ # Cosmos DB supports: COS (cosine), L2 (Euclidean), IP (inner product)
1248
+ metric_mapping = {"cosine": "COS", "euclidean": "L2", "dotProduct": "IP"}
1249
+ return metric_mapping.get(self.distance_metric, "COS")
1250
+
1251
+ def _convert_objectids_to_strings(self, obj: Any) -> Any:
1252
+ """
1253
+ Recursively convert MongoDB ObjectIds to strings in any data structure.
1254
+
1255
+ Args:
1256
+ obj: Any object that might contain ObjectIds
1257
+
1258
+ Returns:
1259
+ The same object with ObjectIds converted to strings
1260
+ """
1261
+ if isinstance(obj, ObjectId):
1262
+ return str(obj)
1263
+ elif isinstance(obj, dict):
1264
+ return {key: self._convert_objectids_to_strings(value) for key, value in obj.items()}
1265
+ elif isinstance(obj, list):
1266
+ return [self._convert_objectids_to_strings(item) for item in obj]
1267
+ elif isinstance(obj, tuple):
1268
+ return tuple(self._convert_objectids_to_strings(item) for item in obj)
1269
+ else:
1270
+ return obj
1271
+
1272
+ def delete_by_id(self, id: str) -> bool:
1273
+ """Delete document by ID."""
1274
+ try:
1275
+ collection = self._get_collection()
1276
+ result = collection.delete_one({"_id": id})
1277
+
1278
+ if result.deleted_count > 0:
1279
+ log_info(
1280
+ f"Deleted {result.deleted_count} document(s) with ID '{id}' from collection '{self.collection_name}'."
1281
+ )
1282
+ return True
1283
+ else:
1284
+ log_info(f"No documents found with ID '{id}' to delete.")
1285
+ return True
1286
+ except Exception as e:
1287
+ logger.error(f"Error deleting document with ID '{id}': {e}")
1288
+ return False
1289
+
1290
+ def delete_by_name(self, name: str) -> bool:
1291
+ """Delete documents by name."""
1292
+ try:
1293
+ collection = self._get_collection()
1294
+ result = collection.delete_many({"name": name})
1295
+
1296
+ log_info(
1297
+ f"Deleted {result.deleted_count} document(s) with name '{name}' from collection '{self.collection_name}'."
1298
+ )
1299
+ return True
1300
+ except Exception as e:
1301
+ logger.error(f"Error deleting documents with name '{name}': {e}")
1302
+ return False
1303
+
1304
+ def delete_by_metadata(self, metadata: Dict[str, Any]) -> bool:
1305
+ """Delete documents by metadata."""
1306
+ try:
1307
+ collection = self._get_collection()
1308
+
1309
+ # Build MongoDB query for metadata matching
1310
+ mongo_filters = {}
1311
+ for key, value in metadata.items():
1312
+ # Use dot notation for nested metadata fields
1313
+ mongo_filters[f"meta_data.{key}"] = value
1314
+
1315
+ result = collection.delete_many(mongo_filters)
1316
+
1317
+ log_info(
1318
+ f"Deleted {result.deleted_count} document(s) with metadata '{metadata}' from collection '{self.collection_name}'."
1319
+ )
1320
+ return True
1321
+ except Exception as e:
1322
+ logger.error(f"Error deleting documents with metadata '{metadata}': {e}")
1323
+ return False
1324
+
1325
+ def _delete_by_content_hash(self, content_hash: str) -> bool:
1326
+ """Delete documents by content hash.
1327
+
1328
+ Args:
1329
+ content_hash (str): The content hash to delete.
1330
+
1331
+ Returns:
1332
+ bool: True if documents were deleted successfully, False otherwise.
1333
+ """
1334
+ try:
1335
+ collection = self._get_collection()
1336
+ result = collection.delete_many({"content_hash": content_hash})
1337
+ log_info(f"Deleted {result.deleted_count} documents with content_hash '{content_hash}'")
1338
+ return True
1339
+ except Exception as e:
1340
+ logger.error(f"Error deleting documents by content_hash '{content_hash}': {e}")
1341
+ return False
1342
+
1343
+ def delete_by_content_id(self, content_id: str) -> bool:
1344
+ """Delete documents by content ID."""
1345
+ try:
1346
+ collection = self._get_collection()
1347
+ result = collection.delete_many({"content_id": content_id})
1348
+
1349
+ log_info(
1350
+ f"Deleted {result.deleted_count} document(s) with content_id '{content_id}' from collection '{self.collection_name}'."
1351
+ )
1352
+ return True
1353
+ except Exception as e:
1354
+ logger.error(f"Error deleting documents with content_id '{content_id}': {e}")
1355
+ return False
1356
+
1357
+ def update_metadata(self, content_id: str, metadata: Dict[str, Any]) -> None:
1358
+ """
1359
+ Update the metadata for documents with the given content_id.
1360
+
1361
+ Args:
1362
+ content_id (str): The content ID to update
1363
+ metadata (Dict[str, Any]): The metadata to update
1364
+ """
1365
+ try:
1366
+ collection = self._client[self.database][self.collection_name] # type: ignore
1367
+
1368
+ # Create query filter for content_id
1369
+ filter_query = {"content_id": content_id}
1370
+
1371
+ update_operations = {}
1372
+ for key, value in metadata.items():
1373
+ update_operations[f"meta_data.{key}"] = value
1374
+ update_operations[f"filters.{key}"] = value
1375
+
1376
+ # Update documents
1377
+ result = collection.update_many(filter_query, {"$set": update_operations})
1378
+
1379
+ if result.matched_count == 0:
1380
+ logger.debug(f"No documents found with content_id: {content_id}")
1381
+ else:
1382
+ logger.debug(f"Updated metadata for {result.matched_count} documents with content_id: {content_id}")
1383
+
1384
+ except Exception as e:
1385
+ logger.error(f"Error updating metadata for content_id '{content_id}': {e}")
1386
+ raise
1387
+
1388
+ def get_supported_search_types(self) -> List[str]:
1389
+ """Get the supported search types for this vector database."""
1390
+ return [SearchType.vector, SearchType.hybrid]