agno 2.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/__init__.py +8 -0
- agno/agent/__init__.py +51 -0
- agno/agent/agent.py +10405 -0
- agno/api/__init__.py +0 -0
- agno/api/agent.py +28 -0
- agno/api/api.py +40 -0
- agno/api/evals.py +22 -0
- agno/api/os.py +17 -0
- agno/api/routes.py +13 -0
- agno/api/schemas/__init__.py +9 -0
- agno/api/schemas/agent.py +16 -0
- agno/api/schemas/evals.py +16 -0
- agno/api/schemas/os.py +14 -0
- agno/api/schemas/response.py +6 -0
- agno/api/schemas/team.py +16 -0
- agno/api/schemas/utils.py +21 -0
- agno/api/schemas/workflows.py +16 -0
- agno/api/settings.py +53 -0
- agno/api/team.py +30 -0
- agno/api/workflow.py +28 -0
- agno/cloud/aws/base.py +214 -0
- agno/cloud/aws/s3/__init__.py +2 -0
- agno/cloud/aws/s3/api_client.py +43 -0
- agno/cloud/aws/s3/bucket.py +195 -0
- agno/cloud/aws/s3/object.py +57 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/__init__.py +24 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +598 -0
- agno/db/dynamo/__init__.py +3 -0
- agno/db/dynamo/dynamo.py +2042 -0
- agno/db/dynamo/schemas.py +314 -0
- agno/db/dynamo/utils.py +743 -0
- agno/db/firestore/__init__.py +3 -0
- agno/db/firestore/firestore.py +1795 -0
- agno/db/firestore/schemas.py +140 -0
- agno/db/firestore/utils.py +376 -0
- agno/db/gcs_json/__init__.py +3 -0
- agno/db/gcs_json/gcs_json_db.py +1335 -0
- agno/db/gcs_json/utils.py +228 -0
- agno/db/in_memory/__init__.py +3 -0
- agno/db/in_memory/in_memory_db.py +1160 -0
- agno/db/in_memory/utils.py +230 -0
- agno/db/json/__init__.py +3 -0
- agno/db/json/json_db.py +1328 -0
- agno/db/json/utils.py +230 -0
- agno/db/migrations/__init__.py +0 -0
- agno/db/migrations/v1_to_v2.py +635 -0
- agno/db/mongo/__init__.py +17 -0
- agno/db/mongo/async_mongo.py +2026 -0
- agno/db/mongo/mongo.py +1982 -0
- agno/db/mongo/schemas.py +87 -0
- agno/db/mongo/utils.py +259 -0
- agno/db/mysql/__init__.py +3 -0
- agno/db/mysql/mysql.py +2308 -0
- agno/db/mysql/schemas.py +138 -0
- agno/db/mysql/utils.py +355 -0
- agno/db/postgres/__init__.py +4 -0
- agno/db/postgres/async_postgres.py +1927 -0
- agno/db/postgres/postgres.py +2260 -0
- agno/db/postgres/schemas.py +139 -0
- agno/db/postgres/utils.py +442 -0
- agno/db/redis/__init__.py +3 -0
- agno/db/redis/redis.py +1660 -0
- agno/db/redis/schemas.py +123 -0
- agno/db/redis/utils.py +346 -0
- agno/db/schemas/__init__.py +4 -0
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +33 -0
- agno/db/schemas/knowledge.py +40 -0
- agno/db/schemas/memory.py +46 -0
- agno/db/schemas/metrics.py +0 -0
- agno/db/singlestore/__init__.py +3 -0
- agno/db/singlestore/schemas.py +130 -0
- agno/db/singlestore/singlestore.py +2272 -0
- agno/db/singlestore/utils.py +384 -0
- agno/db/sqlite/__init__.py +4 -0
- agno/db/sqlite/async_sqlite.py +2293 -0
- agno/db/sqlite/schemas.py +133 -0
- agno/db/sqlite/sqlite.py +2288 -0
- agno/db/sqlite/utils.py +431 -0
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +309 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1353 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +116 -0
- agno/debug.py +18 -0
- agno/eval/__init__.py +14 -0
- agno/eval/accuracy.py +834 -0
- agno/eval/performance.py +773 -0
- agno/eval/reliability.py +306 -0
- agno/eval/utils.py +119 -0
- agno/exceptions.py +161 -0
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/integrations/__init__.py +0 -0
- agno/integrations/discord/__init__.py +3 -0
- agno/integrations/discord/client.py +203 -0
- agno/knowledge/__init__.py +5 -0
- agno/knowledge/chunking/__init__.py +0 -0
- agno/knowledge/chunking/agentic.py +79 -0
- agno/knowledge/chunking/document.py +91 -0
- agno/knowledge/chunking/fixed.py +57 -0
- agno/knowledge/chunking/markdown.py +151 -0
- agno/knowledge/chunking/recursive.py +63 -0
- agno/knowledge/chunking/row.py +39 -0
- agno/knowledge/chunking/semantic.py +86 -0
- agno/knowledge/chunking/strategy.py +165 -0
- agno/knowledge/content.py +74 -0
- agno/knowledge/document/__init__.py +5 -0
- agno/knowledge/document/base.py +58 -0
- agno/knowledge/embedder/__init__.py +5 -0
- agno/knowledge/embedder/aws_bedrock.py +343 -0
- agno/knowledge/embedder/azure_openai.py +210 -0
- agno/knowledge/embedder/base.py +23 -0
- agno/knowledge/embedder/cohere.py +323 -0
- agno/knowledge/embedder/fastembed.py +62 -0
- agno/knowledge/embedder/fireworks.py +13 -0
- agno/knowledge/embedder/google.py +258 -0
- agno/knowledge/embedder/huggingface.py +94 -0
- agno/knowledge/embedder/jina.py +182 -0
- agno/knowledge/embedder/langdb.py +22 -0
- agno/knowledge/embedder/mistral.py +206 -0
- agno/knowledge/embedder/nebius.py +13 -0
- agno/knowledge/embedder/ollama.py +154 -0
- agno/knowledge/embedder/openai.py +195 -0
- agno/knowledge/embedder/sentence_transformer.py +63 -0
- agno/knowledge/embedder/together.py +13 -0
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +165 -0
- agno/knowledge/knowledge.py +1988 -0
- agno/knowledge/reader/__init__.py +7 -0
- agno/knowledge/reader/arxiv_reader.py +81 -0
- agno/knowledge/reader/base.py +95 -0
- agno/knowledge/reader/csv_reader.py +166 -0
- agno/knowledge/reader/docx_reader.py +82 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +292 -0
- agno/knowledge/reader/firecrawl_reader.py +201 -0
- agno/knowledge/reader/json_reader.py +87 -0
- agno/knowledge/reader/markdown_reader.py +137 -0
- agno/knowledge/reader/pdf_reader.py +431 -0
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +313 -0
- agno/knowledge/reader/s3_reader.py +89 -0
- agno/knowledge/reader/tavily_reader.py +194 -0
- agno/knowledge/reader/text_reader.py +115 -0
- agno/knowledge/reader/web_search_reader.py +372 -0
- agno/knowledge/reader/website_reader.py +455 -0
- agno/knowledge/reader/wikipedia_reader.py +59 -0
- agno/knowledge/reader/youtube_reader.py +78 -0
- agno/knowledge/remote_content/__init__.py +0 -0
- agno/knowledge/remote_content/remote_content.py +88 -0
- agno/knowledge/reranker/__init__.py +3 -0
- agno/knowledge/reranker/base.py +14 -0
- agno/knowledge/reranker/cohere.py +64 -0
- agno/knowledge/reranker/infinity.py +195 -0
- agno/knowledge/reranker/sentence_transformer.py +54 -0
- agno/knowledge/types.py +39 -0
- agno/knowledge/utils.py +189 -0
- agno/media.py +462 -0
- agno/memory/__init__.py +3 -0
- agno/memory/manager.py +1327 -0
- agno/models/__init__.py +0 -0
- agno/models/aimlapi/__init__.py +5 -0
- agno/models/aimlapi/aimlapi.py +45 -0
- agno/models/anthropic/__init__.py +5 -0
- agno/models/anthropic/claude.py +757 -0
- agno/models/aws/__init__.py +15 -0
- agno/models/aws/bedrock.py +701 -0
- agno/models/aws/claude.py +378 -0
- agno/models/azure/__init__.py +18 -0
- agno/models/azure/ai_foundry.py +485 -0
- agno/models/azure/openai_chat.py +131 -0
- agno/models/base.py +2175 -0
- agno/models/cerebras/__init__.py +12 -0
- agno/models/cerebras/cerebras.py +501 -0
- agno/models/cerebras/cerebras_openai.py +112 -0
- agno/models/cohere/__init__.py +5 -0
- agno/models/cohere/chat.py +389 -0
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +57 -0
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +91 -0
- agno/models/deepinfra/__init__.py +5 -0
- agno/models/deepinfra/deepinfra.py +28 -0
- agno/models/deepseek/__init__.py +5 -0
- agno/models/deepseek/deepseek.py +61 -0
- agno/models/defaults.py +1 -0
- agno/models/fireworks/__init__.py +5 -0
- agno/models/fireworks/fireworks.py +26 -0
- agno/models/google/__init__.py +5 -0
- agno/models/google/gemini.py +1085 -0
- agno/models/groq/__init__.py +5 -0
- agno/models/groq/groq.py +556 -0
- agno/models/huggingface/__init__.py +5 -0
- agno/models/huggingface/huggingface.py +491 -0
- agno/models/ibm/__init__.py +5 -0
- agno/models/ibm/watsonx.py +422 -0
- agno/models/internlm/__init__.py +3 -0
- agno/models/internlm/internlm.py +26 -0
- agno/models/langdb/__init__.py +1 -0
- agno/models/langdb/langdb.py +48 -0
- agno/models/litellm/__init__.py +14 -0
- agno/models/litellm/chat.py +468 -0
- agno/models/litellm/litellm_openai.py +25 -0
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/lmstudio/__init__.py +5 -0
- agno/models/lmstudio/lmstudio.py +25 -0
- agno/models/message.py +434 -0
- agno/models/meta/__init__.py +12 -0
- agno/models/meta/llama.py +475 -0
- agno/models/meta/llama_openai.py +78 -0
- agno/models/metrics.py +120 -0
- agno/models/mistral/__init__.py +5 -0
- agno/models/mistral/mistral.py +432 -0
- agno/models/nebius/__init__.py +3 -0
- agno/models/nebius/nebius.py +54 -0
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/__init__.py +5 -0
- agno/models/nvidia/nvidia.py +28 -0
- agno/models/ollama/__init__.py +5 -0
- agno/models/ollama/chat.py +441 -0
- agno/models/openai/__init__.py +9 -0
- agno/models/openai/chat.py +883 -0
- agno/models/openai/like.py +27 -0
- agno/models/openai/responses.py +1050 -0
- agno/models/openrouter/__init__.py +5 -0
- agno/models/openrouter/openrouter.py +66 -0
- agno/models/perplexity/__init__.py +5 -0
- agno/models/perplexity/perplexity.py +187 -0
- agno/models/portkey/__init__.py +3 -0
- agno/models/portkey/portkey.py +81 -0
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +52 -0
- agno/models/response.py +199 -0
- agno/models/sambanova/__init__.py +5 -0
- agno/models/sambanova/sambanova.py +28 -0
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +25 -0
- agno/models/together/__init__.py +5 -0
- agno/models/together/together.py +25 -0
- agno/models/utils.py +266 -0
- agno/models/vercel/__init__.py +3 -0
- agno/models/vercel/v0.py +26 -0
- agno/models/vertexai/__init__.py +0 -0
- agno/models/vertexai/claude.py +70 -0
- agno/models/vllm/__init__.py +3 -0
- agno/models/vllm/vllm.py +78 -0
- agno/models/xai/__init__.py +3 -0
- agno/models/xai/xai.py +113 -0
- agno/os/__init__.py +3 -0
- agno/os/app.py +876 -0
- agno/os/auth.py +57 -0
- agno/os/config.py +104 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +250 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/__init__.py +3 -0
- agno/os/interfaces/agui/agui.py +47 -0
- agno/os/interfaces/agui/router.py +144 -0
- agno/os/interfaces/agui/utils.py +534 -0
- agno/os/interfaces/base.py +25 -0
- agno/os/interfaces/slack/__init__.py +3 -0
- agno/os/interfaces/slack/router.py +148 -0
- agno/os/interfaces/slack/security.py +30 -0
- agno/os/interfaces/slack/slack.py +47 -0
- agno/os/interfaces/whatsapp/__init__.py +3 -0
- agno/os/interfaces/whatsapp/router.py +211 -0
- agno/os/interfaces/whatsapp/security.py +53 -0
- agno/os/interfaces/whatsapp/whatsapp.py +36 -0
- agno/os/mcp.py +292 -0
- agno/os/middleware/__init__.py +7 -0
- agno/os/middleware/jwt.py +233 -0
- agno/os/router.py +1763 -0
- agno/os/routers/__init__.py +3 -0
- agno/os/routers/evals/__init__.py +3 -0
- agno/os/routers/evals/evals.py +430 -0
- agno/os/routers/evals/schemas.py +142 -0
- agno/os/routers/evals/utils.py +162 -0
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/__init__.py +3 -0
- agno/os/routers/knowledge/knowledge.py +997 -0
- agno/os/routers/knowledge/schemas.py +178 -0
- agno/os/routers/memory/__init__.py +3 -0
- agno/os/routers/memory/memory.py +515 -0
- agno/os/routers/memory/schemas.py +62 -0
- agno/os/routers/metrics/__init__.py +3 -0
- agno/os/routers/metrics/metrics.py +190 -0
- agno/os/routers/metrics/schemas.py +47 -0
- agno/os/routers/session/__init__.py +3 -0
- agno/os/routers/session/session.py +997 -0
- agno/os/schema.py +1055 -0
- agno/os/settings.py +43 -0
- agno/os/utils.py +630 -0
- agno/py.typed +0 -0
- agno/reasoning/__init__.py +0 -0
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +67 -0
- agno/reasoning/deepseek.py +63 -0
- agno/reasoning/default.py +97 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +71 -0
- agno/reasoning/helpers.py +63 -0
- agno/reasoning/ollama.py +67 -0
- agno/reasoning/openai.py +86 -0
- agno/reasoning/step.py +31 -0
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +787 -0
- agno/run/base.py +229 -0
- agno/run/cancel.py +81 -0
- agno/run/messages.py +32 -0
- agno/run/team.py +753 -0
- agno/run/workflow.py +708 -0
- agno/session/__init__.py +10 -0
- agno/session/agent.py +295 -0
- agno/session/summary.py +265 -0
- agno/session/team.py +392 -0
- agno/session/workflow.py +205 -0
- agno/team/__init__.py +37 -0
- agno/team/team.py +8793 -0
- agno/tools/__init__.py +10 -0
- agno/tools/agentql.py +120 -0
- agno/tools/airflow.py +69 -0
- agno/tools/api.py +122 -0
- agno/tools/apify.py +314 -0
- agno/tools/arxiv.py +127 -0
- agno/tools/aws_lambda.py +53 -0
- agno/tools/aws_ses.py +66 -0
- agno/tools/baidusearch.py +89 -0
- agno/tools/bitbucket.py +292 -0
- agno/tools/brandfetch.py +213 -0
- agno/tools/bravesearch.py +106 -0
- agno/tools/brightdata.py +367 -0
- agno/tools/browserbase.py +209 -0
- agno/tools/calcom.py +255 -0
- agno/tools/calculator.py +151 -0
- agno/tools/cartesia.py +187 -0
- agno/tools/clickup.py +244 -0
- agno/tools/confluence.py +240 -0
- agno/tools/crawl4ai.py +158 -0
- agno/tools/csv_toolkit.py +185 -0
- agno/tools/dalle.py +110 -0
- agno/tools/daytona.py +475 -0
- agno/tools/decorator.py +262 -0
- agno/tools/desi_vocal.py +108 -0
- agno/tools/discord.py +161 -0
- agno/tools/docker.py +716 -0
- agno/tools/duckdb.py +379 -0
- agno/tools/duckduckgo.py +91 -0
- agno/tools/e2b.py +703 -0
- agno/tools/eleven_labs.py +196 -0
- agno/tools/email.py +67 -0
- agno/tools/evm.py +129 -0
- agno/tools/exa.py +396 -0
- agno/tools/fal.py +127 -0
- agno/tools/file.py +240 -0
- agno/tools/file_generation.py +350 -0
- agno/tools/financial_datasets.py +288 -0
- agno/tools/firecrawl.py +143 -0
- agno/tools/function.py +1187 -0
- agno/tools/giphy.py +93 -0
- agno/tools/github.py +1760 -0
- agno/tools/gmail.py +922 -0
- agno/tools/google_bigquery.py +117 -0
- agno/tools/google_drive.py +270 -0
- agno/tools/google_maps.py +253 -0
- agno/tools/googlecalendar.py +674 -0
- agno/tools/googlesearch.py +98 -0
- agno/tools/googlesheets.py +377 -0
- agno/tools/hackernews.py +77 -0
- agno/tools/jina.py +101 -0
- agno/tools/jira.py +170 -0
- agno/tools/knowledge.py +218 -0
- agno/tools/linear.py +426 -0
- agno/tools/linkup.py +58 -0
- agno/tools/local_file_system.py +90 -0
- agno/tools/lumalab.py +183 -0
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +193 -0
- agno/tools/memori.py +339 -0
- agno/tools/memory.py +419 -0
- agno/tools/mlx_transcribe.py +139 -0
- agno/tools/models/__init__.py +0 -0
- agno/tools/models/azure_openai.py +190 -0
- agno/tools/models/gemini.py +203 -0
- agno/tools/models/groq.py +158 -0
- agno/tools/models/morph.py +186 -0
- agno/tools/models/nebius.py +124 -0
- agno/tools/models_labs.py +195 -0
- agno/tools/moviepy_video.py +349 -0
- agno/tools/neo4j.py +134 -0
- agno/tools/newspaper.py +46 -0
- agno/tools/newspaper4k.py +93 -0
- agno/tools/notion.py +204 -0
- agno/tools/openai.py +202 -0
- agno/tools/openbb.py +160 -0
- agno/tools/opencv.py +321 -0
- agno/tools/openweather.py +233 -0
- agno/tools/oxylabs.py +385 -0
- agno/tools/pandas.py +102 -0
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +257 -0
- agno/tools/pubmed.py +188 -0
- agno/tools/python.py +205 -0
- agno/tools/reasoning.py +283 -0
- agno/tools/reddit.py +467 -0
- agno/tools/replicate.py +117 -0
- agno/tools/resend.py +62 -0
- agno/tools/scrapegraph.py +222 -0
- agno/tools/searxng.py +152 -0
- agno/tools/serpapi.py +116 -0
- agno/tools/serper.py +255 -0
- agno/tools/shell.py +53 -0
- agno/tools/slack.py +136 -0
- agno/tools/sleep.py +20 -0
- agno/tools/spider.py +116 -0
- agno/tools/sql.py +154 -0
- agno/tools/streamlit/__init__.py +0 -0
- agno/tools/streamlit/components.py +113 -0
- agno/tools/tavily.py +254 -0
- agno/tools/telegram.py +48 -0
- agno/tools/todoist.py +218 -0
- agno/tools/tool_registry.py +1 -0
- agno/tools/toolkit.py +146 -0
- agno/tools/trafilatura.py +388 -0
- agno/tools/trello.py +274 -0
- agno/tools/twilio.py +186 -0
- agno/tools/user_control_flow.py +78 -0
- agno/tools/valyu.py +228 -0
- agno/tools/visualization.py +467 -0
- agno/tools/webbrowser.py +28 -0
- agno/tools/webex.py +76 -0
- agno/tools/website.py +54 -0
- agno/tools/webtools.py +45 -0
- agno/tools/whatsapp.py +286 -0
- agno/tools/wikipedia.py +63 -0
- agno/tools/workflow.py +278 -0
- agno/tools/x.py +335 -0
- agno/tools/yfinance.py +257 -0
- agno/tools/youtube.py +184 -0
- agno/tools/zendesk.py +82 -0
- agno/tools/zep.py +454 -0
- agno/tools/zoom.py +382 -0
- agno/utils/__init__.py +0 -0
- agno/utils/agent.py +820 -0
- agno/utils/audio.py +49 -0
- agno/utils/certs.py +27 -0
- agno/utils/code_execution.py +11 -0
- agno/utils/common.py +132 -0
- agno/utils/dttm.py +13 -0
- agno/utils/enum.py +22 -0
- agno/utils/env.py +11 -0
- agno/utils/events.py +696 -0
- agno/utils/format_str.py +16 -0
- agno/utils/functions.py +166 -0
- agno/utils/gemini.py +426 -0
- agno/utils/hooks.py +57 -0
- agno/utils/http.py +74 -0
- agno/utils/json_schema.py +234 -0
- agno/utils/knowledge.py +36 -0
- agno/utils/location.py +19 -0
- agno/utils/log.py +255 -0
- agno/utils/mcp.py +214 -0
- agno/utils/media.py +352 -0
- agno/utils/merge_dict.py +41 -0
- agno/utils/message.py +118 -0
- agno/utils/models/__init__.py +0 -0
- agno/utils/models/ai_foundry.py +43 -0
- agno/utils/models/claude.py +358 -0
- agno/utils/models/cohere.py +87 -0
- agno/utils/models/llama.py +78 -0
- agno/utils/models/mistral.py +98 -0
- agno/utils/models/openai_responses.py +140 -0
- agno/utils/models/schema_utils.py +153 -0
- agno/utils/models/watsonx.py +41 -0
- agno/utils/openai.py +257 -0
- agno/utils/pickle.py +32 -0
- agno/utils/pprint.py +178 -0
- agno/utils/print_response/__init__.py +0 -0
- agno/utils/print_response/agent.py +842 -0
- agno/utils/print_response/team.py +1724 -0
- agno/utils/print_response/workflow.py +1668 -0
- agno/utils/prompts.py +111 -0
- agno/utils/reasoning.py +108 -0
- agno/utils/response.py +163 -0
- agno/utils/response_iterator.py +17 -0
- agno/utils/safe_formatter.py +24 -0
- agno/utils/serialize.py +32 -0
- agno/utils/shell.py +22 -0
- agno/utils/streamlit.py +487 -0
- agno/utils/string.py +231 -0
- agno/utils/team.py +139 -0
- agno/utils/timer.py +41 -0
- agno/utils/tools.py +102 -0
- agno/utils/web.py +23 -0
- agno/utils/whatsapp.py +305 -0
- agno/utils/yaml_io.py +25 -0
- agno/vectordb/__init__.py +3 -0
- agno/vectordb/base.py +127 -0
- agno/vectordb/cassandra/__init__.py +5 -0
- agno/vectordb/cassandra/cassandra.py +501 -0
- agno/vectordb/cassandra/extra_param_mixin.py +11 -0
- agno/vectordb/cassandra/index.py +13 -0
- agno/vectordb/chroma/__init__.py +5 -0
- agno/vectordb/chroma/chromadb.py +929 -0
- agno/vectordb/clickhouse/__init__.py +9 -0
- agno/vectordb/clickhouse/clickhousedb.py +835 -0
- agno/vectordb/clickhouse/index.py +9 -0
- agno/vectordb/couchbase/__init__.py +3 -0
- agno/vectordb/couchbase/couchbase.py +1442 -0
- agno/vectordb/distance.py +7 -0
- agno/vectordb/lancedb/__init__.py +6 -0
- agno/vectordb/lancedb/lance_db.py +995 -0
- agno/vectordb/langchaindb/__init__.py +5 -0
- agno/vectordb/langchaindb/langchaindb.py +163 -0
- agno/vectordb/lightrag/__init__.py +5 -0
- agno/vectordb/lightrag/lightrag.py +388 -0
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +166 -0
- agno/vectordb/milvus/__init__.py +4 -0
- agno/vectordb/milvus/milvus.py +1182 -0
- agno/vectordb/mongodb/__init__.py +9 -0
- agno/vectordb/mongodb/mongodb.py +1417 -0
- agno/vectordb/pgvector/__init__.py +12 -0
- agno/vectordb/pgvector/index.py +23 -0
- agno/vectordb/pgvector/pgvector.py +1462 -0
- agno/vectordb/pineconedb/__init__.py +5 -0
- agno/vectordb/pineconedb/pineconedb.py +747 -0
- agno/vectordb/qdrant/__init__.py +5 -0
- agno/vectordb/qdrant/qdrant.py +1134 -0
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +694 -0
- agno/vectordb/search.py +7 -0
- agno/vectordb/singlestore/__init__.py +10 -0
- agno/vectordb/singlestore/index.py +41 -0
- agno/vectordb/singlestore/singlestore.py +763 -0
- agno/vectordb/surrealdb/__init__.py +3 -0
- agno/vectordb/surrealdb/surrealdb.py +699 -0
- agno/vectordb/upstashdb/__init__.py +5 -0
- agno/vectordb/upstashdb/upstashdb.py +718 -0
- agno/vectordb/weaviate/__init__.py +8 -0
- agno/vectordb/weaviate/index.py +15 -0
- agno/vectordb/weaviate/weaviate.py +1005 -0
- agno/workflow/__init__.py +23 -0
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +738 -0
- agno/workflow/loop.py +735 -0
- agno/workflow/parallel.py +824 -0
- agno/workflow/router.py +702 -0
- agno/workflow/step.py +1432 -0
- agno/workflow/steps.py +592 -0
- agno/workflow/types.py +520 -0
- agno/workflow/workflow.py +4321 -0
- agno-2.2.13.dist-info/METADATA +614 -0
- agno-2.2.13.dist-info/RECORD +575 -0
- agno-2.2.13.dist-info/WHEEL +5 -0
- agno-2.2.13.dist-info/licenses/LICENSE +201 -0
- agno-2.2.13.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1417 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from bson import ObjectId
|
|
6
|
+
|
|
7
|
+
from agno.filters import FilterExpr
|
|
8
|
+
from agno.knowledge.document import Document
|
|
9
|
+
from agno.knowledge.embedder import Embedder
|
|
10
|
+
from agno.utils.log import log_debug, log_info, log_warning, logger
|
|
11
|
+
from agno.vectordb.base import VectorDb
|
|
12
|
+
from agno.vectordb.distance import Distance
|
|
13
|
+
from agno.vectordb.search import SearchType
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from hashlib import md5
|
|
17
|
+
|
|
18
|
+
except ImportError:
|
|
19
|
+
raise ImportError("`hashlib` not installed. Please install using `pip install hashlib`")
|
|
20
|
+
try:
|
|
21
|
+
from pymongo import AsyncMongoClient, MongoClient, errors
|
|
22
|
+
from pymongo.collection import Collection
|
|
23
|
+
from pymongo.operations import SearchIndexModel
|
|
24
|
+
|
|
25
|
+
except ImportError:
|
|
26
|
+
raise ImportError("`pymongo` not installed. Please install using `pip install pymongo`")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MongoDb(VectorDb):
|
|
30
|
+
"""
|
|
31
|
+
MongoDB Vector Database implementation with elegant handling of Atlas Search index creation.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
collection_name: str,
|
|
37
|
+
name: Optional[str] = None,
|
|
38
|
+
description: Optional[str] = None,
|
|
39
|
+
id: Optional[str] = None,
|
|
40
|
+
db_url: Optional[str] = "mongodb://localhost:27017/",
|
|
41
|
+
database: str = "agno",
|
|
42
|
+
embedder: Optional[Embedder] = None,
|
|
43
|
+
distance_metric: str = Distance.cosine,
|
|
44
|
+
overwrite: bool = False,
|
|
45
|
+
wait_until_index_ready_in_seconds: Optional[float] = 3,
|
|
46
|
+
wait_after_insert_in_seconds: Optional[float] = 3,
|
|
47
|
+
max_pool_size: int = 100,
|
|
48
|
+
retry_writes: bool = True,
|
|
49
|
+
client: Optional[MongoClient] = None,
|
|
50
|
+
search_index_name: Optional[str] = "vector_index_1",
|
|
51
|
+
cosmos_compatibility: Optional[bool] = False,
|
|
52
|
+
search_type: SearchType = SearchType.vector,
|
|
53
|
+
hybrid_vector_weight: float = 0.5,
|
|
54
|
+
hybrid_keyword_weight: float = 0.5,
|
|
55
|
+
hybrid_rank_constant: int = 60,
|
|
56
|
+
**kwargs,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize the MongoDb with MongoDB collection details.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
collection_name (str): Name of the MongoDB collection.
|
|
63
|
+
name (Optional[str]): Name of the vector database.
|
|
64
|
+
description (Optional[str]): Description of the vector database.
|
|
65
|
+
db_url (Optional[str]): MongoDB connection string.
|
|
66
|
+
database (str): Database name.
|
|
67
|
+
embedder (Embedder): Embedder instance for generating embeddings.
|
|
68
|
+
distance_metric (str): Distance metric for similarity.
|
|
69
|
+
overwrite (bool): Overwrite existing collection and index if True.
|
|
70
|
+
wait_until_index_ready_in_seconds (float): Time in seconds to wait until the index is ready.
|
|
71
|
+
wait_after_insert_in_seconds (float): Time in seconds to wait after inserting documents.
|
|
72
|
+
max_pool_size (int): Maximum number of connections in the connection pool
|
|
73
|
+
retry_writes (bool): Whether to retry write operations
|
|
74
|
+
client (Optional[MongoClient]): An existing MongoClient instance.
|
|
75
|
+
search_index_name (str): Name of the search index (default: "vector_index_1")
|
|
76
|
+
cosmos_compatibility (bool): Whether to use Azure Cosmos DB Mongovcore compatibility mode.
|
|
77
|
+
search_type: The search type to use when searching for documents.
|
|
78
|
+
hybrid_vector_weight (float): Default weight for vector search results in hybrid search.
|
|
79
|
+
hybrid_keyword_weight (float): Default weight for keyword search results in hybrid search.
|
|
80
|
+
hybrid_rank_constant (int): Default rank constant (k) for Reciprocal Rank Fusion in hybrid search. This constant is added to the rank before taking the reciprocal, helping to smooth scores. A common value is 60.
|
|
81
|
+
**kwargs: Additional arguments for MongoClient.
|
|
82
|
+
"""
|
|
83
|
+
# Validate required parameters
|
|
84
|
+
if not collection_name:
|
|
85
|
+
raise ValueError("Collection name must not be empty.")
|
|
86
|
+
if not database:
|
|
87
|
+
raise ValueError("Database name must not be empty.")
|
|
88
|
+
|
|
89
|
+
# Dynamic ID generation based on unique identifiers
|
|
90
|
+
if id is None:
|
|
91
|
+
from agno.utils.string import generate_id
|
|
92
|
+
|
|
93
|
+
connection_identifier = db_url or "mongodb://localhost:27017/"
|
|
94
|
+
seed = f"{connection_identifier}#{database}#{collection_name}"
|
|
95
|
+
id = generate_id(seed)
|
|
96
|
+
|
|
97
|
+
self.collection_name = collection_name
|
|
98
|
+
# Initialize base class with name, description, and generated ID
|
|
99
|
+
super().__init__(id=id, name=name, description=description)
|
|
100
|
+
|
|
101
|
+
self.database = database
|
|
102
|
+
self.search_index_name = search_index_name
|
|
103
|
+
self.cosmos_compatibility = cosmos_compatibility
|
|
104
|
+
self.search_type = search_type
|
|
105
|
+
self.hybrid_vector_weight = hybrid_vector_weight
|
|
106
|
+
self.hybrid_keyword_weight = hybrid_keyword_weight
|
|
107
|
+
self.hybrid_rank_constant = hybrid_rank_constant
|
|
108
|
+
|
|
109
|
+
if embedder is None:
|
|
110
|
+
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
111
|
+
|
|
112
|
+
embedder = OpenAIEmbedder()
|
|
113
|
+
log_info("Embedder not provided, using OpenAIEmbedder as default.")
|
|
114
|
+
self.embedder = embedder
|
|
115
|
+
|
|
116
|
+
self.distance_metric = distance_metric
|
|
117
|
+
self.connection_string = db_url
|
|
118
|
+
self.overwrite = overwrite
|
|
119
|
+
self.wait_until_index_ready_in_seconds = wait_until_index_ready_in_seconds
|
|
120
|
+
self.wait_after_insert_in_seconds = wait_after_insert_in_seconds
|
|
121
|
+
self.kwargs = kwargs
|
|
122
|
+
self.kwargs.update(
|
|
123
|
+
{
|
|
124
|
+
"maxPoolSize": max_pool_size,
|
|
125
|
+
"retryWrites": retry_writes,
|
|
126
|
+
"serverSelectionTimeoutMS": 5000, # 5 second timeout
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self._client = client
|
|
131
|
+
self._db = None
|
|
132
|
+
self._collection: Optional[Collection] = None
|
|
133
|
+
|
|
134
|
+
self._async_client: Optional[AsyncMongoClient] = None
|
|
135
|
+
self._async_db = None
|
|
136
|
+
self._async_collection: Optional[Collection] = None
|
|
137
|
+
|
|
138
|
+
def _get_client(self) -> MongoClient:
|
|
139
|
+
"""Create or retrieve the MongoDB client."""
|
|
140
|
+
if self._client is None:
|
|
141
|
+
if self.cosmos_compatibility:
|
|
142
|
+
try:
|
|
143
|
+
log_debug("Creating MongoDB Client for Azure Cosmos DB")
|
|
144
|
+
# Cosmos DB specific settings
|
|
145
|
+
cosmos_kwargs = {
|
|
146
|
+
"retryWrites": False,
|
|
147
|
+
"ssl": True,
|
|
148
|
+
"tlsAllowInvalidCertificates": True,
|
|
149
|
+
"maxPoolSize": 100,
|
|
150
|
+
"maxIdleTimeMS": 30000,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Suppress UserWarning about CosmosDB
|
|
154
|
+
import warnings
|
|
155
|
+
|
|
156
|
+
with warnings.catch_warnings():
|
|
157
|
+
warnings.filterwarnings(
|
|
158
|
+
"ignore", category=UserWarning, message=".*connected to a CosmosDB cluster.*"
|
|
159
|
+
)
|
|
160
|
+
self._client = MongoClient(self.connection_string, **cosmos_kwargs) # type: ignore
|
|
161
|
+
|
|
162
|
+
self._client.admin.command("ping")
|
|
163
|
+
|
|
164
|
+
log_info("Connected to Azure Cosmos DB successfully.")
|
|
165
|
+
self._db = self._client.get_database(self.database) # type: ignore
|
|
166
|
+
log_info(f"Using database: {self.database}")
|
|
167
|
+
|
|
168
|
+
except errors.ConnectionFailure as e:
|
|
169
|
+
raise ConnectionError(f"Failed to connect to Azure Cosmos DB: {e}")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"An error occurred while connecting to Azure Cosmos DB: {e}")
|
|
172
|
+
raise
|
|
173
|
+
else:
|
|
174
|
+
try:
|
|
175
|
+
log_debug("Creating MongoDB Client")
|
|
176
|
+
self._client = MongoClient(self.connection_string, **self.kwargs)
|
|
177
|
+
# Trigger a connection to verify the client
|
|
178
|
+
self._client.admin.command("ping")
|
|
179
|
+
log_info("Connected to MongoDB successfully.")
|
|
180
|
+
self._db = self._client[self.database] # type: ignore
|
|
181
|
+
except errors.ConnectionFailure as e:
|
|
182
|
+
logger.error(f"Failed to connect to MongoDB: {e}")
|
|
183
|
+
raise ConnectionError(f"Failed to connect to MongoDB: {e}")
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.error(f"An error occurred while connecting to MongoDB: {e}")
|
|
186
|
+
raise
|
|
187
|
+
return self._client
|
|
188
|
+
|
|
189
|
+
async def _get_async_client(self) -> AsyncMongoClient:
|
|
190
|
+
"""Create or retrieve the async MongoDB client."""
|
|
191
|
+
if self._async_client is None:
|
|
192
|
+
log_debug("Creating Async MongoDB Client")
|
|
193
|
+
self._async_client = AsyncMongoClient(
|
|
194
|
+
self.connection_string,
|
|
195
|
+
maxPoolSize=self.kwargs.get("maxPoolSize", 100),
|
|
196
|
+
retryWrites=self.kwargs.get("retryWrites", True),
|
|
197
|
+
serverSelectionTimeoutMS=5000,
|
|
198
|
+
)
|
|
199
|
+
# Verify connection
|
|
200
|
+
try:
|
|
201
|
+
await self._async_client.admin.command("ping")
|
|
202
|
+
log_info("Connected to MongoDB asynchronously.")
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(f"Failed to connect to MongoDB asynchronously: {e}")
|
|
205
|
+
raise
|
|
206
|
+
return self._async_client
|
|
207
|
+
|
|
208
|
+
def _get_or_create_collection(self) -> Collection:
|
|
209
|
+
"""Get or create the MongoDB collection, handling Atlas Search index creation."""
|
|
210
|
+
self._collection = self._db[self.collection_name] # type: ignore
|
|
211
|
+
|
|
212
|
+
if not self.collection_exists():
|
|
213
|
+
log_info(f"Creating collection '{self.collection_name}'.")
|
|
214
|
+
self._db.create_collection(self.collection_name) # type: ignore
|
|
215
|
+
self._create_search_index()
|
|
216
|
+
else:
|
|
217
|
+
log_info(f"Using existing collection '{self.collection_name}'.")
|
|
218
|
+
# check if index exists
|
|
219
|
+
log_info(f"Checking if search index '{self.collection_name}' exists.")
|
|
220
|
+
if not self._search_index_exists():
|
|
221
|
+
log_info(f"Search index '{self.collection_name}' does not exist. Creating it.")
|
|
222
|
+
self._create_search_index()
|
|
223
|
+
if self.wait_until_index_ready_in_seconds and not self.cosmos_compatibility:
|
|
224
|
+
self._wait_for_index_ready()
|
|
225
|
+
else:
|
|
226
|
+
log_info("Using existing vector search index.")
|
|
227
|
+
return self._collection # type: ignore
|
|
228
|
+
|
|
229
|
+
def _get_collection(self) -> Collection:
|
|
230
|
+
"""Get or create the MongoDB collection."""
|
|
231
|
+
if self._collection is None:
|
|
232
|
+
if self._client is None:
|
|
233
|
+
self._get_client()
|
|
234
|
+
self._collection = self._db[self.collection_name] # type: ignore
|
|
235
|
+
log_info(f"Using collection: {self.collection_name}")
|
|
236
|
+
return self._collection
|
|
237
|
+
|
|
238
|
+
async def _get_async_collection(self):
|
|
239
|
+
"""Get or create the async MongoDB collection."""
|
|
240
|
+
if self._async_collection is None:
|
|
241
|
+
client = await self._get_async_client()
|
|
242
|
+
self._async_db = client[self.database] # type: ignore
|
|
243
|
+
self._async_collection = self._async_db[self.collection_name] # type: ignore
|
|
244
|
+
return self._async_collection
|
|
245
|
+
|
|
246
|
+
def _create_search_index(self, overwrite: bool = True) -> None:
|
|
247
|
+
"""Create or overwrite the Atlas Search index with proper error handling."""
|
|
248
|
+
index_name = self.search_index_name or "vector_index_1"
|
|
249
|
+
max_retries = 3
|
|
250
|
+
retry_delay = 5
|
|
251
|
+
|
|
252
|
+
if self.cosmos_compatibility:
|
|
253
|
+
try:
|
|
254
|
+
collection = self._get_collection()
|
|
255
|
+
|
|
256
|
+
# Handle overwrite if requested
|
|
257
|
+
if overwrite and index_name in collection.index_information():
|
|
258
|
+
log_info(f"Dropping existing index '{index_name}'")
|
|
259
|
+
collection.drop_index(index_name)
|
|
260
|
+
|
|
261
|
+
embedding_dim = getattr(self.embedder, "dimensions", 1536)
|
|
262
|
+
log_info(f"Creating vector search index '{index_name}'")
|
|
263
|
+
|
|
264
|
+
# Create vector search index using Cosmos DB IVF format
|
|
265
|
+
collection.create_index(
|
|
266
|
+
[("embedding", "cosmosSearch")],
|
|
267
|
+
name=index_name,
|
|
268
|
+
cosmosSearchOptions={
|
|
269
|
+
"kind": "vector-ivf",
|
|
270
|
+
"numLists": 1,
|
|
271
|
+
"dimensions": embedding_dim,
|
|
272
|
+
"similarity": self._get_cosmos_similarity_metric(),
|
|
273
|
+
},
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
log_info(f"Created vector search index '{index_name}' successfully")
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Error creating vector search index: {e}")
|
|
280
|
+
raise
|
|
281
|
+
else:
|
|
282
|
+
for attempt in range(max_retries):
|
|
283
|
+
try:
|
|
284
|
+
if overwrite and self._search_index_exists():
|
|
285
|
+
log_info(f"Dropping existing search index '{index_name}'.")
|
|
286
|
+
try:
|
|
287
|
+
collection = self._get_collection()
|
|
288
|
+
collection.drop_search_index(index_name)
|
|
289
|
+
# Wait longer after index deletion
|
|
290
|
+
time.sleep(retry_delay * 2)
|
|
291
|
+
except errors.OperationFailure as e:
|
|
292
|
+
if "Index already requested to be deleted" in str(e):
|
|
293
|
+
log_info("Index is already being deleted, waiting...")
|
|
294
|
+
time.sleep(retry_delay * 2) # Wait longer for deletion to complete
|
|
295
|
+
else:
|
|
296
|
+
raise
|
|
297
|
+
|
|
298
|
+
# Verify index is gone before creating new one
|
|
299
|
+
retries = 3
|
|
300
|
+
while retries > 0 and self._search_index_exists():
|
|
301
|
+
log_info("Waiting for index deletion to complete...")
|
|
302
|
+
time.sleep(retry_delay)
|
|
303
|
+
retries -= 1
|
|
304
|
+
|
|
305
|
+
log_info(f"Creating search index '{index_name}'.")
|
|
306
|
+
|
|
307
|
+
# Get embedding dimension from embedder
|
|
308
|
+
embedding_dim = getattr(self.embedder, "dimensions", 1536)
|
|
309
|
+
|
|
310
|
+
search_index_model = SearchIndexModel(
|
|
311
|
+
definition={
|
|
312
|
+
"fields": [
|
|
313
|
+
{
|
|
314
|
+
"type": "vector",
|
|
315
|
+
"numDimensions": embedding_dim,
|
|
316
|
+
"path": "embedding",
|
|
317
|
+
"similarity": self.distance_metric,
|
|
318
|
+
},
|
|
319
|
+
]
|
|
320
|
+
},
|
|
321
|
+
name=index_name,
|
|
322
|
+
type="vectorSearch",
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
collection = self._get_collection()
|
|
326
|
+
collection.create_search_index(model=search_index_model)
|
|
327
|
+
|
|
328
|
+
if self.wait_until_index_ready_in_seconds:
|
|
329
|
+
self._wait_for_index_ready()
|
|
330
|
+
|
|
331
|
+
log_info(f"Search index '{index_name}' created successfully.")
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
except errors.OperationFailure as e:
|
|
335
|
+
if "Duplicate Index" in str(e) and attempt < max_retries - 1:
|
|
336
|
+
logger.warning(f"Index already exists, retrying... (attempt {attempt + 1})")
|
|
337
|
+
time.sleep(retry_delay * (attempt + 1))
|
|
338
|
+
continue
|
|
339
|
+
logger.error(f"Failed to create search index: {e}")
|
|
340
|
+
raise
|
|
341
|
+
except Exception as e:
|
|
342
|
+
logger.error(f"Unexpected error creating search index: {e}")
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
async def _create_search_index_async(self) -> None:
|
|
346
|
+
"""Create the Atlas Search index asynchronously."""
|
|
347
|
+
index_name = self.search_index_name
|
|
348
|
+
max_retries = 3
|
|
349
|
+
retry_delay = 5
|
|
350
|
+
|
|
351
|
+
for attempt in range(max_retries):
|
|
352
|
+
try:
|
|
353
|
+
collection = await self._get_async_collection()
|
|
354
|
+
|
|
355
|
+
# Get embedding dimension from embedder
|
|
356
|
+
embedding_dim = getattr(self.embedder, "dimensions", 1536)
|
|
357
|
+
|
|
358
|
+
search_index_model = SearchIndexModel(
|
|
359
|
+
definition={
|
|
360
|
+
"fields": [
|
|
361
|
+
{
|
|
362
|
+
"type": "vector",
|
|
363
|
+
"numDimensions": embedding_dim,
|
|
364
|
+
"path": "embedding",
|
|
365
|
+
"similarity": self.distance_metric,
|
|
366
|
+
},
|
|
367
|
+
]
|
|
368
|
+
},
|
|
369
|
+
name=index_name,
|
|
370
|
+
type="vectorSearch",
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
await collection.create_search_index(model=search_index_model)
|
|
374
|
+
log_info(f"Search index '{index_name}' created successfully.")
|
|
375
|
+
return
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
if attempt < max_retries - 1:
|
|
379
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
|
380
|
+
continue
|
|
381
|
+
logger.error(f"Failed to create search index: {e}")
|
|
382
|
+
raise
|
|
383
|
+
|
|
384
|
+
def _search_index_exists(self) -> bool:
|
|
385
|
+
"""Check if the search index exists."""
|
|
386
|
+
index_name = self.search_index_name
|
|
387
|
+
if self.cosmos_compatibility:
|
|
388
|
+
index_name = self.search_index_name or "vector_index_1"
|
|
389
|
+
try:
|
|
390
|
+
collection = self._get_collection()
|
|
391
|
+
indexes = collection.index_information()
|
|
392
|
+
|
|
393
|
+
for idx_name, idx_info in indexes.items():
|
|
394
|
+
if idx_name == index_name:
|
|
395
|
+
key_info = idx_info.get("key", [])
|
|
396
|
+
for key_value_pair in key_info:
|
|
397
|
+
# Ensure we have a tuple/list with exactly 2 elements
|
|
398
|
+
if isinstance(key_value_pair, (tuple, list)) and len(key_value_pair) == 2:
|
|
399
|
+
key, value = key_value_pair
|
|
400
|
+
if key == "embedding" and value == "cosmosSearch":
|
|
401
|
+
log_debug(f"Found existing vector search index: {index_name}")
|
|
402
|
+
return True
|
|
403
|
+
|
|
404
|
+
log_debug(f"Vector search index '{index_name}' not found")
|
|
405
|
+
return False
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.error(f"Error checking search index existence: {e}")
|
|
408
|
+
return False
|
|
409
|
+
else:
|
|
410
|
+
try:
|
|
411
|
+
collection = self._get_collection()
|
|
412
|
+
indexes = list(collection.list_search_indexes()) # type: ignore
|
|
413
|
+
exists = any(index["name"] == index_name for index in indexes) # type: ignore
|
|
414
|
+
return exists
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.error(f"Error checking search index existence: {e}")
|
|
417
|
+
return False
|
|
418
|
+
|
|
419
|
+
def _wait_for_index_ready(self) -> None:
|
|
420
|
+
"""Wait until the Atlas Search index is ready."""
|
|
421
|
+
index_name = self.search_index_name
|
|
422
|
+
while True:
|
|
423
|
+
try:
|
|
424
|
+
if self._search_index_exists():
|
|
425
|
+
log_info(f"Search index '{index_name}' is ready.")
|
|
426
|
+
break
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.error(f"Error checking index status: {e}")
|
|
429
|
+
raise TimeoutError("Timeout waiting for search index to become ready.")
|
|
430
|
+
time.sleep(1)
|
|
431
|
+
|
|
432
|
+
async def _wait_for_index_ready_async(self) -> None:
|
|
433
|
+
"""Wait until the Atlas Search index is ready asynchronously."""
|
|
434
|
+
start_time = time.time()
|
|
435
|
+
index_name = self.search_index_name
|
|
436
|
+
while True:
|
|
437
|
+
try:
|
|
438
|
+
collection = await self._get_async_collection()
|
|
439
|
+
indexes = await collection.list_search_indexes()
|
|
440
|
+
if any(index["name"] == index_name for index in indexes):
|
|
441
|
+
log_info(f"Search index '{index_name}' is ready.")
|
|
442
|
+
break
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logger.error(f"Error checking index status asynchronously: {e}")
|
|
445
|
+
import traceback
|
|
446
|
+
|
|
447
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
448
|
+
|
|
449
|
+
if time.time() - start_time > self.wait_until_index_ready_in_seconds: # type: ignore
|
|
450
|
+
raise TimeoutError("Timeout waiting for search index to become ready.")
|
|
451
|
+
await asyncio.sleep(1)
|
|
452
|
+
|
|
453
|
+
def collection_exists(self) -> bool:
|
|
454
|
+
"""Check if the collection exists in the database."""
|
|
455
|
+
if self._db is None:
|
|
456
|
+
self._get_client()
|
|
457
|
+
return self.collection_name in self._db.list_collection_names() # type: ignore
|
|
458
|
+
|
|
459
|
+
def create(self) -> None:
|
|
460
|
+
"""Create the MongoDB collection and indexes if they do not exist."""
|
|
461
|
+
self._get_or_create_collection()
|
|
462
|
+
|
|
463
|
+
async def async_create(self) -> None:
|
|
464
|
+
"""Create the MongoDB collection and indexes asynchronously."""
|
|
465
|
+
await self._get_async_collection()
|
|
466
|
+
|
|
467
|
+
if not await self.async_exists():
|
|
468
|
+
log_info(f"Creating collection '{self.collection_name}' asynchronously.")
|
|
469
|
+
await self._async_db.create_collection(self.collection_name) # type: ignore
|
|
470
|
+
await self._create_search_index_async()
|
|
471
|
+
if self.wait_until_index_ready_in_seconds:
|
|
472
|
+
await self._wait_for_index_ready_async()
|
|
473
|
+
|
|
474
|
+
def doc_exists(self, document: Document) -> bool:
|
|
475
|
+
"""Check if a document exists in the MongoDB collection based on its content."""
|
|
476
|
+
try:
|
|
477
|
+
collection = self._get_collection()
|
|
478
|
+
# Use content hash as document ID
|
|
479
|
+
doc_id = md5(document.content.encode("utf-8")).hexdigest()
|
|
480
|
+
result = collection.find_one({"_id": doc_id})
|
|
481
|
+
exists = result is not None
|
|
482
|
+
log_debug(f"Document {'exists' if exists else 'does not exist'}: {doc_id}")
|
|
483
|
+
return exists
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.error(f"Error checking document existence: {e}")
|
|
486
|
+
return False
|
|
487
|
+
|
|
488
|
+
def name_exists(self, name: str) -> bool:
|
|
489
|
+
"""Check if a document with a given name exists in the collection."""
|
|
490
|
+
try:
|
|
491
|
+
collection = self._get_collection()
|
|
492
|
+
exists = collection.find_one({"name": name}) is not None
|
|
493
|
+
log_debug(f"Document with name '{name}' {'exists' if exists else 'does not exist'}")
|
|
494
|
+
return exists
|
|
495
|
+
except Exception as e:
|
|
496
|
+
logger.error(f"Error checking document name existence: {e}")
|
|
497
|
+
return False
|
|
498
|
+
|
|
499
|
+
def id_exists(self, id: str) -> bool:
|
|
500
|
+
"""Check if a document with the given ID exists in the collection.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
id (str): The document ID to check.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
bool: True if the document exists, False otherwise.
|
|
507
|
+
"""
|
|
508
|
+
try:
|
|
509
|
+
collection = self._get_collection()
|
|
510
|
+
result = collection.find_one({"_id": id})
|
|
511
|
+
exists = result is not None
|
|
512
|
+
log_debug(f"Document with ID '{id}' {'exists' if exists else 'does not exist'}")
|
|
513
|
+
return exists
|
|
514
|
+
except Exception as e:
|
|
515
|
+
logger.error(f"Error checking document ID existence: {e}")
|
|
516
|
+
return False
|
|
517
|
+
|
|
518
|
+
def content_hash_exists(self, content_hash: str) -> bool:
|
|
519
|
+
"""Check if documents with the given content hash exist in the collection.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
content_hash (str): The content hash to check.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
bool: True if documents with the content hash exist, False otherwise.
|
|
526
|
+
"""
|
|
527
|
+
try:
|
|
528
|
+
collection = self._get_collection()
|
|
529
|
+
result = collection.find_one({"content_hash": content_hash})
|
|
530
|
+
exists = result is not None
|
|
531
|
+
log_debug(f"Document with content_hash '{content_hash}' {'exists' if exists else 'does not exist'}")
|
|
532
|
+
return exists
|
|
533
|
+
except Exception as e:
|
|
534
|
+
logger.error(f"Error checking content_hash existence: {e}")
|
|
535
|
+
return False
|
|
536
|
+
|
|
537
|
+
def insert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
538
|
+
"""Insert documents into the MongoDB collection."""
|
|
539
|
+
log_debug(f"Inserting {len(documents)} documents")
|
|
540
|
+
collection = self._get_collection()
|
|
541
|
+
|
|
542
|
+
prepared_docs = []
|
|
543
|
+
for document in documents:
|
|
544
|
+
try:
|
|
545
|
+
document.embed(embedder=self.embedder)
|
|
546
|
+
if document.embedding is None:
|
|
547
|
+
raise ValueError(f"Failed to generate embedding for document: {document.id}")
|
|
548
|
+
doc_data = self.prepare_doc(content_hash, document, filters)
|
|
549
|
+
prepared_docs.append(doc_data)
|
|
550
|
+
except ValueError as e:
|
|
551
|
+
logger.error(f"Error preparing document '{document.name}': {e}")
|
|
552
|
+
|
|
553
|
+
if prepared_docs:
|
|
554
|
+
try:
|
|
555
|
+
collection.insert_many(prepared_docs, ordered=False)
|
|
556
|
+
log_info(f"Inserted {len(prepared_docs)} documents successfully.")
|
|
557
|
+
if self.wait_after_insert_in_seconds and self.wait_after_insert_in_seconds > 0:
|
|
558
|
+
time.sleep(self.wait_after_insert_in_seconds)
|
|
559
|
+
except errors.BulkWriteError as e:
|
|
560
|
+
logger.warning(f"Bulk write error while inserting documents: {e.details}")
|
|
561
|
+
except Exception as e:
|
|
562
|
+
logger.error(f"Error inserting documents: {e}")
|
|
563
|
+
|
|
564
|
+
def upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
565
|
+
"""Upsert documents into the MongoDB collection."""
|
|
566
|
+
log_info(f"Upserting {len(documents)} documents")
|
|
567
|
+
collection = self._get_collection()
|
|
568
|
+
|
|
569
|
+
for document in documents:
|
|
570
|
+
try:
|
|
571
|
+
document.embed(embedder=self.embedder)
|
|
572
|
+
if document.embedding is None:
|
|
573
|
+
raise ValueError(f"Failed to generate embedding for document: {document.id}")
|
|
574
|
+
doc_data = self.prepare_doc(content_hash, document, filters)
|
|
575
|
+
collection.update_one(
|
|
576
|
+
{"_id": doc_data["_id"]},
|
|
577
|
+
{"$set": doc_data},
|
|
578
|
+
upsert=True,
|
|
579
|
+
)
|
|
580
|
+
log_info(f"Upserted document: {doc_data['_id']}")
|
|
581
|
+
except Exception as e:
|
|
582
|
+
logger.error(f"Error upserting document '{document.name}': {e}")
|
|
583
|
+
|
|
584
|
+
def upsert_available(self) -> bool:
|
|
585
|
+
"""Indicate that upsert functionality is available."""
|
|
586
|
+
return True
|
|
587
|
+
|
|
588
|
+
def search(
|
|
589
|
+
self,
|
|
590
|
+
query: str,
|
|
591
|
+
limit: int = 5,
|
|
592
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
593
|
+
min_score: float = 0.0,
|
|
594
|
+
) -> List[Document]:
|
|
595
|
+
"""Search for documents using vector similarity."""
|
|
596
|
+
if isinstance(filters, List):
|
|
597
|
+
log_warning("Filters Expressions are not supported in MongoDB. No filters will be applied.")
|
|
598
|
+
filters = None
|
|
599
|
+
if self.search_type == SearchType.hybrid:
|
|
600
|
+
return self.hybrid_search(query, limit=limit, filters=filters)
|
|
601
|
+
|
|
602
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
603
|
+
if query_embedding is None:
|
|
604
|
+
logger.error(f"Failed to generate embedding for query: {query}")
|
|
605
|
+
return []
|
|
606
|
+
|
|
607
|
+
if self.cosmos_compatibility:
|
|
608
|
+
# Azure Cosmos DB Mongo Vcore compatibility mode
|
|
609
|
+
try:
|
|
610
|
+
collection = self._get_collection()
|
|
611
|
+
|
|
612
|
+
# Construct the search pipeline
|
|
613
|
+
search_stage = {
|
|
614
|
+
"$search": {
|
|
615
|
+
"cosmosSearch": {"vector": query_embedding, "path": "embedding", "k": limit, "nProbes": 2},
|
|
616
|
+
"returnStoredSource": True,
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
pipeline = [
|
|
621
|
+
search_stage,
|
|
622
|
+
{
|
|
623
|
+
"$project": {
|
|
624
|
+
"similarityScore": {"$meta": "searchScore"},
|
|
625
|
+
"_id": 1,
|
|
626
|
+
"name": 1,
|
|
627
|
+
"content": 1,
|
|
628
|
+
"meta_data": 1,
|
|
629
|
+
}
|
|
630
|
+
},
|
|
631
|
+
]
|
|
632
|
+
|
|
633
|
+
results = list(collection.aggregate(pipeline))
|
|
634
|
+
docs = [
|
|
635
|
+
Document(
|
|
636
|
+
id=str(doc["_id"]),
|
|
637
|
+
name=doc.get("name"),
|
|
638
|
+
content=doc["content"],
|
|
639
|
+
meta_data={**doc.get("meta_data", {}), "score": doc.get("similarityScore", 0.0)},
|
|
640
|
+
content_id=doc.get("content_id"),
|
|
641
|
+
)
|
|
642
|
+
for doc in results
|
|
643
|
+
]
|
|
644
|
+
|
|
645
|
+
log_info(f"Search completed. Found {len(docs)} documents.")
|
|
646
|
+
return docs
|
|
647
|
+
|
|
648
|
+
except Exception as e:
|
|
649
|
+
logger.error(f"Error during vector search: {e}")
|
|
650
|
+
return []
|
|
651
|
+
else:
|
|
652
|
+
# MongoDB Atlas Search
|
|
653
|
+
try:
|
|
654
|
+
collection = self._get_collection()
|
|
655
|
+
pipeline = [
|
|
656
|
+
{
|
|
657
|
+
"$vectorSearch": {
|
|
658
|
+
"index": self.search_index_name,
|
|
659
|
+
"limit": limit,
|
|
660
|
+
"numCandidates": min(limit * 4, 100),
|
|
661
|
+
"queryVector": query_embedding,
|
|
662
|
+
"path": "embedding",
|
|
663
|
+
}
|
|
664
|
+
},
|
|
665
|
+
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
|
|
666
|
+
]
|
|
667
|
+
|
|
668
|
+
match_filters = {}
|
|
669
|
+
if min_score > 0:
|
|
670
|
+
match_filters["score"] = {"$gte": min_score}
|
|
671
|
+
|
|
672
|
+
# Handle filters if provided
|
|
673
|
+
if filters:
|
|
674
|
+
# MongoDB uses dot notation for nested fields, so we need to prepend meta_data. if needed
|
|
675
|
+
mongo_filters = {}
|
|
676
|
+
for key, value in filters.items():
|
|
677
|
+
# If the key doesn't already include a dot notation for meta_data
|
|
678
|
+
if not key.startswith("meta_data.") and "." not in key:
|
|
679
|
+
mongo_filters[f"meta_data.{key}"] = value
|
|
680
|
+
else:
|
|
681
|
+
mongo_filters[key] = value
|
|
682
|
+
|
|
683
|
+
match_filters.update(mongo_filters)
|
|
684
|
+
|
|
685
|
+
if match_filters:
|
|
686
|
+
pipeline.append({"$match": match_filters}) # type: ignore
|
|
687
|
+
|
|
688
|
+
pipeline.append({"$project": {"embedding": 0}})
|
|
689
|
+
|
|
690
|
+
results = list(collection.aggregate(pipeline)) # type: ignore
|
|
691
|
+
|
|
692
|
+
docs = []
|
|
693
|
+
for doc in results:
|
|
694
|
+
# Convert ObjectIds to strings before creating Document
|
|
695
|
+
clean_doc = self._convert_objectids_to_strings(doc)
|
|
696
|
+
document = Document(
|
|
697
|
+
id=str(clean_doc["_id"]),
|
|
698
|
+
name=clean_doc.get("name"),
|
|
699
|
+
content=clean_doc["content"],
|
|
700
|
+
meta_data={**clean_doc.get("meta_data", {}), "score": clean_doc.get("score", 0.0)},
|
|
701
|
+
content_id=clean_doc.get("content_id"),
|
|
702
|
+
)
|
|
703
|
+
docs.append(document)
|
|
704
|
+
|
|
705
|
+
log_info(f"Search completed. Found {len(docs)} documents.")
|
|
706
|
+
return docs
|
|
707
|
+
|
|
708
|
+
except Exception as e:
|
|
709
|
+
logger.error(f"Error during search: {e}")
|
|
710
|
+
raise
|
|
711
|
+
|
|
712
|
+
def vector_search(self, query: str, limit: int = 5) -> List[Document]:
|
|
713
|
+
"""Perform a vector-based search."""
|
|
714
|
+
log_debug("Performing vector search.")
|
|
715
|
+
return self.search(query, limit=limit)
|
|
716
|
+
|
|
717
|
+
def keyword_search(self, query: str, limit: int = 5) -> List[Document]:
|
|
718
|
+
"""Perform a keyword-based search."""
|
|
719
|
+
try:
|
|
720
|
+
collection = self._get_collection()
|
|
721
|
+
cursor = collection.find(
|
|
722
|
+
{"content": {"$regex": query, "$options": "i"}},
|
|
723
|
+
{"_id": 1, "name": 1, "content": 1, "meta_data": 1, "content_id": 1},
|
|
724
|
+
).limit(limit)
|
|
725
|
+
results = [
|
|
726
|
+
Document(
|
|
727
|
+
id=str(doc["_id"]),
|
|
728
|
+
name=doc.get("name"),
|
|
729
|
+
content=doc["content"],
|
|
730
|
+
meta_data=doc.get("meta_data", {}),
|
|
731
|
+
content_id=doc.get("content_id"),
|
|
732
|
+
)
|
|
733
|
+
for doc in cursor
|
|
734
|
+
]
|
|
735
|
+
log_debug(f"Keyword search completed. Found {len(results)} documents.")
|
|
736
|
+
return results
|
|
737
|
+
except Exception as e:
|
|
738
|
+
logger.error(f"Error during keyword search: {e}")
|
|
739
|
+
return []
|
|
740
|
+
|
|
741
|
+
def hybrid_search(
|
|
742
|
+
self,
|
|
743
|
+
query: str,
|
|
744
|
+
limit: int = 5,
|
|
745
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
746
|
+
) -> List[Document]:
|
|
747
|
+
"""
|
|
748
|
+
Perform a hybrid search combining vector and keyword-based searches using Reciprocal Rank Fusion.
|
|
749
|
+
|
|
750
|
+
Weights for vector and keyword search are configured at the instance level (hybrid_vector_weight, hybrid_keyword_weight).
|
|
751
|
+
The rank constant k is used in the RRF formula `1 / (rank + k)` to smooth scores.
|
|
752
|
+
|
|
753
|
+
Reference: https://www.mongodb.com/docs/atlas/atlas-vector-search/tutorials/reciprocal-rank-fusion
|
|
754
|
+
"""
|
|
755
|
+
|
|
756
|
+
if self.cosmos_compatibility:
|
|
757
|
+
log_warning("Hybrid search is not implemented for Cosmos DB compatibility mode. Returning empty list.")
|
|
758
|
+
return []
|
|
759
|
+
|
|
760
|
+
log_debug(f"Performing hybrid search for query: '{query}' with limit: {limit}")
|
|
761
|
+
|
|
762
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
763
|
+
if query_embedding is None:
|
|
764
|
+
logger.error(f"Failed to generate embedding for query: {query}")
|
|
765
|
+
return []
|
|
766
|
+
|
|
767
|
+
collection = self._get_collection()
|
|
768
|
+
|
|
769
|
+
k = self.hybrid_rank_constant
|
|
770
|
+
|
|
771
|
+
mongo_filters = {}
|
|
772
|
+
if filters:
|
|
773
|
+
for key, value in filters.items():
|
|
774
|
+
# If the key doesn't already include a dot notation for meta_data
|
|
775
|
+
if not key.startswith("meta_data.") and "." not in key:
|
|
776
|
+
mongo_filters[f"meta_data.{key}"] = value
|
|
777
|
+
else:
|
|
778
|
+
mongo_filters[key] = value
|
|
779
|
+
|
|
780
|
+
pipeline = [
|
|
781
|
+
# Vector Search Branch
|
|
782
|
+
{
|
|
783
|
+
"$vectorSearch": {
|
|
784
|
+
"index": self.search_index_name,
|
|
785
|
+
"path": "embedding",
|
|
786
|
+
"queryVector": query_embedding,
|
|
787
|
+
"numCandidates": min(limit * 10, 200),
|
|
788
|
+
"limit": limit * 2,
|
|
789
|
+
}
|
|
790
|
+
},
|
|
791
|
+
{"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
|
|
792
|
+
{"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
|
|
793
|
+
{
|
|
794
|
+
"$addFields": {
|
|
795
|
+
"_id": "$docs._id",
|
|
796
|
+
"name": "$docs.name",
|
|
797
|
+
"content": "$docs.content",
|
|
798
|
+
"meta_data": "$docs.meta_data",
|
|
799
|
+
"content_id": "$docs.content_id",
|
|
800
|
+
"vs_score": {
|
|
801
|
+
"$divide": [
|
|
802
|
+
self.hybrid_vector_weight,
|
|
803
|
+
{"$add": ["$rank", k, 1]},
|
|
804
|
+
]
|
|
805
|
+
},
|
|
806
|
+
"fts_score": 0.0, # Ensure fts_score exists with a default value
|
|
807
|
+
}
|
|
808
|
+
},
|
|
809
|
+
{
|
|
810
|
+
"$project": {
|
|
811
|
+
"_id": 1,
|
|
812
|
+
"name": 1,
|
|
813
|
+
"content": 1,
|
|
814
|
+
"meta_data": 1,
|
|
815
|
+
"content_id": 1,
|
|
816
|
+
"vs_score": 1,
|
|
817
|
+
# Now fts_score is included with its value (0.0 here)
|
|
818
|
+
"fts_score": 1,
|
|
819
|
+
}
|
|
820
|
+
},
|
|
821
|
+
# Union with Keyword Search Branch
|
|
822
|
+
{
|
|
823
|
+
"$unionWith": {
|
|
824
|
+
"coll": self.collection_name,
|
|
825
|
+
"pipeline": [
|
|
826
|
+
{
|
|
827
|
+
"$search": {
|
|
828
|
+
"index": "default",
|
|
829
|
+
"text": {"query": query, "path": "content"},
|
|
830
|
+
}
|
|
831
|
+
},
|
|
832
|
+
{"$limit": limit * 2},
|
|
833
|
+
{"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
|
|
834
|
+
{"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
|
|
835
|
+
{
|
|
836
|
+
"$addFields": {
|
|
837
|
+
"_id": "$docs._id",
|
|
838
|
+
"name": "$docs.name",
|
|
839
|
+
"content": "$docs.content",
|
|
840
|
+
"meta_data": "$docs.meta_data",
|
|
841
|
+
"content_id": "$docs.content_id",
|
|
842
|
+
"vs_score": 0.0,
|
|
843
|
+
"fts_score": {
|
|
844
|
+
"$divide": [
|
|
845
|
+
self.hybrid_keyword_weight,
|
|
846
|
+
{"$add": ["$rank", k, 1]},
|
|
847
|
+
]
|
|
848
|
+
},
|
|
849
|
+
}
|
|
850
|
+
},
|
|
851
|
+
{
|
|
852
|
+
"$project": {
|
|
853
|
+
"_id": 1,
|
|
854
|
+
"name": 1,
|
|
855
|
+
"content": 1,
|
|
856
|
+
"meta_data": 1,
|
|
857
|
+
"content_id": 1,
|
|
858
|
+
"vs_score": 1,
|
|
859
|
+
"fts_score": 1,
|
|
860
|
+
}
|
|
861
|
+
},
|
|
862
|
+
],
|
|
863
|
+
}
|
|
864
|
+
},
|
|
865
|
+
# Combine and Rank
|
|
866
|
+
{
|
|
867
|
+
"$group": {
|
|
868
|
+
"_id": "$_id",
|
|
869
|
+
"name": {"$first": "$name"},
|
|
870
|
+
"content": {"$first": "$content"},
|
|
871
|
+
"meta_data": {"$first": "$meta_data"},
|
|
872
|
+
"content_id": {"$first": "$content_id"},
|
|
873
|
+
"vs_score": {"$sum": "$vs_score"},
|
|
874
|
+
"fts_score": {"$sum": "$fts_score"},
|
|
875
|
+
}
|
|
876
|
+
},
|
|
877
|
+
{
|
|
878
|
+
"$project": {
|
|
879
|
+
"_id": 1,
|
|
880
|
+
"name": 1,
|
|
881
|
+
"content": 1,
|
|
882
|
+
"meta_data": 1,
|
|
883
|
+
"content_id": 1,
|
|
884
|
+
"score": {"$add": ["$vs_score", "$fts_score"]},
|
|
885
|
+
}
|
|
886
|
+
},
|
|
887
|
+
{"$sort": {"score": -1}},
|
|
888
|
+
{"$limit": limit},
|
|
889
|
+
]
|
|
890
|
+
|
|
891
|
+
# Apply filters if provided
|
|
892
|
+
if mongo_filters:
|
|
893
|
+
pipeline.append({"$match": mongo_filters})
|
|
894
|
+
|
|
895
|
+
try:
|
|
896
|
+
from typing import Mapping, Sequence, cast
|
|
897
|
+
|
|
898
|
+
results = list(collection.aggregate(cast(Sequence[Mapping[str, Any]], pipeline)))
|
|
899
|
+
|
|
900
|
+
docs = []
|
|
901
|
+
for doc in results:
|
|
902
|
+
# Convert ObjectIds to strings before creating Document
|
|
903
|
+
clean_doc = self._convert_objectids_to_strings(doc)
|
|
904
|
+
document = Document(
|
|
905
|
+
id=str(clean_doc["_id"]),
|
|
906
|
+
name=clean_doc.get("name"),
|
|
907
|
+
content=clean_doc["content"],
|
|
908
|
+
meta_data={**clean_doc.get("meta_data", {}), "score": clean_doc.get("score", 0.0)},
|
|
909
|
+
content_id=clean_doc.get("content_id"),
|
|
910
|
+
)
|
|
911
|
+
docs.append(document)
|
|
912
|
+
|
|
913
|
+
log_info(f"Hybrid search completed. Found {len(docs)} documents.")
|
|
914
|
+
return docs
|
|
915
|
+
except errors.OperationFailure as e:
|
|
916
|
+
logger.error(
|
|
917
|
+
f"Error during hybrid search, potentially due to missing or misconfigured Atlas Search index for text search: {e}"
|
|
918
|
+
)
|
|
919
|
+
logger.error(f"Details: {e.details}")
|
|
920
|
+
return []
|
|
921
|
+
except Exception as e:
|
|
922
|
+
logger.error(f"Error during hybrid search: {e}")
|
|
923
|
+
import traceback
|
|
924
|
+
|
|
925
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
926
|
+
return []
|
|
927
|
+
|
|
928
|
+
def drop(self) -> None:
|
|
929
|
+
"""Drop the collection and clean up indexes."""
|
|
930
|
+
collection = self._get_collection()
|
|
931
|
+
index_name = self.search_index_name or "vector_index_1"
|
|
932
|
+
|
|
933
|
+
if self.exists():
|
|
934
|
+
if self.cosmos_compatibility:
|
|
935
|
+
# Cosmos DB specific handling
|
|
936
|
+
try:
|
|
937
|
+
# Drop the index if it exists
|
|
938
|
+
if self._search_index_exists():
|
|
939
|
+
log_info(f"Dropping index '{index_name}'")
|
|
940
|
+
try:
|
|
941
|
+
collection.drop_index(index_name)
|
|
942
|
+
except Exception as e:
|
|
943
|
+
logger.error(f"Error dropping index: {e}")
|
|
944
|
+
|
|
945
|
+
except Exception as e:
|
|
946
|
+
logger.error(f"Error dropping collection: {e}")
|
|
947
|
+
raise
|
|
948
|
+
else:
|
|
949
|
+
# MongoDB Atlas specific handling
|
|
950
|
+
try:
|
|
951
|
+
if self._search_index_exists():
|
|
952
|
+
collection.drop_search_index(index_name)
|
|
953
|
+
time.sleep(2)
|
|
954
|
+
|
|
955
|
+
except Exception as e:
|
|
956
|
+
logger.error(f"Error dropping collection: {e}")
|
|
957
|
+
raise
|
|
958
|
+
|
|
959
|
+
# Drop the collection
|
|
960
|
+
collection.drop()
|
|
961
|
+
time.sleep(2)
|
|
962
|
+
|
|
963
|
+
log_info(f"Collection '{self.collection_name}' dropped successfully")
|
|
964
|
+
|
|
965
|
+
def exists(self) -> bool:
|
|
966
|
+
"""Check if the MongoDB collection exists."""
|
|
967
|
+
exists = self.collection_exists()
|
|
968
|
+
log_debug(f"Collection '{self.collection_name}' existence: {exists}")
|
|
969
|
+
return exists
|
|
970
|
+
|
|
971
|
+
def optimize(self) -> None:
|
|
972
|
+
"""TODO: not implemented"""
|
|
973
|
+
pass
|
|
974
|
+
|
|
975
|
+
def delete(self) -> bool:
|
|
976
|
+
"""Delete all documents from the collection."""
|
|
977
|
+
if self.exists():
|
|
978
|
+
try:
|
|
979
|
+
collection = self._get_collection()
|
|
980
|
+
result = collection.delete_many({})
|
|
981
|
+
# Consider any deletion (even 0) as success
|
|
982
|
+
success = result.deleted_count >= 0
|
|
983
|
+
log_info(f"Deleted {result.deleted_count} documents from collection.")
|
|
984
|
+
return success
|
|
985
|
+
except Exception as e:
|
|
986
|
+
logger.error(f"Error deleting documents: {e}")
|
|
987
|
+
return False
|
|
988
|
+
# Return True if collection doesn't exist (nothing to delete)
|
|
989
|
+
return True
|
|
990
|
+
|
|
991
|
+
def prepare_doc(
|
|
992
|
+
self, content_hash: str, document: Document, filters: Optional[Dict[str, Any]] = None
|
|
993
|
+
) -> Dict[str, Any]:
|
|
994
|
+
"""Prepare a document for insertion or upsertion into MongoDB."""
|
|
995
|
+
|
|
996
|
+
# Add filters to document metadata if provided
|
|
997
|
+
if filters:
|
|
998
|
+
meta_data = document.meta_data.copy() if document.meta_data else {}
|
|
999
|
+
meta_data.update(filters)
|
|
1000
|
+
document.meta_data = meta_data
|
|
1001
|
+
|
|
1002
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
1003
|
+
doc_id = md5(cleaned_content.encode("utf-8")).hexdigest()
|
|
1004
|
+
doc_data = {
|
|
1005
|
+
"_id": doc_id,
|
|
1006
|
+
"name": document.name,
|
|
1007
|
+
"content": cleaned_content,
|
|
1008
|
+
"meta_data": document.meta_data,
|
|
1009
|
+
"embedding": document.embedding,
|
|
1010
|
+
"content_id": document.content_id,
|
|
1011
|
+
"content_hash": content_hash,
|
|
1012
|
+
}
|
|
1013
|
+
log_debug(f"Prepared document: {doc_data['_id']}")
|
|
1014
|
+
return doc_data
|
|
1015
|
+
|
|
1016
|
+
def get_count(self) -> int:
|
|
1017
|
+
"""Get the count of documents in the MongoDB collection."""
|
|
1018
|
+
try:
|
|
1019
|
+
collection = self._get_collection()
|
|
1020
|
+
count = collection.count_documents({})
|
|
1021
|
+
log_debug(f"Collection '{self.collection_name}' has {count} documents.")
|
|
1022
|
+
return count
|
|
1023
|
+
except Exception as e:
|
|
1024
|
+
logger.error(f"Error getting document count: {e}")
|
|
1025
|
+
return 0
|
|
1026
|
+
|
|
1027
|
+
async def async_doc_exists(self, document: Document) -> bool:
|
|
1028
|
+
"""Check if a document exists asynchronously."""
|
|
1029
|
+
try:
|
|
1030
|
+
collection = await self._get_async_collection()
|
|
1031
|
+
doc_id = md5(document.content.encode("utf-8")).hexdigest()
|
|
1032
|
+
result = await collection.find_one({"_id": doc_id})
|
|
1033
|
+
exists = result is not None
|
|
1034
|
+
log_debug(f"Document {'exists' if exists else 'does not exist'}: {doc_id}")
|
|
1035
|
+
return exists
|
|
1036
|
+
except Exception as e:
|
|
1037
|
+
logger.error(f"Error checking document existence asynchronously: {e}")
|
|
1038
|
+
return False
|
|
1039
|
+
|
|
1040
|
+
async def async_insert(
|
|
1041
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
1042
|
+
) -> None:
|
|
1043
|
+
"""Insert documents asynchronously."""
|
|
1044
|
+
log_debug(f"Inserting {len(documents)} documents asynchronously")
|
|
1045
|
+
collection = await self._get_async_collection()
|
|
1046
|
+
|
|
1047
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
1048
|
+
# Use batch embedding when enabled and supported
|
|
1049
|
+
try:
|
|
1050
|
+
# Extract content from all documents
|
|
1051
|
+
doc_contents = [doc.content for doc in documents]
|
|
1052
|
+
|
|
1053
|
+
# Get batch embeddings and usage
|
|
1054
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
1055
|
+
|
|
1056
|
+
# Process documents with pre-computed embeddings
|
|
1057
|
+
for j, doc in enumerate(documents):
|
|
1058
|
+
try:
|
|
1059
|
+
if j < len(embeddings):
|
|
1060
|
+
doc.embedding = embeddings[j]
|
|
1061
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
1062
|
+
except Exception as e:
|
|
1063
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
1064
|
+
|
|
1065
|
+
except Exception as e:
|
|
1066
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
1067
|
+
error_str = str(e).lower()
|
|
1068
|
+
is_rate_limit = any(
|
|
1069
|
+
phrase in error_str
|
|
1070
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
if is_rate_limit:
|
|
1074
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
1075
|
+
raise e
|
|
1076
|
+
else:
|
|
1077
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
1078
|
+
# Fall back to individual embedding
|
|
1079
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
1080
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1081
|
+
else:
|
|
1082
|
+
# Use individual embedding
|
|
1083
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
1084
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1085
|
+
|
|
1086
|
+
prepared_docs = []
|
|
1087
|
+
for document in documents:
|
|
1088
|
+
try:
|
|
1089
|
+
doc_data = self.prepare_doc(content_hash, document, filters)
|
|
1090
|
+
prepared_docs.append(doc_data)
|
|
1091
|
+
except ValueError as e:
|
|
1092
|
+
logger.error(f"Error preparing document '{document.name}': {e}")
|
|
1093
|
+
|
|
1094
|
+
if prepared_docs:
|
|
1095
|
+
try:
|
|
1096
|
+
await collection.insert_many(prepared_docs, ordered=False)
|
|
1097
|
+
log_info(f"Inserted {len(prepared_docs)} documents successfully.")
|
|
1098
|
+
if self.wait_after_insert_in_seconds and self.wait_after_insert_in_seconds > 0:
|
|
1099
|
+
await asyncio.sleep(self.wait_after_insert_in_seconds)
|
|
1100
|
+
except errors.BulkWriteError as e:
|
|
1101
|
+
logger.warning(f"Bulk write error while inserting documents: {e.details}")
|
|
1102
|
+
except Exception as e:
|
|
1103
|
+
logger.error(f"Error inserting documents asynchronously: {e}")
|
|
1104
|
+
|
|
1105
|
+
async def async_upsert(
|
|
1106
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
1107
|
+
) -> None:
|
|
1108
|
+
"""Upsert documents asynchronously."""
|
|
1109
|
+
log_info(f"Upserting {len(documents)} documents asynchronously")
|
|
1110
|
+
collection = await self._get_async_collection()
|
|
1111
|
+
|
|
1112
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
1113
|
+
# Use batch embedding when enabled and supported
|
|
1114
|
+
try:
|
|
1115
|
+
# Extract content from all documents
|
|
1116
|
+
doc_contents = [doc.content for doc in documents]
|
|
1117
|
+
|
|
1118
|
+
# Get batch embeddings and usage
|
|
1119
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
1120
|
+
|
|
1121
|
+
# Process documents with pre-computed embeddings
|
|
1122
|
+
for j, doc in enumerate(documents):
|
|
1123
|
+
try:
|
|
1124
|
+
if j < len(embeddings):
|
|
1125
|
+
doc.embedding = embeddings[j]
|
|
1126
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
1127
|
+
except Exception as e:
|
|
1128
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
1129
|
+
|
|
1130
|
+
except Exception as e:
|
|
1131
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
1132
|
+
error_str = str(e).lower()
|
|
1133
|
+
is_rate_limit = any(
|
|
1134
|
+
phrase in error_str
|
|
1135
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
if is_rate_limit:
|
|
1139
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
1140
|
+
raise e
|
|
1141
|
+
else:
|
|
1142
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
1143
|
+
# Fall back to individual embedding
|
|
1144
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
1145
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1146
|
+
else:
|
|
1147
|
+
# Use individual embedding
|
|
1148
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
1149
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1150
|
+
|
|
1151
|
+
for document in documents:
|
|
1152
|
+
try:
|
|
1153
|
+
doc_data = self.prepare_doc(content_hash, document, filters)
|
|
1154
|
+
await collection.update_one(
|
|
1155
|
+
{"_id": doc_data["_id"]},
|
|
1156
|
+
{"$set": doc_data},
|
|
1157
|
+
upsert=True,
|
|
1158
|
+
)
|
|
1159
|
+
log_info(f"Upserted document: {doc_data['_id']}")
|
|
1160
|
+
except Exception as e:
|
|
1161
|
+
logger.error(f"Error upserting document '{document.name}' asynchronously: {e}")
|
|
1162
|
+
|
|
1163
|
+
async def async_search(
|
|
1164
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
1165
|
+
) -> List[Document]:
|
|
1166
|
+
"""Search for documents asynchronously."""
|
|
1167
|
+
if isinstance(filters, List):
|
|
1168
|
+
log_warning("Filters Expressions are not supported in MongoDB. No filters will be applied.")
|
|
1169
|
+
filters = None
|
|
1170
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
1171
|
+
if query_embedding is None:
|
|
1172
|
+
logger.error(f"Failed to generate embedding for query: {query}")
|
|
1173
|
+
return []
|
|
1174
|
+
|
|
1175
|
+
try:
|
|
1176
|
+
collection = await self._get_async_collection()
|
|
1177
|
+
pipeline = [
|
|
1178
|
+
{
|
|
1179
|
+
"$vectorSearch": {
|
|
1180
|
+
"index": self.search_index_name,
|
|
1181
|
+
"limit": limit,
|
|
1182
|
+
"numCandidates": min(limit * 4, 100),
|
|
1183
|
+
"queryVector": query_embedding,
|
|
1184
|
+
"path": "embedding",
|
|
1185
|
+
}
|
|
1186
|
+
},
|
|
1187
|
+
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
|
|
1188
|
+
]
|
|
1189
|
+
|
|
1190
|
+
# Handle filters if provided
|
|
1191
|
+
if filters:
|
|
1192
|
+
# MongoDB uses dot notation for nested fields, so we need to prepend meta_data. if needed
|
|
1193
|
+
mongo_filters = {}
|
|
1194
|
+
for key, value in filters.items():
|
|
1195
|
+
# If the key doesn't already include a dot notation for meta_data
|
|
1196
|
+
if not key.startswith("meta_data.") and "." not in key:
|
|
1197
|
+
mongo_filters[f"meta_data.{key}"] = value
|
|
1198
|
+
else:
|
|
1199
|
+
mongo_filters[key] = value
|
|
1200
|
+
|
|
1201
|
+
pipeline.append({"$match": mongo_filters})
|
|
1202
|
+
|
|
1203
|
+
pipeline.append({"$project": {"embedding": 0}})
|
|
1204
|
+
|
|
1205
|
+
# With AsyncMongoClient, aggregate() returns a coroutine that resolves to a cursor
|
|
1206
|
+
# We need to await it first to get the cursor
|
|
1207
|
+
cursor = await collection.aggregate(pipeline)
|
|
1208
|
+
|
|
1209
|
+
# Now we can iterate over the cursor to get results
|
|
1210
|
+
results = []
|
|
1211
|
+
async for doc in cursor:
|
|
1212
|
+
results.append(doc)
|
|
1213
|
+
if len(results) >= limit:
|
|
1214
|
+
break
|
|
1215
|
+
|
|
1216
|
+
docs = [
|
|
1217
|
+
Document(
|
|
1218
|
+
id=str(doc["_id"]),
|
|
1219
|
+
name=doc.get("name"),
|
|
1220
|
+
content=doc["content"],
|
|
1221
|
+
meta_data={**doc.get("meta_data", {}), "score": doc.get("score", 0.0)},
|
|
1222
|
+
content_id=doc.get("content_id"),
|
|
1223
|
+
)
|
|
1224
|
+
for doc in results
|
|
1225
|
+
]
|
|
1226
|
+
|
|
1227
|
+
log_info(f"Async search completed. Found {len(docs)} documents.")
|
|
1228
|
+
return docs
|
|
1229
|
+
|
|
1230
|
+
except Exception as e:
|
|
1231
|
+
logger.error(f"Error during async search: {e}")
|
|
1232
|
+
# Include traceback for better debugging
|
|
1233
|
+
import traceback
|
|
1234
|
+
|
|
1235
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
1236
|
+
raise
|
|
1237
|
+
|
|
1238
|
+
async def async_drop(self) -> None:
|
|
1239
|
+
"""Drop the collection asynchronously."""
|
|
1240
|
+
if await self.async_exists():
|
|
1241
|
+
try:
|
|
1242
|
+
collection = await self._get_async_collection()
|
|
1243
|
+
await collection.drop()
|
|
1244
|
+
log_info(f"Collection '{self.collection_name}' dropped asynchronously")
|
|
1245
|
+
except Exception as e:
|
|
1246
|
+
logger.error(f"Error dropping collection asynchronously: {e}")
|
|
1247
|
+
raise
|
|
1248
|
+
|
|
1249
|
+
async def async_exists(self) -> bool:
|
|
1250
|
+
"""Check if the collection exists asynchronously."""
|
|
1251
|
+
try:
|
|
1252
|
+
client = await self._get_async_client()
|
|
1253
|
+
collection_names = await client[self.database].list_collection_names()
|
|
1254
|
+
exists = self.collection_name in collection_names
|
|
1255
|
+
log_debug(f"Collection '{self.collection_name}' existence (async): {exists}")
|
|
1256
|
+
return exists
|
|
1257
|
+
except Exception as e:
|
|
1258
|
+
logger.error(f"Error checking collection existence asynchronously: {e}")
|
|
1259
|
+
return False
|
|
1260
|
+
|
|
1261
|
+
async def async_name_exists(self, name: str) -> bool:
|
|
1262
|
+
"""Check if a document with a given name exists asynchronously."""
|
|
1263
|
+
try:
|
|
1264
|
+
collection = await self._get_async_collection()
|
|
1265
|
+
exists = await collection.find_one({"name": name}) is not None
|
|
1266
|
+
log_debug(f"Document with name '{name}' {'exists' if exists else 'does not exist'} (async)")
|
|
1267
|
+
return exists
|
|
1268
|
+
except Exception as e:
|
|
1269
|
+
logger.error(f"Error checking document name existence asynchronously: {e}")
|
|
1270
|
+
return False
|
|
1271
|
+
|
|
1272
|
+
def _get_cosmos_similarity_metric(self) -> str:
|
|
1273
|
+
"""Convert MongoDB distance metric to Cosmos DB format."""
|
|
1274
|
+
# Cosmos DB supports: COS (cosine), L2 (Euclidean), IP (inner product)
|
|
1275
|
+
metric_mapping = {"cosine": "COS", "euclidean": "L2", "dotProduct": "IP"}
|
|
1276
|
+
return metric_mapping.get(self.distance_metric, "COS")
|
|
1277
|
+
|
|
1278
|
+
def _convert_objectids_to_strings(self, obj: Any) -> Any:
|
|
1279
|
+
"""
|
|
1280
|
+
Recursively convert MongoDB ObjectIds to strings in any data structure.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
obj: Any object that might contain ObjectIds
|
|
1284
|
+
|
|
1285
|
+
Returns:
|
|
1286
|
+
The same object with ObjectIds converted to strings
|
|
1287
|
+
"""
|
|
1288
|
+
if isinstance(obj, ObjectId):
|
|
1289
|
+
return str(obj)
|
|
1290
|
+
elif isinstance(obj, dict):
|
|
1291
|
+
return {key: self._convert_objectids_to_strings(value) for key, value in obj.items()}
|
|
1292
|
+
elif isinstance(obj, list):
|
|
1293
|
+
return [self._convert_objectids_to_strings(item) for item in obj]
|
|
1294
|
+
elif isinstance(obj, tuple):
|
|
1295
|
+
return tuple(self._convert_objectids_to_strings(item) for item in obj)
|
|
1296
|
+
else:
|
|
1297
|
+
return obj
|
|
1298
|
+
|
|
1299
|
+
def delete_by_id(self, id: str) -> bool:
|
|
1300
|
+
"""Delete document by ID."""
|
|
1301
|
+
try:
|
|
1302
|
+
collection = self._get_collection()
|
|
1303
|
+
result = collection.delete_one({"_id": id})
|
|
1304
|
+
|
|
1305
|
+
if result.deleted_count > 0:
|
|
1306
|
+
log_info(
|
|
1307
|
+
f"Deleted {result.deleted_count} document(s) with ID '{id}' from collection '{self.collection_name}'."
|
|
1308
|
+
)
|
|
1309
|
+
return True
|
|
1310
|
+
else:
|
|
1311
|
+
log_info(f"No documents found with ID '{id}' to delete.")
|
|
1312
|
+
return True
|
|
1313
|
+
except Exception as e:
|
|
1314
|
+
logger.error(f"Error deleting document with ID '{id}': {e}")
|
|
1315
|
+
return False
|
|
1316
|
+
|
|
1317
|
+
def delete_by_name(self, name: str) -> bool:
|
|
1318
|
+
"""Delete documents by name."""
|
|
1319
|
+
try:
|
|
1320
|
+
collection = self._get_collection()
|
|
1321
|
+
result = collection.delete_many({"name": name})
|
|
1322
|
+
|
|
1323
|
+
log_info(
|
|
1324
|
+
f"Deleted {result.deleted_count} document(s) with name '{name}' from collection '{self.collection_name}'."
|
|
1325
|
+
)
|
|
1326
|
+
return True
|
|
1327
|
+
except Exception as e:
|
|
1328
|
+
logger.error(f"Error deleting documents with name '{name}': {e}")
|
|
1329
|
+
return False
|
|
1330
|
+
|
|
1331
|
+
def delete_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
1332
|
+
"""Delete documents by metadata."""
|
|
1333
|
+
try:
|
|
1334
|
+
collection = self._get_collection()
|
|
1335
|
+
|
|
1336
|
+
# Build MongoDB query for metadata matching
|
|
1337
|
+
mongo_filters = {}
|
|
1338
|
+
for key, value in metadata.items():
|
|
1339
|
+
# Use dot notation for nested metadata fields
|
|
1340
|
+
mongo_filters[f"meta_data.{key}"] = value
|
|
1341
|
+
|
|
1342
|
+
result = collection.delete_many(mongo_filters)
|
|
1343
|
+
|
|
1344
|
+
log_info(
|
|
1345
|
+
f"Deleted {result.deleted_count} document(s) with metadata '{metadata}' from collection '{self.collection_name}'."
|
|
1346
|
+
)
|
|
1347
|
+
return True
|
|
1348
|
+
except Exception as e:
|
|
1349
|
+
logger.error(f"Error deleting documents with metadata '{metadata}': {e}")
|
|
1350
|
+
return False
|
|
1351
|
+
|
|
1352
|
+
def _delete_by_content_hash(self, content_hash: str) -> bool:
|
|
1353
|
+
"""Delete documents by content hash.
|
|
1354
|
+
|
|
1355
|
+
Args:
|
|
1356
|
+
content_hash (str): The content hash to delete.
|
|
1357
|
+
|
|
1358
|
+
Returns:
|
|
1359
|
+
bool: True if documents were deleted successfully, False otherwise.
|
|
1360
|
+
"""
|
|
1361
|
+
try:
|
|
1362
|
+
collection = self._get_collection()
|
|
1363
|
+
result = collection.delete_many({"content_hash": content_hash})
|
|
1364
|
+
log_info(f"Deleted {result.deleted_count} documents with content_hash '{content_hash}'")
|
|
1365
|
+
return True
|
|
1366
|
+
except Exception as e:
|
|
1367
|
+
logger.error(f"Error deleting documents by content_hash '{content_hash}': {e}")
|
|
1368
|
+
return False
|
|
1369
|
+
|
|
1370
|
+
def delete_by_content_id(self, content_id: str) -> bool:
|
|
1371
|
+
"""Delete documents by content ID."""
|
|
1372
|
+
try:
|
|
1373
|
+
collection = self._get_collection()
|
|
1374
|
+
result = collection.delete_many({"content_id": content_id})
|
|
1375
|
+
|
|
1376
|
+
log_info(
|
|
1377
|
+
f"Deleted {result.deleted_count} document(s) with content_id '{content_id}' from collection '{self.collection_name}'."
|
|
1378
|
+
)
|
|
1379
|
+
return True
|
|
1380
|
+
except Exception as e:
|
|
1381
|
+
logger.error(f"Error deleting documents with content_id '{content_id}': {e}")
|
|
1382
|
+
return False
|
|
1383
|
+
|
|
1384
|
+
def update_metadata(self, content_id: str, metadata: Dict[str, Any]) -> None:
|
|
1385
|
+
"""
|
|
1386
|
+
Update the metadata for documents with the given content_id.
|
|
1387
|
+
|
|
1388
|
+
Args:
|
|
1389
|
+
content_id (str): The content ID to update
|
|
1390
|
+
metadata (Dict[str, Any]): The metadata to update
|
|
1391
|
+
"""
|
|
1392
|
+
try:
|
|
1393
|
+
collection = self._client[self.database][self.collection_name] # type: ignore
|
|
1394
|
+
|
|
1395
|
+
# Create query filter for content_id
|
|
1396
|
+
filter_query = {"content_id": content_id}
|
|
1397
|
+
|
|
1398
|
+
update_operations = {}
|
|
1399
|
+
for key, value in metadata.items():
|
|
1400
|
+
update_operations[f"meta_data.{key}"] = value
|
|
1401
|
+
update_operations[f"filters.{key}"] = value
|
|
1402
|
+
|
|
1403
|
+
# Update documents
|
|
1404
|
+
result = collection.update_many(filter_query, {"$set": update_operations})
|
|
1405
|
+
|
|
1406
|
+
if result.matched_count == 0:
|
|
1407
|
+
logger.debug(f"No documents found with content_id: {content_id}")
|
|
1408
|
+
else:
|
|
1409
|
+
logger.debug(f"Updated metadata for {result.matched_count} documents with content_id: {content_id}")
|
|
1410
|
+
|
|
1411
|
+
except Exception as e:
|
|
1412
|
+
logger.error(f"Error updating metadata for content_id '{content_id}': {e}")
|
|
1413
|
+
raise
|
|
1414
|
+
|
|
1415
|
+
def get_supported_search_types(self) -> List[str]:
|
|
1416
|
+
"""Get the supported search types for this vector database."""
|
|
1417
|
+
return [SearchType.vector, SearchType.hybrid]
|