agno 0.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/__init__.py +8 -0
- agno/agent/__init__.py +44 -5
- agno/agent/agent.py +10531 -2975
- agno/api/agent.py +14 -53
- agno/api/api.py +7 -46
- agno/api/evals.py +22 -0
- agno/api/os.py +17 -0
- agno/api/routes.py +6 -25
- agno/api/schemas/__init__.py +9 -0
- agno/api/schemas/agent.py +6 -9
- agno/api/schemas/evals.py +16 -0
- agno/api/schemas/os.py +14 -0
- agno/api/schemas/team.py +10 -10
- agno/api/schemas/utils.py +21 -0
- agno/api/schemas/workflows.py +16 -0
- agno/api/settings.py +53 -0
- agno/api/team.py +22 -26
- agno/api/workflow.py +28 -0
- agno/cloud/aws/base.py +214 -0
- agno/cloud/aws/s3/__init__.py +2 -0
- agno/cloud/aws/s3/api_client.py +43 -0
- agno/cloud/aws/s3/bucket.py +195 -0
- agno/cloud/aws/s3/object.py +57 -0
- agno/compression/__init__.py +3 -0
- agno/compression/manager.py +247 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/__init__.py +24 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +946 -0
- agno/db/dynamo/__init__.py +3 -0
- agno/db/dynamo/dynamo.py +2781 -0
- agno/db/dynamo/schemas.py +442 -0
- agno/db/dynamo/utils.py +743 -0
- agno/db/firestore/__init__.py +3 -0
- agno/db/firestore/firestore.py +2379 -0
- agno/db/firestore/schemas.py +181 -0
- agno/db/firestore/utils.py +376 -0
- agno/db/gcs_json/__init__.py +3 -0
- agno/db/gcs_json/gcs_json_db.py +1791 -0
- agno/db/gcs_json/utils.py +228 -0
- agno/db/in_memory/__init__.py +3 -0
- agno/db/in_memory/in_memory_db.py +1312 -0
- agno/db/in_memory/utils.py +230 -0
- agno/db/json/__init__.py +3 -0
- agno/db/json/json_db.py +1777 -0
- agno/db/json/utils.py +230 -0
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +635 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +17 -0
- agno/db/mongo/async_mongo.py +2760 -0
- agno/db/mongo/mongo.py +2597 -0
- agno/db/mongo/schemas.py +119 -0
- agno/db/mongo/utils.py +276 -0
- agno/db/mysql/__init__.py +4 -0
- agno/db/mysql/async_mysql.py +2912 -0
- agno/db/mysql/mysql.py +2923 -0
- agno/db/mysql/schemas.py +186 -0
- agno/db/mysql/utils.py +488 -0
- agno/db/postgres/__init__.py +4 -0
- agno/db/postgres/async_postgres.py +2579 -0
- agno/db/postgres/postgres.py +2870 -0
- agno/db/postgres/schemas.py +187 -0
- agno/db/postgres/utils.py +442 -0
- agno/db/redis/__init__.py +3 -0
- agno/db/redis/redis.py +2141 -0
- agno/db/redis/schemas.py +159 -0
- agno/db/redis/utils.py +346 -0
- agno/db/schemas/__init__.py +4 -0
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +34 -0
- agno/db/schemas/knowledge.py +40 -0
- agno/db/schemas/memory.py +61 -0
- agno/db/singlestore/__init__.py +3 -0
- agno/db/singlestore/schemas.py +179 -0
- agno/db/singlestore/singlestore.py +2877 -0
- agno/db/singlestore/utils.py +384 -0
- agno/db/sqlite/__init__.py +4 -0
- agno/db/sqlite/async_sqlite.py +2911 -0
- agno/db/sqlite/schemas.py +181 -0
- agno/db/sqlite/sqlite.py +2908 -0
- agno/db/sqlite/utils.py +429 -0
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +334 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1908 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +118 -0
- agno/eval/__init__.py +24 -0
- agno/eval/accuracy.py +666 -276
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/performance.py +779 -0
- agno/eval/reliability.py +241 -62
- agno/eval/utils.py +120 -0
- agno/exceptions.py +143 -1
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/hooks/__init__.py +3 -0
- agno/hooks/decorator.py +164 -0
- agno/integrations/discord/__init__.py +3 -0
- agno/integrations/discord/client.py +203 -0
- agno/knowledge/__init__.py +5 -1
- agno/{document → knowledge}/chunking/agentic.py +22 -14
- agno/{document → knowledge}/chunking/document.py +2 -2
- agno/{document → knowledge}/chunking/fixed.py +7 -6
- agno/knowledge/chunking/markdown.py +151 -0
- agno/{document → knowledge}/chunking/recursive.py +15 -3
- agno/knowledge/chunking/row.py +39 -0
- agno/knowledge/chunking/semantic.py +91 -0
- agno/knowledge/chunking/strategy.py +165 -0
- agno/knowledge/content.py +74 -0
- agno/knowledge/document/__init__.py +5 -0
- agno/{document → knowledge/document}/base.py +12 -2
- agno/knowledge/embedder/__init__.py +5 -0
- agno/knowledge/embedder/aws_bedrock.py +343 -0
- agno/knowledge/embedder/azure_openai.py +210 -0
- agno/{embedder → knowledge/embedder}/base.py +8 -0
- agno/knowledge/embedder/cohere.py +323 -0
- agno/knowledge/embedder/fastembed.py +62 -0
- agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
- agno/knowledge/embedder/google.py +258 -0
- agno/knowledge/embedder/huggingface.py +94 -0
- agno/knowledge/embedder/jina.py +182 -0
- agno/knowledge/embedder/langdb.py +22 -0
- agno/knowledge/embedder/mistral.py +206 -0
- agno/knowledge/embedder/nebius.py +13 -0
- agno/knowledge/embedder/ollama.py +154 -0
- agno/knowledge/embedder/openai.py +195 -0
- agno/knowledge/embedder/sentence_transformer.py +63 -0
- agno/{embedder → knowledge/embedder}/together.py +1 -1
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +165 -0
- agno/knowledge/knowledge.py +3006 -0
- agno/knowledge/reader/__init__.py +7 -0
- agno/knowledge/reader/arxiv_reader.py +81 -0
- agno/knowledge/reader/base.py +95 -0
- agno/knowledge/reader/csv_reader.py +164 -0
- agno/knowledge/reader/docx_reader.py +82 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
- agno/knowledge/reader/firecrawl_reader.py +201 -0
- agno/knowledge/reader/json_reader.py +88 -0
- agno/knowledge/reader/markdown_reader.py +137 -0
- agno/knowledge/reader/pdf_reader.py +431 -0
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +313 -0
- agno/knowledge/reader/s3_reader.py +89 -0
- agno/knowledge/reader/tavily_reader.py +193 -0
- agno/knowledge/reader/text_reader.py +127 -0
- agno/knowledge/reader/web_search_reader.py +325 -0
- agno/knowledge/reader/website_reader.py +455 -0
- agno/knowledge/reader/wikipedia_reader.py +91 -0
- agno/knowledge/reader/youtube_reader.py +78 -0
- agno/knowledge/remote_content/remote_content.py +88 -0
- agno/knowledge/reranker/__init__.py +3 -0
- agno/{reranker → knowledge/reranker}/base.py +1 -1
- agno/{reranker → knowledge/reranker}/cohere.py +2 -2
- agno/knowledge/reranker/infinity.py +195 -0
- agno/knowledge/reranker/sentence_transformer.py +54 -0
- agno/knowledge/types.py +39 -0
- agno/knowledge/utils.py +234 -0
- agno/media.py +439 -95
- agno/memory/__init__.py +16 -3
- agno/memory/manager.py +1474 -123
- agno/memory/strategies/__init__.py +15 -0
- agno/memory/strategies/base.py +66 -0
- agno/memory/strategies/summarize.py +196 -0
- agno/memory/strategies/types.py +37 -0
- agno/models/aimlapi/__init__.py +5 -0
- agno/models/aimlapi/aimlapi.py +62 -0
- agno/models/anthropic/__init__.py +4 -0
- agno/models/anthropic/claude.py +960 -496
- agno/models/aws/__init__.py +15 -0
- agno/models/aws/bedrock.py +686 -451
- agno/models/aws/claude.py +190 -183
- agno/models/azure/__init__.py +18 -1
- agno/models/azure/ai_foundry.py +489 -0
- agno/models/azure/openai_chat.py +89 -40
- agno/models/base.py +2477 -550
- agno/models/cerebras/__init__.py +12 -0
- agno/models/cerebras/cerebras.py +565 -0
- agno/models/cerebras/cerebras_openai.py +131 -0
- agno/models/cohere/__init__.py +4 -0
- agno/models/cohere/chat.py +306 -492
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +74 -0
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +90 -0
- agno/models/deepinfra/__init__.py +5 -0
- agno/models/deepinfra/deepinfra.py +45 -0
- agno/models/deepseek/__init__.py +4 -0
- agno/models/deepseek/deepseek.py +110 -9
- agno/models/fireworks/__init__.py +4 -0
- agno/models/fireworks/fireworks.py +19 -22
- agno/models/google/__init__.py +3 -7
- agno/models/google/gemini.py +1717 -662
- agno/models/google/utils.py +22 -0
- agno/models/groq/__init__.py +4 -0
- agno/models/groq/groq.py +391 -666
- agno/models/huggingface/__init__.py +4 -0
- agno/models/huggingface/huggingface.py +266 -538
- agno/models/ibm/__init__.py +5 -0
- agno/models/ibm/watsonx.py +432 -0
- agno/models/internlm/__init__.py +3 -0
- agno/models/internlm/internlm.py +20 -3
- agno/models/langdb/__init__.py +1 -0
- agno/models/langdb/langdb.py +60 -0
- agno/models/litellm/__init__.py +14 -0
- agno/models/litellm/chat.py +503 -0
- agno/models/litellm/litellm_openai.py +42 -0
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/lmstudio/__init__.py +5 -0
- agno/models/lmstudio/lmstudio.py +25 -0
- agno/models/message.py +361 -39
- agno/models/meta/__init__.py +12 -0
- agno/models/meta/llama.py +502 -0
- agno/models/meta/llama_openai.py +79 -0
- agno/models/metrics.py +120 -0
- agno/models/mistral/__init__.py +4 -0
- agno/models/mistral/mistral.py +293 -393
- agno/models/nebius/__init__.py +3 -0
- agno/models/nebius/nebius.py +53 -0
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/__init__.py +4 -0
- agno/models/nvidia/nvidia.py +22 -3
- agno/models/ollama/__init__.py +4 -2
- agno/models/ollama/chat.py +257 -492
- agno/models/openai/__init__.py +7 -0
- agno/models/openai/chat.py +725 -770
- agno/models/openai/like.py +16 -2
- agno/models/openai/responses.py +1121 -0
- agno/models/openrouter/__init__.py +4 -0
- agno/models/openrouter/openrouter.py +62 -5
- agno/models/perplexity/__init__.py +5 -0
- agno/models/perplexity/perplexity.py +203 -0
- agno/models/portkey/__init__.py +3 -0
- agno/models/portkey/portkey.py +82 -0
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +69 -0
- agno/models/response.py +177 -7
- agno/models/sambanova/__init__.py +4 -0
- agno/models/sambanova/sambanova.py +23 -4
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +42 -0
- agno/models/together/__init__.py +4 -0
- agno/models/together/together.py +21 -164
- agno/models/utils.py +266 -0
- agno/models/vercel/__init__.py +3 -0
- agno/models/vercel/v0.py +43 -0
- agno/models/vertexai/__init__.py +0 -1
- agno/models/vertexai/claude.py +190 -0
- agno/models/vllm/__init__.py +3 -0
- agno/models/vllm/vllm.py +83 -0
- agno/models/xai/__init__.py +2 -0
- agno/models/xai/xai.py +111 -7
- agno/os/__init__.py +3 -0
- agno/os/app.py +1027 -0
- agno/os/auth.py +244 -0
- agno/os/config.py +126 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +249 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/__init__.py +3 -0
- agno/os/interfaces/agui/agui.py +47 -0
- agno/os/interfaces/agui/router.py +147 -0
- agno/os/interfaces/agui/utils.py +574 -0
- agno/os/interfaces/base.py +25 -0
- agno/os/interfaces/slack/__init__.py +3 -0
- agno/os/interfaces/slack/router.py +148 -0
- agno/os/interfaces/slack/security.py +30 -0
- agno/os/interfaces/slack/slack.py +47 -0
- agno/os/interfaces/whatsapp/__init__.py +3 -0
- agno/os/interfaces/whatsapp/router.py +210 -0
- agno/os/interfaces/whatsapp/security.py +55 -0
- agno/os/interfaces/whatsapp/whatsapp.py +36 -0
- agno/os/mcp.py +293 -0
- agno/os/middleware/__init__.py +9 -0
- agno/os/middleware/jwt.py +797 -0
- agno/os/router.py +258 -0
- agno/os/routers/__init__.py +3 -0
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +599 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/__init__.py +3 -0
- agno/os/routers/evals/evals.py +450 -0
- agno/os/routers/evals/schemas.py +174 -0
- agno/os/routers/evals/utils.py +231 -0
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/__init__.py +3 -0
- agno/os/routers/knowledge/knowledge.py +1008 -0
- agno/os/routers/knowledge/schemas.py +178 -0
- agno/os/routers/memory/__init__.py +3 -0
- agno/os/routers/memory/memory.py +661 -0
- agno/os/routers/memory/schemas.py +88 -0
- agno/os/routers/metrics/__init__.py +3 -0
- agno/os/routers/metrics/metrics.py +190 -0
- agno/os/routers/metrics/schemas.py +47 -0
- agno/os/routers/session/__init__.py +3 -0
- agno/os/routers/session/session.py +997 -0
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +512 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/traces/__init__.py +3 -0
- agno/os/routers/traces/schemas.py +414 -0
- agno/os/routers/traces/traces.py +499 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +624 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +534 -0
- agno/os/scopes.py +469 -0
- agno/{playground → os}/settings.py +7 -15
- agno/os/utils.py +973 -0
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +67 -0
- agno/reasoning/deepseek.py +63 -0
- agno/reasoning/default.py +97 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +71 -0
- agno/reasoning/helpers.py +24 -1
- agno/reasoning/ollama.py +67 -0
- agno/reasoning/openai.py +86 -0
- agno/reasoning/step.py +2 -1
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +822 -0
- agno/run/base.py +247 -0
- agno/run/cancel.py +81 -0
- agno/run/requirement.py +181 -0
- agno/run/team.py +767 -0
- agno/run/workflow.py +708 -0
- agno/session/__init__.py +10 -0
- agno/session/agent.py +260 -0
- agno/session/summary.py +265 -0
- agno/session/team.py +342 -0
- agno/session/workflow.py +501 -0
- agno/table.py +10 -0
- agno/team/__init__.py +37 -0
- agno/team/team.py +9536 -0
- agno/tools/__init__.py +7 -0
- agno/tools/agentql.py +120 -0
- agno/tools/airflow.py +22 -12
- agno/tools/api.py +122 -0
- agno/tools/apify.py +276 -83
- agno/tools/{arxiv_toolkit.py → arxiv.py} +20 -12
- agno/tools/aws_lambda.py +28 -7
- agno/tools/aws_ses.py +66 -0
- agno/tools/baidusearch.py +11 -4
- agno/tools/bitbucket.py +292 -0
- agno/tools/brandfetch.py +213 -0
- agno/tools/bravesearch.py +106 -0
- agno/tools/brightdata.py +367 -0
- agno/tools/browserbase.py +209 -0
- agno/tools/calcom.py +32 -23
- agno/tools/calculator.py +24 -37
- agno/tools/cartesia.py +187 -0
- agno/tools/{clickup_tool.py → clickup.py} +17 -28
- agno/tools/confluence.py +91 -26
- agno/tools/crawl4ai.py +139 -43
- agno/tools/csv_toolkit.py +28 -22
- agno/tools/dalle.py +36 -22
- agno/tools/daytona.py +475 -0
- agno/tools/decorator.py +169 -14
- agno/tools/desi_vocal.py +23 -11
- agno/tools/discord.py +32 -29
- agno/tools/docker.py +716 -0
- agno/tools/duckdb.py +76 -81
- agno/tools/duckduckgo.py +43 -40
- agno/tools/e2b.py +703 -0
- agno/tools/eleven_labs.py +65 -54
- agno/tools/email.py +13 -5
- agno/tools/evm.py +129 -0
- agno/tools/exa.py +324 -42
- agno/tools/fal.py +39 -35
- agno/tools/file.py +196 -30
- agno/tools/file_generation.py +356 -0
- agno/tools/financial_datasets.py +288 -0
- agno/tools/firecrawl.py +108 -33
- agno/tools/function.py +960 -122
- agno/tools/giphy.py +34 -12
- agno/tools/github.py +1294 -97
- agno/tools/gmail.py +922 -0
- agno/tools/google_bigquery.py +117 -0
- agno/tools/google_drive.py +271 -0
- agno/tools/google_maps.py +253 -0
- agno/tools/googlecalendar.py +607 -107
- agno/tools/googlesheets.py +377 -0
- agno/tools/hackernews.py +20 -12
- agno/tools/jina.py +24 -14
- agno/tools/jira.py +48 -19
- agno/tools/knowledge.py +218 -0
- agno/tools/linear.py +82 -43
- agno/tools/linkup.py +58 -0
- agno/tools/local_file_system.py +15 -7
- agno/tools/lumalab.py +41 -26
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +193 -0
- agno/tools/memory.py +419 -0
- agno/tools/mlx_transcribe.py +11 -9
- agno/tools/models/azure_openai.py +190 -0
- agno/tools/models/gemini.py +203 -0
- agno/tools/models/groq.py +158 -0
- agno/tools/models/morph.py +186 -0
- agno/tools/models/nebius.py +124 -0
- agno/tools/models_labs.py +163 -82
- agno/tools/moviepy_video.py +18 -13
- agno/tools/nano_banana.py +151 -0
- agno/tools/neo4j.py +134 -0
- agno/tools/newspaper.py +15 -4
- agno/tools/newspaper4k.py +19 -6
- agno/tools/notion.py +204 -0
- agno/tools/openai.py +181 -17
- agno/tools/openbb.py +27 -20
- agno/tools/opencv.py +321 -0
- agno/tools/openweather.py +233 -0
- agno/tools/oxylabs.py +385 -0
- agno/tools/pandas.py +25 -15
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +238 -185
- agno/tools/pubmed.py +125 -13
- agno/tools/python.py +48 -35
- agno/tools/reasoning.py +283 -0
- agno/tools/reddit.py +207 -29
- agno/tools/redshift.py +406 -0
- agno/tools/replicate.py +69 -26
- agno/tools/resend.py +11 -6
- agno/tools/scrapegraph.py +179 -19
- agno/tools/searxng.py +23 -31
- agno/tools/serpapi.py +15 -10
- agno/tools/serper.py +255 -0
- agno/tools/shell.py +23 -12
- agno/tools/shopify.py +1519 -0
- agno/tools/slack.py +56 -14
- agno/tools/sleep.py +8 -6
- agno/tools/spider.py +35 -11
- agno/tools/spotify.py +919 -0
- agno/tools/sql.py +34 -19
- agno/tools/tavily.py +158 -8
- agno/tools/telegram.py +18 -8
- agno/tools/todoist.py +218 -0
- agno/tools/toolkit.py +134 -9
- agno/tools/trafilatura.py +388 -0
- agno/tools/trello.py +25 -28
- agno/tools/twilio.py +18 -9
- agno/tools/user_control_flow.py +78 -0
- agno/tools/valyu.py +228 -0
- agno/tools/visualization.py +467 -0
- agno/tools/webbrowser.py +28 -0
- agno/tools/webex.py +76 -0
- agno/tools/website.py +23 -19
- agno/tools/webtools.py +45 -0
- agno/tools/whatsapp.py +286 -0
- agno/tools/wikipedia.py +28 -19
- agno/tools/workflow.py +285 -0
- agno/tools/{twitter.py → x.py} +142 -46
- agno/tools/yfinance.py +41 -39
- agno/tools/youtube.py +34 -17
- agno/tools/zendesk.py +15 -5
- agno/tools/zep.py +454 -0
- agno/tools/zoom.py +86 -37
- agno/tracing/__init__.py +12 -0
- agno/tracing/exporter.py +157 -0
- agno/tracing/schemas.py +276 -0
- agno/tracing/setup.py +111 -0
- agno/utils/agent.py +938 -0
- agno/utils/audio.py +37 -1
- agno/utils/certs.py +27 -0
- agno/utils/code_execution.py +11 -0
- agno/utils/common.py +103 -20
- agno/utils/cryptography.py +22 -0
- agno/utils/dttm.py +33 -0
- agno/utils/events.py +700 -0
- agno/utils/functions.py +107 -37
- agno/utils/gemini.py +426 -0
- agno/utils/hooks.py +171 -0
- agno/utils/http.py +185 -0
- agno/utils/json_schema.py +159 -37
- agno/utils/knowledge.py +36 -0
- agno/utils/location.py +19 -0
- agno/utils/log.py +221 -8
- agno/utils/mcp.py +214 -0
- agno/utils/media.py +335 -14
- agno/utils/merge_dict.py +22 -1
- agno/utils/message.py +77 -2
- agno/utils/models/ai_foundry.py +50 -0
- agno/utils/models/claude.py +373 -0
- agno/utils/models/cohere.py +94 -0
- agno/utils/models/llama.py +85 -0
- agno/utils/models/mistral.py +100 -0
- agno/utils/models/openai_responses.py +140 -0
- agno/utils/models/schema_utils.py +153 -0
- agno/utils/models/watsonx.py +41 -0
- agno/utils/openai.py +257 -0
- agno/utils/pickle.py +1 -1
- agno/utils/pprint.py +124 -8
- agno/utils/print_response/agent.py +930 -0
- agno/utils/print_response/team.py +1914 -0
- agno/utils/print_response/workflow.py +1668 -0
- agno/utils/prompts.py +111 -0
- agno/utils/reasoning.py +108 -0
- agno/utils/response.py +163 -0
- agno/utils/serialize.py +32 -0
- agno/utils/shell.py +4 -4
- agno/utils/streamlit.py +487 -0
- agno/utils/string.py +204 -51
- agno/utils/team.py +139 -0
- agno/utils/timer.py +9 -2
- agno/utils/tokens.py +657 -0
- agno/utils/tools.py +19 -1
- agno/utils/whatsapp.py +305 -0
- agno/utils/yaml_io.py +3 -3
- agno/vectordb/__init__.py +2 -0
- agno/vectordb/base.py +87 -9
- agno/vectordb/cassandra/__init__.py +5 -1
- agno/vectordb/cassandra/cassandra.py +383 -27
- agno/vectordb/chroma/__init__.py +4 -0
- agno/vectordb/chroma/chromadb.py +748 -83
- agno/vectordb/clickhouse/__init__.py +7 -1
- agno/vectordb/clickhouse/clickhousedb.py +554 -53
- agno/vectordb/couchbase/__init__.py +3 -0
- agno/vectordb/couchbase/couchbase.py +1446 -0
- agno/vectordb/lancedb/__init__.py +5 -0
- agno/vectordb/lancedb/lance_db.py +730 -98
- agno/vectordb/langchaindb/__init__.py +5 -0
- agno/vectordb/langchaindb/langchaindb.py +163 -0
- agno/vectordb/lightrag/__init__.py +5 -0
- agno/vectordb/lightrag/lightrag.py +388 -0
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +166 -0
- agno/vectordb/milvus/__init__.py +3 -0
- agno/vectordb/milvus/milvus.py +966 -78
- agno/vectordb/mongodb/__init__.py +9 -1
- agno/vectordb/mongodb/mongodb.py +1175 -172
- agno/vectordb/pgvector/__init__.py +8 -0
- agno/vectordb/pgvector/pgvector.py +599 -115
- agno/vectordb/pineconedb/__init__.py +5 -1
- agno/vectordb/pineconedb/pineconedb.py +406 -43
- agno/vectordb/qdrant/__init__.py +4 -0
- agno/vectordb/qdrant/qdrant.py +914 -61
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +682 -0
- agno/vectordb/singlestore/__init__.py +8 -1
- agno/vectordb/singlestore/singlestore.py +771 -0
- agno/vectordb/surrealdb/__init__.py +3 -0
- agno/vectordb/surrealdb/surrealdb.py +663 -0
- agno/vectordb/upstashdb/__init__.py +5 -0
- agno/vectordb/upstashdb/upstashdb.py +718 -0
- agno/vectordb/weaviate/__init__.py +8 -0
- agno/vectordb/weaviate/index.py +15 -0
- agno/vectordb/weaviate/weaviate.py +1009 -0
- agno/workflow/__init__.py +23 -1
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +759 -0
- agno/workflow/loop.py +756 -0
- agno/workflow/parallel.py +853 -0
- agno/workflow/router.py +723 -0
- agno/workflow/step.py +1564 -0
- agno/workflow/steps.py +613 -0
- agno/workflow/types.py +556 -0
- agno/workflow/workflow.py +4327 -514
- agno-2.3.13.dist-info/METADATA +639 -0
- agno-2.3.13.dist-info/RECORD +613 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +1 -1
- agno-2.3.13.dist-info/licenses/LICENSE +201 -0
- agno/api/playground.py +0 -91
- agno/api/schemas/playground.py +0 -22
- agno/api/schemas/user.py +0 -22
- agno/api/schemas/workspace.py +0 -46
- agno/api/user.py +0 -160
- agno/api/workspace.py +0 -151
- agno/cli/auth_server.py +0 -118
- agno/cli/config.py +0 -275
- agno/cli/console.py +0 -88
- agno/cli/credentials.py +0 -23
- agno/cli/entrypoint.py +0 -571
- agno/cli/operator.py +0 -355
- agno/cli/settings.py +0 -85
- agno/cli/ws/ws_cli.py +0 -817
- agno/constants.py +0 -13
- agno/document/__init__.py +0 -1
- agno/document/chunking/semantic.py +0 -47
- agno/document/chunking/strategy.py +0 -31
- agno/document/reader/__init__.py +0 -1
- agno/document/reader/arxiv_reader.py +0 -41
- agno/document/reader/base.py +0 -22
- agno/document/reader/csv_reader.py +0 -84
- agno/document/reader/docx_reader.py +0 -46
- agno/document/reader/firecrawl_reader.py +0 -99
- agno/document/reader/json_reader.py +0 -43
- agno/document/reader/pdf_reader.py +0 -219
- agno/document/reader/s3/pdf_reader.py +0 -46
- agno/document/reader/s3/text_reader.py +0 -51
- agno/document/reader/text_reader.py +0 -41
- agno/document/reader/website_reader.py +0 -175
- agno/document/reader/youtube_reader.py +0 -50
- agno/embedder/__init__.py +0 -1
- agno/embedder/azure_openai.py +0 -86
- agno/embedder/cohere.py +0 -72
- agno/embedder/fastembed.py +0 -37
- agno/embedder/google.py +0 -73
- agno/embedder/huggingface.py +0 -54
- agno/embedder/mistral.py +0 -80
- agno/embedder/ollama.py +0 -57
- agno/embedder/openai.py +0 -74
- agno/embedder/sentence_transformer.py +0 -38
- agno/embedder/voyageai.py +0 -64
- agno/eval/perf.py +0 -201
- agno/file/__init__.py +0 -1
- agno/file/file.py +0 -16
- agno/file/local/csv.py +0 -32
- agno/file/local/txt.py +0 -19
- agno/infra/app.py +0 -240
- agno/infra/base.py +0 -144
- agno/infra/context.py +0 -20
- agno/infra/db_app.py +0 -52
- agno/infra/resource.py +0 -205
- agno/infra/resources.py +0 -55
- agno/knowledge/agent.py +0 -230
- agno/knowledge/arxiv.py +0 -22
- agno/knowledge/combined.py +0 -22
- agno/knowledge/csv.py +0 -28
- agno/knowledge/csv_url.py +0 -19
- agno/knowledge/document.py +0 -20
- agno/knowledge/docx.py +0 -30
- agno/knowledge/json.py +0 -28
- agno/knowledge/langchain.py +0 -71
- agno/knowledge/llamaindex.py +0 -66
- agno/knowledge/pdf.py +0 -28
- agno/knowledge/pdf_url.py +0 -26
- agno/knowledge/s3/base.py +0 -60
- agno/knowledge/s3/pdf.py +0 -21
- agno/knowledge/s3/text.py +0 -23
- agno/knowledge/text.py +0 -30
- agno/knowledge/website.py +0 -88
- agno/knowledge/wikipedia.py +0 -31
- agno/knowledge/youtube.py +0 -22
- agno/memory/agent.py +0 -392
- agno/memory/classifier.py +0 -104
- agno/memory/db/__init__.py +0 -1
- agno/memory/db/base.py +0 -42
- agno/memory/db/mongodb.py +0 -189
- agno/memory/db/postgres.py +0 -203
- agno/memory/db/sqlite.py +0 -193
- agno/memory/memory.py +0 -15
- agno/memory/row.py +0 -36
- agno/memory/summarizer.py +0 -192
- agno/memory/summary.py +0 -19
- agno/memory/workflow.py +0 -38
- agno/models/google/gemini_openai.py +0 -26
- agno/models/ollama/hermes.py +0 -221
- agno/models/ollama/tools.py +0 -362
- agno/models/vertexai/gemini.py +0 -595
- agno/playground/__init__.py +0 -3
- agno/playground/async_router.py +0 -421
- agno/playground/deploy.py +0 -249
- agno/playground/operator.py +0 -92
- agno/playground/playground.py +0 -91
- agno/playground/schemas.py +0 -76
- agno/playground/serve.py +0 -55
- agno/playground/sync_router.py +0 -405
- agno/reasoning/agent.py +0 -68
- agno/run/response.py +0 -112
- agno/storage/agent/__init__.py +0 -0
- agno/storage/agent/base.py +0 -38
- agno/storage/agent/dynamodb.py +0 -350
- agno/storage/agent/json.py +0 -92
- agno/storage/agent/mongodb.py +0 -228
- agno/storage/agent/postgres.py +0 -367
- agno/storage/agent/session.py +0 -79
- agno/storage/agent/singlestore.py +0 -303
- agno/storage/agent/sqlite.py +0 -357
- agno/storage/agent/yaml.py +0 -93
- agno/storage/workflow/__init__.py +0 -0
- agno/storage/workflow/base.py +0 -40
- agno/storage/workflow/mongodb.py +0 -233
- agno/storage/workflow/postgres.py +0 -366
- agno/storage/workflow/session.py +0 -60
- agno/storage/workflow/sqlite.py +0 -359
- agno/tools/googlesearch.py +0 -88
- agno/utils/defaults.py +0 -57
- agno/utils/filesystem.py +0 -39
- agno/utils/git.py +0 -52
- agno/utils/json_io.py +0 -30
- agno/utils/load_env.py +0 -19
- agno/utils/py_io.py +0 -19
- agno/utils/pyproject.py +0 -18
- agno/utils/resource_filter.py +0 -31
- agno/vectordb/singlestore/s2vectordb.py +0 -390
- agno/vectordb/singlestore/s2vectordb2.py +0 -355
- agno/workspace/__init__.py +0 -0
- agno/workspace/config.py +0 -325
- agno/workspace/enums.py +0 -6
- agno/workspace/helpers.py +0 -48
- agno/workspace/operator.py +0 -758
- agno/workspace/settings.py +0 -63
- agno-0.1.2.dist-info/LICENSE +0 -375
- agno-0.1.2.dist-info/METADATA +0 -502
- agno-0.1.2.dist-info/RECORD +0 -352
- agno-0.1.2.dist-info/entry_points.txt +0 -3
- /agno/{cli → db/migrations}/__init__.py +0 -0
- /agno/{cli/ws → db/migrations/versions}/__init__.py +0 -0
- /agno/{document/chunking/__init__.py → db/schemas/metrics.py} +0 -0
- /agno/{document/reader/s3 → integrations}/__init__.py +0 -0
- /agno/{file/local → knowledge/chunking}/__init__.py +0 -0
- /agno/{infra → knowledge/remote_content}/__init__.py +0 -0
- /agno/{knowledge/s3 → tools/models}/__init__.py +0 -0
- /agno/{reranker → utils/models}/__init__.py +0 -0
- /agno/{storage → utils/print_response}/__init__.py +0 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1446 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from agno.filters import FilterExpr
|
|
7
|
+
from agno.knowledge.document import Document
|
|
8
|
+
from agno.knowledge.embedder import Embedder
|
|
9
|
+
from agno.utils.log import log_debug, log_info, log_warning, logger
|
|
10
|
+
from agno.vectordb.base import VectorDb
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from hashlib import md5
|
|
14
|
+
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError("`hashlib` not installed. Please install using `pip install hashlib`")
|
|
17
|
+
try:
|
|
18
|
+
from acouchbase.bucket import AsyncBucket
|
|
19
|
+
from acouchbase.cluster import AsyncCluster
|
|
20
|
+
from acouchbase.collection import AsyncCollection
|
|
21
|
+
from acouchbase.management.search import (
|
|
22
|
+
ScopeSearchIndexManager as AsyncScopeSearchIndexManager,
|
|
23
|
+
)
|
|
24
|
+
from acouchbase.management.search import (
|
|
25
|
+
SearchIndex as AsyncSearchIndex,
|
|
26
|
+
)
|
|
27
|
+
from acouchbase.management.search import (
|
|
28
|
+
SearchIndexManager as AsyncSearchIndexManager,
|
|
29
|
+
)
|
|
30
|
+
from acouchbase.scope import AsyncScope
|
|
31
|
+
from couchbase.bucket import Bucket
|
|
32
|
+
from couchbase.cluster import Cluster
|
|
33
|
+
from couchbase.collection import Collection
|
|
34
|
+
from couchbase.exceptions import (
|
|
35
|
+
CollectionAlreadyExistsException,
|
|
36
|
+
CollectionNotFoundException,
|
|
37
|
+
ScopeAlreadyExistsException,
|
|
38
|
+
SearchIndexNotFoundException,
|
|
39
|
+
)
|
|
40
|
+
from couchbase.management.search import ScopeSearchIndexManager, SearchIndex, SearchIndexManager
|
|
41
|
+
from couchbase.n1ql import QueryScanConsistency
|
|
42
|
+
from couchbase.options import ClusterOptions, QueryOptions, SearchOptions
|
|
43
|
+
from couchbase.result import SearchResult
|
|
44
|
+
from couchbase.scope import Scope
|
|
45
|
+
from couchbase.search import SearchRequest
|
|
46
|
+
from couchbase.vector_search import VectorQuery, VectorSearch
|
|
47
|
+
except ImportError:
|
|
48
|
+
raise ImportError("`couchbase` not installed. Please install using `pip install couchbase`")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CouchbaseSearch(VectorDb):
|
|
52
|
+
"""
|
|
53
|
+
Couchbase Vector Database implementation with FTS (Full Text Search) index support.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
bucket_name: str,
|
|
59
|
+
scope_name: str,
|
|
60
|
+
collection_name: str,
|
|
61
|
+
couchbase_connection_string: str,
|
|
62
|
+
cluster_options: ClusterOptions,
|
|
63
|
+
search_index: Union[str, SearchIndex],
|
|
64
|
+
embedder: Optional[Embedder] = None,
|
|
65
|
+
overwrite: bool = False,
|
|
66
|
+
is_global_level_index: bool = False,
|
|
67
|
+
wait_until_index_ready: float = 0,
|
|
68
|
+
batch_limit: int = 500,
|
|
69
|
+
name: Optional[str] = None,
|
|
70
|
+
description: Optional[str] = None,
|
|
71
|
+
**kwargs,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Initialize the CouchbaseSearch with Couchbase connection details.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
bucket_name (str): Name of the Couchbase bucket.
|
|
78
|
+
scope_name (str): Name of the scope within the bucket.
|
|
79
|
+
collection_name (str): Name of the collection within the scope.
|
|
80
|
+
name (Optional[str]): Name of the vector database.
|
|
81
|
+
description (Optional[str]): Description of the vector database.
|
|
82
|
+
couchbase_connection_string (str): Couchbase connection string.
|
|
83
|
+
cluster_options (ClusterOptions): Options for configuring the Couchbase cluster connection.
|
|
84
|
+
search_index (Union[str, SearchIndex], optional): Search index configuration, either as index name or SearchIndex definition.
|
|
85
|
+
embedder (Embedder): Embedder instance for generating embeddings. Defaults to OpenAIEmbedder.
|
|
86
|
+
overwrite (bool): Whether to overwrite existing collection. Defaults to False.
|
|
87
|
+
wait_until_index_ready (float, optional): Time in seconds to wait until the index is ready. Defaults to 0.
|
|
88
|
+
batch_limit (int, optional): Maximum number of documents to process in a single batch (applies to both sync and async operations). Defaults to 500.
|
|
89
|
+
**kwargs: Additional arguments for Couchbase connection.
|
|
90
|
+
"""
|
|
91
|
+
if not bucket_name:
|
|
92
|
+
raise ValueError("Bucket name must not be empty.")
|
|
93
|
+
|
|
94
|
+
self.bucket_name = bucket_name
|
|
95
|
+
self.scope_name = scope_name
|
|
96
|
+
self.collection_name = collection_name
|
|
97
|
+
self.connection_string = couchbase_connection_string
|
|
98
|
+
self.cluster_options = cluster_options
|
|
99
|
+
if embedder is None:
|
|
100
|
+
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
101
|
+
|
|
102
|
+
embedder = OpenAIEmbedder()
|
|
103
|
+
log_info("Embedder not provided, using OpenAIEmbedder as default.")
|
|
104
|
+
self.embedder = embedder
|
|
105
|
+
self.overwrite = overwrite
|
|
106
|
+
self.is_global_level_index = is_global_level_index
|
|
107
|
+
self.wait_until_index_ready = wait_until_index_ready
|
|
108
|
+
# Initialize base class with name and description
|
|
109
|
+
super().__init__(name=name, description=description)
|
|
110
|
+
|
|
111
|
+
self.kwargs = kwargs
|
|
112
|
+
self.batch_limit = batch_limit
|
|
113
|
+
if isinstance(search_index, str):
|
|
114
|
+
self.search_index_name = search_index
|
|
115
|
+
self.search_index_definition = None
|
|
116
|
+
else:
|
|
117
|
+
self.search_index_name = search_index.name
|
|
118
|
+
self.search_index_definition = search_index
|
|
119
|
+
|
|
120
|
+
self._cluster: Optional[Cluster] = None
|
|
121
|
+
self._bucket: Optional[Bucket] = None
|
|
122
|
+
self._scope: Optional[Scope] = None
|
|
123
|
+
self._collection: Optional[Collection] = None
|
|
124
|
+
|
|
125
|
+
self._async_cluster: Optional[AsyncCluster] = None
|
|
126
|
+
self._async_bucket: Optional[AsyncBucket] = None
|
|
127
|
+
self._async_scope: Optional[AsyncScope] = None
|
|
128
|
+
self._async_collection: Optional[AsyncCollection] = None
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def cluster(self) -> Cluster:
|
|
132
|
+
"""Create or retrieve the Couchbase cluster connection."""
|
|
133
|
+
if self._cluster is None:
|
|
134
|
+
try:
|
|
135
|
+
logger.debug("Creating Couchbase Cluster connection")
|
|
136
|
+
cluster = Cluster(self.connection_string, self.cluster_options)
|
|
137
|
+
# Verify connection
|
|
138
|
+
cluster.wait_until_ready(timeout=timedelta(seconds=60))
|
|
139
|
+
logger.info("Connected to Couchbase successfully.")
|
|
140
|
+
self._cluster = cluster
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Failed to connect to Couchbase: {e}")
|
|
143
|
+
raise ConnectionError(f"Failed to connect to Couchbase: {e}")
|
|
144
|
+
return self._cluster
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def bucket(self) -> Bucket:
|
|
148
|
+
"""Get the Couchbase bucket."""
|
|
149
|
+
if self._bucket is None:
|
|
150
|
+
self._bucket = self.cluster.bucket(self.bucket_name)
|
|
151
|
+
return self._bucket
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def scope(self) -> Scope:
|
|
155
|
+
"""Get the Couchbase scope."""
|
|
156
|
+
if self._scope is None:
|
|
157
|
+
self._scope = self.bucket.scope(self.scope_name)
|
|
158
|
+
return self._scope
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def collection(self) -> Collection:
|
|
162
|
+
"""Get the Couchbase collection."""
|
|
163
|
+
if self._collection is None:
|
|
164
|
+
self._collection = self.scope.collection(self.collection_name)
|
|
165
|
+
return self._collection
|
|
166
|
+
|
|
167
|
+
def _create_collection_and_scope(self):
|
|
168
|
+
"""
|
|
169
|
+
Get or create the scope and collection within the bucket.
|
|
170
|
+
|
|
171
|
+
Uses EAFP principle: attempts to create scope/collection and handles
|
|
172
|
+
specific exceptions if they already exist or (for collections with overwrite=True)
|
|
173
|
+
if they are not found for dropping.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
Exception: If scope or collection creation/manipulation fails unexpectedly.
|
|
177
|
+
"""
|
|
178
|
+
# 1. Ensure Scope Exists
|
|
179
|
+
try:
|
|
180
|
+
self.bucket.collections().create_scope(scope_name=self.scope_name)
|
|
181
|
+
logger.info(f"Created new scope '{self.scope_name}'")
|
|
182
|
+
except ScopeAlreadyExistsException:
|
|
183
|
+
logger.info(f"Scope '{self.scope_name}' already exists. Using existing scope.")
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.error(f"Failed to create or ensure scope '{self.scope_name}' exists: {e}")
|
|
186
|
+
raise
|
|
187
|
+
|
|
188
|
+
collection_manager = self.bucket.collections()
|
|
189
|
+
|
|
190
|
+
# 2. Handle Collection
|
|
191
|
+
if self.overwrite:
|
|
192
|
+
# Attempt to drop the collection first since overwrite is True
|
|
193
|
+
try:
|
|
194
|
+
logger.info(
|
|
195
|
+
f"Overwrite is True. Attempting to drop collection '{self.collection_name}' in scope '{self.scope_name}'."
|
|
196
|
+
)
|
|
197
|
+
collection_manager.drop_collection(collection_name=self.collection_name, scope_name=self.scope_name)
|
|
198
|
+
logger.info(f"Successfully dropped collection '{self.collection_name}'.")
|
|
199
|
+
time.sleep(1) # Brief wait after drop, as in original code
|
|
200
|
+
except CollectionNotFoundException:
|
|
201
|
+
logger.info(
|
|
202
|
+
f"Collection '{self.collection_name}' not found in scope '{self.scope_name}'. No need to drop."
|
|
203
|
+
)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"Error dropping collection '{self.collection_name}' during overwrite: {e}")
|
|
206
|
+
raise
|
|
207
|
+
|
|
208
|
+
# Proceed to create the collection
|
|
209
|
+
try:
|
|
210
|
+
logger.info(f"Creating collection '{self.collection_name}' in scope '{self.scope_name}'.")
|
|
211
|
+
collection_manager.create_collection(scope_name=self.scope_name, collection_name=self.collection_name)
|
|
212
|
+
logger.info(
|
|
213
|
+
f"Successfully created collection '{self.collection_name}' after drop attempt (overwrite=True)."
|
|
214
|
+
)
|
|
215
|
+
except CollectionAlreadyExistsException:
|
|
216
|
+
# This is an unexpected state if overwrite=True and drop was supposed to clear the way.
|
|
217
|
+
logger.error(
|
|
218
|
+
f"Failed to create collection '{self.collection_name}' as it already exists, "
|
|
219
|
+
f"even after drop attempt for overwrite. Overwrite operation may not have completed as intended."
|
|
220
|
+
)
|
|
221
|
+
raise # Re-raise as the overwrite intent failed
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(
|
|
224
|
+
f"Error creating collection '{self.collection_name}' after drop attempt (overwrite=True): {e}"
|
|
225
|
+
)
|
|
226
|
+
raise
|
|
227
|
+
else: # self.overwrite is False
|
|
228
|
+
try:
|
|
229
|
+
logger.info(
|
|
230
|
+
f"Overwrite is False. Attempting to create collection '{self.collection_name}' in scope '{self.scope_name}'."
|
|
231
|
+
)
|
|
232
|
+
collection_manager.create_collection(scope_name=self.scope_name, collection_name=self.collection_name)
|
|
233
|
+
logger.info(f"Successfully created new collection '{self.collection_name}'.")
|
|
234
|
+
except CollectionAlreadyExistsException:
|
|
235
|
+
logger.info(
|
|
236
|
+
f"Collection '{self.collection_name}' already exists in scope '{self.scope_name}'. Using existing collection."
|
|
237
|
+
)
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.error(f"Error creating collection '{self.collection_name}': {e}")
|
|
240
|
+
raise
|
|
241
|
+
|
|
242
|
+
def _search_indexes_mng(self) -> Union[SearchIndexManager, ScopeSearchIndexManager]:
|
|
243
|
+
"""Get the search indexes manager."""
|
|
244
|
+
if self.is_global_level_index:
|
|
245
|
+
return self.cluster.search_indexes()
|
|
246
|
+
else:
|
|
247
|
+
return self.scope.search_indexes()
|
|
248
|
+
|
|
249
|
+
def _create_fts_index(self):
|
|
250
|
+
"""Create a FTS index on the collection if it doesn't exist."""
|
|
251
|
+
try:
|
|
252
|
+
# Check if index exists and handle string index name
|
|
253
|
+
self._search_indexes_mng().get_index(self.search_index_name)
|
|
254
|
+
if not self.overwrite:
|
|
255
|
+
return
|
|
256
|
+
except Exception:
|
|
257
|
+
if self.search_index_definition is None:
|
|
258
|
+
raise ValueError(f"Index '{self.search_index_name}' does not exist")
|
|
259
|
+
|
|
260
|
+
# Create or update index
|
|
261
|
+
try:
|
|
262
|
+
if self.overwrite:
|
|
263
|
+
try:
|
|
264
|
+
logger.info(f"Dropping existing FTS index '{self.search_index_name}'")
|
|
265
|
+
self._search_indexes_mng().drop_index(self.search_index_name)
|
|
266
|
+
except SearchIndexNotFoundException:
|
|
267
|
+
logger.warning(f"Index '{self.search_index_name}' does not exist")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.warning(f"Error dropping index (may not exist): {e}")
|
|
270
|
+
|
|
271
|
+
self._search_indexes_mng().upsert_index(self.search_index_definition)
|
|
272
|
+
logger.info(f"Created FTS index '{self.search_index_name}'")
|
|
273
|
+
|
|
274
|
+
if self.wait_until_index_ready:
|
|
275
|
+
self._wait_for_index_ready()
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.error(f"Error creating FTS index '{self.search_index_name}': {e}")
|
|
279
|
+
raise
|
|
280
|
+
|
|
281
|
+
def _wait_for_index_ready(self):
|
|
282
|
+
"""Wait until the FTS index is ready."""
|
|
283
|
+
start_time = time.time()
|
|
284
|
+
while True:
|
|
285
|
+
try:
|
|
286
|
+
count = self._search_indexes_mng().get_indexed_documents_count(self.search_index_name)
|
|
287
|
+
if count > -1:
|
|
288
|
+
logger.info(f"FTS index '{self.search_index_name}' is ready")
|
|
289
|
+
break
|
|
290
|
+
# logger.info(f"FTS index '{self.search_index_name}' is not ready yet status: {index['status']}")
|
|
291
|
+
except Exception as e:
|
|
292
|
+
if time.time() - start_time > self.wait_until_index_ready:
|
|
293
|
+
logger.error(f"Error checking index status: {e}")
|
|
294
|
+
raise TimeoutError("Timeout waiting for FTS index to become ready")
|
|
295
|
+
time.sleep(1)
|
|
296
|
+
|
|
297
|
+
def create(self) -> None:
|
|
298
|
+
"""Create the collection and FTS index if they don't exist."""
|
|
299
|
+
self._create_collection_and_scope()
|
|
300
|
+
self._create_fts_index()
|
|
301
|
+
|
|
302
|
+
def insert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
303
|
+
"""
|
|
304
|
+
Insert documents into the Couchbase bucket. Fails if any document already exists.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
documents: List of documents to insert
|
|
308
|
+
filters: Optional filters to apply to the documents
|
|
309
|
+
"""
|
|
310
|
+
log_debug(f"Inserting {len(documents)} documents")
|
|
311
|
+
|
|
312
|
+
docs_to_insert: Dict[str, Any] = {}
|
|
313
|
+
for document in documents:
|
|
314
|
+
if document.embedding is None:
|
|
315
|
+
document.embed(embedder=self.embedder)
|
|
316
|
+
|
|
317
|
+
if document.embedding is None:
|
|
318
|
+
raise ValueError(f"Failed to generate embedding for document: {document.name}")
|
|
319
|
+
try:
|
|
320
|
+
doc_data = self.prepare_doc(content_hash, document)
|
|
321
|
+
if filters:
|
|
322
|
+
doc_data["filters"] = filters
|
|
323
|
+
# For insert_multi, the key of the dict is the document ID,
|
|
324
|
+
# and the value is the document content itself.
|
|
325
|
+
doc_id = doc_data.pop("_id")
|
|
326
|
+
docs_to_insert[doc_id] = doc_data
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.error(f"Error preparing document '{document.name}': {e}")
|
|
329
|
+
|
|
330
|
+
if not docs_to_insert:
|
|
331
|
+
logger.info("No documents prepared for insertion.")
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
doc_ids = list(docs_to_insert.keys())
|
|
335
|
+
total_inserted_count = 0
|
|
336
|
+
total_processed_count = len(doc_ids)
|
|
337
|
+
errors_occurred = False
|
|
338
|
+
|
|
339
|
+
for i in range(0, len(doc_ids), self.batch_limit):
|
|
340
|
+
batch_doc_ids = doc_ids[i : i + self.batch_limit]
|
|
341
|
+
batch_docs_to_insert = {doc_id: docs_to_insert[doc_id] for doc_id in batch_doc_ids}
|
|
342
|
+
|
|
343
|
+
if not batch_docs_to_insert:
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
log_debug(f"Inserting batch of {len(batch_docs_to_insert)} documents.")
|
|
347
|
+
try:
|
|
348
|
+
result = self.collection.insert_multi(batch_docs_to_insert)
|
|
349
|
+
# Check for errors in the batch result
|
|
350
|
+
# The actual way to count successes/failures might depend on the SDK version
|
|
351
|
+
# For Couchbase SDK 3.x/4.x, result.all_ok is a good indicator for the whole batch.
|
|
352
|
+
# If not all_ok, result.exceptions (dict) contains errors for specific keys.
|
|
353
|
+
|
|
354
|
+
# Simplistic success counting for this example, assuming partial success is possible
|
|
355
|
+
# and we want to count how many actually made it.
|
|
356
|
+
if result.all_ok:
|
|
357
|
+
batch_inserted_count = len(batch_docs_to_insert)
|
|
358
|
+
logger.info(f"Batch of {batch_inserted_count} documents inserted successfully.")
|
|
359
|
+
else:
|
|
360
|
+
# If not all_ok, count successes by checking which keys are NOT in exceptions
|
|
361
|
+
# This is a more robust way than just len(batch) - len(exceptions)
|
|
362
|
+
# as some items might succeed even if others fail.
|
|
363
|
+
succeeded_ids = set(batch_docs_to_insert.keys()) - set(
|
|
364
|
+
result.exceptions.keys() if result.exceptions else []
|
|
365
|
+
)
|
|
366
|
+
batch_inserted_count = len(succeeded_ids)
|
|
367
|
+
if batch_inserted_count > 0:
|
|
368
|
+
logger.info(f"Partially inserted {batch_inserted_count} documents in batch.")
|
|
369
|
+
logger.warning(f"Bulk write error during batch insert: {result.exceptions}")
|
|
370
|
+
errors_occurred = True
|
|
371
|
+
total_inserted_count += batch_inserted_count
|
|
372
|
+
|
|
373
|
+
except Exception as e:
|
|
374
|
+
logger.error(f"Error during batch bulk insert for {len(batch_docs_to_insert)} documents: {e}")
|
|
375
|
+
errors_occurred = True # Mark that an error occurred in this batch
|
|
376
|
+
|
|
377
|
+
logger.info(f"Finished processing {total_processed_count} documents for insertion.")
|
|
378
|
+
logger.info(f"Total successfully inserted: {total_inserted_count}.")
|
|
379
|
+
if errors_occurred:
|
|
380
|
+
logger.warning("Some errors occurred during the insert operation. Please check logs for details.")
|
|
381
|
+
|
|
382
|
+
def upsert_available(self) -> bool:
|
|
383
|
+
"""Check if upsert is available in Couchbase."""
|
|
384
|
+
return True
|
|
385
|
+
|
|
386
|
+
def _upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
387
|
+
"""
|
|
388
|
+
Update existing documents or insert new ones into the Couchbase bucket.
|
|
389
|
+
"""
|
|
390
|
+
if self.content_hash_exists(content_hash):
|
|
391
|
+
self._delete_by_content_hash(content_hash)
|
|
392
|
+
self.insert(content_hash=content_hash, documents=documents, filters=filters)
|
|
393
|
+
|
|
394
|
+
def upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
395
|
+
"""
|
|
396
|
+
Update existing documents or insert new ones into the Couchbase bucket.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
documents: List of documents to upsert
|
|
400
|
+
filters: Optional filters to apply to the documents
|
|
401
|
+
"""
|
|
402
|
+
logger.info(f"Upserting {len(documents)} documents")
|
|
403
|
+
|
|
404
|
+
docs_to_upsert: Dict[str, Any] = {}
|
|
405
|
+
for document in documents:
|
|
406
|
+
try:
|
|
407
|
+
if document.embedding is None:
|
|
408
|
+
document.embed(embedder=self.embedder)
|
|
409
|
+
|
|
410
|
+
if document.embedding is None:
|
|
411
|
+
raise ValueError(f"Failed to generate embedding for document: {document.name}")
|
|
412
|
+
|
|
413
|
+
doc_data = self.prepare_doc(content_hash, document)
|
|
414
|
+
if filters:
|
|
415
|
+
doc_data["filters"] = filters
|
|
416
|
+
# For upsert_multi, the key of the dict is the document ID,
|
|
417
|
+
# and the value is the document content itself.
|
|
418
|
+
doc_id = doc_data.pop("_id")
|
|
419
|
+
docs_to_upsert[doc_id] = doc_data
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.error(f"Error preparing document '{document.name}': {e}")
|
|
422
|
+
|
|
423
|
+
if not docs_to_upsert:
|
|
424
|
+
logger.info("No documents prepared for upsert.")
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
doc_ids = list(docs_to_upsert.keys())
|
|
428
|
+
total_upserted_count = 0
|
|
429
|
+
total_processed_count = len(doc_ids)
|
|
430
|
+
errors_occurred = False
|
|
431
|
+
|
|
432
|
+
for i in range(0, len(doc_ids), self.batch_limit):
|
|
433
|
+
batch_doc_ids = doc_ids[i : i + self.batch_limit]
|
|
434
|
+
batch_docs_to_upsert = {doc_id: docs_to_upsert[doc_id] for doc_id in batch_doc_ids}
|
|
435
|
+
|
|
436
|
+
if not batch_docs_to_upsert:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
logger.info(f"Upserting batch of {len(batch_docs_to_upsert)} documents.")
|
|
440
|
+
try:
|
|
441
|
+
result = self.collection.upsert_multi(batch_docs_to_upsert)
|
|
442
|
+
# Similar to insert_multi, check for errors in the batch result.
|
|
443
|
+
if result.all_ok:
|
|
444
|
+
batch_upserted_count = len(batch_docs_to_upsert)
|
|
445
|
+
logger.info(f"Batch of {batch_upserted_count} documents upserted successfully.")
|
|
446
|
+
else:
|
|
447
|
+
succeeded_ids = set(batch_docs_to_upsert.keys()) - set(
|
|
448
|
+
result.exceptions.keys() if result.exceptions else []
|
|
449
|
+
)
|
|
450
|
+
batch_upserted_count = len(succeeded_ids)
|
|
451
|
+
if batch_upserted_count > 0:
|
|
452
|
+
logger.info(f"Partially upserted {batch_upserted_count} documents in batch.")
|
|
453
|
+
logger.warning(f"Bulk write error during batch upsert: {result.exceptions}")
|
|
454
|
+
errors_occurred = True
|
|
455
|
+
total_upserted_count += batch_upserted_count
|
|
456
|
+
|
|
457
|
+
except Exception as e:
|
|
458
|
+
logger.error(f"Error during batch bulk upsert for {len(batch_docs_to_upsert)} documents: {e}")
|
|
459
|
+
errors_occurred = True
|
|
460
|
+
|
|
461
|
+
logger.info(f"Finished processing {total_processed_count} documents for upsert.")
|
|
462
|
+
logger.info(f"Total successfully upserted: {total_upserted_count}.")
|
|
463
|
+
if errors_occurred:
|
|
464
|
+
logger.warning("Some errors occurred during the upsert operation. Please check logs for details.")
|
|
465
|
+
|
|
466
|
+
def search(
|
|
467
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
468
|
+
) -> List[Document]:
|
|
469
|
+
if isinstance(filters, List):
|
|
470
|
+
log_warning("Filter Expressions are not yet supported in Couchbase. No filters will be applied.")
|
|
471
|
+
filters = None
|
|
472
|
+
"""Search the Couchbase bucket for documents relevant to the query."""
|
|
473
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
474
|
+
if query_embedding is None:
|
|
475
|
+
logger.error(f"Failed to generate embedding for query: {query}")
|
|
476
|
+
return []
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
# Implement vector search using Couchbase FTS
|
|
480
|
+
vector_search = VectorSearch.from_vector_query(
|
|
481
|
+
VectorQuery(field_name="embedding", vector=query_embedding, num_candidates=limit)
|
|
482
|
+
)
|
|
483
|
+
request = SearchRequest.create(vector_search)
|
|
484
|
+
|
|
485
|
+
# Prepare the options dictionary
|
|
486
|
+
options_dict = {"limit": limit, "fields": ["*"]}
|
|
487
|
+
if filters:
|
|
488
|
+
options_dict["raw"] = filters
|
|
489
|
+
|
|
490
|
+
search_args = {
|
|
491
|
+
"index": self.search_index_name,
|
|
492
|
+
"request": request,
|
|
493
|
+
"options": SearchOptions(**options_dict), # Construct SearchOptions with the dictionary
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
if self.is_global_level_index:
|
|
497
|
+
results = self.cluster.search(**search_args)
|
|
498
|
+
else:
|
|
499
|
+
results = self.scope.search(**search_args)
|
|
500
|
+
|
|
501
|
+
return self.__get_doc_from_kv(results)
|
|
502
|
+
except Exception as e:
|
|
503
|
+
logger.error(f"Error during search: {e}")
|
|
504
|
+
raise
|
|
505
|
+
|
|
506
|
+
def __get_doc_from_kv(self, response: SearchResult) -> List[Document]:
|
|
507
|
+
"""
|
|
508
|
+
Convert search results to Document objects by fetching full documents from KV store.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
response: SearchResult from Couchbase search query
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
List of Document objects
|
|
515
|
+
"""
|
|
516
|
+
documents: List[Document] = []
|
|
517
|
+
search_hits = [(doc.id, doc.score) for doc in response.rows()]
|
|
518
|
+
|
|
519
|
+
if not search_hits:
|
|
520
|
+
return documents
|
|
521
|
+
|
|
522
|
+
# Fetch documents from KV store
|
|
523
|
+
ids = [hit[0] for hit in search_hits]
|
|
524
|
+
kv_response = self.collection.get_multi(keys=ids)
|
|
525
|
+
|
|
526
|
+
if not kv_response.all_ok:
|
|
527
|
+
raise Exception(f"Failed to get documents from KV store: {kv_response.exceptions}")
|
|
528
|
+
|
|
529
|
+
# Convert results to Documents
|
|
530
|
+
for doc_id, score in search_hits:
|
|
531
|
+
get_result = kv_response.results.get(doc_id)
|
|
532
|
+
if get_result is None or not get_result.success:
|
|
533
|
+
logger.warning(f"Document {doc_id} not found in KV store")
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
value = get_result.value
|
|
537
|
+
documents.append(
|
|
538
|
+
Document(
|
|
539
|
+
id=doc_id,
|
|
540
|
+
name=value["name"],
|
|
541
|
+
content=value["content"],
|
|
542
|
+
meta_data=value["meta_data"],
|
|
543
|
+
embedding=value["embedding"],
|
|
544
|
+
content_id=value.get("content_id"),
|
|
545
|
+
)
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
return documents
|
|
549
|
+
|
|
550
|
+
def drop(self) -> None:
|
|
551
|
+
"""Delete the collection from the scope."""
|
|
552
|
+
if self.exists():
|
|
553
|
+
try:
|
|
554
|
+
self.bucket.collections().drop_collection(
|
|
555
|
+
collection_name=self.collection_name, scope_name=self.scope_name
|
|
556
|
+
)
|
|
557
|
+
logger.info(f"Collection '{self.collection_name}' dropped successfully.")
|
|
558
|
+
except Exception as e:
|
|
559
|
+
logger.error(f"Error dropping collection '{self.collection_name}': {e}")
|
|
560
|
+
raise
|
|
561
|
+
|
|
562
|
+
def delete(self) -> bool:
|
|
563
|
+
"""Delete the collection from the scope."""
|
|
564
|
+
if self.exists():
|
|
565
|
+
self.drop()
|
|
566
|
+
return True
|
|
567
|
+
return False
|
|
568
|
+
|
|
569
|
+
def exists(self) -> bool:
|
|
570
|
+
"""Check if the collection exists."""
|
|
571
|
+
try:
|
|
572
|
+
scopes = self.bucket.collections().get_all_scopes()
|
|
573
|
+
for scope in scopes:
|
|
574
|
+
if scope.name == self.scope_name:
|
|
575
|
+
for collection in scope.collections:
|
|
576
|
+
if collection.name == self.collection_name:
|
|
577
|
+
return True
|
|
578
|
+
return False
|
|
579
|
+
except Exception:
|
|
580
|
+
return False
|
|
581
|
+
|
|
582
|
+
def prepare_doc(self, content_hash: str, document: Document) -> Dict[str, Any]:
|
|
583
|
+
"""
|
|
584
|
+
Prepare a document for insertion into Couchbase.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
document: Document to prepare
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
Dictionary containing document data ready for insertion
|
|
591
|
+
|
|
592
|
+
Raises:
|
|
593
|
+
ValueError: If embedding generation fails
|
|
594
|
+
"""
|
|
595
|
+
if not document.content:
|
|
596
|
+
raise ValueError(f"Document {document.name} has no content")
|
|
597
|
+
|
|
598
|
+
logger.debug(f"Preparing document: {document.name}")
|
|
599
|
+
|
|
600
|
+
# Clean content and generate ID
|
|
601
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
602
|
+
doc_id = md5(cleaned_content.encode("utf-8")).hexdigest()
|
|
603
|
+
|
|
604
|
+
return {
|
|
605
|
+
"_id": doc_id,
|
|
606
|
+
"name": document.name,
|
|
607
|
+
"content": cleaned_content,
|
|
608
|
+
"meta_data": document.meta_data, # Ensure meta_data is never None
|
|
609
|
+
"embedding": document.embedding,
|
|
610
|
+
"content_id": document.content_id,
|
|
611
|
+
"content_hash": content_hash,
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
def get_count(self) -> int:
|
|
615
|
+
"""Get the count of documents in the Couchbase bucket."""
|
|
616
|
+
try:
|
|
617
|
+
search_indexes = self.cluster.search_indexes()
|
|
618
|
+
if not self.is_global_level_index:
|
|
619
|
+
search_indexes = self.scope.search_indexes()
|
|
620
|
+
return search_indexes.get_indexed_documents_count(self.search_index_name)
|
|
621
|
+
except Exception as e:
|
|
622
|
+
logger.error(f"Error getting document count: {e}")
|
|
623
|
+
return 0
|
|
624
|
+
|
|
625
|
+
def name_exists(self, name: str) -> bool:
|
|
626
|
+
"""Check if a document exists in the bucket based on its name."""
|
|
627
|
+
try:
|
|
628
|
+
# Use N1QL query to check if document with given name exists
|
|
629
|
+
query = f"SELECT name FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE name = $name LIMIT 1"
|
|
630
|
+
result = self.scope.query(
|
|
631
|
+
query, QueryOptions(named_parameters={"name": name}, scan_consistency=QueryScanConsistency.REQUEST_PLUS)
|
|
632
|
+
)
|
|
633
|
+
for row in result.rows():
|
|
634
|
+
return True
|
|
635
|
+
return False
|
|
636
|
+
except Exception as e:
|
|
637
|
+
logger.error(f"Error checking document name existence: {e}")
|
|
638
|
+
return False
|
|
639
|
+
|
|
640
|
+
def id_exists(self, id: str) -> bool:
|
|
641
|
+
"""Check if a document exists in the bucket based on its ID."""
|
|
642
|
+
try:
|
|
643
|
+
result = self.collection.exists(id)
|
|
644
|
+
if not result.exists:
|
|
645
|
+
logger.debug(f"Document 'does not exist': {id}")
|
|
646
|
+
return result.exists
|
|
647
|
+
except Exception as e:
|
|
648
|
+
logger.error(f"Error checking document existence: {e}")
|
|
649
|
+
return False
|
|
650
|
+
|
|
651
|
+
def content_hash_exists(self, content_hash: str) -> bool:
|
|
652
|
+
"""Check if a document exists in the bucket based on its content hash."""
|
|
653
|
+
try:
|
|
654
|
+
# Use N1QL query to check if document with given content_hash exists
|
|
655
|
+
query = f"SELECT content_hash FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE content_hash = $content_hash LIMIT 1"
|
|
656
|
+
result = self.scope.query(
|
|
657
|
+
query,
|
|
658
|
+
QueryOptions(
|
|
659
|
+
named_parameters={"content_hash": content_hash}, scan_consistency=QueryScanConsistency.REQUEST_PLUS
|
|
660
|
+
),
|
|
661
|
+
)
|
|
662
|
+
for row in result.rows():
|
|
663
|
+
return True
|
|
664
|
+
return False
|
|
665
|
+
except Exception as e:
|
|
666
|
+
logger.error(f"Error checking document content_hash existence: {e}")
|
|
667
|
+
return False
|
|
668
|
+
|
|
669
|
+
# === ASYNC SUPPORT USING acouchbase ===
|
|
670
|
+
|
|
671
|
+
async def _create_async_cluster_instance(self) -> AsyncCluster:
|
|
672
|
+
"""Helper method to create and connect an AsyncCluster instance."""
|
|
673
|
+
logger.debug("Creating and connecting new AsyncCluster instance.")
|
|
674
|
+
cluster = await AsyncCluster.connect(self.connection_string, self.cluster_options)
|
|
675
|
+
# AsyncCluster.connect ensures the cluster is ready upon successful await.
|
|
676
|
+
# No explicit wait_until_ready is needed here for AsyncCluster.
|
|
677
|
+
logger.info("AsyncCluster connected successfully.")
|
|
678
|
+
return cluster
|
|
679
|
+
|
|
680
|
+
async def get_async_cluster(self) -> AsyncCluster:
|
|
681
|
+
"""Gets or creates the cached AsyncCluster instance."""
|
|
682
|
+
if self._async_cluster is None:
|
|
683
|
+
logger.debug("AsyncCluster instance not cached, creating new one.")
|
|
684
|
+
self._async_cluster = await self._create_async_cluster_instance()
|
|
685
|
+
return self._async_cluster
|
|
686
|
+
|
|
687
|
+
async def get_async_bucket(self) -> AsyncBucket:
|
|
688
|
+
"""Gets or creates the cached AsyncBucket instance."""
|
|
689
|
+
if self._async_bucket is None:
|
|
690
|
+
logger.debug("AsyncBucket instance not cached, creating new one.")
|
|
691
|
+
cluster = await self.get_async_cluster()
|
|
692
|
+
self._async_bucket = cluster.bucket(self.bucket_name)
|
|
693
|
+
return self._async_bucket
|
|
694
|
+
|
|
695
|
+
async def get_async_scope(self) -> AsyncScope:
|
|
696
|
+
"""Gets or creates the cached AsyncScope instance."""
|
|
697
|
+
if self._async_scope is None:
|
|
698
|
+
logger.debug("AsyncScope instance not cached, creating new one.")
|
|
699
|
+
bucket = await self.get_async_bucket()
|
|
700
|
+
self._async_scope = bucket.scope(self.scope_name)
|
|
701
|
+
return self._async_scope
|
|
702
|
+
|
|
703
|
+
async def get_async_collection(self) -> AsyncCollection:
|
|
704
|
+
"""Gets or creates the cached AsyncCollection instance."""
|
|
705
|
+
if self._async_collection is None:
|
|
706
|
+
logger.debug("AsyncCollection instance not cached, creating new one.")
|
|
707
|
+
scope = await self.get_async_scope()
|
|
708
|
+
self._async_collection = scope.collection(self.collection_name)
|
|
709
|
+
return self._async_collection
|
|
710
|
+
|
|
711
|
+
async def async_create(self) -> None:
|
|
712
|
+
# FTS index creation is not supported in acouchbase as of now, so fallback to sync for index creation
|
|
713
|
+
# This is a limitation of the SDK. You may want to document this.
|
|
714
|
+
await self._async_create_collection_and_scope()
|
|
715
|
+
await self._async_create_fts_index()
|
|
716
|
+
|
|
717
|
+
async def _async_create_collection_and_scope(self):
|
|
718
|
+
"""
|
|
719
|
+
Get or create the scope and collection within the bucket.
|
|
720
|
+
|
|
721
|
+
Uses EAFP principle: attempts to create scope/collection and handles
|
|
722
|
+
specific exceptions if they already exist or (for collections with overwrite=True)
|
|
723
|
+
if they are not found for dropping.
|
|
724
|
+
|
|
725
|
+
Raises:
|
|
726
|
+
Exception: If scope or collection creation/manipulation fails unexpectedly.
|
|
727
|
+
"""
|
|
728
|
+
# 1. Ensure Scope Exists
|
|
729
|
+
async_bucket_instance = await self.get_async_bucket()
|
|
730
|
+
try:
|
|
731
|
+
await async_bucket_instance.collections().create_scope(self.scope_name)
|
|
732
|
+
logger.info(f"Created new scope '{self.scope_name}'")
|
|
733
|
+
except ScopeAlreadyExistsException:
|
|
734
|
+
logger.info(f"Scope '{self.scope_name}' already exists. Using existing scope.")
|
|
735
|
+
except Exception as e:
|
|
736
|
+
logger.error(f"Failed to create or ensure scope '{self.scope_name}' exists: {e}")
|
|
737
|
+
raise
|
|
738
|
+
|
|
739
|
+
collection_manager = async_bucket_instance.collections()
|
|
740
|
+
|
|
741
|
+
# 2. Handle Collection
|
|
742
|
+
if self.overwrite:
|
|
743
|
+
# Attempt to drop the collection first since overwrite is True
|
|
744
|
+
try:
|
|
745
|
+
logger.info(
|
|
746
|
+
f"Overwrite is True. Attempting to drop collection '{self.collection_name}' in scope '{self.scope_name}'."
|
|
747
|
+
)
|
|
748
|
+
await collection_manager.drop_collection(
|
|
749
|
+
collection_name=self.collection_name, scope_name=self.scope_name
|
|
750
|
+
)
|
|
751
|
+
logger.info(f"Successfully dropped collection '{self.collection_name}'.")
|
|
752
|
+
time.sleep(1) # Brief wait after drop, as in original code
|
|
753
|
+
except CollectionNotFoundException:
|
|
754
|
+
logger.info(
|
|
755
|
+
f"Collection '{self.collection_name}' not found in scope '{self.scope_name}'. No need to drop."
|
|
756
|
+
)
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logger.error(f"Error dropping collection '{self.collection_name}' during overwrite: {e}")
|
|
759
|
+
raise
|
|
760
|
+
|
|
761
|
+
# Proceed to create the collection
|
|
762
|
+
try:
|
|
763
|
+
logger.info(f"Creating collection '{self.collection_name}' in scope '{self.scope_name}'.")
|
|
764
|
+
await collection_manager.create_collection(
|
|
765
|
+
scope_name=self.scope_name, collection_name=self.collection_name
|
|
766
|
+
)
|
|
767
|
+
logger.info(
|
|
768
|
+
f"Successfully created collection '{self.collection_name}' after drop attempt (overwrite=True)."
|
|
769
|
+
)
|
|
770
|
+
except CollectionAlreadyExistsException:
|
|
771
|
+
# This is an unexpected state if overwrite=True and drop was supposed to clear the way.
|
|
772
|
+
logger.error(
|
|
773
|
+
f"Failed to create collection '{self.collection_name}' as it already exists, "
|
|
774
|
+
f"even after drop attempt for overwrite. Overwrite operation may not have completed as intended."
|
|
775
|
+
)
|
|
776
|
+
raise # Re-raise as the overwrite intent failed
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(
|
|
779
|
+
f"Error creating collection '{self.collection_name}' after drop attempt (overwrite=True): {e}"
|
|
780
|
+
)
|
|
781
|
+
raise
|
|
782
|
+
else: # self.overwrite is False
|
|
783
|
+
try:
|
|
784
|
+
logger.info(
|
|
785
|
+
f"Overwrite is False. Attempting to create collection '{self.collection_name}' in scope '{self.scope_name}'."
|
|
786
|
+
)
|
|
787
|
+
await collection_manager.create_collection(
|
|
788
|
+
scope_name=self.scope_name, collection_name=self.collection_name
|
|
789
|
+
)
|
|
790
|
+
logger.info(f"Successfully created new collection '{self.collection_name}'.")
|
|
791
|
+
except CollectionAlreadyExistsException:
|
|
792
|
+
logger.info(
|
|
793
|
+
f"Collection '{self.collection_name}' already exists in scope '{self.scope_name}'. Using existing collection."
|
|
794
|
+
)
|
|
795
|
+
except Exception as e:
|
|
796
|
+
logger.error(f"Error creating collection '{self.collection_name}': {e}")
|
|
797
|
+
raise
|
|
798
|
+
|
|
799
|
+
async def _get_async_search_indexes_mng(self) -> Union[AsyncSearchIndexManager, AsyncScopeSearchIndexManager]:
|
|
800
|
+
"""Get the async search indexes manager."""
|
|
801
|
+
if self.is_global_level_index:
|
|
802
|
+
cluster = await self.get_async_cluster()
|
|
803
|
+
return cluster.search_indexes()
|
|
804
|
+
else:
|
|
805
|
+
scope = await self.get_async_scope()
|
|
806
|
+
return scope.search_indexes()
|
|
807
|
+
|
|
808
|
+
async def _async_create_fts_index(self):
|
|
809
|
+
"""Create a FTS index on the collection if it doesn't exist."""
|
|
810
|
+
async_search_mng = await self._get_async_search_indexes_mng()
|
|
811
|
+
try:
|
|
812
|
+
# Check if index exists and handle string index name
|
|
813
|
+
await async_search_mng.get_index(self.search_index_name)
|
|
814
|
+
if not self.overwrite:
|
|
815
|
+
return
|
|
816
|
+
except Exception:
|
|
817
|
+
if self.search_index_definition is None:
|
|
818
|
+
raise ValueError(f"Index '{self.search_index_name}' does not exist")
|
|
819
|
+
|
|
820
|
+
# Create or update index
|
|
821
|
+
try:
|
|
822
|
+
if self.overwrite:
|
|
823
|
+
try:
|
|
824
|
+
logger.info(f"Dropping existing FTS index '{self.search_index_name}'")
|
|
825
|
+
await async_search_mng.drop_index(self.search_index_name)
|
|
826
|
+
except SearchIndexNotFoundException:
|
|
827
|
+
logger.warning(f"Index '{self.search_index_name}' does not exist")
|
|
828
|
+
except Exception as e:
|
|
829
|
+
logger.warning(f"Error dropping index (may not exist): {e}")
|
|
830
|
+
|
|
831
|
+
await async_search_mng.upsert_index(self.search_index_definition)
|
|
832
|
+
logger.info(f"Created FTS index '{self.search_index_name}'")
|
|
833
|
+
|
|
834
|
+
if self.wait_until_index_ready:
|
|
835
|
+
await self._async_wait_for_index_ready()
|
|
836
|
+
|
|
837
|
+
except Exception as e:
|
|
838
|
+
logger.error(f"Error creating FTS index '{self.search_index_name}': {e}")
|
|
839
|
+
raise
|
|
840
|
+
|
|
841
|
+
async def _async_wait_for_index_ready(self):
|
|
842
|
+
"""Wait until the FTS index is ready."""
|
|
843
|
+
start_time = time.time()
|
|
844
|
+
async_search_mng = await self._get_async_search_indexes_mng()
|
|
845
|
+
while True:
|
|
846
|
+
try:
|
|
847
|
+
count = await async_search_mng.get_indexed_documents_count(self.search_index_name)
|
|
848
|
+
if count > -1:
|
|
849
|
+
logger.info(f"FTS index '{self.search_index_name}' is ready")
|
|
850
|
+
break
|
|
851
|
+
# logger.info(f"FTS index '{self.search_index_name}' is not ready yet status: {index['status']}")
|
|
852
|
+
except Exception as e:
|
|
853
|
+
if time.time() - start_time > self.wait_until_index_ready:
|
|
854
|
+
logger.error(f"Error checking index status: {e}")
|
|
855
|
+
raise TimeoutError("Timeout waiting for FTS index to become ready")
|
|
856
|
+
await asyncio.sleep(1)
|
|
857
|
+
|
|
858
|
+
async def async_id_exists(self, id: str) -> bool:
|
|
859
|
+
try:
|
|
860
|
+
async_collection_instance = await self.get_async_collection()
|
|
861
|
+
result = await async_collection_instance.exists(id)
|
|
862
|
+
if not result.exists:
|
|
863
|
+
logger.debug(f"[async] Document does not exist: {id}")
|
|
864
|
+
return result.exists
|
|
865
|
+
except Exception as e:
|
|
866
|
+
logger.error(f"[async] Error checking document existence: {e}")
|
|
867
|
+
return False
|
|
868
|
+
|
|
869
|
+
async def async_name_exists(self, name: str) -> bool:
|
|
870
|
+
try:
|
|
871
|
+
query = f"SELECT name FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE name = $name LIMIT 1"
|
|
872
|
+
async_scope_instance = await self.get_async_scope()
|
|
873
|
+
result = async_scope_instance.query(
|
|
874
|
+
query, QueryOptions(named_parameters={"name": name}, scan_consistency=QueryScanConsistency.REQUEST_PLUS)
|
|
875
|
+
)
|
|
876
|
+
async for row in result.rows():
|
|
877
|
+
return True
|
|
878
|
+
return False
|
|
879
|
+
except Exception as e:
|
|
880
|
+
logger.error(f"[async] Error checking document name existence: {e}")
|
|
881
|
+
return False
|
|
882
|
+
|
|
883
|
+
async def async_insert(
|
|
884
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
885
|
+
) -> None:
|
|
886
|
+
logger.info(f"[async] Inserting {len(documents)} documents")
|
|
887
|
+
|
|
888
|
+
async_collection_instance = await self.get_async_collection()
|
|
889
|
+
all_docs_to_insert: Dict[str, Any] = {}
|
|
890
|
+
|
|
891
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
892
|
+
# Use batch embedding when enabled and supported
|
|
893
|
+
try:
|
|
894
|
+
# Extract content from all documents
|
|
895
|
+
doc_contents = [doc.content for doc in documents]
|
|
896
|
+
|
|
897
|
+
# Get batch embeddings and usage
|
|
898
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
899
|
+
|
|
900
|
+
# Process documents with pre-computed embeddings
|
|
901
|
+
for j, doc in enumerate(documents):
|
|
902
|
+
try:
|
|
903
|
+
if j < len(embeddings):
|
|
904
|
+
doc.embedding = embeddings[j]
|
|
905
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
908
|
+
|
|
909
|
+
except Exception as e:
|
|
910
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
911
|
+
error_str = str(e).lower()
|
|
912
|
+
is_rate_limit = any(
|
|
913
|
+
phrase in error_str
|
|
914
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
if is_rate_limit:
|
|
918
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
919
|
+
raise e
|
|
920
|
+
else:
|
|
921
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
922
|
+
# Fall back to individual embedding
|
|
923
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
924
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
925
|
+
else:
|
|
926
|
+
# Use individual embedding
|
|
927
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
928
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
929
|
+
|
|
930
|
+
for document in documents:
|
|
931
|
+
try:
|
|
932
|
+
# User edit: self.prepare_doc is no longer awaited with to_thread
|
|
933
|
+
doc_data = self.prepare_doc(content_hash, document)
|
|
934
|
+
if filters:
|
|
935
|
+
doc_data["filters"] = filters
|
|
936
|
+
doc_id = doc_data.pop("_id") # Remove _id as it's used as key
|
|
937
|
+
all_docs_to_insert[doc_id] = doc_data
|
|
938
|
+
except Exception as e:
|
|
939
|
+
logger.error(f"[async] Error preparing document '{document.name}': {e}")
|
|
940
|
+
|
|
941
|
+
if not all_docs_to_insert:
|
|
942
|
+
logger.info("[async] No documents prepared for insertion.")
|
|
943
|
+
return
|
|
944
|
+
|
|
945
|
+
doc_ids = list(all_docs_to_insert.keys())
|
|
946
|
+
total_inserted_count = 0
|
|
947
|
+
total_failed_count = 0
|
|
948
|
+
processed_doc_count = len(all_docs_to_insert)
|
|
949
|
+
|
|
950
|
+
for i in range(0, len(doc_ids), self.batch_limit):
|
|
951
|
+
batch_doc_ids = doc_ids[i : i + self.batch_limit]
|
|
952
|
+
|
|
953
|
+
logger.info(f"[async] Processing batch of {len(batch_doc_ids)} documents for concurrent insertion.")
|
|
954
|
+
|
|
955
|
+
insert_tasks = []
|
|
956
|
+
for doc_id in batch_doc_ids:
|
|
957
|
+
doc_content = all_docs_to_insert[doc_id]
|
|
958
|
+
insert_tasks.append(async_collection_instance.insert(doc_id, doc_content))
|
|
959
|
+
|
|
960
|
+
if insert_tasks:
|
|
961
|
+
results = await asyncio.gather(*insert_tasks, return_exceptions=True)
|
|
962
|
+
for idx, result in enumerate(results):
|
|
963
|
+
# Get the original doc_id for logging, corresponding to the task order
|
|
964
|
+
current_doc_id = batch_doc_ids[idx]
|
|
965
|
+
if isinstance(result, Exception):
|
|
966
|
+
total_failed_count += 1
|
|
967
|
+
logger.error(f"[async] Error inserting document '{current_doc_id}': {result}")
|
|
968
|
+
else:
|
|
969
|
+
# Assuming successful insert doesn't return a specific value we need to check further,
|
|
970
|
+
# or if it does, the absence of an exception means success.
|
|
971
|
+
total_inserted_count += 1
|
|
972
|
+
logger.debug(f"[async] Successfully inserted document '{current_doc_id}'.")
|
|
973
|
+
|
|
974
|
+
logger.info(f"[async] Finished processing {processed_doc_count} documents.")
|
|
975
|
+
logger.info(f"[async] Total successfully inserted: {total_inserted_count}, Total failed: {total_failed_count}.")
|
|
976
|
+
|
|
977
|
+
async def async_upsert(
|
|
978
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
979
|
+
) -> None:
|
|
980
|
+
"""Upsert documents asynchronously."""
|
|
981
|
+
if self.content_hash_exists(content_hash):
|
|
982
|
+
self._delete_by_content_hash(content_hash)
|
|
983
|
+
await self._async_upsert(content_hash=content_hash, documents=documents, filters=filters)
|
|
984
|
+
|
|
985
|
+
async def _async_upsert(
|
|
986
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
987
|
+
) -> None:
|
|
988
|
+
logger.info(f"[async] Upserting {len(documents)} documents")
|
|
989
|
+
|
|
990
|
+
async_collection_instance = await self.get_async_collection()
|
|
991
|
+
all_docs_to_upsert: Dict[str, Any] = {}
|
|
992
|
+
|
|
993
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
994
|
+
# Use batch embedding when enabled and supported
|
|
995
|
+
try:
|
|
996
|
+
# Extract content from all documents
|
|
997
|
+
doc_contents = [doc.content for doc in documents]
|
|
998
|
+
|
|
999
|
+
# Get batch embeddings and usage
|
|
1000
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
1001
|
+
|
|
1002
|
+
# Process documents with pre-computed embeddings
|
|
1003
|
+
for j, doc in enumerate(documents):
|
|
1004
|
+
try:
|
|
1005
|
+
if j < len(embeddings):
|
|
1006
|
+
doc.embedding = embeddings[j]
|
|
1007
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
1010
|
+
|
|
1011
|
+
except Exception as e:
|
|
1012
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
1013
|
+
error_str = str(e).lower()
|
|
1014
|
+
is_rate_limit = any(
|
|
1015
|
+
phrase in error_str
|
|
1016
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
if is_rate_limit:
|
|
1020
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
1021
|
+
raise e
|
|
1022
|
+
else:
|
|
1023
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
1024
|
+
# Fall back to individual embedding
|
|
1025
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
1026
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1027
|
+
else:
|
|
1028
|
+
# Use individual embedding
|
|
1029
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
1030
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
1031
|
+
|
|
1032
|
+
for document in documents:
|
|
1033
|
+
try:
|
|
1034
|
+
# Consistent with async_insert, prepare_doc is not awaited with to_thread based on prior user edits
|
|
1035
|
+
doc_data = self.prepare_doc(content_hash, document)
|
|
1036
|
+
if filters:
|
|
1037
|
+
doc_data["filters"] = filters
|
|
1038
|
+
doc_id = doc_data.pop("_id") # _id is used as key for upsert
|
|
1039
|
+
all_docs_to_upsert[doc_id] = doc_data
|
|
1040
|
+
except Exception as e:
|
|
1041
|
+
logger.error(f"[async] Error preparing document '{document.name}' for upsert: {e}")
|
|
1042
|
+
|
|
1043
|
+
if not all_docs_to_upsert:
|
|
1044
|
+
logger.info("[async] No documents prepared for upsert.")
|
|
1045
|
+
return
|
|
1046
|
+
|
|
1047
|
+
doc_ids = list(all_docs_to_upsert.keys())
|
|
1048
|
+
total_upserted_count = 0
|
|
1049
|
+
total_failed_count = 0
|
|
1050
|
+
processed_doc_count = len(all_docs_to_upsert)
|
|
1051
|
+
|
|
1052
|
+
logger.info(f"[async] Prepared {processed_doc_count} documents for upsert.")
|
|
1053
|
+
|
|
1054
|
+
for i in range(0, len(doc_ids), self.batch_limit):
|
|
1055
|
+
batch_doc_ids = doc_ids[i : i + self.batch_limit]
|
|
1056
|
+
|
|
1057
|
+
logger.info(f"[async] Processing batch of {len(batch_doc_ids)} documents for concurrent upsert.")
|
|
1058
|
+
|
|
1059
|
+
upsert_tasks = []
|
|
1060
|
+
for doc_id in batch_doc_ids:
|
|
1061
|
+
doc_content = all_docs_to_upsert[doc_id]
|
|
1062
|
+
upsert_tasks.append(async_collection_instance.upsert(doc_id, doc_content))
|
|
1063
|
+
|
|
1064
|
+
if upsert_tasks:
|
|
1065
|
+
results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
|
|
1066
|
+
for idx, result in enumerate(results):
|
|
1067
|
+
current_doc_id = batch_doc_ids[idx]
|
|
1068
|
+
if isinstance(result, Exception):
|
|
1069
|
+
total_failed_count += 1
|
|
1070
|
+
logger.error(f"[async] Error upserting document '{current_doc_id}': {result}")
|
|
1071
|
+
else:
|
|
1072
|
+
# Assuming successful upsert doesn't return a specific value we need to check further,
|
|
1073
|
+
# or if it does, the absence of an exception means success.
|
|
1074
|
+
total_upserted_count += 1
|
|
1075
|
+
logger.debug(f"[async] Successfully upserted document '{current_doc_id}'.")
|
|
1076
|
+
|
|
1077
|
+
logger.info(f"[async] Finished processing {processed_doc_count} documents for upsert.")
|
|
1078
|
+
logger.info(f"[async] Total successfully upserted: {total_upserted_count}, Total failed: {total_failed_count}.")
|
|
1079
|
+
|
|
1080
|
+
async def async_search(
|
|
1081
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
1082
|
+
) -> List[Document]:
|
|
1083
|
+
if isinstance(filters, List):
|
|
1084
|
+
log_warning("Filter Expressions are not yet supported in Couchbase. No filters will be applied.")
|
|
1085
|
+
filters = None
|
|
1086
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
1087
|
+
if query_embedding is None:
|
|
1088
|
+
logger.error(f"[async] Failed to generate embedding for query: {query}")
|
|
1089
|
+
return []
|
|
1090
|
+
try:
|
|
1091
|
+
# Implement vector search using Couchbase FTS
|
|
1092
|
+
vector_search = VectorSearch.from_vector_query(
|
|
1093
|
+
VectorQuery(field_name="embedding", vector=query_embedding, num_candidates=limit)
|
|
1094
|
+
)
|
|
1095
|
+
request = SearchRequest.create(vector_search)
|
|
1096
|
+
|
|
1097
|
+
# Prepare the options dictionary
|
|
1098
|
+
options_dict = {"limit": limit, "fields": ["*"]}
|
|
1099
|
+
if filters:
|
|
1100
|
+
options_dict["raw"] = filters
|
|
1101
|
+
|
|
1102
|
+
search_args = {
|
|
1103
|
+
"index": self.search_index_name,
|
|
1104
|
+
"request": request,
|
|
1105
|
+
"options": SearchOptions(**options_dict), # Construct SearchOptions with the dictionary
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
if self.is_global_level_index:
|
|
1109
|
+
async_cluster_instance = await self.get_async_cluster()
|
|
1110
|
+
results = async_cluster_instance.search(**search_args)
|
|
1111
|
+
else:
|
|
1112
|
+
async_scope_instance = await self.get_async_scope()
|
|
1113
|
+
results = async_scope_instance.search(**search_args)
|
|
1114
|
+
|
|
1115
|
+
return await self.__async_get_doc_from_kv(results)
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
logger.error(f"[async] Error during search: {e}")
|
|
1118
|
+
raise
|
|
1119
|
+
|
|
1120
|
+
async def async_drop(self) -> None:
|
|
1121
|
+
if await self.async_exists():
|
|
1122
|
+
try:
|
|
1123
|
+
async_bucket_instance = await self.get_async_bucket()
|
|
1124
|
+
await async_bucket_instance.collections().drop_collection(
|
|
1125
|
+
collection_name=self.collection_name, scope_name=self.scope_name
|
|
1126
|
+
)
|
|
1127
|
+
logger.info(f"[async] Collection '{self.collection_name}' dropped successfully.")
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
logger.error(f"[async] Error dropping collection '{self.collection_name}': {e}")
|
|
1130
|
+
raise
|
|
1131
|
+
|
|
1132
|
+
async def async_exists(self) -> bool:
|
|
1133
|
+
try:
|
|
1134
|
+
async_bucket_instance = await self.get_async_bucket()
|
|
1135
|
+
scopes = await async_bucket_instance.collections().get_all_scopes()
|
|
1136
|
+
for scope in scopes:
|
|
1137
|
+
if scope.name == self.scope_name:
|
|
1138
|
+
for collection in scope.collections:
|
|
1139
|
+
if collection.name == self.collection_name:
|
|
1140
|
+
return True
|
|
1141
|
+
return False
|
|
1142
|
+
except Exception:
|
|
1143
|
+
return False
|
|
1144
|
+
|
|
1145
|
+
async def __async_get_doc_from_kv(self, response: AsyncSearchIndex) -> List[Document]:
|
|
1146
|
+
"""
|
|
1147
|
+
Convert search results to Document objects by fetching full documents from KV store concurrently.
|
|
1148
|
+
|
|
1149
|
+
Args:
|
|
1150
|
+
response: SearchResult from Couchbase search query
|
|
1151
|
+
|
|
1152
|
+
Returns:
|
|
1153
|
+
List of Document objects
|
|
1154
|
+
"""
|
|
1155
|
+
documents: List[Document] = []
|
|
1156
|
+
# Assuming search_hits map directly to the order of documents we want to fetch and reconstruct
|
|
1157
|
+
search_hits_map = {doc.id: doc.score async for doc in response.rows()}
|
|
1158
|
+
doc_ids_to_fetch = list(search_hits_map.keys())
|
|
1159
|
+
|
|
1160
|
+
if not doc_ids_to_fetch:
|
|
1161
|
+
return documents
|
|
1162
|
+
|
|
1163
|
+
async_collection_instance = await self.get_async_collection()
|
|
1164
|
+
|
|
1165
|
+
# Process in batches
|
|
1166
|
+
for i in range(0, len(doc_ids_to_fetch), self.batch_limit):
|
|
1167
|
+
batch_doc_ids = doc_ids_to_fetch[i : i + self.batch_limit]
|
|
1168
|
+
if not batch_doc_ids:
|
|
1169
|
+
continue
|
|
1170
|
+
|
|
1171
|
+
logger.debug(f"[async] Fetching batch of {len(batch_doc_ids)} documents from KV.")
|
|
1172
|
+
get_tasks = [async_collection_instance.get(doc_id) for doc_id in batch_doc_ids]
|
|
1173
|
+
|
|
1174
|
+
# Fetch documents from KV store concurrently for the current batch
|
|
1175
|
+
results_from_kv_batch = await asyncio.gather(*get_tasks, return_exceptions=True)
|
|
1176
|
+
|
|
1177
|
+
for batch_idx, get_result in enumerate(results_from_kv_batch):
|
|
1178
|
+
# Original doc_id corresponding to this result within the batch
|
|
1179
|
+
doc_id = batch_doc_ids[batch_idx]
|
|
1180
|
+
# score = search_hits_map[doc_id] # Retrieve the original score
|
|
1181
|
+
|
|
1182
|
+
if isinstance(get_result, BaseException) or isinstance(get_result, Exception) or get_result is None:
|
|
1183
|
+
logger.warning(f"[async] Document {doc_id} not found or error fetching from KV store: {get_result}")
|
|
1184
|
+
continue
|
|
1185
|
+
|
|
1186
|
+
try:
|
|
1187
|
+
value = get_result.content_as[dict]
|
|
1188
|
+
if not isinstance(value, dict):
|
|
1189
|
+
logger.warning(
|
|
1190
|
+
f"[async] Document {doc_id} content from KV is not a dict: {type(value)}. Skipping."
|
|
1191
|
+
)
|
|
1192
|
+
continue
|
|
1193
|
+
|
|
1194
|
+
documents.append(
|
|
1195
|
+
Document(
|
|
1196
|
+
id=doc_id,
|
|
1197
|
+
name=value.get("name"),
|
|
1198
|
+
content=value.get("content", ""),
|
|
1199
|
+
meta_data=value.get("meta_data", {}),
|
|
1200
|
+
embedding=value.get("embedding", []),
|
|
1201
|
+
)
|
|
1202
|
+
)
|
|
1203
|
+
except Exception as e:
|
|
1204
|
+
logger.warning(
|
|
1205
|
+
f"[async] Error processing document {doc_id} from KV store: {e}. Value: {getattr(get_result, 'content_as', 'N/A')}"
|
|
1206
|
+
)
|
|
1207
|
+
continue
|
|
1208
|
+
|
|
1209
|
+
return documents
|
|
1210
|
+
|
|
1211
|
+
def delete_by_id(self, id: str) -> bool:
|
|
1212
|
+
"""
|
|
1213
|
+
Delete a document by its ID.
|
|
1214
|
+
|
|
1215
|
+
Args:
|
|
1216
|
+
id (str): The document ID to delete
|
|
1217
|
+
|
|
1218
|
+
Returns:
|
|
1219
|
+
bool: True if document was deleted, False otherwise
|
|
1220
|
+
"""
|
|
1221
|
+
try:
|
|
1222
|
+
log_debug(f"Couchbase VectorDB : Deleting document with ID {id}")
|
|
1223
|
+
if not self.id_exists(id):
|
|
1224
|
+
return False
|
|
1225
|
+
|
|
1226
|
+
# Delete by ID using Couchbase collection.delete()
|
|
1227
|
+
self.collection.remove(id)
|
|
1228
|
+
log_info(f"Successfully deleted document with ID {id}")
|
|
1229
|
+
return True
|
|
1230
|
+
except Exception as e:
|
|
1231
|
+
log_info(f"Error deleting document with ID {id}: {e}")
|
|
1232
|
+
return False
|
|
1233
|
+
|
|
1234
|
+
def delete_by_name(self, name: str) -> bool:
|
|
1235
|
+
"""
|
|
1236
|
+
Delete documents by name.
|
|
1237
|
+
|
|
1238
|
+
Args:
|
|
1239
|
+
name (str): The document name to delete
|
|
1240
|
+
|
|
1241
|
+
Returns:
|
|
1242
|
+
bool: True if documents were deleted, False otherwise
|
|
1243
|
+
"""
|
|
1244
|
+
try:
|
|
1245
|
+
log_debug(f"Couchbase VectorDB : Deleting documents with name {name}")
|
|
1246
|
+
|
|
1247
|
+
query = f"SELECT META().id as doc_id, * FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE name = $name"
|
|
1248
|
+
result = self.scope.query(
|
|
1249
|
+
query, QueryOptions(named_parameters={"name": name}, scan_consistency=QueryScanConsistency.REQUEST_PLUS)
|
|
1250
|
+
)
|
|
1251
|
+
rows = list(result.rows()) # Collect once
|
|
1252
|
+
|
|
1253
|
+
for row in rows:
|
|
1254
|
+
self.collection.remove(row.get("doc_id"))
|
|
1255
|
+
log_info(f"Deleted {len(rows)} documents with name {name}")
|
|
1256
|
+
return True
|
|
1257
|
+
|
|
1258
|
+
except Exception as e:
|
|
1259
|
+
log_info(f"Error deleting documents with name {name}: {e}")
|
|
1260
|
+
return False
|
|
1261
|
+
|
|
1262
|
+
def delete_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
1263
|
+
"""
|
|
1264
|
+
Delete documents by metadata.
|
|
1265
|
+
|
|
1266
|
+
Args:
|
|
1267
|
+
metadata (Dict[str, Any]): The metadata to match for deletion
|
|
1268
|
+
|
|
1269
|
+
Returns:
|
|
1270
|
+
bool: True if documents were deleted, False otherwise
|
|
1271
|
+
"""
|
|
1272
|
+
try:
|
|
1273
|
+
log_debug(f"Couchbase VectorDB : Deleting documents with metadata {metadata}")
|
|
1274
|
+
|
|
1275
|
+
if not metadata:
|
|
1276
|
+
log_info("No metadata provided for deletion")
|
|
1277
|
+
return False
|
|
1278
|
+
|
|
1279
|
+
# Build WHERE clause for metadata matching
|
|
1280
|
+
where_conditions = []
|
|
1281
|
+
named_parameters: Dict[str, Any] = {}
|
|
1282
|
+
|
|
1283
|
+
for key, value in metadata.items():
|
|
1284
|
+
if isinstance(value, (list, tuple)):
|
|
1285
|
+
# For array values, use ARRAY_CONTAINS
|
|
1286
|
+
where_conditions.append(
|
|
1287
|
+
f"(ARRAY_CONTAINS(filters.{key}, $value_{key}) OR ARRAY_CONTAINS(recipes.filters.{key}, $value_{key}))"
|
|
1288
|
+
)
|
|
1289
|
+
named_parameters[f"value_{key}"] = value
|
|
1290
|
+
elif isinstance(value, str):
|
|
1291
|
+
where_conditions.append(f"(filters.{key} = $value_{key} OR recipes.filters.{key} = $value_{key})")
|
|
1292
|
+
named_parameters[f"value_{key}"] = value
|
|
1293
|
+
elif isinstance(value, bool):
|
|
1294
|
+
where_conditions.append(f"(filters.{key} = $value_{key} OR recipes.filters.{key} = $value_{key})")
|
|
1295
|
+
named_parameters[f"value_{key}"] = value
|
|
1296
|
+
elif isinstance(value, (int, float)):
|
|
1297
|
+
where_conditions.append(f"(filters.{key} = $value_{key} OR recipes.filters.{key} = $value_{key})")
|
|
1298
|
+
named_parameters[f"value_{key}"] = value
|
|
1299
|
+
elif value is None:
|
|
1300
|
+
where_conditions.append(f"(filters.{key} IS NULL OR recipes.filters.{key} IS NULL)")
|
|
1301
|
+
else:
|
|
1302
|
+
# For other types, convert to string
|
|
1303
|
+
where_conditions.append(f"(filters.{key} = $value_{key} OR recipes.filters.{key} = $value_{key})")
|
|
1304
|
+
named_parameters[f"value_{key}"] = str(value)
|
|
1305
|
+
|
|
1306
|
+
if not where_conditions:
|
|
1307
|
+
log_info("No valid metadata conditions for deletion")
|
|
1308
|
+
return False
|
|
1309
|
+
|
|
1310
|
+
where_clause = " AND ".join(where_conditions)
|
|
1311
|
+
query = f"SELECT META().id as doc_id, * FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE {where_clause}"
|
|
1312
|
+
|
|
1313
|
+
result = self.scope.query(
|
|
1314
|
+
query,
|
|
1315
|
+
QueryOptions(named_parameters=named_parameters, scan_consistency=QueryScanConsistency.REQUEST_PLUS),
|
|
1316
|
+
)
|
|
1317
|
+
rows = list(result.rows()) # Collect once
|
|
1318
|
+
|
|
1319
|
+
for row in rows:
|
|
1320
|
+
self.collection.remove(row.get("doc_id"))
|
|
1321
|
+
log_info(f"Deleted {len(rows)} documents with metadata {metadata}")
|
|
1322
|
+
return True
|
|
1323
|
+
|
|
1324
|
+
except Exception as e:
|
|
1325
|
+
log_info(f"Error deleting documents with metadata {metadata}: {e}")
|
|
1326
|
+
return False
|
|
1327
|
+
|
|
1328
|
+
def delete_by_content_id(self, content_id: str) -> bool:
|
|
1329
|
+
"""
|
|
1330
|
+
Delete documents by content ID.
|
|
1331
|
+
|
|
1332
|
+
Args:
|
|
1333
|
+
content_id (str): The content ID to delete
|
|
1334
|
+
|
|
1335
|
+
Returns:
|
|
1336
|
+
bool: True if documents were deleted, False otherwise
|
|
1337
|
+
"""
|
|
1338
|
+
try:
|
|
1339
|
+
log_debug(f"Couchbase VectorDB : Deleting documents with content_id {content_id}")
|
|
1340
|
+
|
|
1341
|
+
query = f"SELECT META().id as doc_id, * FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE content_id = $content_id OR recipes.content_id = $content_id"
|
|
1342
|
+
result = self.scope.query(
|
|
1343
|
+
query,
|
|
1344
|
+
QueryOptions(
|
|
1345
|
+
named_parameters={"content_id": content_id}, scan_consistency=QueryScanConsistency.REQUEST_PLUS
|
|
1346
|
+
),
|
|
1347
|
+
)
|
|
1348
|
+
rows = list(result.rows()) # Collect once
|
|
1349
|
+
|
|
1350
|
+
for row in rows:
|
|
1351
|
+
self.collection.remove(row.get("doc_id"))
|
|
1352
|
+
log_info(f"Deleted {len(rows)} documents with content_id {content_id}")
|
|
1353
|
+
return True
|
|
1354
|
+
|
|
1355
|
+
except Exception as e:
|
|
1356
|
+
log_info(f"Error deleting documents with content_id {content_id}: {e}")
|
|
1357
|
+
return False
|
|
1358
|
+
|
|
1359
|
+
def _delete_by_content_hash(self, content_hash: str) -> bool:
|
|
1360
|
+
"""
|
|
1361
|
+
Delete documents by content hash.
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
content_hash (str): The content hash to delete
|
|
1365
|
+
|
|
1366
|
+
Returns:
|
|
1367
|
+
bool: True if documents were deleted, False otherwise
|
|
1368
|
+
"""
|
|
1369
|
+
try:
|
|
1370
|
+
log_debug(f"Couchbase VectorDB : Deleting documents with content_hash {content_hash}")
|
|
1371
|
+
|
|
1372
|
+
query = f"SELECT META().id as doc_id, * FROM {self.bucket_name}.{self.scope_name}.{self.collection_name} WHERE content_hash = $content_hash"
|
|
1373
|
+
result = self.scope.query(
|
|
1374
|
+
query,
|
|
1375
|
+
QueryOptions(
|
|
1376
|
+
named_parameters={"content_hash": content_hash}, scan_consistency=QueryScanConsistency.REQUEST_PLUS
|
|
1377
|
+
),
|
|
1378
|
+
)
|
|
1379
|
+
rows = list(result.rows()) # Collect once
|
|
1380
|
+
|
|
1381
|
+
for row in rows:
|
|
1382
|
+
self.collection.remove(row.get("doc_id"))
|
|
1383
|
+
log_info(f"Deleted {len(rows)} documents with content_hash {content_hash}")
|
|
1384
|
+
return True
|
|
1385
|
+
|
|
1386
|
+
except Exception as e:
|
|
1387
|
+
log_info(f"Error deleting documents with content_hash {content_hash}: {e}")
|
|
1388
|
+
return False
|
|
1389
|
+
|
|
1390
|
+
def update_metadata(self, content_id: str, metadata: Dict[str, Any]) -> None:
|
|
1391
|
+
"""
|
|
1392
|
+
Update the metadata for documents with the given content_id.
|
|
1393
|
+
|
|
1394
|
+
Args:
|
|
1395
|
+
content_id (str): The content ID to update
|
|
1396
|
+
metadata (Dict[str, Any]): The metadata to update
|
|
1397
|
+
"""
|
|
1398
|
+
try:
|
|
1399
|
+
# Query for documents with the given content_id
|
|
1400
|
+
query = f"SELECT META().id as doc_id, meta_data, filters FROM `{self.bucket_name}` WHERE content_id = $content_id"
|
|
1401
|
+
result = self.cluster.query(query, content_id=content_id)
|
|
1402
|
+
|
|
1403
|
+
updated_count = 0
|
|
1404
|
+
for row in result:
|
|
1405
|
+
doc_id = row.get("doc_id")
|
|
1406
|
+
current_metadata = row.get("meta_data", {})
|
|
1407
|
+
current_filters = row.get("filters", {})
|
|
1408
|
+
|
|
1409
|
+
# Merge existing metadata with new metadata
|
|
1410
|
+
if isinstance(current_metadata, dict):
|
|
1411
|
+
updated_metadata = current_metadata.copy()
|
|
1412
|
+
updated_metadata.update(metadata)
|
|
1413
|
+
else:
|
|
1414
|
+
updated_metadata = metadata
|
|
1415
|
+
|
|
1416
|
+
# Merge existing filters with new metadata
|
|
1417
|
+
if isinstance(current_filters, dict):
|
|
1418
|
+
updated_filters = current_filters.copy()
|
|
1419
|
+
updated_filters.update(metadata)
|
|
1420
|
+
else:
|
|
1421
|
+
updated_filters = metadata
|
|
1422
|
+
|
|
1423
|
+
# Update the document
|
|
1424
|
+
try:
|
|
1425
|
+
doc = self.collection.get(doc_id)
|
|
1426
|
+
doc_content = doc.content_as[dict]
|
|
1427
|
+
doc_content["meta_data"] = updated_metadata
|
|
1428
|
+
doc_content["filters"] = updated_filters
|
|
1429
|
+
|
|
1430
|
+
self.collection.upsert(doc_id, doc_content)
|
|
1431
|
+
updated_count += 1
|
|
1432
|
+
except Exception as doc_error:
|
|
1433
|
+
logger.warning(f"Failed to update document {doc_id}: {doc_error}")
|
|
1434
|
+
|
|
1435
|
+
if updated_count == 0:
|
|
1436
|
+
logger.debug(f"No documents found with content_id: {content_id}")
|
|
1437
|
+
else:
|
|
1438
|
+
logger.debug(f"Updated metadata for {updated_count} documents with content_id: {content_id}")
|
|
1439
|
+
|
|
1440
|
+
except Exception as e:
|
|
1441
|
+
logger.error(f"Error updating metadata for content_id '{content_id}': {e}")
|
|
1442
|
+
raise
|
|
1443
|
+
|
|
1444
|
+
def get_supported_search_types(self) -> List[str]:
|
|
1445
|
+
"""Get the supported search types for this vector database."""
|
|
1446
|
+
return [] # CouchbaseSearch doesn't use SearchType enum
|