agno 2.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/__init__.py +8 -0
- agno/agent/__init__.py +51 -0
- agno/agent/agent.py +10405 -0
- agno/api/__init__.py +0 -0
- agno/api/agent.py +28 -0
- agno/api/api.py +40 -0
- agno/api/evals.py +22 -0
- agno/api/os.py +17 -0
- agno/api/routes.py +13 -0
- agno/api/schemas/__init__.py +9 -0
- agno/api/schemas/agent.py +16 -0
- agno/api/schemas/evals.py +16 -0
- agno/api/schemas/os.py +14 -0
- agno/api/schemas/response.py +6 -0
- agno/api/schemas/team.py +16 -0
- agno/api/schemas/utils.py +21 -0
- agno/api/schemas/workflows.py +16 -0
- agno/api/settings.py +53 -0
- agno/api/team.py +30 -0
- agno/api/workflow.py +28 -0
- agno/cloud/aws/base.py +214 -0
- agno/cloud/aws/s3/__init__.py +2 -0
- agno/cloud/aws/s3/api_client.py +43 -0
- agno/cloud/aws/s3/bucket.py +195 -0
- agno/cloud/aws/s3/object.py +57 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/__init__.py +24 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +598 -0
- agno/db/dynamo/__init__.py +3 -0
- agno/db/dynamo/dynamo.py +2042 -0
- agno/db/dynamo/schemas.py +314 -0
- agno/db/dynamo/utils.py +743 -0
- agno/db/firestore/__init__.py +3 -0
- agno/db/firestore/firestore.py +1795 -0
- agno/db/firestore/schemas.py +140 -0
- agno/db/firestore/utils.py +376 -0
- agno/db/gcs_json/__init__.py +3 -0
- agno/db/gcs_json/gcs_json_db.py +1335 -0
- agno/db/gcs_json/utils.py +228 -0
- agno/db/in_memory/__init__.py +3 -0
- agno/db/in_memory/in_memory_db.py +1160 -0
- agno/db/in_memory/utils.py +230 -0
- agno/db/json/__init__.py +3 -0
- agno/db/json/json_db.py +1328 -0
- agno/db/json/utils.py +230 -0
- agno/db/migrations/__init__.py +0 -0
- agno/db/migrations/v1_to_v2.py +635 -0
- agno/db/mongo/__init__.py +17 -0
- agno/db/mongo/async_mongo.py +2026 -0
- agno/db/mongo/mongo.py +1982 -0
- agno/db/mongo/schemas.py +87 -0
- agno/db/mongo/utils.py +259 -0
- agno/db/mysql/__init__.py +3 -0
- agno/db/mysql/mysql.py +2308 -0
- agno/db/mysql/schemas.py +138 -0
- agno/db/mysql/utils.py +355 -0
- agno/db/postgres/__init__.py +4 -0
- agno/db/postgres/async_postgres.py +1927 -0
- agno/db/postgres/postgres.py +2260 -0
- agno/db/postgres/schemas.py +139 -0
- agno/db/postgres/utils.py +442 -0
- agno/db/redis/__init__.py +3 -0
- agno/db/redis/redis.py +1660 -0
- agno/db/redis/schemas.py +123 -0
- agno/db/redis/utils.py +346 -0
- agno/db/schemas/__init__.py +4 -0
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +33 -0
- agno/db/schemas/knowledge.py +40 -0
- agno/db/schemas/memory.py +46 -0
- agno/db/schemas/metrics.py +0 -0
- agno/db/singlestore/__init__.py +3 -0
- agno/db/singlestore/schemas.py +130 -0
- agno/db/singlestore/singlestore.py +2272 -0
- agno/db/singlestore/utils.py +384 -0
- agno/db/sqlite/__init__.py +4 -0
- agno/db/sqlite/async_sqlite.py +2293 -0
- agno/db/sqlite/schemas.py +133 -0
- agno/db/sqlite/sqlite.py +2288 -0
- agno/db/sqlite/utils.py +431 -0
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +309 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1353 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +116 -0
- agno/debug.py +18 -0
- agno/eval/__init__.py +14 -0
- agno/eval/accuracy.py +834 -0
- agno/eval/performance.py +773 -0
- agno/eval/reliability.py +306 -0
- agno/eval/utils.py +119 -0
- agno/exceptions.py +161 -0
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/integrations/__init__.py +0 -0
- agno/integrations/discord/__init__.py +3 -0
- agno/integrations/discord/client.py +203 -0
- agno/knowledge/__init__.py +5 -0
- agno/knowledge/chunking/__init__.py +0 -0
- agno/knowledge/chunking/agentic.py +79 -0
- agno/knowledge/chunking/document.py +91 -0
- agno/knowledge/chunking/fixed.py +57 -0
- agno/knowledge/chunking/markdown.py +151 -0
- agno/knowledge/chunking/recursive.py +63 -0
- agno/knowledge/chunking/row.py +39 -0
- agno/knowledge/chunking/semantic.py +86 -0
- agno/knowledge/chunking/strategy.py +165 -0
- agno/knowledge/content.py +74 -0
- agno/knowledge/document/__init__.py +5 -0
- agno/knowledge/document/base.py +58 -0
- agno/knowledge/embedder/__init__.py +5 -0
- agno/knowledge/embedder/aws_bedrock.py +343 -0
- agno/knowledge/embedder/azure_openai.py +210 -0
- agno/knowledge/embedder/base.py +23 -0
- agno/knowledge/embedder/cohere.py +323 -0
- agno/knowledge/embedder/fastembed.py +62 -0
- agno/knowledge/embedder/fireworks.py +13 -0
- agno/knowledge/embedder/google.py +258 -0
- agno/knowledge/embedder/huggingface.py +94 -0
- agno/knowledge/embedder/jina.py +182 -0
- agno/knowledge/embedder/langdb.py +22 -0
- agno/knowledge/embedder/mistral.py +206 -0
- agno/knowledge/embedder/nebius.py +13 -0
- agno/knowledge/embedder/ollama.py +154 -0
- agno/knowledge/embedder/openai.py +195 -0
- agno/knowledge/embedder/sentence_transformer.py +63 -0
- agno/knowledge/embedder/together.py +13 -0
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +165 -0
- agno/knowledge/knowledge.py +1988 -0
- agno/knowledge/reader/__init__.py +7 -0
- agno/knowledge/reader/arxiv_reader.py +81 -0
- agno/knowledge/reader/base.py +95 -0
- agno/knowledge/reader/csv_reader.py +166 -0
- agno/knowledge/reader/docx_reader.py +82 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +292 -0
- agno/knowledge/reader/firecrawl_reader.py +201 -0
- agno/knowledge/reader/json_reader.py +87 -0
- agno/knowledge/reader/markdown_reader.py +137 -0
- agno/knowledge/reader/pdf_reader.py +431 -0
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +313 -0
- agno/knowledge/reader/s3_reader.py +89 -0
- agno/knowledge/reader/tavily_reader.py +194 -0
- agno/knowledge/reader/text_reader.py +115 -0
- agno/knowledge/reader/web_search_reader.py +372 -0
- agno/knowledge/reader/website_reader.py +455 -0
- agno/knowledge/reader/wikipedia_reader.py +59 -0
- agno/knowledge/reader/youtube_reader.py +78 -0
- agno/knowledge/remote_content/__init__.py +0 -0
- agno/knowledge/remote_content/remote_content.py +88 -0
- agno/knowledge/reranker/__init__.py +3 -0
- agno/knowledge/reranker/base.py +14 -0
- agno/knowledge/reranker/cohere.py +64 -0
- agno/knowledge/reranker/infinity.py +195 -0
- agno/knowledge/reranker/sentence_transformer.py +54 -0
- agno/knowledge/types.py +39 -0
- agno/knowledge/utils.py +189 -0
- agno/media.py +462 -0
- agno/memory/__init__.py +3 -0
- agno/memory/manager.py +1327 -0
- agno/models/__init__.py +0 -0
- agno/models/aimlapi/__init__.py +5 -0
- agno/models/aimlapi/aimlapi.py +45 -0
- agno/models/anthropic/__init__.py +5 -0
- agno/models/anthropic/claude.py +757 -0
- agno/models/aws/__init__.py +15 -0
- agno/models/aws/bedrock.py +701 -0
- agno/models/aws/claude.py +378 -0
- agno/models/azure/__init__.py +18 -0
- agno/models/azure/ai_foundry.py +485 -0
- agno/models/azure/openai_chat.py +131 -0
- agno/models/base.py +2175 -0
- agno/models/cerebras/__init__.py +12 -0
- agno/models/cerebras/cerebras.py +501 -0
- agno/models/cerebras/cerebras_openai.py +112 -0
- agno/models/cohere/__init__.py +5 -0
- agno/models/cohere/chat.py +389 -0
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +57 -0
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +91 -0
- agno/models/deepinfra/__init__.py +5 -0
- agno/models/deepinfra/deepinfra.py +28 -0
- agno/models/deepseek/__init__.py +5 -0
- agno/models/deepseek/deepseek.py +61 -0
- agno/models/defaults.py +1 -0
- agno/models/fireworks/__init__.py +5 -0
- agno/models/fireworks/fireworks.py +26 -0
- agno/models/google/__init__.py +5 -0
- agno/models/google/gemini.py +1085 -0
- agno/models/groq/__init__.py +5 -0
- agno/models/groq/groq.py +556 -0
- agno/models/huggingface/__init__.py +5 -0
- agno/models/huggingface/huggingface.py +491 -0
- agno/models/ibm/__init__.py +5 -0
- agno/models/ibm/watsonx.py +422 -0
- agno/models/internlm/__init__.py +3 -0
- agno/models/internlm/internlm.py +26 -0
- agno/models/langdb/__init__.py +1 -0
- agno/models/langdb/langdb.py +48 -0
- agno/models/litellm/__init__.py +14 -0
- agno/models/litellm/chat.py +468 -0
- agno/models/litellm/litellm_openai.py +25 -0
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/lmstudio/__init__.py +5 -0
- agno/models/lmstudio/lmstudio.py +25 -0
- agno/models/message.py +434 -0
- agno/models/meta/__init__.py +12 -0
- agno/models/meta/llama.py +475 -0
- agno/models/meta/llama_openai.py +78 -0
- agno/models/metrics.py +120 -0
- agno/models/mistral/__init__.py +5 -0
- agno/models/mistral/mistral.py +432 -0
- agno/models/nebius/__init__.py +3 -0
- agno/models/nebius/nebius.py +54 -0
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/__init__.py +5 -0
- agno/models/nvidia/nvidia.py +28 -0
- agno/models/ollama/__init__.py +5 -0
- agno/models/ollama/chat.py +441 -0
- agno/models/openai/__init__.py +9 -0
- agno/models/openai/chat.py +883 -0
- agno/models/openai/like.py +27 -0
- agno/models/openai/responses.py +1050 -0
- agno/models/openrouter/__init__.py +5 -0
- agno/models/openrouter/openrouter.py +66 -0
- agno/models/perplexity/__init__.py +5 -0
- agno/models/perplexity/perplexity.py +187 -0
- agno/models/portkey/__init__.py +3 -0
- agno/models/portkey/portkey.py +81 -0
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +52 -0
- agno/models/response.py +199 -0
- agno/models/sambanova/__init__.py +5 -0
- agno/models/sambanova/sambanova.py +28 -0
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +25 -0
- agno/models/together/__init__.py +5 -0
- agno/models/together/together.py +25 -0
- agno/models/utils.py +266 -0
- agno/models/vercel/__init__.py +3 -0
- agno/models/vercel/v0.py +26 -0
- agno/models/vertexai/__init__.py +0 -0
- agno/models/vertexai/claude.py +70 -0
- agno/models/vllm/__init__.py +3 -0
- agno/models/vllm/vllm.py +78 -0
- agno/models/xai/__init__.py +3 -0
- agno/models/xai/xai.py +113 -0
- agno/os/__init__.py +3 -0
- agno/os/app.py +876 -0
- agno/os/auth.py +57 -0
- agno/os/config.py +104 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +250 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/__init__.py +3 -0
- agno/os/interfaces/agui/agui.py +47 -0
- agno/os/interfaces/agui/router.py +144 -0
- agno/os/interfaces/agui/utils.py +534 -0
- agno/os/interfaces/base.py +25 -0
- agno/os/interfaces/slack/__init__.py +3 -0
- agno/os/interfaces/slack/router.py +148 -0
- agno/os/interfaces/slack/security.py +30 -0
- agno/os/interfaces/slack/slack.py +47 -0
- agno/os/interfaces/whatsapp/__init__.py +3 -0
- agno/os/interfaces/whatsapp/router.py +211 -0
- agno/os/interfaces/whatsapp/security.py +53 -0
- agno/os/interfaces/whatsapp/whatsapp.py +36 -0
- agno/os/mcp.py +292 -0
- agno/os/middleware/__init__.py +7 -0
- agno/os/middleware/jwt.py +233 -0
- agno/os/router.py +1763 -0
- agno/os/routers/__init__.py +3 -0
- agno/os/routers/evals/__init__.py +3 -0
- agno/os/routers/evals/evals.py +430 -0
- agno/os/routers/evals/schemas.py +142 -0
- agno/os/routers/evals/utils.py +162 -0
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/__init__.py +3 -0
- agno/os/routers/knowledge/knowledge.py +997 -0
- agno/os/routers/knowledge/schemas.py +178 -0
- agno/os/routers/memory/__init__.py +3 -0
- agno/os/routers/memory/memory.py +515 -0
- agno/os/routers/memory/schemas.py +62 -0
- agno/os/routers/metrics/__init__.py +3 -0
- agno/os/routers/metrics/metrics.py +190 -0
- agno/os/routers/metrics/schemas.py +47 -0
- agno/os/routers/session/__init__.py +3 -0
- agno/os/routers/session/session.py +997 -0
- agno/os/schema.py +1055 -0
- agno/os/settings.py +43 -0
- agno/os/utils.py +630 -0
- agno/py.typed +0 -0
- agno/reasoning/__init__.py +0 -0
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +67 -0
- agno/reasoning/deepseek.py +63 -0
- agno/reasoning/default.py +97 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +71 -0
- agno/reasoning/helpers.py +63 -0
- agno/reasoning/ollama.py +67 -0
- agno/reasoning/openai.py +86 -0
- agno/reasoning/step.py +31 -0
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +787 -0
- agno/run/base.py +229 -0
- agno/run/cancel.py +81 -0
- agno/run/messages.py +32 -0
- agno/run/team.py +753 -0
- agno/run/workflow.py +708 -0
- agno/session/__init__.py +10 -0
- agno/session/agent.py +295 -0
- agno/session/summary.py +265 -0
- agno/session/team.py +392 -0
- agno/session/workflow.py +205 -0
- agno/team/__init__.py +37 -0
- agno/team/team.py +8793 -0
- agno/tools/__init__.py +10 -0
- agno/tools/agentql.py +120 -0
- agno/tools/airflow.py +69 -0
- agno/tools/api.py +122 -0
- agno/tools/apify.py +314 -0
- agno/tools/arxiv.py +127 -0
- agno/tools/aws_lambda.py +53 -0
- agno/tools/aws_ses.py +66 -0
- agno/tools/baidusearch.py +89 -0
- agno/tools/bitbucket.py +292 -0
- agno/tools/brandfetch.py +213 -0
- agno/tools/bravesearch.py +106 -0
- agno/tools/brightdata.py +367 -0
- agno/tools/browserbase.py +209 -0
- agno/tools/calcom.py +255 -0
- agno/tools/calculator.py +151 -0
- agno/tools/cartesia.py +187 -0
- agno/tools/clickup.py +244 -0
- agno/tools/confluence.py +240 -0
- agno/tools/crawl4ai.py +158 -0
- agno/tools/csv_toolkit.py +185 -0
- agno/tools/dalle.py +110 -0
- agno/tools/daytona.py +475 -0
- agno/tools/decorator.py +262 -0
- agno/tools/desi_vocal.py +108 -0
- agno/tools/discord.py +161 -0
- agno/tools/docker.py +716 -0
- agno/tools/duckdb.py +379 -0
- agno/tools/duckduckgo.py +91 -0
- agno/tools/e2b.py +703 -0
- agno/tools/eleven_labs.py +196 -0
- agno/tools/email.py +67 -0
- agno/tools/evm.py +129 -0
- agno/tools/exa.py +396 -0
- agno/tools/fal.py +127 -0
- agno/tools/file.py +240 -0
- agno/tools/file_generation.py +350 -0
- agno/tools/financial_datasets.py +288 -0
- agno/tools/firecrawl.py +143 -0
- agno/tools/function.py +1187 -0
- agno/tools/giphy.py +93 -0
- agno/tools/github.py +1760 -0
- agno/tools/gmail.py +922 -0
- agno/tools/google_bigquery.py +117 -0
- agno/tools/google_drive.py +270 -0
- agno/tools/google_maps.py +253 -0
- agno/tools/googlecalendar.py +674 -0
- agno/tools/googlesearch.py +98 -0
- agno/tools/googlesheets.py +377 -0
- agno/tools/hackernews.py +77 -0
- agno/tools/jina.py +101 -0
- agno/tools/jira.py +170 -0
- agno/tools/knowledge.py +218 -0
- agno/tools/linear.py +426 -0
- agno/tools/linkup.py +58 -0
- agno/tools/local_file_system.py +90 -0
- agno/tools/lumalab.py +183 -0
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +193 -0
- agno/tools/memori.py +339 -0
- agno/tools/memory.py +419 -0
- agno/tools/mlx_transcribe.py +139 -0
- agno/tools/models/__init__.py +0 -0
- agno/tools/models/azure_openai.py +190 -0
- agno/tools/models/gemini.py +203 -0
- agno/tools/models/groq.py +158 -0
- agno/tools/models/morph.py +186 -0
- agno/tools/models/nebius.py +124 -0
- agno/tools/models_labs.py +195 -0
- agno/tools/moviepy_video.py +349 -0
- agno/tools/neo4j.py +134 -0
- agno/tools/newspaper.py +46 -0
- agno/tools/newspaper4k.py +93 -0
- agno/tools/notion.py +204 -0
- agno/tools/openai.py +202 -0
- agno/tools/openbb.py +160 -0
- agno/tools/opencv.py +321 -0
- agno/tools/openweather.py +233 -0
- agno/tools/oxylabs.py +385 -0
- agno/tools/pandas.py +102 -0
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +257 -0
- agno/tools/pubmed.py +188 -0
- agno/tools/python.py +205 -0
- agno/tools/reasoning.py +283 -0
- agno/tools/reddit.py +467 -0
- agno/tools/replicate.py +117 -0
- agno/tools/resend.py +62 -0
- agno/tools/scrapegraph.py +222 -0
- agno/tools/searxng.py +152 -0
- agno/tools/serpapi.py +116 -0
- agno/tools/serper.py +255 -0
- agno/tools/shell.py +53 -0
- agno/tools/slack.py +136 -0
- agno/tools/sleep.py +20 -0
- agno/tools/spider.py +116 -0
- agno/tools/sql.py +154 -0
- agno/tools/streamlit/__init__.py +0 -0
- agno/tools/streamlit/components.py +113 -0
- agno/tools/tavily.py +254 -0
- agno/tools/telegram.py +48 -0
- agno/tools/todoist.py +218 -0
- agno/tools/tool_registry.py +1 -0
- agno/tools/toolkit.py +146 -0
- agno/tools/trafilatura.py +388 -0
- agno/tools/trello.py +274 -0
- agno/tools/twilio.py +186 -0
- agno/tools/user_control_flow.py +78 -0
- agno/tools/valyu.py +228 -0
- agno/tools/visualization.py +467 -0
- agno/tools/webbrowser.py +28 -0
- agno/tools/webex.py +76 -0
- agno/tools/website.py +54 -0
- agno/tools/webtools.py +45 -0
- agno/tools/whatsapp.py +286 -0
- agno/tools/wikipedia.py +63 -0
- agno/tools/workflow.py +278 -0
- agno/tools/x.py +335 -0
- agno/tools/yfinance.py +257 -0
- agno/tools/youtube.py +184 -0
- agno/tools/zendesk.py +82 -0
- agno/tools/zep.py +454 -0
- agno/tools/zoom.py +382 -0
- agno/utils/__init__.py +0 -0
- agno/utils/agent.py +820 -0
- agno/utils/audio.py +49 -0
- agno/utils/certs.py +27 -0
- agno/utils/code_execution.py +11 -0
- agno/utils/common.py +132 -0
- agno/utils/dttm.py +13 -0
- agno/utils/enum.py +22 -0
- agno/utils/env.py +11 -0
- agno/utils/events.py +696 -0
- agno/utils/format_str.py +16 -0
- agno/utils/functions.py +166 -0
- agno/utils/gemini.py +426 -0
- agno/utils/hooks.py +57 -0
- agno/utils/http.py +74 -0
- agno/utils/json_schema.py +234 -0
- agno/utils/knowledge.py +36 -0
- agno/utils/location.py +19 -0
- agno/utils/log.py +255 -0
- agno/utils/mcp.py +214 -0
- agno/utils/media.py +352 -0
- agno/utils/merge_dict.py +41 -0
- agno/utils/message.py +118 -0
- agno/utils/models/__init__.py +0 -0
- agno/utils/models/ai_foundry.py +43 -0
- agno/utils/models/claude.py +358 -0
- agno/utils/models/cohere.py +87 -0
- agno/utils/models/llama.py +78 -0
- agno/utils/models/mistral.py +98 -0
- agno/utils/models/openai_responses.py +140 -0
- agno/utils/models/schema_utils.py +153 -0
- agno/utils/models/watsonx.py +41 -0
- agno/utils/openai.py +257 -0
- agno/utils/pickle.py +32 -0
- agno/utils/pprint.py +178 -0
- agno/utils/print_response/__init__.py +0 -0
- agno/utils/print_response/agent.py +842 -0
- agno/utils/print_response/team.py +1724 -0
- agno/utils/print_response/workflow.py +1668 -0
- agno/utils/prompts.py +111 -0
- agno/utils/reasoning.py +108 -0
- agno/utils/response.py +163 -0
- agno/utils/response_iterator.py +17 -0
- agno/utils/safe_formatter.py +24 -0
- agno/utils/serialize.py +32 -0
- agno/utils/shell.py +22 -0
- agno/utils/streamlit.py +487 -0
- agno/utils/string.py +231 -0
- agno/utils/team.py +139 -0
- agno/utils/timer.py +41 -0
- agno/utils/tools.py +102 -0
- agno/utils/web.py +23 -0
- agno/utils/whatsapp.py +305 -0
- agno/utils/yaml_io.py +25 -0
- agno/vectordb/__init__.py +3 -0
- agno/vectordb/base.py +127 -0
- agno/vectordb/cassandra/__init__.py +5 -0
- agno/vectordb/cassandra/cassandra.py +501 -0
- agno/vectordb/cassandra/extra_param_mixin.py +11 -0
- agno/vectordb/cassandra/index.py +13 -0
- agno/vectordb/chroma/__init__.py +5 -0
- agno/vectordb/chroma/chromadb.py +929 -0
- agno/vectordb/clickhouse/__init__.py +9 -0
- agno/vectordb/clickhouse/clickhousedb.py +835 -0
- agno/vectordb/clickhouse/index.py +9 -0
- agno/vectordb/couchbase/__init__.py +3 -0
- agno/vectordb/couchbase/couchbase.py +1442 -0
- agno/vectordb/distance.py +7 -0
- agno/vectordb/lancedb/__init__.py +6 -0
- agno/vectordb/lancedb/lance_db.py +995 -0
- agno/vectordb/langchaindb/__init__.py +5 -0
- agno/vectordb/langchaindb/langchaindb.py +163 -0
- agno/vectordb/lightrag/__init__.py +5 -0
- agno/vectordb/lightrag/lightrag.py +388 -0
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +166 -0
- agno/vectordb/milvus/__init__.py +4 -0
- agno/vectordb/milvus/milvus.py +1182 -0
- agno/vectordb/mongodb/__init__.py +9 -0
- agno/vectordb/mongodb/mongodb.py +1417 -0
- agno/vectordb/pgvector/__init__.py +12 -0
- agno/vectordb/pgvector/index.py +23 -0
- agno/vectordb/pgvector/pgvector.py +1462 -0
- agno/vectordb/pineconedb/__init__.py +5 -0
- agno/vectordb/pineconedb/pineconedb.py +747 -0
- agno/vectordb/qdrant/__init__.py +5 -0
- agno/vectordb/qdrant/qdrant.py +1134 -0
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +694 -0
- agno/vectordb/search.py +7 -0
- agno/vectordb/singlestore/__init__.py +10 -0
- agno/vectordb/singlestore/index.py +41 -0
- agno/vectordb/singlestore/singlestore.py +763 -0
- agno/vectordb/surrealdb/__init__.py +3 -0
- agno/vectordb/surrealdb/surrealdb.py +699 -0
- agno/vectordb/upstashdb/__init__.py +5 -0
- agno/vectordb/upstashdb/upstashdb.py +718 -0
- agno/vectordb/weaviate/__init__.py +8 -0
- agno/vectordb/weaviate/index.py +15 -0
- agno/vectordb/weaviate/weaviate.py +1005 -0
- agno/workflow/__init__.py +23 -0
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +738 -0
- agno/workflow/loop.py +735 -0
- agno/workflow/parallel.py +824 -0
- agno/workflow/router.py +702 -0
- agno/workflow/step.py +1432 -0
- agno/workflow/steps.py +592 -0
- agno/workflow/types.py +520 -0
- agno/workflow/workflow.py +4321 -0
- agno-2.2.13.dist-info/METADATA +614 -0
- agno-2.2.13.dist-info/RECORD +575 -0
- agno-2.2.13.dist-info/WHEEL +5 -0
- agno-2.2.13.dist-info/licenses/LICENSE +201 -0
- agno-2.2.13.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,929 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from hashlib import md5
|
|
4
|
+
from typing import Any, Dict, List, Mapping, Optional, Union, cast
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from chromadb import Client as ChromaDbClient
|
|
8
|
+
from chromadb import PersistentClient as PersistentChromaDbClient
|
|
9
|
+
from chromadb.api.client import ClientAPI
|
|
10
|
+
from chromadb.api.models.Collection import Collection
|
|
11
|
+
from chromadb.api.types import QueryResult
|
|
12
|
+
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError("The `chromadb` package is not installed. Please install it via `pip install chromadb`.")
|
|
15
|
+
|
|
16
|
+
from agno.filters import FilterExpr
|
|
17
|
+
from agno.knowledge.document import Document
|
|
18
|
+
from agno.knowledge.embedder import Embedder
|
|
19
|
+
from agno.knowledge.reranker.base import Reranker
|
|
20
|
+
from agno.utils.log import log_debug, log_error, log_info, log_warning, logger
|
|
21
|
+
from agno.vectordb.base import VectorDb
|
|
22
|
+
from agno.vectordb.distance import Distance
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChromaDb(VectorDb):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
collection: str,
|
|
29
|
+
name: Optional[str] = None,
|
|
30
|
+
description: Optional[str] = None,
|
|
31
|
+
id: Optional[str] = None,
|
|
32
|
+
embedder: Optional[Embedder] = None,
|
|
33
|
+
distance: Distance = Distance.cosine,
|
|
34
|
+
path: str = "tmp/chromadb",
|
|
35
|
+
persistent_client: bool = False,
|
|
36
|
+
reranker: Optional[Reranker] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
# Validate required parameters
|
|
40
|
+
if not collection:
|
|
41
|
+
raise ValueError("Collection name must be provided.")
|
|
42
|
+
|
|
43
|
+
# Dynamic ID generation based on unique identifiers
|
|
44
|
+
if id is None:
|
|
45
|
+
from agno.utils.string import generate_id
|
|
46
|
+
|
|
47
|
+
seed = f"{path}#{collection}"
|
|
48
|
+
id = generate_id(seed)
|
|
49
|
+
|
|
50
|
+
# Initialize base class with name, description, and generated ID
|
|
51
|
+
super().__init__(id=id, name=name, description=description)
|
|
52
|
+
|
|
53
|
+
# Collection attributes
|
|
54
|
+
self.collection_name: str = collection
|
|
55
|
+
# Embedder for embedding the document contents
|
|
56
|
+
if embedder is None:
|
|
57
|
+
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
58
|
+
|
|
59
|
+
embedder = OpenAIEmbedder()
|
|
60
|
+
log_info("Embedder not provided, using OpenAIEmbedder as default.")
|
|
61
|
+
self.embedder: Embedder = embedder
|
|
62
|
+
# Distance metric
|
|
63
|
+
self.distance: Distance = distance
|
|
64
|
+
|
|
65
|
+
# Chroma client instance
|
|
66
|
+
self._client: Optional[ClientAPI] = None
|
|
67
|
+
|
|
68
|
+
# Chroma collection instance
|
|
69
|
+
self._collection: Optional[Collection] = None
|
|
70
|
+
|
|
71
|
+
# Persistent Chroma client instance
|
|
72
|
+
self.persistent_client: bool = persistent_client
|
|
73
|
+
self.path: str = path
|
|
74
|
+
|
|
75
|
+
# Reranker instance
|
|
76
|
+
self.reranker: Optional[Reranker] = reranker
|
|
77
|
+
|
|
78
|
+
# Chroma client kwargs
|
|
79
|
+
self.kwargs = kwargs
|
|
80
|
+
|
|
81
|
+
def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Union[str, int, float, bool]]:
|
|
82
|
+
"""
|
|
83
|
+
Flatten nested metadata to ChromaDB-compatible format.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
metadata: Dictionary that may contain nested structures
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Flattened dictionary with only primitive values
|
|
90
|
+
"""
|
|
91
|
+
flattened: Dict[str, Any] = {}
|
|
92
|
+
|
|
93
|
+
def _flatten_recursive(obj: Any, prefix: str = "") -> None:
|
|
94
|
+
if isinstance(obj, dict):
|
|
95
|
+
if len(obj) == 0:
|
|
96
|
+
# Handle empty dictionaries by converting to JSON string
|
|
97
|
+
flattened[prefix] = json.dumps(obj)
|
|
98
|
+
else:
|
|
99
|
+
for key, value in obj.items():
|
|
100
|
+
new_key = f"{prefix}.{key}" if prefix else key
|
|
101
|
+
_flatten_recursive(value, new_key)
|
|
102
|
+
elif isinstance(obj, (list, tuple)):
|
|
103
|
+
# Convert lists/tuples to JSON strings
|
|
104
|
+
flattened[prefix] = json.dumps(obj)
|
|
105
|
+
elif isinstance(obj, (str, int, float, bool)) or obj is None:
|
|
106
|
+
if obj is not None: # ChromaDB doesn't accept None values
|
|
107
|
+
flattened[prefix] = obj
|
|
108
|
+
else:
|
|
109
|
+
# Convert other complex types to JSON strings
|
|
110
|
+
try:
|
|
111
|
+
flattened[prefix] = json.dumps(obj)
|
|
112
|
+
except (TypeError, ValueError):
|
|
113
|
+
# If it can't be serialized, convert to string
|
|
114
|
+
flattened[prefix] = str(obj)
|
|
115
|
+
|
|
116
|
+
_flatten_recursive(metadata)
|
|
117
|
+
return flattened
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def client(self) -> ClientAPI:
|
|
121
|
+
if self._client is None:
|
|
122
|
+
if not self.persistent_client:
|
|
123
|
+
log_debug("Creating Chroma Client")
|
|
124
|
+
self._client = ChromaDbClient(
|
|
125
|
+
**self.kwargs,
|
|
126
|
+
)
|
|
127
|
+
elif self.persistent_client:
|
|
128
|
+
log_debug("Creating Persistent Chroma Client")
|
|
129
|
+
self._client = PersistentChromaDbClient(
|
|
130
|
+
path=self.path,
|
|
131
|
+
**self.kwargs,
|
|
132
|
+
)
|
|
133
|
+
return self._client
|
|
134
|
+
|
|
135
|
+
def create(self) -> None:
|
|
136
|
+
"""Create the collection in ChromaDb."""
|
|
137
|
+
if self.exists():
|
|
138
|
+
log_debug(f"Collection already exists: {self.collection_name}")
|
|
139
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
140
|
+
else:
|
|
141
|
+
log_debug(f"Creating collection: {self.collection_name}")
|
|
142
|
+
self._collection = self.client.create_collection(
|
|
143
|
+
name=self.collection_name, metadata={"hnsw:space": self.distance.value}
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
async def async_create(self) -> None:
|
|
147
|
+
"""Create the collection asynchronously by running in a thread."""
|
|
148
|
+
await asyncio.to_thread(self.create)
|
|
149
|
+
|
|
150
|
+
def name_exists(self, name: str) -> bool:
|
|
151
|
+
"""Check if a document with a given name exists in the collection.
|
|
152
|
+
Args:
|
|
153
|
+
name (str): Name of the document to check.
|
|
154
|
+
Returns:
|
|
155
|
+
bool: True if document exists, False otherwise."""
|
|
156
|
+
if not self.client:
|
|
157
|
+
logger.warning("Client not initialized")
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
162
|
+
result = collection.get(where=cast(Any, {"name": {"$eq": name}}), limit=1)
|
|
163
|
+
return len(result.get("ids", [])) > 0
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error checking name existence: {e}")
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
async def async_name_exists(self, name: str) -> bool:
|
|
169
|
+
"""Check if a document with given name exists asynchronously."""
|
|
170
|
+
return await asyncio.to_thread(self.name_exists, name)
|
|
171
|
+
|
|
172
|
+
def insert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
173
|
+
"""Insert documents into the collection.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
documents (List[Document]): List of documents to insert
|
|
177
|
+
filters (Optional[Dict[str, Any]]): Filters to merge with document metadata
|
|
178
|
+
"""
|
|
179
|
+
log_info(f"Inserting {len(documents)} documents")
|
|
180
|
+
ids: List = []
|
|
181
|
+
docs: List = []
|
|
182
|
+
docs_embeddings: List = []
|
|
183
|
+
docs_metadata: List = []
|
|
184
|
+
|
|
185
|
+
if not self._collection:
|
|
186
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
187
|
+
|
|
188
|
+
for document in documents:
|
|
189
|
+
document.embed(embedder=self.embedder)
|
|
190
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
191
|
+
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
192
|
+
|
|
193
|
+
# Handle metadata and filters
|
|
194
|
+
metadata = document.meta_data or {}
|
|
195
|
+
if filters:
|
|
196
|
+
metadata.update(filters)
|
|
197
|
+
|
|
198
|
+
# Add name, content_id to metadata
|
|
199
|
+
if document.name is not None:
|
|
200
|
+
metadata["name"] = document.name
|
|
201
|
+
if document.content_id is not None:
|
|
202
|
+
metadata["content_id"] = document.content_id
|
|
203
|
+
|
|
204
|
+
metadata["content_hash"] = content_hash
|
|
205
|
+
|
|
206
|
+
# Flatten metadata for ChromaDB compatibility
|
|
207
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
208
|
+
|
|
209
|
+
docs_embeddings.append(document.embedding)
|
|
210
|
+
docs.append(cleaned_content)
|
|
211
|
+
ids.append(doc_id)
|
|
212
|
+
docs_metadata.append(flattened_metadata)
|
|
213
|
+
log_debug(f"Prepared document: {document.id} | {document.name} | {flattened_metadata}")
|
|
214
|
+
|
|
215
|
+
if self._collection is None:
|
|
216
|
+
logger.warning("Collection does not exist")
|
|
217
|
+
else:
|
|
218
|
+
if len(docs) > 0:
|
|
219
|
+
self._collection.add(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
220
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
221
|
+
|
|
222
|
+
async def async_insert(
|
|
223
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
224
|
+
) -> None:
|
|
225
|
+
"""Insert documents asynchronously by running in a thread."""
|
|
226
|
+
log_info(f"Async Inserting {len(documents)} documents")
|
|
227
|
+
ids: List = []
|
|
228
|
+
docs: List = []
|
|
229
|
+
docs_embeddings: List = []
|
|
230
|
+
docs_metadata: List = []
|
|
231
|
+
|
|
232
|
+
if not self._collection:
|
|
233
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
234
|
+
|
|
235
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
236
|
+
# Use batch embedding when enabled and supported
|
|
237
|
+
try:
|
|
238
|
+
# Extract content from all documents
|
|
239
|
+
doc_contents = [doc.content for doc in documents]
|
|
240
|
+
|
|
241
|
+
# Get batch embeddings and usage
|
|
242
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
243
|
+
|
|
244
|
+
# Process documents with pre-computed embeddings
|
|
245
|
+
for j, doc in enumerate(documents):
|
|
246
|
+
try:
|
|
247
|
+
if j < len(embeddings):
|
|
248
|
+
doc.embedding = embeddings[j]
|
|
249
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
255
|
+
error_str = str(e).lower()
|
|
256
|
+
is_rate_limit = any(
|
|
257
|
+
phrase in error_str
|
|
258
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if is_rate_limit:
|
|
262
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
263
|
+
raise e
|
|
264
|
+
else:
|
|
265
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
266
|
+
# Fall back to individual embedding
|
|
267
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
268
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
269
|
+
else:
|
|
270
|
+
# Use individual embedding
|
|
271
|
+
try:
|
|
272
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
273
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
log_error(f"Error processing document: {e}")
|
|
276
|
+
|
|
277
|
+
for document in documents:
|
|
278
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
279
|
+
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
280
|
+
|
|
281
|
+
# Handle metadata and filters
|
|
282
|
+
metadata = document.meta_data or {}
|
|
283
|
+
if filters:
|
|
284
|
+
metadata.update(filters)
|
|
285
|
+
|
|
286
|
+
# Add name, content_id to metadata
|
|
287
|
+
if document.name is not None:
|
|
288
|
+
metadata["name"] = document.name
|
|
289
|
+
if document.content_id is not None:
|
|
290
|
+
metadata["content_id"] = document.content_id
|
|
291
|
+
|
|
292
|
+
metadata["content_hash"] = content_hash
|
|
293
|
+
|
|
294
|
+
# Flatten metadata for ChromaDB compatibility
|
|
295
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
296
|
+
|
|
297
|
+
docs_embeddings.append(document.embedding)
|
|
298
|
+
docs.append(cleaned_content)
|
|
299
|
+
ids.append(doc_id)
|
|
300
|
+
docs_metadata.append(flattened_metadata)
|
|
301
|
+
log_debug(f"Prepared document: {document.id} | {document.name} | {flattened_metadata}")
|
|
302
|
+
|
|
303
|
+
if self._collection is None:
|
|
304
|
+
logger.warning("Collection does not exist")
|
|
305
|
+
else:
|
|
306
|
+
if len(docs) > 0:
|
|
307
|
+
self._collection.add(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
308
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
309
|
+
|
|
310
|
+
def upsert_available(self) -> bool:
|
|
311
|
+
"""Check if upsert is available in ChromaDB."""
|
|
312
|
+
return True
|
|
313
|
+
|
|
314
|
+
def upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
315
|
+
"""Upsert documents into the collection.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
documents (List[Document]): List of documents to upsert
|
|
319
|
+
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
if self.content_hash_exists(content_hash):
|
|
323
|
+
self._delete_by_content_hash(content_hash)
|
|
324
|
+
self._upsert(content_hash, documents, filters)
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logger.error(f"Error upserting documents by content hash: {e}")
|
|
327
|
+
raise
|
|
328
|
+
|
|
329
|
+
def _upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
330
|
+
"""Upsert documents into the collection.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
documents (List[Document]): List of documents to upsert
|
|
334
|
+
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
335
|
+
"""
|
|
336
|
+
log_info(f"Upserting {len(documents)} documents")
|
|
337
|
+
ids: List = []
|
|
338
|
+
docs: List = []
|
|
339
|
+
docs_embeddings: List = []
|
|
340
|
+
docs_metadata: List = []
|
|
341
|
+
|
|
342
|
+
if not self._collection:
|
|
343
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
344
|
+
|
|
345
|
+
for document in documents:
|
|
346
|
+
document.embed(embedder=self.embedder)
|
|
347
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
348
|
+
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
349
|
+
|
|
350
|
+
# Handle metadata and filters
|
|
351
|
+
metadata = document.meta_data or {}
|
|
352
|
+
if filters:
|
|
353
|
+
metadata.update(filters)
|
|
354
|
+
|
|
355
|
+
# Add name, content_id to metadata
|
|
356
|
+
if document.name is not None:
|
|
357
|
+
metadata["name"] = document.name
|
|
358
|
+
if document.content_id is not None:
|
|
359
|
+
metadata["content_id"] = document.content_id
|
|
360
|
+
|
|
361
|
+
metadata["content_hash"] = content_hash
|
|
362
|
+
|
|
363
|
+
# Flatten metadata for ChromaDB compatibility
|
|
364
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
365
|
+
|
|
366
|
+
docs_embeddings.append(document.embedding)
|
|
367
|
+
docs.append(cleaned_content)
|
|
368
|
+
ids.append(doc_id)
|
|
369
|
+
docs_metadata.append(flattened_metadata)
|
|
370
|
+
log_debug(f"Upserted document: {document.id} | {document.name} | {flattened_metadata}")
|
|
371
|
+
|
|
372
|
+
if self._collection is None:
|
|
373
|
+
logger.warning("Collection does not exist")
|
|
374
|
+
else:
|
|
375
|
+
if len(docs) > 0:
|
|
376
|
+
self._collection.upsert(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
377
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
378
|
+
|
|
379
|
+
async def _async_upsert(
|
|
380
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
381
|
+
) -> None:
|
|
382
|
+
"""Upsert documents into the collection.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
documents (List[Document]): List of documents to upsert
|
|
386
|
+
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
387
|
+
"""
|
|
388
|
+
log_info(f"Async Upserting {len(documents)} documents")
|
|
389
|
+
ids: List = []
|
|
390
|
+
docs: List = []
|
|
391
|
+
docs_embeddings: List = []
|
|
392
|
+
docs_metadata: List = []
|
|
393
|
+
|
|
394
|
+
if not self._collection:
|
|
395
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
396
|
+
|
|
397
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
398
|
+
# Use batch embedding when enabled and supported
|
|
399
|
+
try:
|
|
400
|
+
# Extract content from all documents
|
|
401
|
+
doc_contents = [doc.content for doc in documents]
|
|
402
|
+
|
|
403
|
+
# Get batch embeddings and usage
|
|
404
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
405
|
+
|
|
406
|
+
# Process documents with pre-computed embeddings
|
|
407
|
+
for j, doc in enumerate(documents):
|
|
408
|
+
try:
|
|
409
|
+
if j < len(embeddings):
|
|
410
|
+
doc.embedding = embeddings[j]
|
|
411
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
414
|
+
|
|
415
|
+
except Exception as e:
|
|
416
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
417
|
+
error_str = str(e).lower()
|
|
418
|
+
is_rate_limit = any(
|
|
419
|
+
phrase in error_str
|
|
420
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if is_rate_limit:
|
|
424
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
425
|
+
raise e
|
|
426
|
+
else:
|
|
427
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
428
|
+
# Fall back to individual embedding
|
|
429
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
430
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
431
|
+
else:
|
|
432
|
+
# Use individual embedding
|
|
433
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
434
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
435
|
+
|
|
436
|
+
for document in documents:
|
|
437
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
438
|
+
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
439
|
+
|
|
440
|
+
# Handle metadata and filters
|
|
441
|
+
metadata = document.meta_data or {}
|
|
442
|
+
if filters:
|
|
443
|
+
metadata.update(filters)
|
|
444
|
+
|
|
445
|
+
# Add name, content_id to metadata
|
|
446
|
+
if document.name is not None:
|
|
447
|
+
metadata["name"] = document.name
|
|
448
|
+
if document.content_id is not None:
|
|
449
|
+
metadata["content_id"] = document.content_id
|
|
450
|
+
|
|
451
|
+
metadata["content_hash"] = content_hash
|
|
452
|
+
|
|
453
|
+
# Flatten metadata for ChromaDB compatibility
|
|
454
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
455
|
+
|
|
456
|
+
docs_embeddings.append(document.embedding)
|
|
457
|
+
docs.append(cleaned_content)
|
|
458
|
+
ids.append(doc_id)
|
|
459
|
+
docs_metadata.append(flattened_metadata)
|
|
460
|
+
log_debug(f"Upserted document: {document.id} | {document.name} | {flattened_metadata}")
|
|
461
|
+
|
|
462
|
+
if self._collection is None:
|
|
463
|
+
logger.warning("Collection does not exist")
|
|
464
|
+
else:
|
|
465
|
+
if len(docs) > 0:
|
|
466
|
+
self._collection.upsert(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
467
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
468
|
+
|
|
469
|
+
async def async_upsert(
|
|
470
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
471
|
+
) -> None:
|
|
472
|
+
"""Upsert documents asynchronously by running in a thread."""
|
|
473
|
+
try:
|
|
474
|
+
if self.content_hash_exists(content_hash):
|
|
475
|
+
self._delete_by_content_hash(content_hash)
|
|
476
|
+
await self._async_upsert(content_hash, documents, filters)
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.error(f"Error upserting documents by content hash: {e}")
|
|
479
|
+
raise
|
|
480
|
+
|
|
481
|
+
def search(
|
|
482
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
483
|
+
) -> List[Document]:
|
|
484
|
+
"""Search the collection for a query.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
query (str): Query to search for.
|
|
488
|
+
limit (int): Number of results to return.
|
|
489
|
+
filters (Optional[Union[Dict[str, Any], List[FilterExpr]]]): Filters to apply while searching.
|
|
490
|
+
Supports ChromaDB's filtering operators:
|
|
491
|
+
- $eq, $ne: Equality/Inequality
|
|
492
|
+
- $gt, $gte, $lt, $lte: Numeric comparisons
|
|
493
|
+
- $in, $nin: List inclusion/exclusion
|
|
494
|
+
- $and, $or: Logical operators
|
|
495
|
+
Returns:
|
|
496
|
+
List[Document]: List of search results.
|
|
497
|
+
"""
|
|
498
|
+
if isinstance(filters, list):
|
|
499
|
+
log_warning("Filter Expressions are not yet supported in ChromaDB. No filters will be applied.")
|
|
500
|
+
filters = None
|
|
501
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
502
|
+
if query_embedding is None:
|
|
503
|
+
logger.error(f"Error getting embedding for Query: {query}")
|
|
504
|
+
return []
|
|
505
|
+
|
|
506
|
+
if not self._collection:
|
|
507
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
508
|
+
|
|
509
|
+
# Convert simple filters to ChromaDB's format if needed
|
|
510
|
+
where_filter = self._convert_filters(filters) if filters else None
|
|
511
|
+
|
|
512
|
+
result: QueryResult = self._collection.query(
|
|
513
|
+
query_embeddings=query_embedding,
|
|
514
|
+
n_results=limit,
|
|
515
|
+
where=where_filter, # Add where filter
|
|
516
|
+
include=["metadatas", "documents", "embeddings", "distances", "uris"],
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Build search results
|
|
520
|
+
search_results: List[Document] = []
|
|
521
|
+
|
|
522
|
+
ids_list = result.get("ids", [[]]) # type: ignore
|
|
523
|
+
metadata_list = result.get("metadatas", [[{}]]) # type: ignore
|
|
524
|
+
documents_list = result.get("documents", [[]]) # type: ignore
|
|
525
|
+
embeddings_list = result.get("embeddings") # type: ignore
|
|
526
|
+
distances_list = result.get("distances", [[]]) # type: ignore
|
|
527
|
+
|
|
528
|
+
if not ids_list or not metadata_list or not documents_list or embeddings_list is None or not distances_list:
|
|
529
|
+
return search_results
|
|
530
|
+
|
|
531
|
+
ids = ids_list[0]
|
|
532
|
+
metadata = [dict(m) if m else {} for m in metadata_list[0]] # Convert to mutable dicts
|
|
533
|
+
documents = documents_list[0]
|
|
534
|
+
embeddings_raw = embeddings_list[0] if embeddings_list else []
|
|
535
|
+
embeddings = []
|
|
536
|
+
for e in embeddings_raw:
|
|
537
|
+
if hasattr(e, "tolist") and callable(getattr(e, "tolist", None)):
|
|
538
|
+
try:
|
|
539
|
+
embeddings.append(list(cast(Any, e).tolist()))
|
|
540
|
+
except (AttributeError, TypeError):
|
|
541
|
+
embeddings.append(list(e) if isinstance(e, (list, tuple)) else [])
|
|
542
|
+
elif isinstance(e, (list, tuple)):
|
|
543
|
+
embeddings.append([float(x) for x in e if isinstance(x, (int, float))])
|
|
544
|
+
elif isinstance(e, (int, float)):
|
|
545
|
+
embeddings.append([float(e)])
|
|
546
|
+
else:
|
|
547
|
+
embeddings.append([])
|
|
548
|
+
distances = distances_list[0]
|
|
549
|
+
|
|
550
|
+
for idx, distance in enumerate(distances):
|
|
551
|
+
if idx < len(metadata):
|
|
552
|
+
metadata[idx]["distances"] = distance
|
|
553
|
+
|
|
554
|
+
try:
|
|
555
|
+
for idx, (id_, doc_metadata, document) in enumerate(zip(ids, metadata, documents)):
|
|
556
|
+
# Extract the fields we added to metadata
|
|
557
|
+
name_val = doc_metadata.pop("name", None)
|
|
558
|
+
content_id_val = doc_metadata.pop("content_id", None)
|
|
559
|
+
|
|
560
|
+
# Convert types to match Document constructor expectations
|
|
561
|
+
name = str(name_val) if name_val is not None and not isinstance(name_val, str) else name_val
|
|
562
|
+
content_id = (
|
|
563
|
+
str(content_id_val)
|
|
564
|
+
if content_id_val is not None and not isinstance(content_id_val, str)
|
|
565
|
+
else content_id_val
|
|
566
|
+
)
|
|
567
|
+
content = str(document) if document is not None else ""
|
|
568
|
+
embedding = embeddings[idx] if idx < len(embeddings) else None
|
|
569
|
+
|
|
570
|
+
search_results.append(
|
|
571
|
+
Document(
|
|
572
|
+
id=id_,
|
|
573
|
+
name=name,
|
|
574
|
+
meta_data=doc_metadata,
|
|
575
|
+
content=content,
|
|
576
|
+
embedding=embedding,
|
|
577
|
+
content_id=content_id,
|
|
578
|
+
)
|
|
579
|
+
)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.error(f"Error building search results: {e}")
|
|
582
|
+
|
|
583
|
+
if self.reranker:
|
|
584
|
+
search_results = self.reranker.rerank(query=query, documents=search_results)
|
|
585
|
+
|
|
586
|
+
log_info(f"Found {len(search_results)} documents")
|
|
587
|
+
return search_results
|
|
588
|
+
|
|
589
|
+
def _convert_filters(self, filters: Dict[str, Any]) -> Dict[str, Any]:
|
|
590
|
+
"""Convert simple filters to ChromaDB's filter format.
|
|
591
|
+
|
|
592
|
+
Handles conversion of simple key-value filters to ChromaDB's operator format
|
|
593
|
+
when needed.
|
|
594
|
+
"""
|
|
595
|
+
if not filters:
|
|
596
|
+
return {}
|
|
597
|
+
|
|
598
|
+
# If filters already use ChromaDB operators ($eq, $ne, etc.), return as is
|
|
599
|
+
if any(key.startswith("$") for key in filters.keys()):
|
|
600
|
+
return filters
|
|
601
|
+
|
|
602
|
+
# Convert simple key-value pairs to ChromaDB's format
|
|
603
|
+
converted = {}
|
|
604
|
+
for key, value in filters.items():
|
|
605
|
+
if isinstance(value, (list, tuple)):
|
|
606
|
+
# Convert lists to $in operator
|
|
607
|
+
converted[key] = {"$in": list(value)}
|
|
608
|
+
else:
|
|
609
|
+
# Convert simple equality to $eq
|
|
610
|
+
converted[key] = {"$eq": value}
|
|
611
|
+
|
|
612
|
+
return converted
|
|
613
|
+
|
|
614
|
+
async def async_search(
|
|
615
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
616
|
+
) -> List[Document]:
|
|
617
|
+
"""Search asynchronously by running in a thread."""
|
|
618
|
+
return await asyncio.to_thread(self.search, query, limit, filters)
|
|
619
|
+
|
|
620
|
+
def drop(self) -> None:
|
|
621
|
+
"""Delete the collection."""
|
|
622
|
+
if self.exists():
|
|
623
|
+
log_debug(f"Deleting collection: {self.collection_name}")
|
|
624
|
+
self.client.delete_collection(name=self.collection_name)
|
|
625
|
+
|
|
626
|
+
async def async_drop(self) -> None:
|
|
627
|
+
"""Drop the collection asynchronously by running in a thread."""
|
|
628
|
+
await asyncio.to_thread(self.drop)
|
|
629
|
+
|
|
630
|
+
def exists(self) -> bool:
|
|
631
|
+
"""Check if the collection exists."""
|
|
632
|
+
try:
|
|
633
|
+
self.client.get_collection(name=self.collection_name)
|
|
634
|
+
return True
|
|
635
|
+
except Exception as e:
|
|
636
|
+
log_debug(f"Collection does not exist: {e}")
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
async def async_exists(self) -> bool:
|
|
640
|
+
"""Check if collection exists asynchronously by running in a thread."""
|
|
641
|
+
return await asyncio.to_thread(self.exists)
|
|
642
|
+
|
|
643
|
+
def get_count(self) -> int:
|
|
644
|
+
"""Get the count of documents in the collection."""
|
|
645
|
+
if self.exists():
|
|
646
|
+
try:
|
|
647
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
648
|
+
return collection.count()
|
|
649
|
+
except Exception as e:
|
|
650
|
+
logger.error(f"Error getting count: {e}")
|
|
651
|
+
return 0
|
|
652
|
+
|
|
653
|
+
def optimize(self) -> None:
|
|
654
|
+
raise NotImplementedError
|
|
655
|
+
|
|
656
|
+
def delete(self) -> bool:
|
|
657
|
+
try:
|
|
658
|
+
self.client.delete_collection(name=self.collection_name)
|
|
659
|
+
return True
|
|
660
|
+
except Exception as e:
|
|
661
|
+
logger.error(f"Error clearing collection: {e}")
|
|
662
|
+
return False
|
|
663
|
+
|
|
664
|
+
def delete_by_id(self, id: str) -> bool:
|
|
665
|
+
"""Delete document by ID."""
|
|
666
|
+
if not self.client:
|
|
667
|
+
logger.error("Client not initialized")
|
|
668
|
+
return False
|
|
669
|
+
|
|
670
|
+
try:
|
|
671
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
672
|
+
|
|
673
|
+
# Check if document exists
|
|
674
|
+
if not self.id_exists(id):
|
|
675
|
+
log_info(f"Document with ID '{id}' not found")
|
|
676
|
+
return False
|
|
677
|
+
|
|
678
|
+
# Delete the document
|
|
679
|
+
collection.delete(ids=[id])
|
|
680
|
+
log_info(f"Deleted document with ID '{id}'")
|
|
681
|
+
return True
|
|
682
|
+
except Exception as e:
|
|
683
|
+
logger.error(f"Error deleting document by ID '{id}': {e}")
|
|
684
|
+
return False
|
|
685
|
+
|
|
686
|
+
def delete_by_name(self, name: str) -> bool:
|
|
687
|
+
"""Delete documents by name."""
|
|
688
|
+
if not self.client:
|
|
689
|
+
logger.error("Client not initialized")
|
|
690
|
+
return False
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
694
|
+
|
|
695
|
+
# Find all documents with the given name
|
|
696
|
+
result = collection.get(where=cast(Any, {"name": {"$eq": name}}))
|
|
697
|
+
ids_to_delete = result.get("ids", [])
|
|
698
|
+
|
|
699
|
+
if not ids_to_delete:
|
|
700
|
+
log_info(f"No documents found with name '{name}'")
|
|
701
|
+
return False
|
|
702
|
+
|
|
703
|
+
# Delete all matching documents
|
|
704
|
+
collection.delete(ids=ids_to_delete)
|
|
705
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with name '{name}'")
|
|
706
|
+
return True
|
|
707
|
+
except Exception as e:
|
|
708
|
+
logger.error(f"Error deleting documents by name '{name}': {e}")
|
|
709
|
+
return False
|
|
710
|
+
|
|
711
|
+
def delete_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
712
|
+
"""Delete documents by metadata."""
|
|
713
|
+
if not self.client:
|
|
714
|
+
logger.error("Client not initialized")
|
|
715
|
+
return False
|
|
716
|
+
|
|
717
|
+
try:
|
|
718
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
719
|
+
|
|
720
|
+
# Build where clause for metadata filtering
|
|
721
|
+
where_clause = {}
|
|
722
|
+
for key, value in metadata.items():
|
|
723
|
+
where_clause[key] = {"$eq": value}
|
|
724
|
+
|
|
725
|
+
# Find all documents with the matching metadata
|
|
726
|
+
result = collection.get(where=cast(Any, where_clause))
|
|
727
|
+
ids_to_delete = result.get("ids", [])
|
|
728
|
+
|
|
729
|
+
if not ids_to_delete:
|
|
730
|
+
log_info(f"No documents found with metadata '{metadata}'")
|
|
731
|
+
return False
|
|
732
|
+
|
|
733
|
+
# Delete all matching documents
|
|
734
|
+
collection.delete(ids=ids_to_delete)
|
|
735
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with metadata '{metadata}'")
|
|
736
|
+
return True
|
|
737
|
+
except Exception as e:
|
|
738
|
+
logger.error(f"Error deleting documents by metadata '{metadata}': {e}")
|
|
739
|
+
return False
|
|
740
|
+
|
|
741
|
+
def delete_by_content_id(self, content_id: str) -> bool:
|
|
742
|
+
"""Delete documents by content ID."""
|
|
743
|
+
if not self.client:
|
|
744
|
+
logger.error("Client not initialized")
|
|
745
|
+
return False
|
|
746
|
+
|
|
747
|
+
try:
|
|
748
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
749
|
+
|
|
750
|
+
# Find all documents with the given content_id
|
|
751
|
+
result = collection.get(where=cast(Any, {"content_id": {"$eq": content_id}}))
|
|
752
|
+
ids_to_delete = result.get("ids", [])
|
|
753
|
+
|
|
754
|
+
if not ids_to_delete:
|
|
755
|
+
log_info(f"No documents found with content_id '{content_id}'")
|
|
756
|
+
return False
|
|
757
|
+
|
|
758
|
+
# Delete all matching documents
|
|
759
|
+
collection.delete(ids=ids_to_delete)
|
|
760
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with content_id '{content_id}'")
|
|
761
|
+
return True
|
|
762
|
+
except Exception as e:
|
|
763
|
+
logger.error(f"Error deleting documents by content_id '{content_id}': {e}")
|
|
764
|
+
return False
|
|
765
|
+
|
|
766
|
+
def _delete_by_content_hash(self, content_hash: str) -> bool:
|
|
767
|
+
"""Delete documents by content hash."""
|
|
768
|
+
if not self.client:
|
|
769
|
+
logger.error("Client not initialized")
|
|
770
|
+
return False
|
|
771
|
+
|
|
772
|
+
try:
|
|
773
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
774
|
+
|
|
775
|
+
# Find all documents with the given content_hash
|
|
776
|
+
result = collection.get(where=cast(Any, {"content_hash": {"$eq": content_hash}}))
|
|
777
|
+
ids_to_delete = result.get("ids", [])
|
|
778
|
+
|
|
779
|
+
if not ids_to_delete:
|
|
780
|
+
log_info(f"No documents found with content_hash '{content_hash}'")
|
|
781
|
+
return False
|
|
782
|
+
|
|
783
|
+
# Delete all matching documents
|
|
784
|
+
collection.delete(ids=ids_to_delete)
|
|
785
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with content_hash '{content_hash}'")
|
|
786
|
+
return True
|
|
787
|
+
except Exception as e:
|
|
788
|
+
logger.error(f"Error deleting documents by content_hash '{content_hash}': {e}")
|
|
789
|
+
return False
|
|
790
|
+
|
|
791
|
+
def id_exists(self, id: str) -> bool:
|
|
792
|
+
"""Check if a document with the given ID exists in the collection.
|
|
793
|
+
|
|
794
|
+
Args:
|
|
795
|
+
id (str): The document ID to check.
|
|
796
|
+
|
|
797
|
+
Returns:
|
|
798
|
+
bool: True if the document exists, False otherwise.
|
|
799
|
+
"""
|
|
800
|
+
if not self.client:
|
|
801
|
+
logger.error("Client not initialized")
|
|
802
|
+
return False
|
|
803
|
+
|
|
804
|
+
try:
|
|
805
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
806
|
+
# Try to get the document by ID
|
|
807
|
+
result = collection.get(ids=[id])
|
|
808
|
+
found_ids = result.get("ids", [])
|
|
809
|
+
|
|
810
|
+
# Return True if the document was found
|
|
811
|
+
return len(found_ids) > 0
|
|
812
|
+
except Exception as e:
|
|
813
|
+
logger.error(f"Error checking if ID '{id}' exists: {e}")
|
|
814
|
+
return False
|
|
815
|
+
|
|
816
|
+
def content_hash_exists(self, content_hash: str) -> bool:
|
|
817
|
+
"""Check if documents with the given content hash exist."""
|
|
818
|
+
if not self.client:
|
|
819
|
+
logger.error("Client not initialized")
|
|
820
|
+
return False
|
|
821
|
+
|
|
822
|
+
try:
|
|
823
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
824
|
+
|
|
825
|
+
# Try to query for documents with the given content_hash
|
|
826
|
+
try:
|
|
827
|
+
result = collection.get(where=cast(Any, {"content_hash": {"$eq": content_hash}}))
|
|
828
|
+
# Safely extract ids from result
|
|
829
|
+
if hasattr(result, "get") and callable(result.get):
|
|
830
|
+
found_ids = result.get("ids", [])
|
|
831
|
+
elif hasattr(result, "__getitem__") and "ids" in result:
|
|
832
|
+
found_ids = result["ids"]
|
|
833
|
+
else:
|
|
834
|
+
found_ids = []
|
|
835
|
+
|
|
836
|
+
# Return True if any documents were found
|
|
837
|
+
if isinstance(found_ids, (list, tuple)):
|
|
838
|
+
return len(found_ids) > 0
|
|
839
|
+
elif isinstance(found_ids, int):
|
|
840
|
+
# Some ChromaDB versions might return a count instead of a list
|
|
841
|
+
return found_ids > 0
|
|
842
|
+
else:
|
|
843
|
+
return False
|
|
844
|
+
|
|
845
|
+
except TypeError as te:
|
|
846
|
+
if "object of type 'int' has no len()" in str(te):
|
|
847
|
+
# Known issue with ChromaDB 0.5.0 - internal bug
|
|
848
|
+
# As a workaround, assume content doesn't exist to allow processing to continue
|
|
849
|
+
logger.warning(
|
|
850
|
+
f"ChromaDB internal error (version 0.5.0 bug): {te}. Assuming content_hash '{content_hash}' does not exist."
|
|
851
|
+
)
|
|
852
|
+
return False
|
|
853
|
+
else:
|
|
854
|
+
raise te
|
|
855
|
+
|
|
856
|
+
except Exception as e:
|
|
857
|
+
logger.error(f"Error checking if content_hash '{content_hash}' exists: {e}")
|
|
858
|
+
return False
|
|
859
|
+
|
|
860
|
+
def update_metadata(self, content_id: str, metadata: Dict[str, Any]) -> None:
|
|
861
|
+
"""
|
|
862
|
+
Update the metadata for documents with the given content_id.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
content_id (str): The content ID to update
|
|
866
|
+
metadata (Dict[str, Any]): The metadata to update
|
|
867
|
+
"""
|
|
868
|
+
try:
|
|
869
|
+
if not self.client:
|
|
870
|
+
logger.error("Client not initialized")
|
|
871
|
+
return
|
|
872
|
+
|
|
873
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
874
|
+
|
|
875
|
+
# Find documents with the given content_id
|
|
876
|
+
try:
|
|
877
|
+
result = collection.get(where=cast(Any, {"content_id": {"$eq": content_id}}))
|
|
878
|
+
|
|
879
|
+
# Extract IDs and current metadata
|
|
880
|
+
if hasattr(result, "get") and callable(result.get):
|
|
881
|
+
ids = result.get("ids", [])
|
|
882
|
+
current_metadatas = result.get("metadatas", [])
|
|
883
|
+
elif hasattr(result, "__getitem__"):
|
|
884
|
+
ids = result.get("ids", []) if "ids" in result else []
|
|
885
|
+
current_metadatas = result.get("metadatas", []) if "metadatas" in result else []
|
|
886
|
+
else:
|
|
887
|
+
ids = []
|
|
888
|
+
current_metadatas = []
|
|
889
|
+
|
|
890
|
+
if not ids:
|
|
891
|
+
logger.debug(f"No documents found with content_id: {content_id}")
|
|
892
|
+
return
|
|
893
|
+
|
|
894
|
+
# Flatten the new metadata first
|
|
895
|
+
flattened_new_metadata = self._flatten_metadata(metadata)
|
|
896
|
+
|
|
897
|
+
# Merge metadata for each document
|
|
898
|
+
updated_metadatas = []
|
|
899
|
+
for i, current_meta in enumerate(current_metadatas or []):
|
|
900
|
+
if current_meta is None:
|
|
901
|
+
meta_dict: Dict[str, Any] = {}
|
|
902
|
+
else:
|
|
903
|
+
meta_dict = dict(current_meta) # Convert Mapping to dict
|
|
904
|
+
|
|
905
|
+
# Update with flattened metadata
|
|
906
|
+
meta_dict.update(flattened_new_metadata)
|
|
907
|
+
updated_metadatas.append(meta_dict)
|
|
908
|
+
|
|
909
|
+
# Convert to the expected type for ChromaDB
|
|
910
|
+
chroma_metadatas = cast(List[Mapping[str, Union[str, int, float, bool]]], updated_metadatas)
|
|
911
|
+
collection.update(ids=ids, metadatas=chroma_metadatas) # type: ignore
|
|
912
|
+
logger.debug(f"Updated metadata for {len(ids)} documents with content_id: {content_id}")
|
|
913
|
+
|
|
914
|
+
except TypeError as te:
|
|
915
|
+
if "object of type 'int' has no len()" in str(te):
|
|
916
|
+
logger.warning(
|
|
917
|
+
f"ChromaDB internal error (version 0.5.0 bug): {te}. Cannot update metadata for content_id '{content_id}'."
|
|
918
|
+
)
|
|
919
|
+
return
|
|
920
|
+
else:
|
|
921
|
+
raise te
|
|
922
|
+
|
|
923
|
+
except Exception as e:
|
|
924
|
+
logger.error(f"Error updating metadata for content_id '{content_id}': {e}")
|
|
925
|
+
raise
|
|
926
|
+
|
|
927
|
+
def get_supported_search_types(self) -> List[str]:
|
|
928
|
+
"""Get the supported search types for this vector database."""
|
|
929
|
+
return [] # ChromaDb doesn't use SearchType enum
|