agno 0.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/__init__.py +8 -0
- agno/agent/__init__.py +44 -5
- agno/agent/agent.py +10531 -2975
- agno/api/agent.py +14 -53
- agno/api/api.py +7 -46
- agno/api/evals.py +22 -0
- agno/api/os.py +17 -0
- agno/api/routes.py +6 -25
- agno/api/schemas/__init__.py +9 -0
- agno/api/schemas/agent.py +6 -9
- agno/api/schemas/evals.py +16 -0
- agno/api/schemas/os.py +14 -0
- agno/api/schemas/team.py +10 -10
- agno/api/schemas/utils.py +21 -0
- agno/api/schemas/workflows.py +16 -0
- agno/api/settings.py +53 -0
- agno/api/team.py +22 -26
- agno/api/workflow.py +28 -0
- agno/cloud/aws/base.py +214 -0
- agno/cloud/aws/s3/__init__.py +2 -0
- agno/cloud/aws/s3/api_client.py +43 -0
- agno/cloud/aws/s3/bucket.py +195 -0
- agno/cloud/aws/s3/object.py +57 -0
- agno/compression/__init__.py +3 -0
- agno/compression/manager.py +247 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/__init__.py +24 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +946 -0
- agno/db/dynamo/__init__.py +3 -0
- agno/db/dynamo/dynamo.py +2781 -0
- agno/db/dynamo/schemas.py +442 -0
- agno/db/dynamo/utils.py +743 -0
- agno/db/firestore/__init__.py +3 -0
- agno/db/firestore/firestore.py +2379 -0
- agno/db/firestore/schemas.py +181 -0
- agno/db/firestore/utils.py +376 -0
- agno/db/gcs_json/__init__.py +3 -0
- agno/db/gcs_json/gcs_json_db.py +1791 -0
- agno/db/gcs_json/utils.py +228 -0
- agno/db/in_memory/__init__.py +3 -0
- agno/db/in_memory/in_memory_db.py +1312 -0
- agno/db/in_memory/utils.py +230 -0
- agno/db/json/__init__.py +3 -0
- agno/db/json/json_db.py +1777 -0
- agno/db/json/utils.py +230 -0
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +635 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +17 -0
- agno/db/mongo/async_mongo.py +2760 -0
- agno/db/mongo/mongo.py +2597 -0
- agno/db/mongo/schemas.py +119 -0
- agno/db/mongo/utils.py +276 -0
- agno/db/mysql/__init__.py +4 -0
- agno/db/mysql/async_mysql.py +2912 -0
- agno/db/mysql/mysql.py +2923 -0
- agno/db/mysql/schemas.py +186 -0
- agno/db/mysql/utils.py +488 -0
- agno/db/postgres/__init__.py +4 -0
- agno/db/postgres/async_postgres.py +2579 -0
- agno/db/postgres/postgres.py +2870 -0
- agno/db/postgres/schemas.py +187 -0
- agno/db/postgres/utils.py +442 -0
- agno/db/redis/__init__.py +3 -0
- agno/db/redis/redis.py +2141 -0
- agno/db/redis/schemas.py +159 -0
- agno/db/redis/utils.py +346 -0
- agno/db/schemas/__init__.py +4 -0
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +34 -0
- agno/db/schemas/knowledge.py +40 -0
- agno/db/schemas/memory.py +61 -0
- agno/db/singlestore/__init__.py +3 -0
- agno/db/singlestore/schemas.py +179 -0
- agno/db/singlestore/singlestore.py +2877 -0
- agno/db/singlestore/utils.py +384 -0
- agno/db/sqlite/__init__.py +4 -0
- agno/db/sqlite/async_sqlite.py +2911 -0
- agno/db/sqlite/schemas.py +181 -0
- agno/db/sqlite/sqlite.py +2908 -0
- agno/db/sqlite/utils.py +429 -0
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +334 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1908 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +118 -0
- agno/eval/__init__.py +24 -0
- agno/eval/accuracy.py +666 -276
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/performance.py +779 -0
- agno/eval/reliability.py +241 -62
- agno/eval/utils.py +120 -0
- agno/exceptions.py +143 -1
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/hooks/__init__.py +3 -0
- agno/hooks/decorator.py +164 -0
- agno/integrations/discord/__init__.py +3 -0
- agno/integrations/discord/client.py +203 -0
- agno/knowledge/__init__.py +5 -1
- agno/{document → knowledge}/chunking/agentic.py +22 -14
- agno/{document → knowledge}/chunking/document.py +2 -2
- agno/{document → knowledge}/chunking/fixed.py +7 -6
- agno/knowledge/chunking/markdown.py +151 -0
- agno/{document → knowledge}/chunking/recursive.py +15 -3
- agno/knowledge/chunking/row.py +39 -0
- agno/knowledge/chunking/semantic.py +91 -0
- agno/knowledge/chunking/strategy.py +165 -0
- agno/knowledge/content.py +74 -0
- agno/knowledge/document/__init__.py +5 -0
- agno/{document → knowledge/document}/base.py +12 -2
- agno/knowledge/embedder/__init__.py +5 -0
- agno/knowledge/embedder/aws_bedrock.py +343 -0
- agno/knowledge/embedder/azure_openai.py +210 -0
- agno/{embedder → knowledge/embedder}/base.py +8 -0
- agno/knowledge/embedder/cohere.py +323 -0
- agno/knowledge/embedder/fastembed.py +62 -0
- agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
- agno/knowledge/embedder/google.py +258 -0
- agno/knowledge/embedder/huggingface.py +94 -0
- agno/knowledge/embedder/jina.py +182 -0
- agno/knowledge/embedder/langdb.py +22 -0
- agno/knowledge/embedder/mistral.py +206 -0
- agno/knowledge/embedder/nebius.py +13 -0
- agno/knowledge/embedder/ollama.py +154 -0
- agno/knowledge/embedder/openai.py +195 -0
- agno/knowledge/embedder/sentence_transformer.py +63 -0
- agno/{embedder → knowledge/embedder}/together.py +1 -1
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +165 -0
- agno/knowledge/knowledge.py +3006 -0
- agno/knowledge/reader/__init__.py +7 -0
- agno/knowledge/reader/arxiv_reader.py +81 -0
- agno/knowledge/reader/base.py +95 -0
- agno/knowledge/reader/csv_reader.py +164 -0
- agno/knowledge/reader/docx_reader.py +82 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
- agno/knowledge/reader/firecrawl_reader.py +201 -0
- agno/knowledge/reader/json_reader.py +88 -0
- agno/knowledge/reader/markdown_reader.py +137 -0
- agno/knowledge/reader/pdf_reader.py +431 -0
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +313 -0
- agno/knowledge/reader/s3_reader.py +89 -0
- agno/knowledge/reader/tavily_reader.py +193 -0
- agno/knowledge/reader/text_reader.py +127 -0
- agno/knowledge/reader/web_search_reader.py +325 -0
- agno/knowledge/reader/website_reader.py +455 -0
- agno/knowledge/reader/wikipedia_reader.py +91 -0
- agno/knowledge/reader/youtube_reader.py +78 -0
- agno/knowledge/remote_content/remote_content.py +88 -0
- agno/knowledge/reranker/__init__.py +3 -0
- agno/{reranker → knowledge/reranker}/base.py +1 -1
- agno/{reranker → knowledge/reranker}/cohere.py +2 -2
- agno/knowledge/reranker/infinity.py +195 -0
- agno/knowledge/reranker/sentence_transformer.py +54 -0
- agno/knowledge/types.py +39 -0
- agno/knowledge/utils.py +234 -0
- agno/media.py +439 -95
- agno/memory/__init__.py +16 -3
- agno/memory/manager.py +1474 -123
- agno/memory/strategies/__init__.py +15 -0
- agno/memory/strategies/base.py +66 -0
- agno/memory/strategies/summarize.py +196 -0
- agno/memory/strategies/types.py +37 -0
- agno/models/aimlapi/__init__.py +5 -0
- agno/models/aimlapi/aimlapi.py +62 -0
- agno/models/anthropic/__init__.py +4 -0
- agno/models/anthropic/claude.py +960 -496
- agno/models/aws/__init__.py +15 -0
- agno/models/aws/bedrock.py +686 -451
- agno/models/aws/claude.py +190 -183
- agno/models/azure/__init__.py +18 -1
- agno/models/azure/ai_foundry.py +489 -0
- agno/models/azure/openai_chat.py +89 -40
- agno/models/base.py +2477 -550
- agno/models/cerebras/__init__.py +12 -0
- agno/models/cerebras/cerebras.py +565 -0
- agno/models/cerebras/cerebras_openai.py +131 -0
- agno/models/cohere/__init__.py +4 -0
- agno/models/cohere/chat.py +306 -492
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +74 -0
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +90 -0
- agno/models/deepinfra/__init__.py +5 -0
- agno/models/deepinfra/deepinfra.py +45 -0
- agno/models/deepseek/__init__.py +4 -0
- agno/models/deepseek/deepseek.py +110 -9
- agno/models/fireworks/__init__.py +4 -0
- agno/models/fireworks/fireworks.py +19 -22
- agno/models/google/__init__.py +3 -7
- agno/models/google/gemini.py +1717 -662
- agno/models/google/utils.py +22 -0
- agno/models/groq/__init__.py +4 -0
- agno/models/groq/groq.py +391 -666
- agno/models/huggingface/__init__.py +4 -0
- agno/models/huggingface/huggingface.py +266 -538
- agno/models/ibm/__init__.py +5 -0
- agno/models/ibm/watsonx.py +432 -0
- agno/models/internlm/__init__.py +3 -0
- agno/models/internlm/internlm.py +20 -3
- agno/models/langdb/__init__.py +1 -0
- agno/models/langdb/langdb.py +60 -0
- agno/models/litellm/__init__.py +14 -0
- agno/models/litellm/chat.py +503 -0
- agno/models/litellm/litellm_openai.py +42 -0
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/lmstudio/__init__.py +5 -0
- agno/models/lmstudio/lmstudio.py +25 -0
- agno/models/message.py +361 -39
- agno/models/meta/__init__.py +12 -0
- agno/models/meta/llama.py +502 -0
- agno/models/meta/llama_openai.py +79 -0
- agno/models/metrics.py +120 -0
- agno/models/mistral/__init__.py +4 -0
- agno/models/mistral/mistral.py +293 -393
- agno/models/nebius/__init__.py +3 -0
- agno/models/nebius/nebius.py +53 -0
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/__init__.py +4 -0
- agno/models/nvidia/nvidia.py +22 -3
- agno/models/ollama/__init__.py +4 -2
- agno/models/ollama/chat.py +257 -492
- agno/models/openai/__init__.py +7 -0
- agno/models/openai/chat.py +725 -770
- agno/models/openai/like.py +16 -2
- agno/models/openai/responses.py +1121 -0
- agno/models/openrouter/__init__.py +4 -0
- agno/models/openrouter/openrouter.py +62 -5
- agno/models/perplexity/__init__.py +5 -0
- agno/models/perplexity/perplexity.py +203 -0
- agno/models/portkey/__init__.py +3 -0
- agno/models/portkey/portkey.py +82 -0
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +69 -0
- agno/models/response.py +177 -7
- agno/models/sambanova/__init__.py +4 -0
- agno/models/sambanova/sambanova.py +23 -4
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +42 -0
- agno/models/together/__init__.py +4 -0
- agno/models/together/together.py +21 -164
- agno/models/utils.py +266 -0
- agno/models/vercel/__init__.py +3 -0
- agno/models/vercel/v0.py +43 -0
- agno/models/vertexai/__init__.py +0 -1
- agno/models/vertexai/claude.py +190 -0
- agno/models/vllm/__init__.py +3 -0
- agno/models/vllm/vllm.py +83 -0
- agno/models/xai/__init__.py +2 -0
- agno/models/xai/xai.py +111 -7
- agno/os/__init__.py +3 -0
- agno/os/app.py +1027 -0
- agno/os/auth.py +244 -0
- agno/os/config.py +126 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +249 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/__init__.py +3 -0
- agno/os/interfaces/agui/agui.py +47 -0
- agno/os/interfaces/agui/router.py +147 -0
- agno/os/interfaces/agui/utils.py +574 -0
- agno/os/interfaces/base.py +25 -0
- agno/os/interfaces/slack/__init__.py +3 -0
- agno/os/interfaces/slack/router.py +148 -0
- agno/os/interfaces/slack/security.py +30 -0
- agno/os/interfaces/slack/slack.py +47 -0
- agno/os/interfaces/whatsapp/__init__.py +3 -0
- agno/os/interfaces/whatsapp/router.py +210 -0
- agno/os/interfaces/whatsapp/security.py +55 -0
- agno/os/interfaces/whatsapp/whatsapp.py +36 -0
- agno/os/mcp.py +293 -0
- agno/os/middleware/__init__.py +9 -0
- agno/os/middleware/jwt.py +797 -0
- agno/os/router.py +258 -0
- agno/os/routers/__init__.py +3 -0
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +599 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/__init__.py +3 -0
- agno/os/routers/evals/evals.py +450 -0
- agno/os/routers/evals/schemas.py +174 -0
- agno/os/routers/evals/utils.py +231 -0
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/__init__.py +3 -0
- agno/os/routers/knowledge/knowledge.py +1008 -0
- agno/os/routers/knowledge/schemas.py +178 -0
- agno/os/routers/memory/__init__.py +3 -0
- agno/os/routers/memory/memory.py +661 -0
- agno/os/routers/memory/schemas.py +88 -0
- agno/os/routers/metrics/__init__.py +3 -0
- agno/os/routers/metrics/metrics.py +190 -0
- agno/os/routers/metrics/schemas.py +47 -0
- agno/os/routers/session/__init__.py +3 -0
- agno/os/routers/session/session.py +997 -0
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +512 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/traces/__init__.py +3 -0
- agno/os/routers/traces/schemas.py +414 -0
- agno/os/routers/traces/traces.py +499 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +624 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +534 -0
- agno/os/scopes.py +469 -0
- agno/{playground → os}/settings.py +7 -15
- agno/os/utils.py +973 -0
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +67 -0
- agno/reasoning/deepseek.py +63 -0
- agno/reasoning/default.py +97 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +71 -0
- agno/reasoning/helpers.py +24 -1
- agno/reasoning/ollama.py +67 -0
- agno/reasoning/openai.py +86 -0
- agno/reasoning/step.py +2 -1
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +822 -0
- agno/run/base.py +247 -0
- agno/run/cancel.py +81 -0
- agno/run/requirement.py +181 -0
- agno/run/team.py +767 -0
- agno/run/workflow.py +708 -0
- agno/session/__init__.py +10 -0
- agno/session/agent.py +260 -0
- agno/session/summary.py +265 -0
- agno/session/team.py +342 -0
- agno/session/workflow.py +501 -0
- agno/table.py +10 -0
- agno/team/__init__.py +37 -0
- agno/team/team.py +9536 -0
- agno/tools/__init__.py +7 -0
- agno/tools/agentql.py +120 -0
- agno/tools/airflow.py +22 -12
- agno/tools/api.py +122 -0
- agno/tools/apify.py +276 -83
- agno/tools/{arxiv_toolkit.py → arxiv.py} +20 -12
- agno/tools/aws_lambda.py +28 -7
- agno/tools/aws_ses.py +66 -0
- agno/tools/baidusearch.py +11 -4
- agno/tools/bitbucket.py +292 -0
- agno/tools/brandfetch.py +213 -0
- agno/tools/bravesearch.py +106 -0
- agno/tools/brightdata.py +367 -0
- agno/tools/browserbase.py +209 -0
- agno/tools/calcom.py +32 -23
- agno/tools/calculator.py +24 -37
- agno/tools/cartesia.py +187 -0
- agno/tools/{clickup_tool.py → clickup.py} +17 -28
- agno/tools/confluence.py +91 -26
- agno/tools/crawl4ai.py +139 -43
- agno/tools/csv_toolkit.py +28 -22
- agno/tools/dalle.py +36 -22
- agno/tools/daytona.py +475 -0
- agno/tools/decorator.py +169 -14
- agno/tools/desi_vocal.py +23 -11
- agno/tools/discord.py +32 -29
- agno/tools/docker.py +716 -0
- agno/tools/duckdb.py +76 -81
- agno/tools/duckduckgo.py +43 -40
- agno/tools/e2b.py +703 -0
- agno/tools/eleven_labs.py +65 -54
- agno/tools/email.py +13 -5
- agno/tools/evm.py +129 -0
- agno/tools/exa.py +324 -42
- agno/tools/fal.py +39 -35
- agno/tools/file.py +196 -30
- agno/tools/file_generation.py +356 -0
- agno/tools/financial_datasets.py +288 -0
- agno/tools/firecrawl.py +108 -33
- agno/tools/function.py +960 -122
- agno/tools/giphy.py +34 -12
- agno/tools/github.py +1294 -97
- agno/tools/gmail.py +922 -0
- agno/tools/google_bigquery.py +117 -0
- agno/tools/google_drive.py +271 -0
- agno/tools/google_maps.py +253 -0
- agno/tools/googlecalendar.py +607 -107
- agno/tools/googlesheets.py +377 -0
- agno/tools/hackernews.py +20 -12
- agno/tools/jina.py +24 -14
- agno/tools/jira.py +48 -19
- agno/tools/knowledge.py +218 -0
- agno/tools/linear.py +82 -43
- agno/tools/linkup.py +58 -0
- agno/tools/local_file_system.py +15 -7
- agno/tools/lumalab.py +41 -26
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +193 -0
- agno/tools/memory.py +419 -0
- agno/tools/mlx_transcribe.py +11 -9
- agno/tools/models/azure_openai.py +190 -0
- agno/tools/models/gemini.py +203 -0
- agno/tools/models/groq.py +158 -0
- agno/tools/models/morph.py +186 -0
- agno/tools/models/nebius.py +124 -0
- agno/tools/models_labs.py +163 -82
- agno/tools/moviepy_video.py +18 -13
- agno/tools/nano_banana.py +151 -0
- agno/tools/neo4j.py +134 -0
- agno/tools/newspaper.py +15 -4
- agno/tools/newspaper4k.py +19 -6
- agno/tools/notion.py +204 -0
- agno/tools/openai.py +181 -17
- agno/tools/openbb.py +27 -20
- agno/tools/opencv.py +321 -0
- agno/tools/openweather.py +233 -0
- agno/tools/oxylabs.py +385 -0
- agno/tools/pandas.py +25 -15
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +238 -185
- agno/tools/pubmed.py +125 -13
- agno/tools/python.py +48 -35
- agno/tools/reasoning.py +283 -0
- agno/tools/reddit.py +207 -29
- agno/tools/redshift.py +406 -0
- agno/tools/replicate.py +69 -26
- agno/tools/resend.py +11 -6
- agno/tools/scrapegraph.py +179 -19
- agno/tools/searxng.py +23 -31
- agno/tools/serpapi.py +15 -10
- agno/tools/serper.py +255 -0
- agno/tools/shell.py +23 -12
- agno/tools/shopify.py +1519 -0
- agno/tools/slack.py +56 -14
- agno/tools/sleep.py +8 -6
- agno/tools/spider.py +35 -11
- agno/tools/spotify.py +919 -0
- agno/tools/sql.py +34 -19
- agno/tools/tavily.py +158 -8
- agno/tools/telegram.py +18 -8
- agno/tools/todoist.py +218 -0
- agno/tools/toolkit.py +134 -9
- agno/tools/trafilatura.py +388 -0
- agno/tools/trello.py +25 -28
- agno/tools/twilio.py +18 -9
- agno/tools/user_control_flow.py +78 -0
- agno/tools/valyu.py +228 -0
- agno/tools/visualization.py +467 -0
- agno/tools/webbrowser.py +28 -0
- agno/tools/webex.py +76 -0
- agno/tools/website.py +23 -19
- agno/tools/webtools.py +45 -0
- agno/tools/whatsapp.py +286 -0
- agno/tools/wikipedia.py +28 -19
- agno/tools/workflow.py +285 -0
- agno/tools/{twitter.py → x.py} +142 -46
- agno/tools/yfinance.py +41 -39
- agno/tools/youtube.py +34 -17
- agno/tools/zendesk.py +15 -5
- agno/tools/zep.py +454 -0
- agno/tools/zoom.py +86 -37
- agno/tracing/__init__.py +12 -0
- agno/tracing/exporter.py +157 -0
- agno/tracing/schemas.py +276 -0
- agno/tracing/setup.py +111 -0
- agno/utils/agent.py +938 -0
- agno/utils/audio.py +37 -1
- agno/utils/certs.py +27 -0
- agno/utils/code_execution.py +11 -0
- agno/utils/common.py +103 -20
- agno/utils/cryptography.py +22 -0
- agno/utils/dttm.py +33 -0
- agno/utils/events.py +700 -0
- agno/utils/functions.py +107 -37
- agno/utils/gemini.py +426 -0
- agno/utils/hooks.py +171 -0
- agno/utils/http.py +185 -0
- agno/utils/json_schema.py +159 -37
- agno/utils/knowledge.py +36 -0
- agno/utils/location.py +19 -0
- agno/utils/log.py +221 -8
- agno/utils/mcp.py +214 -0
- agno/utils/media.py +335 -14
- agno/utils/merge_dict.py +22 -1
- agno/utils/message.py +77 -2
- agno/utils/models/ai_foundry.py +50 -0
- agno/utils/models/claude.py +373 -0
- agno/utils/models/cohere.py +94 -0
- agno/utils/models/llama.py +85 -0
- agno/utils/models/mistral.py +100 -0
- agno/utils/models/openai_responses.py +140 -0
- agno/utils/models/schema_utils.py +153 -0
- agno/utils/models/watsonx.py +41 -0
- agno/utils/openai.py +257 -0
- agno/utils/pickle.py +1 -1
- agno/utils/pprint.py +124 -8
- agno/utils/print_response/agent.py +930 -0
- agno/utils/print_response/team.py +1914 -0
- agno/utils/print_response/workflow.py +1668 -0
- agno/utils/prompts.py +111 -0
- agno/utils/reasoning.py +108 -0
- agno/utils/response.py +163 -0
- agno/utils/serialize.py +32 -0
- agno/utils/shell.py +4 -4
- agno/utils/streamlit.py +487 -0
- agno/utils/string.py +204 -51
- agno/utils/team.py +139 -0
- agno/utils/timer.py +9 -2
- agno/utils/tokens.py +657 -0
- agno/utils/tools.py +19 -1
- agno/utils/whatsapp.py +305 -0
- agno/utils/yaml_io.py +3 -3
- agno/vectordb/__init__.py +2 -0
- agno/vectordb/base.py +87 -9
- agno/vectordb/cassandra/__init__.py +5 -1
- agno/vectordb/cassandra/cassandra.py +383 -27
- agno/vectordb/chroma/__init__.py +4 -0
- agno/vectordb/chroma/chromadb.py +748 -83
- agno/vectordb/clickhouse/__init__.py +7 -1
- agno/vectordb/clickhouse/clickhousedb.py +554 -53
- agno/vectordb/couchbase/__init__.py +3 -0
- agno/vectordb/couchbase/couchbase.py +1446 -0
- agno/vectordb/lancedb/__init__.py +5 -0
- agno/vectordb/lancedb/lance_db.py +730 -98
- agno/vectordb/langchaindb/__init__.py +5 -0
- agno/vectordb/langchaindb/langchaindb.py +163 -0
- agno/vectordb/lightrag/__init__.py +5 -0
- agno/vectordb/lightrag/lightrag.py +388 -0
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +166 -0
- agno/vectordb/milvus/__init__.py +3 -0
- agno/vectordb/milvus/milvus.py +966 -78
- agno/vectordb/mongodb/__init__.py +9 -1
- agno/vectordb/mongodb/mongodb.py +1175 -172
- agno/vectordb/pgvector/__init__.py +8 -0
- agno/vectordb/pgvector/pgvector.py +599 -115
- agno/vectordb/pineconedb/__init__.py +5 -1
- agno/vectordb/pineconedb/pineconedb.py +406 -43
- agno/vectordb/qdrant/__init__.py +4 -0
- agno/vectordb/qdrant/qdrant.py +914 -61
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +682 -0
- agno/vectordb/singlestore/__init__.py +8 -1
- agno/vectordb/singlestore/singlestore.py +771 -0
- agno/vectordb/surrealdb/__init__.py +3 -0
- agno/vectordb/surrealdb/surrealdb.py +663 -0
- agno/vectordb/upstashdb/__init__.py +5 -0
- agno/vectordb/upstashdb/upstashdb.py +718 -0
- agno/vectordb/weaviate/__init__.py +8 -0
- agno/vectordb/weaviate/index.py +15 -0
- agno/vectordb/weaviate/weaviate.py +1009 -0
- agno/workflow/__init__.py +23 -1
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +759 -0
- agno/workflow/loop.py +756 -0
- agno/workflow/parallel.py +853 -0
- agno/workflow/router.py +723 -0
- agno/workflow/step.py +1564 -0
- agno/workflow/steps.py +613 -0
- agno/workflow/types.py +556 -0
- agno/workflow/workflow.py +4327 -514
- agno-2.3.13.dist-info/METADATA +639 -0
- agno-2.3.13.dist-info/RECORD +613 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +1 -1
- agno-2.3.13.dist-info/licenses/LICENSE +201 -0
- agno/api/playground.py +0 -91
- agno/api/schemas/playground.py +0 -22
- agno/api/schemas/user.py +0 -22
- agno/api/schemas/workspace.py +0 -46
- agno/api/user.py +0 -160
- agno/api/workspace.py +0 -151
- agno/cli/auth_server.py +0 -118
- agno/cli/config.py +0 -275
- agno/cli/console.py +0 -88
- agno/cli/credentials.py +0 -23
- agno/cli/entrypoint.py +0 -571
- agno/cli/operator.py +0 -355
- agno/cli/settings.py +0 -85
- agno/cli/ws/ws_cli.py +0 -817
- agno/constants.py +0 -13
- agno/document/__init__.py +0 -1
- agno/document/chunking/semantic.py +0 -47
- agno/document/chunking/strategy.py +0 -31
- agno/document/reader/__init__.py +0 -1
- agno/document/reader/arxiv_reader.py +0 -41
- agno/document/reader/base.py +0 -22
- agno/document/reader/csv_reader.py +0 -84
- agno/document/reader/docx_reader.py +0 -46
- agno/document/reader/firecrawl_reader.py +0 -99
- agno/document/reader/json_reader.py +0 -43
- agno/document/reader/pdf_reader.py +0 -219
- agno/document/reader/s3/pdf_reader.py +0 -46
- agno/document/reader/s3/text_reader.py +0 -51
- agno/document/reader/text_reader.py +0 -41
- agno/document/reader/website_reader.py +0 -175
- agno/document/reader/youtube_reader.py +0 -50
- agno/embedder/__init__.py +0 -1
- agno/embedder/azure_openai.py +0 -86
- agno/embedder/cohere.py +0 -72
- agno/embedder/fastembed.py +0 -37
- agno/embedder/google.py +0 -73
- agno/embedder/huggingface.py +0 -54
- agno/embedder/mistral.py +0 -80
- agno/embedder/ollama.py +0 -57
- agno/embedder/openai.py +0 -74
- agno/embedder/sentence_transformer.py +0 -38
- agno/embedder/voyageai.py +0 -64
- agno/eval/perf.py +0 -201
- agno/file/__init__.py +0 -1
- agno/file/file.py +0 -16
- agno/file/local/csv.py +0 -32
- agno/file/local/txt.py +0 -19
- agno/infra/app.py +0 -240
- agno/infra/base.py +0 -144
- agno/infra/context.py +0 -20
- agno/infra/db_app.py +0 -52
- agno/infra/resource.py +0 -205
- agno/infra/resources.py +0 -55
- agno/knowledge/agent.py +0 -230
- agno/knowledge/arxiv.py +0 -22
- agno/knowledge/combined.py +0 -22
- agno/knowledge/csv.py +0 -28
- agno/knowledge/csv_url.py +0 -19
- agno/knowledge/document.py +0 -20
- agno/knowledge/docx.py +0 -30
- agno/knowledge/json.py +0 -28
- agno/knowledge/langchain.py +0 -71
- agno/knowledge/llamaindex.py +0 -66
- agno/knowledge/pdf.py +0 -28
- agno/knowledge/pdf_url.py +0 -26
- agno/knowledge/s3/base.py +0 -60
- agno/knowledge/s3/pdf.py +0 -21
- agno/knowledge/s3/text.py +0 -23
- agno/knowledge/text.py +0 -30
- agno/knowledge/website.py +0 -88
- agno/knowledge/wikipedia.py +0 -31
- agno/knowledge/youtube.py +0 -22
- agno/memory/agent.py +0 -392
- agno/memory/classifier.py +0 -104
- agno/memory/db/__init__.py +0 -1
- agno/memory/db/base.py +0 -42
- agno/memory/db/mongodb.py +0 -189
- agno/memory/db/postgres.py +0 -203
- agno/memory/db/sqlite.py +0 -193
- agno/memory/memory.py +0 -15
- agno/memory/row.py +0 -36
- agno/memory/summarizer.py +0 -192
- agno/memory/summary.py +0 -19
- agno/memory/workflow.py +0 -38
- agno/models/google/gemini_openai.py +0 -26
- agno/models/ollama/hermes.py +0 -221
- agno/models/ollama/tools.py +0 -362
- agno/models/vertexai/gemini.py +0 -595
- agno/playground/__init__.py +0 -3
- agno/playground/async_router.py +0 -421
- agno/playground/deploy.py +0 -249
- agno/playground/operator.py +0 -92
- agno/playground/playground.py +0 -91
- agno/playground/schemas.py +0 -76
- agno/playground/serve.py +0 -55
- agno/playground/sync_router.py +0 -405
- agno/reasoning/agent.py +0 -68
- agno/run/response.py +0 -112
- agno/storage/agent/__init__.py +0 -0
- agno/storage/agent/base.py +0 -38
- agno/storage/agent/dynamodb.py +0 -350
- agno/storage/agent/json.py +0 -92
- agno/storage/agent/mongodb.py +0 -228
- agno/storage/agent/postgres.py +0 -367
- agno/storage/agent/session.py +0 -79
- agno/storage/agent/singlestore.py +0 -303
- agno/storage/agent/sqlite.py +0 -357
- agno/storage/agent/yaml.py +0 -93
- agno/storage/workflow/__init__.py +0 -0
- agno/storage/workflow/base.py +0 -40
- agno/storage/workflow/mongodb.py +0 -233
- agno/storage/workflow/postgres.py +0 -366
- agno/storage/workflow/session.py +0 -60
- agno/storage/workflow/sqlite.py +0 -359
- agno/tools/googlesearch.py +0 -88
- agno/utils/defaults.py +0 -57
- agno/utils/filesystem.py +0 -39
- agno/utils/git.py +0 -52
- agno/utils/json_io.py +0 -30
- agno/utils/load_env.py +0 -19
- agno/utils/py_io.py +0 -19
- agno/utils/pyproject.py +0 -18
- agno/utils/resource_filter.py +0 -31
- agno/vectordb/singlestore/s2vectordb.py +0 -390
- agno/vectordb/singlestore/s2vectordb2.py +0 -355
- agno/workspace/__init__.py +0 -0
- agno/workspace/config.py +0 -325
- agno/workspace/enums.py +0 -6
- agno/workspace/helpers.py +0 -48
- agno/workspace/operator.py +0 -758
- agno/workspace/settings.py +0 -63
- agno-0.1.2.dist-info/LICENSE +0 -375
- agno-0.1.2.dist-info/METADATA +0 -502
- agno-0.1.2.dist-info/RECORD +0 -352
- agno-0.1.2.dist-info/entry_points.txt +0 -3
- /agno/{cli → db/migrations}/__init__.py +0 -0
- /agno/{cli/ws → db/migrations/versions}/__init__.py +0 -0
- /agno/{document/chunking/__init__.py → db/schemas/metrics.py} +0 -0
- /agno/{document/reader/s3 → integrations}/__init__.py +0 -0
- /agno/{file/local → knowledge/chunking}/__init__.py +0 -0
- /agno/{infra → knowledge/remote_content}/__init__.py +0 -0
- /agno/{knowledge/s3 → tools/models}/__init__.py +0 -0
- /agno/{reranker → utils/models}/__init__.py +0 -0
- /agno/{storage → utils/print_response}/__init__.py +0 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
agno/vectordb/chroma/chromadb.py
CHANGED
|
@@ -1,21 +1,23 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
1
3
|
from hashlib import md5
|
|
2
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Mapping, Optional, Union, cast
|
|
3
5
|
|
|
4
6
|
try:
|
|
5
7
|
from chromadb import Client as ChromaDbClient
|
|
6
8
|
from chromadb import PersistentClient as PersistentChromaDbClient
|
|
7
9
|
from chromadb.api.client import ClientAPI
|
|
8
10
|
from chromadb.api.models.Collection import Collection
|
|
9
|
-
from chromadb.api.types import
|
|
11
|
+
from chromadb.api.types import QueryResult
|
|
10
12
|
|
|
11
13
|
except ImportError:
|
|
12
14
|
raise ImportError("The `chromadb` package is not installed. Please install it via `pip install chromadb`.")
|
|
13
15
|
|
|
14
|
-
from agno.
|
|
15
|
-
from agno.
|
|
16
|
-
from agno.embedder
|
|
17
|
-
from agno.reranker.base import Reranker
|
|
18
|
-
from agno.utils.log import logger
|
|
16
|
+
from agno.filters import FilterExpr
|
|
17
|
+
from agno.knowledge.document import Document
|
|
18
|
+
from agno.knowledge.embedder import Embedder
|
|
19
|
+
from agno.knowledge.reranker.base import Reranker
|
|
20
|
+
from agno.utils.log import log_debug, log_error, log_info, log_warning, logger
|
|
19
21
|
from agno.vectordb.base import VectorDb
|
|
20
22
|
from agno.vectordb.distance import Distance
|
|
21
23
|
|
|
@@ -24,19 +26,39 @@ class ChromaDb(VectorDb):
|
|
|
24
26
|
def __init__(
|
|
25
27
|
self,
|
|
26
28
|
collection: str,
|
|
27
|
-
|
|
29
|
+
name: Optional[str] = None,
|
|
30
|
+
description: Optional[str] = None,
|
|
31
|
+
id: Optional[str] = None,
|
|
32
|
+
embedder: Optional[Embedder] = None,
|
|
28
33
|
distance: Distance = Distance.cosine,
|
|
29
34
|
path: str = "tmp/chromadb",
|
|
30
35
|
persistent_client: bool = False,
|
|
31
36
|
reranker: Optional[Reranker] = None,
|
|
32
37
|
**kwargs,
|
|
33
38
|
):
|
|
34
|
-
#
|
|
35
|
-
|
|
39
|
+
# Validate required parameters
|
|
40
|
+
if not collection:
|
|
41
|
+
raise ValueError("Collection name must be provided.")
|
|
42
|
+
|
|
43
|
+
# Dynamic ID generation based on unique identifiers
|
|
44
|
+
if id is None:
|
|
45
|
+
from agno.utils.string import generate_id
|
|
46
|
+
|
|
47
|
+
seed = f"{path}#{collection}"
|
|
48
|
+
id = generate_id(seed)
|
|
49
|
+
|
|
50
|
+
# Initialize base class with name, description, and generated ID
|
|
51
|
+
super().__init__(id=id, name=name, description=description)
|
|
36
52
|
|
|
53
|
+
# Collection attributes
|
|
54
|
+
self.collection_name: str = collection
|
|
37
55
|
# Embedder for embedding the document contents
|
|
38
|
-
|
|
56
|
+
if embedder is None:
|
|
57
|
+
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
39
58
|
|
|
59
|
+
embedder = OpenAIEmbedder()
|
|
60
|
+
log_info("Embedder not provided, using OpenAIEmbedder as default.")
|
|
61
|
+
self.embedder: Embedder = embedder
|
|
40
62
|
# Distance metric
|
|
41
63
|
self.distance: Distance = distance
|
|
42
64
|
|
|
@@ -56,16 +78,54 @@ class ChromaDb(VectorDb):
|
|
|
56
78
|
# Chroma client kwargs
|
|
57
79
|
self.kwargs = kwargs
|
|
58
80
|
|
|
81
|
+
def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Union[str, int, float, bool]]:
|
|
82
|
+
"""
|
|
83
|
+
Flatten nested metadata to ChromaDB-compatible format.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
metadata: Dictionary that may contain nested structures
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Flattened dictionary with only primitive values
|
|
90
|
+
"""
|
|
91
|
+
flattened: Dict[str, Any] = {}
|
|
92
|
+
|
|
93
|
+
def _flatten_recursive(obj: Any, prefix: str = "") -> None:
|
|
94
|
+
if isinstance(obj, dict):
|
|
95
|
+
if len(obj) == 0:
|
|
96
|
+
# Handle empty dictionaries by converting to JSON string
|
|
97
|
+
flattened[prefix] = json.dumps(obj)
|
|
98
|
+
else:
|
|
99
|
+
for key, value in obj.items():
|
|
100
|
+
new_key = f"{prefix}.{key}" if prefix else key
|
|
101
|
+
_flatten_recursive(value, new_key)
|
|
102
|
+
elif isinstance(obj, (list, tuple)):
|
|
103
|
+
# Convert lists/tuples to JSON strings
|
|
104
|
+
flattened[prefix] = json.dumps(obj)
|
|
105
|
+
elif isinstance(obj, (str, int, float, bool)) or obj is None:
|
|
106
|
+
if obj is not None: # ChromaDB doesn't accept None values
|
|
107
|
+
flattened[prefix] = obj
|
|
108
|
+
else:
|
|
109
|
+
# Convert other complex types to JSON strings
|
|
110
|
+
try:
|
|
111
|
+
flattened[prefix] = json.dumps(obj)
|
|
112
|
+
except (TypeError, ValueError):
|
|
113
|
+
# If it can't be serialized, convert to string
|
|
114
|
+
flattened[prefix] = str(obj)
|
|
115
|
+
|
|
116
|
+
_flatten_recursive(metadata)
|
|
117
|
+
return flattened
|
|
118
|
+
|
|
59
119
|
@property
|
|
60
120
|
def client(self) -> ClientAPI:
|
|
61
121
|
if self._client is None:
|
|
62
122
|
if not self.persistent_client:
|
|
63
|
-
|
|
123
|
+
log_debug("Creating Chroma Client")
|
|
64
124
|
self._client = ChromaDbClient(
|
|
65
125
|
**self.kwargs,
|
|
66
126
|
)
|
|
67
127
|
elif self.persistent_client:
|
|
68
|
-
|
|
128
|
+
log_debug("Creating Persistent Chroma Client")
|
|
69
129
|
self._client = PersistentChromaDbClient(
|
|
70
130
|
path=self.path,
|
|
71
131
|
**self.kwargs,
|
|
@@ -74,32 +134,18 @@ class ChromaDb(VectorDb):
|
|
|
74
134
|
|
|
75
135
|
def create(self) -> None:
|
|
76
136
|
"""Create the collection in ChromaDb."""
|
|
77
|
-
if
|
|
78
|
-
|
|
137
|
+
if self.exists():
|
|
138
|
+
log_debug(f"Collection already exists: {self.collection_name}")
|
|
139
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
140
|
+
else:
|
|
141
|
+
log_debug(f"Creating collection: {self.collection_name}")
|
|
79
142
|
self._collection = self.client.create_collection(
|
|
80
|
-
name=self.
|
|
143
|
+
name=self.collection_name, metadata={"hnsw:space": self.distance.value}
|
|
81
144
|
)
|
|
82
145
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def doc_exists(self, document: Document) -> bool:
|
|
88
|
-
"""Check if a document exists in the collection.
|
|
89
|
-
Args:
|
|
90
|
-
document (Document): Document to check.
|
|
91
|
-
Returns:
|
|
92
|
-
bool: True if document exists, False otherwise.
|
|
93
|
-
"""
|
|
94
|
-
if self.client:
|
|
95
|
-
try:
|
|
96
|
-
collection: Collection = self.client.get_collection(name=self.collection)
|
|
97
|
-
collection_data: GetResult = collection.get(include=[IncludeEnum.documents])
|
|
98
|
-
if collection_data.get("documents") != []:
|
|
99
|
-
return True
|
|
100
|
-
except Exception as e:
|
|
101
|
-
logger.error(f"Document does not exist: {e}")
|
|
102
|
-
return False
|
|
146
|
+
async def async_create(self) -> None:
|
|
147
|
+
"""Create the collection asynchronously by running in a thread."""
|
|
148
|
+
await asyncio.to_thread(self.create)
|
|
103
149
|
|
|
104
150
|
def name_exists(self, name: str) -> bool:
|
|
105
151
|
"""Check if a document with a given name exists in the collection.
|
|
@@ -107,121 +153,432 @@ class ChromaDb(VectorDb):
|
|
|
107
153
|
name (str): Name of the document to check.
|
|
108
154
|
Returns:
|
|
109
155
|
bool: True if document exists, False otherwise."""
|
|
110
|
-
if self.client:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
156
|
+
if not self.client:
|
|
157
|
+
logger.warning("Client not initialized")
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
162
|
+
result = collection.get(where=cast(Any, {"name": {"$eq": name}}), limit=1)
|
|
163
|
+
return len(result.get("ids", [])) > 0
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error checking name existence: {e}")
|
|
118
166
|
return False
|
|
119
167
|
|
|
120
|
-
def
|
|
168
|
+
async def async_name_exists(self, name: str) -> bool:
|
|
169
|
+
"""Check if a document with given name exists asynchronously."""
|
|
170
|
+
return await asyncio.to_thread(self.name_exists, name)
|
|
171
|
+
|
|
172
|
+
def insert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
121
173
|
"""Insert documents into the collection.
|
|
122
174
|
|
|
123
175
|
Args:
|
|
124
176
|
documents (List[Document]): List of documents to insert
|
|
125
|
-
filters (Optional[Dict[str, Any]]): Filters to
|
|
177
|
+
filters (Optional[Dict[str, Any]]): Filters to merge with document metadata
|
|
126
178
|
"""
|
|
127
|
-
|
|
179
|
+
log_info(f"Inserting {len(documents)} documents")
|
|
128
180
|
ids: List = []
|
|
129
181
|
docs: List = []
|
|
130
182
|
docs_embeddings: List = []
|
|
183
|
+
docs_metadata: List = []
|
|
184
|
+
|
|
185
|
+
if not self._collection:
|
|
186
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
131
187
|
|
|
132
188
|
for document in documents:
|
|
133
189
|
document.embed(embedder=self.embedder)
|
|
134
190
|
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
135
191
|
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
192
|
+
|
|
193
|
+
# Handle metadata and filters
|
|
194
|
+
metadata = document.meta_data or {}
|
|
195
|
+
if filters:
|
|
196
|
+
metadata.update(filters)
|
|
197
|
+
|
|
198
|
+
# Add name, content_id to metadata
|
|
199
|
+
if document.name is not None:
|
|
200
|
+
metadata["name"] = document.name
|
|
201
|
+
if document.content_id is not None:
|
|
202
|
+
metadata["content_id"] = document.content_id
|
|
203
|
+
|
|
204
|
+
metadata["content_hash"] = content_hash
|
|
205
|
+
|
|
206
|
+
# Flatten metadata for ChromaDB compatibility
|
|
207
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
208
|
+
|
|
209
|
+
docs_embeddings.append(document.embedding)
|
|
210
|
+
docs.append(cleaned_content)
|
|
211
|
+
ids.append(doc_id)
|
|
212
|
+
docs_metadata.append(flattened_metadata)
|
|
213
|
+
log_debug(f"Prepared document: {document.id} | {document.name} | {flattened_metadata}")
|
|
214
|
+
|
|
215
|
+
if self._collection is None:
|
|
216
|
+
logger.warning("Collection does not exist")
|
|
217
|
+
else:
|
|
218
|
+
if len(docs) > 0:
|
|
219
|
+
self._collection.add(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
220
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
221
|
+
|
|
222
|
+
async def async_insert(
|
|
223
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
224
|
+
) -> None:
|
|
225
|
+
"""Insert documents asynchronously by running in a thread."""
|
|
226
|
+
log_info(f"Async Inserting {len(documents)} documents")
|
|
227
|
+
ids: List = []
|
|
228
|
+
docs: List = []
|
|
229
|
+
docs_embeddings: List = []
|
|
230
|
+
docs_metadata: List = []
|
|
231
|
+
|
|
232
|
+
if not self._collection:
|
|
233
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
234
|
+
|
|
235
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
236
|
+
# Use batch embedding when enabled and supported
|
|
237
|
+
try:
|
|
238
|
+
# Extract content from all documents
|
|
239
|
+
doc_contents = [doc.content for doc in documents]
|
|
240
|
+
|
|
241
|
+
# Get batch embeddings and usage
|
|
242
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
243
|
+
|
|
244
|
+
# Process documents with pre-computed embeddings
|
|
245
|
+
for j, doc in enumerate(documents):
|
|
246
|
+
try:
|
|
247
|
+
if j < len(embeddings):
|
|
248
|
+
doc.embedding = embeddings[j]
|
|
249
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
255
|
+
error_str = str(e).lower()
|
|
256
|
+
is_rate_limit = any(
|
|
257
|
+
phrase in error_str
|
|
258
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if is_rate_limit:
|
|
262
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
263
|
+
raise e
|
|
264
|
+
else:
|
|
265
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
266
|
+
# Fall back to individual embedding
|
|
267
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
268
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
269
|
+
else:
|
|
270
|
+
# Use individual embedding
|
|
271
|
+
try:
|
|
272
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
273
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
log_error(f"Error processing document: {e}")
|
|
276
|
+
|
|
277
|
+
for document in documents:
|
|
278
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
279
|
+
# Include content_hash in ID to ensure uniqueness across different content hashes
|
|
280
|
+
base_id = document.id or md5(cleaned_content.encode()).hexdigest()
|
|
281
|
+
doc_id = md5(f"{base_id}_{content_hash}".encode()).hexdigest()
|
|
282
|
+
|
|
283
|
+
# Handle metadata and filters
|
|
284
|
+
metadata = document.meta_data or {}
|
|
285
|
+
if filters:
|
|
286
|
+
metadata.update(filters)
|
|
287
|
+
|
|
288
|
+
# Add name, content_id to metadata
|
|
289
|
+
if document.name is not None:
|
|
290
|
+
metadata["name"] = document.name
|
|
291
|
+
if document.content_id is not None:
|
|
292
|
+
metadata["content_id"] = document.content_id
|
|
293
|
+
|
|
294
|
+
metadata["content_hash"] = content_hash
|
|
295
|
+
|
|
296
|
+
# Flatten metadata for ChromaDB compatibility
|
|
297
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
298
|
+
|
|
136
299
|
docs_embeddings.append(document.embedding)
|
|
137
300
|
docs.append(cleaned_content)
|
|
138
301
|
ids.append(doc_id)
|
|
139
|
-
|
|
302
|
+
docs_metadata.append(flattened_metadata)
|
|
303
|
+
log_debug(f"Prepared document: {document.id} | {document.name} | {flattened_metadata}")
|
|
140
304
|
|
|
141
|
-
if
|
|
142
|
-
|
|
143
|
-
logger.debug(f"Committed {len(docs)} documents")
|
|
305
|
+
if self._collection is None:
|
|
306
|
+
logger.warning("Collection does not exist")
|
|
144
307
|
else:
|
|
145
|
-
|
|
308
|
+
if len(docs) > 0:
|
|
309
|
+
self._collection.add(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
310
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
146
311
|
|
|
147
312
|
def upsert_available(self) -> bool:
|
|
148
313
|
"""Check if upsert is available in ChromaDB."""
|
|
149
314
|
return True
|
|
150
315
|
|
|
151
|
-
def upsert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
316
|
+
def upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
152
317
|
"""Upsert documents into the collection.
|
|
153
318
|
|
|
154
319
|
Args:
|
|
155
320
|
documents (List[Document]): List of documents to upsert
|
|
156
321
|
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
157
322
|
"""
|
|
158
|
-
|
|
323
|
+
try:
|
|
324
|
+
if self.content_hash_exists(content_hash):
|
|
325
|
+
self._delete_by_content_hash(content_hash)
|
|
326
|
+
self._upsert(content_hash, documents, filters)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.error(f"Error upserting documents by content hash: {e}")
|
|
329
|
+
raise
|
|
330
|
+
|
|
331
|
+
def _upsert(self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
332
|
+
"""Upsert documents into the collection.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
documents (List[Document]): List of documents to upsert
|
|
336
|
+
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
337
|
+
"""
|
|
338
|
+
log_info(f"Upserting {len(documents)} documents")
|
|
159
339
|
ids: List = []
|
|
160
340
|
docs: List = []
|
|
161
341
|
docs_embeddings: List = []
|
|
342
|
+
docs_metadata: List = []
|
|
343
|
+
|
|
344
|
+
if not self._collection:
|
|
345
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
162
346
|
|
|
163
347
|
for document in documents:
|
|
164
348
|
document.embed(embedder=self.embedder)
|
|
165
349
|
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
166
350
|
doc_id = md5(cleaned_content.encode()).hexdigest()
|
|
351
|
+
|
|
352
|
+
# Handle metadata and filters
|
|
353
|
+
metadata = document.meta_data or {}
|
|
354
|
+
if filters:
|
|
355
|
+
metadata.update(filters)
|
|
356
|
+
|
|
357
|
+
# Add name, content_id to metadata
|
|
358
|
+
if document.name is not None:
|
|
359
|
+
metadata["name"] = document.name
|
|
360
|
+
if document.content_id is not None:
|
|
361
|
+
metadata["content_id"] = document.content_id
|
|
362
|
+
|
|
363
|
+
metadata["content_hash"] = content_hash
|
|
364
|
+
|
|
365
|
+
# Flatten metadata for ChromaDB compatibility
|
|
366
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
367
|
+
|
|
167
368
|
docs_embeddings.append(document.embedding)
|
|
168
369
|
docs.append(cleaned_content)
|
|
169
370
|
ids.append(doc_id)
|
|
170
|
-
|
|
371
|
+
docs_metadata.append(flattened_metadata)
|
|
372
|
+
log_debug(f"Upserted document: {document.id} | {document.name} | {flattened_metadata}")
|
|
373
|
+
|
|
374
|
+
if self._collection is None:
|
|
375
|
+
logger.warning("Collection does not exist")
|
|
376
|
+
else:
|
|
377
|
+
if len(docs) > 0:
|
|
378
|
+
self._collection.upsert(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
379
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
380
|
+
|
|
381
|
+
async def _async_upsert(
|
|
382
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
383
|
+
) -> None:
|
|
384
|
+
"""Upsert documents into the collection.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
documents (List[Document]): List of documents to upsert
|
|
388
|
+
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
389
|
+
"""
|
|
390
|
+
log_info(f"Async Upserting {len(documents)} documents")
|
|
391
|
+
ids: List = []
|
|
392
|
+
docs: List = []
|
|
393
|
+
docs_embeddings: List = []
|
|
394
|
+
docs_metadata: List = []
|
|
395
|
+
|
|
396
|
+
if not self._collection:
|
|
397
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
398
|
+
|
|
399
|
+
if self.embedder.enable_batch and hasattr(self.embedder, "async_get_embeddings_batch_and_usage"):
|
|
400
|
+
# Use batch embedding when enabled and supported
|
|
401
|
+
try:
|
|
402
|
+
# Extract content from all documents
|
|
403
|
+
doc_contents = [doc.content for doc in documents]
|
|
404
|
+
|
|
405
|
+
# Get batch embeddings and usage
|
|
406
|
+
embeddings, usages = await self.embedder.async_get_embeddings_batch_and_usage(doc_contents)
|
|
171
407
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
408
|
+
# Process documents with pre-computed embeddings
|
|
409
|
+
for j, doc in enumerate(documents):
|
|
410
|
+
try:
|
|
411
|
+
if j < len(embeddings):
|
|
412
|
+
doc.embedding = embeddings[j]
|
|
413
|
+
doc.usage = usages[j] if j < len(usages) else None
|
|
414
|
+
except Exception as e:
|
|
415
|
+
logger.error(f"Error assigning batch embedding to document '{doc.name}': {e}")
|
|
175
416
|
|
|
417
|
+
except Exception as e:
|
|
418
|
+
# Check if this is a rate limit error - don't fall back as it would make things worse
|
|
419
|
+
error_str = str(e).lower()
|
|
420
|
+
is_rate_limit = any(
|
|
421
|
+
phrase in error_str
|
|
422
|
+
for phrase in ["rate limit", "too many requests", "429", "trial key", "api calls / minute"]
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if is_rate_limit:
|
|
426
|
+
logger.error(f"Rate limit detected during batch embedding. {e}")
|
|
427
|
+
raise e
|
|
428
|
+
else:
|
|
429
|
+
logger.warning(f"Async batch embedding failed, falling back to individual embeddings: {e}")
|
|
430
|
+
# Fall back to individual embedding
|
|
431
|
+
embed_tasks = [doc.async_embed(embedder=self.embedder) for doc in documents]
|
|
432
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
176
433
|
else:
|
|
177
|
-
|
|
434
|
+
# Use individual embedding
|
|
435
|
+
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
436
|
+
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
437
|
+
|
|
438
|
+
for document in documents:
|
|
439
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
440
|
+
# Include content_hash in ID to ensure uniqueness across different content hashes
|
|
441
|
+
base_id = document.id or md5(cleaned_content.encode()).hexdigest()
|
|
442
|
+
doc_id = md5(f"{base_id}_{content_hash}".encode()).hexdigest()
|
|
178
443
|
|
|
179
|
-
|
|
444
|
+
# Handle metadata and filters
|
|
445
|
+
metadata = document.meta_data or {}
|
|
446
|
+
if filters:
|
|
447
|
+
metadata.update(filters)
|
|
448
|
+
|
|
449
|
+
# Add name, content_id to metadata
|
|
450
|
+
if document.name is not None:
|
|
451
|
+
metadata["name"] = document.name
|
|
452
|
+
if document.content_id is not None:
|
|
453
|
+
metadata["content_id"] = document.content_id
|
|
454
|
+
|
|
455
|
+
metadata["content_hash"] = content_hash
|
|
456
|
+
|
|
457
|
+
# Flatten metadata for ChromaDB compatibility
|
|
458
|
+
flattened_metadata = self._flatten_metadata(metadata)
|
|
459
|
+
|
|
460
|
+
docs_embeddings.append(document.embedding)
|
|
461
|
+
docs.append(cleaned_content)
|
|
462
|
+
ids.append(doc_id)
|
|
463
|
+
docs_metadata.append(flattened_metadata)
|
|
464
|
+
log_debug(f"Upserted document: {document.id} | {document.name} | {flattened_metadata}")
|
|
465
|
+
|
|
466
|
+
if self._collection is None:
|
|
467
|
+
logger.warning("Collection does not exist")
|
|
468
|
+
else:
|
|
469
|
+
if len(docs) > 0:
|
|
470
|
+
self._collection.upsert(ids=ids, embeddings=docs_embeddings, documents=docs, metadatas=docs_metadata)
|
|
471
|
+
log_debug(f"Committed {len(docs)} documents")
|
|
472
|
+
|
|
473
|
+
async def async_upsert(
|
|
474
|
+
self, content_hash: str, documents: List[Document], filters: Optional[Dict[str, Any]] = None
|
|
475
|
+
) -> None:
|
|
476
|
+
"""Upsert documents asynchronously by running in a thread."""
|
|
477
|
+
try:
|
|
478
|
+
if self.content_hash_exists(content_hash):
|
|
479
|
+
self._delete_by_content_hash(content_hash)
|
|
480
|
+
await self._async_upsert(content_hash, documents, filters)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger.error(f"Error upserting documents by content hash: {e}")
|
|
483
|
+
raise
|
|
484
|
+
|
|
485
|
+
def search(
|
|
486
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
487
|
+
) -> List[Document]:
|
|
180
488
|
"""Search the collection for a query.
|
|
181
489
|
|
|
182
490
|
Args:
|
|
183
491
|
query (str): Query to search for.
|
|
184
492
|
limit (int): Number of results to return.
|
|
185
|
-
filters (Optional[Dict[str, Any]]): Filters to apply while searching.
|
|
493
|
+
filters (Optional[Union[Dict[str, Any], List[FilterExpr]]]): Filters to apply while searching.
|
|
494
|
+
Supports ChromaDB's filtering operators:
|
|
495
|
+
- $eq, $ne: Equality/Inequality
|
|
496
|
+
- $gt, $gte, $lt, $lte: Numeric comparisons
|
|
497
|
+
- $in, $nin: List inclusion/exclusion
|
|
498
|
+
- $and, $or: Logical operators
|
|
186
499
|
Returns:
|
|
187
500
|
List[Document]: List of search results.
|
|
188
501
|
"""
|
|
502
|
+
if isinstance(filters, list):
|
|
503
|
+
log_warning("Filter Expressions are not yet supported in ChromaDB. No filters will be applied.")
|
|
504
|
+
filters = None
|
|
189
505
|
query_embedding = self.embedder.get_embedding(query)
|
|
190
506
|
if query_embedding is None:
|
|
191
507
|
logger.error(f"Error getting embedding for Query: {query}")
|
|
192
508
|
return []
|
|
193
509
|
|
|
194
510
|
if not self._collection:
|
|
195
|
-
self._collection = self.client.get_collection(name=self.
|
|
511
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
512
|
+
|
|
513
|
+
# Convert simple filters to ChromaDB's format if needed
|
|
514
|
+
where_filter = self._convert_filters(filters) if filters else None
|
|
196
515
|
|
|
197
516
|
result: QueryResult = self._collection.query(
|
|
198
517
|
query_embeddings=query_embedding,
|
|
199
518
|
n_results=limit,
|
|
519
|
+
where=where_filter, # Add where filter
|
|
520
|
+
include=["metadatas", "documents", "embeddings", "distances", "uris"],
|
|
200
521
|
)
|
|
201
522
|
|
|
202
523
|
# Build search results
|
|
203
524
|
search_results: List[Document] = []
|
|
204
525
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
526
|
+
ids_list = result.get("ids", [[]]) # type: ignore
|
|
527
|
+
metadata_list = result.get("metadatas", [[{}]]) # type: ignore
|
|
528
|
+
documents_list = result.get("documents", [[]]) # type: ignore
|
|
529
|
+
embeddings_list = result.get("embeddings") # type: ignore
|
|
530
|
+
distances_list = result.get("distances", [[]]) # type: ignore
|
|
531
|
+
|
|
532
|
+
if not ids_list or not metadata_list or not documents_list or embeddings_list is None or not distances_list:
|
|
533
|
+
return search_results
|
|
534
|
+
|
|
535
|
+
ids = ids_list[0]
|
|
536
|
+
metadata = [dict(m) if m else {} for m in metadata_list[0]] # Convert to mutable dicts
|
|
537
|
+
documents = documents_list[0]
|
|
538
|
+
embeddings_raw = embeddings_list[0] if embeddings_list else []
|
|
539
|
+
embeddings = []
|
|
540
|
+
for e in embeddings_raw:
|
|
541
|
+
if hasattr(e, "tolist") and callable(getattr(e, "tolist", None)):
|
|
542
|
+
try:
|
|
543
|
+
embeddings.append(list(cast(Any, e).tolist()))
|
|
544
|
+
except (AttributeError, TypeError):
|
|
545
|
+
embeddings.append(list(e) if isinstance(e, (list, tuple)) else [])
|
|
546
|
+
elif isinstance(e, (list, tuple)):
|
|
547
|
+
embeddings.append([float(x) for x in e if isinstance(x, (int, float))])
|
|
548
|
+
elif isinstance(e, (int, float)):
|
|
549
|
+
embeddings.append([float(e)])
|
|
550
|
+
else:
|
|
551
|
+
embeddings.append([])
|
|
552
|
+
distances = distances_list[0]
|
|
553
|
+
|
|
554
|
+
for idx, distance in enumerate(distances):
|
|
555
|
+
if idx < len(metadata):
|
|
556
|
+
metadata[idx]["distances"] = distance
|
|
215
557
|
|
|
216
558
|
try:
|
|
217
|
-
|
|
218
|
-
|
|
559
|
+
for idx, (id_, doc_metadata, document) in enumerate(zip(ids, metadata, documents)):
|
|
560
|
+
# Extract the fields we added to metadata
|
|
561
|
+
name_val = doc_metadata.pop("name", None)
|
|
562
|
+
content_id_val = doc_metadata.pop("content_id", None)
|
|
563
|
+
|
|
564
|
+
# Convert types to match Document constructor expectations
|
|
565
|
+
name = str(name_val) if name_val is not None and not isinstance(name_val, str) else name_val
|
|
566
|
+
content_id = (
|
|
567
|
+
str(content_id_val)
|
|
568
|
+
if content_id_val is not None and not isinstance(content_id_val, str)
|
|
569
|
+
else content_id_val
|
|
570
|
+
)
|
|
571
|
+
content = str(document) if document is not None else ""
|
|
572
|
+
embedding = embeddings[idx] if idx < len(embeddings) else None
|
|
573
|
+
|
|
219
574
|
search_results.append(
|
|
220
575
|
Document(
|
|
221
576
|
id=id_,
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
577
|
+
name=name,
|
|
578
|
+
meta_data=doc_metadata,
|
|
579
|
+
content=content,
|
|
580
|
+
embedding=embedding,
|
|
581
|
+
content_id=content_id,
|
|
225
582
|
)
|
|
226
583
|
)
|
|
227
584
|
except Exception as e:
|
|
@@ -230,28 +587,68 @@ class ChromaDb(VectorDb):
|
|
|
230
587
|
if self.reranker:
|
|
231
588
|
search_results = self.reranker.rerank(query=query, documents=search_results)
|
|
232
589
|
|
|
590
|
+
log_info(f"Found {len(search_results)} documents")
|
|
233
591
|
return search_results
|
|
234
592
|
|
|
593
|
+
def _convert_filters(self, filters: Dict[str, Any]) -> Dict[str, Any]:
|
|
594
|
+
"""Convert simple filters to ChromaDB's filter format.
|
|
595
|
+
|
|
596
|
+
Handles conversion of simple key-value filters to ChromaDB's operator format
|
|
597
|
+
when needed.
|
|
598
|
+
"""
|
|
599
|
+
if not filters:
|
|
600
|
+
return {}
|
|
601
|
+
|
|
602
|
+
# If filters already use ChromaDB operators ($eq, $ne, etc.), return as is
|
|
603
|
+
if any(key.startswith("$") for key in filters.keys()):
|
|
604
|
+
return filters
|
|
605
|
+
|
|
606
|
+
# Convert simple key-value pairs to ChromaDB's format
|
|
607
|
+
converted = {}
|
|
608
|
+
for key, value in filters.items():
|
|
609
|
+
if isinstance(value, (list, tuple)):
|
|
610
|
+
# Convert lists to $in operator
|
|
611
|
+
converted[key] = {"$in": list(value)}
|
|
612
|
+
else:
|
|
613
|
+
# Convert simple equality to $eq
|
|
614
|
+
converted[key] = {"$eq": value}
|
|
615
|
+
|
|
616
|
+
return converted
|
|
617
|
+
|
|
618
|
+
async def async_search(
|
|
619
|
+
self, query: str, limit: int = 5, filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None
|
|
620
|
+
) -> List[Document]:
|
|
621
|
+
"""Search asynchronously by running in a thread."""
|
|
622
|
+
return await asyncio.to_thread(self.search, query, limit, filters)
|
|
623
|
+
|
|
235
624
|
def drop(self) -> None:
|
|
236
625
|
"""Delete the collection."""
|
|
237
626
|
if self.exists():
|
|
238
|
-
|
|
239
|
-
self.client.delete_collection(name=self.
|
|
627
|
+
log_debug(f"Deleting collection: {self.collection_name}")
|
|
628
|
+
self.client.delete_collection(name=self.collection_name)
|
|
629
|
+
|
|
630
|
+
async def async_drop(self) -> None:
|
|
631
|
+
"""Drop the collection asynchronously by running in a thread."""
|
|
632
|
+
await asyncio.to_thread(self.drop)
|
|
240
633
|
|
|
241
634
|
def exists(self) -> bool:
|
|
242
635
|
"""Check if the collection exists."""
|
|
243
636
|
try:
|
|
244
|
-
self.client.get_collection(name=self.
|
|
637
|
+
self.client.get_collection(name=self.collection_name)
|
|
245
638
|
return True
|
|
246
639
|
except Exception as e:
|
|
247
|
-
|
|
640
|
+
log_debug(f"Collection does not exist: {e}")
|
|
248
641
|
return False
|
|
249
642
|
|
|
643
|
+
async def async_exists(self) -> bool:
|
|
644
|
+
"""Check if collection exists asynchronously by running in a thread."""
|
|
645
|
+
return await asyncio.to_thread(self.exists)
|
|
646
|
+
|
|
250
647
|
def get_count(self) -> int:
|
|
251
648
|
"""Get the count of documents in the collection."""
|
|
252
649
|
if self.exists():
|
|
253
650
|
try:
|
|
254
|
-
collection: Collection = self.client.get_collection(name=self.
|
|
651
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
255
652
|
return collection.count()
|
|
256
653
|
except Exception as e:
|
|
257
654
|
logger.error(f"Error getting count: {e}")
|
|
@@ -262,8 +659,276 @@ class ChromaDb(VectorDb):
|
|
|
262
659
|
|
|
263
660
|
def delete(self) -> bool:
|
|
264
661
|
try:
|
|
265
|
-
self.client.delete_collection(name=self.
|
|
662
|
+
self.client.delete_collection(name=self.collection_name)
|
|
266
663
|
return True
|
|
267
664
|
except Exception as e:
|
|
268
665
|
logger.error(f"Error clearing collection: {e}")
|
|
269
666
|
return False
|
|
667
|
+
|
|
668
|
+
def delete_by_id(self, id: str) -> bool:
|
|
669
|
+
"""Delete document by ID."""
|
|
670
|
+
if not self.client:
|
|
671
|
+
logger.error("Client not initialized")
|
|
672
|
+
return False
|
|
673
|
+
|
|
674
|
+
try:
|
|
675
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
676
|
+
|
|
677
|
+
# Check if document exists
|
|
678
|
+
if not self.id_exists(id):
|
|
679
|
+
log_info(f"Document with ID '{id}' not found")
|
|
680
|
+
return False
|
|
681
|
+
|
|
682
|
+
# Delete the document
|
|
683
|
+
collection.delete(ids=[id])
|
|
684
|
+
log_info(f"Deleted document with ID '{id}'")
|
|
685
|
+
return True
|
|
686
|
+
except Exception as e:
|
|
687
|
+
logger.error(f"Error deleting document by ID '{id}': {e}")
|
|
688
|
+
return False
|
|
689
|
+
|
|
690
|
+
def delete_by_name(self, name: str) -> bool:
|
|
691
|
+
"""Delete documents by name."""
|
|
692
|
+
if not self.client:
|
|
693
|
+
logger.error("Client not initialized")
|
|
694
|
+
return False
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
698
|
+
|
|
699
|
+
# Find all documents with the given name
|
|
700
|
+
result = collection.get(where=cast(Any, {"name": {"$eq": name}}))
|
|
701
|
+
ids_to_delete = result.get("ids", [])
|
|
702
|
+
|
|
703
|
+
if not ids_to_delete:
|
|
704
|
+
log_info(f"No documents found with name '{name}'")
|
|
705
|
+
return False
|
|
706
|
+
|
|
707
|
+
# Delete all matching documents
|
|
708
|
+
collection.delete(ids=ids_to_delete)
|
|
709
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with name '{name}'")
|
|
710
|
+
return True
|
|
711
|
+
except Exception as e:
|
|
712
|
+
logger.error(f"Error deleting documents by name '{name}': {e}")
|
|
713
|
+
return False
|
|
714
|
+
|
|
715
|
+
def delete_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
716
|
+
"""Delete documents by metadata."""
|
|
717
|
+
if not self.client:
|
|
718
|
+
logger.error("Client not initialized")
|
|
719
|
+
return False
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
723
|
+
|
|
724
|
+
# Build where clause for metadata filtering
|
|
725
|
+
where_clause = {}
|
|
726
|
+
for key, value in metadata.items():
|
|
727
|
+
where_clause[key] = {"$eq": value}
|
|
728
|
+
|
|
729
|
+
# Find all documents with the matching metadata
|
|
730
|
+
result = collection.get(where=cast(Any, where_clause))
|
|
731
|
+
ids_to_delete = result.get("ids", [])
|
|
732
|
+
|
|
733
|
+
if not ids_to_delete:
|
|
734
|
+
log_info(f"No documents found with metadata '{metadata}'")
|
|
735
|
+
return False
|
|
736
|
+
|
|
737
|
+
# Delete all matching documents
|
|
738
|
+
collection.delete(ids=ids_to_delete)
|
|
739
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with metadata '{metadata}'")
|
|
740
|
+
return True
|
|
741
|
+
except Exception as e:
|
|
742
|
+
logger.error(f"Error deleting documents by metadata '{metadata}': {e}")
|
|
743
|
+
return False
|
|
744
|
+
|
|
745
|
+
def delete_by_content_id(self, content_id: str) -> bool:
|
|
746
|
+
"""Delete documents by content ID."""
|
|
747
|
+
if not self.client:
|
|
748
|
+
logger.error("Client not initialized")
|
|
749
|
+
return False
|
|
750
|
+
|
|
751
|
+
try:
|
|
752
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
753
|
+
|
|
754
|
+
# Find all documents with the given content_id
|
|
755
|
+
result = collection.get(where=cast(Any, {"content_id": {"$eq": content_id}}))
|
|
756
|
+
ids_to_delete = result.get("ids", [])
|
|
757
|
+
|
|
758
|
+
if not ids_to_delete:
|
|
759
|
+
log_info(f"No documents found with content_id '{content_id}'")
|
|
760
|
+
return False
|
|
761
|
+
|
|
762
|
+
# Delete all matching documents
|
|
763
|
+
collection.delete(ids=ids_to_delete)
|
|
764
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with content_id '{content_id}'")
|
|
765
|
+
return True
|
|
766
|
+
except Exception as e:
|
|
767
|
+
logger.error(f"Error deleting documents by content_id '{content_id}': {e}")
|
|
768
|
+
return False
|
|
769
|
+
|
|
770
|
+
def _delete_by_content_hash(self, content_hash: str) -> bool:
|
|
771
|
+
"""Delete documents by content hash."""
|
|
772
|
+
if not self.client:
|
|
773
|
+
logger.error("Client not initialized")
|
|
774
|
+
return False
|
|
775
|
+
|
|
776
|
+
try:
|
|
777
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
778
|
+
|
|
779
|
+
# Find all documents with the given content_hash
|
|
780
|
+
result = collection.get(where=cast(Any, {"content_hash": {"$eq": content_hash}}))
|
|
781
|
+
ids_to_delete = result.get("ids", [])
|
|
782
|
+
|
|
783
|
+
if not ids_to_delete:
|
|
784
|
+
log_info(f"No documents found with content_hash '{content_hash}'")
|
|
785
|
+
return False
|
|
786
|
+
|
|
787
|
+
# Delete all matching documents
|
|
788
|
+
collection.delete(ids=ids_to_delete)
|
|
789
|
+
log_info(f"Deleted {len(ids_to_delete)} documents with content_hash '{content_hash}'")
|
|
790
|
+
return True
|
|
791
|
+
except Exception as e:
|
|
792
|
+
logger.error(f"Error deleting documents by content_hash '{content_hash}': {e}")
|
|
793
|
+
return False
|
|
794
|
+
|
|
795
|
+
def id_exists(self, id: str) -> bool:
|
|
796
|
+
"""Check if a document with the given ID exists in the collection.
|
|
797
|
+
|
|
798
|
+
Args:
|
|
799
|
+
id (str): The document ID to check.
|
|
800
|
+
|
|
801
|
+
Returns:
|
|
802
|
+
bool: True if the document exists, False otherwise.
|
|
803
|
+
"""
|
|
804
|
+
if not self.client:
|
|
805
|
+
logger.error("Client not initialized")
|
|
806
|
+
return False
|
|
807
|
+
|
|
808
|
+
try:
|
|
809
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
810
|
+
# Try to get the document by ID
|
|
811
|
+
result = collection.get(ids=[id])
|
|
812
|
+
found_ids = result.get("ids", [])
|
|
813
|
+
|
|
814
|
+
# Return True if the document was found
|
|
815
|
+
return len(found_ids) > 0
|
|
816
|
+
except Exception as e:
|
|
817
|
+
logger.error(f"Error checking if ID '{id}' exists: {e}")
|
|
818
|
+
return False
|
|
819
|
+
|
|
820
|
+
def content_hash_exists(self, content_hash: str) -> bool:
|
|
821
|
+
"""Check if documents with the given content hash exist."""
|
|
822
|
+
if not self.client:
|
|
823
|
+
logger.error("Client not initialized")
|
|
824
|
+
return False
|
|
825
|
+
|
|
826
|
+
try:
|
|
827
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
828
|
+
|
|
829
|
+
# Try to query for documents with the given content_hash
|
|
830
|
+
try:
|
|
831
|
+
result = collection.get(where=cast(Any, {"content_hash": {"$eq": content_hash}}))
|
|
832
|
+
# Safely extract ids from result
|
|
833
|
+
if hasattr(result, "get") and callable(result.get):
|
|
834
|
+
found_ids = result.get("ids", [])
|
|
835
|
+
elif hasattr(result, "__getitem__") and "ids" in result:
|
|
836
|
+
found_ids = result["ids"]
|
|
837
|
+
else:
|
|
838
|
+
found_ids = []
|
|
839
|
+
|
|
840
|
+
# Return True if any documents were found
|
|
841
|
+
if isinstance(found_ids, (list, tuple)):
|
|
842
|
+
return len(found_ids) > 0
|
|
843
|
+
elif isinstance(found_ids, int):
|
|
844
|
+
# Some ChromaDB versions might return a count instead of a list
|
|
845
|
+
return found_ids > 0
|
|
846
|
+
else:
|
|
847
|
+
return False
|
|
848
|
+
|
|
849
|
+
except TypeError as te:
|
|
850
|
+
if "object of type 'int' has no len()" in str(te):
|
|
851
|
+
# Known issue with ChromaDB 0.5.0 - internal bug
|
|
852
|
+
# As a workaround, assume content doesn't exist to allow processing to continue
|
|
853
|
+
logger.warning(
|
|
854
|
+
f"ChromaDB internal error (version 0.5.0 bug): {te}. Assuming content_hash '{content_hash}' does not exist."
|
|
855
|
+
)
|
|
856
|
+
return False
|
|
857
|
+
else:
|
|
858
|
+
raise te
|
|
859
|
+
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.error(f"Error checking if content_hash '{content_hash}' exists: {e}")
|
|
862
|
+
return False
|
|
863
|
+
|
|
864
|
+
def update_metadata(self, content_id: str, metadata: Dict[str, Any]) -> None:
|
|
865
|
+
"""
|
|
866
|
+
Update the metadata for documents with the given content_id.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
content_id (str): The content ID to update
|
|
870
|
+
metadata (Dict[str, Any]): The metadata to update
|
|
871
|
+
"""
|
|
872
|
+
try:
|
|
873
|
+
if not self.client:
|
|
874
|
+
logger.error("Client not initialized")
|
|
875
|
+
return
|
|
876
|
+
|
|
877
|
+
collection: Collection = self.client.get_collection(name=self.collection_name)
|
|
878
|
+
|
|
879
|
+
# Find documents with the given content_id
|
|
880
|
+
try:
|
|
881
|
+
result = collection.get(where=cast(Any, {"content_id": {"$eq": content_id}}))
|
|
882
|
+
|
|
883
|
+
# Extract IDs and current metadata
|
|
884
|
+
if hasattr(result, "get") and callable(result.get):
|
|
885
|
+
ids = result.get("ids", [])
|
|
886
|
+
current_metadatas = result.get("metadatas", [])
|
|
887
|
+
elif hasattr(result, "__getitem__"):
|
|
888
|
+
ids = result.get("ids", []) if "ids" in result else []
|
|
889
|
+
current_metadatas = result.get("metadatas", []) if "metadatas" in result else []
|
|
890
|
+
else:
|
|
891
|
+
ids = []
|
|
892
|
+
current_metadatas = []
|
|
893
|
+
|
|
894
|
+
if not ids:
|
|
895
|
+
logger.debug(f"No documents found with content_id: {content_id}")
|
|
896
|
+
return
|
|
897
|
+
|
|
898
|
+
# Flatten the new metadata first
|
|
899
|
+
flattened_new_metadata = self._flatten_metadata(metadata)
|
|
900
|
+
|
|
901
|
+
# Merge metadata for each document
|
|
902
|
+
updated_metadatas = []
|
|
903
|
+
for i, current_meta in enumerate(current_metadatas or []):
|
|
904
|
+
if current_meta is None:
|
|
905
|
+
meta_dict: Dict[str, Any] = {}
|
|
906
|
+
else:
|
|
907
|
+
meta_dict = dict(current_meta) # Convert Mapping to dict
|
|
908
|
+
|
|
909
|
+
# Update with flattened metadata
|
|
910
|
+
meta_dict.update(flattened_new_metadata)
|
|
911
|
+
updated_metadatas.append(meta_dict)
|
|
912
|
+
|
|
913
|
+
# Convert to the expected type for ChromaDB
|
|
914
|
+
chroma_metadatas = cast(List[Mapping[str, Union[str, int, float, bool]]], updated_metadatas)
|
|
915
|
+
chroma_metadatas = [{k: v for k, v in m.items() if k and v} for m in chroma_metadatas]
|
|
916
|
+
collection.update(ids=ids, metadatas=chroma_metadatas) # type: ignore
|
|
917
|
+
logger.debug(f"Updated metadata for {len(ids)} documents with content_id: {content_id}")
|
|
918
|
+
|
|
919
|
+
except TypeError as te:
|
|
920
|
+
if "object of type 'int' has no len()" in str(te):
|
|
921
|
+
logger.warning(
|
|
922
|
+
f"ChromaDB internal error (version 0.5.0 bug): {te}. Cannot update metadata for content_id '{content_id}'."
|
|
923
|
+
)
|
|
924
|
+
return
|
|
925
|
+
else:
|
|
926
|
+
raise te
|
|
927
|
+
|
|
928
|
+
except Exception as e:
|
|
929
|
+
logger.error(f"Error updating metadata for content_id '{content_id}': {e}")
|
|
930
|
+
raise
|
|
931
|
+
|
|
932
|
+
def get_supported_search_types(self) -> List[str]:
|
|
933
|
+
"""Get the supported search types for this vector database."""
|
|
934
|
+
return [] # ChromaDb doesn't use SearchType enum
|