agno 2.2.13__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/__init__.py +6 -0
- agno/agent/agent.py +5252 -3145
- agno/agent/remote.py +525 -0
- agno/api/api.py +2 -0
- agno/client/__init__.py +3 -0
- agno/client/a2a/__init__.py +10 -0
- agno/client/a2a/client.py +554 -0
- agno/client/a2a/schemas.py +112 -0
- agno/client/a2a/utils.py +369 -0
- agno/client/os.py +2669 -0
- agno/compression/__init__.py +3 -0
- agno/compression/manager.py +247 -0
- agno/culture/manager.py +2 -2
- agno/db/base.py +927 -6
- agno/db/dynamo/dynamo.py +788 -2
- agno/db/dynamo/schemas.py +128 -0
- agno/db/dynamo/utils.py +26 -3
- agno/db/firestore/firestore.py +674 -50
- agno/db/firestore/schemas.py +41 -0
- agno/db/firestore/utils.py +25 -10
- agno/db/gcs_json/gcs_json_db.py +506 -3
- agno/db/gcs_json/utils.py +14 -2
- agno/db/in_memory/in_memory_db.py +203 -4
- agno/db/in_memory/utils.py +14 -2
- agno/db/json/json_db.py +498 -2
- agno/db/json/utils.py +14 -2
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/utils.py +19 -0
- agno/db/migrations/v1_to_v2.py +54 -16
- agno/db/migrations/versions/__init__.py +0 -0
- agno/db/migrations/versions/v2_3_0.py +977 -0
- agno/db/mongo/async_mongo.py +1013 -39
- agno/db/mongo/mongo.py +684 -4
- agno/db/mongo/schemas.py +48 -0
- agno/db/mongo/utils.py +17 -0
- agno/db/mysql/__init__.py +2 -1
- agno/db/mysql/async_mysql.py +2958 -0
- agno/db/mysql/mysql.py +722 -53
- agno/db/mysql/schemas.py +77 -11
- agno/db/mysql/utils.py +151 -8
- agno/db/postgres/async_postgres.py +1254 -137
- agno/db/postgres/postgres.py +2316 -93
- agno/db/postgres/schemas.py +153 -21
- agno/db/postgres/utils.py +22 -7
- agno/db/redis/redis.py +531 -3
- agno/db/redis/schemas.py +36 -0
- agno/db/redis/utils.py +31 -15
- agno/db/schemas/evals.py +1 -0
- agno/db/schemas/memory.py +20 -9
- agno/db/singlestore/schemas.py +70 -1
- agno/db/singlestore/singlestore.py +737 -74
- agno/db/singlestore/utils.py +13 -3
- agno/db/sqlite/async_sqlite.py +1069 -89
- agno/db/sqlite/schemas.py +133 -1
- agno/db/sqlite/sqlite.py +2203 -165
- agno/db/sqlite/utils.py +21 -11
- agno/db/surrealdb/models.py +25 -0
- agno/db/surrealdb/surrealdb.py +603 -1
- agno/db/utils.py +60 -0
- agno/eval/__init__.py +26 -3
- agno/eval/accuracy.py +25 -12
- agno/eval/agent_as_judge.py +871 -0
- agno/eval/base.py +29 -0
- agno/eval/performance.py +10 -4
- agno/eval/reliability.py +22 -13
- agno/eval/utils.py +2 -1
- agno/exceptions.py +42 -0
- agno/hooks/__init__.py +3 -0
- agno/hooks/decorator.py +164 -0
- agno/integrations/discord/client.py +13 -2
- agno/knowledge/__init__.py +4 -0
- agno/knowledge/chunking/code.py +90 -0
- agno/knowledge/chunking/document.py +65 -4
- agno/knowledge/chunking/fixed.py +4 -1
- agno/knowledge/chunking/markdown.py +102 -11
- agno/knowledge/chunking/recursive.py +2 -2
- agno/knowledge/chunking/semantic.py +130 -48
- agno/knowledge/chunking/strategy.py +18 -0
- agno/knowledge/embedder/azure_openai.py +0 -1
- agno/knowledge/embedder/google.py +1 -1
- agno/knowledge/embedder/mistral.py +1 -1
- agno/knowledge/embedder/nebius.py +1 -1
- agno/knowledge/embedder/openai.py +16 -12
- agno/knowledge/filesystem.py +412 -0
- agno/knowledge/knowledge.py +4261 -1199
- agno/knowledge/protocol.py +134 -0
- agno/knowledge/reader/arxiv_reader.py +3 -2
- agno/knowledge/reader/base.py +9 -7
- agno/knowledge/reader/csv_reader.py +91 -42
- agno/knowledge/reader/docx_reader.py +9 -10
- agno/knowledge/reader/excel_reader.py +225 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +38 -48
- agno/knowledge/reader/firecrawl_reader.py +3 -2
- agno/knowledge/reader/json_reader.py +16 -22
- agno/knowledge/reader/markdown_reader.py +15 -14
- agno/knowledge/reader/pdf_reader.py +33 -28
- agno/knowledge/reader/pptx_reader.py +9 -10
- agno/knowledge/reader/reader_factory.py +135 -1
- agno/knowledge/reader/s3_reader.py +8 -16
- agno/knowledge/reader/tavily_reader.py +3 -3
- agno/knowledge/reader/text_reader.py +15 -14
- agno/knowledge/reader/utils/__init__.py +17 -0
- agno/knowledge/reader/utils/spreadsheet.py +114 -0
- agno/knowledge/reader/web_search_reader.py +8 -65
- agno/knowledge/reader/website_reader.py +16 -13
- agno/knowledge/reader/wikipedia_reader.py +36 -3
- agno/knowledge/reader/youtube_reader.py +3 -2
- agno/knowledge/remote_content/__init__.py +33 -0
- agno/knowledge/remote_content/config.py +266 -0
- agno/knowledge/remote_content/remote_content.py +105 -17
- agno/knowledge/utils.py +76 -22
- agno/learn/__init__.py +71 -0
- agno/learn/config.py +463 -0
- agno/learn/curate.py +185 -0
- agno/learn/machine.py +725 -0
- agno/learn/schemas.py +1114 -0
- agno/learn/stores/__init__.py +38 -0
- agno/learn/stores/decision_log.py +1156 -0
- agno/learn/stores/entity_memory.py +3275 -0
- agno/learn/stores/learned_knowledge.py +1583 -0
- agno/learn/stores/protocol.py +117 -0
- agno/learn/stores/session_context.py +1217 -0
- agno/learn/stores/user_memory.py +1495 -0
- agno/learn/stores/user_profile.py +1220 -0
- agno/learn/utils.py +209 -0
- agno/media.py +22 -6
- agno/memory/__init__.py +14 -1
- agno/memory/manager.py +223 -8
- agno/memory/strategies/__init__.py +15 -0
- agno/memory/strategies/base.py +66 -0
- agno/memory/strategies/summarize.py +196 -0
- agno/memory/strategies/types.py +37 -0
- agno/models/aimlapi/aimlapi.py +17 -0
- agno/models/anthropic/claude.py +434 -59
- agno/models/aws/bedrock.py +121 -20
- agno/models/aws/claude.py +131 -274
- agno/models/azure/ai_foundry.py +10 -6
- agno/models/azure/openai_chat.py +33 -10
- agno/models/base.py +1162 -561
- agno/models/cerebras/cerebras.py +120 -24
- agno/models/cerebras/cerebras_openai.py +21 -2
- agno/models/cohere/chat.py +65 -6
- agno/models/cometapi/cometapi.py +18 -1
- agno/models/dashscope/dashscope.py +2 -3
- agno/models/deepinfra/deepinfra.py +18 -1
- agno/models/deepseek/deepseek.py +69 -3
- agno/models/fireworks/fireworks.py +18 -1
- agno/models/google/gemini.py +959 -89
- agno/models/google/utils.py +22 -0
- agno/models/groq/groq.py +48 -18
- agno/models/huggingface/huggingface.py +17 -6
- agno/models/ibm/watsonx.py +16 -6
- agno/models/internlm/internlm.py +18 -1
- agno/models/langdb/langdb.py +13 -1
- agno/models/litellm/chat.py +88 -9
- agno/models/litellm/litellm_openai.py +18 -1
- agno/models/message.py +24 -5
- agno/models/meta/llama.py +40 -13
- agno/models/meta/llama_openai.py +22 -21
- agno/models/metrics.py +12 -0
- agno/models/mistral/mistral.py +8 -4
- agno/models/n1n/__init__.py +3 -0
- agno/models/n1n/n1n.py +57 -0
- agno/models/nebius/nebius.py +6 -7
- agno/models/nvidia/nvidia.py +20 -3
- agno/models/ollama/__init__.py +2 -0
- agno/models/ollama/chat.py +17 -6
- agno/models/ollama/responses.py +100 -0
- agno/models/openai/__init__.py +2 -0
- agno/models/openai/chat.py +117 -26
- agno/models/openai/open_responses.py +46 -0
- agno/models/openai/responses.py +110 -32
- agno/models/openrouter/__init__.py +2 -0
- agno/models/openrouter/openrouter.py +67 -2
- agno/models/openrouter/responses.py +146 -0
- agno/models/perplexity/perplexity.py +19 -1
- agno/models/portkey/portkey.py +7 -6
- agno/models/requesty/requesty.py +19 -2
- agno/models/response.py +20 -2
- agno/models/sambanova/sambanova.py +20 -3
- agno/models/siliconflow/siliconflow.py +19 -2
- agno/models/together/together.py +20 -3
- agno/models/vercel/v0.py +20 -3
- agno/models/vertexai/claude.py +124 -4
- agno/models/vllm/vllm.py +19 -14
- agno/models/xai/xai.py +19 -2
- agno/os/app.py +467 -137
- agno/os/auth.py +253 -5
- agno/os/config.py +22 -0
- agno/os/interfaces/a2a/a2a.py +7 -6
- agno/os/interfaces/a2a/router.py +635 -26
- agno/os/interfaces/a2a/utils.py +32 -33
- agno/os/interfaces/agui/agui.py +5 -3
- agno/os/interfaces/agui/router.py +26 -16
- agno/os/interfaces/agui/utils.py +97 -57
- agno/os/interfaces/base.py +7 -7
- agno/os/interfaces/slack/router.py +16 -7
- agno/os/interfaces/slack/slack.py +7 -7
- agno/os/interfaces/whatsapp/router.py +35 -7
- agno/os/interfaces/whatsapp/security.py +3 -1
- agno/os/interfaces/whatsapp/whatsapp.py +11 -8
- agno/os/managers.py +326 -0
- agno/os/mcp.py +652 -79
- agno/os/middleware/__init__.py +4 -0
- agno/os/middleware/jwt.py +718 -115
- agno/os/middleware/trailing_slash.py +27 -0
- agno/os/router.py +105 -1558
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +655 -0
- agno/os/routers/agents/schema.py +288 -0
- agno/os/routers/components/__init__.py +3 -0
- agno/os/routers/components/components.py +475 -0
- agno/os/routers/database.py +155 -0
- agno/os/routers/evals/evals.py +111 -18
- agno/os/routers/evals/schemas.py +38 -5
- agno/os/routers/evals/utils.py +80 -11
- agno/os/routers/health.py +3 -3
- agno/os/routers/knowledge/knowledge.py +284 -35
- agno/os/routers/knowledge/schemas.py +14 -2
- agno/os/routers/memory/memory.py +274 -11
- agno/os/routers/memory/schemas.py +44 -3
- agno/os/routers/metrics/metrics.py +30 -15
- agno/os/routers/metrics/schemas.py +10 -6
- agno/os/routers/registry/__init__.py +3 -0
- agno/os/routers/registry/registry.py +337 -0
- agno/os/routers/session/session.py +143 -14
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +550 -0
- agno/os/routers/teams/schema.py +280 -0
- agno/os/routers/traces/__init__.py +3 -0
- agno/os/routers/traces/schemas.py +414 -0
- agno/os/routers/traces/traces.py +549 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +757 -0
- agno/os/routers/workflows/schema.py +139 -0
- agno/os/schema.py +157 -584
- agno/os/scopes.py +469 -0
- agno/os/settings.py +3 -0
- agno/os/utils.py +574 -185
- agno/reasoning/anthropic.py +85 -1
- agno/reasoning/azure_ai_foundry.py +93 -1
- agno/reasoning/deepseek.py +102 -2
- agno/reasoning/default.py +6 -7
- agno/reasoning/gemini.py +87 -3
- agno/reasoning/groq.py +109 -2
- agno/reasoning/helpers.py +6 -7
- agno/reasoning/manager.py +1238 -0
- agno/reasoning/ollama.py +93 -1
- agno/reasoning/openai.py +115 -1
- agno/reasoning/vertexai.py +85 -1
- agno/registry/__init__.py +3 -0
- agno/registry/registry.py +68 -0
- agno/remote/__init__.py +3 -0
- agno/remote/base.py +581 -0
- agno/run/__init__.py +2 -4
- agno/run/agent.py +134 -19
- agno/run/base.py +49 -1
- agno/run/cancel.py +65 -52
- agno/run/cancellation_management/__init__.py +9 -0
- agno/run/cancellation_management/base.py +78 -0
- agno/run/cancellation_management/in_memory_cancellation_manager.py +100 -0
- agno/run/cancellation_management/redis_cancellation_manager.py +236 -0
- agno/run/requirement.py +181 -0
- agno/run/team.py +111 -19
- agno/run/workflow.py +2 -1
- agno/session/agent.py +57 -92
- agno/session/summary.py +1 -1
- agno/session/team.py +62 -115
- agno/session/workflow.py +353 -57
- agno/skills/__init__.py +17 -0
- agno/skills/agent_skills.py +377 -0
- agno/skills/errors.py +32 -0
- agno/skills/loaders/__init__.py +4 -0
- agno/skills/loaders/base.py +27 -0
- agno/skills/loaders/local.py +216 -0
- agno/skills/skill.py +65 -0
- agno/skills/utils.py +107 -0
- agno/skills/validator.py +277 -0
- agno/table.py +10 -0
- agno/team/__init__.py +5 -1
- agno/team/remote.py +447 -0
- agno/team/team.py +3769 -2202
- agno/tools/brandfetch.py +27 -18
- agno/tools/browserbase.py +225 -16
- agno/tools/crawl4ai.py +3 -0
- agno/tools/duckduckgo.py +25 -71
- agno/tools/exa.py +0 -21
- agno/tools/file.py +14 -13
- agno/tools/file_generation.py +12 -6
- agno/tools/firecrawl.py +15 -7
- agno/tools/function.py +94 -113
- agno/tools/google_bigquery.py +11 -2
- agno/tools/google_drive.py +4 -3
- agno/tools/knowledge.py +9 -4
- agno/tools/mcp/mcp.py +301 -18
- agno/tools/mcp/multi_mcp.py +269 -14
- agno/tools/mem0.py +11 -10
- agno/tools/memory.py +47 -46
- agno/tools/mlx_transcribe.py +10 -7
- agno/tools/models/nebius.py +5 -5
- agno/tools/models_labs.py +20 -10
- agno/tools/nano_banana.py +151 -0
- agno/tools/parallel.py +0 -7
- agno/tools/postgres.py +76 -36
- agno/tools/python.py +14 -6
- agno/tools/reasoning.py +30 -23
- agno/tools/redshift.py +406 -0
- agno/tools/shopify.py +1519 -0
- agno/tools/spotify.py +919 -0
- agno/tools/tavily.py +4 -1
- agno/tools/toolkit.py +253 -18
- agno/tools/websearch.py +93 -0
- agno/tools/website.py +1 -1
- agno/tools/wikipedia.py +1 -1
- agno/tools/workflow.py +56 -48
- agno/tools/yfinance.py +12 -11
- agno/tracing/__init__.py +12 -0
- agno/tracing/exporter.py +161 -0
- agno/tracing/schemas.py +276 -0
- agno/tracing/setup.py +112 -0
- agno/utils/agent.py +251 -10
- agno/utils/cryptography.py +22 -0
- agno/utils/dttm.py +33 -0
- agno/utils/events.py +264 -7
- agno/utils/hooks.py +111 -3
- agno/utils/http.py +161 -2
- agno/utils/mcp.py +49 -8
- agno/utils/media.py +22 -1
- agno/utils/models/ai_foundry.py +9 -2
- agno/utils/models/claude.py +20 -5
- agno/utils/models/cohere.py +9 -2
- agno/utils/models/llama.py +9 -2
- agno/utils/models/mistral.py +4 -2
- agno/utils/os.py +0 -0
- agno/utils/print_response/agent.py +99 -16
- agno/utils/print_response/team.py +223 -24
- agno/utils/print_response/workflow.py +0 -2
- agno/utils/prompts.py +8 -6
- agno/utils/remote.py +23 -0
- agno/utils/response.py +1 -13
- agno/utils/string.py +91 -2
- agno/utils/team.py +62 -12
- agno/utils/tokens.py +657 -0
- agno/vectordb/base.py +15 -2
- agno/vectordb/cassandra/cassandra.py +1 -1
- agno/vectordb/chroma/__init__.py +2 -1
- agno/vectordb/chroma/chromadb.py +468 -23
- agno/vectordb/clickhouse/clickhousedb.py +1 -1
- agno/vectordb/couchbase/couchbase.py +6 -2
- agno/vectordb/lancedb/lance_db.py +7 -38
- agno/vectordb/lightrag/lightrag.py +7 -6
- agno/vectordb/milvus/milvus.py +118 -84
- agno/vectordb/mongodb/__init__.py +2 -1
- agno/vectordb/mongodb/mongodb.py +14 -31
- agno/vectordb/pgvector/pgvector.py +120 -66
- agno/vectordb/pineconedb/pineconedb.py +2 -19
- agno/vectordb/qdrant/__init__.py +2 -1
- agno/vectordb/qdrant/qdrant.py +33 -56
- agno/vectordb/redis/__init__.py +2 -1
- agno/vectordb/redis/redisdb.py +19 -31
- agno/vectordb/singlestore/singlestore.py +17 -9
- agno/vectordb/surrealdb/surrealdb.py +2 -38
- agno/vectordb/weaviate/__init__.py +2 -1
- agno/vectordb/weaviate/weaviate.py +7 -3
- agno/workflow/__init__.py +5 -1
- agno/workflow/agent.py +2 -2
- agno/workflow/condition.py +12 -10
- agno/workflow/loop.py +28 -9
- agno/workflow/parallel.py +21 -13
- agno/workflow/remote.py +362 -0
- agno/workflow/router.py +12 -9
- agno/workflow/step.py +261 -36
- agno/workflow/steps.py +12 -8
- agno/workflow/types.py +40 -77
- agno/workflow/workflow.py +939 -213
- {agno-2.2.13.dist-info → agno-2.4.3.dist-info}/METADATA +134 -181
- agno-2.4.3.dist-info/RECORD +677 -0
- {agno-2.2.13.dist-info → agno-2.4.3.dist-info}/WHEEL +1 -1
- agno/tools/googlesearch.py +0 -98
- agno/tools/memori.py +0 -339
- agno-2.2.13.dist-info/RECORD +0 -575
- {agno-2.2.13.dist-info → agno-2.4.3.dist-info}/licenses/LICENSE +0 -0
- {agno-2.2.13.dist-info → agno-2.4.3.dist-info}/top_level.txt +0 -0
agno/vectordb/chroma/__init__.py
CHANGED
agno/vectordb/chroma/chromadb.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
3
5
|
from hashlib import md5
|
|
4
|
-
from typing import Any, Dict, List, Mapping, Optional, Union, cast
|
|
6
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
|
|
5
7
|
|
|
6
8
|
try:
|
|
7
9
|
from chromadb import Client as ChromaDbClient
|
|
@@ -20,12 +22,64 @@ from agno.knowledge.reranker.base import Reranker
|
|
|
20
22
|
from agno.utils.log import log_debug, log_error, log_info, log_warning, logger
|
|
21
23
|
from agno.vectordb.base import VectorDb
|
|
22
24
|
from agno.vectordb.distance import Distance
|
|
25
|
+
from agno.vectordb.search import SearchType
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def reciprocal_rank_fusion(
|
|
29
|
+
ranked_lists: List[List[Tuple[str, float]]],
|
|
30
|
+
k: int = 60,
|
|
31
|
+
) -> List[Tuple[str, float]]:
|
|
32
|
+
"""
|
|
33
|
+
Combine multiple ranked lists using Reciprocal Rank Fusion (RRF).
|
|
34
|
+
|
|
35
|
+
RRF is a simple yet effective method for combining multiple rankings.
|
|
36
|
+
The formula is: RRF(d) = sum(1 / (k + rank_i(d))) for each ranking i
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
ranked_lists: List of ranked results, each as [(doc_id, score), ...]
|
|
40
|
+
k: RRF constant (default 60, as per original paper by Cormack et al.)
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Fused ranking as [(doc_id, rrf_score), ...] sorted by score descending
|
|
44
|
+
"""
|
|
45
|
+
rrf_scores: Dict[str, float] = defaultdict(float)
|
|
46
|
+
|
|
47
|
+
for ranked_list in ranked_lists:
|
|
48
|
+
for rank, (doc_id, _) in enumerate(ranked_list, start=1):
|
|
49
|
+
rrf_scores[doc_id] += 1.0 / (k + rank)
|
|
50
|
+
|
|
51
|
+
sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
|
|
52
|
+
return sorted_results
|
|
23
53
|
|
|
24
54
|
|
|
25
55
|
class ChromaDb(VectorDb):
|
|
56
|
+
"""
|
|
57
|
+
ChromaDb class for managing vector operations with ChromaDB.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
collection: The name of the ChromaDB collection. If not provided, derived from 'name'.
|
|
61
|
+
name: Name of the vector database. Also used as collection name if 'collection' is not provided.
|
|
62
|
+
description: Description of the vector database.
|
|
63
|
+
id: Unique identifier for this vector database instance.
|
|
64
|
+
embedder: The embedder to use when embedding the document contents.
|
|
65
|
+
distance: The distance metric to use when searching for documents.
|
|
66
|
+
path: The path to store the ChromaDB data (for persistent client).
|
|
67
|
+
persistent_client: Whether to use a persistent client.
|
|
68
|
+
search_type: The search type to use when searching for documents.
|
|
69
|
+
- SearchType.vector: Pure vector similarity search (default)
|
|
70
|
+
- SearchType.keyword: Keyword-based search using document content
|
|
71
|
+
- SearchType.hybrid: Combines vector + FTS with Reciprocal Rank Fusion
|
|
72
|
+
hybrid_rrf_k: RRF (Reciprocal Rank Fusion) constant for hybrid search.
|
|
73
|
+
Controls ranking smoothness - higher values give more weight to lower-ranked
|
|
74
|
+
results, lower values make top results more dominant. Default is 60
|
|
75
|
+
(per original RRF paper by Cormack et al.).
|
|
76
|
+
reranker: The reranker to use when reranking documents.
|
|
77
|
+
**kwargs: Additional arguments to pass to the ChromaDB client.
|
|
78
|
+
"""
|
|
79
|
+
|
|
26
80
|
def __init__(
|
|
27
81
|
self,
|
|
28
|
-
collection: str,
|
|
82
|
+
collection: Optional[str] = None,
|
|
29
83
|
name: Optional[str] = None,
|
|
30
84
|
description: Optional[str] = None,
|
|
31
85
|
id: Optional[str] = None,
|
|
@@ -33,12 +87,18 @@ class ChromaDb(VectorDb):
|
|
|
33
87
|
distance: Distance = Distance.cosine,
|
|
34
88
|
path: str = "tmp/chromadb",
|
|
35
89
|
persistent_client: bool = False,
|
|
90
|
+
search_type: SearchType = SearchType.vector,
|
|
91
|
+
hybrid_rrf_k: int = 60,
|
|
36
92
|
reranker: Optional[Reranker] = None,
|
|
37
93
|
**kwargs,
|
|
38
94
|
):
|
|
39
|
-
#
|
|
40
|
-
if
|
|
41
|
-
|
|
95
|
+
# Derive collection from name if not provided
|
|
96
|
+
if collection is None:
|
|
97
|
+
if name is not None:
|
|
98
|
+
# Sanitize name: lowercase and replace spaces with underscores
|
|
99
|
+
collection = name.lower().replace(" ", "_")
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError("Either 'collection' or 'name' must be provided.")
|
|
42
102
|
|
|
43
103
|
# Dynamic ID generation based on unique identifiers
|
|
44
104
|
if id is None:
|
|
@@ -57,7 +117,7 @@ class ChromaDb(VectorDb):
|
|
|
57
117
|
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
58
118
|
|
|
59
119
|
embedder = OpenAIEmbedder()
|
|
60
|
-
|
|
120
|
+
log_debug("Embedder not provided, using OpenAIEmbedder as default.")
|
|
61
121
|
self.embedder: Embedder = embedder
|
|
62
122
|
# Distance metric
|
|
63
123
|
self.distance: Distance = distance
|
|
@@ -72,6 +132,10 @@ class ChromaDb(VectorDb):
|
|
|
72
132
|
self.persistent_client: bool = persistent_client
|
|
73
133
|
self.path: str = path
|
|
74
134
|
|
|
135
|
+
# Search type configuration
|
|
136
|
+
self.search_type: SearchType = search_type
|
|
137
|
+
self.hybrid_rrf_k: int = hybrid_rrf_k
|
|
138
|
+
|
|
75
139
|
# Reranker instance
|
|
76
140
|
self.reranker: Optional[Reranker] = reranker
|
|
77
141
|
|
|
@@ -272,11 +336,13 @@ class ChromaDb(VectorDb):
|
|
|
272
336
|
embed_tasks = [document.async_embed(embedder=self.embedder) for document in documents]
|
|
273
337
|
await asyncio.gather(*embed_tasks, return_exceptions=True)
|
|
274
338
|
except Exception as e:
|
|
275
|
-
|
|
339
|
+
logger.error(f"Error processing document: {e}")
|
|
276
340
|
|
|
277
341
|
for document in documents:
|
|
278
342
|
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
279
|
-
|
|
343
|
+
# Include content_hash in ID to ensure uniqueness across different content hashes
|
|
344
|
+
base_id = document.id or md5(cleaned_content.encode()).hexdigest()
|
|
345
|
+
doc_id = md5(f"{base_id}_{content_hash}".encode()).hexdigest()
|
|
280
346
|
|
|
281
347
|
# Handle metadata and filters
|
|
282
348
|
metadata = document.meta_data or {}
|
|
@@ -435,7 +501,9 @@ class ChromaDb(VectorDb):
|
|
|
435
501
|
|
|
436
502
|
for document in documents:
|
|
437
503
|
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
438
|
-
|
|
504
|
+
# Include content_hash in ID to ensure uniqueness across different content hashes
|
|
505
|
+
base_id = document.id or md5(cleaned_content.encode()).hexdigest()
|
|
506
|
+
doc_id = md5(f"{base_id}_{content_hash}".encode()).hexdigest()
|
|
439
507
|
|
|
440
508
|
# Handle metadata and filters
|
|
441
509
|
metadata = document.meta_data or {}
|
|
@@ -498,6 +566,41 @@ class ChromaDb(VectorDb):
|
|
|
498
566
|
if isinstance(filters, list):
|
|
499
567
|
log_warning("Filter Expressions are not yet supported in ChromaDB. No filters will be applied.")
|
|
500
568
|
filters = None
|
|
569
|
+
|
|
570
|
+
if not self._collection:
|
|
571
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
572
|
+
|
|
573
|
+
# Route to appropriate search method based on search_type
|
|
574
|
+
if self.search_type == SearchType.vector:
|
|
575
|
+
search_results = self._vector_search(query, limit, filters)
|
|
576
|
+
elif self.search_type == SearchType.keyword:
|
|
577
|
+
search_results = self._keyword_search(query, limit, filters)
|
|
578
|
+
elif self.search_type == SearchType.hybrid:
|
|
579
|
+
search_results = self._hybrid_search(query, limit, filters)
|
|
580
|
+
else:
|
|
581
|
+
logger.error(f"Invalid search type '{self.search_type}'.")
|
|
582
|
+
return []
|
|
583
|
+
|
|
584
|
+
if self.reranker and search_results:
|
|
585
|
+
try:
|
|
586
|
+
search_results = self.reranker.rerank(query=query, documents=search_results)
|
|
587
|
+
except Exception as e:
|
|
588
|
+
log_warning(f"Reranker failed, returning unranked results: {e}")
|
|
589
|
+
|
|
590
|
+
log_info(f"Found {len(search_results)} documents")
|
|
591
|
+
return search_results
|
|
592
|
+
|
|
593
|
+
def _vector_search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
594
|
+
"""Perform pure vector similarity search.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
query (str): Query to search for.
|
|
598
|
+
limit (int): Number of results to return.
|
|
599
|
+
filters (Optional[Dict[str, Any]]): Metadata filters to apply.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
List[Document]: List of search results.
|
|
603
|
+
"""
|
|
501
604
|
query_embedding = self.embedder.get_embedding(query)
|
|
502
605
|
if query_embedding is None:
|
|
503
606
|
logger.error(f"Error getting embedding for Query: {query}")
|
|
@@ -512,11 +615,248 @@ class ChromaDb(VectorDb):
|
|
|
512
615
|
result: QueryResult = self._collection.query(
|
|
513
616
|
query_embeddings=query_embedding,
|
|
514
617
|
n_results=limit,
|
|
515
|
-
where=where_filter,
|
|
618
|
+
where=where_filter,
|
|
516
619
|
include=["metadatas", "documents", "embeddings", "distances", "uris"],
|
|
517
620
|
)
|
|
518
621
|
|
|
519
|
-
|
|
622
|
+
return self._build_search_results(result)
|
|
623
|
+
|
|
624
|
+
def _keyword_search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
625
|
+
"""Perform keyword-based search using document content filtering.
|
|
626
|
+
|
|
627
|
+
This uses ChromaDB's where_document filter with $contains operator
|
|
628
|
+
for basic full-text search functionality.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
query (str): Query to search for (keywords to match in document content).
|
|
632
|
+
limit (int): Number of results to return.
|
|
633
|
+
filters (Optional[Dict[str, Any]]): Metadata filters to apply.
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
List[Document]: List of search results.
|
|
637
|
+
"""
|
|
638
|
+
if not self._collection:
|
|
639
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
640
|
+
|
|
641
|
+
# Convert simple filters to ChromaDB's format if needed
|
|
642
|
+
where_filter = self._convert_filters(filters) if filters else None
|
|
643
|
+
|
|
644
|
+
# Get first significant word for $contains filter
|
|
645
|
+
query_words = query.split()
|
|
646
|
+
if not query_words:
|
|
647
|
+
return []
|
|
648
|
+
|
|
649
|
+
# Use where_document to filter by document content
|
|
650
|
+
where_document: Dict[str, Any] = {"$contains": query_words[0]}
|
|
651
|
+
|
|
652
|
+
try:
|
|
653
|
+
# Get documents matching the keyword filter
|
|
654
|
+
result = self._collection.get(
|
|
655
|
+
where=where_filter,
|
|
656
|
+
where_document=cast(Any, where_document),
|
|
657
|
+
limit=limit,
|
|
658
|
+
include=["metadatas", "documents", "embeddings"],
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
return self._build_get_results(cast(Dict[str, Any], result), query)
|
|
662
|
+
except Exception as e:
|
|
663
|
+
logger.error(f"Error in keyword search: {e}")
|
|
664
|
+
return []
|
|
665
|
+
|
|
666
|
+
def _hybrid_search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
667
|
+
"""Perform hybrid search combining vector similarity with full-text search using RRF.
|
|
668
|
+
|
|
669
|
+
This method combines:
|
|
670
|
+
1. Dense vector similarity search (semantic search)
|
|
671
|
+
2. Full-text search (keyword/lexical search)
|
|
672
|
+
|
|
673
|
+
Results are fused using Reciprocal Rank Fusion (RRF) for optimal ranking.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
query (str): Query to search for.
|
|
677
|
+
limit (int): Number of results to return.
|
|
678
|
+
filters (Optional[Dict[str, Any]]): Metadata filters to apply.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
List[Document]: List of search results with RRF-fused ranking.
|
|
682
|
+
"""
|
|
683
|
+
query_embedding = self.embedder.get_embedding(query)
|
|
684
|
+
if query_embedding is None:
|
|
685
|
+
logger.error(f"Error getting embedding for Query: {query}")
|
|
686
|
+
return []
|
|
687
|
+
|
|
688
|
+
if not self._collection:
|
|
689
|
+
self._collection = self.client.get_collection(name=self.collection_name)
|
|
690
|
+
|
|
691
|
+
# Convert simple filters to ChromaDB's format if needed
|
|
692
|
+
where_filter = self._convert_filters(filters) if filters else None
|
|
693
|
+
|
|
694
|
+
# Fetch more candidates than needed for better fusion
|
|
695
|
+
fetch_k = min(limit * 3, 100)
|
|
696
|
+
|
|
697
|
+
def dense_vector_similarity_search() -> List[Tuple[str, float]]:
|
|
698
|
+
"""Dense vector similarity search."""
|
|
699
|
+
try:
|
|
700
|
+
results = self._collection.query( # type: ignore
|
|
701
|
+
query_embeddings=query_embedding,
|
|
702
|
+
n_results=fetch_k,
|
|
703
|
+
where=where_filter,
|
|
704
|
+
include=["documents", "metadatas", "distances"],
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
ranked: List[Tuple[str, float]] = []
|
|
708
|
+
if results.get("ids") and results["ids"][0]:
|
|
709
|
+
for i, doc_id in enumerate(results["ids"][0]):
|
|
710
|
+
distance = results["distances"][0][i] if results.get("distances") else 0 # type: ignore
|
|
711
|
+
# Convert distance to similarity score (lower distance = higher score)
|
|
712
|
+
score = 1.0 / (1.0 + distance)
|
|
713
|
+
ranked.append((doc_id, score))
|
|
714
|
+
return ranked
|
|
715
|
+
except Exception as e:
|
|
716
|
+
log_error(f"Error in vector search component: {e}")
|
|
717
|
+
return []
|
|
718
|
+
|
|
719
|
+
def fts_search() -> List[Tuple[str, float]]:
|
|
720
|
+
"""Full-text search using ChromaDB's where_document filter."""
|
|
721
|
+
try:
|
|
722
|
+
query_words = query.split()
|
|
723
|
+
if not query_words:
|
|
724
|
+
return []
|
|
725
|
+
|
|
726
|
+
# Use first word for $contains filter
|
|
727
|
+
fts_where_document: Dict[str, Any] = {"$contains": query_words[0]}
|
|
728
|
+
|
|
729
|
+
results = self._collection.query( # type: ignore
|
|
730
|
+
query_embeddings=query_embedding,
|
|
731
|
+
n_results=fetch_k,
|
|
732
|
+
where=where_filter,
|
|
733
|
+
where_document=cast(Any, fts_where_document),
|
|
734
|
+
include=["documents", "metadatas", "distances"],
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
ranked: List[Tuple[str, float]] = []
|
|
738
|
+
if results.get("ids") and results["ids"][0]:
|
|
739
|
+
for i, doc_id in enumerate(results["ids"][0]):
|
|
740
|
+
# Score based on term overlap (simple BM25-like scoring)
|
|
741
|
+
doc = results["documents"][0][i] if results.get("documents") else "" # type: ignore
|
|
742
|
+
query_terms = set(query.lower().split())
|
|
743
|
+
doc_terms = set(doc.lower().split()) if doc else set()
|
|
744
|
+
overlap = len(query_terms & doc_terms)
|
|
745
|
+
score = overlap / max(len(query_terms), 1)
|
|
746
|
+
ranked.append((doc_id, score))
|
|
747
|
+
|
|
748
|
+
# Sort by score descending
|
|
749
|
+
ranked.sort(key=lambda x: x[1], reverse=True)
|
|
750
|
+
return ranked
|
|
751
|
+
except Exception as e:
|
|
752
|
+
log_error(f"Error in FTS search component: {e}")
|
|
753
|
+
return []
|
|
754
|
+
|
|
755
|
+
# Execute searches in parallel for better performance
|
|
756
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
757
|
+
vector_future = executor.submit(dense_vector_similarity_search)
|
|
758
|
+
fts_future = executor.submit(fts_search)
|
|
759
|
+
|
|
760
|
+
vector_results = vector_future.result()
|
|
761
|
+
fts_results = fts_future.result()
|
|
762
|
+
|
|
763
|
+
# Apply RRF fusion
|
|
764
|
+
fused_ranking = reciprocal_rank_fusion(
|
|
765
|
+
[vector_results, fts_results],
|
|
766
|
+
k=self.hybrid_rrf_k,
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
# Get top IDs from fused ranking
|
|
770
|
+
top_ids = [doc_id for doc_id, _ in fused_ranking[:limit]]
|
|
771
|
+
|
|
772
|
+
if not top_ids:
|
|
773
|
+
return []
|
|
774
|
+
|
|
775
|
+
# Fetch full document data for top results
|
|
776
|
+
try:
|
|
777
|
+
full_results = self._collection.get(
|
|
778
|
+
ids=top_ids,
|
|
779
|
+
include=["documents", "metadatas", "embeddings"],
|
|
780
|
+
)
|
|
781
|
+
except Exception as e:
|
|
782
|
+
log_error(f"Error fetching full results: {e}")
|
|
783
|
+
return []
|
|
784
|
+
|
|
785
|
+
# Build lookup dict for results
|
|
786
|
+
doc_lookup: Dict[str, Dict[str, Any]] = {}
|
|
787
|
+
result_ids = full_results.get("ids", [])
|
|
788
|
+
result_docs = full_results.get("documents")
|
|
789
|
+
result_metas = full_results.get("metadatas")
|
|
790
|
+
result_embeds = full_results.get("embeddings")
|
|
791
|
+
|
|
792
|
+
for i, doc_id in enumerate(result_ids if result_ids is not None else []):
|
|
793
|
+
doc_lookup[doc_id] = {
|
|
794
|
+
"document": result_docs[i] if result_docs is not None and i < len(result_docs) else None,
|
|
795
|
+
"metadata": result_metas[i] if result_metas is not None and i < len(result_metas) else None,
|
|
796
|
+
"embedding": result_embeds[i] if result_embeds is not None and i < len(result_embeds) else None,
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
# Build final results in fused ranking order
|
|
800
|
+
search_results: List[Document] = []
|
|
801
|
+
rrf_scores = dict(fused_ranking)
|
|
802
|
+
|
|
803
|
+
for doc_id in top_ids:
|
|
804
|
+
if doc_id not in doc_lookup:
|
|
805
|
+
continue
|
|
806
|
+
|
|
807
|
+
doc_data = doc_lookup[doc_id]
|
|
808
|
+
doc_metadata = dict(doc_data["metadata"]) if doc_data["metadata"] else {}
|
|
809
|
+
|
|
810
|
+
# Add RRF score to metadata
|
|
811
|
+
doc_metadata["rrf_score"] = rrf_scores.get(doc_id, 0.0)
|
|
812
|
+
|
|
813
|
+
# Extract the fields we added to metadata
|
|
814
|
+
name_val = doc_metadata.pop("name", None)
|
|
815
|
+
content_id_val = doc_metadata.pop("content_id", None)
|
|
816
|
+
|
|
817
|
+
# Convert types to match Document constructor expectations
|
|
818
|
+
name = str(name_val) if name_val is not None and not isinstance(name_val, str) else name_val
|
|
819
|
+
content_id = (
|
|
820
|
+
str(content_id_val)
|
|
821
|
+
if content_id_val is not None and not isinstance(content_id_val, str)
|
|
822
|
+
else content_id_val
|
|
823
|
+
)
|
|
824
|
+
content = str(doc_data["document"]) if doc_data["document"] is not None else ""
|
|
825
|
+
|
|
826
|
+
# Process embedding
|
|
827
|
+
embedding = None
|
|
828
|
+
if doc_data["embedding"] is not None:
|
|
829
|
+
embed_data = doc_data["embedding"]
|
|
830
|
+
if hasattr(embed_data, "tolist") and callable(getattr(embed_data, "tolist", None)):
|
|
831
|
+
try:
|
|
832
|
+
embedding = list(cast(Any, embed_data).tolist())
|
|
833
|
+
except (AttributeError, TypeError):
|
|
834
|
+
embedding = list(embed_data) if isinstance(embed_data, (list, tuple)) else None
|
|
835
|
+
elif isinstance(embed_data, (list, tuple)):
|
|
836
|
+
embedding = [float(x) for x in embed_data if isinstance(x, (int, float))]
|
|
837
|
+
|
|
838
|
+
search_results.append(
|
|
839
|
+
Document(
|
|
840
|
+
id=doc_id,
|
|
841
|
+
name=name,
|
|
842
|
+
meta_data=doc_metadata,
|
|
843
|
+
content=content,
|
|
844
|
+
embedding=embedding,
|
|
845
|
+
content_id=content_id,
|
|
846
|
+
)
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
return search_results
|
|
850
|
+
|
|
851
|
+
def _build_search_results(self, result: QueryResult) -> List[Document]:
|
|
852
|
+
"""Build Document list from ChromaDB QueryResult.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
result: The QueryResult from ChromaDB query.
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
List[Document]: List of Document objects.
|
|
859
|
+
"""
|
|
520
860
|
search_results: List[Document] = []
|
|
521
861
|
|
|
522
862
|
ids_list = result.get("ids", [[]]) # type: ignore
|
|
@@ -525,13 +865,33 @@ class ChromaDb(VectorDb):
|
|
|
525
865
|
embeddings_list = result.get("embeddings") # type: ignore
|
|
526
866
|
distances_list = result.get("distances", [[]]) # type: ignore
|
|
527
867
|
|
|
528
|
-
|
|
868
|
+
# Check if we have valid results - handle numpy arrays carefully
|
|
869
|
+
if ids_list is None or len(ids_list) == 0:
|
|
870
|
+
return search_results
|
|
871
|
+
if metadata_list is None or len(metadata_list) == 0:
|
|
872
|
+
return search_results
|
|
873
|
+
if documents_list is None or len(documents_list) == 0:
|
|
874
|
+
return search_results
|
|
875
|
+
if distances_list is None or len(distances_list) == 0:
|
|
529
876
|
return search_results
|
|
530
877
|
|
|
531
878
|
ids = ids_list[0]
|
|
532
879
|
metadata = [dict(m) if m else {} for m in metadata_list[0]] # Convert to mutable dicts
|
|
533
880
|
documents = documents_list[0]
|
|
534
|
-
|
|
881
|
+
|
|
882
|
+
# Handle embeddings - may be None or numpy array
|
|
883
|
+
embeddings_raw: Any = []
|
|
884
|
+
if embeddings_list is not None:
|
|
885
|
+
try:
|
|
886
|
+
if len(embeddings_list) > 0:
|
|
887
|
+
embeddings_raw = embeddings_list[0]
|
|
888
|
+
except (TypeError, ValueError):
|
|
889
|
+
# numpy array truth value issue - try direct access
|
|
890
|
+
try:
|
|
891
|
+
embeddings_raw = embeddings_list[0]
|
|
892
|
+
except Exception:
|
|
893
|
+
embeddings_raw = []
|
|
894
|
+
|
|
535
895
|
embeddings = []
|
|
536
896
|
for e in embeddings_raw:
|
|
537
897
|
if hasattr(e, "tolist") and callable(getattr(e, "tolist", None)):
|
|
@@ -545,7 +905,8 @@ class ChromaDb(VectorDb):
|
|
|
545
905
|
embeddings.append([float(e)])
|
|
546
906
|
else:
|
|
547
907
|
embeddings.append([])
|
|
548
|
-
|
|
908
|
+
|
|
909
|
+
distances = distances_list[0] if len(distances_list) > 0 else []
|
|
549
910
|
|
|
550
911
|
for idx, distance in enumerate(distances):
|
|
551
912
|
if idx < len(metadata):
|
|
@@ -578,12 +939,95 @@ class ChromaDb(VectorDb):
|
|
|
578
939
|
)
|
|
579
940
|
)
|
|
580
941
|
except Exception as e:
|
|
581
|
-
|
|
942
|
+
log_error(f"Error building search results: {e}")
|
|
582
943
|
|
|
583
|
-
|
|
584
|
-
|
|
944
|
+
return search_results
|
|
945
|
+
|
|
946
|
+
def _build_get_results(self, result: Dict[str, Any], query: str = "") -> List[Document]:
|
|
947
|
+
"""Build Document list from ChromaDB GetResult.
|
|
948
|
+
|
|
949
|
+
Args:
|
|
950
|
+
result: The GetResult from ChromaDB get.
|
|
951
|
+
query: The original query for scoring.
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
List[Document]: List of Document objects.
|
|
955
|
+
"""
|
|
956
|
+
search_results: List[Document] = []
|
|
957
|
+
|
|
958
|
+
ids = result.get("ids", [])
|
|
959
|
+
metadatas = result.get("metadatas", [])
|
|
960
|
+
documents = result.get("documents", [])
|
|
961
|
+
embeddings_raw = result.get("embeddings")
|
|
962
|
+
|
|
963
|
+
# Check ids safely (may be numpy array)
|
|
964
|
+
if ids is None:
|
|
965
|
+
return search_results
|
|
966
|
+
try:
|
|
967
|
+
if len(ids) == 0:
|
|
968
|
+
return search_results
|
|
969
|
+
except (TypeError, ValueError):
|
|
970
|
+
return search_results
|
|
971
|
+
|
|
972
|
+
embeddings = []
|
|
973
|
+
# Handle embeddings - may be None or numpy array
|
|
974
|
+
if embeddings_raw is not None:
|
|
975
|
+
try:
|
|
976
|
+
for e in embeddings_raw:
|
|
977
|
+
if hasattr(e, "tolist") and callable(getattr(e, "tolist", None)):
|
|
978
|
+
try:
|
|
979
|
+
embeddings.append(list(cast(Any, e).tolist()))
|
|
980
|
+
except (AttributeError, TypeError):
|
|
981
|
+
embeddings.append(list(e) if isinstance(e, (list, tuple)) else [])
|
|
982
|
+
elif isinstance(e, (list, tuple)):
|
|
983
|
+
embeddings.append([float(x) for x in e if isinstance(x, (int, float))])
|
|
984
|
+
elif isinstance(e, (int, float)):
|
|
985
|
+
embeddings.append([float(e)])
|
|
986
|
+
else:
|
|
987
|
+
embeddings.append([])
|
|
988
|
+
except (TypeError, ValueError):
|
|
989
|
+
# numpy array iteration issue
|
|
990
|
+
embeddings = []
|
|
991
|
+
|
|
992
|
+
try:
|
|
993
|
+
for idx, id_ in enumerate(ids):
|
|
994
|
+
doc_metadata = dict(metadatas[idx]) if metadatas and idx < len(metadatas) and metadatas[idx] else {}
|
|
995
|
+
document = documents[idx] if documents and idx < len(documents) else ""
|
|
996
|
+
|
|
997
|
+
# Calculate simple keyword score if query provided
|
|
998
|
+
if query and document:
|
|
999
|
+
query_terms = set(query.lower().split())
|
|
1000
|
+
doc_terms = set(document.lower().split())
|
|
1001
|
+
overlap = len(query_terms & doc_terms)
|
|
1002
|
+
doc_metadata["keyword_score"] = overlap / max(len(query_terms), 1)
|
|
1003
|
+
|
|
1004
|
+
# Extract the fields we added to metadata
|
|
1005
|
+
name_val = doc_metadata.pop("name", None)
|
|
1006
|
+
content_id_val = doc_metadata.pop("content_id", None)
|
|
1007
|
+
|
|
1008
|
+
# Convert types to match Document constructor expectations
|
|
1009
|
+
name = str(name_val) if name_val is not None and not isinstance(name_val, str) else name_val
|
|
1010
|
+
content_id = (
|
|
1011
|
+
str(content_id_val)
|
|
1012
|
+
if content_id_val is not None and not isinstance(content_id_val, str)
|
|
1013
|
+
else content_id_val
|
|
1014
|
+
)
|
|
1015
|
+
content = str(document) if document is not None else ""
|
|
1016
|
+
embedding = embeddings[idx] if idx < len(embeddings) else None
|
|
1017
|
+
|
|
1018
|
+
search_results.append(
|
|
1019
|
+
Document(
|
|
1020
|
+
id=id_,
|
|
1021
|
+
name=name,
|
|
1022
|
+
meta_data=doc_metadata,
|
|
1023
|
+
content=content,
|
|
1024
|
+
embedding=embedding,
|
|
1025
|
+
content_id=content_id,
|
|
1026
|
+
)
|
|
1027
|
+
)
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
log_error(f"Error building get results: {e}")
|
|
585
1030
|
|
|
586
|
-
log_info(f"Found {len(search_results)} documents")
|
|
587
1031
|
return search_results
|
|
588
1032
|
|
|
589
1033
|
def _convert_filters(self, filters: Dict[str, Any]) -> Dict[str, Any]:
|
|
@@ -888,7 +1332,7 @@ class ChromaDb(VectorDb):
|
|
|
888
1332
|
current_metadatas = []
|
|
889
1333
|
|
|
890
1334
|
if not ids:
|
|
891
|
-
|
|
1335
|
+
log_debug(f"No documents found with content_id: {content_id}")
|
|
892
1336
|
return
|
|
893
1337
|
|
|
894
1338
|
# Flatten the new metadata first
|
|
@@ -908,12 +1352,13 @@ class ChromaDb(VectorDb):
|
|
|
908
1352
|
|
|
909
1353
|
# Convert to the expected type for ChromaDB
|
|
910
1354
|
chroma_metadatas = cast(List[Mapping[str, Union[str, int, float, bool]]], updated_metadatas)
|
|
1355
|
+
chroma_metadatas = [{k: v for k, v in m.items() if k and v} for m in chroma_metadatas]
|
|
911
1356
|
collection.update(ids=ids, metadatas=chroma_metadatas) # type: ignore
|
|
912
|
-
|
|
1357
|
+
log_debug(f"Updated metadata for {len(ids)} documents with content_id: {content_id}")
|
|
913
1358
|
|
|
914
1359
|
except TypeError as te:
|
|
915
1360
|
if "object of type 'int' has no len()" in str(te):
|
|
916
|
-
|
|
1361
|
+
log_warning(
|
|
917
1362
|
f"ChromaDB internal error (version 0.5.0 bug): {te}. Cannot update metadata for content_id '{content_id}'."
|
|
918
1363
|
)
|
|
919
1364
|
return
|
|
@@ -921,9 +1366,9 @@ class ChromaDb(VectorDb):
|
|
|
921
1366
|
raise te
|
|
922
1367
|
|
|
923
1368
|
except Exception as e:
|
|
924
|
-
|
|
1369
|
+
log_error(f"Error updating metadata for content_id '{content_id}': {e}")
|
|
925
1370
|
raise
|
|
926
1371
|
|
|
927
1372
|
def get_supported_search_types(self) -> List[str]:
|
|
928
1373
|
"""Get the supported search types for this vector database."""
|
|
929
|
-
return [
|
|
1374
|
+
return [SearchType.vector, SearchType.keyword, SearchType.hybrid]
|
|
@@ -71,7 +71,7 @@ class Clickhouse(VectorDb):
|
|
|
71
71
|
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
72
72
|
|
|
73
73
|
_embedder = OpenAIEmbedder()
|
|
74
|
-
|
|
74
|
+
log_debug("Embedder not provided, using OpenAIEmbedder as default.")
|
|
75
75
|
self.embedder: Embedder = _embedder
|
|
76
76
|
self.dimensions: Optional[int] = self.embedder.dimensions
|
|
77
77
|
|
|
@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
6
6
|
from agno.filters import FilterExpr
|
|
7
7
|
from agno.knowledge.document import Document
|
|
8
8
|
from agno.knowledge.embedder import Embedder
|
|
9
|
-
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
10
9
|
from agno.utils.log import log_debug, log_info, log_warning, logger
|
|
11
10
|
from agno.vectordb.base import VectorDb
|
|
12
11
|
|
|
@@ -62,7 +61,7 @@ class CouchbaseSearch(VectorDb):
|
|
|
62
61
|
couchbase_connection_string: str,
|
|
63
62
|
cluster_options: ClusterOptions,
|
|
64
63
|
search_index: Union[str, SearchIndex],
|
|
65
|
-
embedder: Embedder =
|
|
64
|
+
embedder: Optional[Embedder] = None,
|
|
66
65
|
overwrite: bool = False,
|
|
67
66
|
is_global_level_index: bool = False,
|
|
68
67
|
wait_until_index_ready: float = 0,
|
|
@@ -97,6 +96,11 @@ class CouchbaseSearch(VectorDb):
|
|
|
97
96
|
self.collection_name = collection_name
|
|
98
97
|
self.connection_string = couchbase_connection_string
|
|
99
98
|
self.cluster_options = cluster_options
|
|
99
|
+
if embedder is None:
|
|
100
|
+
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
101
|
+
|
|
102
|
+
embedder = OpenAIEmbedder()
|
|
103
|
+
log_debug("Embedder not provided, using OpenAIEmbedder as default.")
|
|
100
104
|
self.embedder = embedder
|
|
101
105
|
self.overwrite = overwrite
|
|
102
106
|
self.is_global_level_index = is_global_level_index
|