hexdag 0.5.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hexdag/__init__.py +116 -0
- hexdag/__main__.py +30 -0
- hexdag/adapters/executors/__init__.py +5 -0
- hexdag/adapters/executors/local_executor.py +316 -0
- hexdag/builtin/__init__.py +6 -0
- hexdag/builtin/adapters/__init__.py +51 -0
- hexdag/builtin/adapters/anthropic/__init__.py +5 -0
- hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
- hexdag/builtin/adapters/database/__init__.py +6 -0
- hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
- hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
- hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
- hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
- hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
- hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
- hexdag/builtin/adapters/local/README.md +59 -0
- hexdag/builtin/adapters/local/__init__.py +7 -0
- hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
- hexdag/builtin/adapters/memory/__init__.py +47 -0
- hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
- hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
- hexdag/builtin/adapters/memory/schemas.py +57 -0
- hexdag/builtin/adapters/memory/session_memory.py +178 -0
- hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
- hexdag/builtin/adapters/memory/state_memory.py +280 -0
- hexdag/builtin/adapters/mock/README.md +89 -0
- hexdag/builtin/adapters/mock/__init__.py +15 -0
- hexdag/builtin/adapters/mock/hexdag.toml +50 -0
- hexdag/builtin/adapters/mock/mock_database.py +225 -0
- hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
- hexdag/builtin/adapters/mock/mock_llm.py +177 -0
- hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
- hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
- hexdag/builtin/adapters/openai/__init__.py +5 -0
- hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
- hexdag/builtin/adapters/secret/__init__.py +7 -0
- hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
- hexdag/builtin/adapters/unified_tool_router.py +280 -0
- hexdag/builtin/macros/__init__.py +17 -0
- hexdag/builtin/macros/conversation_agent.py +390 -0
- hexdag/builtin/macros/llm_macro.py +151 -0
- hexdag/builtin/macros/reasoning_agent.py +423 -0
- hexdag/builtin/macros/tool_macro.py +380 -0
- hexdag/builtin/nodes/__init__.py +38 -0
- hexdag/builtin/nodes/_discovery.py +123 -0
- hexdag/builtin/nodes/agent_node.py +696 -0
- hexdag/builtin/nodes/base_node_factory.py +242 -0
- hexdag/builtin/nodes/composite_node.py +926 -0
- hexdag/builtin/nodes/data_node.py +201 -0
- hexdag/builtin/nodes/expression_node.py +487 -0
- hexdag/builtin/nodes/function_node.py +454 -0
- hexdag/builtin/nodes/llm_node.py +491 -0
- hexdag/builtin/nodes/loop_node.py +920 -0
- hexdag/builtin/nodes/mapped_input.py +518 -0
- hexdag/builtin/nodes/port_call_node.py +269 -0
- hexdag/builtin/nodes/tool_call_node.py +195 -0
- hexdag/builtin/nodes/tool_utils.py +390 -0
- hexdag/builtin/prompts/__init__.py +68 -0
- hexdag/builtin/prompts/base.py +422 -0
- hexdag/builtin/prompts/chat_prompts.py +303 -0
- hexdag/builtin/prompts/error_correction_prompts.py +320 -0
- hexdag/builtin/prompts/tool_prompts.py +160 -0
- hexdag/builtin/tools/builtin_tools.py +84 -0
- hexdag/builtin/tools/database_tools.py +164 -0
- hexdag/cli/__init__.py +17 -0
- hexdag/cli/__main__.py +7 -0
- hexdag/cli/commands/__init__.py +27 -0
- hexdag/cli/commands/build_cmd.py +812 -0
- hexdag/cli/commands/create_cmd.py +208 -0
- hexdag/cli/commands/docs_cmd.py +293 -0
- hexdag/cli/commands/generate_types_cmd.py +252 -0
- hexdag/cli/commands/init_cmd.py +188 -0
- hexdag/cli/commands/pipeline_cmd.py +494 -0
- hexdag/cli/commands/plugin_dev_cmd.py +529 -0
- hexdag/cli/commands/plugins_cmd.py +441 -0
- hexdag/cli/commands/studio_cmd.py +101 -0
- hexdag/cli/commands/validate_cmd.py +221 -0
- hexdag/cli/main.py +84 -0
- hexdag/core/__init__.py +83 -0
- hexdag/core/config/__init__.py +20 -0
- hexdag/core/config/loader.py +479 -0
- hexdag/core/config/models.py +150 -0
- hexdag/core/configurable.py +294 -0
- hexdag/core/context/__init__.py +37 -0
- hexdag/core/context/execution_context.py +378 -0
- hexdag/core/docs/__init__.py +26 -0
- hexdag/core/docs/extractors.py +678 -0
- hexdag/core/docs/generators.py +890 -0
- hexdag/core/docs/models.py +120 -0
- hexdag/core/domain/__init__.py +10 -0
- hexdag/core/domain/dag.py +1225 -0
- hexdag/core/exceptions.py +234 -0
- hexdag/core/expression_parser.py +569 -0
- hexdag/core/logging.py +449 -0
- hexdag/core/models/__init__.py +17 -0
- hexdag/core/models/base.py +138 -0
- hexdag/core/orchestration/__init__.py +46 -0
- hexdag/core/orchestration/body_executor.py +481 -0
- hexdag/core/orchestration/components/__init__.py +97 -0
- hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
- hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
- hexdag/core/orchestration/components/execution_coordinator.py +360 -0
- hexdag/core/orchestration/components/health_check_manager.py +176 -0
- hexdag/core/orchestration/components/input_mapper.py +143 -0
- hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
- hexdag/core/orchestration/components/node_executor.py +377 -0
- hexdag/core/orchestration/components/secret_manager.py +202 -0
- hexdag/core/orchestration/components/wave_executor.py +158 -0
- hexdag/core/orchestration/constants.py +17 -0
- hexdag/core/orchestration/events/README.md +312 -0
- hexdag/core/orchestration/events/__init__.py +104 -0
- hexdag/core/orchestration/events/batching.py +330 -0
- hexdag/core/orchestration/events/decorators.py +139 -0
- hexdag/core/orchestration/events/events.py +573 -0
- hexdag/core/orchestration/events/observers/__init__.py +30 -0
- hexdag/core/orchestration/events/observers/core_observers.py +690 -0
- hexdag/core/orchestration/events/observers/models.py +111 -0
- hexdag/core/orchestration/events/taxonomy.py +269 -0
- hexdag/core/orchestration/hook_context.py +237 -0
- hexdag/core/orchestration/hooks.py +437 -0
- hexdag/core/orchestration/models.py +418 -0
- hexdag/core/orchestration/orchestrator.py +910 -0
- hexdag/core/orchestration/orchestrator_factory.py +275 -0
- hexdag/core/orchestration/port_wrappers.py +327 -0
- hexdag/core/orchestration/prompt/__init__.py +32 -0
- hexdag/core/orchestration/prompt/template.py +332 -0
- hexdag/core/pipeline_builder/__init__.py +21 -0
- hexdag/core/pipeline_builder/component_instantiator.py +386 -0
- hexdag/core/pipeline_builder/include_tag.py +265 -0
- hexdag/core/pipeline_builder/pipeline_config.py +133 -0
- hexdag/core/pipeline_builder/py_tag.py +223 -0
- hexdag/core/pipeline_builder/tag_discovery.py +268 -0
- hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
- hexdag/core/pipeline_builder/yaml_validator.py +569 -0
- hexdag/core/ports/__init__.py +65 -0
- hexdag/core/ports/api_call.py +133 -0
- hexdag/core/ports/database.py +489 -0
- hexdag/core/ports/embedding.py +215 -0
- hexdag/core/ports/executor.py +237 -0
- hexdag/core/ports/file_storage.py +117 -0
- hexdag/core/ports/healthcheck.py +87 -0
- hexdag/core/ports/llm.py +551 -0
- hexdag/core/ports/memory.py +70 -0
- hexdag/core/ports/observer_manager.py +130 -0
- hexdag/core/ports/secret.py +145 -0
- hexdag/core/ports/tool_router.py +94 -0
- hexdag/core/ports_builder.py +623 -0
- hexdag/core/protocols.py +273 -0
- hexdag/core/resolver.py +304 -0
- hexdag/core/schema/__init__.py +9 -0
- hexdag/core/schema/generator.py +742 -0
- hexdag/core/secrets.py +242 -0
- hexdag/core/types.py +413 -0
- hexdag/core/utils/async_warnings.py +206 -0
- hexdag/core/utils/schema_conversion.py +78 -0
- hexdag/core/utils/sql_validation.py +86 -0
- hexdag/core/validation/secure_json.py +148 -0
- hexdag/core/yaml_macro.py +517 -0
- hexdag/mcp_server.py +3120 -0
- hexdag/studio/__init__.py +10 -0
- hexdag/studio/build_ui.py +92 -0
- hexdag/studio/server/__init__.py +1 -0
- hexdag/studio/server/main.py +100 -0
- hexdag/studio/server/routes/__init__.py +9 -0
- hexdag/studio/server/routes/execute.py +208 -0
- hexdag/studio/server/routes/export.py +558 -0
- hexdag/studio/server/routes/files.py +207 -0
- hexdag/studio/server/routes/plugins.py +419 -0
- hexdag/studio/server/routes/validate.py +220 -0
- hexdag/studio/ui/index.html +13 -0
- hexdag/studio/ui/package-lock.json +2992 -0
- hexdag/studio/ui/package.json +31 -0
- hexdag/studio/ui/postcss.config.js +6 -0
- hexdag/studio/ui/public/hexdag.svg +5 -0
- hexdag/studio/ui/src/App.tsx +251 -0
- hexdag/studio/ui/src/components/Canvas.tsx +408 -0
- hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
- hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
- hexdag/studio/ui/src/components/Header.tsx +181 -0
- hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
- hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
- hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
- hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
- hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
- hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
- hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
- hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
- hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
- hexdag/studio/ui/src/components/index.ts +8 -0
- hexdag/studio/ui/src/index.css +92 -0
- hexdag/studio/ui/src/main.tsx +10 -0
- hexdag/studio/ui/src/types/index.ts +123 -0
- hexdag/studio/ui/src/vite-env.d.ts +1 -0
- hexdag/studio/ui/tailwind.config.js +29 -0
- hexdag/studio/ui/tsconfig.json +37 -0
- hexdag/studio/ui/tsconfig.node.json +13 -0
- hexdag/studio/ui/vite.config.ts +35 -0
- hexdag/visualization/__init__.py +69 -0
- hexdag/visualization/dag_visualizer.py +1020 -0
- hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
- hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
- hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
- hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
- hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
- hexdag_plugins/.gitignore +43 -0
- hexdag_plugins/README.md +73 -0
- hexdag_plugins/__init__.py +1 -0
- hexdag_plugins/azure/LICENSE +21 -0
- hexdag_plugins/azure/README.md +414 -0
- hexdag_plugins/azure/__init__.py +21 -0
- hexdag_plugins/azure/azure_blob_adapter.py +450 -0
- hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
- hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
- hexdag_plugins/azure/azure_openai_adapter.py +415 -0
- hexdag_plugins/azure/pyproject.toml +107 -0
- hexdag_plugins/azure/tests/__init__.py +1 -0
- hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
- hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
- hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
- hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
- hexdag_plugins/hexdag_etl/README.md +168 -0
- hexdag_plugins/hexdag_etl/__init__.py +53 -0
- hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
- hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
- hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
- hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
- hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
- hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
- hexdag_plugins/hexdag_etl/test_transform.py +54 -0
- hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
- hexdag_plugins/mysql_adapter/LICENSE +21 -0
- hexdag_plugins/mysql_adapter/README.md +224 -0
- hexdag_plugins/mysql_adapter/__init__.py +6 -0
- hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
- hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
- hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
- hexdag_plugins/storage/README.md +184 -0
- hexdag_plugins/storage/__init__.py +19 -0
- hexdag_plugins/storage/file/__init__.py +5 -0
- hexdag_plugins/storage/file/local.py +325 -0
- hexdag_plugins/storage/ports/__init__.py +5 -0
- hexdag_plugins/storage/ports/vector_store.py +236 -0
- hexdag_plugins/storage/sql/__init__.py +7 -0
- hexdag_plugins/storage/sql/base.py +187 -0
- hexdag_plugins/storage/sql/mysql.py +27 -0
- hexdag_plugins/storage/sql/postgresql.py +27 -0
- hexdag_plugins/storage/tests/__init__.py +1 -0
- hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
- hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
- hexdag_plugins/storage/vector/__init__.py +7 -0
- hexdag_plugins/storage/vector/chromadb.py +223 -0
- hexdag_plugins/storage/vector/in_memory.py +285 -0
- hexdag_plugins/storage/vector/pgvector.py +502 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""ChromaDB vector store adapter for RAG plugin.
|
|
2
|
+
|
|
3
|
+
ChromaDB is an open-source embedding database that provides:
|
|
4
|
+
- Easy local development and deployment
|
|
5
|
+
- Built-in embedding models
|
|
6
|
+
- Persistent storage
|
|
7
|
+
- Cloud deployment option
|
|
8
|
+
|
|
9
|
+
Installation:
|
|
10
|
+
pip install chromadb
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from pydantic import Field
|
|
16
|
+
|
|
17
|
+
from hexdag.core import AdapterConfig, ConfigurableAdapter
|
|
18
|
+
from hexdag.core.registry.decorators import adapter
|
|
19
|
+
from hexdag_plugins.storage.ports import VectorStorePort
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ChromaDBConfig(AdapterConfig):
|
|
23
|
+
"""Configuration for ChromaDB adapter.
|
|
24
|
+
|
|
25
|
+
Attributes
|
|
26
|
+
----------
|
|
27
|
+
collection_name : str
|
|
28
|
+
Name of the ChromaDB collection (default: "hexdag_documents")
|
|
29
|
+
persist_directory : str | None
|
|
30
|
+
Directory for persistent storage (None for in-memory)
|
|
31
|
+
embedding_function : str
|
|
32
|
+
Embedding function to use (default: "default")
|
|
33
|
+
Options: "default", "sentence-transformers", "openai"
|
|
34
|
+
distance_metric : str
|
|
35
|
+
Distance metric for similarity (default: "cosine")
|
|
36
|
+
Options: "cosine", "l2", "ip" (inner product)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
collection_name: str = "hexdag_documents"
|
|
40
|
+
persist_directory: str | None = None
|
|
41
|
+
embedding_function: str = "default"
|
|
42
|
+
distance_metric: str = Field(default="cosine", pattern="^(cosine|l2|ip)$")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@adapter("vector_store", name="chromadb", namespace="plugin")
|
|
46
|
+
class ChromaDBAdapter(ConfigurableAdapter, VectorStorePort):
|
|
47
|
+
"""ChromaDB vector store adapter.
|
|
48
|
+
|
|
49
|
+
Provides persistent vector storage with built-in embedding support.
|
|
50
|
+
|
|
51
|
+
Examples
|
|
52
|
+
--------
|
|
53
|
+
>>> # In-memory ChromaDB
|
|
54
|
+
>>> store = ChromaDBAdapter(collection_name="docs")
|
|
55
|
+
>>> await store.aadd_documents([{"text": "Python programming"}])
|
|
56
|
+
>>> results = await store.asearch("Python", top_k=5)
|
|
57
|
+
|
|
58
|
+
>>> # Persistent ChromaDB
|
|
59
|
+
>>> store = ChromaDBAdapter(
|
|
60
|
+
... collection_name="docs",
|
|
61
|
+
... persist_directory="./chroma_db"
|
|
62
|
+
... )
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
Config = ChromaDBConfig
|
|
66
|
+
|
|
67
|
+
def __init__(self, **kwargs):
|
|
68
|
+
"""Initialize ChromaDB adapter."""
|
|
69
|
+
super().__init__(**kwargs)
|
|
70
|
+
self._client = None
|
|
71
|
+
self._collection = None
|
|
72
|
+
|
|
73
|
+
async def asetup(self):
|
|
74
|
+
"""Initialize ChromaDB client and collection."""
|
|
75
|
+
try:
|
|
76
|
+
import chromadb
|
|
77
|
+
from chromadb.config import Settings
|
|
78
|
+
except ImportError as e:
|
|
79
|
+
msg = "ChromaDB not installed. Install with: pip install chromadb"
|
|
80
|
+
raise ImportError(msg) from e
|
|
81
|
+
|
|
82
|
+
# Create client
|
|
83
|
+
if self.config.persist_directory:
|
|
84
|
+
settings = Settings(
|
|
85
|
+
persist_directory=self.config.persist_directory,
|
|
86
|
+
anonymized_telemetry=False,
|
|
87
|
+
)
|
|
88
|
+
self._client = chromadb.Client(settings)
|
|
89
|
+
else:
|
|
90
|
+
self._client = chromadb.Client()
|
|
91
|
+
|
|
92
|
+
# Get or create collection
|
|
93
|
+
self._collection = self._client.get_or_create_collection(
|
|
94
|
+
name=self.config.collection_name,
|
|
95
|
+
metadata={"distance_metric": self.config.distance_metric},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
async def aadd_documents(
|
|
99
|
+
self,
|
|
100
|
+
documents: list[dict[str, Any]],
|
|
101
|
+
embeddings: list[list[float]] | None = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
"""Add documents to ChromaDB.
|
|
104
|
+
|
|
105
|
+
ChromaDB can generate embeddings automatically if not provided.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
documents: List of documents with 'text' and optional 'metadata'
|
|
109
|
+
embeddings: Optional pre-computed embeddings (if None, ChromaDB generates)
|
|
110
|
+
"""
|
|
111
|
+
if not self._collection:
|
|
112
|
+
await self.asetup()
|
|
113
|
+
|
|
114
|
+
texts = [doc["text"] for doc in documents]
|
|
115
|
+
metadatas = [doc.get("metadata", {}) for doc in documents]
|
|
116
|
+
ids = [doc.get("id", f"doc_{i}") for i, doc in enumerate(documents)]
|
|
117
|
+
|
|
118
|
+
if embeddings:
|
|
119
|
+
# Use provided embeddings
|
|
120
|
+
self._collection.add(
|
|
121
|
+
documents=texts,
|
|
122
|
+
embeddings=embeddings,
|
|
123
|
+
metadatas=metadatas,
|
|
124
|
+
ids=ids,
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
# Let ChromaDB generate embeddings
|
|
128
|
+
self._collection.add(
|
|
129
|
+
documents=texts,
|
|
130
|
+
metadatas=metadatas,
|
|
131
|
+
ids=ids,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
async def asearch(
|
|
135
|
+
self,
|
|
136
|
+
query: str,
|
|
137
|
+
query_embedding: list[float] | None = None,
|
|
138
|
+
top_k: int | None = None,
|
|
139
|
+
filter_metadata: dict[str, Any] | None = None,
|
|
140
|
+
) -> list[dict[str, Any]]:
|
|
141
|
+
"""Search for similar documents in ChromaDB.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
query: Search query text
|
|
145
|
+
query_embedding: Optional pre-computed query embedding
|
|
146
|
+
top_k: Number of results to return (default: from config or 5)
|
|
147
|
+
filter_metadata: Optional metadata filters (ChromaDB where clause)
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of matching documents with scores and metadata
|
|
151
|
+
"""
|
|
152
|
+
if not self._collection:
|
|
153
|
+
await self.asetup()
|
|
154
|
+
|
|
155
|
+
k = top_k or 5
|
|
156
|
+
|
|
157
|
+
# Build where clause from filter_metadata
|
|
158
|
+
where = None
|
|
159
|
+
if filter_metadata:
|
|
160
|
+
where = filter_metadata
|
|
161
|
+
|
|
162
|
+
if query_embedding:
|
|
163
|
+
# Use provided embedding
|
|
164
|
+
results = self._collection.query(
|
|
165
|
+
query_embeddings=[query_embedding],
|
|
166
|
+
n_results=k,
|
|
167
|
+
where=where,
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
# Let ChromaDB embed the query
|
|
171
|
+
results = self._collection.query(
|
|
172
|
+
query_texts=[query],
|
|
173
|
+
n_results=k,
|
|
174
|
+
where=where,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Format results
|
|
178
|
+
return [
|
|
179
|
+
{
|
|
180
|
+
"id": results["ids"][0][i],
|
|
181
|
+
"text": results["documents"][0][i],
|
|
182
|
+
"score": 1.0 - results["distances"][0][i], # Convert distance to similarity
|
|
183
|
+
"metadata": results["metadatas"][0][i] if results["metadatas"] else {},
|
|
184
|
+
}
|
|
185
|
+
for i in range(len(results["ids"][0]))
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
async def aclear(self) -> None:
|
|
189
|
+
"""Clear all documents from the collection."""
|
|
190
|
+
if not self._collection:
|
|
191
|
+
await self.asetup()
|
|
192
|
+
|
|
193
|
+
# Delete and recreate collection
|
|
194
|
+
self._client.delete_collection(name=self.config.collection_name)
|
|
195
|
+
self._collection = self._client.get_or_create_collection(
|
|
196
|
+
name=self.config.collection_name,
|
|
197
|
+
metadata={"distance_metric": self.config.distance_metric},
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
async def acount(self) -> int:
|
|
201
|
+
"""Get the number of documents in the collection."""
|
|
202
|
+
if not self._collection:
|
|
203
|
+
await self.asetup()
|
|
204
|
+
|
|
205
|
+
return self._collection.count()
|
|
206
|
+
|
|
207
|
+
async def adelete(self, ids: list[str]) -> None:
|
|
208
|
+
"""Delete documents by ID.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
ids: List of document IDs to delete
|
|
212
|
+
"""
|
|
213
|
+
if not self._collection:
|
|
214
|
+
await self.asetup()
|
|
215
|
+
|
|
216
|
+
self._collection.delete(ids=ids)
|
|
217
|
+
|
|
218
|
+
def __repr__(self) -> str:
|
|
219
|
+
"""String representation."""
|
|
220
|
+
return (
|
|
221
|
+
f"ChromaDBAdapter(collection={self.config.collection_name}, "
|
|
222
|
+
f"persist={self.config.persist_directory is not None})"
|
|
223
|
+
)
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""In-memory vector store for RAG operations."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import math
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from hexdag.core.configurable import AdapterConfig, ConfigurableAdapter
|
|
8
|
+
from hexdag.core.registry.decorators import adapter
|
|
9
|
+
from hexdag_plugins.storage.ports import VectorStorePort
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorStoreConfig(AdapterConfig):
|
|
13
|
+
"""Configuration for in-memory vector store.
|
|
14
|
+
|
|
15
|
+
Attributes
|
|
16
|
+
----------
|
|
17
|
+
embedding_dim : int
|
|
18
|
+
Dimension of embedding vectors (default: 384 for sentence-transformers)
|
|
19
|
+
max_results : int
|
|
20
|
+
Maximum number of results to return from search (default: 5)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
embedding_dim: int = 384
|
|
24
|
+
max_results: int = 5
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@adapter("vector_store", name="in_memory_vector", namespace="plugin")
|
|
28
|
+
class InMemoryVectorStore(ConfigurableAdapter, VectorStorePort):
|
|
29
|
+
"""In-memory vector store for RAG operations.
|
|
30
|
+
|
|
31
|
+
Stores text chunks with embeddings and provides similarity search.
|
|
32
|
+
Uses simple cosine similarity for retrieval.
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
Store and search documents::
|
|
37
|
+
|
|
38
|
+
from hexdag.core.registry import registry
|
|
39
|
+
|
|
40
|
+
vector_store = registry.get("in_memory_vector", namespace="plugin")
|
|
41
|
+
|
|
42
|
+
# Add documents
|
|
43
|
+
await vector_store.aadd_documents([
|
|
44
|
+
{"text": "Python is a programming language", "id": "doc1"},
|
|
45
|
+
{"text": "Machine learning uses algorithms", "id": "doc2"},
|
|
46
|
+
])
|
|
47
|
+
|
|
48
|
+
# Search
|
|
49
|
+
results = await vector_store.asearch("programming", top_k=2)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
Config = VectorStoreConfig
|
|
53
|
+
|
|
54
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
55
|
+
"""Initialize vector store."""
|
|
56
|
+
super().__init__(**kwargs)
|
|
57
|
+
self._documents: list[dict[str, Any]] = []
|
|
58
|
+
self._embeddings: list[list[float]] = []
|
|
59
|
+
|
|
60
|
+
async def aadd_documents(
|
|
61
|
+
self,
|
|
62
|
+
documents: list[dict[str, Any]],
|
|
63
|
+
embeddings: list[list[float]] | None = None,
|
|
64
|
+
) -> dict[str, Any]:
|
|
65
|
+
"""Add documents to the vector store.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
documents : list[dict[str, Any]]
|
|
70
|
+
Documents to add (must have 'text' field)
|
|
71
|
+
embeddings : list[list[float]] | None
|
|
72
|
+
Pre-computed embeddings (if None, uses simple hash-based embedding)
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
dict[str, Any]
|
|
77
|
+
Result with count of added documents
|
|
78
|
+
"""
|
|
79
|
+
if embeddings is None:
|
|
80
|
+
# Generate simple embeddings if none provided
|
|
81
|
+
embeddings = [self._simple_embedding(doc["text"]) for doc in documents]
|
|
82
|
+
|
|
83
|
+
if len(documents) != len(embeddings):
|
|
84
|
+
msg = "Number of documents must match number of embeddings"
|
|
85
|
+
raise ValueError(msg)
|
|
86
|
+
|
|
87
|
+
self._documents.extend(documents)
|
|
88
|
+
self._embeddings.extend(embeddings)
|
|
89
|
+
|
|
90
|
+
return {"added": len(documents), "total": len(self._documents)}
|
|
91
|
+
|
|
92
|
+
async def asearch(
|
|
93
|
+
self,
|
|
94
|
+
query: str,
|
|
95
|
+
query_embedding: list[float] | None = None,
|
|
96
|
+
top_k: int | None = None,
|
|
97
|
+
) -> list[dict[str, Any]]:
|
|
98
|
+
"""Search for similar documents.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
query : str
|
|
103
|
+
Query text
|
|
104
|
+
query_embedding : list[float] | None
|
|
105
|
+
Pre-computed query embedding (if None, generates from query text)
|
|
106
|
+
top_k : int | None
|
|
107
|
+
Number of results to return (uses config.max_results if None)
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
list[dict[str, Any]]
|
|
112
|
+
Top-k most similar documents with similarity scores
|
|
113
|
+
"""
|
|
114
|
+
if not self._documents:
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
if query_embedding is None:
|
|
118
|
+
query_embedding = self._simple_embedding(query)
|
|
119
|
+
|
|
120
|
+
k = top_k if top_k is not None else self.config.max_results
|
|
121
|
+
|
|
122
|
+
# Return empty list if top_k is explicitly 0
|
|
123
|
+
if k == 0:
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
# Calculate similarities
|
|
127
|
+
similarities = []
|
|
128
|
+
for i, doc_embedding in enumerate(self._embeddings):
|
|
129
|
+
sim = self._cosine_similarity(query_embedding, doc_embedding)
|
|
130
|
+
similarities.append((i, sim))
|
|
131
|
+
|
|
132
|
+
# Sort by similarity and return top-k
|
|
133
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
134
|
+
results = []
|
|
135
|
+
|
|
136
|
+
for idx, score in similarities[:k]:
|
|
137
|
+
result = self._documents[idx].copy()
|
|
138
|
+
result["similarity_score"] = score
|
|
139
|
+
results.append(result)
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
async def aclear(self) -> dict[str, Any]:
|
|
144
|
+
"""Clear all documents from the store.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
dict[str, Any]
|
|
149
|
+
Result with count of removed documents
|
|
150
|
+
"""
|
|
151
|
+
count = len(self._documents)
|
|
152
|
+
self._documents.clear()
|
|
153
|
+
self._embeddings.clear()
|
|
154
|
+
return {"removed": count}
|
|
155
|
+
|
|
156
|
+
async def adelete(self, ids: list[str]) -> dict[str, Any]:
|
|
157
|
+
"""Delete documents by ID.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
ids : list[str]
|
|
162
|
+
List of document IDs to delete
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
dict[str, Any]
|
|
167
|
+
Result with count of deleted documents
|
|
168
|
+
"""
|
|
169
|
+
deleted_count = 0
|
|
170
|
+
indices_to_remove = []
|
|
171
|
+
|
|
172
|
+
# Find indices of documents to delete
|
|
173
|
+
for i, doc in enumerate(self._documents):
|
|
174
|
+
if doc.get("id") in ids:
|
|
175
|
+
indices_to_remove.append(i)
|
|
176
|
+
deleted_count += 1
|
|
177
|
+
|
|
178
|
+
# Remove in reverse order to maintain indices
|
|
179
|
+
for idx in reversed(indices_to_remove):
|
|
180
|
+
del self._documents[idx]
|
|
181
|
+
del self._embeddings[idx]
|
|
182
|
+
|
|
183
|
+
return {"deleted": deleted_count}
|
|
184
|
+
|
|
185
|
+
async def acount(self) -> int:
|
|
186
|
+
"""Get the number of documents in the vector store.
|
|
187
|
+
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
190
|
+
int
|
|
191
|
+
Number of documents currently stored
|
|
192
|
+
"""
|
|
193
|
+
return len(self._documents)
|
|
194
|
+
|
|
195
|
+
async def aget_stats(self) -> dict[str, Any]:
|
|
196
|
+
"""Get vector store statistics.
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
dict[str, Any]
|
|
201
|
+
Statistics about stored documents
|
|
202
|
+
"""
|
|
203
|
+
return {
|
|
204
|
+
"document_count": len(self._documents),
|
|
205
|
+
"embedding_dim": self.config.embedding_dim,
|
|
206
|
+
"max_results": self.config.max_results,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
def get_stats(self) -> dict[str, Any]:
|
|
210
|
+
"""Get vector store statistics (sync version for backwards compatibility).
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
dict[str, Any]
|
|
215
|
+
Statistics about stored documents
|
|
216
|
+
"""
|
|
217
|
+
return {
|
|
218
|
+
"document_count": len(self._documents),
|
|
219
|
+
"embedding_dim": self.config.embedding_dim,
|
|
220
|
+
"max_results": self.config.max_results,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
def _simple_embedding(self, text: str) -> list[float]:
|
|
224
|
+
"""Generate a simple hash-based embedding.
|
|
225
|
+
|
|
226
|
+
This is a placeholder for production embedding models like
|
|
227
|
+
sentence-transformers or OpenAI embeddings.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
text : str
|
|
232
|
+
Text to embed
|
|
233
|
+
|
|
234
|
+
Returns
|
|
235
|
+
-------
|
|
236
|
+
list[float]
|
|
237
|
+
Embedding vector
|
|
238
|
+
"""
|
|
239
|
+
# Use multiple hash functions to create vector
|
|
240
|
+
dim = self.config.embedding_dim
|
|
241
|
+
vector = []
|
|
242
|
+
|
|
243
|
+
# Normalize text
|
|
244
|
+
normalized = text.lower().strip()
|
|
245
|
+
|
|
246
|
+
for i in range(dim):
|
|
247
|
+
# Create different seeds for hash
|
|
248
|
+
seed = f"{normalized}_{i}"
|
|
249
|
+
hash_val = int(hashlib.md5(seed.encode()).hexdigest(), 16)
|
|
250
|
+
# Normalize to [-1, 1]
|
|
251
|
+
vector.append((hash_val % 1000) / 500 - 1)
|
|
252
|
+
|
|
253
|
+
return vector
|
|
254
|
+
|
|
255
|
+
def _cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
|
|
256
|
+
"""Calculate cosine similarity between two vectors.
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
vec1 : list[float]
|
|
261
|
+
First vector
|
|
262
|
+
vec2 : list[float]
|
|
263
|
+
Second vector
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
float
|
|
268
|
+
Cosine similarity score [-1, 1]
|
|
269
|
+
"""
|
|
270
|
+
if len(vec1) != len(vec2):
|
|
271
|
+
msg = "Vectors must have same dimension"
|
|
272
|
+
raise ValueError(msg)
|
|
273
|
+
|
|
274
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2, strict=False))
|
|
275
|
+
mag1 = math.sqrt(sum(a * a for a in vec1))
|
|
276
|
+
mag2 = math.sqrt(sum(b * b for b in vec2))
|
|
277
|
+
|
|
278
|
+
if mag1 == 0 or mag2 == 0:
|
|
279
|
+
return 0.0
|
|
280
|
+
|
|
281
|
+
return dot_product / (mag1 * mag2)
|
|
282
|
+
|
|
283
|
+
def __repr__(self) -> str:
|
|
284
|
+
"""String representation."""
|
|
285
|
+
return f"InMemoryVectorStore(documents={len(self._documents)})"
|