hexdag 0.5.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hexdag/__init__.py +116 -0
- hexdag/__main__.py +30 -0
- hexdag/adapters/executors/__init__.py +5 -0
- hexdag/adapters/executors/local_executor.py +316 -0
- hexdag/builtin/__init__.py +6 -0
- hexdag/builtin/adapters/__init__.py +51 -0
- hexdag/builtin/adapters/anthropic/__init__.py +5 -0
- hexdag/builtin/adapters/anthropic/anthropic_adapter.py +151 -0
- hexdag/builtin/adapters/database/__init__.py +6 -0
- hexdag/builtin/adapters/database/csv/csv_adapter.py +249 -0
- hexdag/builtin/adapters/database/pgvector/__init__.py +5 -0
- hexdag/builtin/adapters/database/pgvector/pgvector_adapter.py +478 -0
- hexdag/builtin/adapters/database/sqlalchemy/sqlalchemy_adapter.py +252 -0
- hexdag/builtin/adapters/database/sqlite/__init__.py +5 -0
- hexdag/builtin/adapters/database/sqlite/sqlite_adapter.py +410 -0
- hexdag/builtin/adapters/local/README.md +59 -0
- hexdag/builtin/adapters/local/__init__.py +7 -0
- hexdag/builtin/adapters/local/local_observer_manager.py +696 -0
- hexdag/builtin/adapters/memory/__init__.py +47 -0
- hexdag/builtin/adapters/memory/file_memory_adapter.py +297 -0
- hexdag/builtin/adapters/memory/in_memory_memory.py +216 -0
- hexdag/builtin/adapters/memory/schemas.py +57 -0
- hexdag/builtin/adapters/memory/session_memory.py +178 -0
- hexdag/builtin/adapters/memory/sqlite_memory_adapter.py +215 -0
- hexdag/builtin/adapters/memory/state_memory.py +280 -0
- hexdag/builtin/adapters/mock/README.md +89 -0
- hexdag/builtin/adapters/mock/__init__.py +15 -0
- hexdag/builtin/adapters/mock/hexdag.toml +50 -0
- hexdag/builtin/adapters/mock/mock_database.py +225 -0
- hexdag/builtin/adapters/mock/mock_embedding.py +223 -0
- hexdag/builtin/adapters/mock/mock_llm.py +177 -0
- hexdag/builtin/adapters/mock/mock_tool_adapter.py +192 -0
- hexdag/builtin/adapters/mock/mock_tool_router.py +232 -0
- hexdag/builtin/adapters/openai/__init__.py +5 -0
- hexdag/builtin/adapters/openai/openai_adapter.py +634 -0
- hexdag/builtin/adapters/secret/__init__.py +7 -0
- hexdag/builtin/adapters/secret/local_secret_adapter.py +248 -0
- hexdag/builtin/adapters/unified_tool_router.py +280 -0
- hexdag/builtin/macros/__init__.py +17 -0
- hexdag/builtin/macros/conversation_agent.py +390 -0
- hexdag/builtin/macros/llm_macro.py +151 -0
- hexdag/builtin/macros/reasoning_agent.py +423 -0
- hexdag/builtin/macros/tool_macro.py +380 -0
- hexdag/builtin/nodes/__init__.py +38 -0
- hexdag/builtin/nodes/_discovery.py +123 -0
- hexdag/builtin/nodes/agent_node.py +696 -0
- hexdag/builtin/nodes/base_node_factory.py +242 -0
- hexdag/builtin/nodes/composite_node.py +926 -0
- hexdag/builtin/nodes/data_node.py +201 -0
- hexdag/builtin/nodes/expression_node.py +487 -0
- hexdag/builtin/nodes/function_node.py +454 -0
- hexdag/builtin/nodes/llm_node.py +491 -0
- hexdag/builtin/nodes/loop_node.py +920 -0
- hexdag/builtin/nodes/mapped_input.py +518 -0
- hexdag/builtin/nodes/port_call_node.py +269 -0
- hexdag/builtin/nodes/tool_call_node.py +195 -0
- hexdag/builtin/nodes/tool_utils.py +390 -0
- hexdag/builtin/prompts/__init__.py +68 -0
- hexdag/builtin/prompts/base.py +422 -0
- hexdag/builtin/prompts/chat_prompts.py +303 -0
- hexdag/builtin/prompts/error_correction_prompts.py +320 -0
- hexdag/builtin/prompts/tool_prompts.py +160 -0
- hexdag/builtin/tools/builtin_tools.py +84 -0
- hexdag/builtin/tools/database_tools.py +164 -0
- hexdag/cli/__init__.py +17 -0
- hexdag/cli/__main__.py +7 -0
- hexdag/cli/commands/__init__.py +27 -0
- hexdag/cli/commands/build_cmd.py +812 -0
- hexdag/cli/commands/create_cmd.py +208 -0
- hexdag/cli/commands/docs_cmd.py +293 -0
- hexdag/cli/commands/generate_types_cmd.py +252 -0
- hexdag/cli/commands/init_cmd.py +188 -0
- hexdag/cli/commands/pipeline_cmd.py +494 -0
- hexdag/cli/commands/plugin_dev_cmd.py +529 -0
- hexdag/cli/commands/plugins_cmd.py +441 -0
- hexdag/cli/commands/studio_cmd.py +101 -0
- hexdag/cli/commands/validate_cmd.py +221 -0
- hexdag/cli/main.py +84 -0
- hexdag/core/__init__.py +83 -0
- hexdag/core/config/__init__.py +20 -0
- hexdag/core/config/loader.py +479 -0
- hexdag/core/config/models.py +150 -0
- hexdag/core/configurable.py +294 -0
- hexdag/core/context/__init__.py +37 -0
- hexdag/core/context/execution_context.py +378 -0
- hexdag/core/docs/__init__.py +26 -0
- hexdag/core/docs/extractors.py +678 -0
- hexdag/core/docs/generators.py +890 -0
- hexdag/core/docs/models.py +120 -0
- hexdag/core/domain/__init__.py +10 -0
- hexdag/core/domain/dag.py +1225 -0
- hexdag/core/exceptions.py +234 -0
- hexdag/core/expression_parser.py +569 -0
- hexdag/core/logging.py +449 -0
- hexdag/core/models/__init__.py +17 -0
- hexdag/core/models/base.py +138 -0
- hexdag/core/orchestration/__init__.py +46 -0
- hexdag/core/orchestration/body_executor.py +481 -0
- hexdag/core/orchestration/components/__init__.py +97 -0
- hexdag/core/orchestration/components/adapter_lifecycle_manager.py +113 -0
- hexdag/core/orchestration/components/checkpoint_manager.py +134 -0
- hexdag/core/orchestration/components/execution_coordinator.py +360 -0
- hexdag/core/orchestration/components/health_check_manager.py +176 -0
- hexdag/core/orchestration/components/input_mapper.py +143 -0
- hexdag/core/orchestration/components/lifecycle_manager.py +583 -0
- hexdag/core/orchestration/components/node_executor.py +377 -0
- hexdag/core/orchestration/components/secret_manager.py +202 -0
- hexdag/core/orchestration/components/wave_executor.py +158 -0
- hexdag/core/orchestration/constants.py +17 -0
- hexdag/core/orchestration/events/README.md +312 -0
- hexdag/core/orchestration/events/__init__.py +104 -0
- hexdag/core/orchestration/events/batching.py +330 -0
- hexdag/core/orchestration/events/decorators.py +139 -0
- hexdag/core/orchestration/events/events.py +573 -0
- hexdag/core/orchestration/events/observers/__init__.py +30 -0
- hexdag/core/orchestration/events/observers/core_observers.py +690 -0
- hexdag/core/orchestration/events/observers/models.py +111 -0
- hexdag/core/orchestration/events/taxonomy.py +269 -0
- hexdag/core/orchestration/hook_context.py +237 -0
- hexdag/core/orchestration/hooks.py +437 -0
- hexdag/core/orchestration/models.py +418 -0
- hexdag/core/orchestration/orchestrator.py +910 -0
- hexdag/core/orchestration/orchestrator_factory.py +275 -0
- hexdag/core/orchestration/port_wrappers.py +327 -0
- hexdag/core/orchestration/prompt/__init__.py +32 -0
- hexdag/core/orchestration/prompt/template.py +332 -0
- hexdag/core/pipeline_builder/__init__.py +21 -0
- hexdag/core/pipeline_builder/component_instantiator.py +386 -0
- hexdag/core/pipeline_builder/include_tag.py +265 -0
- hexdag/core/pipeline_builder/pipeline_config.py +133 -0
- hexdag/core/pipeline_builder/py_tag.py +223 -0
- hexdag/core/pipeline_builder/tag_discovery.py +268 -0
- hexdag/core/pipeline_builder/yaml_builder.py +1196 -0
- hexdag/core/pipeline_builder/yaml_validator.py +569 -0
- hexdag/core/ports/__init__.py +65 -0
- hexdag/core/ports/api_call.py +133 -0
- hexdag/core/ports/database.py +489 -0
- hexdag/core/ports/embedding.py +215 -0
- hexdag/core/ports/executor.py +237 -0
- hexdag/core/ports/file_storage.py +117 -0
- hexdag/core/ports/healthcheck.py +87 -0
- hexdag/core/ports/llm.py +551 -0
- hexdag/core/ports/memory.py +70 -0
- hexdag/core/ports/observer_manager.py +130 -0
- hexdag/core/ports/secret.py +145 -0
- hexdag/core/ports/tool_router.py +94 -0
- hexdag/core/ports_builder.py +623 -0
- hexdag/core/protocols.py +273 -0
- hexdag/core/resolver.py +304 -0
- hexdag/core/schema/__init__.py +9 -0
- hexdag/core/schema/generator.py +742 -0
- hexdag/core/secrets.py +242 -0
- hexdag/core/types.py +413 -0
- hexdag/core/utils/async_warnings.py +206 -0
- hexdag/core/utils/schema_conversion.py +78 -0
- hexdag/core/utils/sql_validation.py +86 -0
- hexdag/core/validation/secure_json.py +148 -0
- hexdag/core/yaml_macro.py +517 -0
- hexdag/mcp_server.py +3120 -0
- hexdag/studio/__init__.py +10 -0
- hexdag/studio/build_ui.py +92 -0
- hexdag/studio/server/__init__.py +1 -0
- hexdag/studio/server/main.py +100 -0
- hexdag/studio/server/routes/__init__.py +9 -0
- hexdag/studio/server/routes/execute.py +208 -0
- hexdag/studio/server/routes/export.py +558 -0
- hexdag/studio/server/routes/files.py +207 -0
- hexdag/studio/server/routes/plugins.py +419 -0
- hexdag/studio/server/routes/validate.py +220 -0
- hexdag/studio/ui/index.html +13 -0
- hexdag/studio/ui/package-lock.json +2992 -0
- hexdag/studio/ui/package.json +31 -0
- hexdag/studio/ui/postcss.config.js +6 -0
- hexdag/studio/ui/public/hexdag.svg +5 -0
- hexdag/studio/ui/src/App.tsx +251 -0
- hexdag/studio/ui/src/components/Canvas.tsx +408 -0
- hexdag/studio/ui/src/components/ContextMenu.tsx +187 -0
- hexdag/studio/ui/src/components/FileBrowser.tsx +123 -0
- hexdag/studio/ui/src/components/Header.tsx +181 -0
- hexdag/studio/ui/src/components/HexdagNode.tsx +193 -0
- hexdag/studio/ui/src/components/NodeInspector.tsx +512 -0
- hexdag/studio/ui/src/components/NodePalette.tsx +262 -0
- hexdag/studio/ui/src/components/NodePortsSection.tsx +403 -0
- hexdag/studio/ui/src/components/PluginManager.tsx +347 -0
- hexdag/studio/ui/src/components/PortsEditor.tsx +481 -0
- hexdag/studio/ui/src/components/PythonEditor.tsx +195 -0
- hexdag/studio/ui/src/components/ValidationPanel.tsx +105 -0
- hexdag/studio/ui/src/components/YamlEditor.tsx +196 -0
- hexdag/studio/ui/src/components/index.ts +8 -0
- hexdag/studio/ui/src/index.css +92 -0
- hexdag/studio/ui/src/main.tsx +10 -0
- hexdag/studio/ui/src/types/index.ts +123 -0
- hexdag/studio/ui/src/vite-env.d.ts +1 -0
- hexdag/studio/ui/tailwind.config.js +29 -0
- hexdag/studio/ui/tsconfig.json +37 -0
- hexdag/studio/ui/tsconfig.node.json +13 -0
- hexdag/studio/ui/vite.config.ts +35 -0
- hexdag/visualization/__init__.py +69 -0
- hexdag/visualization/dag_visualizer.py +1020 -0
- hexdag-0.5.0.dev1.dist-info/METADATA +369 -0
- hexdag-0.5.0.dev1.dist-info/RECORD +261 -0
- hexdag-0.5.0.dev1.dist-info/WHEEL +4 -0
- hexdag-0.5.0.dev1.dist-info/entry_points.txt +4 -0
- hexdag-0.5.0.dev1.dist-info/licenses/LICENSE +190 -0
- hexdag_plugins/.gitignore +43 -0
- hexdag_plugins/README.md +73 -0
- hexdag_plugins/__init__.py +1 -0
- hexdag_plugins/azure/LICENSE +21 -0
- hexdag_plugins/azure/README.md +414 -0
- hexdag_plugins/azure/__init__.py +21 -0
- hexdag_plugins/azure/azure_blob_adapter.py +450 -0
- hexdag_plugins/azure/azure_cosmos_adapter.py +383 -0
- hexdag_plugins/azure/azure_keyvault_adapter.py +314 -0
- hexdag_plugins/azure/azure_openai_adapter.py +415 -0
- hexdag_plugins/azure/pyproject.toml +107 -0
- hexdag_plugins/azure/tests/__init__.py +1 -0
- hexdag_plugins/azure/tests/test_azure_blob_adapter.py +350 -0
- hexdag_plugins/azure/tests/test_azure_cosmos_adapter.py +323 -0
- hexdag_plugins/azure/tests/test_azure_keyvault_adapter.py +330 -0
- hexdag_plugins/azure/tests/test_azure_openai_adapter.py +329 -0
- hexdag_plugins/hexdag_etl/README.md +168 -0
- hexdag_plugins/hexdag_etl/__init__.py +53 -0
- hexdag_plugins/hexdag_etl/examples/01_simple_pandas_transform.py +270 -0
- hexdag_plugins/hexdag_etl/examples/02_simple_pandas_only.py +149 -0
- hexdag_plugins/hexdag_etl/examples/03_file_io_pipeline.py +109 -0
- hexdag_plugins/hexdag_etl/examples/test_pandas_transform.py +84 -0
- hexdag_plugins/hexdag_etl/hexdag.toml +25 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/__init__.py +48 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/__init__.py +13 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/api_extract.py +230 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/base_node_factory.py +181 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/file_io.py +415 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/outlook.py +492 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/pandas_transform.py +563 -0
- hexdag_plugins/hexdag_etl/hexdag_etl/nodes/sql_extract_load.py +112 -0
- hexdag_plugins/hexdag_etl/pyproject.toml +82 -0
- hexdag_plugins/hexdag_etl/test_transform.py +54 -0
- hexdag_plugins/hexdag_etl/tests/test_plugin_integration.py +62 -0
- hexdag_plugins/mysql_adapter/LICENSE +21 -0
- hexdag_plugins/mysql_adapter/README.md +224 -0
- hexdag_plugins/mysql_adapter/__init__.py +6 -0
- hexdag_plugins/mysql_adapter/mysql_adapter.py +408 -0
- hexdag_plugins/mysql_adapter/pyproject.toml +93 -0
- hexdag_plugins/mysql_adapter/tests/test_mysql_adapter.py +259 -0
- hexdag_plugins/storage/README.md +184 -0
- hexdag_plugins/storage/__init__.py +19 -0
- hexdag_plugins/storage/file/__init__.py +5 -0
- hexdag_plugins/storage/file/local.py +325 -0
- hexdag_plugins/storage/ports/__init__.py +5 -0
- hexdag_plugins/storage/ports/vector_store.py +236 -0
- hexdag_plugins/storage/sql/__init__.py +7 -0
- hexdag_plugins/storage/sql/base.py +187 -0
- hexdag_plugins/storage/sql/mysql.py +27 -0
- hexdag_plugins/storage/sql/postgresql.py +27 -0
- hexdag_plugins/storage/tests/__init__.py +1 -0
- hexdag_plugins/storage/tests/test_local_file_storage.py +161 -0
- hexdag_plugins/storage/tests/test_sql_adapters.py +212 -0
- hexdag_plugins/storage/vector/__init__.py +7 -0
- hexdag_plugins/storage/vector/chromadb.py +223 -0
- hexdag_plugins/storage/vector/in_memory.py +285 -0
- hexdag_plugins/storage/vector/pgvector.py +502 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
"""PostgreSQL pgvector adapter using SQLAlchemy.
|
|
2
|
+
|
|
3
|
+
Production-ready vector store built on SQLAlchemy's battle-tested connection pool
|
|
4
|
+
and the official pgvector Python library.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import ConfigDict, SecretStr, field_validator
|
|
11
|
+
from sqlalchemy import Column, Integer, Text, text
|
|
12
|
+
from sqlalchemy.dialects.postgresql import JSONB
|
|
13
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, create_async_engine
|
|
14
|
+
from sqlalchemy.orm import declarative_base
|
|
15
|
+
|
|
16
|
+
from hexdag.core.configurable import AdapterConfig, ConfigurableAdapter, SecretField
|
|
17
|
+
from hexdag.core.ports.healthcheck import HealthStatus
|
|
18
|
+
from hexdag.core.registry.decorators import adapter
|
|
19
|
+
from hexdag.core.utils.sql_validation import validate_sql_identifier
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from pgvector.sqlalchemy import Vector
|
|
23
|
+
except ImportError:
|
|
24
|
+
Vector = None # type: ignore[assignment,misc]
|
|
25
|
+
|
|
26
|
+
from hexdag_plugins.storage.ports import VectorStorePort
|
|
27
|
+
|
|
28
|
+
Base = declarative_base()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PgVectorConfig(AdapterConfig):
|
|
32
|
+
"""Configuration for PgVector adapter with SQLAlchemy.
|
|
33
|
+
|
|
34
|
+
Attributes
|
|
35
|
+
----------
|
|
36
|
+
connection_string : SecretStr | None
|
|
37
|
+
PostgreSQL connection string with asyncpg driver
|
|
38
|
+
(e.g., "postgresql+asyncpg://user:pass@localhost/db")
|
|
39
|
+
table_name : str
|
|
40
|
+
Name of the vector table (default: "document_embeddings")
|
|
41
|
+
embedding_dim : int
|
|
42
|
+
Dimension of embedding vectors (default: 384)
|
|
43
|
+
max_results : int
|
|
44
|
+
Maximum number of search results (default: 5)
|
|
45
|
+
distance_metric : str
|
|
46
|
+
Distance metric: "cosine", "l2", or "inner_product" (default: "cosine")
|
|
47
|
+
pool_size : int
|
|
48
|
+
SQLAlchemy connection pool size (default: 5)
|
|
49
|
+
max_overflow : int
|
|
50
|
+
Maximum overflow connections beyond pool_size (default: 10)
|
|
51
|
+
pool_timeout : float
|
|
52
|
+
Timeout for getting connection from pool (default: 30.0)
|
|
53
|
+
pool_recycle : int
|
|
54
|
+
Recycle connections after N seconds (default: 3600 - 1 hour)
|
|
55
|
+
pool_pre_ping : bool
|
|
56
|
+
Test connections before using them (default: True)
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
60
|
+
|
|
61
|
+
connection_string: SecretStr | None = SecretField(
|
|
62
|
+
env_var="PGVECTOR_CONNECTION_STRING",
|
|
63
|
+
description="PostgreSQL connection string (postgresql+asyncpg://...)",
|
|
64
|
+
)
|
|
65
|
+
table_name: str = "document_embeddings"
|
|
66
|
+
embedding_dim: int = 384
|
|
67
|
+
max_results: int = 5
|
|
68
|
+
distance_metric: str = "cosine"
|
|
69
|
+
|
|
70
|
+
# SQLAlchemy pool configuration
|
|
71
|
+
pool_size: int = 5
|
|
72
|
+
max_overflow: int = 10
|
|
73
|
+
pool_timeout: float = 30.0
|
|
74
|
+
pool_recycle: int = 3600
|
|
75
|
+
pool_pre_ping: bool = True
|
|
76
|
+
|
|
77
|
+
@field_validator("table_name")
|
|
78
|
+
@classmethod
|
|
79
|
+
def validate_table_name(cls, v: str) -> str:
|
|
80
|
+
"""Validate table name to prevent SQL injection."""
|
|
81
|
+
validate_sql_identifier(v, identifier_type="table", raise_on_invalid=True)
|
|
82
|
+
if len(v) > 63:
|
|
83
|
+
msg = f"Table name '{v}' exceeds PostgreSQL limit of 63 characters"
|
|
84
|
+
raise ValueError(msg)
|
|
85
|
+
return v
|
|
86
|
+
|
|
87
|
+
@field_validator("distance_metric")
|
|
88
|
+
@classmethod
|
|
89
|
+
def validate_distance_metric(cls, v: str) -> str:
|
|
90
|
+
"""Validate distance metric."""
|
|
91
|
+
valid_metrics = {"cosine", "l2", "inner_product"}
|
|
92
|
+
if v not in valid_metrics:
|
|
93
|
+
msg = f"Invalid distance metric: '{v}'. Must be one of: {', '.join(valid_metrics)}"
|
|
94
|
+
raise ValueError(msg)
|
|
95
|
+
return v
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@adapter("vector_store", name="pgvector", namespace="plugin")
|
|
99
|
+
class PgVectorAdapter(ConfigurableAdapter, VectorStorePort):
|
|
100
|
+
"""PostgreSQL pgvector adapter using SQLAlchemy.
|
|
101
|
+
|
|
102
|
+
Built on SQLAlchemy's async engine with connection pooling for production use.
|
|
103
|
+
Uses the official pgvector library for vector operations.
|
|
104
|
+
|
|
105
|
+
Benefits
|
|
106
|
+
--------
|
|
107
|
+
- SQLAlchemy's mature connection pooling (5-20 connections)
|
|
108
|
+
- Automatic connection health checks (pool_pre_ping)
|
|
109
|
+
- Connection recycling to prevent stale connections
|
|
110
|
+
- Type-safe ORM with pgvector support
|
|
111
|
+
- Production-ready error handling
|
|
112
|
+
|
|
113
|
+
Examples
|
|
114
|
+
--------
|
|
115
|
+
Basic usage with connection pooling::
|
|
116
|
+
|
|
117
|
+
from hexdag.core.registry import registry
|
|
118
|
+
|
|
119
|
+
pgvector = registry.get("pgvector", namespace="plugin")(
|
|
120
|
+
connection_string="postgresql+asyncpg://localhost/mydb",
|
|
121
|
+
pool_size=10,
|
|
122
|
+
max_overflow=20
|
|
123
|
+
)
|
|
124
|
+
await pgvector.asetup()
|
|
125
|
+
|
|
126
|
+
# Add documents
|
|
127
|
+
await pgvector.aadd_documents(documents, embeddings)
|
|
128
|
+
|
|
129
|
+
# Search with vector similarity
|
|
130
|
+
results = await pgvector.asearch(
|
|
131
|
+
"query text",
|
|
132
|
+
query_embedding=embedding,
|
|
133
|
+
top_k=5
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
Configuration options::
|
|
137
|
+
|
|
138
|
+
pgvector = registry.get("pgvector", namespace="plugin")(
|
|
139
|
+
connection_string="postgresql+asyncpg://localhost/mydb",
|
|
140
|
+
table_name="embeddings",
|
|
141
|
+
embedding_dim=1536,
|
|
142
|
+
pool_size=20, # Connection pool size
|
|
143
|
+
max_overflow=10, # Extra connections if needed
|
|
144
|
+
pool_recycle=3600, # Recycle after 1 hour
|
|
145
|
+
pool_pre_ping=True, # Check connection health
|
|
146
|
+
distance_metric="cosine"
|
|
147
|
+
)
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
Config = PgVectorConfig
|
|
151
|
+
|
|
152
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
153
|
+
"""Initialize PgVector adapter with SQLAlchemy engine."""
|
|
154
|
+
if Vector is None:
|
|
155
|
+
msg = (
|
|
156
|
+
"pgvector is required for PgVector adapter. "
|
|
157
|
+
"Install it with: uv pip install pgvector"
|
|
158
|
+
)
|
|
159
|
+
raise ImportError(msg)
|
|
160
|
+
|
|
161
|
+
super().__init__(**kwargs)
|
|
162
|
+
self._engine: AsyncEngine | None = None
|
|
163
|
+
self._model_class = None
|
|
164
|
+
self._initialized = False
|
|
165
|
+
|
|
166
|
+
def _create_model(self):
|
|
167
|
+
"""Create SQLAlchemy model dynamically based on config."""
|
|
168
|
+
table_name = self.config.table_name
|
|
169
|
+
embedding_dim = self.config.embedding_dim
|
|
170
|
+
|
|
171
|
+
class EmbeddingModel(Base):
|
|
172
|
+
__tablename__ = table_name
|
|
173
|
+
__table_args__ = {"extend_existing": True}
|
|
174
|
+
|
|
175
|
+
id = Column(Integer, primary_key=True)
|
|
176
|
+
text = Column(Text, nullable=False)
|
|
177
|
+
embedding = Column(Vector(embedding_dim))
|
|
178
|
+
metadata = Column(JSONB, default={})
|
|
179
|
+
|
|
180
|
+
return EmbeddingModel
|
|
181
|
+
|
|
182
|
+
async def asetup(self) -> None:
|
|
183
|
+
"""Initialize SQLAlchemy engine and create tables."""
|
|
184
|
+
if self.config.connection_string is None:
|
|
185
|
+
msg = "connection_string is required for PgVector adapter"
|
|
186
|
+
raise ValueError(msg)
|
|
187
|
+
|
|
188
|
+
# Create async engine with connection pool
|
|
189
|
+
self._engine = create_async_engine(
|
|
190
|
+
self.config.connection_string.get_secret_value(),
|
|
191
|
+
pool_size=self.config.pool_size,
|
|
192
|
+
max_overflow=self.config.max_overflow,
|
|
193
|
+
pool_timeout=self.config.pool_timeout,
|
|
194
|
+
pool_recycle=self.config.pool_recycle,
|
|
195
|
+
pool_pre_ping=self.config.pool_pre_ping,
|
|
196
|
+
echo=False, # Set to True for SQL debugging
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Create dynamic model
|
|
200
|
+
self._model_class = self._create_model()
|
|
201
|
+
|
|
202
|
+
# Enable pgvector extension and create tables
|
|
203
|
+
async with self._engine.begin() as conn:
|
|
204
|
+
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
|
205
|
+
await conn.run_sync(Base.metadata.create_all)
|
|
206
|
+
|
|
207
|
+
# Create vector similarity index
|
|
208
|
+
distance_op = self._get_distance_operator()
|
|
209
|
+
index_name = f"{self.config.table_name}_embedding_idx"
|
|
210
|
+
|
|
211
|
+
# Check if index exists
|
|
212
|
+
index_exists = await conn.scalar(
|
|
213
|
+
text("SELECT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = :index_name)"),
|
|
214
|
+
{"index_name": index_name},
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if not index_exists:
|
|
218
|
+
await conn.execute(
|
|
219
|
+
text(f"""
|
|
220
|
+
CREATE INDEX {index_name}
|
|
221
|
+
ON {self.config.table_name}
|
|
222
|
+
USING ivfflat (embedding {distance_op})
|
|
223
|
+
WITH (lists = 100)
|
|
224
|
+
""")
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
self._initialized = True
|
|
228
|
+
|
|
229
|
+
async def aclose(self) -> None:
|
|
230
|
+
"""Close SQLAlchemy engine and connection pool."""
|
|
231
|
+
if self._engine:
|
|
232
|
+
await self._engine.dispose()
|
|
233
|
+
self._engine = None
|
|
234
|
+
self._initialized = False
|
|
235
|
+
|
|
236
|
+
async def aadd_documents(
|
|
237
|
+
self,
|
|
238
|
+
documents: list[dict[str, Any]],
|
|
239
|
+
embeddings: list[list[float]] | None = None,
|
|
240
|
+
) -> dict[str, Any]:
|
|
241
|
+
"""Add documents with embeddings to the vector store."""
|
|
242
|
+
if not self._initialized:
|
|
243
|
+
await self.asetup()
|
|
244
|
+
|
|
245
|
+
if embeddings is None:
|
|
246
|
+
msg = "Embeddings are required for PgVector adapter. Example: embeddings = \
|
|
247
|
+
await embedding_adapter.aembed_texts([doc['text'] for doc in documents])"
|
|
248
|
+
raise ValueError(msg)
|
|
249
|
+
|
|
250
|
+
if len(documents) != len(embeddings):
|
|
251
|
+
msg = f"Mismatch: {len(documents)} documents but {len(embeddings)} embeddings"
|
|
252
|
+
raise ValueError(msg)
|
|
253
|
+
|
|
254
|
+
inserted_ids = []
|
|
255
|
+
async with AsyncSession(self._engine) as session:
|
|
256
|
+
for doc, embedding in zip(documents, embeddings, strict=False):
|
|
257
|
+
record = self._model_class(
|
|
258
|
+
text=doc.get("text", ""),
|
|
259
|
+
embedding=embedding,
|
|
260
|
+
metadata=doc.get("metadata", {}),
|
|
261
|
+
)
|
|
262
|
+
session.add(record)
|
|
263
|
+
|
|
264
|
+
await session.commit()
|
|
265
|
+
|
|
266
|
+
# Get IDs of inserted records (need to refresh to get IDs)
|
|
267
|
+
inserted_ids.extend(record.id for record in session.new if hasattr(record, "id"))
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
"added": len(documents),
|
|
271
|
+
"ids": inserted_ids,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
async def asearch(
|
|
275
|
+
self,
|
|
276
|
+
query: str,
|
|
277
|
+
query_embedding: list[float] | None = None,
|
|
278
|
+
top_k: int | None = None,
|
|
279
|
+
filter_metadata: dict[str, Any] | None = None,
|
|
280
|
+
) -> list[dict[str, Any]]:
|
|
281
|
+
"""Search for similar documents using vector similarity."""
|
|
282
|
+
if not self._initialized:
|
|
283
|
+
await self.asetup()
|
|
284
|
+
|
|
285
|
+
if query_embedding is None:
|
|
286
|
+
msg = (
|
|
287
|
+
"Query embedding is required for PgVector search. "
|
|
288
|
+
"Example: query_embedding = await embedding_adapter.aembed_text(query)"
|
|
289
|
+
)
|
|
290
|
+
raise ValueError(msg)
|
|
291
|
+
|
|
292
|
+
k = top_k or self.config.max_results
|
|
293
|
+
|
|
294
|
+
# Build query with distance function
|
|
295
|
+
distance_func = self._get_distance_function()
|
|
296
|
+
|
|
297
|
+
async with AsyncSession(self._engine) as session:
|
|
298
|
+
# Build base query
|
|
299
|
+
query_obj = session.query(self._model_class).order_by(
|
|
300
|
+
distance_func(self._model_class.embedding, query_embedding)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Add metadata filtering if provided
|
|
304
|
+
if filter_metadata:
|
|
305
|
+
for key, value in filter_metadata.items():
|
|
306
|
+
query_obj = query_obj.filter(
|
|
307
|
+
self._model_class.metadata[key].astext == str(value)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Execute query
|
|
311
|
+
query_obj = query_obj.limit(k)
|
|
312
|
+
results = (await session.execute(query_obj)).scalars().all()
|
|
313
|
+
|
|
314
|
+
# Format results
|
|
315
|
+
formatted_results = []
|
|
316
|
+
for record in results:
|
|
317
|
+
# Calculate similarity score
|
|
318
|
+
distance = distance_func(record.embedding, query_embedding)
|
|
319
|
+
similarity = (
|
|
320
|
+
1 - distance if self.config.distance_metric in ["cosine", "l2"] else distance
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
formatted_results.append({
|
|
324
|
+
"id": record.id,
|
|
325
|
+
"text": record.text,
|
|
326
|
+
"metadata": record.metadata or {},
|
|
327
|
+
"similarity_score": float(similarity),
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
return formatted_results
|
|
331
|
+
|
|
332
|
+
async def aclear(self) -> dict[str, Any]:
|
|
333
|
+
"""Clear all documents from the table."""
|
|
334
|
+
if not self._initialized:
|
|
335
|
+
await self.asetup()
|
|
336
|
+
|
|
337
|
+
async with AsyncSession(self._engine) as session:
|
|
338
|
+
# Count before deletion
|
|
339
|
+
count = await session.scalar(text(f"SELECT COUNT(*) FROM {self.config.table_name}"))
|
|
340
|
+
|
|
341
|
+
# Truncate table
|
|
342
|
+
await session.execute(text(f"TRUNCATE TABLE {self.config.table_name}"))
|
|
343
|
+
await session.commit()
|
|
344
|
+
|
|
345
|
+
return {"removed": count}
|
|
346
|
+
|
|
347
|
+
async def acount(self) -> int:
|
|
348
|
+
"""Get the number of documents in the vector store."""
|
|
349
|
+
if not self._initialized:
|
|
350
|
+
await self.asetup()
|
|
351
|
+
|
|
352
|
+
async with AsyncSession(self._engine) as session:
|
|
353
|
+
count = await session.scalar(text(f"SELECT COUNT(*) FROM {self.config.table_name}"))
|
|
354
|
+
return count or 0
|
|
355
|
+
|
|
356
|
+
async def aget_stats(self) -> dict[str, Any]:
|
|
357
|
+
"""Get statistics about the vector store and connection pool."""
|
|
358
|
+
if not self._initialized:
|
|
359
|
+
await self.asetup()
|
|
360
|
+
|
|
361
|
+
async with AsyncSession(self._engine) as session:
|
|
362
|
+
# Get document count
|
|
363
|
+
count = await session.scalar(text(f"SELECT COUNT(*) FROM {self.config.table_name}"))
|
|
364
|
+
|
|
365
|
+
# Get table size
|
|
366
|
+
size = await session.scalar(
|
|
367
|
+
text(f"SELECT pg_size_pretty(pg_total_relation_size('{self.config.table_name}'))")
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Get pool statistics
|
|
371
|
+
pool_stats = {}
|
|
372
|
+
if self._engine and self._engine.pool:
|
|
373
|
+
pool = self._engine.pool
|
|
374
|
+
pool_stats = {
|
|
375
|
+
"pool_size": pool.size(),
|
|
376
|
+
"checked_in": pool.checkedin(),
|
|
377
|
+
"checked_out": pool.checkedout(),
|
|
378
|
+
"overflow": pool.overflow(),
|
|
379
|
+
"max_overflow": self.config.max_overflow,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
"document_count": count,
|
|
384
|
+
"table_name": self.config.table_name,
|
|
385
|
+
"table_size": size,
|
|
386
|
+
"embedding_dim": self.config.embedding_dim,
|
|
387
|
+
"distance_metric": self.config.distance_metric,
|
|
388
|
+
"connection_pool": pool_stats,
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
def _get_distance_operator(self) -> str:
|
|
392
|
+
"""Get PostgreSQL distance operator for the configured metric."""
|
|
393
|
+
operators = {
|
|
394
|
+
"cosine": "<=>",
|
|
395
|
+
"l2": "<->",
|
|
396
|
+
"inner_product": "<#>",
|
|
397
|
+
}
|
|
398
|
+
return operators.get(self.config.distance_metric, "<=>")
|
|
399
|
+
|
|
400
|
+
def _get_distance_function(self):
|
|
401
|
+
"""Get pgvector distance function for SQLAlchemy."""
|
|
402
|
+
from pgvector.sqlalchemy import Vector
|
|
403
|
+
|
|
404
|
+
metric_map = {
|
|
405
|
+
"cosine": "cosine_distance",
|
|
406
|
+
"l2": "l2_distance",
|
|
407
|
+
"inner_product": "max_inner_product",
|
|
408
|
+
}
|
|
409
|
+
func_name = metric_map.get(self.config.distance_metric, "cosine_distance")
|
|
410
|
+
return getattr(Vector, func_name, Vector.cosine_distance)
|
|
411
|
+
|
|
412
|
+
async def ahealth_check(self) -> HealthStatus:
|
|
413
|
+
"""Check PostgreSQL, pgvector, and connection pool health."""
|
|
414
|
+
start_time = time.time()
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
if not self._initialized:
|
|
418
|
+
await self.asetup()
|
|
419
|
+
|
|
420
|
+
async with AsyncSession(self._engine) as session:
|
|
421
|
+
# Test basic connectivity
|
|
422
|
+
await session.execute(text("SELECT 1"))
|
|
423
|
+
|
|
424
|
+
# Test pgvector extension
|
|
425
|
+
await session.execute(text("SELECT vector_dims(vector('[1,2,3]'))"))
|
|
426
|
+
|
|
427
|
+
# Test table exists
|
|
428
|
+
table_exists = await session.scalar(
|
|
429
|
+
text(
|
|
430
|
+
"""
|
|
431
|
+
SELECT EXISTS (
|
|
432
|
+
SELECT FROM information_schema.tables
|
|
433
|
+
WHERE table_name = :table_name
|
|
434
|
+
)
|
|
435
|
+
"""
|
|
436
|
+
),
|
|
437
|
+
{"table_name": self.config.table_name},
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
441
|
+
|
|
442
|
+
if not table_exists:
|
|
443
|
+
return HealthStatus(
|
|
444
|
+
status="degraded",
|
|
445
|
+
adapter_name="pgvector",
|
|
446
|
+
port_name="vector_store",
|
|
447
|
+
latency_ms=latency_ms,
|
|
448
|
+
details={
|
|
449
|
+
"table": self.config.table_name,
|
|
450
|
+
"message": "Table does not exist (not yet initialized)",
|
|
451
|
+
},
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Get document count
|
|
455
|
+
count = await self.acount()
|
|
456
|
+
|
|
457
|
+
# Get pool statistics
|
|
458
|
+
pool_stats = {}
|
|
459
|
+
if self._engine and self._engine.pool:
|
|
460
|
+
pool = self._engine.pool
|
|
461
|
+
pool_stats = {
|
|
462
|
+
"pool_size": pool.size(),
|
|
463
|
+
"checked_in": pool.checkedin(),
|
|
464
|
+
"checked_out": pool.checkedout(),
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
return HealthStatus(
|
|
468
|
+
status="healthy",
|
|
469
|
+
adapter_name="pgvector",
|
|
470
|
+
port_name="vector_store",
|
|
471
|
+
latency_ms=latency_ms,
|
|
472
|
+
details={
|
|
473
|
+
"table": self.config.table_name,
|
|
474
|
+
"document_count": count,
|
|
475
|
+
"embedding_dim": self.config.embedding_dim,
|
|
476
|
+
"distance_metric": self.config.distance_metric,
|
|
477
|
+
"connection_pool": pool_stats,
|
|
478
|
+
"sqlalchemy_version": "async",
|
|
479
|
+
},
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
except Exception as e:
|
|
483
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
484
|
+
return HealthStatus(
|
|
485
|
+
status="unhealthy",
|
|
486
|
+
adapter_name="pgvector",
|
|
487
|
+
port_name="vector_store",
|
|
488
|
+
error=e,
|
|
489
|
+
latency_ms=latency_ms,
|
|
490
|
+
details={
|
|
491
|
+
"table": self.config.table_name,
|
|
492
|
+
"error_type": type(e).__name__,
|
|
493
|
+
},
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
def __repr__(self) -> str:
|
|
497
|
+
"""String representation."""
|
|
498
|
+
pool_info = ""
|
|
499
|
+
if self._engine and self._engine.pool:
|
|
500
|
+
pool = self._engine.pool
|
|
501
|
+
pool_info = f", pool={pool.checkedout()}/{pool.size()}"
|
|
502
|
+
return f"PgVectorAdapter(table={self.config.table_name}{pool_info})"
|