astrocyte-pgvector 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """PostgreSQL + pgvector adapter for Astrocyte Tier 1."""
2
+
3
+ from astrocyte_pgvector.store import PgVectorStore
4
+
5
+ __all__ = ["PgVectorStore"]
@@ -0,0 +1,280 @@
1
+ """VectorStore backed by PostgreSQL with the pgvector extension."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import os
8
+ import re
9
+ from datetime import datetime
10
+ from typing import Any, ClassVar
11
+
12
+ import psycopg
13
+ from pgvector.psycopg import register_vector_async
14
+ from psycopg.rows import dict_row
15
+ from psycopg.types.json import Json
16
+ from psycopg_pool import AsyncConnectionPool
17
+
18
+ from astrocyte.types import HealthStatus, VectorFilters, VectorHit, VectorItem
19
+
20
+ _TABLE_SAFE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
21
+
22
+
23
+ def _sanitize_table(name: str) -> str:
24
+ if not _TABLE_SAFE.match(name):
25
+ raise ValueError(f"Invalid table name: {name!r}")
26
+ return name
27
+
28
+
29
+ class PgVectorStore:
30
+ """Tier 1 vector store using `pgvector` cosine distance search."""
31
+
32
+ SPI_VERSION: ClassVar[int] = 1
33
+
34
+ def __init__(
35
+ self,
36
+ dsn: str | None = None,
37
+ table_name: str = "astrocyte_vectors",
38
+ embedding_dimensions: int = 128,
39
+ bootstrap_schema: bool = True,
40
+ **kwargs: Any,
41
+ ) -> None:
42
+ self._dsn = dsn or os.environ.get("DATABASE_URL") or os.environ.get("ASTROCYTES_PG_DSN")
43
+ if not self._dsn:
44
+ raise ValueError(
45
+ "PgVectorStore requires `dsn` in vector_store_config or DATABASE_URL / ASTROCYTES_PG_DSN",
46
+ )
47
+ self._table = _sanitize_table(table_name)
48
+ self._dim = int(embedding_dimensions)
49
+ if self._dim < 1:
50
+ raise ValueError("embedding_dimensions must be >= 1")
51
+ self._bootstrap_schema = bool(bootstrap_schema)
52
+ self._pool: AsyncConnectionPool | None = None
53
+ self._pool_lock = asyncio.Lock()
54
+ # When migrations own DDL, skip in-app CREATE TABLE / indexes.
55
+ self._schema_ready = not self._bootstrap_schema
56
+ self._schema_lock = asyncio.Lock()
57
+
58
+ async def _ensure_pool(self) -> AsyncConnectionPool:
59
+ async with self._pool_lock:
60
+ if self._pool is None:
61
+
62
+ async def configure(conn: psycopg.AsyncConnection) -> None:
63
+ await conn.execute("SELECT 1")
64
+ # register_vector_async needs the `vector` type. Skip until pgvector exists (quick path:
65
+ # /health can run before in-app DDL; runbook path: migrations already created the extension).
66
+ async with conn.cursor() as cur:
67
+ await cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector'")
68
+ ext_present = await cur.fetchone()
69
+ if ext_present:
70
+ await register_vector_async(conn)
71
+ await conn.commit()
72
+
73
+ self._pool = AsyncConnectionPool(
74
+ conninfo=self._dsn,
75
+ configure=configure,
76
+ open=False,
77
+ min_size=1,
78
+ max_size=10,
79
+ kwargs={"connect_timeout": 10},
80
+ )
81
+ await self._pool.open()
82
+ return self._pool
83
+
84
+ async def _ensure_schema(self, pool: AsyncConnectionPool) -> None:
85
+ async with self._schema_lock:
86
+ if self._schema_ready:
87
+ return
88
+ async with pool.connection() as conn:
89
+ await conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
90
+ await conn.execute(
91
+ f"""
92
+ CREATE TABLE IF NOT EXISTS {self._table} (
93
+ id TEXT PRIMARY KEY,
94
+ bank_id TEXT NOT NULL,
95
+ embedding vector({self._dim}) NOT NULL,
96
+ text TEXT NOT NULL,
97
+ metadata JSONB,
98
+ tags TEXT[],
99
+ fact_type TEXT,
100
+ occurred_at TIMESTAMPTZ,
101
+ memory_layer TEXT
102
+ )
103
+ """
104
+ )
105
+ await conn.execute(
106
+ f"CREATE INDEX IF NOT EXISTS {self._table}_bank_idx ON {self._table} (bank_id)"
107
+ )
108
+ await register_vector_async(conn)
109
+ await conn.commit()
110
+ self._schema_ready = True
111
+
112
+ async def store_vectors(self, items: list[VectorItem]) -> list[str]:
113
+ pool = await self._ensure_pool()
114
+ await self._ensure_schema(pool)
115
+ stored: list[str] = []
116
+ async with pool.connection() as conn:
117
+ async with conn.cursor() as cur:
118
+ for item in items:
119
+ if len(item.vector) != self._dim:
120
+ raise ValueError(
121
+ f"Vector length {len(item.vector)} != embedding_dimensions {self._dim}",
122
+ )
123
+ await cur.execute(
124
+ f"""
125
+ INSERT INTO {self._table}
126
+ (id, bank_id, embedding, text, metadata, tags, fact_type, occurred_at, memory_layer)
127
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
128
+ ON CONFLICT (id) DO UPDATE SET
129
+ bank_id = EXCLUDED.bank_id,
130
+ embedding = EXCLUDED.embedding,
131
+ text = EXCLUDED.text,
132
+ metadata = EXCLUDED.metadata,
133
+ tags = EXCLUDED.tags,
134
+ fact_type = EXCLUDED.fact_type,
135
+ occurred_at = EXCLUDED.occurred_at,
136
+ memory_layer = EXCLUDED.memory_layer
137
+ """,
138
+ (
139
+ item.id,
140
+ item.bank_id,
141
+ item.vector,
142
+ item.text,
143
+ Json(item.metadata) if item.metadata is not None else None,
144
+ item.tags,
145
+ item.fact_type,
146
+ item.occurred_at,
147
+ item.memory_layer,
148
+ ),
149
+ )
150
+ stored.append(item.id)
151
+ return stored
152
+
153
+ async def search_similar(
154
+ self,
155
+ query_vector: list[float],
156
+ bank_id: str,
157
+ limit: int = 10,
158
+ filters: VectorFilters | None = None,
159
+ ) -> list[VectorHit]:
160
+ if len(query_vector) != self._dim:
161
+ raise ValueError(
162
+ f"Query vector length {len(query_vector)} != embedding_dimensions {self._dim}",
163
+ )
164
+ pool = await self._ensure_pool()
165
+ await self._ensure_schema(pool)
166
+
167
+ where = ["bank_id = %s"]
168
+ params: list[Any] = [query_vector, bank_id]
169
+ if filters and filters.tags:
170
+ where.append("tags && %s::text[]")
171
+ params.append(filters.tags)
172
+ if filters and filters.fact_types:
173
+ where.append("fact_type = ANY(%s::text[])")
174
+ params.append(filters.fact_types)
175
+ params.extend([query_vector, limit])
176
+
177
+ where_sql = " AND ".join(where)
178
+ # Cosine distance `<=>`; map to a 0–1-ish score via (1 - distance).
179
+ sql = f"""
180
+ SELECT id, text, metadata, tags, fact_type, occurred_at, memory_layer,
181
+ (1 - (embedding <=> %s::vector))::float AS score
182
+ FROM {self._table}
183
+ WHERE {where_sql}
184
+ ORDER BY embedding <=> %s::vector
185
+ LIMIT %s
186
+ """
187
+
188
+ async with pool.connection() as conn:
189
+ async with conn.cursor(row_factory=dict_row) as cur:
190
+ await cur.execute(sql, params)
191
+ rows = await cur.fetchall()
192
+
193
+ hits: list[VectorHit] = []
194
+ for row in rows:
195
+ score = float(row["score"])
196
+ if score < 0.0:
197
+ score = 0.0
198
+ if score > 1.0:
199
+ score = 1.0
200
+ md = row["metadata"]
201
+ if isinstance(md, str):
202
+ md = json.loads(md)
203
+ hits.append(
204
+ VectorHit(
205
+ id=row["id"],
206
+ text=row["text"],
207
+ score=score,
208
+ metadata=md,
209
+ tags=list(row["tags"]) if row["tags"] else None,
210
+ fact_type=row["fact_type"],
211
+ occurred_at=row["occurred_at"],
212
+ memory_layer=row.get("memory_layer"),
213
+ )
214
+ )
215
+ return hits
216
+
217
+ async def list_vectors(
218
+ self,
219
+ bank_id: str,
220
+ offset: int = 0,
221
+ limit: int = 100,
222
+ ) -> list[VectorItem]:
223
+ pool = await self._ensure_pool()
224
+ await self._ensure_schema(pool)
225
+ async with pool.connection() as conn:
226
+ async with conn.cursor(row_factory=dict_row) as cur:
227
+ await cur.execute(
228
+ f"""
229
+ SELECT id, bank_id, embedding, text, metadata, tags, fact_type,
230
+ occurred_at, memory_layer
231
+ FROM {self._table}
232
+ WHERE bank_id = %s
233
+ ORDER BY id
234
+ OFFSET %s LIMIT %s
235
+ """,
236
+ (bank_id, offset, limit),
237
+ )
238
+ rows = await cur.fetchall()
239
+ items: list[VectorItem] = []
240
+ for row in rows:
241
+ md = row["metadata"]
242
+ if isinstance(md, str):
243
+ md = json.loads(md)
244
+ items.append(
245
+ VectorItem(
246
+ id=row["id"],
247
+ bank_id=row["bank_id"],
248
+ vector=list(row["embedding"]),
249
+ text=row["text"],
250
+ metadata=md,
251
+ tags=list(row["tags"]) if row["tags"] else None,
252
+ fact_type=row["fact_type"],
253
+ occurred_at=row["occurred_at"],
254
+ memory_layer=row.get("memory_layer"),
255
+ )
256
+ )
257
+ return items
258
+
259
+ async def delete(self, ids: list[str], bank_id: str) -> int:
260
+ if not ids:
261
+ return 0
262
+ pool = await self._ensure_pool()
263
+ await self._ensure_schema(pool)
264
+ async with pool.connection() as conn:
265
+ async with conn.cursor() as cur:
266
+ await cur.execute(
267
+ f"DELETE FROM {self._table} WHERE bank_id = %s AND id = ANY(%s::text[])",
268
+ (bank_id, ids),
269
+ )
270
+ return cur.rowcount or 0
271
+
272
+ async def health(self) -> HealthStatus:
273
+ try:
274
+ pool = await self._ensure_pool()
275
+ async with pool.connection() as conn:
276
+ async with conn.cursor() as cur:
277
+ await cur.execute("SELECT 1")
278
+ return HealthStatus(healthy=True, message="pgvector connected")
279
+ except Exception as e:
280
+ return HealthStatus(healthy=False, message=f"pgvector unhealthy: {e!s}")
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: astrocyte-pgvector
3
+ Version: 0.1.0
4
+ Summary: PostgreSQL pgvector VectorStore adapter for Astrocyte
5
+ License-Expression: Apache-2.0
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: astrocyte<0.2,>=0.1.0
8
+ Requires-Dist: pgvector>=0.4
9
+ Requires-Dist: psycopg-pool>=3.2
10
+ Requires-Dist: psycopg[binary]>=3.1
11
+ Description-Content-Type: text/markdown
12
+
13
+ # astrocyte-pgvector
14
+
15
+ **PostgreSQL + [pgvector](https://github.com/pgvector/pgvector)** implementation of the Astrocyte **`VectorStore`** SPI ([`provider-spi.md`](../../docs/_plugins/provider-spi.md)).
16
+
17
+ ## Install
18
+
19
+ From the monorepo (with `astrocyte` available):
20
+
21
+ ```bash
22
+ cd astrocyte-services-py/astrocyte-pgvector
23
+ uv sync
24
+ # or: pip install -e ../../astrocyte-py && pip install -e .
25
+ ```
26
+
27
+ Entry point name: **`pgvector`** (group `astrocyte.vector_stores`).
28
+
29
+ ## PostgreSQL with Docker
30
+
31
+ Use the **combined** Compose stack in **[`../docker-compose.yml`](../docker-compose.yml)** (directory **`astrocyte-services-py/`**) to run **Postgres (pgvector) + the reference REST service** together:
32
+
33
+ ```bash
34
+ cd astrocyte-services-py
35
+ docker compose up -d
36
+ ```
37
+
38
+ For **Postgres only** (no HTTP), start only `postgres`:
39
+
40
+ ```bash
41
+ cd astrocyte-services-py
42
+ docker compose up -d postgres
43
+ ```
44
+
45
+ Default DSN from your host (port **5433** maps to Postgres in the compose file):
46
+
47
+ ```text
48
+ postgresql://astrocyte:astrocyte@127.0.0.1:5433/astrocyte
49
+ ```
50
+
51
+ ## Schema migrations (production)
52
+
53
+ DDL is shipped as **plain SQL** under [`migrations/`](migrations/) and applied with **`psql`** via [`scripts/migrate.sh`](scripts/migrate.sh) (no Python migration framework).
54
+
55
+ ```bash
56
+ export DATABASE_URL='postgresql://astrocyte:astrocyte@127.0.0.1:5433/astrocyte'
57
+ cd astrocyte-services-py/astrocyte-pgvector
58
+ ./scripts/migrate.sh
59
+ ```
60
+
61
+ Requirements: **PostgreSQL 15+** (for `CREATE INDEX CONCURRENTLY IF NOT EXISTS`), **psql** on `PATH`.
62
+
63
+ After migrations are applied, set **`bootstrap_schema: false`** in `vector_store_config` so the app does not run `CREATE TABLE` / indexes at runtime (see configuration table below). For a **single command** that starts Postgres, runs migrations, then starts the stack with runbook config, use **[`scripts/runbook-up.sh`](../scripts/runbook-up.sh)** (see **[Runbook](../README.md#runbook)**).
64
+
65
+ **Embedding width:** [`migrations/002_astrocyte_vectors.sql`](migrations/002_astrocyte_vectors.sql) defines `vector(128)`. That must match **`embedding_dimensions`** in config. For another width, add a new migration (or edit before first deploy) and keep the Python config aligned.
66
+
67
+ **Custom `table_name`:** The shipped SQL targets **`astrocyte_vectors`**. If you use another table name, copy and adjust the migration files accordingly.
68
+
69
+ ## Configuration
70
+
71
+ | Constructor / YAML `vector_store_config` | Meaning |
72
+ |--------------------------------------------|---------|
73
+ | `dsn` | PostgreSQL connection URI (or set `DATABASE_URL` / `ASTROCYTES_PG_DSN`) |
74
+ | `table_name` | Table name (default `astrocyte_vectors`; alphanumeric + underscore only) |
75
+ | `embedding_dimensions` | Fixed `vector(N)` width; must match your embedding model and the **`vector(N)`** in SQL migrations (default **128**) |
76
+ | `bootstrap_schema` | If **`true`** (default), create extension / table / btree index on first use (dev-friendly; no HNSW). If **`false`**, assume **`migrate.sh`** already applied [`migrations/`](migrations/) (production). |
77
+
78
+ ## How this fits `astrocyte_rest`
79
+
80
+ 1. **`astrocyte-py`** defines the **`VectorStore`** protocol and discovers adapters by **entry point** (`astrocyte.vector_stores`).
81
+ 2. **`astrocyte-pgvector`** registers **`pgvector` → `PgVectorStore`**. Installing this package makes the name **`pgvector`** available to **`resolve_provider()`**.
82
+ 3. **`astrocyte_rest/wiring.py`** calls **`resolve_vector_store(config)`**, which loads the class from the entry point and passes **`vector_store_config`** from YAML (or env-only defaults).
83
+ 4. **`astrocyte_rest/brain.py`** builds **`Astrocyte`** + **`PipelineOrchestrator`** with that store and your chosen **`llm_provider`** (still **`mock`** unless you configure a real LLM).
84
+
85
+ Example **`ASTROCYTES_CONFIG_PATH`** snippet:
86
+
87
+ ```yaml
88
+ provider_tier: storage
89
+ vector_store: pgvector
90
+ llm_provider: mock
91
+ vector_store_config:
92
+ dsn: postgresql://astrocyte:astrocyte@127.0.0.1:5433/astrocyte
93
+ embedding_dimensions: 128
94
+ bootstrap_schema: false
95
+ ```
96
+
97
+ Then run the REST service (from repo layout):
98
+
99
+ ```bash
100
+ export ASTROCYTES_CONFIG_PATH=/path/to/that.yaml
101
+ cd astrocyte-services-py/astrocyte-rest && uv run astrocyte-rest
102
+ ```
103
+
104
+ Or set only env (no YAML file):
105
+
106
+ ```bash
107
+ export ASTROCYTES_VECTOR_STORE=pgvector
108
+ export DATABASE_URL=postgresql://astrocyte:astrocyte@127.0.0.1:5433/astrocyte
109
+ # embedding_dimensions default 128 — override via YAML if you add a file
110
+ cd astrocyte-services-py/astrocyte-rest && uv sync --extra pgvector
111
+ ```
112
+
113
+ **Note:** `vector_store_config` for dimensions is only merged from YAML today; for env-only mode, add a small YAML or extend `brain.py` to pass `ASTROCYTES_EMBEDDING_DIMENSIONS` (future improvement).
114
+
115
+ ## Production notes
116
+
117
+ - **HNSW** parameters (`m`, `ef_construction`) live in [`migrations/003_indexes.sql`](migrations/003_indexes.sql); tune with DBA guidance as load grows.
118
+ - **Embedding dimension** must match the **`LLMProvider.embed()`** output used by the pipeline.
119
+ - Use **secrets** for `dsn`, not committed YAML.
@@ -0,0 +1,6 @@
1
+ astrocyte_pgvector/__init__.py,sha256=RFaVN_DmwTwQg_dSlQor5JbT8GSDyuheJaN4zXKMLhc,139
2
+ astrocyte_pgvector/store.py,sha256=qDd65lCsneDAAKy-iogZZVLM2R8UzOkdkeRT6vgUEgs,10874
3
+ astrocyte_pgvector-0.1.0.dist-info/METADATA,sha256=wIqS_pfn-fBXWpEegOknKZwi60SboFdyIONBTaQMiSo,5342
4
+ astrocyte_pgvector-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
5
+ astrocyte_pgvector-0.1.0.dist-info/entry_points.txt,sha256=1ddqMVkzitWuDUYGl_RtvQLeHPlagUY7yXpU0wL0AVU,76
6
+ astrocyte_pgvector-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [astrocyte.vector_stores]
2
+ pgvector = astrocyte_pgvector.store:PgVectorStore