PyPI - pgstream - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pgstream 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

pgstream/__init__.py +5 -0
pgstream/decoder.py +248 -0
pgstream/events.py +42 -0
pgstream/py.typed +0 -0
pgstream/replication.py +225 -0
pgstream/sinks/__init__.py +5 -0
pgstream/sinks/base.py +53 -0
pgstream/sinks/pgvector.py +91 -0
pgstream/sinks/qdrant.py +114 -0
pgstream/stream.py +277 -0
pgstream-0.1.0.dist-info/METADATA +296 -0
pgstream-0.1.0.dist-info/RECORD +13 -0
pgstream-0.1.0.dist-info/WHEEL +4 -0

pgstream/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .events import ChangeEvent
+from .stream import PGStream
+__all__ = ["PGStream", "ChangeEvent"]
+__version__ = "0.1.0"

pgstream/decoder.py ADDED Viewed

@@ -0,0 +1,248 @@
+from __future__ import annotations
+import struct
+from dataclasses import dataclass, field
+from datetime import datetime, timezone, timedelta
+from typing import NamedTuple
+_PG_EPOCH = datetime(2000, 1, 1, tzinfo=timezone.utc)
+def _pg_ts_to_datetime(microseconds: int) -> datetime:
+    return _PG_EPOCH + timedelta(microseconds=microseconds)
+def _lsn_to_str(lsn: int) -> str:
+    high = lsn >> 32
+    low = lsn & 0xFFFFFFFF
+    return f"{high:X}/{low:X}"
+@dataclass
+class ColumnInfo:
+    name: str
+    type_oid: int
+    is_key: bool
+@dataclass
+class RelationInfo:
+    oid: int
+    schema: str
+    table: str
+    columns: list[ColumnInfo] = field(default_factory=list)
+class PgOutputDecoder:
+    """Stateful parser for the pgoutput logical replication protocol (v1).
+    Caches ``Relation`` messages keyed by OID so that subsequent DML messages
+    can be decoded into named columns. Transaction context (LSN, timestamp,
+    XID) is tracked from ``Begin`` messages and attached to every emitted event.
+    Usage::
+        decoder = PgOutputDecoder()
+        for raw_msg in replication_cursor:
+            event = decoder.decode(raw_msg.payload)
+            if event is not None:
+                yield event
+    """
+    def __init__(self) -> None:
+        self._relations: dict[int, RelationInfo] = {}
+        self._current_lsn: str = "0/0"
+        self._current_commit_time: datetime = _PG_EPOCH
+        self._current_xid: int = 0
+    def decode(self, payload: bytes) -> dict | None:
+        """Decode one raw pgoutput message.
+        Returns a dict for Insert/Update/Delete/Truncate messages, or ``None``
+        for Begin/Commit/Relation and other metadata messages.
+        """
+        if not payload:
+            return None
+        msg_type = chr(payload[0])
+        data = payload[1:]
+        if msg_type == "B":
+            self._handle_begin(data)
+        elif msg_type == "C":
+            self._handle_commit(data)
+        elif msg_type == "R":
+            self._handle_relation(data)
+        elif msg_type == "I":
+            return self._handle_insert(data)
+        elif msg_type == "U":
+            return self._handle_update(data)
+        elif msg_type == "D":
+            return self._handle_delete(data)
+        elif msg_type == "T":
+            return self._handle_truncate(data)
+        return None
+    def _handle_begin(self, data: bytes) -> None:
+        final_lsn, commit_ts, xid = struct.unpack_from(">qqi", data, 0)
+        self._current_lsn = _lsn_to_str(final_lsn)
+        self._current_commit_time = _pg_ts_to_datetime(commit_ts)
+        self._current_xid = xid
+    def _handle_commit(self, data: bytes) -> None:
+        commit_lsn, end_lsn, commit_ts = struct.unpack_from(">qqq", data, 1)
+        self._current_lsn = _lsn_to_str(commit_lsn)
+        self._current_commit_time = _pg_ts_to_datetime(commit_ts)
+    def _handle_relation(self, data: bytes) -> None:
+        offset = 0
+        oid, = struct.unpack_from(">I", data, offset)
+        offset += 4
+        schema, offset = _read_cstring(data, offset)
+        table, offset = _read_cstring(data, offset)
+        offset += 1  # replica identity byte
+        num_cols, = struct.unpack_from(">H", data, offset)
+        offset += 2
+        columns: list[ColumnInfo] = []
+        for _ in range(num_cols):
+            col_flags = data[offset]
+            offset += 1
+            col_name, offset = _read_cstring(data, offset)
+            type_oid, type_mod = struct.unpack_from(">Ii", data, offset)
+            offset += 8
+            columns.append(ColumnInfo(
+                name=col_name,
+                type_oid=type_oid,
+                is_key=bool(col_flags & 0x01),
+            ))
+        self._relations[oid] = RelationInfo(
+            oid=oid, schema=schema, table=table, columns=columns
+        )
+    def _handle_insert(self, data: bytes) -> dict | None:
+        offset = 0
+        oid, = struct.unpack_from(">I", data, offset)
+        offset += 4
+        relation = self._get_relation(oid)
+        if relation is None:
+            return None
+        offset += 1  # skip 'N' byte
+        row, offset = self._decode_tuple(data, offset, relation)
+        return self._make_event("insert", relation, row, None)
+    def _handle_update(self, data: bytes) -> dict | None:
+        offset = 0
+        oid, = struct.unpack_from(">I", data, offset)
+        offset += 4
+        relation = self._get_relation(oid)
+        if relation is None:
+            return None
+        old_row: dict | None = None
+        marker = chr(data[offset])
+        if marker in ("K", "O"):
+            offset += 1
+            old_row, offset = self._decode_tuple(data, offset, relation)
+            marker = chr(data[offset])
+        assert marker == "N", f"Expected 'N' in UPDATE, got {marker!r}"
+        offset += 1
+        new_row, offset = self._decode_tuple(data, offset, relation)
+        return self._make_event("update", relation, new_row, old_row)
+    def _handle_delete(self, data: bytes) -> dict | None:
+        offset = 0
+        oid, = struct.unpack_from(">I", data, offset)
+        offset += 4
+        relation = self._get_relation(oid)
+        if relation is None:
+            return None
+        offset += 1  # skip 'K' or 'O' byte
+        old_row, offset = self._decode_tuple(data, offset, relation)
+        return self._make_event("delete", relation, old_row, None)
+    def _handle_truncate(self, data: bytes) -> dict | None:
+        offset = 0
+        num_relations, = struct.unpack_from(">I", data, offset)
+        offset += 4
+        offset += 1  # flags byte
+        if num_relations == 0:
+            return None
+        oid, = struct.unpack_from(">I", data, offset)
+        relation = self._get_relation(oid)
+        if relation is None:
+            return None
+        return self._make_event("truncate", relation, {}, None)
+    def _get_relation(self, oid: int) -> RelationInfo | None:
+        rel = self._relations.get(oid)
+        if rel is None:
+            import warnings
+            warnings.warn(
+                f"pgstream: received DML for unknown relation OID {oid}. "
+                "The Relation message may have been missed. Skipping event."
+            )
+        return rel
+    def _decode_tuple(
+        self, data: bytes, offset: int, relation: RelationInfo
+    ) -> tuple[dict[str, str | None], int]:
+        num_cols, = struct.unpack_from(">H", data, offset)
+        offset += 2
+        row: dict[str, str | None] = {}
+        for i in range(num_cols):
+            col_name = relation.columns[i].name if i < len(relation.columns) else f"col_{i}"
+            col_type = chr(data[offset])
+            offset += 1
+            if col_type == "n":
+                row[col_name] = None
+            elif col_type == "u":
+                # Unchanged TOASTed value — not sent by Postgres; emit None.
+                row[col_name] = None
+            elif col_type == "t":
+                val_len, = struct.unpack_from(">I", data, offset)
+                offset += 4
+                row[col_name] = data[offset : offset + val_len].decode("utf-8")
+                offset += val_len
+            elif col_type == "b":
+                val_len, = struct.unpack_from(">I", data, offset)
+                offset += 4
+                row[col_name] = data[offset : offset + val_len].hex()
+                offset += val_len
+            else:
+                raise ValueError(
+                    f"pgstream decoder: unknown column type byte {col_type!r} "
+                    f"for column {col_name!r} in {relation.schema}.{relation.table}"
+                )
+        return row, offset
+    def _make_event(
+        self,
+        operation: str,
+        relation: RelationInfo,
+        row: dict,
+        old_row: dict | None,
+    ) -> dict:
+        return {
+            "operation": operation,
+            "schema": relation.schema,
+            "table": relation.table,
+            "row": row,
+            "old_row": old_row,
+            "lsn": self._current_lsn,
+            "commit_time": self._current_commit_time,
+            "xid": self._current_xid,
+        }
+def _read_cstring(data: bytes, offset: int) -> tuple[str, int]:
+    end = data.index(b"\x00", offset)
+    value = data[offset:end].decode("utf-8")
+    return value, end + 1

pgstream/events.py ADDED Viewed

@@ -0,0 +1,42 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Literal
+@dataclass
+class ChangeEvent:
+    """A single committed row-level change decoded from the Postgres WAL.
+    Attributes:
+        operation:   One of ``"insert"``, ``"update"``, ``"delete"``, ``"truncate"``.
+        schema:      Postgres schema name (e.g. ``"public"``).
+        table:       Table name (e.g. ``"documents"``).
+        row:         New row as ``{column: value}``. Values are always strings or
+                     ``None``; cast them yourself (e.g. ``int(event.row["id"])``).
+                     For DELETE this contains the old/key row. For TRUNCATE it is
+                     an empty dict.
+        old_row:     Previous row on UPDATE or DELETE when ``REPLICA IDENTITY FULL``
+                     is set. ``None`` otherwise.
+        lsn:         WAL Log Sequence Number at commit (e.g. ``"0/1A3F28"``).
+        commit_time: UTC datetime of the transaction commit.
+        xid:         Postgres transaction ID.
+    """
+    operation: Literal["insert", "update", "delete", "truncate"]
+    schema: str
+    table: str
+    row: dict[str, str | None]
+    old_row: dict[str, str | None] | None
+    lsn: str
+    commit_time: datetime
+    xid: int
+    def __repr__(self) -> str:
+        row_preview = {k: v for k, v in list(self.row.items())[:3]}
+        suffix = "..." if len(self.row) > 3 else ""
+        return (
+            f"ChangeEvent({self.operation} {self.schema}.{self.table} "
+            f"lsn={self.lsn} row={row_preview}{suffix})"
+        )

pgstream/py.typed ADDED Viewed

File without changes

pgstream/replication.py ADDED Viewed

@@ -0,0 +1,225 @@
+from __future__ import annotations
+import logging
+import select
+import threading
+import time
+from typing import Callable
+import psycopg2
+import psycopg2.extras
+from psycopg2.extras import (
+    LogicalReplicationConnection,
+    ReplicationCursor,
+    ReplicationMessage,
+)
+from .decoder import PgOutputDecoder
+from .events import ChangeEvent
+logger = logging.getLogger("pgstream.replication")
+class SlotManager:
+    """Creates and drops the replication slot and publication.
+    Uses a normal (non-replication) psycopg2 connection for all DDL.
+    All methods are safe to call from a thread executor.
+    """
+    def __init__(self, dsn: str, slot_name: str, publication_name: str) -> None:
+        self._dsn = dsn
+        self.slot_name = slot_name
+        self.publication_name = publication_name
+    def setup(self, tables: list[str]) -> None:
+        """Create the publication and replication slot if they don't exist.
+        Idempotent — safe to call on every startup.
+        Args:
+            tables: Unqualified table names to watch, e.g. ``["documents"]``.
+        """
+        conn = psycopg2.connect(self._dsn)
+        conn.autocommit = True
+        try:
+            cur = conn.cursor()
+            cur.execute(
+                "SELECT 1 FROM pg_publication WHERE pubname = %s",
+                (self.publication_name,),
+            )
+            if cur.fetchone() is None:
+                table_list = ", ".join(f'"{t}"' for t in tables)
+                cur.execute(
+                    f"CREATE PUBLICATION {self.publication_name} FOR TABLE {table_list}"
+                )
+                logger.info("Created publication %r for tables: %s", self.publication_name, tables)
+            else:
+                logger.info("Publication %r already exists — skipping.", self.publication_name)
+            cur.execute(
+                "SELECT 1 FROM pg_replication_slots WHERE slot_name = %s",
+                (self.slot_name,),
+            )
+            if cur.fetchone() is None:
+                cur.execute(
+                    "SELECT pg_create_logical_replication_slot(%s, 'pgoutput')",
+                    (self.slot_name,),
+                )
+                logger.info("Created replication slot %r.", self.slot_name)
+            else:
+                logger.info("Replication slot %r already exists — skipping.", self.slot_name)
+        finally:
+            conn.close()
+    def teardown(self, drop_slot: bool = True, drop_publication: bool = True) -> None:
+        """Drop the replication slot and/or publication.
+        Args:
+            drop_slot:        Drop the replication slot (default ``True``).
+            drop_publication: Drop the publication (default ``True``).
+        Warning:
+            Dropping the slot causes Postgres to stop retaining WAL. Any events
+            that occur while the slot is absent will be permanently lost.
+        """
+        conn = psycopg2.connect(self._dsn)
+        conn.autocommit = True
+        try:
+            cur = conn.cursor()
+            if drop_slot:
+                cur.execute(
+                    "SELECT pg_drop_replication_slot(%s) "
+                    "WHERE EXISTS (SELECT 1 FROM pg_replication_slots WHERE slot_name = %s)",
+                    (self.slot_name, self.slot_name),
+                )
+                logger.info("Dropped replication slot %r.", self.slot_name)
+            if drop_publication:
+                cur.execute(f"DROP PUBLICATION IF EXISTS {self.publication_name}")
+                logger.info("Dropped publication %r.", self.publication_name)
+        finally:
+            conn.close()
+class ReplicationStream:
+    """Opens a replication connection and streams :class:`ChangeEvent` objects.
+    Blocking — runs the Postgres replication protocol loop in the calling
+    thread. :class:`~pgstream.stream.PGStream` wraps this in a background
+    thread so the asyncio event loop is not blocked.
+    Call :meth:`stop` from any thread to exit the loop cleanly.
+    """
+    KEEPALIVE_INTERVAL = 10.0
+    def __init__(self, dsn: str, slot_name: str, publication_name: str) -> None:
+        self._dsn = dsn
+        self._slot_name = slot_name
+        self._publication_name = publication_name
+        self._stop_event = threading.Event()
+        self._decoder = PgOutputDecoder()
+        self._conn: psycopg2.extensions.connection | None = None
+    def stop(self) -> None:
+        """Signal the streaming loop to stop after the current message."""
+        self._stop_event.set()
+        if self._conn is not None:
+            try:
+                self._conn.close()
+            except Exception:
+                pass
+    def stream(
+        self,
+        tables: list[str],
+        on_event: Callable[[ChangeEvent], None],
+    ) -> None:
+        """Stream WAL events from Postgres, calling *on_event* for each.
+        The LSN is ACKed to Postgres only after *on_event* returns without
+        raising, guaranteeing at-least-once delivery.
+        Blocks until :meth:`stop` is called or a fatal error occurs.
+        Args:
+            tables:   Table names being watched (used for filtering truncates).
+            on_event: Synchronous callback invoked for each :class:`ChangeEvent`.
+        """
+        self._stop_event.clear()
+        self._conn = psycopg2.connect(
+            self._dsn,
+            connection_factory=LogicalReplicationConnection,
+        )
+        try:
+            cur: ReplicationCursor = self._conn.cursor()
+            cur.start_replication(
+                slot_name=self._slot_name,
+                decode=False,
+                options={
+                    "proto_version": "1",
+                    "publication_names": self._publication_name,
+                },
+            )
+            logger.info(
+                "Started replication from slot %r / publication %r",
+                self._slot_name,
+                self._publication_name,
+            )
+            last_keepalive = time.monotonic()
+            while not self._stop_event.is_set():
+                try:
+                    msg: ReplicationMessage | None = cur.read_message()
+                except psycopg2.OperationalError:
+                    break
+                if msg is not None:
+                    raw = self._decoder.decode(bytes(msg.payload))
+                    if raw is not None:
+                        event = ChangeEvent(
+                            operation=raw["operation"],
+                            schema=raw["schema"],
+                            table=raw["table"],
+                            row=raw["row"],
+                            old_row=raw["old_row"],
+                            lsn=raw["lsn"],
+                            commit_time=raw["commit_time"],
+                            xid=raw["xid"],
+                        )
+                        on_event(event)
+                    try:
+                        cur.send_feedback(flush_lsn=msg.data_start)
+                    except (psycopg2.InterfaceError, psycopg2.OperationalError):
+                        break
+                    last_keepalive = time.monotonic()
+                else:
+                    try:
+                        fd = self._conn.fileno()
+                        select.select([fd], [], [], 1.0)
+                    except OSError:
+                        break
+                    now = time.monotonic()
+                    if now - last_keepalive > self.KEEPALIVE_INTERVAL:
+                        try:
+                            cur.send_feedback()
+                        except (psycopg2.InterfaceError, psycopg2.OperationalError):
+                            break
+                        last_keepalive = now
+        finally:
+            try:
+                if self._conn and not self._conn.closed:
+                    self._conn.close()
+            except Exception:
+                pass
+            self._conn = None
+            logger.info("Replication stream stopped.")

pgstream/sinks/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .base import Sink
+from .pgvector import PgVectorSink
+from .qdrant import QdrantSink
+__all__ = ["Sink", "PgVectorSink", "QdrantSink"]

pgstream/sinks/base.py ADDED Viewed

@@ -0,0 +1,53 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+class Sink(ABC):
+    """Abstract base class for all pgstream vector store sinks.
+    Implement this interface to add support for a new vector store.
+    All I/O methods are async.
+    Example::
+        class MyVectorDB(Sink):
+            async def upsert(self, id: str, vector: list[float], payload: dict | None = None) -> None:
+                await self._client.upsert(id, vector, metadata=payload)
+            async def delete(self, id: str) -> None:
+                await self._client.delete(id)
+    """
+    @abstractmethod
+    async def upsert(
+        self,
+        id: str,
+        vector: list[float],
+        payload: dict | None = None,
+    ) -> None:
+        """Insert or update a vector in the store.
+        Args:
+            id:      Unique identifier for this document (stringified PK).
+            vector:  Dense float embedding, e.g. ``[0.12, -0.45, ...]``.
+            payload: Optional metadata stored alongside the vector.
+        """
+        ...
+    @abstractmethod
+    async def delete(self, id: str) -> None:
+        """Remove a vector from the store by its ID.
+        Implementations should be idempotent — deleting a non-existent ID
+        must not raise.
+        """
+        ...
+    async def close(self) -> None:
+        """Release resources held by this sink (connections, HTTP sessions, etc.).
+        Called automatically by :meth:`~pgstream.stream.PGStream.stop`.
+        Override if your sink holds a long-lived connection.
+        """
+        pass

pgstream/sinks/pgvector.py ADDED Viewed

@@ -0,0 +1,91 @@
+from __future__ import annotations
+import json
+import logging
+import asyncpg
+from .base import Sink
+logger = logging.getLogger("pgstream.sinks.pgvector")
+class PgVectorSink(Sink):
+    """Writes vectors to a `pgvector <https://github.com/pgvector/pgvector>`_-enabled Postgres table.
+    The target table must have this schema::
+        CREATE EXTENSION IF NOT EXISTS vector;
+        CREATE TABLE embeddings (
+            id      TEXT PRIMARY KEY,
+            vector  VECTOR(1536),   -- match your model's output dimension
+            payload JSONB
+        );
+    Args:
+        dsn:       Postgres connection string.
+        table:     Target table name (default ``"embeddings"``).
+        dimension: Embedding dimension — informational only.
+    """
+    def __init__(
+        self,
+        dsn: str,
+        table: str = "embeddings",
+        dimension: int | None = None,
+    ) -> None:
+        self._dsn = dsn
+        self._table = table
+        self._dimension = dimension
+        self._pool: asyncpg.Pool | None = None
+    async def _get_pool(self) -> asyncpg.Pool:
+        if self._pool is None:
+            self._pool = await asyncpg.create_pool(
+                self._dsn,
+                min_size=1,
+                max_size=5,
+                init=self._init_connection,
+            )
+        return self._pool
+    @staticmethod
+    async def _init_connection(conn: asyncpg.Connection) -> None:
+        await conn.execute("SET search_path TO public")
+    async def upsert(
+        self,
+        id: str,
+        vector: list[float],
+        payload: dict | None = None,
+    ) -> None:
+        """Insert or update a vector row. Uses ``INSERT ... ON CONFLICT DO UPDATE``."""
+        pool = await self._get_pool()
+        vector_str = "[" + ",".join(str(v) for v in vector) + "]"
+        payload_json = json.dumps(payload) if payload is not None else "{}"
+        await pool.execute(
+            f"""
+            INSERT INTO {self._table} (id, vector, payload)
+            VALUES ($1, $2::vector, $3::jsonb)
+            ON CONFLICT (id) DO UPDATE
+                SET vector  = EXCLUDED.vector,
+                    payload = EXCLUDED.payload
+            """,
+            id,
+            vector_str,
+            payload_json,
+        )
+        logger.debug("Upserted id=%s into %s", id, self._table)
+    async def delete(self, id: str) -> None:
+        """Delete a row by ``id``. No-op if the row does not exist."""
+        pool = await self._get_pool()
+        await pool.execute(f"DELETE FROM {self._table} WHERE id = $1", id)
+        logger.debug("Deleted id=%s from %s", id, self._table)
+    async def close(self) -> None:
+        if self._pool is not None:
+            await self._pool.close()
+            self._pool = None
+            logger.info("PgVectorSink pool closed.")