pgstream 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pgstream-0.1.0/PKG-INFO +296 -0
- pgstream-0.1.0/README.md +279 -0
- pgstream-0.1.0/pyproject.toml +41 -0
- pgstream-0.1.0/src/pgstream/__init__.py +5 -0
- pgstream-0.1.0/src/pgstream/decoder.py +248 -0
- pgstream-0.1.0/src/pgstream/events.py +42 -0
- pgstream-0.1.0/src/pgstream/py.typed +0 -0
- pgstream-0.1.0/src/pgstream/replication.py +225 -0
- pgstream-0.1.0/src/pgstream/sinks/__init__.py +5 -0
- pgstream-0.1.0/src/pgstream/sinks/base.py +53 -0
- pgstream-0.1.0/src/pgstream/sinks/pgvector.py +91 -0
- pgstream-0.1.0/src/pgstream/sinks/qdrant.py +114 -0
- pgstream-0.1.0/src/pgstream/stream.py +277 -0
pgstream-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pgstream
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade Python SDK for Postgres CDC: watch tables via logical replication and sync changes to vector stores (pgvector, Qdrant).
|
|
5
|
+
Author: Kamalesh-Kavin
|
|
6
|
+
Author-email: Kamalesh-Kavin <kamalesh.s7316@gmail.com>
|
|
7
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
8
|
+
Requires-Dist: asyncpg>=0.30.0 ; extra == 'all'
|
|
9
|
+
Requires-Dist: qdrant-client>=1.14.0 ; extra == 'all'
|
|
10
|
+
Requires-Dist: asyncpg>=0.30.0 ; extra == 'pgvector'
|
|
11
|
+
Requires-Dist: qdrant-client>=1.14.0 ; extra == 'qdrant'
|
|
12
|
+
Requires-Python: >=3.13
|
|
13
|
+
Provides-Extra: all
|
|
14
|
+
Provides-Extra: pgvector
|
|
15
|
+
Provides-Extra: qdrant
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# pgstream
|
|
19
|
+
|
|
20
|
+
A production-grade Python SDK that watches Postgres tables via **logical replication (CDC)** and syncs row changes to **vector stores** in real time.
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
pip install pgstream
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## What it does
|
|
29
|
+
|
|
30
|
+
pgstream connects to your Postgres database using the **logical replication protocol** (the same protocol Debezium and pglogical use). Whenever you INSERT, UPDATE, or DELETE a row in a watched table, pgstream decodes the WAL change into a structured `ChangeEvent` Python object and calls your async handler with it.
|
|
31
|
+
|
|
32
|
+
Your handler is where the application logic lives — typically:
|
|
33
|
+
- Embed the new row content with your model of choice
|
|
34
|
+
- Upsert the resulting vector into a vector store (pgvector, Qdrant, etc.)
|
|
35
|
+
- Delete vectors for deleted rows
|
|
36
|
+
|
|
37
|
+
pgstream handles all the low-level plumbing: slot management, keepalive ACKs, at-least-once delivery, graceful shutdown.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import asyncio
|
|
45
|
+
from pgstream import PGStream, ChangeEvent
|
|
46
|
+
from pgstream.sinks import QdrantSink
|
|
47
|
+
|
|
48
|
+
stream = PGStream(dsn="postgresql://user:pass@localhost/db")
|
|
49
|
+
stream.watch("documents")
|
|
50
|
+
stream.sink(QdrantSink(url="http://localhost:6333", collection_name="docs"))
|
|
51
|
+
|
|
52
|
+
@stream.on_change
|
|
53
|
+
async def handle(event: ChangeEvent, sink):
|
|
54
|
+
if event.operation in ("insert", "update"):
|
|
55
|
+
vector = await my_embed_function(event.row["content"])
|
|
56
|
+
await sink.upsert(
|
|
57
|
+
id=event.row["id"],
|
|
58
|
+
vector=vector,
|
|
59
|
+
payload={"content": event.row["content"]},
|
|
60
|
+
)
|
|
61
|
+
elif event.operation == "delete":
|
|
62
|
+
await sink.delete(event.row["id"])
|
|
63
|
+
|
|
64
|
+
async def main():
|
|
65
|
+
await stream.setup() # idempotent — creates slot + publication once
|
|
66
|
+
await stream.start() # blocks; press Ctrl+C to stop
|
|
67
|
+
|
|
68
|
+
asyncio.run(main())
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Core only (replication + event decoding)
|
|
77
|
+
pip install pgstream
|
|
78
|
+
|
|
79
|
+
# With pgvector sink
|
|
80
|
+
pip install "pgstream[pgvector]"
|
|
81
|
+
|
|
82
|
+
# With Qdrant sink
|
|
83
|
+
pip install "pgstream[qdrant]"
|
|
84
|
+
|
|
85
|
+
# Both
|
|
86
|
+
pip install "pgstream[all]"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Requirements:**
|
|
90
|
+
- Python 3.13+
|
|
91
|
+
- Postgres 10+ with `wal_level = logical`
|
|
92
|
+
- The user in the DSN must have the `REPLICATION` privilege
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Architecture
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
┌────────────────────────────────────────────────────────────────────┐
|
|
100
|
+
│ Postgres │
|
|
101
|
+
│ │
|
|
102
|
+
│ WAL ──► pgoutput plugin ──► replication slot ──► client │
|
|
103
|
+
└────────────────────────────────────────────────────────────────────┘
|
|
104
|
+
│
|
|
105
|
+
psycopg2 replication protocol
|
|
106
|
+
│
|
|
107
|
+
┌─────────────────────────────────────────────────────────┼──────────┐
|
|
108
|
+
│ pgstream │ │
|
|
109
|
+
│ ▼ │
|
|
110
|
+
│ ┌──────────────────────────┐ ┌────────────────────────────┐ │
|
|
111
|
+
│ │ ReplicationStream │ │ PgOutputDecoder │ │
|
|
112
|
+
│ │ (background thread) │───►│ (binary wire format │ │
|
|
113
|
+
│ │ │ │ parser, pure Python) │ │
|
|
114
|
+
│ │ · psycopg2 blocking │ └────────────┬───────────────┘ │
|
|
115
|
+
│ │ replication loop │ │ ChangeEvent │
|
|
116
|
+
│ │ · read_message() │ ▼ │
|
|
117
|
+
│ │ · send_feedback (ACK) │ ┌────────────────────────────┐ │
|
|
118
|
+
│ │ · keepalive every 10s │ │ on_change handler │ │
|
|
119
|
+
│ └──────────────────────────┘ │ (user's async function) │ │
|
|
120
|
+
│ │ │ │ │
|
|
121
|
+
│ asyncio.run_coroutine_ │ runs in main event loop │ │
|
|
122
|
+
│ threadsafe() ─────────────────►│ via run_coroutine_ │ │
|
|
123
|
+
│ │ threadsafe() │ │
|
|
124
|
+
│ └────────────┬───────────────┘ │
|
|
125
|
+
│ │ │
|
|
126
|
+
│ ▼ │
|
|
127
|
+
│ ┌────────────────────────────┐ │
|
|
128
|
+
│ │ Sink │ │
|
|
129
|
+
│ │ (PgVectorSink / │ │
|
|
130
|
+
│ │ QdrantSink / custom) │ │
|
|
131
|
+
└──────────────────────────────────────────────────────────────────┘
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Two connections, two purposes
|
|
135
|
+
|
|
136
|
+
| Connection | Library | Purpose |
|
|
137
|
+
|---|---|---|
|
|
138
|
+
| Replication connection | psycopg2 | Streams WAL bytes using the logical replication protocol |
|
|
139
|
+
| Normal query connection | asyncpg (in sinks) | Regular SQL: CREATE PUBLICATION, slot management, vector writes |
|
|
140
|
+
|
|
141
|
+
**Why two libraries?** asyncpg does not implement the logical replication protocol — it is a query client only. psycopg2 is the only Python library with full replication support (`LogicalReplicationConnection`, `start_replication()`, `read_message()`, `send_feedback()`).
|
|
142
|
+
|
|
143
|
+
### Threading model
|
|
144
|
+
|
|
145
|
+
The psycopg2 replication loop is blocking. Running it directly in the asyncio event loop would stall all other coroutines. pgstream runs it in a **background daemon thread**. When an event is decoded, `asyncio.run_coroutine_threadsafe()` submits the user's async handler to the main event loop and **blocks the thread** until it completes before ACKing the LSN. This preserves at-least-once delivery.
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
[background thread] [main event loop]
|
|
149
|
+
│ │
|
|
150
|
+
decode WAL bytes │
|
|
151
|
+
│ │
|
|
152
|
+
run_coroutine_threadsafe ────────────►│ await handler(event, sink)
|
|
153
|
+
│ │ │
|
|
154
|
+
future.result() ◄───────────────────┤ return │
|
|
155
|
+
(blocks thread) │
|
|
156
|
+
│
|
|
157
|
+
send_feedback (ACK)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Module breakdown
|
|
163
|
+
|
|
164
|
+
| File | Responsibility |
|
|
165
|
+
|---|---|
|
|
166
|
+
| `events.py` | `ChangeEvent` dataclass — the single object flowing through the pipeline |
|
|
167
|
+
| `decoder.py` | `PgOutputDecoder` — pure Python binary parser for the pgoutput protocol |
|
|
168
|
+
| `replication.py` | `SlotManager` (setup/teardown) + `ReplicationStream` (streaming loop) |
|
|
169
|
+
| `stream.py` | `PGStream` — top-level user API, threading bridge, lifecycle management |
|
|
170
|
+
| `sinks/base.py` | `Sink` abstract base class |
|
|
171
|
+
| `sinks/pgvector.py` | `PgVectorSink` — asyncpg-based pgvector reference implementation |
|
|
172
|
+
| `sinks/qdrant.py` | `QdrantSink` — qdrant-client-based Qdrant reference implementation |
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## The `ChangeEvent` object
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
@dataclass
|
|
180
|
+
class ChangeEvent:
|
|
181
|
+
operation: Literal["insert", "update", "delete", "truncate"]
|
|
182
|
+
schema: str # e.g. "public"
|
|
183
|
+
table: str # e.g. "documents"
|
|
184
|
+
row: dict[str, str | None] # new row (text-encoded values)
|
|
185
|
+
old_row: dict[str, str | None] | None # old row (only with REPLICA IDENTITY FULL)
|
|
186
|
+
lsn: str # WAL position, e.g. "0/1A3F28"
|
|
187
|
+
commit_time: datetime # UTC datetime of the transaction
|
|
188
|
+
xid: int # Postgres transaction ID
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
**Column values are always strings.** pgoutput sends all column data text-encoded. pgstream does not coerce types — `event.row["price"]` is `"9.99"`, not `9.99`. Cast in your handler.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Delivery guarantee
|
|
196
|
+
|
|
197
|
+
pgstream provides **at-least-once delivery**:
|
|
198
|
+
- Each LSN is ACKed (via `send_feedback()`) only **after** your handler returns successfully.
|
|
199
|
+
- If your handler raises, the LSN is not ACKed. On the next restart, Postgres replays from the last confirmed position.
|
|
200
|
+
- This means your handler may be called twice for the same event if it crashes mid-way. Design your sink writes to be **idempotent** (both `PgVectorSink` and `QdrantSink` use upsert semantics by default).
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Running the example
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
# Postgres must have wal_level = logical
|
|
208
|
+
export PGSTREAM_DSN=postgresql://user:pass@localhost:5432/db
|
|
209
|
+
|
|
210
|
+
uv run python examples/basic_watch.py
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
In a second terminal:
|
|
214
|
+
```sql
|
|
215
|
+
INSERT INTO documents (content) VALUES ('Hello, pgstream!');
|
|
216
|
+
UPDATE documents SET content = 'Updated' WHERE id = 1;
|
|
217
|
+
DELETE FROM documents WHERE id = 1;
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Running tests
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
# Unit tests only (no DB required)
|
|
226
|
+
uv run pytest tests/test_decoder.py -v
|
|
227
|
+
|
|
228
|
+
# Integration tests (requires Postgres with wal_level = logical)
|
|
229
|
+
export PGSTREAM_DSN=postgresql://user:pass@localhost:5432/db
|
|
230
|
+
uv run pytest tests/ -v
|
|
231
|
+
|
|
232
|
+
# Skip integration tests
|
|
233
|
+
uv run pytest tests/ -v -m "not integration"
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Implementing a custom Sink
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
from pgstream.sinks import Sink
|
|
242
|
+
|
|
243
|
+
class MyPineconeSink(Sink):
|
|
244
|
+
async def upsert(self, id: str, vector: list[float], payload: dict | None = None) -> None:
|
|
245
|
+
# write to Pinecone, Weaviate, Milvus, etc.
|
|
246
|
+
...
|
|
247
|
+
|
|
248
|
+
async def delete(self, id: str) -> None:
|
|
249
|
+
...
|
|
250
|
+
|
|
251
|
+
async def close(self) -> None:
|
|
252
|
+
# optional: release HTTP sessions, pools, etc.
|
|
253
|
+
...
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
---
|
|
257
|
+
|
|
258
|
+
## Postgres setup
|
|
259
|
+
|
|
260
|
+
```sql
|
|
261
|
+
-- 1. Enable logical replication (in postgresql.conf, then restart)
|
|
262
|
+
wal_level = logical
|
|
263
|
+
|
|
264
|
+
-- 2. Grant replication privilege to your user
|
|
265
|
+
ALTER USER myuser REPLICATION;
|
|
266
|
+
|
|
267
|
+
-- 3. pgstream handles the rest (CREATE PUBLICATION, slot creation) via setup()
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Key decisions and what I learned
|
|
273
|
+
|
|
274
|
+
### Why psycopg2 for replication?
|
|
275
|
+
|
|
276
|
+
asyncpg is widely used for async Postgres in Python, but it only implements the regular query protocol. The logical replication protocol requires a separate connection type (`LogicalReplicationConnection`) and special commands (`start_replication`, `read_message`, `send_feedback`). Only psycopg2 exposes these in Python. psycopg3 does not yet have a stable public API for replication. This was a surprising discovery that shaped the entire architecture.
|
|
277
|
+
|
|
278
|
+
### Why a background thread?
|
|
279
|
+
|
|
280
|
+
The psycopg2 replication loop is fundamentally blocking — `select()` and `read_message()` block the calling thread. Running this in the asyncio event loop would freeze all coroutines. The solution is a daemon thread for the replication loop, with `asyncio.run_coroutine_threadsafe()` to bridge back to the event loop for the user's handler. The thread blocks on `future.result()` until the handler completes, preserving the at-least-once delivery guarantee.
|
|
281
|
+
|
|
282
|
+
### Why text encoding?
|
|
283
|
+
|
|
284
|
+
pgoutput protocol v1 sends column values as text strings (e.g. `"42"` for an integer column). pgstream deliberately does not coerce types — doing so would require knowing the Postgres OID → Python type mapping, which is complex (timezone-aware datetimes, Decimals, custom enum types, arrays, JSONB...). Keeping values as strings keeps the library simple and lets the user decide on casting.
|
|
285
|
+
|
|
286
|
+
### The Relation message cache
|
|
287
|
+
|
|
288
|
+
Before any INSERT/UPDATE/DELETE, Postgres sends a Relation (R) message with the table's OID and column schema. This must be cached — DML messages only contain the OID, not column names. If the cache is missing an OID (edge case: consumer joined mid-stream), we emit a warning and skip the event rather than crashing. The cache is per-connection (OIDs are session-stable).
|
|
289
|
+
|
|
290
|
+
### ACK timing
|
|
291
|
+
|
|
292
|
+
`send_feedback(flush_lsn=...)` tells Postgres "I have processed everything up to this LSN, you can discard WAL". We ACK **after** the handler returns, never before. This is the crucial point for at-least-once delivery. We also send keepalive feedback every 10 seconds even when idle — Postgres will kill the replication connection after `wal_sender_timeout` (default 60s) if it hears nothing.
|
|
293
|
+
|
|
294
|
+
### Idempotent setup
|
|
295
|
+
|
|
296
|
+
`setup()` checks `pg_replication_slots` and `pg_publication` before creating anything. It is safe to call on every startup. This simplifies deployment — you don't need a migration step to create the slot; just call `setup()` at boot.
|
pgstream-0.1.0/README.md
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# pgstream
|
|
2
|
+
|
|
3
|
+
A production-grade Python SDK that watches Postgres tables via **logical replication (CDC)** and syncs row changes to **vector stores** in real time.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
pip install pgstream
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What it does
|
|
12
|
+
|
|
13
|
+
pgstream connects to your Postgres database using the **logical replication protocol** (the same protocol Debezium and pglogical use). Whenever you INSERT, UPDATE, or DELETE a row in a watched table, pgstream decodes the WAL change into a structured `ChangeEvent` Python object and calls your async handler with it.
|
|
14
|
+
|
|
15
|
+
Your handler is where the application logic lives — typically:
|
|
16
|
+
- Embed the new row content with your model of choice
|
|
17
|
+
- Upsert the resulting vector into a vector store (pgvector, Qdrant, etc.)
|
|
18
|
+
- Delete vectors for deleted rows
|
|
19
|
+
|
|
20
|
+
pgstream handles all the low-level plumbing: slot management, keepalive ACKs, at-least-once delivery, graceful shutdown.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import asyncio
|
|
28
|
+
from pgstream import PGStream, ChangeEvent
|
|
29
|
+
from pgstream.sinks import QdrantSink
|
|
30
|
+
|
|
31
|
+
stream = PGStream(dsn="postgresql://user:pass@localhost/db")
|
|
32
|
+
stream.watch("documents")
|
|
33
|
+
stream.sink(QdrantSink(url="http://localhost:6333", collection_name="docs"))
|
|
34
|
+
|
|
35
|
+
@stream.on_change
|
|
36
|
+
async def handle(event: ChangeEvent, sink):
|
|
37
|
+
if event.operation in ("insert", "update"):
|
|
38
|
+
vector = await my_embed_function(event.row["content"])
|
|
39
|
+
await sink.upsert(
|
|
40
|
+
id=event.row["id"],
|
|
41
|
+
vector=vector,
|
|
42
|
+
payload={"content": event.row["content"]},
|
|
43
|
+
)
|
|
44
|
+
elif event.operation == "delete":
|
|
45
|
+
await sink.delete(event.row["id"])
|
|
46
|
+
|
|
47
|
+
async def main():
|
|
48
|
+
await stream.setup() # idempotent — creates slot + publication once
|
|
49
|
+
await stream.start() # blocks; press Ctrl+C to stop
|
|
50
|
+
|
|
51
|
+
asyncio.run(main())
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Core only (replication + event decoding)
|
|
60
|
+
pip install pgstream
|
|
61
|
+
|
|
62
|
+
# With pgvector sink
|
|
63
|
+
pip install "pgstream[pgvector]"
|
|
64
|
+
|
|
65
|
+
# With Qdrant sink
|
|
66
|
+
pip install "pgstream[qdrant]"
|
|
67
|
+
|
|
68
|
+
# Both
|
|
69
|
+
pip install "pgstream[all]"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**Requirements:**
|
|
73
|
+
- Python 3.13+
|
|
74
|
+
- Postgres 10+ with `wal_level = logical`
|
|
75
|
+
- The user in the DSN must have the `REPLICATION` privilege
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Architecture
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
┌────────────────────────────────────────────────────────────────────┐
|
|
83
|
+
│ Postgres │
|
|
84
|
+
│ │
|
|
85
|
+
│ WAL ──► pgoutput plugin ──► replication slot ──► client │
|
|
86
|
+
└────────────────────────────────────────────────────────────────────┘
|
|
87
|
+
│
|
|
88
|
+
psycopg2 replication protocol
|
|
89
|
+
│
|
|
90
|
+
┌─────────────────────────────────────────────────────────┼──────────┐
|
|
91
|
+
│ pgstream │ │
|
|
92
|
+
│ ▼ │
|
|
93
|
+
│ ┌──────────────────────────┐ ┌────────────────────────────┐ │
|
|
94
|
+
│ │ ReplicationStream │ │ PgOutputDecoder │ │
|
|
95
|
+
│ │ (background thread) │───►│ (binary wire format │ │
|
|
96
|
+
│ │ │ │ parser, pure Python) │ │
|
|
97
|
+
│ │ · psycopg2 blocking │ └────────────┬───────────────┘ │
|
|
98
|
+
│ │ replication loop │ │ ChangeEvent │
|
|
99
|
+
│ │ · read_message() │ ▼ │
|
|
100
|
+
│ │ · send_feedback (ACK) │ ┌────────────────────────────┐ │
|
|
101
|
+
│ │ · keepalive every 10s │ │ on_change handler │ │
|
|
102
|
+
│ └──────────────────────────┘ │ (user's async function) │ │
|
|
103
|
+
│ │ │ │ │
|
|
104
|
+
│ asyncio.run_coroutine_ │ runs in main event loop │ │
|
|
105
|
+
│ threadsafe() ─────────────────►│ via run_coroutine_ │ │
|
|
106
|
+
│ │ threadsafe() │ │
|
|
107
|
+
│ └────────────┬───────────────┘ │
|
|
108
|
+
│ │ │
|
|
109
|
+
│ ▼ │
|
|
110
|
+
│ ┌────────────────────────────┐ │
|
|
111
|
+
│ │ Sink │ │
|
|
112
|
+
│ │ (PgVectorSink / │ │
|
|
113
|
+
│ │ QdrantSink / custom) │ │
|
|
114
|
+
└──────────────────────────────────────────────────────────────────┘
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Two connections, two purposes
|
|
118
|
+
|
|
119
|
+
| Connection | Library | Purpose |
|
|
120
|
+
|---|---|---|
|
|
121
|
+
| Replication connection | psycopg2 | Streams WAL bytes using the logical replication protocol |
|
|
122
|
+
| Normal query connection | asyncpg (in sinks) | Regular SQL: CREATE PUBLICATION, slot management, vector writes |
|
|
123
|
+
|
|
124
|
+
**Why two libraries?** asyncpg does not implement the logical replication protocol — it is a query client only. psycopg2 is the only Python library with full replication support (`LogicalReplicationConnection`, `start_replication()`, `read_message()`, `send_feedback()`).
|
|
125
|
+
|
|
126
|
+
### Threading model
|
|
127
|
+
|
|
128
|
+
The psycopg2 replication loop is blocking. Running it directly in the asyncio event loop would stall all other coroutines. pgstream runs it in a **background daemon thread**. When an event is decoded, `asyncio.run_coroutine_threadsafe()` submits the user's async handler to the main event loop and **blocks the thread** until it completes before ACKing the LSN. This preserves at-least-once delivery.
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
[background thread] [main event loop]
|
|
132
|
+
│ │
|
|
133
|
+
decode WAL bytes │
|
|
134
|
+
│ │
|
|
135
|
+
run_coroutine_threadsafe ────────────►│ await handler(event, sink)
|
|
136
|
+
│ │ │
|
|
137
|
+
future.result() ◄───────────────────┤ return │
|
|
138
|
+
(blocks thread) │
|
|
139
|
+
│
|
|
140
|
+
send_feedback (ACK)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Module breakdown
|
|
146
|
+
|
|
147
|
+
| File | Responsibility |
|
|
148
|
+
|---|---|
|
|
149
|
+
| `events.py` | `ChangeEvent` dataclass — the single object flowing through the pipeline |
|
|
150
|
+
| `decoder.py` | `PgOutputDecoder` — pure Python binary parser for the pgoutput protocol |
|
|
151
|
+
| `replication.py` | `SlotManager` (setup/teardown) + `ReplicationStream` (streaming loop) |
|
|
152
|
+
| `stream.py` | `PGStream` — top-level user API, threading bridge, lifecycle management |
|
|
153
|
+
| `sinks/base.py` | `Sink` abstract base class |
|
|
154
|
+
| `sinks/pgvector.py` | `PgVectorSink` — asyncpg-based pgvector reference implementation |
|
|
155
|
+
| `sinks/qdrant.py` | `QdrantSink` — qdrant-client-based Qdrant reference implementation |
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## The `ChangeEvent` object
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
@dataclass
|
|
163
|
+
class ChangeEvent:
|
|
164
|
+
operation: Literal["insert", "update", "delete", "truncate"]
|
|
165
|
+
schema: str # e.g. "public"
|
|
166
|
+
table: str # e.g. "documents"
|
|
167
|
+
row: dict[str, str | None] # new row (text-encoded values)
|
|
168
|
+
old_row: dict[str, str | None] | None # old row (only with REPLICA IDENTITY FULL)
|
|
169
|
+
lsn: str # WAL position, e.g. "0/1A3F28"
|
|
170
|
+
commit_time: datetime # UTC datetime of the transaction
|
|
171
|
+
xid: int # Postgres transaction ID
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Column values are always strings.** pgoutput sends all column data text-encoded. pgstream does not coerce types — `event.row["price"]` is `"9.99"`, not `9.99`. Cast in your handler.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Delivery guarantee
|
|
179
|
+
|
|
180
|
+
pgstream provides **at-least-once delivery**:
|
|
181
|
+
- Each LSN is ACKed (via `send_feedback()`) only **after** your handler returns successfully.
|
|
182
|
+
- If your handler raises, the LSN is not ACKed. On the next restart, Postgres replays from the last confirmed position.
|
|
183
|
+
- This means your handler may be called twice for the same event if it crashes mid-way. Design your sink writes to be **idempotent** (both `PgVectorSink` and `QdrantSink` use upsert semantics by default).
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Running the example
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Postgres must have wal_level = logical
|
|
191
|
+
export PGSTREAM_DSN=postgresql://user:pass@localhost:5432/db
|
|
192
|
+
|
|
193
|
+
uv run python examples/basic_watch.py
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
In a second terminal:
|
|
197
|
+
```sql
|
|
198
|
+
INSERT INTO documents (content) VALUES ('Hello, pgstream!');
|
|
199
|
+
UPDATE documents SET content = 'Updated' WHERE id = 1;
|
|
200
|
+
DELETE FROM documents WHERE id = 1;
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Running tests
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
# Unit tests only (no DB required)
|
|
209
|
+
uv run pytest tests/test_decoder.py -v
|
|
210
|
+
|
|
211
|
+
# Integration tests (requires Postgres with wal_level = logical)
|
|
212
|
+
export PGSTREAM_DSN=postgresql://user:pass@localhost:5432/db
|
|
213
|
+
uv run pytest tests/ -v
|
|
214
|
+
|
|
215
|
+
# Skip integration tests
|
|
216
|
+
uv run pytest tests/ -v -m "not integration"
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Implementing a custom Sink
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from pgstream.sinks import Sink
|
|
225
|
+
|
|
226
|
+
class MyPineconeSink(Sink):
|
|
227
|
+
async def upsert(self, id: str, vector: list[float], payload: dict | None = None) -> None:
|
|
228
|
+
# write to Pinecone, Weaviate, Milvus, etc.
|
|
229
|
+
...
|
|
230
|
+
|
|
231
|
+
async def delete(self, id: str) -> None:
|
|
232
|
+
...
|
|
233
|
+
|
|
234
|
+
async def close(self) -> None:
|
|
235
|
+
# optional: release HTTP sessions, pools, etc.
|
|
236
|
+
...
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Postgres setup
|
|
242
|
+
|
|
243
|
+
```sql
|
|
244
|
+
-- 1. Enable logical replication (in postgresql.conf, then restart)
|
|
245
|
+
wal_level = logical
|
|
246
|
+
|
|
247
|
+
-- 2. Grant replication privilege to your user
|
|
248
|
+
ALTER USER myuser REPLICATION;
|
|
249
|
+
|
|
250
|
+
-- 3. pgstream handles the rest (CREATE PUBLICATION, slot creation) via setup()
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Key decisions and what I learned
|
|
256
|
+
|
|
257
|
+
### Why psycopg2 for replication?
|
|
258
|
+
|
|
259
|
+
asyncpg is widely used for async Postgres in Python, but it only implements the regular query protocol. The logical replication protocol requires a separate connection type (`LogicalReplicationConnection`) and special commands (`start_replication`, `read_message`, `send_feedback`). Only psycopg2 exposes these in Python. psycopg3 does not yet have a stable public API for replication. This was a surprising discovery that shaped the entire architecture.
|
|
260
|
+
|
|
261
|
+
### Why a background thread?
|
|
262
|
+
|
|
263
|
+
The psycopg2 replication loop is fundamentally blocking — `select()` and `read_message()` block the calling thread. Running this in the asyncio event loop would freeze all coroutines. The solution is a daemon thread for the replication loop, with `asyncio.run_coroutine_threadsafe()` to bridge back to the event loop for the user's handler. The thread blocks on `future.result()` until the handler completes, preserving the at-least-once delivery guarantee.
|
|
264
|
+
|
|
265
|
+
### Why text encoding?
|
|
266
|
+
|
|
267
|
+
pgoutput protocol v1 sends column values as text strings (e.g. `"42"` for an integer column). pgstream deliberately does not coerce types — doing so would require knowing the Postgres OID → Python type mapping, which is complex (timezone-aware datetimes, Decimals, custom enum types, arrays, JSONB...). Keeping values as strings keeps the library simple and lets the user decide on casting.
|
|
268
|
+
|
|
269
|
+
### The Relation message cache
|
|
270
|
+
|
|
271
|
+
Before any INSERT/UPDATE/DELETE, Postgres sends a Relation (R) message with the table's OID and column schema. This must be cached — DML messages only contain the OID, not column names. If the cache is missing an OID (edge case: consumer joined mid-stream), we emit a warning and skip the event rather than crashing. The cache is per-connection (OIDs are session-stable).
|
|
272
|
+
|
|
273
|
+
### ACK timing
|
|
274
|
+
|
|
275
|
+
`send_feedback(flush_lsn=...)` tells Postgres "I have processed everything up to this LSN, you can discard WAL". We ACK **after** the handler returns, never before. This is the crucial point for at-least-once delivery. We also send keepalive feedback every 10 seconds even when idle — Postgres will kill the replication connection after `wal_sender_timeout` (default 60s) if it hears nothing.
|
|
276
|
+
|
|
277
|
+
### Idempotent setup
|
|
278
|
+
|
|
279
|
+
`setup()` checks `pg_replication_slots` and `pg_publication` before creating anything. It is safe to call on every startup. This simplifies deployment — you don't need a migration step to create the slot; just call `setup()` at boot.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pgstream"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Production-grade Python SDK for Postgres CDC: watch tables via logical replication and sync changes to vector stores (pgvector, Qdrant)."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Kamalesh-Kavin", email = "kamalesh.s7316@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"psycopg2-binary>=2.9.11",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
pgvector = [
|
|
16
|
+
"asyncpg>=0.30.0",
|
|
17
|
+
]
|
|
18
|
+
qdrant = [
|
|
19
|
+
"qdrant-client>=1.14.0",
|
|
20
|
+
]
|
|
21
|
+
all = [
|
|
22
|
+
"asyncpg>=0.30.0",
|
|
23
|
+
"qdrant-client>=1.14.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[build-system]
|
|
27
|
+
requires = ["uv_build>=0.10.10,<0.11.0"]
|
|
28
|
+
build-backend = "uv_build"
|
|
29
|
+
|
|
30
|
+
[dependency-groups]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=9.0.2",
|
|
33
|
+
"pytest-asyncio>=1.3.0",
|
|
34
|
+
"asyncpg>=0.30.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
asyncio_mode = "auto"
|
|
39
|
+
markers = [
|
|
40
|
+
"integration: marks tests that require a live Postgres instance (deselect with -m 'not integration')",
|
|
41
|
+
]
|