sql-code-graph 1.2.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlcg/server/server.py CHANGED
@@ -79,37 +79,51 @@ async def _control_socket_task(
79
79
  stop_event: "anyio.Event",
80
80
  backend_lock: "anyio.Lock",
81
81
  start_time: float,
82
+ writer_queue: "WriterQueue",
82
83
  ) -> None:
83
84
  """Accept control connections on ``<db>.sock`` and dispatch ops.
84
85
 
85
86
  Supported ops:
86
87
 
87
- - ``{"op": "status"}`` → running state, pid, db_path, freshness, uptime.
88
- Unframed (legacy single-recv protocol).
88
+ - ``{"op": "status"}`` → running state, pid, db_path, freshness, uptime,
89
+ writer_queue block. **Length-prefixed framing** (v1.3.0, B3): the
90
+ response uses ``<decimal-byte-length>\\n<json-body>`` so large queue
91
+ payloads are read in full by the recv-exactly client.
89
92
  - ``{"op": "stop"}`` → sends ``{"ok": true}`` then signals stop via
90
- *stop_event*. Unframed.
91
- - ``{"op": "reindex", "root", "from", "to", "dialect"}`` → runs
92
- ``Indexer.resync_changed`` off the event-loop thread via
93
- ``anyio.to_thread.run_sync``, serialised behind *backend_lock* (R1, R2).
94
- Unframed.
93
+ *stop_event*. Unframed (mcp_stop uses s.recv(128) — do NOT change).
94
+ - ``{"op": "index", "root", "dialect", "wait"}`` → enqueues a full index
95
+ onto *writer_queue* (rule 1 supersedes all pending). Supports
96
+ ``wait=true`` (stream progress frames + terminal ``done:true``) and
97
+ ``wait=false`` (immediate ``{ok, queued, position}``).
98
+ - ``{"op": "reindex", "root", "from", "to", "dialect", "wait"}`` →
99
+ enqueues an incremental resync (coalescing rules 2–3). ``from`` may be
100
+ ``null``/omitted to resolve at drain start (W3). Same ``wait`` semantics
101
+ as ``index``. The handler enqueues only — it never touches the backend
102
+ (B1 invariant: only the drain task resolves a backend, under backend_lock).
95
103
  - ``{"op": "query", "cypher": ..., "params": ...}`` → executes a
96
- read-only Cypher query on the single backend connection, serialised
97
- behind *backend_lock*. **Length-prefixed framing** (v1.2.0):
104
+ read-only SQL query on the single backend connection, serialised
105
+ behind *backend_lock*. (The ``cypher`` field name is a legacy wire-key
106
+ retained for protocol compatibility; the value is SQL.)
107
+ **Length-prefixed framing** (v1.2.0):
98
108
  ``<decimal-byte-length>\\n<json-body>`` on both request and response.
99
109
 
100
- Framing protocol (v1.2.0, ``query`` op only):
101
- Request: ``b"<len>\\n" + json_body`` server detects by sniffing the
102
- first line; a bare decimal integer framed. Unframed requests always
103
- start with ``{`` (never a digit), so the sniff is unambiguous.
104
- Response: same ``<len>\\n<body>`` format for framed requests; unframed
105
- response for unframed requests. Old clients that use the unframed
106
- ``s.recv(65536)`` + ``json.loads`` pattern will get a loud
107
- ``json.JSONDecodeError`` if they accidentally receive a framed response —
108
- NOT silent truncation. Only the new ``read_client`` sends framed
109
- requests, so this does not affect existing callers.
110
+ Framing protocol:
111
+ Requests: a bare decimal integer on the first line → framed. Unframed
112
+ JSON always starts with ``{``, so the sniff is unambiguous.
113
+ Responses: framed (``<len>\\n<body>``) for ``query`` and ``status``;
114
+ unframed for ``stop``/``reindex``/``index`` (unless ``wait=true`` which
115
+ uses the multi-frame streaming protocol).
116
+
117
+ Multi-frame streaming protocol (``index``/``reindex`` with ``wait=true``):
118
+ The server sends a sequence of length-prefixed frames on the same
119
+ connection. Progress frames carry ``{done: false, files_done, files_total}``.
120
+ The terminal frame carries ``{ok: true, done: true, summary: {...}}`` on
121
+ success or ``{ok: false, done: true, error: ...}`` on failure (W7).
122
+ The client reads frames in a loop and stops when it sees ``done == true``
123
+ — it does NOT rely on EOF as the terminator.
110
124
 
111
125
  R2 (single connection): all backend operations go through ``backend_lock``
112
- so concurrent calls never touch the single Kuzu connection simultaneously.
126
+ so concurrent calls never touch the single DuckDB connection simultaneously.
113
127
 
114
128
  R8 teardown ordering: the caller must cancel this task BEFORE calling
115
129
  ``shutdown_backend()``. This is guaranteed by the ``anyio.CancelScope``
@@ -126,18 +140,19 @@ async def _control_socket_task(
126
140
  from anyio.streams.buffered import BufferedByteReceiveStream
127
141
 
128
142
  from sqlcg.core.config import get_db_path as _get_db_path
143
+ from sqlcg.server.writer import WriterRequest
129
144
 
130
- # Read-only keyword allow-list for the ``query`` op. Only these leading
131
- # keywords are permitted — anything that starts with a write keyword is
132
- # rejected before execution. This is a guard against accidental mutation,
145
+ # Read-only keyword allow-list for the ``query`` op. Only SELECT and WITH
146
+ # (CTE preamble) are permitted — anything that starts with a write keyword
147
+ # is rejected before execution. This is a guard against accidental mutation,
133
148
  # not a security boundary (the socket is already 0o600 / owner-only).
134
- _QUERY_ALLOWED_KEYWORDS = frozenset({"MATCH", "RETURN", "WITH", "CALL", "UNWIND", "OPTIONAL"})
149
+ _QUERY_ALLOWED_KEYWORDS = frozenset({"SELECT", "WITH", "VALUES", "TABLE"})
135
150
 
136
- def _is_read_only_cypher(cypher: str) -> bool:
151
+ def _is_read_only_sql(sql: str) -> bool:
137
152
  """Return True iff the leading keyword is in the read-only allow-list."""
138
153
  import re
139
154
 
140
- m = re.match(r"\s*(?:--[^\n]*)?\s*(\w+)", cypher, re.IGNORECASE)
155
+ m = re.match(r"\s*(?:--[^\n]*)?\s*(\w+)", sql, re.IGNORECASE)
141
156
  if not m:
142
157
  return False
143
158
  return m.group(1).upper() in _QUERY_ALLOWED_KEYWORDS
@@ -192,7 +207,7 @@ async def _control_socket_task(
192
207
  if indexed_sha is not None:
193
208
  try:
194
209
  rows = db.run_read(
195
- "MATCH (r:Repo) RETURN r.path AS path LIMIT 1",
210
+ 'SELECT path FROM "Repo" LIMIT 1',
196
211
  {},
197
212
  )
198
213
  if rows:
@@ -211,7 +226,13 @@ async def _control_socket_task(
211
226
  "stale_by_commits": stale,
212
227
  "connected_clients": 1, # stdio transport = 1 by design
213
228
  "uptime": time.time() - start_time,
229
+ "writer_queue": writer_queue.coalesce_view(),
214
230
  }
231
+ # status response is framed (B3, v1.3.0) — same framing as query
232
+ # so recv-exactly clients read it in full regardless of payload size.
233
+ resp_bytes = json.dumps(resp).encode()
234
+ await stream.send(f"{len(resp_bytes)}\n".encode() + resp_bytes)
235
+ return
215
236
 
216
237
  elif op == "stop":
217
238
  resp = {"ok": True}
@@ -222,42 +243,99 @@ async def _control_socket_task(
222
243
  stop_event.set()
223
244
  return
224
245
 
246
+ elif op == "index":
247
+ # Step 3.1 — enqueue a full index; never touches the backend here (B1).
248
+ root = req.get("root")
249
+ dialect = req.get("dialect")
250
+ wait = req.get("wait", False)
251
+ requested_by = req.get("requested_by", "cli")
252
+ if not root:
253
+ resp = {"error": "index op requires root"}
254
+ await stream.send(json.dumps(resp).encode() + b"\n")
255
+ return
256
+
257
+ writer_req = WriterRequest(
258
+ op="index",
259
+ root=root,
260
+ dialect=dialect,
261
+ from_sha=None,
262
+ to_sha=None,
263
+ requested_by=requested_by,
264
+ )
265
+
266
+ if wait:
267
+ # Attach-and-wait: register a memory channel then stream frames.
268
+ send_ch, recv_ch = anyio.create_memory_object_stream(max_buffer_size=64)
269
+ writer_req._waiters.append(send_ch)
270
+ position = await writer_queue.enqueue(writer_req)
271
+ # Send the queued acknowledgement frame first.
272
+ queued_frame = json.dumps(
273
+ {"ok": True, "done": False, "queued": True, "position": position}
274
+ ).encode()
275
+ await stream.send(f"{len(queued_frame)}\n".encode() + queued_frame)
276
+ # Stream progress frames until done:true terminal frame.
277
+ async with recv_ch:
278
+ async for terminal in recv_ch:
279
+ frame_bytes = json.dumps(terminal).encode()
280
+ await stream.send(f"{len(frame_bytes)}\n".encode() + frame_bytes)
281
+ if terminal.get("done"):
282
+ break
283
+ else:
284
+ position = await writer_queue.enqueue(writer_req)
285
+ resp = {"ok": True, "queued": True, "position": position}
286
+ await stream.send(json.dumps(resp).encode() + b"\n")
287
+ return
288
+
225
289
  elif op == "reindex":
290
+ # Step 2.3 (B1) — enqueue; the drain is the only backend consumer.
291
+ # The handler NEVER calls backend_ref() (B1 invariant).
226
292
  root = req.get("root")
227
- from_sha = req.get("from")
293
+ from_sha = req.get("from") # may be None (W3 — server resolves at drain)
228
294
  to_sha = req.get("to")
229
295
  dialect = req.get("dialect")
230
- if not root or not from_sha or not to_sha:
231
- resp = {"error": "reindex op requires root, from, to"}
296
+ wait = req.get("wait", False)
297
+ requested_by = req.get("requested_by", "cli")
298
+ if not root:
299
+ resp = {"error": "reindex op requires root"}
300
+ await stream.send(json.dumps(resp).encode() + b"\n")
301
+ return
302
+
303
+ writer_req = WriterRequest(
304
+ op="reindex",
305
+ root=root,
306
+ dialect=dialect,
307
+ from_sha=from_sha,
308
+ to_sha=to_sha,
309
+ requested_by=requested_by,
310
+ )
311
+
312
+ if wait:
313
+ send_ch, recv_ch = anyio.create_memory_object_stream(max_buffer_size=64)
314
+ writer_req._waiters.append(send_ch)
315
+ position = await writer_queue.enqueue(writer_req)
316
+ queued_frame = json.dumps(
317
+ {"ok": True, "done": False, "queued": True, "position": position}
318
+ ).encode()
319
+ await stream.send(f"{len(queued_frame)}\n".encode() + queued_frame)
320
+ async with recv_ch:
321
+ async for terminal in recv_ch:
322
+ frame_bytes = json.dumps(terminal).encode()
323
+ await stream.send(f"{len(frame_bytes)}\n".encode() + frame_bytes)
324
+ if terminal.get("done"):
325
+ break
232
326
  else:
233
- from sqlcg.indexer.indexer import Indexer
234
-
235
- db = backend_ref()
236
- if db is None:
237
- resp = {"error": "backend not available"}
238
- else:
239
- indexer = Indexer()
240
-
241
- def _do_reindex() -> dict:
242
- return indexer.resync_changed(
243
- _Path(root),
244
- from_sha,
245
- to_sha,
246
- db,
247
- dialect,
248
- )
249
-
250
- async with backend_lock:
251
- # R1: run off event-loop thread; R2: lock serialises
252
- summary = await _to_thread.run_sync(_do_reindex)
253
- resp = {"ok": True, "summary": summary}
327
+ position = await writer_queue.enqueue(writer_req)
328
+ resp = {"ok": True, "queued": True, "position": position}
329
+ await stream.send(json.dumps(resp).encode() + b"\n")
330
+ return
254
331
 
255
332
  elif op == "query":
256
- # Framed op (v1.2.0): read-only Cypher query over the socket.
333
+ # Framed op (v1.2.0): read-only SQL query over the socket.
257
334
  # Must only be called with a framed request (sniff above sets framed=True).
258
- cypher = req.get("cypher", "")
335
+ # Accept both "cypher" (legacy field name) and "sql" keys.
336
+ sql = req.get("sql") or req.get("cypher", "")
259
337
  params = req.get("params") or {}
260
- if not _is_read_only_cypher(cypher):
338
+ if not _is_read_only_sql(sql):
261
339
  resp = {"error": "query op is read-only"}
262
340
  else:
263
341
  db = backend_ref()
@@ -266,11 +344,11 @@ async def _control_socket_task(
266
344
  else:
267
345
 
268
346
  def _do_query() -> list:
269
- return db.run_read(cypher, params)
347
+ return db.run_read(sql, params)
270
348
 
271
349
  async with backend_lock:
272
350
  # R1: run off event-loop thread; R2: lock serialises
273
- # reads and writes on the single Kuzu connection.
351
+ # reads and writes on the single DuckDB connection.
274
352
  rows = await _to_thread.run_sync(_do_query)
275
353
  resp = {"ok": True, "rows": rows}
276
354
 
@@ -297,36 +375,37 @@ async def _control_socket_task(
297
375
  async def _stop_watcher(
298
376
  stop_event: "anyio.Event",
299
377
  db_path: "Path",
378
+ backend_lock: "anyio.Lock",
379
+ shutdown_requested: "anyio.Event",
300
380
  ) -> None:
301
381
  """Wait for stop_event then perform graceful shutdown.
302
382
 
303
- When the control socket ``stop`` op fires ``stop_event``, this task:
304
- 1. Shuts down the backend (flush pending writes).
305
- 2. Removes the control files (.sock, .pid).
306
- 3. Calls ``os._exit(0)`` to terminate the process immediately.
307
-
308
- We use ``os._exit(0)`` rather than a cancel-scope approach because the
309
- MCP ``stdio_server`` runs stdin readline in a thread via
310
- ``anyio.to_thread.run_sync`` with ``abandon_on_cancel=False`` (the
311
- default). When stdin is a pipe, the read-end cannot be closed from within
312
- the subprocess to interrupt the blocking readline — the parent holds the
313
- write end open. ``os._exit`` bypasses the thread-drain wait entirely and
314
- exits the process without calling ``atexit`` handlers or running
315
- ``finally`` blocks in other tasks.
316
-
317
- R8 ordering: backend is shut down HERE (before os._exit), not in main().
318
- The ``finally`` block in ``main()`` will also try to shutdown/cleanup but
319
- ``os._exit`` prevents it from running — so we do it explicitly here.
383
+ Shutdown ordering:
384
+ 1. Set shutdown_requested so the drain loop exits cleanly after its
385
+ current drain completes (no new drains start once this is set).
386
+ 2. Acquire backend_lock waits until any active drain has finished
387
+ (committed its transaction).
388
+ 3. Call shutdown_backend() under the lock.
389
+ 4. Release backend_lock.
390
+ 5. Remove control files.
391
+ 6. Call os._exit(0).
392
+
393
+ We use ``os._exit(0)`` because the MCP ``stdio_server`` blocks on a pipe
394
+ read (``anyio.to_thread.run_sync`` with ``abandon_on_cancel=False``).
395
+ We cannot interrupt it without killing the process.
320
396
  """
321
397
  import sqlcg.server.tools as _tools
322
398
  from sqlcg.server.control import cleanup_control_files
323
399
 
324
400
  await stop_event.wait()
325
- # Graceful teardown before hard exit
326
- try:
327
- _tools.shutdown_backend()
328
- except Exception:
329
- pass
401
+ # Signal drain loop to stop after current drain completes.
402
+ shutdown_requested.set()
403
+ # Wait for any active drain to finish before closing the backend.
404
+ async with backend_lock:
405
+ try:
406
+ _tools.shutdown_backend()
407
+ except Exception:
408
+ pass
330
409
  try:
331
410
  cleanup_control_files(db_path)
332
411
  except Exception:
@@ -355,9 +434,10 @@ async def _sigterm_watcher(
355
434
  async def _run_with_control(db_path: "Path", start_time: float) -> None:
356
435
  """Run the stdio MCP loop and the control-socket task in a shared TaskGroup.
357
436
 
358
- Stop mechanism (R8 teardown ordering):
437
+ Stop mechanism (B2 teardown ordering):
359
438
  - Control socket ``stop`` op → ``stop_event.set()`` → ``_stop_watcher``
360
- shuts down backend + removes control files + calls ``os._exit(0)``.
439
+ sets shutdown_requested, acquires backend_lock (waits for active drain),
440
+ shuts down backend, removes control files, calls ``os._exit(0)``.
361
441
  - External SIGTERM → ``_sigterm_watcher`` → same path via ``stop_event``.
362
442
  - Normal EOF on stdin (editor closes connection) → stdio loop returns →
363
443
  ``tg.cancel_scope.cancel()`` → tasks cancelled → ``main()`` finally
@@ -368,19 +448,35 @@ async def _run_with_control(db_path: "Path", start_time: float) -> None:
368
448
  with ``abandon_on_cancel=False``). We cannot interrupt it without
369
449
  killing the process; ``_stop_watcher`` does cleanup first.
370
450
 
371
- ``backend_lock`` is created once here and passed into
372
- ``_control_socket_task`` so concurrent control ops (reindex, query) are
373
- serialised behind a single lock on the Kuzu connection (R2).
451
+ ``backend_lock`` is created once here and passed into both
452
+ ``_control_socket_task`` and the ``drain_loop`` task so that:
453
+ - concurrent control ops (reindex, query) are serialised (R2), and
454
+ - _stop_watcher can acquire the lock to wait for an active drain (B2).
374
455
  """
375
456
  import anyio
376
457
 
377
458
  import sqlcg.server.tools as _tools
459
+ from sqlcg.server.writer import WriterQueue, drain_loop
378
460
 
379
461
  stop_event = anyio.Event()
380
- backend_lock = anyio.Lock() # R2: serialise all backend ops (Kuzu not thread-safe)
462
+ shutdown_requested = anyio.Event()
463
+ backend_lock = anyio.Lock() # R2 + B2: serialise all backend ops
464
+
465
+ # Inject metrics into the queue so coalesce/drain events are persisted.
466
+ writer_queue = WriterQueue(metrics=_tools._metrics)
467
+
468
+ db_path_str = str(db_path)
381
469
 
382
470
  async with anyio.create_task_group() as tg:
383
471
  if sys.platform != "win32":
472
+ # Drain task: consumes WriterQueue; sole backend consumer (B1).
473
+ tg.start_soon(
474
+ drain_loop,
475
+ writer_queue,
476
+ db_path_str,
477
+ backend_lock,
478
+ shutdown_requested,
479
+ )
384
480
  # Spawn control socket alongside the stdio loop.
385
481
  tg.start_soon(
386
482
  _control_socket_task,
@@ -389,9 +485,10 @@ async def _run_with_control(db_path: "Path", start_time: float) -> None:
389
485
  stop_event,
390
486
  backend_lock,
391
487
  start_time,
488
+ writer_queue,
392
489
  )
393
490
  # Watch stop_event; shuts down and calls os._exit(0).
394
- tg.start_soon(_stop_watcher, stop_event, db_path)
491
+ tg.start_soon(_stop_watcher, stop_event, db_path, backend_lock, shutdown_requested)
395
492
  # Watch for SIGTERM; fires stop_event for same clean path.
396
493
  tg.start_soon(_sigterm_watcher, stop_event)
397
494
 
@@ -404,7 +501,7 @@ def main(db_path: str | None = None) -> None:
404
501
  """Start the MCP server.
405
502
 
406
503
  Args:
407
- db_path: Path to KùzuDB database. If None, uses SQLCG_DB_PATH env var
504
+ db_path: Path to DuckDB database. If None, uses SQLCG_DB_PATH env var
408
505
  or ~/.sqlcg/graph.db (via get_db_path in tools module).
409
506
  """
410
507
  import time
@@ -457,5 +554,6 @@ if TYPE_CHECKING:
457
554
  import anyio
458
555
 
459
556
  from sqlcg.core.graph_db import GraphBackend
557
+ from sqlcg.server.writer import WriterQueue
460
558
 
461
559
  from sqlcg.server.control import sock_path # noqa: E402 (used in _control_socket_task)
sqlcg/server/skill.py CHANGED
@@ -82,7 +82,7 @@ _WORKFLOWS = """\
82
82
  TOOL_RETURN_MODELS: dict[str, type[BaseModel]] = {
83
83
  # Operational tools — return plain dict, no Judgement
84
84
  "index_repo": BaseModel,
85
- "execute_cypher": BaseModel,
85
+ "execute_sql": BaseModel,
86
86
  "submit_feedback": BaseModel,
87
87
  # Lineage / dependency facts
88
88
  "trace_column_lineage": LineageResult,
@@ -109,7 +109,7 @@ TOOL_RETURN_MODELS: dict[str, type[BaseModel]] = {
109
109
 
110
110
  _TOOL_PURPOSE: dict[str, str] = {
111
111
  "index_repo": "Index a SQL repository into the graph",
112
- "execute_cypher": "Execute a read-only Cypher query against the graph",
112
+ "execute_sql": "Execute a read-only SQL query against the graph (DuckDB)",
113
113
  "submit_feedback": "Submit feedback (TP/FP/FN) on a tool result",
114
114
  "trace_column_lineage": "Trace upstream column lineage to its source",
115
115
  "find_table_usages": "Find all queries that consume a given table",