sql-code-graph 1.1.3__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,17 +22,33 @@ class CrossFileAggregator:
22
22
  # Maps lowercased table name (bare name) -> exp.Select body for CTAS statements.
23
23
  # Populated during register_pass1 and used to seed sources_map in pass 2.
24
24
  self.cross_file_sources: dict[str, Any] = {}
25
+ # #44 canonical-name index: bare name (lowercased) -> sole DDL-defined full_id.
26
+ # Built from DDL-defined tables only (defined_tables, not CTAS bodies).
27
+ # Used by _build_file_rows to rewrite an unqualified INSERT-target to the
28
+ # canonical full_id so INSERT-target nodes share identity with the DDL node.
29
+ self.canonical_by_bare: dict[str, str] = {}
30
+ # Bare names defined by >1 schema — do NOT rewrite (ambiguous).
31
+ self._ambiguous_bare: set[str] = set()
25
32
 
26
33
  def register_pass1(self, parsed: ParsedFile) -> None:
27
34
  """Register a pass-1 result and build view/table source map.
28
35
 
29
- Also harvests CTAS bodies from statements for cross-file temp-table resolution.
36
+ Also harvests CTAS bodies from statements for cross-file temp-table resolution,
37
+ and builds the bare-name → canonical-full_id index (#44) from DDL tables.
30
38
 
31
39
  Args:
32
40
  parsed: ParsedFile from pass 1
33
41
  """
34
42
  for table in parsed.defined_tables:
35
43
  self.sources[table.full_id] = parsed
44
+ # #44: build canonical_by_bare index from DDL-defined tables
45
+ bare = (table.name or "").lower()
46
+ if bare:
47
+ if bare in self.canonical_by_bare and self.canonical_by_bare[bare] != table.full_id:
48
+ # Same bare name defined in multiple schemas → ambiguous, never rewrite
49
+ self._ambiguous_bare.add(bare)
50
+ else:
51
+ self.canonical_by_bare[bare] = table.full_id
36
52
 
37
53
  # Harvest CTAS bodies from statements for cross-file resolution.
38
54
  # Key convention matches AnsiParser.parse_file line 109: lowercased bare name.
@@ -68,47 +84,3 @@ class CrossFileAggregator:
68
84
  if bare and bare in self.cross_file_sources and bare not in same_file_bare_names:
69
85
  return True
70
86
  return False
71
-
72
- def resolve_pass2(self, parser, parsed: ParsedFile) -> ParsedFile:
73
- """Re-parse with cross-file schema context.
74
-
75
- Args:
76
- parser: SqlParser instance
77
- parsed: ParsedFile from pass 1
78
-
79
- Returns:
80
- ParsedFile from pass 2 with resolved cross-file references,
81
- or the pass-1 result if skip predicate determines no re-parse is needed
82
- or if the file cannot be re-read.
83
-
84
- Raises:
85
- No exceptions are raised; file read errors are logged as WARNING
86
- and the pass-1 result is returned unchanged.
87
-
88
- Note:
89
- This method returns the exact same ParsedFile object (via `return parsed`)
90
- on the skip path. This identity semantics are used by callers to track
91
- which files were skipped (resolved is parsed). Do not introduce a .copy()
92
- on the skip path — that would break the identity check.
93
- """
94
- # Register view sources for schema resolution
95
- parser._schema.add_view_sources(self.sources)
96
-
97
- if not self._needs_pass2(parsed):
98
- # File has no cross-file dependencies — pass-1 result is already final.
99
- return parsed
100
-
101
- try:
102
- sql = parsed.path.read_text(encoding="utf-8")
103
- except (FileNotFoundError, OSError) as exc:
104
- logger.warning(
105
- "resolve_pass2: cannot re-read %s (%s) — returning pass-1 result",
106
- parsed.path,
107
- exc,
108
- )
109
- return parsed
110
-
111
- # Filter cross-file CTAS bodies to what this file actually references —
112
- # keeps exp.expand bounded by referenced_tables, not by corpus size.
113
- ref_names = {(t.name or "").lower() for t in parsed.referenced_tables if t.name}
114
- return parser.parse_file(parsed.path, sql, dependency_filter=ref_names)
@@ -85,8 +85,8 @@ class AnsiParser(SqlParser):
85
85
  the cross-file sources seeded into `sources_map` are filtered to only those
86
86
  whose name is in the set. Pass-1 callers (and direct test callers) pass
87
87
  `None` to disable filtering; pass-2 callers
88
- (`CrossFileAggregator.resolve_pass2`) compute this from the pass-1
89
- `ParsedFile.referenced_tables`.
88
+ (the `index_repo` pass-2 dispatch in `indexer.py`) compute this from
89
+ the pass-1 `ParsedFile.referenced_tables`.
90
90
  _precomputed_start_lines: optional list of 1-based start lines, one per
91
91
  statement. When provided (e.g. by SnowflakeParser which computes the map
92
92
  from the preprocessed SQL after ``_preprocess_snowflake_sql`` — which
@@ -0,0 +1,192 @@
1
+ """Client helper for routing CLI read commands through the live MCP server.
2
+
3
+ When a server is live on the target DB, CLI read commands route their
4
+ ``run_read(cypher, params)`` calls over the Unix control socket instead of
5
+ opening the DB directly. This avoids "Database is locked" errors when the
6
+ server holds KuzuDB's process-level write lock.
7
+
8
+ With no server running (``query_via_server`` returns ``None``), the fallback
9
+ opens the DB with ``get_backend(read_only=True)`` — zero-config small-repo
10
+ invariant preserved.
11
+
12
+ Framing protocol (v1.2.0):
13
+ Request: ``<decimal-byte-length>\\n<json-body>``
14
+ Response: ``<decimal-byte-length>\\n<json-body>``
15
+ Only the ``query`` op uses this framing; legacy ops (status/stop/reindex)
16
+ keep their unframed ``{...}\\n`` protocol.
17
+
18
+ Client receive strategy: after sending the framed request, read the length
19
+ line with ``f.readline()`` (blocking, will not return a partial line) then
20
+ read exactly that many bytes with ``f.read(n)``. This is the recv-exactly
21
+ pattern required by BLOCKER 2 — a single ``s.recv(65536)`` would silently
22
+ truncate large result sets. Do NOT copy reindex.py's single-recv pattern
23
+ here.
24
+
25
+ Server-busy behaviour (v1.1.0 F1 parity):
26
+ If the server is alive but the lock is held (timeout waiting for the
27
+ response), raise ``typer.Exit`` — a plain ``Exception`` subclass, NOT
28
+ ``SystemExit`` / ``BaseException``. This ensures gain.py's
29
+ ``except Exception: pass`` handler catches it and degrades gracefully
30
+ (skips the parse-quality section) instead of crashing. Other read
31
+ commands let the ``typer.Exit`` propagate to a clean non-zero CLI exit.
32
+ Do NOT fall back to a direct open on timeout — the server is alive and
33
+ holds the lock, so falling back would reproduce the "Database is locked"
34
+ error (mirrors the F1 fix in reindex.py:127–142).
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import json
40
+ import socket as _socket
41
+ import sys
42
+ from pathlib import Path
43
+
44
+ import typer
45
+
46
+ # Client-side socket timeout for the query control-socket path.
47
+ # Sized to cover the longest in-flight reindex (~89 s DWH resync_changed)
48
+ # with headroom. This is a CLI transport constant, NOT a KuzuConfig value —
49
+ # same convention as _NOTIFY_SOCKET_TIMEOUT_S in reindex.py.
50
+ _QUERY_SOCKET_TIMEOUT_S = 300
51
+
52
+
53
+ def query_via_server(
54
+ cypher: str,
55
+ params: dict,
56
+ db_path: Path | None = None,
57
+ timeout_s: float = _QUERY_SOCKET_TIMEOUT_S,
58
+ ) -> list[dict] | None:
59
+ """Send a read query over the control socket.
60
+
61
+ Uses length-prefixed framing (v1.2.0): ``<len>\\n<json-body>`` for both
62
+ request and response. Reads the response with ``makefile`` + ``readline``
63
+ + ``read(n)`` — NOT a single ``recv`` — so arbitrarily large result sets
64
+ are returned in full without truncation (BLOCKER 2).
65
+
66
+ Args:
67
+ cypher: Cypher query string (must be read-only; server enforces).
68
+ params: Query parameter dict.
69
+ db_path: Explicit database path. Defaults to ``get_db_path()``.
70
+ timeout_s: Socket timeout in seconds. On timeout the server is alive
71
+ and holds the lock — raises ``typer.Exit``, does NOT fall back to
72
+ a direct open (which would reproduce the lock error).
73
+
74
+ Returns:
75
+ Row list (list[dict]) on success.
76
+ None when NO server is live (caller should fall back to direct open).
77
+
78
+ Raises:
79
+ typer.Exit: Server is alive but busy (timeout waiting for response).
80
+ Exception-derived, NOT SystemExit — caught by gain.py's
81
+ ``except Exception: pass`` so parse-quality section degrades
82
+ gracefully (WARNING 3).
83
+ typer.Exit: Server returned ``{"error": ...}`` response.
84
+ """
85
+ from sqlcg.server.control import sock_path
86
+
87
+ if sys.platform == "win32":
88
+ # No Unix domain socket on Windows — fall through to direct open.
89
+ return None
90
+
91
+ sp = sock_path(db_path)
92
+ if not sp.exists():
93
+ return None
94
+
95
+ req = {"op": "query", "cypher": cypher, "params": params}
96
+ req_bytes = json.dumps(req).encode()
97
+ frame = f"{len(req_bytes)}\n".encode() + req_bytes
98
+
99
+ try:
100
+ with _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM) as s:
101
+ s.settimeout(timeout_s)
102
+ s.connect(str(sp))
103
+ s.sendall(frame)
104
+
105
+ # Recv-exactly via makefile:
106
+ # - f.readline() reads the length line (``<int>\n``) — will not
107
+ # return a partial line because makefile buffers internally.
108
+ # - f.read(n) reads exactly n bytes — accumulates until complete.
109
+ # A single s.recv(65536) would silently truncate large bodies
110
+ # (BLOCKER 2 guard: this is the recv-exactly implementation).
111
+ f = s.makefile("rb")
112
+ length_line = f.readline()
113
+ if not length_line:
114
+ return None # server closed connection unexpectedly
115
+ try:
116
+ body_len = int(length_line.strip())
117
+ except ValueError:
118
+ # Server sent an unframed response — protocol mismatch.
119
+ return None
120
+ body = f.read(body_len)
121
+
122
+ except TimeoutError:
123
+ # Server is alive and holding the lock. Do NOT fall back to a direct
124
+ # open — that would hit the held lock and produce "Database is locked"
125
+ # (mirrors v1.1.0 F1 fix in reindex.py:127–142).
126
+ from rich.console import Console
127
+
128
+ Console(stderr=True).print(
129
+ f"[red]Server is busy (reindex in progress); timed out after "
130
+ f"{timeout_s:.0f}s. The graph will update when it finishes — "
131
+ "check 'sqlcg mcp status'.[/red]"
132
+ )
133
+ raise typer.Exit(1) from None
134
+ except (FileNotFoundError, ConnectionRefusedError, OSError):
135
+ # Socket absent or refused — no live server; caller falls back to
136
+ # direct open.
137
+ return None
138
+
139
+ try:
140
+ resp = json.loads(body)
141
+ except (json.JSONDecodeError, ValueError):
142
+ return None # malformed response; treat as no-server
143
+
144
+ if "error" in resp:
145
+ from rich.console import Console
146
+
147
+ Console(stderr=True).print(f"[red]Server query error: {resp['error']}[/red]")
148
+ raise typer.Exit(1)
149
+
150
+ return resp.get("rows", [])
151
+
152
+
153
+ def run_read_routed(
154
+ cypher: str,
155
+ params: dict,
156
+ db_path: Path | None = None,
157
+ ) -> list[dict]:
158
+ """Route through a live server if present, else direct read-only open.
159
+
160
+ This is the single seam every CLI read command calls instead of building
161
+ its own backend. Centralises the fallback semantics:
162
+
163
+ - ``query_via_server`` returns a list → server is live, use rows.
164
+ - ``query_via_server`` returns None → no server, open DB directly with
165
+ ``get_backend(read_only=True)`` (BLOCKER 1 — must pass read_only=True
166
+ or the fallback opens read-write and reproduces lock contention).
167
+ - ``query_via_server`` raises ``typer.Exit`` → server busy/error; let it
168
+ propagate (do NOT fall back — lock is held).
169
+
170
+ Args:
171
+ cypher: Cypher query string.
172
+ params: Query parameter dict.
173
+ db_path: Explicit database path. Defaults to ``get_db_path()``.
174
+
175
+ Returns:
176
+ Row list from the server or from a direct read-only DB open.
177
+
178
+ Raises:
179
+ typer.Exit: Server busy or server error (propagated from
180
+ ``query_via_server``).
181
+ """
182
+ rows = query_via_server(cypher, params, db_path=db_path)
183
+ if rows is not None:
184
+ return rows
185
+
186
+ # No server live — fall back to a direct read-only open.
187
+ # read_only=True is required: without it the fallback opens read-write
188
+ # and any concurrent writer will produce "Database is locked" (BLOCKER 1).
189
+ from sqlcg.core.config import get_backend
190
+
191
+ with get_backend(read_only=True) as backend:
192
+ return backend.run_read(cypher, params)
sqlcg/server/server.py CHANGED
@@ -77,24 +77,39 @@ async def _control_socket_task(
77
77
  db_path: "Path",
78
78
  backend_ref: "Callable[[], GraphBackend | None]",
79
79
  stop_event: "anyio.Event",
80
- reindex_lock: "anyio.Lock",
80
+ backend_lock: "anyio.Lock",
81
81
  start_time: float,
82
82
  ) -> None:
83
83
  """Accept control connections on ``<db>.sock`` and dispatch ops.
84
84
 
85
- Supported ops (newline-delimited JSON request → response):
85
+ Supported ops:
86
86
 
87
87
  - ``{"op": "status"}`` → running state, pid, db_path, freshness, uptime.
88
+ Unframed (legacy single-recv protocol).
88
89
  - ``{"op": "stop"}`` → sends ``{"ok": true}`` then signals stop via
89
- *stop_event*. The ``_run_with_control`` coroutine watches this event
90
- and closes stdin to trigger EOF in the MCP stdio loop.
90
+ *stop_event*. Unframed.
91
91
  - ``{"op": "reindex", "root", "from", "to", "dialect"}`` → runs
92
92
  ``Indexer.resync_changed`` off the event-loop thread via
93
- ``anyio.to_thread.run_sync``, serialised behind *reindex_lock* (R1, R2).
94
-
95
- R2 (single connection): all backend mutations go through ``reindex_lock``
96
- so concurrent notify calls never touch the single Kuzu connection
97
- simultaneously.
93
+ ``anyio.to_thread.run_sync``, serialised behind *backend_lock* (R1, R2).
94
+ Unframed.
95
+ - ``{"op": "query", "cypher": ..., "params": ...}`` executes a
96
+ read-only Cypher query on the single backend connection, serialised
97
+ behind *backend_lock*. **Length-prefixed framing** (v1.2.0):
98
+ ``<decimal-byte-length>\\n<json-body>`` on both request and response.
99
+
100
+ Framing protocol (v1.2.0, ``query`` op only):
101
+ Request: ``b"<len>\\n" + json_body`` — server detects by sniffing the
102
+ first line; a bare decimal integer → framed. Unframed requests always
103
+ start with ``{`` (never a digit), so the sniff is unambiguous.
104
+ Response: same ``<len>\\n<body>`` format for framed requests; unframed
105
+ response for unframed requests. Old clients that use the unframed
106
+ ``s.recv(65536)`` + ``json.loads`` pattern will get a loud
107
+ ``json.JSONDecodeError`` if they accidentally receive a framed response —
108
+ NOT silent truncation. Only the new ``read_client`` sends framed
109
+ requests, so this does not affect existing callers.
110
+
111
+ R2 (single connection): all backend operations go through ``backend_lock``
112
+ so concurrent calls never touch the single Kuzu connection simultaneously.
98
113
 
99
114
  R8 teardown ordering: the caller must cancel this task BEFORE calling
100
115
  ``shutdown_backend()``. This is guaranteed by the ``anyio.CancelScope``
@@ -108,9 +123,25 @@ async def _control_socket_task(
108
123
  import anyio
109
124
  import anyio.abc as _anyio_abc
110
125
  import anyio.to_thread as _to_thread
126
+ from anyio.streams.buffered import BufferedByteReceiveStream
111
127
 
112
128
  from sqlcg.core.config import get_db_path as _get_db_path
113
129
 
130
+ # Read-only keyword allow-list for the ``query`` op. Only these leading
131
+ # keywords are permitted — anything that starts with a write keyword is
132
+ # rejected before execution. This is a guard against accidental mutation,
133
+ # not a security boundary (the socket is already 0o600 / owner-only).
134
+ _QUERY_ALLOWED_KEYWORDS = frozenset({"MATCH", "RETURN", "WITH", "CALL", "UNWIND", "OPTIONAL"})
135
+
136
+ def _is_read_only_cypher(cypher: str) -> bool:
137
+ """Return True iff the leading keyword is in the read-only allow-list."""
138
+ import re
139
+
140
+ m = re.match(r"\s*(?:--[^\n]*)?\s*(\w+)", cypher, re.IGNORECASE)
141
+ if not m:
142
+ return False
143
+ return m.group(1).upper() in _QUERY_ALLOWED_KEYWORDS
144
+
114
145
  sp = sock_path(db_path)
115
146
 
116
147
  listener = await anyio.create_unix_listener(str(sp))
@@ -119,8 +150,29 @@ async def _control_socket_task(
119
150
  async def _handle_connection(stream: _anyio_abc.SocketStream) -> None:
120
151
  async with stream:
121
152
  try:
122
- raw = await stream.receive(4096)
123
- req = json.loads(raw)
153
+ # Sniff for framed vs unframed request.
154
+ # Framed (query op, v1.2.0): ``<decimal-len>\n<json-body>``
155
+ # Unframed (legacy status/stop/reindex): JSON object starting with ``{``
156
+ # The sniff is unambiguous: unframed JSON always starts with ``{``,
157
+ # never a bare decimal digit.
158
+ buf = BufferedByteReceiveStream(stream)
159
+ first_line = await buf.receive_until(b"\n", max_bytes=64)
160
+
161
+ try:
162
+ body_len = int(first_line.strip())
163
+ framed = True
164
+ except ValueError:
165
+ framed = False
166
+
167
+ if framed:
168
+ # Framed request: read exactly body_len bytes then parse.
169
+ raw_body = await buf.receive_exactly(body_len)
170
+ req = json.loads(raw_body)
171
+ else:
172
+ # Unframed request (legacy ops): first_line IS the JSON
173
+ # (terminated by \n as sent by the client).
174
+ req = json.loads(first_line)
175
+
124
176
  op = req.get("op")
125
177
 
126
178
  if op == "status":
@@ -195,15 +247,42 @@ async def _control_socket_task(
195
247
  dialect,
196
248
  )
197
249
 
198
- async with reindex_lock:
250
+ async with backend_lock:
199
251
  # R1: run off event-loop thread; R2: lock serialises
200
252
  summary = await _to_thread.run_sync(_do_reindex)
201
253
  resp = {"ok": True, "summary": summary}
202
254
 
255
+ elif op == "query":
256
+ # Framed op (v1.2.0): read-only Cypher query over the socket.
257
+ # Must only be called with a framed request (sniff above sets framed=True).
258
+ cypher = req.get("cypher", "")
259
+ params = req.get("params") or {}
260
+ if not _is_read_only_cypher(cypher):
261
+ resp = {"error": "query op is read-only"}
262
+ else:
263
+ db = backend_ref()
264
+ if db is None:
265
+ resp = {"error": "backend not available"}
266
+ else:
267
+
268
+ def _do_query() -> list:
269
+ return db.run_read(cypher, params)
270
+
271
+ async with backend_lock:
272
+ # R1: run off event-loop thread; R2: lock serialises
273
+ # reads and writes on the single Kuzu connection.
274
+ rows = await _to_thread.run_sync(_do_query)
275
+ resp = {"ok": True, "rows": rows}
276
+
203
277
  else:
204
278
  resp = {"error": f"unknown op: {op!r}"}
205
279
 
206
- await stream.send(json.dumps(resp).encode() + b"\n")
280
+ # Send response: framed for framed requests, unframed for legacy ops.
281
+ resp_bytes = json.dumps(resp).encode()
282
+ if framed:
283
+ await stream.send(f"{len(resp_bytes)}\n".encode() + resp_bytes)
284
+ else:
285
+ await stream.send(resp_bytes + b"\n")
207
286
 
208
287
  except Exception as exc:
209
288
  try:
@@ -289,16 +368,16 @@ async def _run_with_control(db_path: "Path", start_time: float) -> None:
289
368
  with ``abandon_on_cancel=False``). We cannot interrupt it without
290
369
  killing the process; ``_stop_watcher`` does cleanup first.
291
370
 
292
- ``reindex_lock`` is created once here and passed into
293
- ``_control_socket_task`` so concurrent notify calls are serialised
294
- behind a single lock (R2).
371
+ ``backend_lock`` is created once here and passed into
372
+ ``_control_socket_task`` so concurrent control ops (reindex, query) are
373
+ serialised behind a single lock on the Kuzu connection (R2).
295
374
  """
296
375
  import anyio
297
376
 
298
377
  import sqlcg.server.tools as _tools
299
378
 
300
379
  stop_event = anyio.Event()
301
- reindex_lock = anyio.Lock() # R2: serialise reindex ops (Kuzu not thread-safe)
380
+ backend_lock = anyio.Lock() # R2: serialise all backend ops (Kuzu not thread-safe)
302
381
 
303
382
  async with anyio.create_task_group() as tg:
304
383
  if sys.platform != "win32":
@@ -308,7 +387,7 @@ async def _run_with_control(db_path: "Path", start_time: float) -> None:
308
387
  db_path,
309
388
  lambda: _tools._backend,
310
389
  stop_event,
311
- reindex_lock,
390
+ backend_lock,
312
391
  start_time,
313
392
  )
314
393
  # Watch stop_event; shuts down and calls os._exit(0).