sql-code-graph 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,7 @@ def git_name_status_delta(root: Path, old_sha: str, new_sha: str) -> Delta | Non
51
51
  unknown SHA, shallow clone, or git not available). Callers MUST fall
52
52
  back to a full index_repo when None is returned.
53
53
  """
54
+ root = root.resolve() # guard: caller may pass a relative path (e.g. Path("."))
54
55
  try:
55
56
  result = subprocess.run(
56
57
  ["git", "diff", "--name-status", old_sha, new_sha],
sqlcg/indexer/indexer.py CHANGED
@@ -251,9 +251,11 @@ class Indexer:
251
251
  _t_pass2_end = time.perf_counter()
252
252
 
253
253
  except KeyboardInterrupt:
254
- logger.info("SIGINT received flushing pass-1 progress")
255
- # pass1_results may be partial; upsert what we have
256
- self._upsert_all(pass1_results, db)
254
+ # Kill workers and abort immediately. A partial pass-1-only result is
255
+ # an incomplete graph (no cross-file resolution, no star expansion);
256
+ # writing it would leave a misleading half-index. Re-run `sqlcg index`
257
+ # to index — re-indexing is the migration path.
258
+ logger.warning("Interrupted — workers killed; no partial graph written.")
257
259
  raise
258
260
 
259
261
  # Assemble final pass-2 results: start from pass-1, overlay pass-2 where available
@@ -1104,16 +1106,6 @@ class Indexer:
1104
1106
 
1105
1107
  return counts
1106
1108
 
1107
- def _upsert_all(self, results: list[ParsedFile], db: GraphBackend) -> None:
1108
- """Upsert all parsed files.
1109
-
1110
- Args:
1111
- results: List of ParsedFile objects
1112
- db: GraphBackend instance
1113
- """
1114
- for parsed in results:
1115
- self._upsert_parsed_file(parsed, db)
1116
-
1117
1109
  def _expand_star_sources(self, db: GraphBackend) -> int:
1118
1110
  """Run the post-ingestion star expansion query.
1119
1111
 
sqlcg/indexer/pool.py CHANGED
@@ -194,7 +194,14 @@ class HardKillPool:
194
194
  ) -> None:
195
195
  self._dialect = dialect
196
196
  self._schema_aliases: dict[str, str] = schema_aliases or {}
197
- self._n = n_workers or os.cpu_count() or 4
197
+ # Leave 2 logical cores of headroom rather than spawning one worker per
198
+ # logical core. Parsing is CPU-bound, and the main process also does work
199
+ # between passes (closure resolution, batched upserts); saturating every
200
+ # core makes the largest files miss the per-file wall-clock timeout.
201
+ # Measured on the 1,453-file DWH corpus (after the once-per-statement parser
202
+ # fixes): cpu_count → 2 timeouts / 186s; cpu_count-2 → 0 timeouts / 131s
203
+ # (fewer timeouts AND faster, since timed-out files waste work + respawn churn).
204
+ self._n = n_workers or max(1, (os.cpu_count() or 4) - 2)
198
205
  self._ctx = mp.get_context("spawn")
199
206
  self._workers: list[_WorkerState] = []
200
207
 
@@ -278,7 +285,9 @@ class HardKillPool:
278
285
  tidx = queue.pop(0)
279
286
  path = tasks[tidx].get("path", "")
280
287
  if kill_counts.get(path, 0) >= poison_retries:
281
- results[tidx] = _timeout_file(path, self._dialect, poison=True)
288
+ results[tidx] = _timeout_file(
289
+ path, self._dialect, timeout_s=per_task_timeout, poison=True
290
+ )
282
291
  logger.warning("Skipping %s — poisoned after %d kills", path, poison_retries)
283
292
  if on_result is not None:
284
293
  on_result()
@@ -322,6 +331,28 @@ class HardKillPool:
322
331
  w.task_start = time.monotonic()
323
332
  busy.add(slot)
324
333
 
334
+ try:
335
+ return self._run_map_loop(
336
+ tasks, results, busy, kill_counts, _assign, per_task_timeout, on_result, n_tasks
337
+ )
338
+ except KeyboardInterrupt:
339
+ # Workers ignore SIGINT and are CPU-bound, so they will not notice a
340
+ # graceful SHUTDOWN sentinel until their current parse finishes. On
341
+ # interrupt the user wants the process gone now — hard-kill outright.
342
+ self.terminate()
343
+ raise
344
+
345
+ def _run_map_loop(
346
+ self,
347
+ tasks: list[dict],
348
+ results: list[ParsedFile | None],
349
+ busy: set[int],
350
+ kill_counts: dict[str, int],
351
+ _assign: Callable[[int], None],
352
+ per_task_timeout: float,
353
+ on_result: Callable[[], None] | None,
354
+ n_tasks: int,
355
+ ) -> list[ParsedFile | None]:
325
356
  # Initial dispatch: fill all worker slots
326
357
  for i in range(min(self._n, n_tasks)):
327
358
  _assign(i)
@@ -346,7 +377,7 @@ class HardKillPool:
346
377
  slot,
347
378
  kill_counts[path],
348
379
  )
349
- results[tidx] = _timeout_file(path, self._dialect)
380
+ results[tidx] = _timeout_file(path, self._dialect, timeout_s=per_task_timeout)
350
381
  if on_result is not None:
351
382
  on_result()
352
383
  self._respawn(w)
@@ -405,6 +436,31 @@ class HardKillPool:
405
436
  # Shutdown
406
437
  # ------------------------------------------------------------------
407
438
 
439
+ def terminate(self) -> None:
440
+ """Immediately SIGKILL every worker without a graceful handshake.
441
+
442
+ Unlike :meth:`shutdown`, this sends no ``_SHUTDOWN`` sentinel and does
443
+ not wait for in-flight parses. Workers ignore SIGINT and are CPU-bound,
444
+ so a graceful stop would block on the longest running parse; on
445
+ interrupt we kill outright so the process dies promptly.
446
+ """
447
+ for w in self._workers:
448
+ try:
449
+ w.conn.close()
450
+ except Exception:
451
+ pass
452
+ try:
453
+ if w.process.is_alive():
454
+ w.process.kill()
455
+ except Exception:
456
+ pass
457
+ for w in self._workers:
458
+ try:
459
+ w.process.join(timeout=1)
460
+ except Exception:
461
+ pass
462
+ self._workers.clear()
463
+
408
464
  def shutdown(self) -> None:
409
465
  """Gracefully stop all workers, then force-kill any that linger."""
410
466
  for w in self._workers:
@@ -432,11 +488,14 @@ class HardKillPool:
432
488
  def _timeout_file(
433
489
  path: str,
434
490
  dialect: str | None,
491
+ timeout_s: float = 0.0,
435
492
  poison: bool = False,
436
493
  ) -> ParsedFile:
437
494
  pf = ParsedFile(path=Path(path), dialect=dialect)
438
- msg = "skipped:poison" if poison else "timeout"
439
- pf.errors.append(f"{msg} file={Path(path).name}")
495
+ if poison:
496
+ pf.errors.append(f"skipped:poison file={Path(path).name}")
497
+ else:
498
+ pf.errors.append(f"timeout:{timeout_s:.0f}s file={Path(path).name}")
440
499
  return pf
441
500
 
442
501
 
sqlcg/parsers/base.py CHANGED
@@ -619,10 +619,6 @@ class SqlParser(ABC):
619
619
  else:
620
620
  return LineageExtraction(edges=edges, star_sources=star_sources)
621
621
 
622
- # Build scope once from the body for all-column reuse (T-05 optimization)
623
- # Defer scope building to just before the column loop to ensure sources
624
- # are expanded first (avoid rebuilding for each column, but only build
625
- # after sources are known)
626
622
  body_scope = None
627
623
  combined_sources = {**(sources or {})}
628
624
 
@@ -644,8 +640,174 @@ class SqlParser(ABC):
644
640
  key = cte_alias.lower()
645
641
  combined_sources[key] = cte.this
646
642
 
647
- # Extract output columns
648
- for col_expr in col_expressions:
643
+ # Build body_scope ONCE per statement, before the column loop, and reuse
644
+ # it for every column (CLAUDE.md invariant: "body_scope built once per
645
+ # statement"). If schema-qualify fails, retry schema-free so we STILL get
646
+ # a scope for the copy=False fast path; only if both fail do we fall back
647
+ # to the per-column sources= path. Building this lazily inside the loop
648
+ # (regressed in 4234e5d) meant a single qualify failure re-ran
649
+ # expand+qualify+build_scope for EVERY column → O(N_cols) full-body
650
+ # deepcopies per statement (measured: 229 qualify calls on one 460-line file).
651
+ if scope is None:
652
+ expanded_body = body
653
+ expand_sources = {
654
+ k: v for k, v in (sources or {}).items() if isinstance(v, exp.Query)
655
+ }
656
+ if expand_sources:
657
+ try:
658
+ expanded_body = exp.expand(
659
+ body,
660
+ expand_sources, # type: ignore
661
+ dialect=self.DIALECT,
662
+ copy=True,
663
+ )
664
+ except Exception:
665
+ expanded_body = body
666
+ try:
667
+ qualified_body = qualify(
668
+ expanded_body,
669
+ dialect=self.DIALECT,
670
+ schema=schema,
671
+ validate_qualify_columns=False,
672
+ identify=False,
673
+ )
674
+ body_scope = build_scope(qualified_body)
675
+ except Exception as _qualify_exc:
676
+ out.errors.append(
677
+ f"col_lineage_skip:qualify_failed:{type(_qualify_exc).__name__}"
678
+ )
679
+ # Schema-free retry: still yields a scope for the copy=False path.
680
+ try:
681
+ qualified_body = qualify(
682
+ expanded_body,
683
+ dialect=self.DIALECT,
684
+ validate_qualify_columns=False,
685
+ identify=False,
686
+ )
687
+ body_scope = build_scope(qualified_body)
688
+ except Exception:
689
+ body_scope = None
690
+
691
+ # INSERT positional column-list mapping (#25 fix).
692
+ # Compute the positional_col_names skip-set BEFORE the main column loop
693
+ # so the main loop can skip positions already handled here.
694
+ #
695
+ # When an INSERT has an explicit column list (INSERT INTO t (c1, c2) SELECT ...),
696
+ # the target column name at position idx is authoritative — the SELECT alias is
697
+ # cosmetic for the SELECT and meaningless to the INSERT target. This block
698
+ # overrides alias attribution for ALL positions (aliased or not).
699
+ #
700
+ # Guards applied here mirror the main column loop to preserve skip markers:
701
+ # - Star expressions → emit col_lineage_skip:star, register pos, skip sg_lineage
702
+ # - Pure-literal (no Column descendant) → register pos, skip sg_lineage (silent)
703
+ # - Unaliased non-Column (func/arith/CASE) → emit col_lineage_skip:func_fallback,
704
+ # register pos, skip sg_lineage
705
+ # - Plain Column / aliased expression → call sg_lineage (the #25 happy path)
706
+ #
707
+ # CLAUDE.md invariant: body_no_with = body.copy() + strip-WITH happens ONCE
708
+ # before the inner loop; only the single projection is swapped per column.
709
+ positional_col_names: dict[int, str] = {} # idx → insert_col_name
710
+ if isinstance(stmt, exp.Insert) and isinstance(stmt.this, exp.Schema):
711
+ insert_cols_list = [c.name for c in stmt.this.expressions]
712
+ # Build the WITH-stripped body ONCE here, before any per-column loop.
713
+ # Only the single projection is swapped per column below.
714
+ body_no_with = body.copy()
715
+ body_no_with.set("with_", None)
716
+ for _ins_idx, _col_expr in enumerate(col_expressions):
717
+ if _ins_idx >= len(insert_cols_list):
718
+ break
719
+ _insert_col = insert_cols_list[_ins_idx]
720
+ if not _insert_col:
721
+ continue
722
+ # Register position first so the main loop always skips it,
723
+ # regardless of which guard fires below.
724
+ positional_col_names[_ins_idx] = _insert_col
725
+
726
+ # Guard 1: Star projection — emit skip marker (same as main loop).
727
+ _inner_for_guard = (
728
+ _col_expr.this if isinstance(_col_expr, exp.Alias) else _col_expr
729
+ )
730
+ if isinstance(_inner_for_guard, exp.Star) or (
731
+ isinstance(_inner_for_guard, exp.Column)
732
+ and isinstance(_inner_for_guard.this, exp.Star)
733
+ ):
734
+ _qualifier = (
735
+ _inner_for_guard.table
736
+ if isinstance(_inner_for_guard, exp.Column)
737
+ else None
738
+ )
739
+ out.errors.append(f"col_lineage_skip:star:{_qualifier or '<unqualified>'}")
740
+ continue # no sg_lineage for star
741
+
742
+ # Guard 2: Pure-literal — no Column descendants, nothing to trace.
743
+ if not list(_col_expr.find_all(exp.Column)):
744
+ continue # silent skip, no sg_lineage
745
+
746
+ # NOTE: do NOT emit func_fallback here for unaliased non-Column
747
+ # expressions (functions, arithmetic, CASE …). The main loop emits
748
+ # func_fallback for such expressions because a plain SELECT/CREATE VIEW
749
+ # gives them no output column name. The positional INSERT column list
750
+ # DOES supply that name (_insert_col): below we wrap the expression as
751
+ # Alias(expr, _insert_col) and let sg_lineage trace through it — exactly
752
+ # as the aliased form (e.g. `DATE(col) AS a`) already resolves. Guard 2
753
+ # (above) already dropped genuinely-untraceable pure-literal expressions
754
+ # (no Column descendant). Skipping column-containing expressions here would
755
+ # make the #25 positional feature do its work and then discard the result,
756
+ # dropping real lineage edges (regressed by eb19f29; broke COALESCE).
757
+
758
+ # Positional mapping always wins — replace (or add) the alias with the
759
+ # INSERT target column name regardless of SELECT alias.
760
+ if _col_expr.alias and _col_expr.alias != _insert_col:
761
+ self._log.debug(
762
+ "INSERT positional override: SELECT alias %r → INSERT col %r"
763
+ " at position %d",
764
+ _col_expr.alias,
765
+ _insert_col,
766
+ _ins_idx,
767
+ )
768
+ # If the expression is already an Alias(inner, old_alias), unwrap it
769
+ # before re-wrapping — otherwise we produce Alias(Alias(inner, x), c1)
770
+ # which serialises as "inner AS x AS c1" (syntax error).
771
+ _inner = _col_expr.this if isinstance(_col_expr, exp.Alias) else _col_expr
772
+ _aliased = exp.Alias(this=_inner.copy(), alias=_insert_col)
773
+ body_no_with.set("expressions", [_aliased])
774
+ _patched_sql = body_no_with.sql(dialect=self.DIALECT)
775
+ # Pass sources= (not scope=) here: the patched SQL is a freshly
776
+ # serialised string — the scope was built from the original body AST
777
+ # and does not correspond to this new string.
778
+ #
779
+ # Use `sources` (the cross-statement temp/CTAS map), NOT
780
+ # `combined_sources`. combined_sources additionally carries the
781
+ # SAME-STATEMENT CTE bodies (added above). Since body_no_with strips
782
+ # the WITH clause from the patched SQL, those CTE names become opaque
783
+ # source relations — passing their bodies as sources= would expand them
784
+ # away, collapsing intermediate CTE→target hops into the deepest source
785
+ # (regressed by eb19f29; broke the MA_AANTAL_OP_ORDER anchor link 5).
786
+ # Cross-statement temps (e.g. CREATE TEMP TABLE t) live in `sources`
787
+ # and SHOULD still expand (E36 multi-temp: t → src).
788
+ try:
789
+ _root = sg_lineage(
790
+ _insert_col,
791
+ _patched_sql,
792
+ dialect=self.DIALECT,
793
+ sources=sources or {},
794
+ )
795
+ if _root:
796
+ _new_edges = self._lineage_node_to_edges(
797
+ _root,
798
+ dst_col_name=_insert_col,
799
+ dst_table=dst_table,
800
+ path=path,
801
+ out=out,
802
+ )
803
+ edges.extend(_new_edges)
804
+ except Exception:
805
+ pass
806
+
807
+ # Extract output columns — skip positions handled by the positional INSERT block
808
+ for loop_idx, col_expr in enumerate(col_expressions):
809
+ if loop_idx in positional_col_names:
810
+ continue # positional INSERT block already emitted this column
649
811
  # Skip star projections — sg_lineage requires a concrete column name.
650
812
  if isinstance(col_expr, exp.Star) or (
651
813
  isinstance(col_expr, exp.Column) and isinstance(col_expr.this, exp.Star)
@@ -723,53 +885,23 @@ class SqlParser(ABC):
723
885
  continue
724
886
 
725
887
  try:
726
- # Build scope on first column for reuse across all columns (T-05 optimization).
727
- # NOTE: We build body_scope locally from the extracted body rather than
728
- # using a pre-built scope from the statement, because CREATE/INSERT statements
729
- # have their scope rooted at the outer statement, but the body passed here
730
- # is the inner SELECT. Reusing the outer scope would produce incorrect
731
- # qualification. The pre-built scope from parse_file would only be useful
732
- # if we had a mechanism to extract the matching inner scope, which is
733
- # complex and not yet implemented (see sprint_06 T-05 deviation for details).
734
- if body_scope is None and scope is None:
735
- try:
736
- # Expand only file-level sources (CTEs, temp tables, CTAS bodies).
737
- expanded_body = body
738
- expand_sources = {
739
- k: v for k, v in (sources or {}).items() if isinstance(v, exp.Query)
740
- }
741
- if expand_sources:
742
- expanded_body = exp.expand(
743
- body,
744
- expand_sources, # type: ignore
745
- dialect=self.DIALECT,
746
- copy=True,
747
- )
748
-
749
- # Qualify the expanded body to prepare for scope building
750
- qualified_body = qualify(
751
- expanded_body,
752
- dialect=self.DIALECT,
753
- schema=schema,
754
- validate_qualify_columns=False,
755
- identify=False,
756
- )
757
- body_scope = build_scope(qualified_body)
758
- except Exception as _qualify_exc:
759
- # qualify() failure is non-fatal: sg_lineage falls back to
760
- # its own qualification. Record for observability.
761
- out.errors.append(
762
- f"col_lineage_skip:qualify_failed:{type(_qualify_exc).__name__}:{_qualify_exc}"
763
- )
764
- body_scope = None
765
-
766
888
  # When a scope is available it embeds full column→table resolution.
767
889
  # On the qualify-failed fallback path (no scope), pass only the small
768
890
  # set of file-level sources so sg_lineage can resolve CTEs/CTAS bodies.
769
891
  active_scope = scope if scope is not None else body_scope
770
892
  sg_kwargs: dict = {"dialect": self.DIALECT}
771
893
  if active_scope is not None:
894
+ # scope= path: the pre-built scope already embeds full
895
+ # column→table resolution. copy=False + trim_selects=False
896
+ # suppress sqlglot's per-call AST deepcopy and per-column
897
+ # trim — neither is needed when the scope is built once and
898
+ # reused across all columns. Dropping these (regressed in
899
+ # 4234e5d) makes lineage() deepcopy the whole scope per
900
+ # column → O(columns × scope_size) (measured: 3.2M deepcopy
901
+ # calls / ~3.8s on a 359-line file).
772
902
  sg_kwargs["scope"] = active_scope
903
+ sg_kwargs["copy"] = False
904
+ sg_kwargs["trim_selects"] = False
773
905
  else:
774
906
  sg_kwargs["sources"] = sources or {}
775
907
  root = sg_lineage(col_name, body, **sg_kwargs)
@@ -905,42 +1037,6 @@ class SqlParser(ABC):
905
1037
  cte_col_name,
906
1038
  )
907
1039
 
908
- # INSERT column-list aliasing (T-07-02 link 5).
909
- # When an INSERT has an explicit column list and the SELECT expression has
910
- # no alias (e.g. SELECT SUM(x) FROM cte), the INSERT column at the same
911
- # position provides the destination col name. Stripping the WITH clause
912
- # stops sg_lineage at the CTE name boundary (doesn't expand into bodies).
913
- if isinstance(stmt, exp.Insert) and isinstance(stmt.this, exp.Schema):
914
- insert_cols = [c.name for c in stmt.this.expressions]
915
- for idx, col_expr in enumerate(col_expressions):
916
- if idx >= len(insert_cols):
917
- break
918
- if col_expr.alias:
919
- continue # already handled by the main col loop
920
- insert_col = insert_cols[idx]
921
- if not insert_col:
922
- continue
923
- # Build a patched SELECT: strip WITH, alias the expression with the
924
- # INSERT column name so sg_lineage can trace it.
925
- body_no_with = body.copy()
926
- body_no_with.set("with_", None)
927
- aliased = exp.Alias(this=col_expr.copy(), alias=insert_col)
928
- body_no_with.set("expressions", [aliased])
929
- patched_sql = body_no_with.sql(dialect=self.DIALECT)
930
- try:
931
- root = sg_lineage(insert_col, patched_sql, dialect=self.DIALECT)
932
- if root:
933
- new_edges = self._lineage_node_to_edges(
934
- root,
935
- dst_col_name=insert_col,
936
- dst_table=dst_table,
937
- path=path,
938
- out=out,
939
- )
940
- edges.extend(new_edges)
941
- except Exception:
942
- pass
943
-
944
1040
  except Exception as exc:
945
1041
  self._log.debug(
946
1042
  "column lineage extraction failed for entire statement: file=%s error=%s",
sqlcg/server/server.py CHANGED
@@ -1,35 +1,76 @@
1
1
  """MCP server for SQL Code Graph.
2
2
 
3
3
  Exposes FastMCP tools for lineage queries, pattern search, and indexing.
4
- MCP protocol uses stdout for message transport, so this module redirects
5
- stdout to stderr to prevent user logs from corrupting the protocol stream.
4
+ MCP protocol uses stdout (fd 1) for JSON-RPC message transport. This module
5
+ captures fd 1 as a raw binary buffer BEFORE any logging redirection so that
6
+ the captured buffer can be passed explicitly to stdio_server(). This ensures
7
+ JSON-RPC frames always go to fd 1 regardless of what sys.stdout points to
8
+ at call time.
9
+
10
+ Ordering invariant (must not change):
11
+ 1. os.dup(1) → _real_stdout_buffer (first — before everything)
12
+ 2. from mcp.server import FastMCP (module-level import)
13
+ 3. mcp = FastMCP("SQL Code Graph") (module-level; tools.py registers here)
14
+ 4. main() calls _configure_mcp_logging() (not at module scope)
6
15
  """
7
16
 
17
+ import os
8
18
  import sys
9
19
 
10
- from dotenv import load_dotenv
11
- from mcp.server import FastMCP
20
+ # Capture the real fd 1 binary stream FIRST — before _configure_mcp_logging()
21
+ # (which replaces sys.stdout) AND before FastMCP("SQL Code Graph") construction.
22
+ # stdio_server() receives this explicitly so JSON-RPC frames go to fd 1
23
+ # regardless of what sys.stdout points to afterward.
24
+ # Guards against the v1.0.0/v1.0.1 regression where frames went to fd 2.
25
+ _real_stdout_buffer = os.fdopen(os.dup(1), "wb", buffering=0)
12
26
 
13
- from sqlcg.utils.logging import getLogger
27
+ from dotenv import load_dotenv # noqa: E402
28
+ from mcp.server import FastMCP # noqa: E402
29
+
30
+ from sqlcg.utils.logging import getLogger # noqa: E402
14
31
 
15
32
  logger = getLogger(__name__)
16
33
 
34
+ # Create FastMCP instance at module scope so tools.py can import and register with it.
35
+ # This is safe because _real_stdout_buffer has already captured fd 1 above.
36
+ mcp = FastMCP("SQL Code Graph")
37
+
17
38
 
18
39
  def _configure_mcp_logging() -> None:
19
- """Redirect sys.stdout to sys.stderr to protect MCP protocol.
40
+ """Redirect sys.stdout to sys.stderr and configure logging to stderr.
20
41
 
21
- MCP uses stdout for JSON-RPC messages. Any user print() or log output
22
- to stdout would corrupt the protocol. This function must be called before
23
- mcp.run() and before any code that might print to stdout.
42
+ sys.stdout is replaced with sys.stderr so that any stray print() call
43
+ does not pollute fd 1 (reserved for MCP JSON-RPC frames).
44
+ The real fd 1 binary stream is captured in _real_stdout_buffer at module
45
+ top before this replacement and passed explicitly to stdio_server().
46
+
47
+ Must be called inside main(), not at module scope, so that
48
+ _real_stdout_buffer captures fd 1 before the redirect.
24
49
  """
50
+ import logging
51
+
25
52
  sys.stdout = sys.stderr
53
+ logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
26
54
 
27
55
 
28
- # Protect stdout before importing FastMCP (which may emit output during import)
29
- _configure_mcp_logging()
56
+ async def _run_stdio_async_with_real_stdout() -> None:
57
+ """Run the MCP server loop with JSON-RPC frames explicitly on fd 1.
30
58
 
31
- # Create FastMCP instance at module scope so tools.py can import and register with it
32
- mcp = FastMCP("SQL Code Graph")
59
+ Bypasses FastMCP.run_stdio_async() (which uses sys.stdout at call time)
60
+ and drives the server loop directly with the captured _real_stdout_buffer.
61
+ """
62
+ from io import TextIOWrapper
63
+
64
+ import anyio
65
+ from mcp.server.stdio import stdio_server
66
+
67
+ stdout_text = TextIOWrapper(_real_stdout_buffer, encoding="utf-8", line_buffering=False)
68
+ async with stdio_server(stdout=anyio.wrap_file(stdout_text)) as (read_stream, write_stream):
69
+ await mcp._mcp_server.run(
70
+ read_stream,
71
+ write_stream,
72
+ mcp._mcp_server.create_initialization_options(),
73
+ )
33
74
 
34
75
 
35
76
  def main(db_path: str | None = None) -> None:
@@ -38,10 +79,13 @@ def main(db_path: str | None = None) -> None:
38
79
  Args:
39
80
  db_path: Path to KùzuDB database. If None, uses SQLCG_DB_PATH env var
40
81
  or ~/.sqlcg/graph.db (via get_db_path in tools module).
41
-
42
- Raises:
43
- RuntimeError: If tools fail to initialize or FastMCP server fails.
44
82
  """
83
+ import anyio
84
+
85
+ # Must be first — redirects sys.stdout → sys.stderr so stray prints don't
86
+ # corrupt fd 1. _real_stdout_buffer was already captured at module top.
87
+ _configure_mcp_logging()
88
+
45
89
  load_dotenv()
46
90
 
47
91
  # Import tools module to trigger tool registration via @mcp.tool() decorators
@@ -50,8 +94,7 @@ def main(db_path: str | None = None) -> None:
50
94
  # Initialize the backend singleton used by all tools
51
95
  sqlcg.server.tools.init_backend(db_path)
52
96
 
53
- # Run the MCP server event loop, ensuring backend is closed on shutdown
54
97
  try:
55
- mcp.run()
98
+ anyio.run(_run_stdio_async_with_real_stdout)
56
99
  finally:
57
100
  sqlcg.server.tools.shutdown_backend()
sqlcg/server/tools.py CHANGED
@@ -183,6 +183,19 @@ def _assert_indexed(db: GraphBackend) -> None:
183
183
  )
184
184
 
185
185
 
186
+ def _bare_ref(ref: str) -> str:
187
+ """Strip schema prefix from a ref string, keeping table.column.
188
+
189
+ For a 3-part ref ("mart.fact_t.amount") this returns "fact_t.amount".
190
+ For a 2-part ref ("fact_t.amount") this returns the ref unchanged.
191
+ Never uses rsplit — that would yield only the column name for 3-part refs.
192
+ """
193
+ parts = ref.split(".")
194
+ if len(parts) >= 3:
195
+ return ".".join(parts[1:]) # drop schema, keep table.column
196
+ return ref # already bare (no schema prefix)
197
+
198
+
186
199
  def _parse_column_ref(col_ref: str) -> tuple[str, str]:
187
200
  """Parse column reference "table.column" or "catalog.db.table.column".
188
201
 
@@ -554,9 +567,54 @@ def trace_column_lineage(table_col: str, max_depth: int | None = None) -> Lineag
554
567
 
555
568
  mermaid = _build_mermaid(col_id, edges) if edges else None
556
569
 
570
+ # Bare-name fallback: when the primary query returns empty and the ref has a
571
+ # schema component (3+ parts), retry with the schema prefix stripped.
572
+ # This handles unqualified INSERT targets indexed without a schema prefix.
573
+ bare_fallback_used = False
574
+ if not lineage and len(table_col.split(".")) >= 3:
575
+ bare = _bare_ref(table_col)
576
+ bare_queue: deque[tuple[str, int]] = deque([(bare, 0)])
577
+ bare_visited: set[str] = set()
578
+ bare_emitted: set[str] = set()
579
+ while bare_queue:
580
+ current_id, depth = bare_queue.popleft()
581
+ if current_id in bare_visited or (max_depth is not None and depth > max_depth):
582
+ continue
583
+ if len(bare_visited) >= max_nodes:
584
+ break
585
+ bare_visited.add(current_id)
586
+ rows = db.run_read(TRACE_COLUMN_LINEAGE_QUERY, {"id": current_id})
587
+ for row in rows:
588
+ node_id = row["id"]
589
+ edges.append((node_id, current_id, row.get("transform") or "SELECT"))
590
+ if node_id not in bare_visited and node_id not in bare_emitted:
591
+ bare_emitted.add(node_id)
592
+ lineage.append(
593
+ LineageNode(
594
+ name=row.get("col_name", ""),
595
+ kind="column",
596
+ table=row.get("table_qualified"),
597
+ file=None,
598
+ confidence=row.get("confidence"),
599
+ )
600
+ )
601
+ if node_id not in bare_visited:
602
+ bare_queue.append((node_id, depth + 1))
603
+ if lineage:
604
+ bare_fallback_used = True
605
+ mermaid = _build_mermaid(bare, edges) if edges else None
606
+
557
607
  # Populate hint if result is empty (Step 4.1)
558
608
  hint = None
559
- if not lineage:
609
+ if bare_fallback_used:
610
+ bare = _bare_ref(table_col)
611
+ hint = (
612
+ f"No results for '{table_col}'. Found lineage under bare name '{bare}'. "
613
+ "The INSERT target may have been indexed without a schema prefix. "
614
+ "Multiple tables with the same unqualified name in different schemas "
615
+ "would all match — re-index with an explicit schema for precise results."
616
+ )
617
+ elif not lineage:
560
618
  hint = (
561
619
  "No lineage found. Ensure the column reference includes the schema prefix "
562
620
  "(e.g., ba.table_name.column_name). Check that 'sqlcg db info' shows "
sqlcg/utils/ignore.py CHANGED
@@ -14,6 +14,7 @@ def load_ignore_spec(root: Path) -> pathspec.PathSpec:
14
14
  Returns:
15
15
  PathSpec object for matching ignore patterns
16
16
  """
17
+ root = Path(root).resolve() # guard: caller may pass a relative path (e.g. Path("."))
17
18
  ignore_file = root / ".sqlcgignore"
18
19
  if ignore_file.exists():
19
20
  patterns = ignore_file.read_text().splitlines()
@@ -33,4 +34,5 @@ def is_ignored(path: Path, root: Path, spec: pathspec.PathSpec) -> bool:
33
34
  Returns:
34
35
  True if the path matches any ignore pattern
35
36
  """
37
+ root = Path(root).resolve() # guard: ensure root is absolute before relative_to()
36
38
  return spec.match_file(str(path.relative_to(root)))