sql-code-graph 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.0.0.dist-info → sql_code_graph-1.0.2.dist-info}/METADATA +1 -1
- {sql_code_graph-1.0.0.dist-info → sql_code_graph-1.0.2.dist-info}/RECORD +20 -20
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +90 -0
- sqlcg/cli/commands/find.py +11 -0
- sqlcg/cli/commands/index.py +72 -1
- sqlcg/cli/commands/install.py +83 -46
- sqlcg/cli/commands/mcp.py +18 -12
- sqlcg/cli/commands/reindex.py +3 -0
- sqlcg/core/config.py +7 -0
- sqlcg/indexer/error_classify.py +5 -1
- sqlcg/indexer/git_delta.py +1 -0
- sqlcg/indexer/indexer.py +5 -13
- sqlcg/indexer/pool.py +64 -5
- sqlcg/parsers/base.py +178 -82
- sqlcg/server/server.py +61 -18
- sqlcg/server/tools.py +59 -1
- sqlcg/utils/ignore.py +2 -0
- {sql_code_graph-1.0.0.dist-info → sql_code_graph-1.0.2.dist-info}/WHEEL +0 -0
- {sql_code_graph-1.0.0.dist-info → sql_code_graph-1.0.2.dist-info}/entry_points.txt +0 -0
sqlcg/indexer/git_delta.py
CHANGED
|
@@ -51,6 +51,7 @@ def git_name_status_delta(root: Path, old_sha: str, new_sha: str) -> Delta | Non
|
|
|
51
51
|
unknown SHA, shallow clone, or git not available). Callers MUST fall
|
|
52
52
|
back to a full index_repo when None is returned.
|
|
53
53
|
"""
|
|
54
|
+
root = root.resolve() # guard: caller may pass a relative path (e.g. Path("."))
|
|
54
55
|
try:
|
|
55
56
|
result = subprocess.run(
|
|
56
57
|
["git", "diff", "--name-status", old_sha, new_sha],
|
sqlcg/indexer/indexer.py
CHANGED
|
@@ -251,9 +251,11 @@ class Indexer:
|
|
|
251
251
|
_t_pass2_end = time.perf_counter()
|
|
252
252
|
|
|
253
253
|
except KeyboardInterrupt:
|
|
254
|
-
|
|
255
|
-
#
|
|
256
|
-
|
|
254
|
+
# Kill workers and abort immediately. A partial pass-1-only result is
|
|
255
|
+
# an incomplete graph (no cross-file resolution, no star expansion);
|
|
256
|
+
# writing it would leave a misleading half-index. Re-run `sqlcg index`
|
|
257
|
+
# to index — re-indexing is the migration path.
|
|
258
|
+
logger.warning("Interrupted — workers killed; no partial graph written.")
|
|
257
259
|
raise
|
|
258
260
|
|
|
259
261
|
# Assemble final pass-2 results: start from pass-1, overlay pass-2 where available
|
|
@@ -1104,16 +1106,6 @@ class Indexer:
|
|
|
1104
1106
|
|
|
1105
1107
|
return counts
|
|
1106
1108
|
|
|
1107
|
-
def _upsert_all(self, results: list[ParsedFile], db: GraphBackend) -> None:
|
|
1108
|
-
"""Upsert all parsed files.
|
|
1109
|
-
|
|
1110
|
-
Args:
|
|
1111
|
-
results: List of ParsedFile objects
|
|
1112
|
-
db: GraphBackend instance
|
|
1113
|
-
"""
|
|
1114
|
-
for parsed in results:
|
|
1115
|
-
self._upsert_parsed_file(parsed, db)
|
|
1116
|
-
|
|
1117
1109
|
def _expand_star_sources(self, db: GraphBackend) -> int:
|
|
1118
1110
|
"""Run the post-ingestion star expansion query.
|
|
1119
1111
|
|
sqlcg/indexer/pool.py
CHANGED
|
@@ -194,7 +194,14 @@ class HardKillPool:
|
|
|
194
194
|
) -> None:
|
|
195
195
|
self._dialect = dialect
|
|
196
196
|
self._schema_aliases: dict[str, str] = schema_aliases or {}
|
|
197
|
-
|
|
197
|
+
# Leave 2 logical cores of headroom rather than spawning one worker per
|
|
198
|
+
# logical core. Parsing is CPU-bound, and the main process also does work
|
|
199
|
+
# between passes (closure resolution, batched upserts); saturating every
|
|
200
|
+
# core makes the largest files miss the per-file wall-clock timeout.
|
|
201
|
+
# Measured on the 1,453-file DWH corpus (after the once-per-statement parser
|
|
202
|
+
# fixes): cpu_count → 2 timeouts / 186s; cpu_count-2 → 0 timeouts / 131s
|
|
203
|
+
# (fewer timeouts AND faster, since timed-out files waste work + respawn churn).
|
|
204
|
+
self._n = n_workers or max(1, (os.cpu_count() or 4) - 2)
|
|
198
205
|
self._ctx = mp.get_context("spawn")
|
|
199
206
|
self._workers: list[_WorkerState] = []
|
|
200
207
|
|
|
@@ -278,7 +285,9 @@ class HardKillPool:
|
|
|
278
285
|
tidx = queue.pop(0)
|
|
279
286
|
path = tasks[tidx].get("path", "")
|
|
280
287
|
if kill_counts.get(path, 0) >= poison_retries:
|
|
281
|
-
results[tidx] = _timeout_file(
|
|
288
|
+
results[tidx] = _timeout_file(
|
|
289
|
+
path, self._dialect, timeout_s=per_task_timeout, poison=True
|
|
290
|
+
)
|
|
282
291
|
logger.warning("Skipping %s — poisoned after %d kills", path, poison_retries)
|
|
283
292
|
if on_result is not None:
|
|
284
293
|
on_result()
|
|
@@ -322,6 +331,28 @@ class HardKillPool:
|
|
|
322
331
|
w.task_start = time.monotonic()
|
|
323
332
|
busy.add(slot)
|
|
324
333
|
|
|
334
|
+
try:
|
|
335
|
+
return self._run_map_loop(
|
|
336
|
+
tasks, results, busy, kill_counts, _assign, per_task_timeout, on_result, n_tasks
|
|
337
|
+
)
|
|
338
|
+
except KeyboardInterrupt:
|
|
339
|
+
# Workers ignore SIGINT and are CPU-bound, so they will not notice a
|
|
340
|
+
# graceful SHUTDOWN sentinel until their current parse finishes. On
|
|
341
|
+
# interrupt the user wants the process gone now — hard-kill outright.
|
|
342
|
+
self.terminate()
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
def _run_map_loop(
|
|
346
|
+
self,
|
|
347
|
+
tasks: list[dict],
|
|
348
|
+
results: list[ParsedFile | None],
|
|
349
|
+
busy: set[int],
|
|
350
|
+
kill_counts: dict[str, int],
|
|
351
|
+
_assign: Callable[[int], None],
|
|
352
|
+
per_task_timeout: float,
|
|
353
|
+
on_result: Callable[[], None] | None,
|
|
354
|
+
n_tasks: int,
|
|
355
|
+
) -> list[ParsedFile | None]:
|
|
325
356
|
# Initial dispatch: fill all worker slots
|
|
326
357
|
for i in range(min(self._n, n_tasks)):
|
|
327
358
|
_assign(i)
|
|
@@ -346,7 +377,7 @@ class HardKillPool:
|
|
|
346
377
|
slot,
|
|
347
378
|
kill_counts[path],
|
|
348
379
|
)
|
|
349
|
-
results[tidx] = _timeout_file(path, self._dialect)
|
|
380
|
+
results[tidx] = _timeout_file(path, self._dialect, timeout_s=per_task_timeout)
|
|
350
381
|
if on_result is not None:
|
|
351
382
|
on_result()
|
|
352
383
|
self._respawn(w)
|
|
@@ -405,6 +436,31 @@ class HardKillPool:
|
|
|
405
436
|
# Shutdown
|
|
406
437
|
# ------------------------------------------------------------------
|
|
407
438
|
|
|
439
|
+
def terminate(self) -> None:
|
|
440
|
+
"""Immediately SIGKILL every worker without a graceful handshake.
|
|
441
|
+
|
|
442
|
+
Unlike :meth:`shutdown`, this sends no ``_SHUTDOWN`` sentinel and does
|
|
443
|
+
not wait for in-flight parses. Workers ignore SIGINT and are CPU-bound,
|
|
444
|
+
so a graceful stop would block on the longest running parse; on
|
|
445
|
+
interrupt we kill outright so the process dies promptly.
|
|
446
|
+
"""
|
|
447
|
+
for w in self._workers:
|
|
448
|
+
try:
|
|
449
|
+
w.conn.close()
|
|
450
|
+
except Exception:
|
|
451
|
+
pass
|
|
452
|
+
try:
|
|
453
|
+
if w.process.is_alive():
|
|
454
|
+
w.process.kill()
|
|
455
|
+
except Exception:
|
|
456
|
+
pass
|
|
457
|
+
for w in self._workers:
|
|
458
|
+
try:
|
|
459
|
+
w.process.join(timeout=1)
|
|
460
|
+
except Exception:
|
|
461
|
+
pass
|
|
462
|
+
self._workers.clear()
|
|
463
|
+
|
|
408
464
|
def shutdown(self) -> None:
|
|
409
465
|
"""Gracefully stop all workers, then force-kill any that linger."""
|
|
410
466
|
for w in self._workers:
|
|
@@ -432,11 +488,14 @@ class HardKillPool:
|
|
|
432
488
|
def _timeout_file(
|
|
433
489
|
path: str,
|
|
434
490
|
dialect: str | None,
|
|
491
|
+
timeout_s: float = 0.0,
|
|
435
492
|
poison: bool = False,
|
|
436
493
|
) -> ParsedFile:
|
|
437
494
|
pf = ParsedFile(path=Path(path), dialect=dialect)
|
|
438
|
-
|
|
439
|
-
|
|
495
|
+
if poison:
|
|
496
|
+
pf.errors.append(f"skipped:poison file={Path(path).name}")
|
|
497
|
+
else:
|
|
498
|
+
pf.errors.append(f"timeout:{timeout_s:.0f}s file={Path(path).name}")
|
|
440
499
|
return pf
|
|
441
500
|
|
|
442
501
|
|
sqlcg/parsers/base.py
CHANGED
|
@@ -619,10 +619,6 @@ class SqlParser(ABC):
|
|
|
619
619
|
else:
|
|
620
620
|
return LineageExtraction(edges=edges, star_sources=star_sources)
|
|
621
621
|
|
|
622
|
-
# Build scope once from the body for all-column reuse (T-05 optimization)
|
|
623
|
-
# Defer scope building to just before the column loop to ensure sources
|
|
624
|
-
# are expanded first (avoid rebuilding for each column, but only build
|
|
625
|
-
# after sources are known)
|
|
626
622
|
body_scope = None
|
|
627
623
|
combined_sources = {**(sources or {})}
|
|
628
624
|
|
|
@@ -644,8 +640,174 @@ class SqlParser(ABC):
|
|
|
644
640
|
key = cte_alias.lower()
|
|
645
641
|
combined_sources[key] = cte.this
|
|
646
642
|
|
|
647
|
-
#
|
|
648
|
-
for
|
|
643
|
+
# Build body_scope ONCE per statement, before the column loop, and reuse
|
|
644
|
+
# it for every column (CLAUDE.md invariant: "body_scope built once per
|
|
645
|
+
# statement"). If schema-qualify fails, retry schema-free so we STILL get
|
|
646
|
+
# a scope for the copy=False fast path; only if both fail do we fall back
|
|
647
|
+
# to the per-column sources= path. Building this lazily inside the loop
|
|
648
|
+
# (regressed in 4234e5d) meant a single qualify failure re-ran
|
|
649
|
+
# expand+qualify+build_scope for EVERY column → O(N_cols) full-body
|
|
650
|
+
# deepcopies per statement (measured: 229 qualify calls on one 460-line file).
|
|
651
|
+
if scope is None:
|
|
652
|
+
expanded_body = body
|
|
653
|
+
expand_sources = {
|
|
654
|
+
k: v for k, v in (sources or {}).items() if isinstance(v, exp.Query)
|
|
655
|
+
}
|
|
656
|
+
if expand_sources:
|
|
657
|
+
try:
|
|
658
|
+
expanded_body = exp.expand(
|
|
659
|
+
body,
|
|
660
|
+
expand_sources, # type: ignore
|
|
661
|
+
dialect=self.DIALECT,
|
|
662
|
+
copy=True,
|
|
663
|
+
)
|
|
664
|
+
except Exception:
|
|
665
|
+
expanded_body = body
|
|
666
|
+
try:
|
|
667
|
+
qualified_body = qualify(
|
|
668
|
+
expanded_body,
|
|
669
|
+
dialect=self.DIALECT,
|
|
670
|
+
schema=schema,
|
|
671
|
+
validate_qualify_columns=False,
|
|
672
|
+
identify=False,
|
|
673
|
+
)
|
|
674
|
+
body_scope = build_scope(qualified_body)
|
|
675
|
+
except Exception as _qualify_exc:
|
|
676
|
+
out.errors.append(
|
|
677
|
+
f"col_lineage_skip:qualify_failed:{type(_qualify_exc).__name__}"
|
|
678
|
+
)
|
|
679
|
+
# Schema-free retry: still yields a scope for the copy=False path.
|
|
680
|
+
try:
|
|
681
|
+
qualified_body = qualify(
|
|
682
|
+
expanded_body,
|
|
683
|
+
dialect=self.DIALECT,
|
|
684
|
+
validate_qualify_columns=False,
|
|
685
|
+
identify=False,
|
|
686
|
+
)
|
|
687
|
+
body_scope = build_scope(qualified_body)
|
|
688
|
+
except Exception:
|
|
689
|
+
body_scope = None
|
|
690
|
+
|
|
691
|
+
# INSERT positional column-list mapping (#25 fix).
|
|
692
|
+
# Compute the positional_col_names skip-set BEFORE the main column loop
|
|
693
|
+
# so the main loop can skip positions already handled here.
|
|
694
|
+
#
|
|
695
|
+
# When an INSERT has an explicit column list (INSERT INTO t (c1, c2) SELECT ...),
|
|
696
|
+
# the target column name at position idx is authoritative — the SELECT alias is
|
|
697
|
+
# cosmetic for the SELECT and meaningless to the INSERT target. This block
|
|
698
|
+
# overrides alias attribution for ALL positions (aliased or not).
|
|
699
|
+
#
|
|
700
|
+
# Guards applied here mirror the main column loop to preserve skip markers:
|
|
701
|
+
# - Star expressions → emit col_lineage_skip:star, register pos, skip sg_lineage
|
|
702
|
+
# - Pure-literal (no Column descendant) → register pos, skip sg_lineage (silent)
|
|
703
|
+
# - Unaliased non-Column (func/arith/CASE) → emit col_lineage_skip:func_fallback,
|
|
704
|
+
# register pos, skip sg_lineage
|
|
705
|
+
# - Plain Column / aliased expression → call sg_lineage (the #25 happy path)
|
|
706
|
+
#
|
|
707
|
+
# CLAUDE.md invariant: body_no_with = body.copy() + strip-WITH happens ONCE
|
|
708
|
+
# before the inner loop; only the single projection is swapped per column.
|
|
709
|
+
positional_col_names: dict[int, str] = {} # idx → insert_col_name
|
|
710
|
+
if isinstance(stmt, exp.Insert) and isinstance(stmt.this, exp.Schema):
|
|
711
|
+
insert_cols_list = [c.name for c in stmt.this.expressions]
|
|
712
|
+
# Build the WITH-stripped body ONCE here, before any per-column loop.
|
|
713
|
+
# Only the single projection is swapped per column below.
|
|
714
|
+
body_no_with = body.copy()
|
|
715
|
+
body_no_with.set("with_", None)
|
|
716
|
+
for _ins_idx, _col_expr in enumerate(col_expressions):
|
|
717
|
+
if _ins_idx >= len(insert_cols_list):
|
|
718
|
+
break
|
|
719
|
+
_insert_col = insert_cols_list[_ins_idx]
|
|
720
|
+
if not _insert_col:
|
|
721
|
+
continue
|
|
722
|
+
# Register position first so the main loop always skips it,
|
|
723
|
+
# regardless of which guard fires below.
|
|
724
|
+
positional_col_names[_ins_idx] = _insert_col
|
|
725
|
+
|
|
726
|
+
# Guard 1: Star projection — emit skip marker (same as main loop).
|
|
727
|
+
_inner_for_guard = (
|
|
728
|
+
_col_expr.this if isinstance(_col_expr, exp.Alias) else _col_expr
|
|
729
|
+
)
|
|
730
|
+
if isinstance(_inner_for_guard, exp.Star) or (
|
|
731
|
+
isinstance(_inner_for_guard, exp.Column)
|
|
732
|
+
and isinstance(_inner_for_guard.this, exp.Star)
|
|
733
|
+
):
|
|
734
|
+
_qualifier = (
|
|
735
|
+
_inner_for_guard.table
|
|
736
|
+
if isinstance(_inner_for_guard, exp.Column)
|
|
737
|
+
else None
|
|
738
|
+
)
|
|
739
|
+
out.errors.append(f"col_lineage_skip:star:{_qualifier or '<unqualified>'}")
|
|
740
|
+
continue # no sg_lineage for star
|
|
741
|
+
|
|
742
|
+
# Guard 2: Pure-literal — no Column descendants, nothing to trace.
|
|
743
|
+
if not list(_col_expr.find_all(exp.Column)):
|
|
744
|
+
continue # silent skip, no sg_lineage
|
|
745
|
+
|
|
746
|
+
# NOTE: do NOT emit func_fallback here for unaliased non-Column
|
|
747
|
+
# expressions (functions, arithmetic, CASE …). The main loop emits
|
|
748
|
+
# func_fallback for such expressions because a plain SELECT/CREATE VIEW
|
|
749
|
+
# gives them no output column name. The positional INSERT column list
|
|
750
|
+
# DOES supply that name (_insert_col): below we wrap the expression as
|
|
751
|
+
# Alias(expr, _insert_col) and let sg_lineage trace through it — exactly
|
|
752
|
+
# as the aliased form (e.g. `DATE(col) AS a`) already resolves. Guard 2
|
|
753
|
+
# (above) already dropped genuinely-untraceable pure-literal expressions
|
|
754
|
+
# (no Column descendant). Skipping column-containing expressions here would
|
|
755
|
+
# make the #25 positional feature do its work and then discard the result,
|
|
756
|
+
# dropping real lineage edges (regressed by eb19f29; broke COALESCE).
|
|
757
|
+
|
|
758
|
+
# Positional mapping always wins — replace (or add) the alias with the
|
|
759
|
+
# INSERT target column name regardless of SELECT alias.
|
|
760
|
+
if _col_expr.alias and _col_expr.alias != _insert_col:
|
|
761
|
+
self._log.debug(
|
|
762
|
+
"INSERT positional override: SELECT alias %r → INSERT col %r"
|
|
763
|
+
" at position %d",
|
|
764
|
+
_col_expr.alias,
|
|
765
|
+
_insert_col,
|
|
766
|
+
_ins_idx,
|
|
767
|
+
)
|
|
768
|
+
# If the expression is already an Alias(inner, old_alias), unwrap it
|
|
769
|
+
# before re-wrapping — otherwise we produce Alias(Alias(inner, x), c1)
|
|
770
|
+
# which serialises as "inner AS x AS c1" (syntax error).
|
|
771
|
+
_inner = _col_expr.this if isinstance(_col_expr, exp.Alias) else _col_expr
|
|
772
|
+
_aliased = exp.Alias(this=_inner.copy(), alias=_insert_col)
|
|
773
|
+
body_no_with.set("expressions", [_aliased])
|
|
774
|
+
_patched_sql = body_no_with.sql(dialect=self.DIALECT)
|
|
775
|
+
# Pass sources= (not scope=) here: the patched SQL is a freshly
|
|
776
|
+
# serialised string — the scope was built from the original body AST
|
|
777
|
+
# and does not correspond to this new string.
|
|
778
|
+
#
|
|
779
|
+
# Use `sources` (the cross-statement temp/CTAS map), NOT
|
|
780
|
+
# `combined_sources`. combined_sources additionally carries the
|
|
781
|
+
# SAME-STATEMENT CTE bodies (added above). Since body_no_with strips
|
|
782
|
+
# the WITH clause from the patched SQL, those CTE names become opaque
|
|
783
|
+
# source relations — passing their bodies as sources= would expand them
|
|
784
|
+
# away, collapsing intermediate CTE→target hops into the deepest source
|
|
785
|
+
# (regressed by eb19f29; broke the MA_AANTAL_OP_ORDER anchor link 5).
|
|
786
|
+
# Cross-statement temps (e.g. CREATE TEMP TABLE t) live in `sources`
|
|
787
|
+
# and SHOULD still expand (E36 multi-temp: t → src).
|
|
788
|
+
try:
|
|
789
|
+
_root = sg_lineage(
|
|
790
|
+
_insert_col,
|
|
791
|
+
_patched_sql,
|
|
792
|
+
dialect=self.DIALECT,
|
|
793
|
+
sources=sources or {},
|
|
794
|
+
)
|
|
795
|
+
if _root:
|
|
796
|
+
_new_edges = self._lineage_node_to_edges(
|
|
797
|
+
_root,
|
|
798
|
+
dst_col_name=_insert_col,
|
|
799
|
+
dst_table=dst_table,
|
|
800
|
+
path=path,
|
|
801
|
+
out=out,
|
|
802
|
+
)
|
|
803
|
+
edges.extend(_new_edges)
|
|
804
|
+
except Exception:
|
|
805
|
+
pass
|
|
806
|
+
|
|
807
|
+
# Extract output columns — skip positions handled by the positional INSERT block
|
|
808
|
+
for loop_idx, col_expr in enumerate(col_expressions):
|
|
809
|
+
if loop_idx in positional_col_names:
|
|
810
|
+
continue # positional INSERT block already emitted this column
|
|
649
811
|
# Skip star projections — sg_lineage requires a concrete column name.
|
|
650
812
|
if isinstance(col_expr, exp.Star) or (
|
|
651
813
|
isinstance(col_expr, exp.Column) and isinstance(col_expr.this, exp.Star)
|
|
@@ -723,53 +885,23 @@ class SqlParser(ABC):
|
|
|
723
885
|
continue
|
|
724
886
|
|
|
725
887
|
try:
|
|
726
|
-
# Build scope on first column for reuse across all columns (T-05 optimization).
|
|
727
|
-
# NOTE: We build body_scope locally from the extracted body rather than
|
|
728
|
-
# using a pre-built scope from the statement, because CREATE/INSERT statements
|
|
729
|
-
# have their scope rooted at the outer statement, but the body passed here
|
|
730
|
-
# is the inner SELECT. Reusing the outer scope would produce incorrect
|
|
731
|
-
# qualification. The pre-built scope from parse_file would only be useful
|
|
732
|
-
# if we had a mechanism to extract the matching inner scope, which is
|
|
733
|
-
# complex and not yet implemented (see sprint_06 T-05 deviation for details).
|
|
734
|
-
if body_scope is None and scope is None:
|
|
735
|
-
try:
|
|
736
|
-
# Expand only file-level sources (CTEs, temp tables, CTAS bodies).
|
|
737
|
-
expanded_body = body
|
|
738
|
-
expand_sources = {
|
|
739
|
-
k: v for k, v in (sources or {}).items() if isinstance(v, exp.Query)
|
|
740
|
-
}
|
|
741
|
-
if expand_sources:
|
|
742
|
-
expanded_body = exp.expand(
|
|
743
|
-
body,
|
|
744
|
-
expand_sources, # type: ignore
|
|
745
|
-
dialect=self.DIALECT,
|
|
746
|
-
copy=True,
|
|
747
|
-
)
|
|
748
|
-
|
|
749
|
-
# Qualify the expanded body to prepare for scope building
|
|
750
|
-
qualified_body = qualify(
|
|
751
|
-
expanded_body,
|
|
752
|
-
dialect=self.DIALECT,
|
|
753
|
-
schema=schema,
|
|
754
|
-
validate_qualify_columns=False,
|
|
755
|
-
identify=False,
|
|
756
|
-
)
|
|
757
|
-
body_scope = build_scope(qualified_body)
|
|
758
|
-
except Exception as _qualify_exc:
|
|
759
|
-
# qualify() failure is non-fatal: sg_lineage falls back to
|
|
760
|
-
# its own qualification. Record for observability.
|
|
761
|
-
out.errors.append(
|
|
762
|
-
f"col_lineage_skip:qualify_failed:{type(_qualify_exc).__name__}:{_qualify_exc}"
|
|
763
|
-
)
|
|
764
|
-
body_scope = None
|
|
765
|
-
|
|
766
888
|
# When a scope is available it embeds full column→table resolution.
|
|
767
889
|
# On the qualify-failed fallback path (no scope), pass only the small
|
|
768
890
|
# set of file-level sources so sg_lineage can resolve CTEs/CTAS bodies.
|
|
769
891
|
active_scope = scope if scope is not None else body_scope
|
|
770
892
|
sg_kwargs: dict = {"dialect": self.DIALECT}
|
|
771
893
|
if active_scope is not None:
|
|
894
|
+
# scope= path: the pre-built scope already embeds full
|
|
895
|
+
# column→table resolution. copy=False + trim_selects=False
|
|
896
|
+
# suppress sqlglot's per-call AST deepcopy and per-column
|
|
897
|
+
# trim — neither is needed when the scope is built once and
|
|
898
|
+
# reused across all columns. Dropping these (regressed in
|
|
899
|
+
# 4234e5d) makes lineage() deepcopy the whole scope per
|
|
900
|
+
# column → O(columns × scope_size) (measured: 3.2M deepcopy
|
|
901
|
+
# calls / ~3.8s on a 359-line file).
|
|
772
902
|
sg_kwargs["scope"] = active_scope
|
|
903
|
+
sg_kwargs["copy"] = False
|
|
904
|
+
sg_kwargs["trim_selects"] = False
|
|
773
905
|
else:
|
|
774
906
|
sg_kwargs["sources"] = sources or {}
|
|
775
907
|
root = sg_lineage(col_name, body, **sg_kwargs)
|
|
@@ -905,42 +1037,6 @@ class SqlParser(ABC):
|
|
|
905
1037
|
cte_col_name,
|
|
906
1038
|
)
|
|
907
1039
|
|
|
908
|
-
# INSERT column-list aliasing (T-07-02 link 5).
|
|
909
|
-
# When an INSERT has an explicit column list and the SELECT expression has
|
|
910
|
-
# no alias (e.g. SELECT SUM(x) FROM cte), the INSERT column at the same
|
|
911
|
-
# position provides the destination col name. Stripping the WITH clause
|
|
912
|
-
# stops sg_lineage at the CTE name boundary (doesn't expand into bodies).
|
|
913
|
-
if isinstance(stmt, exp.Insert) and isinstance(stmt.this, exp.Schema):
|
|
914
|
-
insert_cols = [c.name for c in stmt.this.expressions]
|
|
915
|
-
for idx, col_expr in enumerate(col_expressions):
|
|
916
|
-
if idx >= len(insert_cols):
|
|
917
|
-
break
|
|
918
|
-
if col_expr.alias:
|
|
919
|
-
continue # already handled by the main col loop
|
|
920
|
-
insert_col = insert_cols[idx]
|
|
921
|
-
if not insert_col:
|
|
922
|
-
continue
|
|
923
|
-
# Build a patched SELECT: strip WITH, alias the expression with the
|
|
924
|
-
# INSERT column name so sg_lineage can trace it.
|
|
925
|
-
body_no_with = body.copy()
|
|
926
|
-
body_no_with.set("with_", None)
|
|
927
|
-
aliased = exp.Alias(this=col_expr.copy(), alias=insert_col)
|
|
928
|
-
body_no_with.set("expressions", [aliased])
|
|
929
|
-
patched_sql = body_no_with.sql(dialect=self.DIALECT)
|
|
930
|
-
try:
|
|
931
|
-
root = sg_lineage(insert_col, patched_sql, dialect=self.DIALECT)
|
|
932
|
-
if root:
|
|
933
|
-
new_edges = self._lineage_node_to_edges(
|
|
934
|
-
root,
|
|
935
|
-
dst_col_name=insert_col,
|
|
936
|
-
dst_table=dst_table,
|
|
937
|
-
path=path,
|
|
938
|
-
out=out,
|
|
939
|
-
)
|
|
940
|
-
edges.extend(new_edges)
|
|
941
|
-
except Exception:
|
|
942
|
-
pass
|
|
943
|
-
|
|
944
1040
|
except Exception as exc:
|
|
945
1041
|
self._log.debug(
|
|
946
1042
|
"column lineage extraction failed for entire statement: file=%s error=%s",
|
sqlcg/server/server.py
CHANGED
|
@@ -1,35 +1,76 @@
|
|
|
1
1
|
"""MCP server for SQL Code Graph.
|
|
2
2
|
|
|
3
3
|
Exposes FastMCP tools for lineage queries, pattern search, and indexing.
|
|
4
|
-
MCP protocol uses stdout for message transport
|
|
5
|
-
|
|
4
|
+
MCP protocol uses stdout (fd 1) for JSON-RPC message transport. This module
|
|
5
|
+
captures fd 1 as a raw binary buffer BEFORE any logging redirection so that
|
|
6
|
+
the captured buffer can be passed explicitly to stdio_server(). This ensures
|
|
7
|
+
JSON-RPC frames always go to fd 1 regardless of what sys.stdout points to
|
|
8
|
+
at call time.
|
|
9
|
+
|
|
10
|
+
Ordering invariant (must not change):
|
|
11
|
+
1. os.dup(1) → _real_stdout_buffer (first — before everything)
|
|
12
|
+
2. from mcp.server import FastMCP (module-level import)
|
|
13
|
+
3. mcp = FastMCP("SQL Code Graph") (module-level; tools.py registers here)
|
|
14
|
+
4. main() calls _configure_mcp_logging() (not at module scope)
|
|
6
15
|
"""
|
|
7
16
|
|
|
17
|
+
import os
|
|
8
18
|
import sys
|
|
9
19
|
|
|
10
|
-
|
|
11
|
-
|
|
20
|
+
# Capture the real fd 1 binary stream FIRST — before _configure_mcp_logging()
|
|
21
|
+
# (which replaces sys.stdout) AND before FastMCP("SQL Code Graph") construction.
|
|
22
|
+
# stdio_server() receives this explicitly so JSON-RPC frames go to fd 1
|
|
23
|
+
# regardless of what sys.stdout points to afterward.
|
|
24
|
+
# Guards against the v1.0.0/v1.0.1 regression where frames went to fd 2.
|
|
25
|
+
_real_stdout_buffer = os.fdopen(os.dup(1), "wb", buffering=0)
|
|
12
26
|
|
|
13
|
-
from
|
|
27
|
+
from dotenv import load_dotenv # noqa: E402
|
|
28
|
+
from mcp.server import FastMCP # noqa: E402
|
|
29
|
+
|
|
30
|
+
from sqlcg.utils.logging import getLogger # noqa: E402
|
|
14
31
|
|
|
15
32
|
logger = getLogger(__name__)
|
|
16
33
|
|
|
34
|
+
# Create FastMCP instance at module scope so tools.py can import and register with it.
|
|
35
|
+
# This is safe because _real_stdout_buffer has already captured fd 1 above.
|
|
36
|
+
mcp = FastMCP("SQL Code Graph")
|
|
37
|
+
|
|
17
38
|
|
|
18
39
|
def _configure_mcp_logging() -> None:
|
|
19
|
-
"""Redirect sys.stdout to sys.stderr
|
|
40
|
+
"""Redirect sys.stdout to sys.stderr and configure logging to stderr.
|
|
20
41
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
42
|
+
sys.stdout is replaced with sys.stderr so that any stray print() call
|
|
43
|
+
does not pollute fd 1 (reserved for MCP JSON-RPC frames).
|
|
44
|
+
The real fd 1 binary stream is captured in _real_stdout_buffer at module
|
|
45
|
+
top before this replacement and passed explicitly to stdio_server().
|
|
46
|
+
|
|
47
|
+
Must be called inside main(), not at module scope, so that
|
|
48
|
+
_real_stdout_buffer captures fd 1 before the redirect.
|
|
24
49
|
"""
|
|
50
|
+
import logging
|
|
51
|
+
|
|
25
52
|
sys.stdout = sys.stderr
|
|
53
|
+
logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
|
|
26
54
|
|
|
27
55
|
|
|
28
|
-
|
|
29
|
-
|
|
56
|
+
async def _run_stdio_async_with_real_stdout() -> None:
|
|
57
|
+
"""Run the MCP server loop with JSON-RPC frames explicitly on fd 1.
|
|
30
58
|
|
|
31
|
-
|
|
32
|
-
|
|
59
|
+
Bypasses FastMCP.run_stdio_async() (which uses sys.stdout at call time)
|
|
60
|
+
and drives the server loop directly with the captured _real_stdout_buffer.
|
|
61
|
+
"""
|
|
62
|
+
from io import TextIOWrapper
|
|
63
|
+
|
|
64
|
+
import anyio
|
|
65
|
+
from mcp.server.stdio import stdio_server
|
|
66
|
+
|
|
67
|
+
stdout_text = TextIOWrapper(_real_stdout_buffer, encoding="utf-8", line_buffering=False)
|
|
68
|
+
async with stdio_server(stdout=anyio.wrap_file(stdout_text)) as (read_stream, write_stream):
|
|
69
|
+
await mcp._mcp_server.run(
|
|
70
|
+
read_stream,
|
|
71
|
+
write_stream,
|
|
72
|
+
mcp._mcp_server.create_initialization_options(),
|
|
73
|
+
)
|
|
33
74
|
|
|
34
75
|
|
|
35
76
|
def main(db_path: str | None = None) -> None:
|
|
@@ -38,10 +79,13 @@ def main(db_path: str | None = None) -> None:
|
|
|
38
79
|
Args:
|
|
39
80
|
db_path: Path to KùzuDB database. If None, uses SQLCG_DB_PATH env var
|
|
40
81
|
or ~/.sqlcg/graph.db (via get_db_path in tools module).
|
|
41
|
-
|
|
42
|
-
Raises:
|
|
43
|
-
RuntimeError: If tools fail to initialize or FastMCP server fails.
|
|
44
82
|
"""
|
|
83
|
+
import anyio
|
|
84
|
+
|
|
85
|
+
# Must be first — redirects sys.stdout → sys.stderr so stray prints don't
|
|
86
|
+
# corrupt fd 1. _real_stdout_buffer was already captured at module top.
|
|
87
|
+
_configure_mcp_logging()
|
|
88
|
+
|
|
45
89
|
load_dotenv()
|
|
46
90
|
|
|
47
91
|
# Import tools module to trigger tool registration via @mcp.tool() decorators
|
|
@@ -50,8 +94,7 @@ def main(db_path: str | None = None) -> None:
|
|
|
50
94
|
# Initialize the backend singleton used by all tools
|
|
51
95
|
sqlcg.server.tools.init_backend(db_path)
|
|
52
96
|
|
|
53
|
-
# Run the MCP server event loop, ensuring backend is closed on shutdown
|
|
54
97
|
try:
|
|
55
|
-
|
|
98
|
+
anyio.run(_run_stdio_async_with_real_stdout)
|
|
56
99
|
finally:
|
|
57
100
|
sqlcg.server.tools.shutdown_backend()
|
sqlcg/server/tools.py
CHANGED
|
@@ -183,6 +183,19 @@ def _assert_indexed(db: GraphBackend) -> None:
|
|
|
183
183
|
)
|
|
184
184
|
|
|
185
185
|
|
|
186
|
+
def _bare_ref(ref: str) -> str:
|
|
187
|
+
"""Strip schema prefix from a ref string, keeping table.column.
|
|
188
|
+
|
|
189
|
+
For a 3-part ref ("mart.fact_t.amount") this returns "fact_t.amount".
|
|
190
|
+
For a 2-part ref ("fact_t.amount") this returns the ref unchanged.
|
|
191
|
+
Never uses rsplit — that would yield only the column name for 3-part refs.
|
|
192
|
+
"""
|
|
193
|
+
parts = ref.split(".")
|
|
194
|
+
if len(parts) >= 3:
|
|
195
|
+
return ".".join(parts[1:]) # drop schema, keep table.column
|
|
196
|
+
return ref # already bare (no schema prefix)
|
|
197
|
+
|
|
198
|
+
|
|
186
199
|
def _parse_column_ref(col_ref: str) -> tuple[str, str]:
|
|
187
200
|
"""Parse column reference "table.column" or "catalog.db.table.column".
|
|
188
201
|
|
|
@@ -554,9 +567,54 @@ def trace_column_lineage(table_col: str, max_depth: int | None = None) -> Lineag
|
|
|
554
567
|
|
|
555
568
|
mermaid = _build_mermaid(col_id, edges) if edges else None
|
|
556
569
|
|
|
570
|
+
# Bare-name fallback: when the primary query returns empty and the ref has a
|
|
571
|
+
# schema component (3+ parts), retry with the schema prefix stripped.
|
|
572
|
+
# This handles unqualified INSERT targets indexed without a schema prefix.
|
|
573
|
+
bare_fallback_used = False
|
|
574
|
+
if not lineage and len(table_col.split(".")) >= 3:
|
|
575
|
+
bare = _bare_ref(table_col)
|
|
576
|
+
bare_queue: deque[tuple[str, int]] = deque([(bare, 0)])
|
|
577
|
+
bare_visited: set[str] = set()
|
|
578
|
+
bare_emitted: set[str] = set()
|
|
579
|
+
while bare_queue:
|
|
580
|
+
current_id, depth = bare_queue.popleft()
|
|
581
|
+
if current_id in bare_visited or (max_depth is not None and depth > max_depth):
|
|
582
|
+
continue
|
|
583
|
+
if len(bare_visited) >= max_nodes:
|
|
584
|
+
break
|
|
585
|
+
bare_visited.add(current_id)
|
|
586
|
+
rows = db.run_read(TRACE_COLUMN_LINEAGE_QUERY, {"id": current_id})
|
|
587
|
+
for row in rows:
|
|
588
|
+
node_id = row["id"]
|
|
589
|
+
edges.append((node_id, current_id, row.get("transform") or "SELECT"))
|
|
590
|
+
if node_id not in bare_visited and node_id not in bare_emitted:
|
|
591
|
+
bare_emitted.add(node_id)
|
|
592
|
+
lineage.append(
|
|
593
|
+
LineageNode(
|
|
594
|
+
name=row.get("col_name", ""),
|
|
595
|
+
kind="column",
|
|
596
|
+
table=row.get("table_qualified"),
|
|
597
|
+
file=None,
|
|
598
|
+
confidence=row.get("confidence"),
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
if node_id not in bare_visited:
|
|
602
|
+
bare_queue.append((node_id, depth + 1))
|
|
603
|
+
if lineage:
|
|
604
|
+
bare_fallback_used = True
|
|
605
|
+
mermaid = _build_mermaid(bare, edges) if edges else None
|
|
606
|
+
|
|
557
607
|
# Populate hint if result is empty (Step 4.1)
|
|
558
608
|
hint = None
|
|
559
|
-
if
|
|
609
|
+
if bare_fallback_used:
|
|
610
|
+
bare = _bare_ref(table_col)
|
|
611
|
+
hint = (
|
|
612
|
+
f"No results for '{table_col}'. Found lineage under bare name '{bare}'. "
|
|
613
|
+
"The INSERT target may have been indexed without a schema prefix. "
|
|
614
|
+
"Multiple tables with the same unqualified name in different schemas "
|
|
615
|
+
"would all match — re-index with an explicit schema for precise results."
|
|
616
|
+
)
|
|
617
|
+
elif not lineage:
|
|
560
618
|
hint = (
|
|
561
619
|
"No lineage found. Ensure the column reference includes the schema prefix "
|
|
562
620
|
"(e.g., ba.table_name.column_name). Check that 'sqlcg db info' shows "
|
sqlcg/utils/ignore.py
CHANGED
|
@@ -14,6 +14,7 @@ def load_ignore_spec(root: Path) -> pathspec.PathSpec:
|
|
|
14
14
|
Returns:
|
|
15
15
|
PathSpec object for matching ignore patterns
|
|
16
16
|
"""
|
|
17
|
+
root = Path(root).resolve() # guard: caller may pass a relative path (e.g. Path("."))
|
|
17
18
|
ignore_file = root / ".sqlcgignore"
|
|
18
19
|
if ignore_file.exists():
|
|
19
20
|
patterns = ignore_file.read_text().splitlines()
|
|
@@ -33,4 +34,5 @@ def is_ignored(path: Path, root: Path, spec: pathspec.PathSpec) -> bool:
|
|
|
33
34
|
Returns:
|
|
34
35
|
True if the path matches any ignore pattern
|
|
35
36
|
"""
|
|
37
|
+
root = Path(root).resolve() # guard: ensure root is absolute before relative_to()
|
|
36
38
|
return spec.match_file(str(path.relative_to(root)))
|
|
File without changes
|
|
File without changes
|