sql-code-graph 1.1.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/METADATA +11 -1
- {sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/RECORD +19 -18
- {sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/WHEEL +1 -1
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +156 -134
- sqlcg/cli/commands/db.py +92 -86
- sqlcg/cli/commands/find.py +30 -33
- sqlcg/cli/commands/gain.py +13 -11
- sqlcg/core/config.py +35 -5
- sqlcg/core/kuzu_backend.py +4 -1
- sqlcg/core/queries.cypher +0 -6
- sqlcg/core/queries.py +0 -1
- sqlcg/indexer/indexer.py +109 -11
- sqlcg/lineage/aggregator.py +17 -45
- sqlcg/parsers/ansi_parser.py +2 -2
- sqlcg/parsers/base.py +7 -1
- sqlcg/server/read_client.py +192 -0
- sqlcg/server/server.py +97 -18
- {sql_code_graph-1.1.0.dist-info → sql_code_graph-1.2.2.dist-info}/entry_points.txt +0 -0
sqlcg/cli/commands/db.py
CHANGED
|
@@ -10,6 +10,7 @@ from rich.console import Console
|
|
|
10
10
|
from sqlcg.core.config import get_backend, get_db_path
|
|
11
11
|
from sqlcg.core.freshness import compute_freshness, render_freshness_line
|
|
12
12
|
from sqlcg.core.schema import NodeLabel
|
|
13
|
+
from sqlcg.server.read_client import run_read_routed
|
|
13
14
|
from sqlcg.utils.logging import getLogger
|
|
14
15
|
|
|
15
16
|
logger = getLogger(__name__)
|
|
@@ -75,107 +76,112 @@ def db_reset( # noqa: B008
|
|
|
75
76
|
@app.command("info")
|
|
76
77
|
def db_info() -> None:
|
|
77
78
|
"""Show database stats."""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
# db info is a read-only command. All Cypher reads route through the live
|
|
80
|
+
# server (run_read_routed) to avoid "Database is locked" while the MCP server
|
|
81
|
+
# holds the write lock. get_schema_version / get_indexed_sha are inlined as
|
|
82
|
+
# run_read_routed calls using their known Cypher so they too route through the
|
|
83
|
+
# socket when a server is live; this avoids a direct-open that would hit the lock.
|
|
84
|
+
|
|
85
|
+
# Schema version
|
|
86
|
+
schema_rows = run_read_routed("MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {})
|
|
87
|
+
version = (schema_rows[0]["version"] if schema_rows else None) or "unknown"
|
|
88
|
+
console.print(f"Schema version: {version}")
|
|
89
|
+
|
|
90
|
+
# Freshness block — only shown when the DB has been indexed from a git repo
|
|
91
|
+
try:
|
|
92
|
+
sha_rows = run_read_routed(
|
|
93
|
+
"MATCH (v:SchemaVersion) RETURN v.indexed_sha AS sha LIMIT 1", {}
|
|
94
|
+
)
|
|
95
|
+
indexed_sha = sha_rows[0]["sha"] if sha_rows else None
|
|
96
|
+
repo_rows = run_read_routed("MATCH (r:Repo) RETURN r.path AS path LIMIT 1", {})
|
|
97
|
+
if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
|
|
98
|
+
repo_root = Path(repo_rows[0]["path"])
|
|
99
|
+
f = compute_freshness(repo_root, indexed_sha)
|
|
100
|
+
console.print(render_freshness_line(f))
|
|
101
|
+
except NotImplementedError:
|
|
102
|
+
# Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
|
|
103
|
+
pass
|
|
104
|
+
except Exception as e:
|
|
105
|
+
# Any unexpected error in the freshness block must not crash db info
|
|
106
|
+
logger.debug(f"Freshness check skipped: {e}")
|
|
107
|
+
|
|
108
|
+
# Show node counts for all labels
|
|
109
|
+
for label in NodeLabel:
|
|
83
110
|
try:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
repo_root = Path(repo_rows[0]["path"])
|
|
88
|
-
f = compute_freshness(repo_root, indexed_sha)
|
|
89
|
-
console.print(render_freshness_line(f))
|
|
90
|
-
except NotImplementedError:
|
|
91
|
-
# Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
|
|
92
|
-
pass
|
|
111
|
+
result = run_read_routed(f"MATCH (n:{label}) RETURN COUNT(*) AS count", {})
|
|
112
|
+
count = result[0]["count"] if result else 0
|
|
113
|
+
console.print(f" {label}: {count}")
|
|
93
114
|
except Exception as e:
|
|
94
|
-
#
|
|
95
|
-
logger.
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
"[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
|
|
115
|
+
# Log unexpected exceptions instead of silently skipping
|
|
116
|
+
logger.error(f"Error getting count for {label}: {e}")
|
|
117
|
+
console.print(f" [red]{label}: error[/red]")
|
|
118
|
+
|
|
119
|
+
# Health check section
|
|
120
|
+
repo_count_result = run_read_routed("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
|
|
121
|
+
repo_count = repo_count_result[0]["count"] if repo_count_result else 0
|
|
122
|
+
|
|
123
|
+
if repo_count == 0:
|
|
124
|
+
console.print( # noqa: E501
|
|
125
|
+
"[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
query_count_result = run_read_routed("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
|
|
129
|
+
query_count = query_count_result[0]["count"] if query_count_result else 0
|
|
130
|
+
|
|
131
|
+
if query_count == 0:
|
|
132
|
+
console.print(
|
|
133
|
+
"[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
|
|
134
|
+
"the graph.[/yellow]"
|
|
115
135
|
)
|
|
116
136
|
else:
|
|
117
|
-
|
|
118
|
-
|
|
137
|
+
col_count_result = run_read_routed("MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {})
|
|
138
|
+
col_count = col_count_result[0]["count"] if col_count_result else 0
|
|
119
139
|
|
|
120
|
-
if
|
|
140
|
+
if col_count == 0:
|
|
121
141
|
console.print(
|
|
122
|
-
"[yellow]
|
|
123
|
-
"
|
|
142
|
+
"[yellow]Column lineage not available. Tools trace_column_lineage, "
|
|
143
|
+
"get_downstream_dependencies, and get_upstream_dependencies "
|
|
144
|
+
"will return empty results.[/yellow]"
|
|
124
145
|
)
|
|
125
|
-
else:
|
|
126
|
-
col_count_result = backend.run_read(
|
|
127
|
-
"MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
|
|
128
|
-
)
|
|
129
|
-
col_count = col_count_result[0]["count"] if col_count_result else 0
|
|
130
|
-
|
|
131
|
-
if col_count == 0:
|
|
132
|
-
console.print(
|
|
133
|
-
"[yellow]Column lineage not available. Tools trace_column_lineage, "
|
|
134
|
-
"get_downstream_dependencies, and get_upstream_dependencies "
|
|
135
|
-
"will return empty results.[/yellow]"
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
# Print COLUMN_LINEAGE edges count
|
|
139
|
-
edges_result = backend.run_read(
|
|
140
|
-
"MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
|
|
141
|
-
)
|
|
142
|
-
edges_count = edges_result[0]["count"] if edges_result else 0
|
|
143
|
-
console.print(f" COLUMN_LINEAGE edges: {edges_count}")
|
|
144
146
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
# Print COLUMN_LINEAGE edges count
|
|
148
|
+
edges_result = run_read_routed("MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {})
|
|
149
|
+
edges_count = edges_result[0]["count"] if edges_result else 0
|
|
150
|
+
console.print(f" COLUMN_LINEAGE edges: {edges_count}")
|
|
147
151
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
console.print(f" STAR_SOURCE edges: {star_source_count}")
|
|
152
|
+
# Print star resolution metrics (T-07)
|
|
153
|
+
from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
|
|
151
154
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
+
star_source_result = run_read_routed(COUNT_STAR_SOURCES_QUERY, {})
|
|
156
|
+
star_source_count = star_source_result[0]["n"] if star_source_result else 0
|
|
157
|
+
console.print(f" STAR_SOURCE edges: {star_source_count}")
|
|
155
158
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
159
|
+
star_expansion_result = run_read_routed(COUNT_STAR_EXPANSIONS_QUERY, {})
|
|
160
|
+
star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
|
|
161
|
+
console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
|
|
162
|
+
|
|
163
|
+
# Print parsing mode distribution
|
|
164
|
+
mode_query = (
|
|
165
|
+
"MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
|
|
166
|
+
)
|
|
167
|
+
mode_rows = run_read_routed(mode_query, {})
|
|
168
|
+
if mode_rows and "mode" in mode_rows[0]:
|
|
169
|
+
console.print("\n Parsing mode distribution:")
|
|
170
|
+
for row in mode_rows:
|
|
171
|
+
console.print(f" {row['mode']}: {row['cnt']}")
|
|
165
172
|
|
|
166
173
|
|
|
167
174
|
@app.command("list-repos")
|
|
168
175
|
def list_repos() -> None:
|
|
169
176
|
"""List all indexed repositories."""
|
|
170
|
-
|
|
171
|
-
result = backend.run_read("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
|
|
177
|
+
result = run_read_routed("MATCH (r:Repo) RETURN r.path AS path, r.name AS name", {})
|
|
172
178
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
179
|
+
if not result:
|
|
180
|
+
console.print("[yellow]No repositories indexed[/yellow]")
|
|
181
|
+
else:
|
|
182
|
+
from rich.table import Table
|
|
177
183
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
184
|
+
table = Table("Path", "Name")
|
|
185
|
+
for row in result:
|
|
186
|
+
table.add_row(str(row.get("path", "")), str(row.get("name", "")))
|
|
187
|
+
console.print(table)
|
sqlcg/cli/commands/find.py
CHANGED
|
@@ -4,8 +4,8 @@ import typer
|
|
|
4
4
|
from rich.console import Console
|
|
5
5
|
from rich.table import Table
|
|
6
6
|
|
|
7
|
-
from sqlcg.core.config import get_backend
|
|
8
7
|
from sqlcg.core.schema import NodeLabel
|
|
8
|
+
from sqlcg.server.read_client import run_read_routed
|
|
9
9
|
|
|
10
10
|
app = typer.Typer(help="Search the graph")
|
|
11
11
|
console = Console()
|
|
@@ -18,21 +18,20 @@ def find_table( # noqa: B008
|
|
|
18
18
|
) -> None:
|
|
19
19
|
"""Find a table by name."""
|
|
20
20
|
name = name.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
from sqlcg.server.noise_filter import NoiseFilter
|
|
21
|
+
results = run_read_routed(
|
|
22
|
+
f"MATCH (t:{NodeLabel.TABLE}) WHERE t.qualified CONTAINS $name "
|
|
23
|
+
"RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
|
|
24
|
+
{"name": name},
|
|
25
|
+
)
|
|
26
|
+
if not raw:
|
|
27
|
+
from sqlcg.server.noise_filter import NoiseFilter
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
29
|
+
nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
|
|
30
|
+
ids = [r["qualified"] for r in results]
|
|
31
|
+
kept, _ = nf.filter_nodes(ids)
|
|
32
|
+
kept_set = set(kept)
|
|
33
|
+
results = [r for r in results if r["qualified"] in kept_set]
|
|
34
|
+
_print_table(results, ["qualified", "kind"])
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
@app.command("column")
|
|
@@ -42,18 +41,17 @@ def find_column( # noqa: B008
|
|
|
42
41
|
) -> None:
|
|
43
42
|
"""Find a column by table.column reference."""
|
|
44
43
|
ref = ref.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
from sqlcg.server.noise_filter import NoiseFilter
|
|
44
|
+
results = run_read_routed(
|
|
45
|
+
f"MATCH (c:{NodeLabel.COLUMN}) WHERE c.id CONTAINS $ref RETURN c.id AS id LIMIT 50",
|
|
46
|
+
{"ref": ref},
|
|
47
|
+
)
|
|
48
|
+
if not raw:
|
|
49
|
+
from sqlcg.server.noise_filter import NoiseFilter
|
|
52
50
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
51
|
+
nf = NoiseFilter.from_config() # repo_root=None → falls back to Path.cwd()
|
|
52
|
+
# Filter on the schema.table portion of each column id (schema.table.column)
|
|
53
|
+
results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
|
|
54
|
+
_print_table(results, ["id"])
|
|
57
55
|
|
|
58
56
|
|
|
59
57
|
@app.command("pattern")
|
|
@@ -61,13 +59,12 @@ def find_pattern( # noqa: B008
|
|
|
61
59
|
pattern: str = typer.Argument(..., help="SQL pattern to search for"), # noqa: B008
|
|
62
60
|
) -> None:
|
|
63
61
|
"""Find queries containing a SQL pattern."""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
_print_table(results, ["id", "kind"])
|
|
62
|
+
results = run_read_routed(
|
|
63
|
+
f"MATCH (q:{NodeLabel.QUERY}) WHERE q.sql CONTAINS $pattern "
|
|
64
|
+
"RETURN q.id AS id, q.kind AS kind LIMIT 50",
|
|
65
|
+
{"pattern": pattern},
|
|
66
|
+
)
|
|
67
|
+
_print_table(results, ["id", "kind"])
|
|
71
68
|
|
|
72
69
|
|
|
73
70
|
def _print_table(rows: list[dict], columns: list[str]) -> None:
|
sqlcg/cli/commands/gain.py
CHANGED
|
@@ -7,8 +7,8 @@ from pathlib import Path
|
|
|
7
7
|
import typer
|
|
8
8
|
from rich.console import Console
|
|
9
9
|
|
|
10
|
-
from sqlcg.core.config import get_backend
|
|
11
10
|
from sqlcg.metrics import store as metrics_module
|
|
11
|
+
from sqlcg.server.read_client import run_read_routed
|
|
12
12
|
from sqlcg.utils.logging import getLogger
|
|
13
13
|
|
|
14
14
|
logger = getLogger(__name__)
|
|
@@ -120,19 +120,21 @@ def gain_cmd(
|
|
|
120
120
|
)
|
|
121
121
|
execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
|
|
122
122
|
|
|
123
|
-
# Section F: parse quality from graph
|
|
123
|
+
# Section F: parse quality from graph.
|
|
124
|
+
# run_read_routed raises typer.Exit (Exception-derived, NOT SystemExit) on
|
|
125
|
+
# server-busy timeout, so the except-Exception block degrades gracefully
|
|
126
|
+
# (skips the parse-quality section) instead of crashing gain (WARNING 3).
|
|
124
127
|
parse_quality: dict[str, int] | None = None
|
|
125
128
|
try:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
|
|
129
|
+
mode_rows = run_read_routed(
|
|
130
|
+
"MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
|
|
131
|
+
" COUNT(q) AS cnt ORDER BY cnt DESC",
|
|
132
|
+
{},
|
|
133
|
+
)
|
|
134
|
+
if mode_rows and "mode" in mode_rows[0]:
|
|
135
|
+
parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
|
|
134
136
|
except Exception:
|
|
135
|
-
pass # graph not available — skip quality section
|
|
137
|
+
pass # graph not available or server busy — skip quality section
|
|
136
138
|
|
|
137
139
|
if json_output:
|
|
138
140
|
payload: dict = {
|
sqlcg/core/config.py
CHANGED
|
@@ -346,14 +346,32 @@ def get_external_consumers(path: Path) -> list[ExternalConsumerSpec]:
|
|
|
346
346
|
return []
|
|
347
347
|
|
|
348
348
|
|
|
349
|
-
def get_backend() -> "GraphBackend":
|
|
349
|
+
def get_backend(read_only: bool = False) -> "GraphBackend":
|
|
350
350
|
"""Get a graph backend instance respecting the SQLCG_BACKEND env var.
|
|
351
351
|
|
|
352
|
+
Args:
|
|
353
|
+
read_only: Open the database in read-only mode. For KuzuBackend this
|
|
354
|
+
enables multiple concurrent read-only opens (reader/reader
|
|
355
|
+
concurrency), but does NOT allow reads while a read-write writer
|
|
356
|
+
holds the exclusive process lock — that requires routing through the
|
|
357
|
+
live MCP server via ``read_client.run_read_routed`` (v1.2.0).
|
|
358
|
+
Ignored for Neo4jBackend (Neo4j has no single-writer process lock;
|
|
359
|
+
the flag is a no-op and the normal connection is opened).
|
|
360
|
+
All writer call sites (index, reindex, db init/reset, server
|
|
361
|
+
init_backend) use the default ``False``.
|
|
362
|
+
|
|
352
363
|
Returns:
|
|
353
364
|
A GraphBackend instance (KuzuBackend by default, or Neo4jBackend)
|
|
354
365
|
|
|
355
366
|
Raises:
|
|
356
367
|
ValueError: If backend type is not recognized
|
|
368
|
+
|
|
369
|
+
Note:
|
|
370
|
+
CLI read commands (find, analyze, db info, gain) route through a live
|
|
371
|
+
MCP server via ``read_client.run_read_routed`` (v1.2.0) when a server
|
|
372
|
+
is live, falling back to ``get_backend(read_only=True)`` when no server
|
|
373
|
+
is present. The fallback path still contends for the process lock under
|
|
374
|
+
an active writer (Windows / no-server fallback only).
|
|
357
375
|
"""
|
|
358
376
|
backend_type = os.getenv("SQLCG_BACKEND", "kuzu")
|
|
359
377
|
|
|
@@ -361,14 +379,26 @@ def get_backend() -> "GraphBackend":
|
|
|
361
379
|
from sqlcg.core.kuzu_backend import KuzuBackend
|
|
362
380
|
|
|
363
381
|
kuzu_cfg = KuzuConfig.from_env()
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
382
|
+
try:
|
|
383
|
+
return KuzuBackend(
|
|
384
|
+
str(kuzu_cfg.db_path),
|
|
385
|
+
buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
|
|
386
|
+
read_only=read_only,
|
|
387
|
+
)
|
|
388
|
+
except RuntimeError as exc:
|
|
389
|
+
if read_only and "READ ONLY" in str(exc):
|
|
390
|
+
# KùzuDB refuses to open a non-existent or empty DB in read-only
|
|
391
|
+
# mode ("Cannot create an empty database under READ ONLY mode").
|
|
392
|
+
# Surface the same empty-DB guidance the user sees from `db info`.
|
|
393
|
+
raise RuntimeError(
|
|
394
|
+
"Database not initialised — run 'sqlcg db init' and 'sqlcg index <path>' first."
|
|
395
|
+
) from exc
|
|
396
|
+
raise
|
|
368
397
|
elif backend_type == "neo4j":
|
|
369
398
|
from sqlcg.core.neo4j_backend import Neo4jBackend
|
|
370
399
|
|
|
371
400
|
neo4j_cfg = Neo4jConfig.from_env()
|
|
401
|
+
# read_only is ignored for Neo4j — no single-writer process lock.
|
|
372
402
|
return Neo4jBackend(neo4j_cfg.uri, neo4j_cfg.user, neo4j_cfg.password)
|
|
373
403
|
else:
|
|
374
404
|
raise ValueError(f"Unknown backend type: {backend_type}")
|
sqlcg/core/kuzu_backend.py
CHANGED
|
@@ -58,7 +58,10 @@ class KuzuBackend(GraphBackend):
|
|
|
58
58
|
Args:
|
|
59
59
|
db_path: Path to the KùzuDB database file (or ':memory:' for in-memory)
|
|
60
60
|
buffer_pool_size_mb: Buffer pool size in MB (0 = use KuzuDB default)
|
|
61
|
-
read_only: Open in read-only mode
|
|
61
|
+
read_only: Open in read-only mode. Enables concurrent read-only
|
|
62
|
+
opens (reader/reader concurrency) by not taking the exclusive
|
|
63
|
+
write lock. Does NOT allow reads while a read-write writer
|
|
64
|
+
holds the lock — KùzuDB's exclusive lock is process-level.
|
|
62
65
|
|
|
63
66
|
Raises:
|
|
64
67
|
RuntimeError: If the database is locked or cannot be opened.
|
sqlcg/core/queries.cypher
CHANGED
|
@@ -38,12 +38,6 @@ RETURN dst.id AS id, dst.col_name AS col_name, dst.table_qualified AS table_qual
|
|
|
38
38
|
MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
|
|
39
39
|
RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
|
|
40
40
|
|
|
41
|
-
-- GET_UPSTREAM_DEPENDENCIES_FILTERED
|
|
42
|
-
MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn)
|
|
43
|
-
MATCH (t:SqlTable {qualified: src.table_qualified})
|
|
44
|
-
WHERE t.kind IN ['table', 'external']
|
|
45
|
-
RETURN src.id AS id, src.col_name AS col_name, src.table_qualified AS table_qualified
|
|
46
|
-
|
|
47
41
|
-- SEARCH_SQL_PATTERN
|
|
48
42
|
MATCH (q:SqlQuery)-[:QUERY_DEFINED_IN]->(f:File)
|
|
49
43
|
WHERE contains(q.sql, $query)
|
sqlcg/core/queries.py
CHANGED
|
@@ -28,7 +28,6 @@ TRACE_COLUMN_LINEAGE_QUERY = _Q["TRACE_COLUMN_LINEAGE"]
|
|
|
28
28
|
FIND_TABLE_USAGES_QUERY = _Q["FIND_TABLE_USAGES"]
|
|
29
29
|
GET_DOWNSTREAM_DEPENDENCIES_QUERY = _Q["GET_DOWNSTREAM_DEPENDENCIES"]
|
|
30
30
|
GET_UPSTREAM_DEPENDENCIES_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES"]
|
|
31
|
-
GET_UPSTREAM_DEPENDENCIES_FILTERED_QUERY = _Q["GET_UPSTREAM_DEPENDENCIES_FILTERED"]
|
|
32
31
|
SEARCH_SQL_PATTERN_QUERY = _Q["SEARCH_SQL_PATTERN"]
|
|
33
32
|
LIST_DIALECTS_AND_REPOS_QUERY = _Q["LIST_DIALECTS_AND_REPOS"]
|
|
34
33
|
EXPAND_STAR_SOURCES_QUERY = _Q["EXPAND_STAR_SOURCES"]
|
sqlcg/indexer/indexer.py
CHANGED
|
@@ -93,8 +93,11 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
|
|
|
93
93
|
This is the v1.1.1 batch-flush core: called once per batch (not once per file).
|
|
94
94
|
Dedup keys mirror the graph's MERGE cardinality:
|
|
95
95
|
- file_rows: path (primary key)
|
|
96
|
-
- table_rows: qualified (primary key); prefers row with non-empty
|
|
97
|
-
defined_in_file so DEFINED_IN provenance is preserved
|
|
96
|
+
- table_rows: qualified (primary key); prefers (1) row with non-empty
|
|
97
|
+
defined_in_file so DEFINED_IN provenance is preserved;
|
|
98
|
+
(2) structural kind ('cte','derived','external') over
|
|
99
|
+
default 'table' so CTE aliases keep kind='cte' even when
|
|
100
|
+
also seen as source references with the default kind.
|
|
98
101
|
- column_rows: id (primary key)
|
|
99
102
|
- query_rows: id (primary key, globally unique path:index)
|
|
100
103
|
- edge rows: (src_key, dst_key) only — matches MERGE (src)-[r]->(dst)
|
|
@@ -108,12 +111,27 @@ def _flush_row_batch(db: GraphBackend, buf: BatchRowBuffer) -> None:
|
|
|
108
111
|
# --- Phase B: batch-scoped dedup ---
|
|
109
112
|
# For table_rows, prefer defined rows (non-empty defined_in_file) so provenance
|
|
110
113
|
# is not lost when a shared table is referenced by multiple files.
|
|
114
|
+
# Also prefer structurally-assigned kinds ('cte', 'derived', 'external') over the
|
|
115
|
+
# default 'table' kind: a CTE alias emitted first as a source reference (kind='table'
|
|
116
|
+
# default) and later confirmed as a CTE destination (kind='cte') must keep 'cte' so
|
|
117
|
+
# the kind filter correctly excludes it from default filtered output.
|
|
118
|
+
_structural_kinds = {"cte", "derived", "external"}
|
|
111
119
|
table_dedup: dict[str, dict] = {}
|
|
112
120
|
for r in buf.table_rows:
|
|
113
121
|
key = r["qualified"]
|
|
114
122
|
existing = table_dedup.get(key)
|
|
115
|
-
if existing is None
|
|
123
|
+
if existing is None:
|
|
116
124
|
table_dedup[key] = r
|
|
125
|
+
else:
|
|
126
|
+
# Rule 1: prefer rows with defined_in_file (DDL provenance)
|
|
127
|
+
if not existing.get("defined_in_file") and r.get("defined_in_file"):
|
|
128
|
+
table_dedup[key] = r
|
|
129
|
+
# Rule 2: prefer structural kind over default 'table'
|
|
130
|
+
elif (
|
|
131
|
+
existing.get("kind", "table") not in _structural_kinds
|
|
132
|
+
and r.get("kind", "table") in _structural_kinds
|
|
133
|
+
):
|
|
134
|
+
table_dedup[key] = r
|
|
117
135
|
table_rows = list(table_dedup.values())
|
|
118
136
|
|
|
119
137
|
column_rows = list({r["id"]: r for r in buf.column_rows}.values())
|
|
@@ -432,7 +450,14 @@ class Indexer:
|
|
|
432
450
|
Note: sqlcg watch's reindex_file uses a separate code path with
|
|
433
451
|
its own short per-file transaction. PERF-BATCH only affects index_repo.
|
|
434
452
|
"""
|
|
435
|
-
self._upsert_file_batch(
|
|
453
|
+
self._upsert_file_batch(
|
|
454
|
+
batch,
|
|
455
|
+
db,
|
|
456
|
+
defined_table_registry,
|
|
457
|
+
nonlocal_counts,
|
|
458
|
+
canonical_by_bare=aggregator.canonical_by_bare,
|
|
459
|
+
ambiguous_bare=aggregator._ambiguous_bare,
|
|
460
|
+
)
|
|
436
461
|
|
|
437
462
|
if profile:
|
|
438
463
|
_t_upsert_start = time.perf_counter()
|
|
@@ -975,6 +1000,8 @@ class Indexer:
|
|
|
975
1000
|
self,
|
|
976
1001
|
parsed: ParsedFile,
|
|
977
1002
|
defined_table_registry: dict[str, str] | None = None,
|
|
1003
|
+
canonical_by_bare: dict[str, str] | None = None,
|
|
1004
|
+
ambiguous_bare: set[str] | None = None,
|
|
978
1005
|
) -> FileRowSet:
|
|
979
1006
|
"""Build all row dicts for one ParsedFile — Phase A (pure, no db access).
|
|
980
1007
|
|
|
@@ -986,6 +1013,13 @@ class Indexer:
|
|
|
986
1013
|
Args:
|
|
987
1014
|
parsed: ParsedFile to build rows for
|
|
988
1015
|
defined_table_registry: Optional cross-file DDL dedup registry
|
|
1016
|
+
canonical_by_bare: Optional #44 bare-name → canonical full_id index
|
|
1017
|
+
(populated by CrossFileAggregator.register_pass1 from DDL tables).
|
|
1018
|
+
When provided, unqualified INSERT targets whose bare name maps to
|
|
1019
|
+
exactly one DDL canonical are rewritten to use the canonical full_id.
|
|
1020
|
+
When None, the rewrite is skipped (single-file / reindex_file path).
|
|
1021
|
+
ambiguous_bare: Optional set of bare names defined in >1 schema.
|
|
1022
|
+
These are never rewritten — the existing _bare_ref CLI hint handles them.
|
|
989
1023
|
|
|
990
1024
|
Returns:
|
|
991
1025
|
FileRowSet with all row lists and per-file counts/quality key
|
|
@@ -1126,6 +1160,24 @@ class Indexer:
|
|
|
1126
1160
|
"table_name": edge.src.table.name,
|
|
1127
1161
|
}
|
|
1128
1162
|
)
|
|
1163
|
+
# Half A (#39): emit a SqlTable node for the source table.
|
|
1164
|
+
# CTE-body-only sources are not in stmt.sources (which only covers
|
|
1165
|
+
# tables reachable via the parser's top-level FROM list), so they were
|
|
1166
|
+
# previously missing from the graph. edge.src.table is a frozen
|
|
1167
|
+
# TableRef with schema-aliasing already applied at parse time — the
|
|
1168
|
+
# qualified value is guaranteed to match edge.src.table_qualified.
|
|
1169
|
+
# key set is identical to other table_rows entries → upsert_nodes_bulk
|
|
1170
|
+
# homogeneity preserved; MERGE on primary key deduplicates re-emits.
|
|
1171
|
+
rows.table_rows.append(
|
|
1172
|
+
{
|
|
1173
|
+
"qualified": edge.src.table.full_id,
|
|
1174
|
+
"name": edge.src.table.name,
|
|
1175
|
+
"catalog": edge.src.table.catalog or "",
|
|
1176
|
+
"db": edge.src.table.db or "",
|
|
1177
|
+
"kind": edge.src.table.role,
|
|
1178
|
+
"defined_in_file": "",
|
|
1179
|
+
}
|
|
1180
|
+
)
|
|
1129
1181
|
rows.column_rows.append(
|
|
1130
1182
|
{
|
|
1131
1183
|
"id": dst_id,
|
|
@@ -1183,15 +1235,53 @@ class Indexer:
|
|
|
1183
1235
|
rows.counts["star_sources"] += 1
|
|
1184
1236
|
|
|
1185
1237
|
# Upsert target table node (if not already a defined_table)
|
|
1186
|
-
# so that star expansion can create destination columns
|
|
1238
|
+
# so that star expansion can create destination columns.
|
|
1239
|
+
# #44: when a canonical_by_bare index is available, attempt to resolve
|
|
1240
|
+
# an unqualified / wrong-schema INSERT target to its DDL-canonical full_id
|
|
1241
|
+
# so that INSERT-target nodes share identity with the DDL node.
|
|
1187
1242
|
if stmt.target and stmt.target.full_id not in defined_table_ids:
|
|
1243
|
+
target_qualified = stmt.target.full_id
|
|
1244
|
+
target_name = stmt.target.name
|
|
1245
|
+
target_db = stmt.target.db or ""
|
|
1246
|
+
target_catalog = stmt.target.catalog or ""
|
|
1247
|
+
target_kind = "table"
|
|
1248
|
+
|
|
1249
|
+
# #44 canonical-name resolution: if bare name maps unambiguously to a
|
|
1250
|
+
# DDL-defined table, use the canonical full_id for the emitted row.
|
|
1251
|
+
# Degrading to no-op when canonical_by_bare is None (single-file path).
|
|
1252
|
+
if canonical_by_bare is not None:
|
|
1253
|
+
bare = (stmt.target.name or "").lower()
|
|
1254
|
+
if bare and bare not in (ambiguous_bare or set()) and bare in canonical_by_bare:
|
|
1255
|
+
# Resolve to the sole DDL canonical — keeps kind='table'
|
|
1256
|
+
canonical_id = canonical_by_bare[bare]
|
|
1257
|
+
target_qualified = canonical_id
|
|
1258
|
+
# Derive name/db/catalog from the canonical full_id parts.
|
|
1259
|
+
# full_id format: "db.name" or "catalog.db.name" or "name"
|
|
1260
|
+
parts = canonical_id.split(".")
|
|
1261
|
+
if len(parts) >= 3:
|
|
1262
|
+
target_name = parts[-1]
|
|
1263
|
+
target_db = parts[-2]
|
|
1264
|
+
target_catalog = parts[-3]
|
|
1265
|
+
elif len(parts) == 2:
|
|
1266
|
+
target_name = parts[-1]
|
|
1267
|
+
target_db = parts[-2]
|
|
1268
|
+
target_catalog = ""
|
|
1269
|
+
else:
|
|
1270
|
+
target_name = canonical_id
|
|
1271
|
+
target_db = ""
|
|
1272
|
+
target_catalog = ""
|
|
1273
|
+
else:
|
|
1274
|
+
# Not resolved to a DDL canonical — mark as derived so the
|
|
1275
|
+
# kind filter excludes it from default (non-raw) output (#45.2)
|
|
1276
|
+
target_kind = "derived"
|
|
1277
|
+
|
|
1188
1278
|
rows.table_rows.append(
|
|
1189
1279
|
{
|
|
1190
|
-
"qualified":
|
|
1191
|
-
"name":
|
|
1192
|
-
"catalog":
|
|
1193
|
-
"db":
|
|
1194
|
-
"kind":
|
|
1280
|
+
"qualified": target_qualified,
|
|
1281
|
+
"name": target_name,
|
|
1282
|
+
"catalog": target_catalog,
|
|
1283
|
+
"db": target_db,
|
|
1284
|
+
"kind": target_kind,
|
|
1195
1285
|
"defined_in_file": "",
|
|
1196
1286
|
}
|
|
1197
1287
|
)
|
|
@@ -1205,6 +1295,8 @@ class Indexer:
|
|
|
1205
1295
|
defined_table_registry: dict[str, str],
|
|
1206
1296
|
nonlocal_counts: dict,
|
|
1207
1297
|
warning_prefix: str = "",
|
|
1298
|
+
canonical_by_bare: dict[str, str] | None = None,
|
|
1299
|
+
ambiguous_bare: set[str] | None = None,
|
|
1208
1300
|
) -> None:
|
|
1209
1301
|
"""Accumulate rows for all files in batch, then flush once in one transaction.
|
|
1210
1302
|
|
|
@@ -1219,13 +1311,19 @@ class Indexer:
|
|
|
1219
1311
|
defined_table_registry: Cross-file DDL dedup registry
|
|
1220
1312
|
nonlocal_counts: Mutable summary dict updated in place (tables/edges/quality/…)
|
|
1221
1313
|
warning_prefix: Optional prefix for warning log messages (e.g. "resync_changed: ")
|
|
1314
|
+
canonical_by_bare: Optional #44 bare-name → canonical full_id index.
|
|
1315
|
+
When provided, unqualified INSERT targets are rewritten to their DDL
|
|
1316
|
+
canonical full_id. None on single-file / reindex_file paths.
|
|
1317
|
+
ambiguous_bare: Optional set of bare names defined in >1 schema.
|
|
1222
1318
|
"""
|
|
1223
1319
|
if not batch:
|
|
1224
1320
|
return
|
|
1225
1321
|
buf = BatchRowBuffer()
|
|
1226
1322
|
for parsed_in_batch in batch:
|
|
1227
1323
|
try:
|
|
1228
|
-
file_rows = self._build_file_rows(
|
|
1324
|
+
file_rows = self._build_file_rows(
|
|
1325
|
+
parsed_in_batch, defined_table_registry, canonical_by_bare, ambiguous_bare
|
|
1326
|
+
)
|
|
1229
1327
|
except Exception as exc:
|
|
1230
1328
|
logger.warning(
|
|
1231
1329
|
"%sFailed to build rows for %s: %s — skipping",
|