sql-code-graph 1.2.2__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/METADATA +2 -4
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/RECORD +31 -30
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +138 -127
- sqlcg/cli/commands/db.py +49 -51
- sqlcg/cli/commands/find.py +5 -9
- sqlcg/cli/commands/gain.py +14 -16
- sqlcg/cli/commands/git.py +11 -4
- sqlcg/cli/commands/index.py +173 -21
- sqlcg/cli/commands/mcp.py +70 -3
- sqlcg/cli/commands/reindex.py +147 -77
- sqlcg/cli/commands/uninstall.py +9 -20
- sqlcg/core/__init__.py +1 -3
- sqlcg/core/config.py +25 -81
- sqlcg/core/duckdb_backend.py +764 -0
- sqlcg/core/freshness.py +1 -1
- sqlcg/core/graph_db.py +20 -4
- sqlcg/core/queries.py +26 -7
- sqlcg/core/queries.sql +249 -0
- sqlcg/core/schema.py +1 -1
- sqlcg/indexer/indexer.py +27 -36
- sqlcg/metrics/store.py +49 -1
- sqlcg/server/control.py +1 -1
- sqlcg/server/noise_filter.py +1 -1
- sqlcg/server/read_client.py +2 -2
- sqlcg/server/server.py +184 -86
- sqlcg/server/skill.py +2 -2
- sqlcg/server/tools.py +119 -41
- sqlcg/server/writer.py +459 -0
- sqlcg/core/kuzu_backend.py +0 -445
- sqlcg/core/neo4j_backend.py +0 -233
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/WHEEL +0 -0
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/entry_points.txt +0 -0
sqlcg/cli/commands/db.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Database management commands."""
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import shutil
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
|
|
@@ -20,18 +19,8 @@ console = Console()
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
@app.command("init")
|
|
23
|
-
def db_init(
|
|
24
|
-
buffer_pool_size: int = typer.Option(
|
|
25
|
-
0,
|
|
26
|
-
"--buffer-pool-size",
|
|
27
|
-
help="KuzuDB buffer pool size in MB (0 = default). "
|
|
28
|
-
"Set to 256-512 on memory-constrained machines.",
|
|
29
|
-
),
|
|
30
|
-
) -> None:
|
|
22
|
+
def db_init() -> None:
|
|
31
23
|
"""Initialise the graph database (idempotent)."""
|
|
32
|
-
if buffer_pool_size > 0:
|
|
33
|
-
os.environ["SQLCG_BUFFER_POOL_MB"] = str(buffer_pool_size)
|
|
34
|
-
|
|
35
24
|
db_path = get_db_path()
|
|
36
25
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
37
26
|
with get_backend() as backend:
|
|
@@ -45,19 +34,40 @@ def db_reset( # noqa: B008
|
|
|
45
34
|
repo: str | None = typer.Option(None, "--repo", help="Reset only this repo path"), # noqa: B008
|
|
46
35
|
) -> None:
|
|
47
36
|
"""Wipe the database or a single repo's subgraph."""
|
|
37
|
+
import socket as _socket
|
|
38
|
+
|
|
39
|
+
from sqlcg.server.control import sock_path
|
|
40
|
+
|
|
41
|
+
# Refuse cleanly when a server is live.
|
|
42
|
+
sp = sock_path()
|
|
43
|
+
if sp.exists():
|
|
44
|
+
try:
|
|
45
|
+
with _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM) as s:
|
|
46
|
+
s.settimeout(1)
|
|
47
|
+
s.connect(str(sp))
|
|
48
|
+
console.print(
|
|
49
|
+
"[red]A server is running on this database; stop it first "
|
|
50
|
+
"('sqlcg mcp stop') before resetting the database.[/red]"
|
|
51
|
+
)
|
|
52
|
+
raise typer.Exit(1)
|
|
53
|
+
except (FileNotFoundError, ConnectionRefusedError, OSError):
|
|
54
|
+
pass
|
|
55
|
+
|
|
48
56
|
if repo:
|
|
49
|
-
# Delete all nodes for this repo (
|
|
57
|
+
# Delete all nodes for this repo: delete File nodes (cascades to all
|
|
58
|
+
# related nodes via delete_nodes_for_file) and the Repo node itself.
|
|
50
59
|
with get_backend() as backend:
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
60
|
+
# Get all files for this repo
|
|
61
|
+
file_rows = backend.run_read(
|
|
62
|
+
'SELECT path FROM "File" WHERE repo_path = ?',
|
|
63
|
+
{"repo_path": repo},
|
|
54
64
|
)
|
|
65
|
+
for fr in file_rows:
|
|
66
|
+
backend.delete_nodes_for_file(fr["path"])
|
|
67
|
+
backend.run_write('DELETE FROM "Repo" WHERE path = ?', {"p": repo})
|
|
55
68
|
console.print(f"[yellow]Reset repo[/yellow] {repo}")
|
|
56
69
|
else:
|
|
57
|
-
# Full reset — delete the
|
|
58
|
-
# e.g. 0.11.x) or a directory (older versions); also drop the .wal sidecar.
|
|
59
|
-
# shutil.rmtree silently no-ops on a regular file (NotADirectoryError +
|
|
60
|
-
# ignore_errors), so dispatch on the actual filesystem type.
|
|
70
|
+
# Full reset — delete the DuckDB file (single file, not a directory).
|
|
61
71
|
db_path = get_db_path()
|
|
62
72
|
removed = False
|
|
63
73
|
for target in (db_path, db_path.with_name(db_path.name + ".wal")):
|
|
@@ -76,56 +86,46 @@ def db_reset( # noqa: B008
|
|
|
76
86
|
@app.command("info")
|
|
77
87
|
def db_info() -> None:
|
|
78
88
|
"""Show database stats."""
|
|
79
|
-
# db info
|
|
80
|
-
#
|
|
81
|
-
# holds the write lock. get_schema_version / get_indexed_sha are inlined as
|
|
82
|
-
# run_read_routed calls using their known Cypher so they too route through the
|
|
83
|
-
# socket when a server is live; this avoids a direct-open that would hit the lock.
|
|
89
|
+
# db info routes through the live server (run_read_routed) to avoid holding
|
|
90
|
+
# the DuckDB file lock when the MCP server is running.
|
|
84
91
|
|
|
85
92
|
# Schema version
|
|
86
|
-
schema_rows = run_read_routed(
|
|
93
|
+
schema_rows = run_read_routed('SELECT version FROM "SchemaVersion" LIMIT 1', {})
|
|
87
94
|
version = (schema_rows[0]["version"] if schema_rows else None) or "unknown"
|
|
88
95
|
console.print(f"Schema version: {version}")
|
|
89
96
|
|
|
90
|
-
# Freshness block
|
|
97
|
+
# Freshness block
|
|
91
98
|
try:
|
|
92
|
-
sha_rows = run_read_routed(
|
|
93
|
-
"MATCH (v:SchemaVersion) RETURN v.indexed_sha AS sha LIMIT 1", {}
|
|
94
|
-
)
|
|
99
|
+
sha_rows = run_read_routed('SELECT indexed_sha AS sha FROM "SchemaVersion" LIMIT 1', {})
|
|
95
100
|
indexed_sha = sha_rows[0]["sha"] if sha_rows else None
|
|
96
|
-
repo_rows = run_read_routed(
|
|
101
|
+
repo_rows = run_read_routed('SELECT path FROM "Repo" LIMIT 1', {})
|
|
97
102
|
if repo_rows and indexed_sha is not None and repo_rows[0].get("path"):
|
|
98
103
|
repo_root = Path(repo_rows[0]["path"])
|
|
99
104
|
f = compute_freshness(repo_root, indexed_sha)
|
|
100
105
|
console.print(render_freshness_line(f))
|
|
101
|
-
except NotImplementedError:
|
|
102
|
-
# Neo4j backend raises NotImplementedError for get_indexed_sha — skip silently
|
|
103
|
-
pass
|
|
104
106
|
except Exception as e:
|
|
105
|
-
# Any unexpected error in the freshness block must not crash db info
|
|
106
107
|
logger.debug(f"Freshness check skipped: {e}")
|
|
107
108
|
|
|
108
|
-
#
|
|
109
|
+
# Node counts
|
|
109
110
|
for label in NodeLabel:
|
|
110
111
|
try:
|
|
111
|
-
result = run_read_routed(f
|
|
112
|
+
result = run_read_routed(f'SELECT count(*) AS count FROM "{label}"', {})
|
|
112
113
|
count = result[0]["count"] if result else 0
|
|
113
114
|
console.print(f" {label}: {count}")
|
|
114
115
|
except Exception as e:
|
|
115
|
-
# Log unexpected exceptions instead of silently skipping
|
|
116
116
|
logger.error(f"Error getting count for {label}: {e}")
|
|
117
117
|
console.print(f" [red]{label}: error[/red]")
|
|
118
118
|
|
|
119
|
-
# Health check
|
|
120
|
-
repo_count_result = run_read_routed(
|
|
119
|
+
# Health check
|
|
120
|
+
repo_count_result = run_read_routed('SELECT count(*) AS count FROM "Repo"', {})
|
|
121
121
|
repo_count = repo_count_result[0]["count"] if repo_count_result else 0
|
|
122
122
|
|
|
123
123
|
if repo_count == 0:
|
|
124
|
-
console.print(
|
|
124
|
+
console.print(
|
|
125
125
|
"[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
|
|
126
126
|
)
|
|
127
127
|
else:
|
|
128
|
-
query_count_result = run_read_routed(
|
|
128
|
+
query_count_result = run_read_routed('SELECT count(*) AS count FROM "SqlQuery"', {})
|
|
129
129
|
query_count = query_count_result[0]["count"] if query_count_result else 0
|
|
130
130
|
|
|
131
131
|
if query_count == 0:
|
|
@@ -134,7 +134,7 @@ def db_info() -> None:
|
|
|
134
134
|
"the graph.[/yellow]"
|
|
135
135
|
)
|
|
136
136
|
else:
|
|
137
|
-
col_count_result = run_read_routed(
|
|
137
|
+
col_count_result = run_read_routed('SELECT count(*) AS count FROM "SqlColumn"', {})
|
|
138
138
|
col_count = col_count_result[0]["count"] if col_count_result else 0
|
|
139
139
|
|
|
140
140
|
if col_count == 0:
|
|
@@ -144,12 +144,10 @@ def db_info() -> None:
|
|
|
144
144
|
"will return empty results.[/yellow]"
|
|
145
145
|
)
|
|
146
146
|
|
|
147
|
-
|
|
148
|
-
edges_result = run_read_routed("MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {})
|
|
147
|
+
edges_result = run_read_routed('SELECT count(*) AS count FROM "COLUMN_LINEAGE"', {})
|
|
149
148
|
edges_count = edges_result[0]["count"] if edges_result else 0
|
|
150
149
|
console.print(f" COLUMN_LINEAGE edges: {edges_count}")
|
|
151
150
|
|
|
152
|
-
# Print star resolution metrics (T-07)
|
|
153
151
|
from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
|
|
154
152
|
|
|
155
153
|
star_source_result = run_read_routed(COUNT_STAR_SOURCES_QUERY, {})
|
|
@@ -160,11 +158,11 @@ def db_info() -> None:
|
|
|
160
158
|
star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
|
|
161
159
|
console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
|
|
162
160
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
"
|
|
161
|
+
mode_rows = run_read_routed(
|
|
162
|
+
'SELECT parsing_mode AS mode, count(*) AS cnt FROM "SqlQuery"'
|
|
163
|
+
" GROUP BY parsing_mode ORDER BY cnt DESC",
|
|
164
|
+
{},
|
|
166
165
|
)
|
|
167
|
-
mode_rows = run_read_routed(mode_query, {})
|
|
168
166
|
if mode_rows and "mode" in mode_rows[0]:
|
|
169
167
|
console.print("\n Parsing mode distribution:")
|
|
170
168
|
for row in mode_rows:
|
|
@@ -174,7 +172,7 @@ def db_info() -> None:
|
|
|
174
172
|
@app.command("list-repos")
|
|
175
173
|
def list_repos() -> None:
|
|
176
174
|
"""List all indexed repositories."""
|
|
177
|
-
result = run_read_routed(
|
|
175
|
+
result = run_read_routed('SELECT path, name FROM "Repo"', {})
|
|
178
176
|
|
|
179
177
|
if not result:
|
|
180
178
|
console.print("[yellow]No repositories indexed[/yellow]")
|
sqlcg/cli/commands/find.py
CHANGED
|
@@ -4,7 +4,6 @@ import typer
|
|
|
4
4
|
from rich.console import Console
|
|
5
5
|
from rich.table import Table
|
|
6
6
|
|
|
7
|
-
from sqlcg.core.schema import NodeLabel
|
|
8
7
|
from sqlcg.server.read_client import run_read_routed
|
|
9
8
|
|
|
10
9
|
app = typer.Typer(help="Search the graph")
|
|
@@ -19,14 +18,13 @@ def find_table( # noqa: B008
|
|
|
19
18
|
"""Find a table by name."""
|
|
20
19
|
name = name.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
21
20
|
results = run_read_routed(
|
|
22
|
-
|
|
23
|
-
"RETURN t.qualified AS qualified, t.kind AS kind LIMIT 50",
|
|
21
|
+
"SELECT qualified, kind FROM \"SqlTable\" WHERE qualified LIKE '%' || ? || '%' LIMIT 50",
|
|
24
22
|
{"name": name},
|
|
25
23
|
)
|
|
26
24
|
if not raw:
|
|
27
25
|
from sqlcg.server.noise_filter import NoiseFilter
|
|
28
26
|
|
|
29
|
-
nf = NoiseFilter.from_config()
|
|
27
|
+
nf = NoiseFilter.from_config()
|
|
30
28
|
ids = [r["qualified"] for r in results]
|
|
31
29
|
kept, _ = nf.filter_nodes(ids)
|
|
32
30
|
kept_set = set(kept)
|
|
@@ -42,14 +40,13 @@ def find_column( # noqa: B008
|
|
|
42
40
|
"""Find a column by table.column reference."""
|
|
43
41
|
ref = ref.lower() # graph keys are lowercased at index time (C2 normalization)
|
|
44
42
|
results = run_read_routed(
|
|
45
|
-
|
|
43
|
+
"SELECT id FROM \"SqlColumn\" WHERE id LIKE '%' || ? || '%' LIMIT 50",
|
|
46
44
|
{"ref": ref},
|
|
47
45
|
)
|
|
48
46
|
if not raw:
|
|
49
47
|
from sqlcg.server.noise_filter import NoiseFilter
|
|
50
48
|
|
|
51
|
-
nf = NoiseFilter.from_config()
|
|
52
|
-
# Filter on the schema.table portion of each column id (schema.table.column)
|
|
49
|
+
nf = NoiseFilter.from_config()
|
|
53
50
|
results = [r for r in results if not nf.is_noise(r["id"].rsplit(".", 1)[0])]
|
|
54
51
|
_print_table(results, ["id"])
|
|
55
52
|
|
|
@@ -60,8 +57,7 @@ def find_pattern( # noqa: B008
|
|
|
60
57
|
) -> None:
|
|
61
58
|
"""Find queries containing a SQL pattern."""
|
|
62
59
|
results = run_read_routed(
|
|
63
|
-
|
|
64
|
-
"RETURN q.id AS id, q.kind AS kind LIMIT 50",
|
|
60
|
+
"SELECT id, kind FROM \"SqlQuery\" WHERE sql LIKE '%' || ? || '%' LIMIT 50",
|
|
65
61
|
{"pattern": pattern},
|
|
66
62
|
)
|
|
67
63
|
_print_table(results, ["id", "kind"])
|
sqlcg/cli/commands/gain.py
CHANGED
|
@@ -112,13 +112,11 @@ def gain_cmd(
|
|
|
112
112
|
"""
|
|
113
113
|
)
|
|
114
114
|
|
|
115
|
-
# Section E:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
)
|
|
121
|
-
execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
|
|
115
|
+
# Section E: execute_sql ratio
|
|
116
|
+
sql_query = "SELECT COUNT(*) as count FROM tool_calls WHERE tool_name = 'execute_sql'"
|
|
117
|
+
execute_sql_count_result = metrics.execute_query(sql_query)
|
|
118
|
+
execute_sql_count = execute_sql_count_result[0][0] if execute_sql_count_result else 0
|
|
119
|
+
execute_sql_ratio = execute_sql_count / total_calls if total_calls > 0 else 0
|
|
122
120
|
|
|
123
121
|
# Section F: parse quality from graph.
|
|
124
122
|
# run_read_routed raises typer.Exit (Exception-derived, NOT SystemExit) on
|
|
@@ -127,8 +125,8 @@ def gain_cmd(
|
|
|
127
125
|
parse_quality: dict[str, int] | None = None
|
|
128
126
|
try:
|
|
129
127
|
mode_rows = run_read_routed(
|
|
130
|
-
|
|
131
|
-
"
|
|
128
|
+
'SELECT parsing_mode AS mode, count(*) AS cnt FROM "SqlQuery"'
|
|
129
|
+
" GROUP BY parsing_mode ORDER BY cnt DESC",
|
|
132
130
|
{},
|
|
133
131
|
)
|
|
134
132
|
if mode_rows and "mode" in mode_rows[0]:
|
|
@@ -144,7 +142,7 @@ def gain_cmd(
|
|
|
144
142
|
"feedback_tp": tp_count,
|
|
145
143
|
"feedback_total": fb_total,
|
|
146
144
|
"top_tools": [{"name": row[0], "count": row[1]} for row in top_tools],
|
|
147
|
-
"
|
|
145
|
+
"execute_sql_ratio": round(execute_sql_ratio, 2),
|
|
148
146
|
}
|
|
149
147
|
if parse_quality is not None:
|
|
150
148
|
payload["parse_quality"] = parse_quality
|
|
@@ -191,14 +189,14 @@ def gain_cmd(
|
|
|
191
189
|
console.print(f" {i}. {name}: {count}")
|
|
192
190
|
console.print()
|
|
193
191
|
|
|
194
|
-
# Section E:
|
|
195
|
-
console.print("[bold cyan]E. Raw
|
|
196
|
-
ratio_pct =
|
|
197
|
-
if
|
|
198
|
-
msg = f" [yellow]
|
|
192
|
+
# Section E: execute_sql ratio
|
|
193
|
+
console.print("[bold cyan]E. Raw SQL Usage[/bold cyan]")
|
|
194
|
+
ratio_pct = execute_sql_ratio * 100
|
|
195
|
+
if execute_sql_ratio > 0.3:
|
|
196
|
+
msg = f" [yellow]execute_sql: {ratio_pct:.1f}% (high raw-SQL usage)[/yellow]"
|
|
199
197
|
console.print(msg)
|
|
200
198
|
else:
|
|
201
|
-
console.print(f"
|
|
199
|
+
console.print(f" execute_sql: {ratio_pct:.1f}%")
|
|
202
200
|
console.print()
|
|
203
201
|
|
|
204
202
|
# Section F: parse quality from graph
|
sqlcg/cli/commands/git.py
CHANGED
|
@@ -33,7 +33,7 @@ _HOOKS: list[_HookSpec] = [
|
|
|
33
33
|
'[ "$3" = "1" ] || exit 0\n'
|
|
34
34
|
'{sqlcg_bin} reindex --from "$1" --to "$2"'
|
|
35
35
|
' "$(git rev-parse --show-toplevel)" --dialect auto --quiet --notify'
|
|
36
|
-
' || echo "sqlcg: graph not updated (
|
|
36
|
+
' || echo "sqlcg: graph not updated (reindex failed)'
|
|
37
37
|
" -- run 'sqlcg mcp status'\" >&2\n"
|
|
38
38
|
),
|
|
39
39
|
),
|
|
@@ -50,10 +50,10 @@ PREV=$(git rev-parse --verify --quiet ORIG_HEAD)
|
|
|
50
50
|
TOP=$(git rev-parse --show-toplevel)
|
|
51
51
|
if [ -n "$PREV" ]; then
|
|
52
52
|
{sqlcg_bin} reindex --from "$PREV" --to HEAD "$TOP" --dialect auto --quiet --notify \\
|
|
53
|
-
|| echo "sqlcg: graph not updated (
|
|
53
|
+
|| echo "sqlcg: graph not updated (reindex failed) -- run 'sqlcg mcp status'" >&2
|
|
54
54
|
else
|
|
55
55
|
{sqlcg_bin} reindex "$TOP" --dialect auto --quiet --notify \\
|
|
56
|
-
|| echo "sqlcg: graph not updated (
|
|
56
|
+
|| echo "sqlcg: graph not updated (reindex failed) -- run 'sqlcg mcp status'" >&2
|
|
57
57
|
fi
|
|
58
58
|
""",
|
|
59
59
|
),
|
|
@@ -101,7 +101,14 @@ def _install_single_hook(hooks_dir: Path, spec: _HookSpec, sqlcg_bin: str) -> No
|
|
|
101
101
|
if hook_path.exists():
|
|
102
102
|
existing_content = hook_path.read_text()
|
|
103
103
|
if spec.sentinel in existing_content:
|
|
104
|
-
|
|
104
|
+
if existing_content == script:
|
|
105
|
+
# Byte-identical current template — true idempotency, silent skip.
|
|
106
|
+
return
|
|
107
|
+
# Sentinel present but content differs: sqlcg-owned but stale hook.
|
|
108
|
+
# Overwrite with the current rendered template and report the upgrade.
|
|
109
|
+
hook_path.write_text(script)
|
|
110
|
+
hook_path.chmod(0o755)
|
|
111
|
+
console.print(f"[green]Upgraded git hook:[/green] .git/hooks/{spec.filename}")
|
|
105
112
|
return
|
|
106
113
|
else:
|
|
107
114
|
# Foreign hook without sqlcg sentinel
|
sqlcg/cli/commands/index.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Index command for scanning and indexing SQL files."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import json
|
|
4
|
+
import socket as _socket
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
import typer
|
|
@@ -14,11 +15,15 @@ from rich.progress import (
|
|
|
14
15
|
TimeRemainingColumn,
|
|
15
16
|
)
|
|
16
17
|
|
|
17
|
-
from sqlcg.core.config import
|
|
18
|
+
from sqlcg.core.config import DbConfig, config_file_present, get_backend, get_db_path, get_dialect
|
|
18
19
|
from sqlcg.indexer.indexer import Indexer
|
|
19
20
|
|
|
20
21
|
console = Console()
|
|
21
22
|
|
|
23
|
+
# Socket timeout for the index-via-server path.
|
|
24
|
+
# Generous budget: full index of a large repo can take several minutes.
|
|
25
|
+
_INDEX_SOCKET_TIMEOUT_S = 600
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
def index_cmd( # noqa: B008
|
|
24
29
|
path: Path = typer.Argument(..., help="Directory to index"), # noqa: B008
|
|
@@ -31,17 +36,11 @@ def index_cmd( # noqa: B008
|
|
|
31
36
|
timeout_per_file: int = typer.Option( # noqa: B008
|
|
32
37
|
10, "--timeout-per-file", help="Timeout per file in seconds"
|
|
33
38
|
),
|
|
34
|
-
buffer_pool_size: int = typer.Option( # noqa: B008
|
|
35
|
-
0,
|
|
36
|
-
"--buffer-pool-size",
|
|
37
|
-
help="KuzuDB buffer pool size in MB (0 = default). "
|
|
38
|
-
"Set to 256-512 on memory-constrained machines.",
|
|
39
|
-
),
|
|
40
39
|
batch_size: int = typer.Option( # noqa: B008
|
|
41
40
|
50,
|
|
42
41
|
"--batch-size",
|
|
43
42
|
help=(
|
|
44
|
-
"Files per
|
|
43
|
+
"Files per DuckDB transaction in the upsert pass. "
|
|
45
44
|
"Default 50 balances commit-overhead reduction (vs. legacy per-file commits) "
|
|
46
45
|
"against per-batch memory cost. Lower values are safer for memory-constrained "
|
|
47
46
|
"machines; higher values give marginal speedup at the cost of larger working sets. "
|
|
@@ -71,9 +70,24 @@ def index_cmd( # noqa: B008
|
|
|
71
70
|
"Marks freshness as 'indexed with working-tree changes'."
|
|
72
71
|
),
|
|
73
72
|
),
|
|
73
|
+
detach: bool = typer.Option( # noqa: B008
|
|
74
|
+
False,
|
|
75
|
+
"--detach",
|
|
76
|
+
help=(
|
|
77
|
+
"When routing through a live server, return immediately after enqueueing "
|
|
78
|
+
"(fire-and-forget). Default is to wait for the index to complete."
|
|
79
|
+
),
|
|
80
|
+
),
|
|
74
81
|
) -> None:
|
|
75
82
|
"""Index SQL files in a directory.
|
|
76
83
|
|
|
84
|
+
When a server is live on this DB, the index is routed through the server's
|
|
85
|
+
control socket so the DB is never opened directly (avoids lock contention).
|
|
86
|
+
Use --detach to enqueue and return immediately (fire-and-forget).
|
|
87
|
+
|
|
88
|
+
With no server live, falls back to the direct-write path unchanged
|
|
89
|
+
(zero-config small-repo invariant).
|
|
90
|
+
|
|
77
91
|
Schema aliases (staging schema → canonical schema) can be configured in
|
|
78
92
|
.sqlcg.toml under sqlcg.schema_aliases, e.g. da_tmp = "da".
|
|
79
93
|
"""
|
|
@@ -85,6 +99,26 @@ def index_cmd( # noqa: B008
|
|
|
85
99
|
logging.getLogger("sqlcg").setLevel(level)
|
|
86
100
|
logging.getLogger("sqlglot").setLevel(level)
|
|
87
101
|
|
|
102
|
+
# Resolve path early so socket routing uses the absolute path.
|
|
103
|
+
path = path.resolve()
|
|
104
|
+
|
|
105
|
+
# Resolve dialect before routing so the WriterRequest always carries a concrete
|
|
106
|
+
# dialect (never the literal sentinel "auto"). Bug A: the route call was before
|
|
107
|
+
# this resolution, causing the server to receive "auto" and fail with
|
|
108
|
+
# "Unknown dialect 'auto'" on every server-routed index.
|
|
109
|
+
if dialect == "auto":
|
|
110
|
+
dialect = get_dialect(path)
|
|
111
|
+
|
|
112
|
+
# Step 3.2 — probe for a live server and route through the socket if present.
|
|
113
|
+
_routed = _try_route_index_via_server(
|
|
114
|
+
path=path,
|
|
115
|
+
dialect=dialect,
|
|
116
|
+
wait=not detach,
|
|
117
|
+
quiet=quiet,
|
|
118
|
+
)
|
|
119
|
+
if _routed:
|
|
120
|
+
return
|
|
121
|
+
|
|
88
122
|
# Route parse warnings to stderr (--verbose) or to the configured log file.
|
|
89
123
|
sqlcg_log = logging.getLogger("sqlcg")
|
|
90
124
|
|
|
@@ -107,20 +141,12 @@ def index_cmd( # noqa: B008
|
|
|
107
141
|
sqlcg_log.addHandler(_warn_handler)
|
|
108
142
|
_warn_log_path = None
|
|
109
143
|
else:
|
|
110
|
-
_warn_log_path =
|
|
144
|
+
_warn_log_path = DbConfig.from_env().log_path
|
|
111
145
|
_warn_log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
112
146
|
_warn_handler = logging.FileHandler(_warn_log_path)
|
|
113
147
|
_warn_handler.setLevel(logging.WARNING)
|
|
114
148
|
sqlcg_log.addHandler(_warn_handler)
|
|
115
149
|
|
|
116
|
-
# Set buffer pool size via env var if specified
|
|
117
|
-
if buffer_pool_size > 0:
|
|
118
|
-
os.environ["SQLCG_BUFFER_POOL_MB"] = str(buffer_pool_size)
|
|
119
|
-
|
|
120
|
-
# Resolve dialect: 'auto' reads from .sqlcg.toml, otherwise use provided value
|
|
121
|
-
if dialect == "auto":
|
|
122
|
-
dialect = get_dialect(path)
|
|
123
|
-
|
|
124
150
|
if not quiet and not config_file_present(path):
|
|
125
151
|
console.print(
|
|
126
152
|
f"[yellow]No .sqlcg.toml found at {path}/.sqlcg.toml — "
|
|
@@ -144,7 +170,7 @@ def index_cmd( # noqa: B008
|
|
|
144
170
|
)
|
|
145
171
|
except KeyboardInterrupt:
|
|
146
172
|
# The backend context manager (inside _run_index) has already closed the
|
|
147
|
-
#
|
|
173
|
+
# DuckDB connection and released the lock by the time we get here.
|
|
148
174
|
console.print("\n[yellow]Interrupted — no partial graph written. Re-run to index.[/yellow]")
|
|
149
175
|
raise typer.Exit(130) from None
|
|
150
176
|
finally:
|
|
@@ -172,6 +198,132 @@ def index_cmd( # noqa: B008
|
|
|
172
198
|
)
|
|
173
199
|
|
|
174
200
|
|
|
201
|
+
def _try_route_index_via_server(
|
|
202
|
+
*,
|
|
203
|
+
path: Path,
|
|
204
|
+
dialect: str | None,
|
|
205
|
+
wait: bool,
|
|
206
|
+
quiet: bool,
|
|
207
|
+
) -> bool:
|
|
208
|
+
"""Probe for a live server and route the index through the socket if found.
|
|
209
|
+
|
|
210
|
+
Returns True if the index was handled via the server (caller should return).
|
|
211
|
+
Returns False if no server is live (caller should fall through to direct path).
|
|
212
|
+
"""
|
|
213
|
+
from sqlcg.server.control import sock_path
|
|
214
|
+
|
|
215
|
+
sp = sock_path()
|
|
216
|
+
if not sp.exists():
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
payload = {
|
|
220
|
+
"op": "index",
|
|
221
|
+
"root": str(path),
|
|
222
|
+
"dialect": dialect,
|
|
223
|
+
"wait": wait,
|
|
224
|
+
"requested_by": "cli",
|
|
225
|
+
}
|
|
226
|
+
payload_bytes = json.dumps(payload).encode()
|
|
227
|
+
frame = f"{len(payload_bytes)}\n".encode() + payload_bytes
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
with _socket.socket(_socket.AF_UNIX, _socket.SOCK_STREAM) as s:
|
|
231
|
+
s.settimeout(_INDEX_SOCKET_TIMEOUT_S)
|
|
232
|
+
s.connect(str(sp))
|
|
233
|
+
s.sendall(frame)
|
|
234
|
+
|
|
235
|
+
if not wait:
|
|
236
|
+
# Fire-and-forget: read one framed acknowledgement frame.
|
|
237
|
+
f = s.makefile("rb")
|
|
238
|
+
length_line = f.readline()
|
|
239
|
+
if length_line:
|
|
240
|
+
try:
|
|
241
|
+
body_len = int(length_line.strip())
|
|
242
|
+
resp_bytes = f.read(body_len)
|
|
243
|
+
resp = json.loads(resp_bytes)
|
|
244
|
+
if "error" in resp:
|
|
245
|
+
err = resp["error"]
|
|
246
|
+
if "SQLCG_DB_PATH" in err or "write lock" in err:
|
|
247
|
+
console.print(f"[red]{err}[/red]")
|
|
248
|
+
else:
|
|
249
|
+
console.print(f"[red]Server error: {err}[/red]")
|
|
250
|
+
raise typer.Exit(1)
|
|
251
|
+
if not quiet:
|
|
252
|
+
pos = resp.get("position", "?")
|
|
253
|
+
console.print(f"[green]Queued via server[/green] (position {pos})")
|
|
254
|
+
except (ValueError, json.JSONDecodeError):
|
|
255
|
+
pass
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
# wait=True: stream framed frames until done:true.
|
|
259
|
+
f = s.makefile("rb")
|
|
260
|
+
with Progress(
|
|
261
|
+
SpinnerColumn(),
|
|
262
|
+
TextColumn("[progress.description]{task.description}"),
|
|
263
|
+
BarColumn(),
|
|
264
|
+
MofNCompleteColumn(),
|
|
265
|
+
TimeRemainingColumn(),
|
|
266
|
+
console=console,
|
|
267
|
+
redirect_stderr=True,
|
|
268
|
+
) as progress:
|
|
269
|
+
task = progress.add_task("Indexing via server", total=None)
|
|
270
|
+
|
|
271
|
+
while True:
|
|
272
|
+
length_line = f.readline()
|
|
273
|
+
if not length_line:
|
|
274
|
+
break
|
|
275
|
+
try:
|
|
276
|
+
body_len = int(length_line.strip())
|
|
277
|
+
except ValueError:
|
|
278
|
+
break
|
|
279
|
+
frame_bytes = f.read(body_len)
|
|
280
|
+
frame_resp = json.loads(frame_bytes)
|
|
281
|
+
|
|
282
|
+
if frame_resp.get("done"):
|
|
283
|
+
if not frame_resp.get("ok"):
|
|
284
|
+
err = frame_resp.get("error", "unknown error")
|
|
285
|
+
if "SQLCG_DB_PATH" in err or "write lock" in err:
|
|
286
|
+
console.print(f"[red]{err}[/red]")
|
|
287
|
+
else:
|
|
288
|
+
console.print(f"[red]Server index error: {err}[/red]")
|
|
289
|
+
raise typer.Exit(1)
|
|
290
|
+
srv_summary = frame_resp.get("summary", {})
|
|
291
|
+
if not quiet:
|
|
292
|
+
console.print(
|
|
293
|
+
f"[green]Indexed via server[/green] "
|
|
294
|
+
f"{srv_summary.get('files_parsed', '?')} files — "
|
|
295
|
+
f"{srv_summary.get('tables_found', '?')} tables, "
|
|
296
|
+
f"{srv_summary.get('lineage_edges_created', '?')} edges"
|
|
297
|
+
)
|
|
298
|
+
break
|
|
299
|
+
# Progress frame
|
|
300
|
+
files_done = frame_resp.get("files_done", 0)
|
|
301
|
+
files_total = frame_resp.get("files_total")
|
|
302
|
+
if files_total:
|
|
303
|
+
progress.update(task, completed=files_done, total=files_total)
|
|
304
|
+
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
except TimeoutError:
|
|
308
|
+
import sys as _sys
|
|
309
|
+
|
|
310
|
+
print(
|
|
311
|
+
f"Server is still applying the index (timed out waiting after "
|
|
312
|
+
f"{_INDEX_SOCKET_TIMEOUT_S}s); the graph will update when it finishes "
|
|
313
|
+
"— check 'sqlcg mcp status'.",
|
|
314
|
+
file=_sys.stderr,
|
|
315
|
+
)
|
|
316
|
+
raise typer.Exit(0) from None
|
|
317
|
+
except (FileNotFoundError, ConnectionRefusedError, OSError):
|
|
318
|
+
# No live server — fall through to direct path.
|
|
319
|
+
return False
|
|
320
|
+
except typer.Exit:
|
|
321
|
+
raise
|
|
322
|
+
except Exception as exc:
|
|
323
|
+
console.print(f"[red]Socket routing failed: {exc}[/red]")
|
|
324
|
+
raise typer.Exit(1) from exc
|
|
325
|
+
|
|
326
|
+
|
|
175
327
|
def _run_index(
|
|
176
328
|
*,
|
|
177
329
|
path: Path,
|
|
@@ -237,10 +389,10 @@ def _run_index(
|
|
|
237
389
|
)
|
|
238
390
|
|
|
239
391
|
# Connect files to repo
|
|
392
|
+
from sqlcg.core.queries import INDEX_REPO_FILES_QUERY
|
|
240
393
|
from sqlcg.core.schema import RelType
|
|
241
394
|
|
|
242
|
-
|
|
243
|
-
file_rows = backend.run_read(files_query, {"repo_prefix": abs_path})
|
|
395
|
+
file_rows = backend.run_read(INDEX_REPO_FILES_QUERY, {"repo_prefix": abs_path})
|
|
244
396
|
for row in file_rows:
|
|
245
397
|
backend.upsert_edge(
|
|
246
398
|
NodeLabel.FILE,
|