sql-code-graph 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.1.dist-info}/METADATA +87 -9
- sql_code_graph-1.0.1.dist-info/RECORD +63 -0
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +24 -0
- sqlcg/cli/commands/db.py +40 -7
- sqlcg/cli/commands/gain.py +5 -17
- sqlcg/cli/commands/git.py +71 -40
- sqlcg/cli/commands/index.py +151 -17
- sqlcg/cli/commands/install.py +147 -8
- sqlcg/cli/commands/mcp.py +12 -0
- sqlcg/cli/commands/reindex.py +170 -0
- sqlcg/cli/commands/uninstall.py +94 -39
- sqlcg/cli/commands/watch.py +14 -1
- sqlcg/cli/main.py +8 -0
- sqlcg/core/config.py +185 -2
- sqlcg/core/graph_db.py +65 -0
- sqlcg/core/kuzu_backend.py +177 -6
- sqlcg/core/neo4j_backend.py +38 -0
- sqlcg/core/queries.cypher +114 -0
- sqlcg/core/queries.py +44 -82
- sqlcg/core/schema.cypher +15 -3
- sqlcg/core/schema.py +2 -1
- sqlcg/indexer/error_classify.py +140 -0
- sqlcg/indexer/git_delta.py +121 -0
- sqlcg/indexer/indexer.py +951 -132
- sqlcg/indexer/pool.py +500 -0
- sqlcg/indexer/walker.py +1 -3
- sqlcg/indexer/watcher.py +68 -18
- sqlcg/lineage/aggregator.py +58 -2
- sqlcg/lineage/schema_resolver.py +26 -14
- sqlcg/parsers/ansi_parser.py +195 -26
- sqlcg/parsers/base.py +627 -58
- sqlcg/parsers/bigquery_parser.py +7 -2
- sqlcg/parsers/postgres_parser.py +7 -2
- sqlcg/parsers/registry.py +7 -2
- sqlcg/parsers/snowflake_parser.py +170 -8
- sqlcg/parsers/tsql_parser.py +7 -2
- sqlcg/server/models.py +297 -4
- sqlcg/server/noise_filter.py +167 -0
- sqlcg/server/skill.py +256 -0
- sqlcg/server/tools.py +934 -178
- sql_code_graph-0.3.0.dist-info/RECORD +0 -56
- {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.1.dist-info}/WHEEL +0 -0
- {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.1.dist-info}/entry_points.txt +0 -0
sqlcg/cli/commands/uninstall.py
CHANGED
|
@@ -28,6 +28,8 @@ def uninstall_cmd( # noqa: B008
|
|
|
28
28
|
Step 1: Remove MCP registration from ~/.claude/settings.json
|
|
29
29
|
Step 2: Optionally delete the KùzuDB graph database
|
|
30
30
|
Step 3: Remove git hook sentinel block from .git/hooks/post-checkout
|
|
31
|
+
Step 4: Remove sqlcg skill directory from ~/.claude/skills/sqlcg/ and
|
|
32
|
+
<repo>/.claude/skills/sqlcg/
|
|
31
33
|
"""
|
|
32
34
|
# Step 1: Remove MCP entry from settings.json
|
|
33
35
|
_step1_remove_mcp_entry()
|
|
@@ -44,6 +46,9 @@ def uninstall_cmd( # noqa: B008
|
|
|
44
46
|
repo_path = repo if repo else Path.cwd()
|
|
45
47
|
_step3_remove_git_hook(repo_path)
|
|
46
48
|
|
|
49
|
+
# Step 4: Remove sqlcg skill directory
|
|
50
|
+
_step4_remove_skill(repo_path)
|
|
51
|
+
|
|
47
52
|
|
|
48
53
|
def _step1_remove_mcp_entry() -> None:
|
|
49
54
|
"""Remove the 'sql-code-graph' entry from ~/.claude/settings.json."""
|
|
@@ -130,84 +135,134 @@ def _step2_delete_database(force: bool) -> None:
|
|
|
130
135
|
console.print(f"[yellow]Warning:[/yellow] Failed to delete metrics store: {e}")
|
|
131
136
|
|
|
132
137
|
|
|
133
|
-
def
|
|
134
|
-
"""
|
|
135
|
-
hook_file = repo_path / ".git" / "hooks" / "post-checkout"
|
|
138
|
+
def _strip_sentinel_block(content: str, sentinel: str) -> str:
|
|
139
|
+
"""Strip the block introduced by sentinel from hook file content.
|
|
136
140
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
return
|
|
141
|
+
The block starts at the sentinel line and extends until an empty line that is followed
|
|
142
|
+
by non-empty content (end-of-block), or until the end of file.
|
|
140
143
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# Strip the sentinel block: from "# sqlcg post-checkout hook" to the end of the block
|
|
145
|
-
# The block ends when we encounter a line that doesn't start with whitespace/# or is empty
|
|
146
|
-
# followed by non-empty content
|
|
144
|
+
Returns the stripped content (may be empty if the sentinel was the only content).
|
|
145
|
+
"""
|
|
147
146
|
lines = content.split("\n")
|
|
148
147
|
filtered_lines = []
|
|
149
148
|
skip_mode = False
|
|
150
149
|
|
|
151
150
|
for i, line in enumerate(lines):
|
|
152
|
-
if
|
|
151
|
+
if sentinel in line:
|
|
153
152
|
skip_mode = True
|
|
154
153
|
continue
|
|
155
154
|
|
|
156
155
|
if skip_mode:
|
|
157
|
-
# Skip all lines that are part of the hook block
|
|
158
|
-
# The block extends from the sentinel comment until we hit an empty line
|
|
159
|
-
# followed by non-hook content, or until the end of file
|
|
160
156
|
if line.strip() == "":
|
|
161
|
-
# Check if there's content after this blank line that's not the hook
|
|
162
157
|
remaining = "\n".join(lines[i + 1 :]).strip()
|
|
163
158
|
if remaining:
|
|
164
|
-
# There's content after this blank line, so end the skip mode
|
|
165
159
|
skip_mode = False
|
|
166
|
-
filtered_lines.append("") # Preserve
|
|
167
|
-
# else: blank line
|
|
168
|
-
# else: continue skipping
|
|
160
|
+
filtered_lines.append("") # Preserve blank-line separator
|
|
161
|
+
# else: trailing blank line — skip
|
|
162
|
+
# else: continue skipping body lines
|
|
169
163
|
continue
|
|
170
164
|
|
|
171
165
|
filtered_lines.append(line)
|
|
172
166
|
|
|
173
|
-
# Reconstruct the content
|
|
174
167
|
if filtered_lines:
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
168
|
+
return "\n".join(filtered_lines).strip() + "\n"
|
|
169
|
+
return ""
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _remove_single_hook(repo_path: Path, filename: str, sentinel: str) -> None:
|
|
173
|
+
"""Strip the sqlcg sentinel block from one git hook file.
|
|
174
|
+
|
|
175
|
+
If the file becomes empty after stripping, delete it.
|
|
176
|
+
If the file does not exist, emit a notice and return.
|
|
177
|
+
"""
|
|
178
|
+
hook_file = repo_path / ".git" / "hooks" / filename
|
|
179
|
+
|
|
180
|
+
if not hook_file.exists():
|
|
181
|
+
console.print(f"[yellow]No {filename} hook found in {repo_path}[/yellow]")
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
content = hook_file.read_text()
|
|
185
|
+
|
|
186
|
+
if sentinel not in content:
|
|
187
|
+
# Nothing to strip
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
new_content = _strip_sentinel_block(content, sentinel)
|
|
178
191
|
|
|
179
192
|
if not new_content.strip():
|
|
180
|
-
# File became empty, delete it
|
|
181
193
|
try:
|
|
182
194
|
hook_file.unlink()
|
|
183
|
-
console.print(
|
|
184
|
-
f"[green]Removed git hook from {repo_path}/.git/hooks/post-checkout[/green]"
|
|
185
|
-
)
|
|
195
|
+
console.print(f"[green]Removed git hook from {repo_path}/.git/hooks/{filename}[/green]")
|
|
186
196
|
except Exception as e:
|
|
187
197
|
console.print(f"[yellow]Warning:[/yellow] Failed to delete hook file: {e}")
|
|
188
198
|
else:
|
|
189
|
-
# Write back the filtered content
|
|
190
199
|
try:
|
|
191
200
|
hook_file.write_text(new_content)
|
|
192
|
-
console.print(
|
|
193
|
-
f"[green]Removed git hook from {repo_path}/.git/hooks/post-checkout[/green]"
|
|
194
|
-
)
|
|
201
|
+
console.print(f"[green]Removed git hook from {repo_path}/.git/hooks/{filename}[/green]")
|
|
195
202
|
except Exception as e:
|
|
196
203
|
console.print(f"[yellow]Warning:[/yellow] Failed to update hook file: {e}")
|
|
197
204
|
|
|
198
205
|
|
|
206
|
+
# (filename, sentinel) pairs for all sqlcg-managed hooks
|
|
207
|
+
_HOOK_SENTINELS: list[tuple[str, str]] = [
|
|
208
|
+
("post-checkout", "# sqlcg post-checkout hook"),
|
|
209
|
+
("post-merge", "# sqlcg post-merge hook"),
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _step3_remove_git_hook(repo_path: Path) -> None:
|
|
214
|
+
"""Remove sqlcg sentinel blocks from all managed git hook files.
|
|
215
|
+
|
|
216
|
+
Strips both post-checkout and post-merge hooks. Deletes a hook file if it
|
|
217
|
+
becomes empty after stripping; preserves foreign content otherwise.
|
|
218
|
+
"""
|
|
219
|
+
for filename, sentinel in _HOOK_SENTINELS:
|
|
220
|
+
_remove_single_hook(repo_path, filename, sentinel)
|
|
221
|
+
|
|
222
|
+
|
|
199
223
|
def _get_db_path() -> str | None:
|
|
200
224
|
"""Get the configured database path from environment or default."""
|
|
201
|
-
|
|
202
|
-
if db_path:
|
|
203
|
-
return db_path
|
|
225
|
+
from sqlcg.core.config import KuzuConfig
|
|
204
226
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
return default_path if Path(default_path).exists() else None
|
|
227
|
+
db_path = str(KuzuConfig.from_env().db_path)
|
|
228
|
+
return db_path if Path(db_path).exists() else None
|
|
208
229
|
|
|
209
230
|
|
|
210
231
|
def _is_kuzu_backend(db_path: str) -> bool:
|
|
211
232
|
"""Check if the database is a KùzuDB backend (not Neo4j)."""
|
|
212
233
|
backend = os.getenv("SQLCG_BACKEND", "kuzu").lower()
|
|
213
234
|
return backend in ("kuzu", "") # Default to kuzu if unset
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# Candidate skill directory locations to remove (global first, then project-relative)
|
|
238
|
+
# Each entry is a callable(repo_path) -> Path resolving to the sqlcg skill dir.
|
|
239
|
+
_SKILL_DIR_TARGETS = [
|
|
240
|
+
lambda repo_path: Path.home() / ".claude" / "skills" / "sqlcg",
|
|
241
|
+
lambda repo_path: repo_path / ".claude" / "skills" / "sqlcg",
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _step4_remove_skill(repo_path: Path) -> None:
|
|
246
|
+
"""Remove the sqlcg-owned skill directory at all candidate locations.
|
|
247
|
+
|
|
248
|
+
Iterates over the global (~/.claude/skills/sqlcg/) and project-relative
|
|
249
|
+
(<repo>/.claude/skills/sqlcg/) directories. For each:
|
|
250
|
+
- If the directory exists, removes it with shutil.rmtree (ignoring errors)
|
|
251
|
+
and prints a green "Removed" notice.
|
|
252
|
+
- If the directory does not exist, prints a yellow "not found" notice.
|
|
253
|
+
|
|
254
|
+
Only the sqlcg/ subdirectory is ever removed — the parent skills/ dir and
|
|
255
|
+
any sibling skill directories are left untouched.
|
|
256
|
+
"""
|
|
257
|
+
any_found = False
|
|
258
|
+
for target_fn in _SKILL_DIR_TARGETS:
|
|
259
|
+
skill_dir = target_fn(repo_path)
|
|
260
|
+
if skill_dir.exists():
|
|
261
|
+
any_found = True
|
|
262
|
+
shutil.rmtree(skill_dir, ignore_errors=True)
|
|
263
|
+
console.print(f"[green]Removed skill directory:[/green] {skill_dir}")
|
|
264
|
+
else:
|
|
265
|
+
console.print(f"[yellow]Skill directory not found:[/yellow] {skill_dir}")
|
|
266
|
+
|
|
267
|
+
if not any_found:
|
|
268
|
+
console.print("[yellow]No skill directories found — nothing to remove.[/yellow]")
|
sqlcg/cli/commands/watch.py
CHANGED
|
@@ -29,6 +29,17 @@ def watch_cmd( # noqa: B008
|
|
|
29
29
|
with get_backend() as backend:
|
|
30
30
|
backend.init_schema()
|
|
31
31
|
|
|
32
|
+
# Check schema version — must match current build
|
|
33
|
+
from sqlcg.core.schema import SCHEMA_VERSION
|
|
34
|
+
|
|
35
|
+
stored = backend.get_schema_version()
|
|
36
|
+
if stored != SCHEMA_VERSION:
|
|
37
|
+
console.print(
|
|
38
|
+
f"[red]Database schema is v{stored}; this build requires v{SCHEMA_VERSION}. "
|
|
39
|
+
"Run 'sqlcg db reset && sqlcg db init' to re-initialize.[/red]"
|
|
40
|
+
)
|
|
41
|
+
raise typer.Exit(1)
|
|
42
|
+
|
|
32
43
|
indexer = Indexer()
|
|
33
44
|
|
|
34
45
|
# Initial full index
|
|
@@ -37,7 +48,9 @@ def watch_cmd( # noqa: B008
|
|
|
37
48
|
|
|
38
49
|
spec = load_ignore_spec(path)
|
|
39
50
|
job_manager = WatchJobManager(indexer, backend, dialect)
|
|
40
|
-
handler = SqlFileEventHandler(
|
|
51
|
+
handler = SqlFileEventHandler(
|
|
52
|
+
job_manager, backend, spec, path, indexer=indexer, dialect=dialect
|
|
53
|
+
)
|
|
41
54
|
observer = Observer()
|
|
42
55
|
observer.schedule(handler, str(path), recursive=True)
|
|
43
56
|
observer.start()
|
sqlcg/cli/main.py
CHANGED
|
@@ -12,6 +12,7 @@ from sqlcg.cli.commands import (
|
|
|
12
12
|
index,
|
|
13
13
|
install,
|
|
14
14
|
mcp,
|
|
15
|
+
reindex,
|
|
15
16
|
report,
|
|
16
17
|
uninstall,
|
|
17
18
|
watch,
|
|
@@ -23,6 +24,12 @@ QUICK START:
|
|
|
23
24
|
1. sqlcg db init
|
|
24
25
|
2. sqlcg index <path> --dialect snowflake
|
|
25
26
|
3. sqlcg git install-hooks
|
|
27
|
+
4. sqlcg install --scope project # also provisions a Claude skill (SKILL.md)
|
|
28
|
+
|
|
29
|
+
USING THE MCP TOOLS:
|
|
30
|
+
Read `sqlcg mcp best-practices` first — it explains the fact/heuristic
|
|
31
|
+
boundary so heuristic output (dead-code, risk) is never reported as fact.
|
|
32
|
+
See `sqlcg mcp --help` for all MCP commands.
|
|
26
33
|
|
|
27
34
|
Note: Binary is `sqlcg`; PyPI package is `sql-code-graph`.
|
|
28
35
|
"""
|
|
@@ -38,6 +45,7 @@ app.add_typer(git.app, name="git")
|
|
|
38
45
|
|
|
39
46
|
# Register single commands
|
|
40
47
|
app.command("index")(index.index_cmd)
|
|
48
|
+
app.command("reindex")(reindex.reindex_cmd)
|
|
41
49
|
app.command("watch")(watch.watch_cmd)
|
|
42
50
|
app.command("gain")(gain.gain_cmd)
|
|
43
51
|
app.command("report")(report.report_cmd)
|
sqlcg/core/config.py
CHANGED
|
@@ -15,6 +15,10 @@ class KuzuConfig(BaseModel):
|
|
|
15
15
|
"""Configuration for KùzuDB backend."""
|
|
16
16
|
|
|
17
17
|
db_path: Path = Field(default_factory=lambda: Path.home() / ".sqlcg" / "graph.db")
|
|
18
|
+
buffer_pool_size_mb: int = Field(
|
|
19
|
+
default=0,
|
|
20
|
+
description="KuzuDB buffer pool size in MB (0 = use KuzuDB default)",
|
|
21
|
+
)
|
|
18
22
|
|
|
19
23
|
@classmethod
|
|
20
24
|
def from_env(cls) -> "KuzuConfig":
|
|
@@ -24,7 +28,11 @@ class KuzuConfig(BaseModel):
|
|
|
24
28
|
KuzuConfig instance with environment-overridden values if present.
|
|
25
29
|
"""
|
|
26
30
|
env_path = os.getenv("SQLCG_DB_PATH")
|
|
27
|
-
|
|
31
|
+
env_buf = os.getenv("SQLCG_BUFFER_POOL_MB")
|
|
32
|
+
return cls(
|
|
33
|
+
db_path=Path(env_path) if env_path else Path.home() / ".sqlcg" / "graph.db",
|
|
34
|
+
buffer_pool_size_mb=int(env_buf) if env_buf else 0,
|
|
35
|
+
)
|
|
28
36
|
|
|
29
37
|
|
|
30
38
|
class Neo4jConfig(BaseModel):
|
|
@@ -79,6 +87,178 @@ def get_dialect(path: Path) -> str:
|
|
|
79
87
|
return "snowflake"
|
|
80
88
|
|
|
81
89
|
|
|
90
|
+
def get_schema_aliases(path: Path) -> dict[str, str]:
|
|
91
|
+
"""Get schema alias mappings from .sqlcg.toml.
|
|
92
|
+
|
|
93
|
+
Reads [sqlcg.schema_aliases] and returns a lowercased staging-schema →
|
|
94
|
+
canonical-schema dict. Use this when a staging area sits under a different
|
|
95
|
+
schema but the table names are identical, e.g.::
|
|
96
|
+
|
|
97
|
+
[sqlcg.schema_aliases]
|
|
98
|
+
da_tmp = "da"
|
|
99
|
+
ba_tmp = "ba"
|
|
100
|
+
|
|
101
|
+
Any table reference whose schema part matches a key is rewritten to use the
|
|
102
|
+
canonical schema instead, so ``da_tmp.my_table`` is traced as ``da.my_table``.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
path: Root directory to search for .sqlcg.toml
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Dict mapping staging schema name (lowercase) to its canonical replacement
|
|
109
|
+
"""
|
|
110
|
+
config_file = Path(path) / ".sqlcg.toml"
|
|
111
|
+
if config_file.exists():
|
|
112
|
+
try:
|
|
113
|
+
with open(config_file, "rb") as f:
|
|
114
|
+
config = tomllib.load(f)
|
|
115
|
+
raw = config.get("sqlcg", {}).get("schema_aliases", {})
|
|
116
|
+
if isinstance(raw, dict):
|
|
117
|
+
return {k.lower(): v for k, v in raw.items() if isinstance(v, str)}
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
return {}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_noise_filter_patterns(path: Path) -> list[str]:
|
|
124
|
+
"""Get backup table ignore patterns from .sqlcg.toml.
|
|
125
|
+
|
|
126
|
+
Reads [sqlcg.noise_filter] -> ignore_table_patterns (a list of glob strings)
|
|
127
|
+
from .sqlcg.toml. Returns the list lowercased. When the key is absent,
|
|
128
|
+
returns a built-in default list::
|
|
129
|
+
|
|
130
|
+
[sqlcg.noise_filter]
|
|
131
|
+
ignore_table_patterns = ["*_bck", "*_bck_us", "*_bck_[0-9]*"]
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
path: Root directory to search for .sqlcg.toml
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List of glob patterns (all lowercased). Defaults to built-in backup patterns.
|
|
138
|
+
"""
|
|
139
|
+
default_patterns = [
|
|
140
|
+
"*_bck",
|
|
141
|
+
"*_bck_us",
|
|
142
|
+
"*_bck_[0-9]*",
|
|
143
|
+
"*_backup",
|
|
144
|
+
"*_backup_[0-9]*",
|
|
145
|
+
]
|
|
146
|
+
config_file = Path(path) / ".sqlcg.toml"
|
|
147
|
+
if config_file.exists():
|
|
148
|
+
try:
|
|
149
|
+
with open(config_file, "rb") as f:
|
|
150
|
+
config = tomllib.load(f)
|
|
151
|
+
raw = config.get("sqlcg", {}).get("noise_filter", {}).get("ignore_table_patterns")
|
|
152
|
+
if isinstance(raw, list):
|
|
153
|
+
return [p.lower() if isinstance(p, str) else p for p in raw]
|
|
154
|
+
except Exception:
|
|
155
|
+
pass
|
|
156
|
+
return default_patterns
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_ignored_tables(path: Path) -> list[str]:
|
|
160
|
+
"""Get explicitly-ignored qualified table names from .sqlcg.toml.
|
|
161
|
+
|
|
162
|
+
Complements ``get_noise_filter_patterns`` (glob patterns) with an exact
|
|
163
|
+
qualified-name list, for specific tables that do not follow a backup naming
|
|
164
|
+
convention but should still be dropped from tool answers — e.g. a
|
|
165
|
+
load-control / delta-bookkeeping table::
|
|
166
|
+
|
|
167
|
+
[sqlcg.noise_filter]
|
|
168
|
+
ignored_tables = ["ma.rtetl_delta", "ctl.load_log"]
|
|
169
|
+
|
|
170
|
+
Names are matched exactly (case-insensitive) against ``schema.table``. The
|
|
171
|
+
lineage engine still records these as real edges; this only lets a user
|
|
172
|
+
declare them noise in config rather than baking the judgment into code.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
path: Root directory to search for .sqlcg.toml
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of qualified table names (all lowercased). Defaults to an empty list.
|
|
179
|
+
"""
|
|
180
|
+
config_file = Path(path) / ".sqlcg.toml"
|
|
181
|
+
if config_file.exists():
|
|
182
|
+
try:
|
|
183
|
+
with open(config_file, "rb") as f:
|
|
184
|
+
config = tomllib.load(f)
|
|
185
|
+
raw = config.get("sqlcg", {}).get("noise_filter", {}).get("ignored_tables")
|
|
186
|
+
if isinstance(raw, list):
|
|
187
|
+
return [t.lower() for t in raw if isinstance(t, str)]
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def get_ignore_table_regexes(path: Path) -> list[str]:
|
|
194
|
+
"""Get table-exclusion regexes from .sqlcg.toml.
|
|
195
|
+
|
|
196
|
+
Complements ``get_noise_filter_patterns`` (anchored fnmatch globs) and
|
|
197
|
+
``get_ignored_tables`` (exact names) with full regular expressions, for
|
|
198
|
+
backup conventions the globs cannot express — e.g. a ``_bck`` marker that
|
|
199
|
+
can appear anywhere in the name, not just as a suffix::
|
|
200
|
+
|
|
201
|
+
[sqlcg.noise_filter]
|
|
202
|
+
ignore_table_regexes = ["_bck", "_tmp_[0-9]{8}"]
|
|
203
|
+
|
|
204
|
+
Each pattern is matched (``re.search``, case-insensitive) against the full
|
|
205
|
+
qualified ``schema.table`` name, so an unanchored ``_bck`` excludes
|
|
206
|
+
``ba.foo_bck`` and ``da.bar_bck_archive`` alike (the latter is missed by the
|
|
207
|
+
suffix-anchored ``*_bck`` glob). The
|
|
208
|
+
lineage engine still records these as real edges; this only lets a user
|
|
209
|
+
declare them noise in config rather than baking the judgment into code.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
path: Root directory to search for .sqlcg.toml
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of regex strings (kept verbatim — not lowercased, so character
|
|
216
|
+
classes survive). Defaults to an empty list.
|
|
217
|
+
"""
|
|
218
|
+
config_file = Path(path) / ".sqlcg.toml"
|
|
219
|
+
if config_file.exists():
|
|
220
|
+
try:
|
|
221
|
+
with open(config_file, "rb") as f:
|
|
222
|
+
config = tomllib.load(f)
|
|
223
|
+
raw = config.get("sqlcg", {}).get("noise_filter", {}).get("ignore_table_regexes")
|
|
224
|
+
if isinstance(raw, list):
|
|
225
|
+
return [r for r in raw if isinstance(r, str)]
|
|
226
|
+
except Exception:
|
|
227
|
+
pass
|
|
228
|
+
return []
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def get_presentation_prefixes(path: Path) -> list[str]:
|
|
232
|
+
"""Get presentation-facing schema prefixes from .sqlcg.toml.
|
|
233
|
+
|
|
234
|
+
Reads [sqlcg.presentation] -> schema_prefixes (a list of strings) from
|
|
235
|
+
.sqlcg.toml. Returns the list lowercased. **Defaults to an empty list** when
|
|
236
|
+
the key is absent — when unset, presentation-facing detection is simply off
|
|
237
|
+
(correct generic behaviour for any user). No schema prefix is hardcoded in
|
|
238
|
+
shipped code; a DWH that wants ``ia_`` flagged must declare it::
|
|
239
|
+
|
|
240
|
+
[sqlcg.presentation]
|
|
241
|
+
schema_prefixes = ["ia_"]
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
path: Root directory to search for .sqlcg.toml
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of schema prefixes (all lowercased). Defaults to an empty list.
|
|
248
|
+
"""
|
|
249
|
+
config_file = Path(path) / ".sqlcg.toml"
|
|
250
|
+
if config_file.exists():
|
|
251
|
+
try:
|
|
252
|
+
with open(config_file, "rb") as f:
|
|
253
|
+
config = tomllib.load(f)
|
|
254
|
+
raw = config.get("sqlcg", {}).get("presentation", {}).get("schema_prefixes")
|
|
255
|
+
if isinstance(raw, list):
|
|
256
|
+
return [p.lower() for p in raw if isinstance(p, str)]
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
|
|
82
262
|
def get_backend() -> "GraphBackend":
|
|
83
263
|
"""Get a graph backend instance respecting the SQLCG_BACKEND env var.
|
|
84
264
|
|
|
@@ -94,7 +274,10 @@ def get_backend() -> "GraphBackend":
|
|
|
94
274
|
from sqlcg.core.kuzu_backend import KuzuBackend
|
|
95
275
|
|
|
96
276
|
kuzu_cfg = KuzuConfig.from_env()
|
|
97
|
-
return KuzuBackend(
|
|
277
|
+
return KuzuBackend(
|
|
278
|
+
str(kuzu_cfg.db_path),
|
|
279
|
+
buffer_pool_size_mb=kuzu_cfg.buffer_pool_size_mb,
|
|
280
|
+
)
|
|
98
281
|
elif backend_type == "neo4j":
|
|
99
282
|
from sqlcg.core.neo4j_backend import Neo4jBackend
|
|
100
283
|
|
sqlcg/core/graph_db.py
CHANGED
|
@@ -67,6 +67,52 @@ class GraphBackend(ABC):
|
|
|
67
67
|
properties: Dict of properties to set/update on the relationship
|
|
68
68
|
"""
|
|
69
69
|
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def upsert_nodes_bulk(
|
|
72
|
+
self,
|
|
73
|
+
label: str,
|
|
74
|
+
rows: list[dict[str, Any]],
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Bulk-upsert nodes of one label in a single backend round-trip.
|
|
77
|
+
|
|
78
|
+
Each row dict must contain the primary-key field for `label` (see _pk_field)
|
|
79
|
+
plus any other properties to SET. All rows must share the same property-key
|
|
80
|
+
set; backends MAY raise if rows are heterogeneous (KuzuBackend does).
|
|
81
|
+
|
|
82
|
+
Idempotent MERGE semantics, identical to upsert_node per row.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
label: Node label (e.g., NodeLabel.COLUMN)
|
|
86
|
+
rows: List of property dicts. Empty list is a no-op.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def upsert_edges_bulk(
|
|
91
|
+
self,
|
|
92
|
+
src_label: str,
|
|
93
|
+
dst_label: str,
|
|
94
|
+
rel_type: str,
|
|
95
|
+
rows: list[dict[str, Any]],
|
|
96
|
+
) -> None:
|
|
97
|
+
"""Bulk-upsert edges of one (src_label, rel_type, dst_label) triple.
|
|
98
|
+
|
|
99
|
+
Each row dict must contain:
|
|
100
|
+
- "src_key": source primary-key value (matches src_label _pk_field)
|
|
101
|
+
- "dst_key": destination primary-key value (matches dst_label _pk_field)
|
|
102
|
+
- Any additional keys are set as edge properties.
|
|
103
|
+
|
|
104
|
+
Idempotent MERGE semantics, identical to upsert_edge per row. Rows whose
|
|
105
|
+
src or dst node does not exist are silently skipped by KuzuDB's MERGE
|
|
106
|
+
semantics — callers must ensure node upserts happen first within the same
|
|
107
|
+
transaction (see indexer ordering rules in _upsert_parsed_file).
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
src_label: Source node label
|
|
111
|
+
dst_label: Destination node label
|
|
112
|
+
rel_type: Relationship type
|
|
113
|
+
rows: List of edge property dicts. Empty list is a no-op.
|
|
114
|
+
"""
|
|
115
|
+
|
|
70
116
|
@abstractmethod
|
|
71
117
|
def run_read(self, query: str, params: dict[str, Any]) -> list[dict[str, Any]]:
|
|
72
118
|
"""Execute a read-only query and return results.
|
|
@@ -112,6 +158,25 @@ class GraphBackend(ABC):
|
|
|
112
158
|
The schema version string, or None if not set.
|
|
113
159
|
"""
|
|
114
160
|
|
|
161
|
+
@abstractmethod
|
|
162
|
+
def set_indexed_sha(self, sha: str) -> None:
|
|
163
|
+
"""Persist the git SHA of the last successful index.
|
|
164
|
+
|
|
165
|
+
Written by index_repo on success and by resync_changed on success.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
sha: Git commit SHA string (e.g. from git rev-parse HEAD).
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
def get_indexed_sha(self) -> str | None:
|
|
173
|
+
"""Retrieve the git SHA of the last successful index.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
The stored SHA string, or None if never set (repo pre-dates this
|
|
177
|
+
feature, or the DB was freshly initialised).
|
|
178
|
+
"""
|
|
179
|
+
|
|
115
180
|
@abstractmethod
|
|
116
181
|
def close(self) -> None:
|
|
117
182
|
"""Close the database connection."""
|