sql-code-graph 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sql_code_graph-0.2.1.dist-info/METADATA +171 -0
- sql_code_graph-0.2.1.dist-info/RECORD +55 -0
- sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
- sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
- sqlcg/__init__.py +5 -0
- sqlcg/__main__.py +6 -0
- sqlcg/cli/__init__.py +1 -0
- sqlcg/cli/commands/__init__.py +1 -0
- sqlcg/cli/commands/analyze.py +93 -0
- sqlcg/cli/commands/db.py +83 -0
- sqlcg/cli/commands/find.py +63 -0
- sqlcg/cli/commands/gain.py +169 -0
- sqlcg/cli/commands/git.py +73 -0
- sqlcg/cli/commands/index.py +92 -0
- sqlcg/cli/commands/install.py +60 -0
- sqlcg/cli/commands/mcp.py +54 -0
- sqlcg/cli/commands/report.py +135 -0
- sqlcg/cli/commands/watch.py +57 -0
- sqlcg/cli/main.py +40 -0
- sqlcg/core/__init__.py +8 -0
- sqlcg/core/config.py +104 -0
- sqlcg/core/graph_db.py +179 -0
- sqlcg/core/jobs.py +105 -0
- sqlcg/core/kuzu_backend.py +269 -0
- sqlcg/core/neo4j_backend.py +195 -0
- sqlcg/core/queries.py +82 -0
- sqlcg/core/schema.cypher +104 -0
- sqlcg/core/schema.py +48 -0
- sqlcg/indexer/__init__.py +1 -0
- sqlcg/indexer/dbt_adapter.py +23 -0
- sqlcg/indexer/indexer.py +317 -0
- sqlcg/indexer/walker.py +55 -0
- sqlcg/indexer/watcher.py +195 -0
- sqlcg/lineage/__init__.py +1 -0
- sqlcg/lineage/aggregator.py +58 -0
- sqlcg/lineage/schema_resolver.py +198 -0
- sqlcg/metrics/__init__.py +5 -0
- sqlcg/metrics/store.py +273 -0
- sqlcg/parsers/__init__.py +30 -0
- sqlcg/parsers/ansi_parser.py +215 -0
- sqlcg/parsers/base.py +414 -0
- sqlcg/parsers/bigquery_parser.py +77 -0
- sqlcg/parsers/postgres_parser.py +27 -0
- sqlcg/parsers/registry.py +46 -0
- sqlcg/parsers/snowflake_parser.py +148 -0
- sqlcg/parsers/tsql_parser.py +27 -0
- sqlcg/server/__init__.py +1 -0
- sqlcg/server/exceptions.py +20 -0
- sqlcg/server/models.py +83 -0
- sqlcg/server/server.py +57 -0
- sqlcg/server/tools.py +663 -0
- sqlcg/utils/__init__.py +6 -0
- sqlcg/utils/hashing.py +18 -0
- sqlcg/utils/ignore.py +36 -0
- sqlcg/utils/logging.py +29 -0
sqlcg/indexer/indexer.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Main indexer orchestrating parsing and graph persistence."""
|
|
2
|
+
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from concurrent.futures import TimeoutError as FuturesTimeout
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from sqlcg.core.graph_db import GraphBackend
|
|
8
|
+
from sqlcg.core.queries import STALE_VIEWS_QUERY
|
|
9
|
+
from sqlcg.core.schema import NodeLabel, RelType
|
|
10
|
+
from sqlcg.indexer.walker import walk_sql_files
|
|
11
|
+
from sqlcg.lineage.aggregator import CrossFileAggregator
|
|
12
|
+
from sqlcg.lineage.schema_resolver import SchemaResolver
|
|
13
|
+
from sqlcg.parsers.base import ParsedFile
|
|
14
|
+
from sqlcg.parsers.registry import get_parser
|
|
15
|
+
from sqlcg.utils.ignore import load_ignore_spec
|
|
16
|
+
from sqlcg.utils.logging import getLogger
|
|
17
|
+
|
|
18
|
+
logger = getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Indexer:
|
|
22
|
+
"""Orchestrates SQL file parsing and graph persistence."""
|
|
23
|
+
|
|
24
|
+
def index_repo(
|
|
25
|
+
self,
|
|
26
|
+
path: Path,
|
|
27
|
+
dialect: str | None,
|
|
28
|
+
db: GraphBackend,
|
|
29
|
+
dbt_manifest: Path | None = None,
|
|
30
|
+
timeout_per_file: int = 30,
|
|
31
|
+
use_git: bool = True,
|
|
32
|
+
) -> dict:
|
|
33
|
+
"""Full two-pass index. Returns summary dict.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path: Root directory to index
|
|
37
|
+
dialect: SQL dialect (None for ANSI)
|
|
38
|
+
db: GraphBackend instance
|
|
39
|
+
dbt_manifest: Optional path to dbt manifest.json
|
|
40
|
+
timeout_per_file: Timeout in seconds per file (0 = no timeout)
|
|
41
|
+
use_git: When True (default), use git ls-files to restrict
|
|
42
|
+
indexing to tracked files; falls back to rglob when git
|
|
43
|
+
is unavailable or the directory is not a git repository.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dict with keys: files_parsed, parse_errors, tables_found, lineage_edges_created
|
|
47
|
+
"""
|
|
48
|
+
spec = load_ignore_spec(path)
|
|
49
|
+
schema_resolver = SchemaResolver(dialect=dialect)
|
|
50
|
+
parser = get_parser(dialect, schema_resolver)
|
|
51
|
+
aggregator = CrossFileAggregator()
|
|
52
|
+
|
|
53
|
+
files = list(walk_sql_files(path, spec, use_git=use_git))
|
|
54
|
+
pass1_results: list[ParsedFile] = []
|
|
55
|
+
parse_errors = 0
|
|
56
|
+
|
|
57
|
+
# Pass 1: parse all files
|
|
58
|
+
for file_path in files:
|
|
59
|
+
try:
|
|
60
|
+
sql = file_path.read_text(encoding="utf-8")
|
|
61
|
+
parsed = self._index_single_file(parser, file_path, sql, timeout_per_file)
|
|
62
|
+
aggregator.register_pass1(parsed)
|
|
63
|
+
pass1_results.append(parsed)
|
|
64
|
+
parse_errors += len(parsed.errors)
|
|
65
|
+
except KeyboardInterrupt:
|
|
66
|
+
logger.info("SIGINT received — flushing progress")
|
|
67
|
+
self._upsert_all(pass1_results, db)
|
|
68
|
+
raise
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
logger.warning("Failed to parse %s: %s", file_path, exc)
|
|
71
|
+
parse_errors += 1
|
|
72
|
+
|
|
73
|
+
# Optional: load dbt manifest
|
|
74
|
+
if dbt_manifest:
|
|
75
|
+
from sqlcg.indexer.dbt_adapter import load_dbt_manifest
|
|
76
|
+
|
|
77
|
+
load_dbt_manifest(dbt_manifest, schema_resolver)
|
|
78
|
+
|
|
79
|
+
# Pass 2: resolve cross-file references
|
|
80
|
+
pass2_results: list[ParsedFile] = []
|
|
81
|
+
for parsed in pass1_results:
|
|
82
|
+
try:
|
|
83
|
+
resolved = aggregator.resolve_pass2(parser, parsed)
|
|
84
|
+
pass2_results.append(resolved)
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
logger.warning("resolve_pass2 failed for %s: %s", parsed.path, exc)
|
|
87
|
+
pass2_results.append(parsed)
|
|
88
|
+
|
|
89
|
+
# Upsert all results
|
|
90
|
+
tables_found = 0
|
|
91
|
+
lineage_edges = 0
|
|
92
|
+
for parsed in pass2_results:
|
|
93
|
+
counts = self._upsert_parsed_file(parsed, db)
|
|
94
|
+
tables_found += counts["tables"]
|
|
95
|
+
lineage_edges += counts["edges"]
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"files_parsed": len(pass2_results),
|
|
99
|
+
"parse_errors": parse_errors,
|
|
100
|
+
"tables_found": tables_found,
|
|
101
|
+
"lineage_edges_created": lineage_edges,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def reindex_file(self, file_path: str, db: GraphBackend, dialect: str | None) -> None:
|
|
105
|
+
"""Re-index a single file and its dependent views.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
file_path: Path to the file to re-index
|
|
109
|
+
db: GraphBackend instance
|
|
110
|
+
dialect: SQL dialect (None for ANSI)
|
|
111
|
+
"""
|
|
112
|
+
stale_views = db.run_read(STALE_VIEWS_QUERY, {"path": file_path})
|
|
113
|
+
|
|
114
|
+
with db.transaction():
|
|
115
|
+
db.delete_nodes_for_file(file_path)
|
|
116
|
+
schema_resolver = SchemaResolver(dialect=dialect)
|
|
117
|
+
parser = get_parser(dialect, schema_resolver)
|
|
118
|
+
sql = Path(file_path).read_text(encoding="utf-8")
|
|
119
|
+
parsed = parser.parse_file(Path(file_path), sql)
|
|
120
|
+
self._upsert_parsed_file(parsed, db)
|
|
121
|
+
|
|
122
|
+
for row in stale_views:
|
|
123
|
+
self._reindex_view_definition(row["view_name"], db, dialect)
|
|
124
|
+
|
|
125
|
+
def _index_single_file(self, parser, path: Path, sql: str, timeout: int) -> ParsedFile:
|
|
126
|
+
"""Parse one file, with optional timeout.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
parser: SqlParser instance
|
|
130
|
+
path: Path to the file
|
|
131
|
+
sql: SQL text
|
|
132
|
+
timeout: Timeout in seconds (0 = no timeout)
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
ParsedFile with parse_failed flag set if timeout occurs
|
|
136
|
+
"""
|
|
137
|
+
if timeout <= 0:
|
|
138
|
+
return parser.parse_file(path, sql)
|
|
139
|
+
|
|
140
|
+
with ThreadPoolExecutor(max_workers=1) as ex:
|
|
141
|
+
future = ex.submit(parser.parse_file, path, sql)
|
|
142
|
+
try:
|
|
143
|
+
return future.result(timeout=timeout)
|
|
144
|
+
except FuturesTimeout:
|
|
145
|
+
logger.warning("Timeout parsing %s (>%ds) — skipping", path, timeout)
|
|
146
|
+
out = ParsedFile(path=path, dialect=parser.DIALECT)
|
|
147
|
+
out.errors.append(f"timeout:{timeout}s")
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
def _upsert_parsed_file(self, parsed: ParsedFile, db: GraphBackend) -> dict:
|
|
151
|
+
"""Map ParsedFile → graph nodes/edges.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
parsed: ParsedFile to upsert
|
|
155
|
+
db: GraphBackend instance
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Dict with keys: tables, edges
|
|
159
|
+
"""
|
|
160
|
+
counts = {"tables": 0, "edges": 0}
|
|
161
|
+
|
|
162
|
+
# Upsert File node
|
|
163
|
+
db.upsert_node(
|
|
164
|
+
NodeLabel.FILE,
|
|
165
|
+
parsed.path_str,
|
|
166
|
+
{
|
|
167
|
+
"path": parsed.path_str,
|
|
168
|
+
"dialect": parsed.dialect or "",
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Upsert defined tables
|
|
173
|
+
for table in parsed.defined_tables:
|
|
174
|
+
db.upsert_node(
|
|
175
|
+
NodeLabel.TABLE,
|
|
176
|
+
table.full_id,
|
|
177
|
+
{
|
|
178
|
+
"qualified": table.full_id,
|
|
179
|
+
"name": table.name,
|
|
180
|
+
"catalog": table.catalog or "",
|
|
181
|
+
"db": table.db or "",
|
|
182
|
+
"kind": "TABLE",
|
|
183
|
+
"defined_in_file": parsed.path_str,
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
db.upsert_edge(
|
|
187
|
+
NodeLabel.TABLE,
|
|
188
|
+
table.full_id,
|
|
189
|
+
NodeLabel.FILE,
|
|
190
|
+
parsed.path_str,
|
|
191
|
+
RelType.DEFINED_IN,
|
|
192
|
+
{},
|
|
193
|
+
)
|
|
194
|
+
counts["tables"] += 1
|
|
195
|
+
|
|
196
|
+
# Upsert query nodes
|
|
197
|
+
for i, stmt in enumerate(parsed.statements):
|
|
198
|
+
query_id = f"{parsed.path_str}:{i}"
|
|
199
|
+
db.upsert_node(
|
|
200
|
+
NodeLabel.QUERY,
|
|
201
|
+
query_id,
|
|
202
|
+
{
|
|
203
|
+
"id": query_id,
|
|
204
|
+
"file_path": parsed.path_str,
|
|
205
|
+
"statement_index": i,
|
|
206
|
+
"sql": stmt.sql[:500],
|
|
207
|
+
"kind": stmt.kind,
|
|
208
|
+
"target_table": stmt.target.full_id if stmt.target else "",
|
|
209
|
+
"parse_failed": stmt.parse_failed,
|
|
210
|
+
"confidence": stmt.confidence,
|
|
211
|
+
"parsing_mode": stmt.parsing_mode,
|
|
212
|
+
},
|
|
213
|
+
)
|
|
214
|
+
db.upsert_edge(
|
|
215
|
+
NodeLabel.QUERY,
|
|
216
|
+
query_id,
|
|
217
|
+
NodeLabel.FILE,
|
|
218
|
+
parsed.path_str,
|
|
219
|
+
RelType.QUERY_DEFINED_IN,
|
|
220
|
+
{},
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Source table edges
|
|
224
|
+
for src_table in stmt.sources:
|
|
225
|
+
db.upsert_node(
|
|
226
|
+
NodeLabel.TABLE,
|
|
227
|
+
src_table.full_id,
|
|
228
|
+
{
|
|
229
|
+
"qualified": src_table.full_id,
|
|
230
|
+
"name": src_table.name,
|
|
231
|
+
"catalog": src_table.catalog or "",
|
|
232
|
+
"db": src_table.db or "",
|
|
233
|
+
"kind": "TABLE",
|
|
234
|
+
"defined_in_file": "",
|
|
235
|
+
},
|
|
236
|
+
)
|
|
237
|
+
db.upsert_edge(
|
|
238
|
+
NodeLabel.QUERY,
|
|
239
|
+
query_id,
|
|
240
|
+
NodeLabel.TABLE,
|
|
241
|
+
src_table.full_id,
|
|
242
|
+
RelType.SELECTS_FROM,
|
|
243
|
+
{},
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Column lineage edges
|
|
247
|
+
for edge in stmt.column_lineage:
|
|
248
|
+
src_id = edge.src.full_id
|
|
249
|
+
dst_id = edge.dst.full_id
|
|
250
|
+
db.upsert_node(
|
|
251
|
+
NodeLabel.COLUMN,
|
|
252
|
+
src_id,
|
|
253
|
+
{
|
|
254
|
+
"id": src_id,
|
|
255
|
+
"col_name": edge.src.name,
|
|
256
|
+
"table_qualified": edge.src.table.full_id,
|
|
257
|
+
"catalog": edge.src.table.catalog or "",
|
|
258
|
+
"db": edge.src.table.db or "",
|
|
259
|
+
"table_name": edge.src.table.name,
|
|
260
|
+
},
|
|
261
|
+
)
|
|
262
|
+
db.upsert_node(
|
|
263
|
+
NodeLabel.COLUMN,
|
|
264
|
+
dst_id,
|
|
265
|
+
{
|
|
266
|
+
"id": dst_id,
|
|
267
|
+
"col_name": edge.dst.name,
|
|
268
|
+
"table_qualified": edge.dst.table.full_id,
|
|
269
|
+
"catalog": edge.dst.table.catalog or "",
|
|
270
|
+
"db": edge.dst.table.db or "",
|
|
271
|
+
"table_name": edge.dst.table.name,
|
|
272
|
+
},
|
|
273
|
+
)
|
|
274
|
+
db.upsert_edge(
|
|
275
|
+
NodeLabel.COLUMN,
|
|
276
|
+
src_id,
|
|
277
|
+
NodeLabel.COLUMN,
|
|
278
|
+
dst_id,
|
|
279
|
+
RelType.COLUMN_LINEAGE,
|
|
280
|
+
{
|
|
281
|
+
"transform": edge.transform,
|
|
282
|
+
"confidence": edge.confidence,
|
|
283
|
+
"query_id": query_id,
|
|
284
|
+
},
|
|
285
|
+
)
|
|
286
|
+
counts["edges"] += 1
|
|
287
|
+
|
|
288
|
+
return counts
|
|
289
|
+
|
|
290
|
+
def _upsert_all(self, results: list[ParsedFile], db: GraphBackend) -> None:
|
|
291
|
+
"""Upsert all parsed files.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
results: List of ParsedFile objects
|
|
295
|
+
db: GraphBackend instance
|
|
296
|
+
"""
|
|
297
|
+
for parsed in results:
|
|
298
|
+
self._upsert_parsed_file(parsed, db)
|
|
299
|
+
|
|
300
|
+
def _reindex_view_definition(
|
|
301
|
+
self, view_name: str, db: GraphBackend, dialect: str | None
|
|
302
|
+
) -> None:
|
|
303
|
+
"""Re-index the file that defines a view.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
view_name: Qualified view name
|
|
307
|
+
db: GraphBackend instance
|
|
308
|
+
dialect: SQL dialect
|
|
309
|
+
"""
|
|
310
|
+
query = (
|
|
311
|
+
f"MATCH (t:{NodeLabel.TABLE} {{qualified: $name}})"
|
|
312
|
+
f"-[:{RelType.DEFINED_IN}]->(f:{NodeLabel.FILE}) "
|
|
313
|
+
"RETURN f.path AS path"
|
|
314
|
+
)
|
|
315
|
+
result = db.run_read(query, {"name": view_name})
|
|
316
|
+
for row in result:
|
|
317
|
+
self.reindex_file(row["path"], db, dialect)
|
sqlcg/indexer/walker.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""SQL file walker with ignore pattern support."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pathspec
|
|
8
|
+
|
|
9
|
+
from sqlcg.utils.ignore import is_ignored
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _git_sql_files(root: Path) -> list[Path] | None:
|
|
13
|
+
"""Return tracked .sql files via git ls-files, or None if git unavailable."""
|
|
14
|
+
try:
|
|
15
|
+
result = subprocess.run(
|
|
16
|
+
["git", "ls-files", "--cached"],
|
|
17
|
+
cwd=root,
|
|
18
|
+
capture_output=True,
|
|
19
|
+
text=True,
|
|
20
|
+
check=True,
|
|
21
|
+
)
|
|
22
|
+
return [root / f for f in result.stdout.splitlines() if f.endswith(".sql")]
|
|
23
|
+
except (subprocess.CalledProcessError, OSError):
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def walk_sql_files(
|
|
28
|
+
root: Path, spec: pathspec.PathSpec, use_git: bool = True
|
|
29
|
+
) -> Iterator[Path]:
|
|
30
|
+
"""Walk directory tree and yield SQL files not matching ignore patterns.
|
|
31
|
+
|
|
32
|
+
When use_git=True (default) and git is available, only tracked files are
|
|
33
|
+
returned — this prevents flooding from build artefacts, node_modules, and
|
|
34
|
+
other untracked directories. Falls back to rglob when git is unavailable
|
|
35
|
+
or the directory is not a git repository.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
root: Root directory to walk
|
|
39
|
+
spec: PathSpec object with ignore patterns
|
|
40
|
+
use_git: Use git ls-files instead of rglob (default True)
|
|
41
|
+
|
|
42
|
+
Yields:
|
|
43
|
+
Path objects for .sql files not matching ignore patterns
|
|
44
|
+
"""
|
|
45
|
+
if use_git:
|
|
46
|
+
git_files = _git_sql_files(root)
|
|
47
|
+
if git_files is not None:
|
|
48
|
+
for path in git_files:
|
|
49
|
+
if path.exists() and not is_ignored(path, root, spec):
|
|
50
|
+
yield path
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
for path in root.rglob("*.sql"):
|
|
54
|
+
if not is_ignored(path, root, spec):
|
|
55
|
+
yield path
|
sqlcg/indexer/watcher.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""File system watcher for SQL file changes."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pathspec
|
|
9
|
+
from watchdog.events import FileSystemEventHandler
|
|
10
|
+
|
|
11
|
+
from sqlcg.utils.ignore import is_ignored
|
|
12
|
+
from sqlcg.utils.logging import getLogger
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SqlFileEventHandler(FileSystemEventHandler):
|
|
18
|
+
"""Watchdog event handler for SQL file changes."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, job_manager, db, ignore_spec: pathspec.PathSpec, root: Path, indexer=None):
|
|
21
|
+
"""Initialize the event handler.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
job_manager: WatchJobManager instance
|
|
25
|
+
db: GraphBackend instance
|
|
26
|
+
ignore_spec: PathSpec with ignore patterns
|
|
27
|
+
root: Root directory being watched
|
|
28
|
+
indexer: Indexer instance (used by BranchMonitor)
|
|
29
|
+
"""
|
|
30
|
+
super().__init__()
|
|
31
|
+
self._jobs = job_manager
|
|
32
|
+
self._db = db
|
|
33
|
+
self._spec = ignore_spec
|
|
34
|
+
self._root = root
|
|
35
|
+
# Create and start BranchMonitor if indexer is provided
|
|
36
|
+
self._branch_monitor: BranchMonitor | None = None
|
|
37
|
+
if indexer is not None:
|
|
38
|
+
self._branch_monitor = BranchMonitor(root, job_manager, indexer, db)
|
|
39
|
+
self._branch_monitor.start()
|
|
40
|
+
|
|
41
|
+
def _is_sql(self, path: str | bytes) -> bool:
|
|
42
|
+
"""Check if a path is a non-ignored SQL file.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
path: File path to check (str or bytes from watchdog)
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
True if the path is a .sql file not matching ignore patterns
|
|
49
|
+
"""
|
|
50
|
+
if isinstance(path, bytes):
|
|
51
|
+
path = path.decode("utf-8")
|
|
52
|
+
path_obj = Path(path)
|
|
53
|
+
return path_obj.suffix == ".sql" and not is_ignored(path_obj, self._root, self._spec)
|
|
54
|
+
|
|
55
|
+
def on_modified(self, event):
|
|
56
|
+
"""Handle file modification events.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
event: Watchdog event object
|
|
60
|
+
"""
|
|
61
|
+
if not event.is_directory and self._is_sql(event.src_path):
|
|
62
|
+
self._jobs.schedule(event.src_path)
|
|
63
|
+
|
|
64
|
+
def on_created(self, event):
|
|
65
|
+
"""Handle file creation events.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
event: Watchdog event object
|
|
69
|
+
"""
|
|
70
|
+
if not event.is_directory and self._is_sql(event.src_path):
|
|
71
|
+
self._jobs.schedule(event.src_path)
|
|
72
|
+
|
|
73
|
+
def on_moved(self, event):
|
|
74
|
+
"""Handle file move events.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
event: Watchdog event object
|
|
78
|
+
"""
|
|
79
|
+
if not event.is_directory and self._is_sql(event.dest_path):
|
|
80
|
+
self._jobs.schedule(event.dest_path)
|
|
81
|
+
|
|
82
|
+
def on_deleted(self, event):
|
|
83
|
+
"""Handle file deletion events.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
event: Watchdog event object
|
|
87
|
+
"""
|
|
88
|
+
if not event.is_directory and self._is_sql(event.src_path):
|
|
89
|
+
self._db.delete_nodes_for_file(event.src_path)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BranchMonitor(threading.Thread):
|
|
93
|
+
"""Background thread that detects branch changes and triggers full resyncs.
|
|
94
|
+
|
|
95
|
+
Polls `git rev-parse --abbrev-ref HEAD` every 2 seconds. When the branch
|
|
96
|
+
changes, pauses the job manager, runs a full reindex, then resumes and
|
|
97
|
+
drains queued file events.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self, watched_path: Path, job_manager, indexer, db, _poll_interval: float = 2.0
|
|
102
|
+
):
|
|
103
|
+
"""Initialize the branch monitor.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
watched_path: Path being watched (used to find git root)
|
|
107
|
+
job_manager: WatchJobManager instance
|
|
108
|
+
indexer: Indexer instance
|
|
109
|
+
db: GraphBackend instance
|
|
110
|
+
_poll_interval: Polling interval in seconds (for testing)
|
|
111
|
+
"""
|
|
112
|
+
# daemon=False ensures that if index_repo() is in-flight when shutdown is requested,
|
|
113
|
+
# the process will wait (via join(timeout=5) in watch.py) up to 5 seconds before exiting.
|
|
114
|
+
# This avoids data loss from killing an in-progress resync.
|
|
115
|
+
super().__init__(daemon=False)
|
|
116
|
+
self._watched_path = watched_path
|
|
117
|
+
self._job_manager = job_manager
|
|
118
|
+
self._indexer = indexer
|
|
119
|
+
self._db = db
|
|
120
|
+
self._stop_event = threading.Event()
|
|
121
|
+
self._current_branch: str | None = None
|
|
122
|
+
self._poll_interval = _poll_interval
|
|
123
|
+
|
|
124
|
+
def run(self) -> None:
|
|
125
|
+
"""Poll git branch and trigger resync on change."""
|
|
126
|
+
while not self._stop_event.is_set():
|
|
127
|
+
try:
|
|
128
|
+
branch = self._get_current_branch()
|
|
129
|
+
if branch is not None and branch != self._current_branch:
|
|
130
|
+
logger.debug(
|
|
131
|
+
"Branch change detected: %s -> %s", self._current_branch, branch
|
|
132
|
+
)
|
|
133
|
+
self._current_branch = branch
|
|
134
|
+
self._on_branch_change()
|
|
135
|
+
except subprocess.CalledProcessError:
|
|
136
|
+
# Not a git repo or git not available
|
|
137
|
+
logger.debug("Could not get current branch (not a git repo or git unavailable)")
|
|
138
|
+
self._stop_event.set()
|
|
139
|
+
break
|
|
140
|
+
except Exception as exc:
|
|
141
|
+
logger.debug("BranchMonitor error: %s", exc)
|
|
142
|
+
|
|
143
|
+
# Sleep in small increments to allow quick shutdown
|
|
144
|
+
for _ in range(int(self._poll_interval * 10)):
|
|
145
|
+
if self._stop_event.is_set():
|
|
146
|
+
break
|
|
147
|
+
time.sleep(0.1)
|
|
148
|
+
|
|
149
|
+
def _get_current_branch(self) -> str | None:
|
|
150
|
+
"""Get the current git branch name.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Branch name, or None if git command fails
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
subprocess.CalledProcessError if git command fails
|
|
157
|
+
"""
|
|
158
|
+
result = subprocess.run(
|
|
159
|
+
["git", "rev-parse", "--abbrev-ref", "HEAD"],
|
|
160
|
+
cwd=str(self._watched_path),
|
|
161
|
+
capture_output=True,
|
|
162
|
+
text=True,
|
|
163
|
+
check=True,
|
|
164
|
+
)
|
|
165
|
+
return result.stdout.strip()
|
|
166
|
+
|
|
167
|
+
def _on_branch_change(self) -> None:
|
|
168
|
+
"""Handle branch change: pause, resync, resume, drain queue."""
|
|
169
|
+
# Pause new file events
|
|
170
|
+
self._job_manager.set_paused(True)
|
|
171
|
+
|
|
172
|
+
# Cancel pending file timers
|
|
173
|
+
self._job_manager.cancel_all()
|
|
174
|
+
|
|
175
|
+
# Run full resync
|
|
176
|
+
try:
|
|
177
|
+
self._indexer.index_repo(self._watched_path, dialect=None, db=self._db)
|
|
178
|
+
except Exception as exc:
|
|
179
|
+
logger.error("Branch change resync failed: %s", exc)
|
|
180
|
+
finally:
|
|
181
|
+
# Resume and drain queued events
|
|
182
|
+
self._job_manager.set_paused(False)
|
|
183
|
+
self._job_manager.drain_queued()
|
|
184
|
+
|
|
185
|
+
def stop(self) -> None:
|
|
186
|
+
"""Signal the thread to stop."""
|
|
187
|
+
self._stop_event.set()
|
|
188
|
+
|
|
189
|
+
def join(self, timeout: float | None = None) -> None:
|
|
190
|
+
"""Wait for the thread to stop.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
timeout: Maximum time to wait in seconds
|
|
194
|
+
"""
|
|
195
|
+
super().join(timeout=timeout)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Lineage resolution and analysis module."""
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Cross-file lineage aggregator for two-pass resolution."""
|
|
2
|
+
|
|
3
|
+
from sqlcg.parsers.base import ParsedFile
|
|
4
|
+
from sqlcg.utils.logging import getLogger
|
|
5
|
+
|
|
6
|
+
logger = getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CrossFileAggregator:
|
|
10
|
+
"""Aggregates parsed files for cross-file resolution.
|
|
11
|
+
|
|
12
|
+
Pass 1: register all parsed files and build view/table source mappings.
|
|
13
|
+
Pass 2: re-parse files with cross-file schema context.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
"""Initialize the aggregator."""
|
|
18
|
+
# Maps table.full_id -> ParsedFile that defines it
|
|
19
|
+
self.sources: dict[str, ParsedFile] = {}
|
|
20
|
+
|
|
21
|
+
def register_pass1(self, parsed: ParsedFile) -> None:
|
|
22
|
+
"""Register a pass-1 result and build view/table source map.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
parsed: ParsedFile from pass 1
|
|
26
|
+
"""
|
|
27
|
+
for table in parsed.defined_tables:
|
|
28
|
+
self.sources[table.full_id] = parsed
|
|
29
|
+
|
|
30
|
+
def resolve_pass2(self, parser, parsed: ParsedFile) -> ParsedFile:
|
|
31
|
+
"""Re-parse with cross-file schema context.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
parser: SqlParser instance
|
|
35
|
+
parsed: ParsedFile from pass 1
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
ParsedFile from pass 2 with resolved cross-file references,
|
|
39
|
+
or the pass-1 result if the file cannot be re-read.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
No exceptions are raised; file read errors are logged as WARNING
|
|
43
|
+
and the pass-1 result is returned unchanged.
|
|
44
|
+
"""
|
|
45
|
+
# Register view sources for schema resolution
|
|
46
|
+
parser._schema.add_view_sources(self.sources)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
sql = parsed.path.read_text(encoding="utf-8")
|
|
50
|
+
except (FileNotFoundError, OSError) as exc:
|
|
51
|
+
logger.warning(
|
|
52
|
+
"resolve_pass2: cannot re-read %s (%s) — returning pass-1 result",
|
|
53
|
+
parsed.path,
|
|
54
|
+
exc,
|
|
55
|
+
)
|
|
56
|
+
return parsed
|
|
57
|
+
|
|
58
|
+
return parser.parse_file(parsed.path, sql)
|