sqlprism 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,677 @@
1
+ """Indexer orchestrator.
2
+
3
+ The only component that connects parsers to storage. It:
4
+ 1. Scans repos for SQL files
5
+ 2. Checksums files and compares against stored checksums
6
+ 3. Determines the dialect per file (repo default or path-based override)
7
+ 4. Calls SqlParser with the appropriate dialect
8
+ 5. Resolves edge references (name/kind → node IDs)
9
+ 6. Inserts results into DuckDB via GraphDB
10
+ """
11
+
12
+ import fnmatch
13
+ import hashlib
14
+ import logging
15
+ import subprocess
16
+ from collections import OrderedDict
17
+ from pathlib import Path
18
+
19
+ from sqlprism.core.graph import GraphDB
20
+ from sqlprism.languages import SQL_EXTENSIONS, is_sql_file
21
+ from sqlprism.languages.sql import SqlParser
22
+ from sqlprism.types import ParseResult
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class Indexer:
28
+ """Orchestrates parsing and indexing across repos.
29
+
30
+ Connects language parsers to the ``GraphDB`` storage layer. Handles
31
+ file scanning, checksum diffing, dialect resolution, and batch
32
+ insertion of parse results. Supports plain SQL repos, sqlmesh
33
+ projects, and dbt projects.
34
+ """
35
+
36
+ def __init__(self, graph: GraphDB):
37
+ """Initialise the indexer.
38
+
39
+ Args:
40
+ graph: The ``GraphDB`` instance to write parsed data into.
41
+ """
42
+ self.graph = graph
43
+ self._parser_cache: dict[str | None, SqlParser] = {}
44
+ self._sqlmesh_renderer = None
45
+ self._dbt_renderer = None
46
+ # Stat-based pre-filter cache: abs_path -> (mtime, size, checksum)
47
+ # Avoids re-reading file bytes when mtime+size are unchanged.
48
+ self._file_stat_cache: OrderedDict[str, tuple[float, int, str]] = OrderedDict()
49
+
50
+ def get_parser(self, dialect: str | None = None) -> SqlParser:
51
+ """Get or create a SqlParser for the given dialect."""
52
+ if dialect not in self._parser_cache:
53
+ self._parser_cache[dialect] = SqlParser(dialect=dialect)
54
+ return self._parser_cache[dialect]
55
+
56
+ def get_sqlmesh_renderer(self, dialect: str | None = None):
57
+ """Get or create a SqlMeshRenderer with the correct dialect parser."""
58
+ from sqlprism.languages.sqlmesh import SqlMeshRenderer
59
+
60
+ if self._sqlmesh_renderer is None or (dialect and self._sqlmesh_renderer.sql_parser.dialect != dialect):
61
+ self._sqlmesh_renderer = SqlMeshRenderer(sql_parser=self.get_parser(dialect))
62
+ return self._sqlmesh_renderer
63
+
64
+ @property
65
+ def dbt_renderer(self):
66
+ if self._dbt_renderer is None:
67
+ from sqlprism.languages.dbt import DbtRenderer
68
+
69
+ self._dbt_renderer = DbtRenderer(sql_parser=self.get_parser())
70
+ return self._dbt_renderer
71
+
72
+ def reindex_repo(
73
+ self,
74
+ name: str,
75
+ path: str | Path,
76
+ dialect: str | None = None,
77
+ dialect_overrides: dict[str, str] | None = None,
78
+ ) -> dict:
79
+ """Reindex a single repo by scanning for SQL files.
80
+
81
+ Compares file checksums against the stored index to determine
82
+ added, changed, and deleted files. Only changed files are re-parsed.
83
+
84
+ Args:
85
+ name: Repo name in the index.
86
+ path: Absolute path to the repo root.
87
+ dialect: Default SQL dialect (e.g. ``"starrocks"``, ``"athena"``).
88
+ dialect_overrides: Per-path dialect overrides as
89
+ ``{glob_pattern: dialect}``, e.g.
90
+ ``{"athena/": "athena", "starrocks/**": "starrocks"}``.
91
+
92
+ Returns:
93
+ Stats dict with keys ``files_scanned``, ``files_added``,
94
+ ``files_changed``, ``files_removed``, ``nodes_added``,
95
+ ``edges_added``, ``column_usage_added``, ``lineage_chains``,
96
+ ``column_usage_dropped``, ``parse_errors``, and
97
+ ``phantoms_cleaned``.
98
+ """
99
+ path = Path(path).resolve()
100
+ repo_id = self.graph.upsert_repo(name, str(path))
101
+
102
+ # Get current checksums from DB
103
+ stored_checksums = self.graph.get_file_checksums(repo_id)
104
+
105
+ # Scan filesystem
106
+ current_files = self._scan_files(path)
107
+
108
+ # Determine what changed
109
+ changed = []
110
+ added = []
111
+ for rel_path, checksum in current_files.items():
112
+ if rel_path not in stored_checksums:
113
+ added.append(rel_path)
114
+ elif stored_checksums[rel_path] != checksum:
115
+ changed.append(rel_path)
116
+
117
+ deleted = [p for p in stored_checksums if p not in current_files]
118
+
119
+ # Parse and insert changed/added files
120
+ stats = {
121
+ "files_scanned": len(current_files),
122
+ "files_added": len(added),
123
+ "files_changed": len(changed),
124
+ "files_removed": len(deleted),
125
+ "nodes_added": 0,
126
+ "edges_added": 0,
127
+ "column_usage_added": 0,
128
+ "lineage_chains": 0,
129
+ "column_usage_dropped": 0,
130
+ "parse_errors": [],
131
+ }
132
+
133
+ # Delete truly removed files in a transaction
134
+ if deleted:
135
+ with self.graph.write_transaction():
136
+ for rel_path in deleted:
137
+ self.graph.delete_file_data(repo_id, rel_path)
138
+
139
+ # Build schema catalog from existing index for SELECT * expansion
140
+ schema_catalog = self.graph.get_table_columns(repo_id) or None
141
+
142
+ # Changed + added files: delete old + insert new in same transaction
143
+ # so a crash never leaves a file in a "deleted but not yet reinserted" state
144
+ changed_set = set(changed)
145
+ for rel_path in changed + added:
146
+ # Resolve dialect for this file
147
+ file_dialect = _resolve_dialect(rel_path, dialect, dialect_overrides)
148
+ parser = self.get_parser(file_dialect)
149
+
150
+ full_path = path / rel_path
151
+ try:
152
+ content = full_path.read_text(errors="replace")
153
+ except (OSError, PermissionError):
154
+ logger.warning("Cannot read file %s — skipping", full_path)
155
+ stats["parse_errors"].append(f"{rel_path}: unreadable (OS/permission error)")
156
+ continue
157
+ checksum = current_files[rel_path]
158
+
159
+ # Parse — pass schema catalog for SELECT * lineage expansion
160
+ result = parser.parse(rel_path, content, schema=schema_catalog)
161
+ if result.errors:
162
+ for err in result.errors:
163
+ stats["parse_errors"].append(f"{rel_path}: {err}")
164
+
165
+ # Wrap per-file delete + insert in a transaction for atomicity
166
+ with self.graph.write_transaction():
167
+ if rel_path in changed_set:
168
+ self.graph.delete_file_data(repo_id, rel_path)
169
+ file_id = self.graph.insert_file(repo_id, rel_path, "sql", checksum)
170
+ self._insert_parse_result(result, file_id, repo_id, stats)
171
+
172
+ # Clean up phantom nodes that now have real counterparts
173
+ phantoms_cleaned = self.graph.cleanup_phantoms()
174
+ stats["phantoms_cleaned"] = phantoms_cleaned
175
+
176
+ # Update repo metadata
177
+ commit, branch = self._get_git_info(path)
178
+ self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
179
+
180
+ self.graph.clear_snippet_cache()
181
+ return stats
182
+
183
+ def reindex_sqlmesh(
184
+ self,
185
+ repo_name: str,
186
+ project_path: str | Path,
187
+ env_file: str | Path | None = None,
188
+ variables: dict[str, str | int] | None = None,
189
+ dialect: str = "athena",
190
+ sqlmesh_command: str = "uv run python",
191
+ venv_dir: str | Path | None = None,
192
+ ) -> dict:
193
+ """Index a sqlmesh project by rendering all models first.
194
+
195
+ Uses ``SqlMeshRenderer`` to render every model via subprocess,
196
+ then parses the rendered SQL and inserts results into the graph.
197
+
198
+ Args:
199
+ repo_name: Repo name in the index.
200
+ project_path: Path to the sqlmesh project directory
201
+ (containing ``config.yaml``).
202
+ env_file: Optional ``.env`` file to source before rendering.
203
+ variables: Extra sqlmesh variables (e.g. ``{"GRACE_PERIOD": 7}``).
204
+ dialect: SQL dialect for rendering (default ``"athena"``).
205
+ sqlmesh_command: Command to invoke Python in the sqlmesh venv.
206
+ venv_dir: Directory containing ``.venv``. Auto-detected if not set.
207
+
208
+ Returns:
209
+ Stats dict with keys ``models_rendered``, ``nodes_added``,
210
+ ``edges_added``, ``column_usage_added``, and ``lineage_chains``.
211
+ """
212
+ project_path = Path(project_path).resolve()
213
+ repo_id = self.graph.upsert_repo(repo_name, str(project_path))
214
+
215
+ # Build schema catalog from existing index for SELECT * expansion
216
+ schema_catalog = self.graph.get_table_columns(repo_id) or None
217
+
218
+ rendered = self.get_sqlmesh_renderer(dialect).render_project(
219
+ project_path=project_path,
220
+ env_file=env_file,
221
+ variables=variables,
222
+ dialect=dialect,
223
+ sqlmesh_command=sqlmesh_command,
224
+ venv_dir=venv_dir,
225
+ schema_catalog=schema_catalog,
226
+ )
227
+
228
+ stats = {
229
+ "models_rendered": len(rendered),
230
+ "nodes_added": 0,
231
+ "edges_added": 0,
232
+ "column_usage_added": 0,
233
+ "lineage_chains": 0,
234
+ }
235
+
236
+ for model_name, result in rendered.items():
237
+ clean_name = model_name.strip('"').replace('"."', "/")
238
+ file_path = clean_name + ".sql"
239
+
240
+ # Wrap delete + insert per model in a transaction for atomicity
241
+ with self.graph.write_transaction():
242
+ self.graph.delete_file_data(repo_id, file_path)
243
+ checksum = _checksum_parse_result(result)
244
+ file_id = self.graph.insert_file(repo_id, file_path, "sql", checksum)
245
+ self._insert_parse_result(result, file_id, repo_id, stats)
246
+
247
+ commit, branch = self._get_git_info(project_path)
248
+ self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
249
+
250
+ self.graph.clear_snippet_cache()
251
+ return stats
252
+
253
+ def reindex_dbt(
254
+ self,
255
+ repo_name: str,
256
+ project_path: str | Path,
257
+ profiles_dir: str | Path | None = None,
258
+ env_file: str | Path | None = None,
259
+ target: str | None = None,
260
+ dbt_command: str = "uv run dbt",
261
+ venv_dir: str | Path | None = None,
262
+ dialect: str | None = None,
263
+ ) -> dict:
264
+ """Index a dbt project by compiling all models first.
265
+
266
+ Runs ``dbt compile`` via ``DbtRenderer``, then parses each
267
+ compiled SQL file and inserts results into the graph.
268
+
269
+ Args:
270
+ repo_name: Repo name in the index.
271
+ project_path: Path to the dbt project directory
272
+ (containing ``dbt_project.yml``).
273
+ profiles_dir: Path to the directory containing ``profiles.yml``.
274
+ env_file: Optional ``.env`` file to source before compilation.
275
+ target: dbt target name override.
276
+ dbt_command: Command to invoke dbt (e.g. ``"uv run dbt"``).
277
+ venv_dir: Directory to run from (where ``.venv`` lives).
278
+ dialect: SQL dialect for parsing compiled output.
279
+
280
+ Returns:
281
+ Stats dict with keys ``models_compiled``, ``nodes_added``,
282
+ ``edges_added``, ``column_usage_added``, and ``lineage_chains``.
283
+ """
284
+ project_path = Path(project_path).resolve()
285
+ repo_id = self.graph.upsert_repo(repo_name, str(project_path))
286
+
287
+ # Build schema catalog from existing index for SELECT * expansion
288
+ schema_catalog = self.graph.get_table_columns(repo_id) or None
289
+
290
+ rendered = self.dbt_renderer.render_project(
291
+ project_path=project_path,
292
+ profiles_dir=profiles_dir,
293
+ env_file=env_file,
294
+ target=target,
295
+ dbt_command=dbt_command,
296
+ venv_dir=venv_dir,
297
+ dialect=dialect,
298
+ schema_catalog=schema_catalog,
299
+ )
300
+
301
+ stats = {
302
+ "models_compiled": len(rendered),
303
+ "nodes_added": 0,
304
+ "edges_added": 0,
305
+ "column_usage_added": 0,
306
+ "lineage_chains": 0,
307
+ }
308
+
309
+ for model_path, result in rendered.items():
310
+ # Wrap delete + insert per model in a transaction for atomicity
311
+ with self.graph.write_transaction():
312
+ self.graph.delete_file_data(repo_id, model_path)
313
+ checksum = _checksum_parse_result(result)
314
+ file_id = self.graph.insert_file(repo_id, model_path, "sql", checksum)
315
+ self._insert_parse_result(result, file_id, repo_id, stats)
316
+
317
+ commit, branch = self._get_git_info(project_path)
318
+ self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
319
+
320
+ self.graph.clear_snippet_cache()
321
+ return stats
322
+
323
+ def _insert_parse_result(
324
+ self,
325
+ result: ParseResult,
326
+ file_id: int,
327
+ repo_id: int,
328
+ stats: dict,
329
+ ) -> None:
330
+ """Insert nodes, edges, column usage, and lineage from a ParseResult.
331
+
332
+ Shared by reindex_repo, reindex_sqlmesh, and reindex_dbt.
333
+ Uses batch inserts for performance. Updates stats dict in-place.
334
+ """
335
+ import json
336
+
337
+ # ── Batch insert nodes ──
338
+ # Key includes schema to avoid collisions between staging.orders and production.orders
339
+ node_id_map: dict[tuple[str, str, str | None], int] = {}
340
+ if result.nodes:
341
+ node_rows = [
342
+ (
343
+ file_id,
344
+ node.kind,
345
+ node.name,
346
+ result.language,
347
+ node.line_start,
348
+ node.line_end,
349
+ json.dumps(node.metadata) if node.metadata else None,
350
+ (node.metadata or {}).get("schema") if node.metadata else None,
351
+ )
352
+ for node in result.nodes
353
+ ]
354
+ node_ids = self.graph.insert_nodes_batch(node_rows)
355
+ for node, nid in zip(result.nodes, node_ids):
356
+ schema = (node.metadata or {}).get("schema") if node.metadata else None
357
+ node_id_map[(node.name, node.kind, schema)] = nid
358
+ stats["nodes_added"] += len(result.nodes)
359
+
360
+ # ── Batch insert edges ──
361
+ if result.edges:
362
+ edge_rows = []
363
+ for edge in result.edges:
364
+ source_id = self._resolve_edge_endpoint(
365
+ edge.source_name,
366
+ edge.source_kind,
367
+ node_id_map,
368
+ repo_id,
369
+ schema=(edge.metadata or {}).get("source_schema") if edge.metadata else None,
370
+ )
371
+ target_id = self._resolve_edge_endpoint(
372
+ edge.target_name,
373
+ edge.target_kind,
374
+ node_id_map,
375
+ repo_id,
376
+ schema=(edge.metadata or {}).get("target_schema") if edge.metadata else None,
377
+ )
378
+ edge_rows.append(
379
+ (
380
+ source_id,
381
+ target_id,
382
+ edge.relationship,
383
+ edge.context,
384
+ json.dumps(edge.metadata) if edge.metadata else None,
385
+ )
386
+ )
387
+ self.graph.insert_edges_batch(edge_rows)
388
+ stats["edges_added"] += len(edge_rows)
389
+
390
+ # ── Batch insert column usage ──
391
+ if result.column_usage:
392
+ cu_rows = []
393
+ for cu in result.column_usage:
394
+ # Try schema-aware lookup first, then fall back to schema=None
395
+ cu_node_id = node_id_map.get((cu.node_name, cu.node_kind, None))
396
+ if not cu_node_id:
397
+ # Try all schemas for this (name, kind)
398
+ for key, nid in node_id_map.items():
399
+ if key[0] == cu.node_name and key[1] == cu.node_kind:
400
+ cu_node_id = nid
401
+ break
402
+ if not cu_node_id:
403
+ cu_node_id = self.graph.resolve_node(cu.node_name, cu.node_kind, repo_id)
404
+ if cu_node_id:
405
+ cu_rows.append(
406
+ (
407
+ cu_node_id,
408
+ cu.table_name,
409
+ cu.column_name,
410
+ cu.usage_type,
411
+ file_id,
412
+ cu.alias,
413
+ cu.transform,
414
+ )
415
+ )
416
+ else:
417
+ stats["column_usage_dropped"] = stats.get("column_usage_dropped", 0) + 1
418
+ logger.warning(
419
+ "Dropped column_usage: node %s/%s not found (table=%s col=%s)",
420
+ cu.node_name,
421
+ cu.node_kind,
422
+ cu.table_name,
423
+ cu.column_name,
424
+ )
425
+ if cu_rows:
426
+ self.graph.insert_column_usage_batch(cu_rows)
427
+ stats["column_usage_added"] += len(cu_rows)
428
+
429
+ # ── Batch insert column lineage ──
430
+ if result.column_lineage:
431
+ lineage_rows = []
432
+ # Track chain_index per (output_node, output_column) to disambiguate multi-path lineage
433
+ chain_counters: dict[tuple[str, str], int] = {}
434
+ for cl in result.column_lineage:
435
+ key = (cl.output_node, cl.output_column)
436
+ chain_idx = chain_counters.get(key, 0)
437
+ chain_counters[key] = chain_idx + 1
438
+ for i, hop in enumerate(cl.chain):
439
+ lineage_rows.append(
440
+ (
441
+ file_id,
442
+ cl.output_node,
443
+ cl.output_column,
444
+ chain_idx,
445
+ i,
446
+ hop.column,
447
+ hop.table,
448
+ hop.expression,
449
+ )
450
+ )
451
+ stats["lineage_chains"] += 1
452
+ if lineage_rows:
453
+ self.graph.insert_column_lineage_batch(lineage_rows)
454
+
455
+ def _resolve_edge_endpoint(
456
+ self,
457
+ name: str,
458
+ kind: str,
459
+ local_map: dict[tuple[str, str, str | None], int],
460
+ repo_id: int,
461
+ schema: str | None = None,
462
+ ) -> int:
463
+ """Resolve an edge endpoint to a node_id."""
464
+ # Try with schema first, then without
465
+ node_id = local_map.get((name, kind, schema))
466
+ if node_id:
467
+ return node_id
468
+ if schema:
469
+ node_id = local_map.get((name, kind, None))
470
+ if node_id:
471
+ return node_id
472
+
473
+ node_id = self.graph.resolve_node(name, kind, repo_id, schema=schema)
474
+ if node_id:
475
+ return node_id
476
+
477
+ return self.graph.get_or_create_phantom(name, kind, "sql")
478
+
479
+ def _scan_files(self, repo_path: Path) -> dict[str, str]:
480
+ """Scan a repo directory for SQL files. Returns {relative_path: sha256}.
481
+
482
+ Uses mtime + size as a pre-filter: if both match the cached values
483
+ from a previous scan, the stored checksum is reused without reading
484
+ the file contents.
485
+ """
486
+ result: dict[str, str] = {}
487
+
488
+ for file_path in repo_path.rglob("*"):
489
+ if not file_path.is_file():
490
+ continue
491
+ if file_path.suffix not in SQL_EXTENSIONS:
492
+ continue
493
+ # Skip common non-source directories
494
+ parts = file_path.relative_to(repo_path).parts
495
+ if any(
496
+ p.startswith(".") or p in ("node_modules", "__pycache__", "venv", ".venv", "target", "build")
497
+ for p in parts
498
+ ):
499
+ continue
500
+
501
+ rel_path = str(file_path.relative_to(repo_path))
502
+ abs_key = str(file_path)
503
+
504
+ # Stat-based pre-filter: skip checksum if mtime+size unchanged
505
+ try:
506
+ st = file_path.stat()
507
+ except OSError:
508
+ logger.warning("Cannot stat file %s — skipping", file_path)
509
+ continue
510
+ mtime = st.st_mtime
511
+ size = st.st_size
512
+
513
+ cached = self._file_stat_cache.get(abs_key)
514
+ if cached is not None and cached[0] == mtime and cached[1] == size:
515
+ checksum = cached[2]
516
+ else:
517
+ try:
518
+ content = file_path.read_bytes()
519
+ except OSError:
520
+ logger.warning("Cannot read file %s — skipping", file_path)
521
+ continue
522
+ checksum = hashlib.sha256(content).hexdigest()
523
+ self._file_stat_cache[abs_key] = (mtime, size, checksum)
524
+ if len(self._file_stat_cache) > 10_000:
525
+ self._file_stat_cache.popitem(last=False) # evict oldest
526
+
527
+ result[rel_path] = checksum
528
+
529
+ return result
530
+
531
+ def _get_git_info(self, repo_path: Path) -> tuple[str | None, str | None]:
532
+ """Get current git commit hash and branch name."""
533
+ try:
534
+ commit = subprocess.run(
535
+ ["git", "rev-parse", "HEAD"],
536
+ cwd=repo_path,
537
+ capture_output=True,
538
+ text=True,
539
+ timeout=5,
540
+ )
541
+ branch = subprocess.run(
542
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
543
+ cwd=repo_path,
544
+ capture_output=True,
545
+ text=True,
546
+ timeout=5,
547
+ )
548
+ return (
549
+ commit.stdout.strip() if commit.returncode == 0 else None,
550
+ branch.stdout.strip() if branch.returncode == 0 else None,
551
+ )
552
+ except (subprocess.TimeoutExpired, FileNotFoundError):
553
+ return None, None
554
+
555
+ def parse_file(
556
+ self,
557
+ file_path: str,
558
+ content: str,
559
+ dialect: str | None = None,
560
+ schema: dict | None = None,
561
+ ) -> ParseResult:
562
+ """Parse a single SQL file without writing to the database.
563
+
564
+ Args:
565
+ file_path: File path (used for naming nodes, not read from disk).
566
+ content: Raw SQL content.
567
+ dialect: Optional SQL dialect override.
568
+ schema: Optional schema catalog for ``SELECT *`` expansion.
569
+
570
+ Returns:
571
+ A ``ParseResult`` with extracted nodes, edges, column usage,
572
+ and lineage. Returns an empty result for non-SQL files.
573
+ """
574
+ if not is_sql_file(file_path):
575
+ return ParseResult(language="sql")
576
+ return self.get_parser(dialect).parse(file_path, content, schema=schema)
577
+
578
+ def parse_file_at_commit(
579
+ self,
580
+ repo_path: Path,
581
+ file_path: str,
582
+ commit: str,
583
+ dialect: str | None = None,
584
+ ) -> ParseResult | None:
585
+ """Parse a file at a specific git commit.
586
+
587
+ Retrieves file content via ``git show`` and parses it without
588
+ writing to the database. Used by pr_impact analysis.
589
+
590
+ Args:
591
+ repo_path: Absolute path to the git repo root.
592
+ file_path: Relative file path within the repo.
593
+ commit: Git commit hash or ref to read from.
594
+ dialect: Optional SQL dialect override.
595
+
596
+ Returns:
597
+ A ``ParseResult``, or ``None`` if the file doesn't exist at
598
+ that commit or is not a SQL file.
599
+ """
600
+ if not is_sql_file(file_path):
601
+ return None
602
+ try:
603
+ result = subprocess.run(
604
+ ["git", "show", f"{commit}:{file_path}"],
605
+ cwd=repo_path,
606
+ capture_output=True,
607
+ text=True,
608
+ timeout=10,
609
+ )
610
+ if result.returncode != 0:
611
+ return None
612
+ except (subprocess.TimeoutExpired, FileNotFoundError):
613
+ return None
614
+
615
+ return self.get_parser(dialect).parse(file_path, result.stdout)
616
+
617
+ def get_changed_files(self, repo_path: Path, base_commit: str) -> list[str]:
618
+ """Get SQL files changed between a base commit and HEAD.
619
+
620
+ Args:
621
+ repo_path: Absolute path to the git repo root.
622
+ base_commit: Git commit hash or ref to diff against HEAD.
623
+
624
+ Returns:
625
+ List of relative file paths for changed SQL files. Returns
626
+ an empty list on git errors or timeouts.
627
+ """
628
+ try:
629
+ result = subprocess.run(
630
+ ["git", "diff", "--name-only", f"{base_commit}..HEAD"],
631
+ cwd=repo_path,
632
+ capture_output=True,
633
+ text=True,
634
+ timeout=10,
635
+ )
636
+ if result.returncode != 0:
637
+ return []
638
+ return [f.strip() for f in result.stdout.strip().split("\n") if f.strip() and is_sql_file(f.strip())]
639
+ except (subprocess.TimeoutExpired, FileNotFoundError):
640
+ return []
641
+
642
+
643
+ def _resolve_dialect(
644
+ file_path: str,
645
+ default_dialect: str | None,
646
+ overrides: dict[str, str] | None,
647
+ ) -> str | None:
648
+ """Determine the SQL dialect for a file path.
649
+
650
+ Checks overrides first (glob patterns), falls back to default.
651
+ """
652
+ if overrides:
653
+ for pattern, dialect in overrides.items():
654
+ # Support both "dir/" prefix matching and full glob
655
+ if file_path.startswith(pattern) or fnmatch.fnmatch(file_path, pattern):
656
+ return dialect
657
+ return default_dialect
658
+
659
+
660
+ def _checksum_parse_result(result: ParseResult) -> str:
661
+ """Hash the structural content of a ParseResult.
662
+
663
+ Used for rendered models (sqlmesh/dbt) where we don't have the raw SQL
664
+ content to hash directly. Produces a stable checksum based on the
665
+ extracted nodes, edges, and column usage.
666
+ """
667
+ parts = []
668
+ for n in result.nodes:
669
+ parts.append(f"N:{n.kind}:{n.name}")
670
+ for e in result.edges:
671
+ parts.append(f"E:{e.source_name}:{e.target_name}:{e.relationship}")
672
+ for cu in result.column_usage:
673
+ parts.append(f"CU:{cu.node_name}:{cu.table_name}:{cu.column_name}:{cu.usage_type}")
674
+ for cl in result.column_lineage:
675
+ hops = "|".join(f"{h.table}.{h.column}:{h.expression}" for h in cl.chain)
676
+ parts.append(f"CL:{cl.output_node}:{cl.output_column}:{hops}")
677
+ return hashlib.sha256("\n".join(parts).encode()).hexdigest()