sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sql_code_graph-0.2.1.dist-info/METADATA +171 -0
  2. sql_code_graph-0.2.1.dist-info/RECORD +55 -0
  3. sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
  4. sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
  5. sqlcg/__init__.py +5 -0
  6. sqlcg/__main__.py +6 -0
  7. sqlcg/cli/__init__.py +1 -0
  8. sqlcg/cli/commands/__init__.py +1 -0
  9. sqlcg/cli/commands/analyze.py +93 -0
  10. sqlcg/cli/commands/db.py +83 -0
  11. sqlcg/cli/commands/find.py +63 -0
  12. sqlcg/cli/commands/gain.py +169 -0
  13. sqlcg/cli/commands/git.py +73 -0
  14. sqlcg/cli/commands/index.py +92 -0
  15. sqlcg/cli/commands/install.py +60 -0
  16. sqlcg/cli/commands/mcp.py +54 -0
  17. sqlcg/cli/commands/report.py +135 -0
  18. sqlcg/cli/commands/watch.py +57 -0
  19. sqlcg/cli/main.py +40 -0
  20. sqlcg/core/__init__.py +8 -0
  21. sqlcg/core/config.py +104 -0
  22. sqlcg/core/graph_db.py +179 -0
  23. sqlcg/core/jobs.py +105 -0
  24. sqlcg/core/kuzu_backend.py +269 -0
  25. sqlcg/core/neo4j_backend.py +195 -0
  26. sqlcg/core/queries.py +82 -0
  27. sqlcg/core/schema.cypher +104 -0
  28. sqlcg/core/schema.py +48 -0
  29. sqlcg/indexer/__init__.py +1 -0
  30. sqlcg/indexer/dbt_adapter.py +23 -0
  31. sqlcg/indexer/indexer.py +317 -0
  32. sqlcg/indexer/walker.py +55 -0
  33. sqlcg/indexer/watcher.py +195 -0
  34. sqlcg/lineage/__init__.py +1 -0
  35. sqlcg/lineage/aggregator.py +58 -0
  36. sqlcg/lineage/schema_resolver.py +198 -0
  37. sqlcg/metrics/__init__.py +5 -0
  38. sqlcg/metrics/store.py +273 -0
  39. sqlcg/parsers/__init__.py +30 -0
  40. sqlcg/parsers/ansi_parser.py +215 -0
  41. sqlcg/parsers/base.py +414 -0
  42. sqlcg/parsers/bigquery_parser.py +77 -0
  43. sqlcg/parsers/postgres_parser.py +27 -0
  44. sqlcg/parsers/registry.py +46 -0
  45. sqlcg/parsers/snowflake_parser.py +148 -0
  46. sqlcg/parsers/tsql_parser.py +27 -0
  47. sqlcg/server/__init__.py +1 -0
  48. sqlcg/server/exceptions.py +20 -0
  49. sqlcg/server/models.py +83 -0
  50. sqlcg/server/server.py +57 -0
  51. sqlcg/server/tools.py +663 -0
  52. sqlcg/utils/__init__.py +6 -0
  53. sqlcg/utils/hashing.py +18 -0
  54. sqlcg/utils/ignore.py +36 -0
  55. sqlcg/utils/logging.py +29 -0
@@ -0,0 +1,317 @@
1
+ """Main indexer orchestrating parsing and graph persistence."""
2
+
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from concurrent.futures import TimeoutError as FuturesTimeout
5
+ from pathlib import Path
6
+
7
+ from sqlcg.core.graph_db import GraphBackend
8
+ from sqlcg.core.queries import STALE_VIEWS_QUERY
9
+ from sqlcg.core.schema import NodeLabel, RelType
10
+ from sqlcg.indexer.walker import walk_sql_files
11
+ from sqlcg.lineage.aggregator import CrossFileAggregator
12
+ from sqlcg.lineage.schema_resolver import SchemaResolver
13
+ from sqlcg.parsers.base import ParsedFile
14
+ from sqlcg.parsers.registry import get_parser
15
+ from sqlcg.utils.ignore import load_ignore_spec
16
+ from sqlcg.utils.logging import getLogger
17
+
18
+ logger = getLogger(__name__)
19
+
20
+
21
+ class Indexer:
22
+ """Orchestrates SQL file parsing and graph persistence."""
23
+
24
+ def index_repo(
25
+ self,
26
+ path: Path,
27
+ dialect: str | None,
28
+ db: GraphBackend,
29
+ dbt_manifest: Path | None = None,
30
+ timeout_per_file: int = 30,
31
+ use_git: bool = True,
32
+ ) -> dict:
33
+ """Full two-pass index. Returns summary dict.
34
+
35
+ Args:
36
+ path: Root directory to index
37
+ dialect: SQL dialect (None for ANSI)
38
+ db: GraphBackend instance
39
+ dbt_manifest: Optional path to dbt manifest.json
40
+ timeout_per_file: Timeout in seconds per file (0 = no timeout)
41
+ use_git: When True (default), use git ls-files to restrict
42
+ indexing to tracked files; falls back to rglob when git
43
+ is unavailable or the directory is not a git repository.
44
+
45
+ Returns:
46
+ Dict with keys: files_parsed, parse_errors, tables_found, lineage_edges_created
47
+ """
48
+ spec = load_ignore_spec(path)
49
+ schema_resolver = SchemaResolver(dialect=dialect)
50
+ parser = get_parser(dialect, schema_resolver)
51
+ aggregator = CrossFileAggregator()
52
+
53
+ files = list(walk_sql_files(path, spec, use_git=use_git))
54
+ pass1_results: list[ParsedFile] = []
55
+ parse_errors = 0
56
+
57
+ # Pass 1: parse all files
58
+ for file_path in files:
59
+ try:
60
+ sql = file_path.read_text(encoding="utf-8")
61
+ parsed = self._index_single_file(parser, file_path, sql, timeout_per_file)
62
+ aggregator.register_pass1(parsed)
63
+ pass1_results.append(parsed)
64
+ parse_errors += len(parsed.errors)
65
+ except KeyboardInterrupt:
66
+ logger.info("SIGINT received — flushing progress")
67
+ self._upsert_all(pass1_results, db)
68
+ raise
69
+ except Exception as exc:
70
+ logger.warning("Failed to parse %s: %s", file_path, exc)
71
+ parse_errors += 1
72
+
73
+ # Optional: load dbt manifest
74
+ if dbt_manifest:
75
+ from sqlcg.indexer.dbt_adapter import load_dbt_manifest
76
+
77
+ load_dbt_manifest(dbt_manifest, schema_resolver)
78
+
79
+ # Pass 2: resolve cross-file references
80
+ pass2_results: list[ParsedFile] = []
81
+ for parsed in pass1_results:
82
+ try:
83
+ resolved = aggregator.resolve_pass2(parser, parsed)
84
+ pass2_results.append(resolved)
85
+ except Exception as exc:
86
+ logger.warning("resolve_pass2 failed for %s: %s", parsed.path, exc)
87
+ pass2_results.append(parsed)
88
+
89
+ # Upsert all results
90
+ tables_found = 0
91
+ lineage_edges = 0
92
+ for parsed in pass2_results:
93
+ counts = self._upsert_parsed_file(parsed, db)
94
+ tables_found += counts["tables"]
95
+ lineage_edges += counts["edges"]
96
+
97
+ return {
98
+ "files_parsed": len(pass2_results),
99
+ "parse_errors": parse_errors,
100
+ "tables_found": tables_found,
101
+ "lineage_edges_created": lineage_edges,
102
+ }
103
+
104
+ def reindex_file(self, file_path: str, db: GraphBackend, dialect: str | None) -> None:
105
+ """Re-index a single file and its dependent views.
106
+
107
+ Args:
108
+ file_path: Path to the file to re-index
109
+ db: GraphBackend instance
110
+ dialect: SQL dialect (None for ANSI)
111
+ """
112
+ stale_views = db.run_read(STALE_VIEWS_QUERY, {"path": file_path})
113
+
114
+ with db.transaction():
115
+ db.delete_nodes_for_file(file_path)
116
+ schema_resolver = SchemaResolver(dialect=dialect)
117
+ parser = get_parser(dialect, schema_resolver)
118
+ sql = Path(file_path).read_text(encoding="utf-8")
119
+ parsed = parser.parse_file(Path(file_path), sql)
120
+ self._upsert_parsed_file(parsed, db)
121
+
122
+ for row in stale_views:
123
+ self._reindex_view_definition(row["view_name"], db, dialect)
124
+
125
+ def _index_single_file(self, parser, path: Path, sql: str, timeout: int) -> ParsedFile:
126
+ """Parse one file, with optional timeout.
127
+
128
+ Args:
129
+ parser: SqlParser instance
130
+ path: Path to the file
131
+ sql: SQL text
132
+ timeout: Timeout in seconds (0 = no timeout)
133
+
134
+ Returns:
135
+ ParsedFile with parse_failed flag set if timeout occurs
136
+ """
137
+ if timeout <= 0:
138
+ return parser.parse_file(path, sql)
139
+
140
+ with ThreadPoolExecutor(max_workers=1) as ex:
141
+ future = ex.submit(parser.parse_file, path, sql)
142
+ try:
143
+ return future.result(timeout=timeout)
144
+ except FuturesTimeout:
145
+ logger.warning("Timeout parsing %s (>%ds) — skipping", path, timeout)
146
+ out = ParsedFile(path=path, dialect=parser.DIALECT)
147
+ out.errors.append(f"timeout:{timeout}s")
148
+ return out
149
+
150
+ def _upsert_parsed_file(self, parsed: ParsedFile, db: GraphBackend) -> dict:
151
+ """Map ParsedFile → graph nodes/edges.
152
+
153
+ Args:
154
+ parsed: ParsedFile to upsert
155
+ db: GraphBackend instance
156
+
157
+ Returns:
158
+ Dict with keys: tables, edges
159
+ """
160
+ counts = {"tables": 0, "edges": 0}
161
+
162
+ # Upsert File node
163
+ db.upsert_node(
164
+ NodeLabel.FILE,
165
+ parsed.path_str,
166
+ {
167
+ "path": parsed.path_str,
168
+ "dialect": parsed.dialect or "",
169
+ },
170
+ )
171
+
172
+ # Upsert defined tables
173
+ for table in parsed.defined_tables:
174
+ db.upsert_node(
175
+ NodeLabel.TABLE,
176
+ table.full_id,
177
+ {
178
+ "qualified": table.full_id,
179
+ "name": table.name,
180
+ "catalog": table.catalog or "",
181
+ "db": table.db or "",
182
+ "kind": "TABLE",
183
+ "defined_in_file": parsed.path_str,
184
+ },
185
+ )
186
+ db.upsert_edge(
187
+ NodeLabel.TABLE,
188
+ table.full_id,
189
+ NodeLabel.FILE,
190
+ parsed.path_str,
191
+ RelType.DEFINED_IN,
192
+ {},
193
+ )
194
+ counts["tables"] += 1
195
+
196
+ # Upsert query nodes
197
+ for i, stmt in enumerate(parsed.statements):
198
+ query_id = f"{parsed.path_str}:{i}"
199
+ db.upsert_node(
200
+ NodeLabel.QUERY,
201
+ query_id,
202
+ {
203
+ "id": query_id,
204
+ "file_path": parsed.path_str,
205
+ "statement_index": i,
206
+ "sql": stmt.sql[:500],
207
+ "kind": stmt.kind,
208
+ "target_table": stmt.target.full_id if stmt.target else "",
209
+ "parse_failed": stmt.parse_failed,
210
+ "confidence": stmt.confidence,
211
+ "parsing_mode": stmt.parsing_mode,
212
+ },
213
+ )
214
+ db.upsert_edge(
215
+ NodeLabel.QUERY,
216
+ query_id,
217
+ NodeLabel.FILE,
218
+ parsed.path_str,
219
+ RelType.QUERY_DEFINED_IN,
220
+ {},
221
+ )
222
+
223
+ # Source table edges
224
+ for src_table in stmt.sources:
225
+ db.upsert_node(
226
+ NodeLabel.TABLE,
227
+ src_table.full_id,
228
+ {
229
+ "qualified": src_table.full_id,
230
+ "name": src_table.name,
231
+ "catalog": src_table.catalog or "",
232
+ "db": src_table.db or "",
233
+ "kind": "TABLE",
234
+ "defined_in_file": "",
235
+ },
236
+ )
237
+ db.upsert_edge(
238
+ NodeLabel.QUERY,
239
+ query_id,
240
+ NodeLabel.TABLE,
241
+ src_table.full_id,
242
+ RelType.SELECTS_FROM,
243
+ {},
244
+ )
245
+
246
+ # Column lineage edges
247
+ for edge in stmt.column_lineage:
248
+ src_id = edge.src.full_id
249
+ dst_id = edge.dst.full_id
250
+ db.upsert_node(
251
+ NodeLabel.COLUMN,
252
+ src_id,
253
+ {
254
+ "id": src_id,
255
+ "col_name": edge.src.name,
256
+ "table_qualified": edge.src.table.full_id,
257
+ "catalog": edge.src.table.catalog or "",
258
+ "db": edge.src.table.db or "",
259
+ "table_name": edge.src.table.name,
260
+ },
261
+ )
262
+ db.upsert_node(
263
+ NodeLabel.COLUMN,
264
+ dst_id,
265
+ {
266
+ "id": dst_id,
267
+ "col_name": edge.dst.name,
268
+ "table_qualified": edge.dst.table.full_id,
269
+ "catalog": edge.dst.table.catalog or "",
270
+ "db": edge.dst.table.db or "",
271
+ "table_name": edge.dst.table.name,
272
+ },
273
+ )
274
+ db.upsert_edge(
275
+ NodeLabel.COLUMN,
276
+ src_id,
277
+ NodeLabel.COLUMN,
278
+ dst_id,
279
+ RelType.COLUMN_LINEAGE,
280
+ {
281
+ "transform": edge.transform,
282
+ "confidence": edge.confidence,
283
+ "query_id": query_id,
284
+ },
285
+ )
286
+ counts["edges"] += 1
287
+
288
+ return counts
289
+
290
+ def _upsert_all(self, results: list[ParsedFile], db: GraphBackend) -> None:
291
+ """Upsert all parsed files.
292
+
293
+ Args:
294
+ results: List of ParsedFile objects
295
+ db: GraphBackend instance
296
+ """
297
+ for parsed in results:
298
+ self._upsert_parsed_file(parsed, db)
299
+
300
+ def _reindex_view_definition(
301
+ self, view_name: str, db: GraphBackend, dialect: str | None
302
+ ) -> None:
303
+ """Re-index the file that defines a view.
304
+
305
+ Args:
306
+ view_name: Qualified view name
307
+ db: GraphBackend instance
308
+ dialect: SQL dialect
309
+ """
310
+ query = (
311
+ f"MATCH (t:{NodeLabel.TABLE} {{qualified: $name}})"
312
+ f"-[:{RelType.DEFINED_IN}]->(f:{NodeLabel.FILE}) "
313
+ "RETURN f.path AS path"
314
+ )
315
+ result = db.run_read(query, {"name": view_name})
316
+ for row in result:
317
+ self.reindex_file(row["path"], db, dialect)
@@ -0,0 +1,55 @@
1
+ """SQL file walker with ignore pattern support."""
2
+
3
+ import subprocess
4
+ from collections.abc import Iterator
5
+ from pathlib import Path
6
+
7
+ import pathspec
8
+
9
+ from sqlcg.utils.ignore import is_ignored
10
+
11
+
12
+ def _git_sql_files(root: Path) -> list[Path] | None:
13
+ """Return tracked .sql files via git ls-files, or None if git unavailable."""
14
+ try:
15
+ result = subprocess.run(
16
+ ["git", "ls-files", "--cached"],
17
+ cwd=root,
18
+ capture_output=True,
19
+ text=True,
20
+ check=True,
21
+ )
22
+ return [root / f for f in result.stdout.splitlines() if f.endswith(".sql")]
23
+ except (subprocess.CalledProcessError, OSError):
24
+ return None
25
+
26
+
27
+ def walk_sql_files(
28
+ root: Path, spec: pathspec.PathSpec, use_git: bool = True
29
+ ) -> Iterator[Path]:
30
+ """Walk directory tree and yield SQL files not matching ignore patterns.
31
+
32
+ When use_git=True (default) and git is available, only tracked files are
33
+ returned — this prevents flooding from build artefacts, node_modules, and
34
+ other untracked directories. Falls back to rglob when git is unavailable
35
+ or the directory is not a git repository.
36
+
37
+ Args:
38
+ root: Root directory to walk
39
+ spec: PathSpec object with ignore patterns
40
+ use_git: Use git ls-files instead of rglob (default True)
41
+
42
+ Yields:
43
+ Path objects for .sql files not matching ignore patterns
44
+ """
45
+ if use_git:
46
+ git_files = _git_sql_files(root)
47
+ if git_files is not None:
48
+ for path in git_files:
49
+ if path.exists() and not is_ignored(path, root, spec):
50
+ yield path
51
+ return
52
+
53
+ for path in root.rglob("*.sql"):
54
+ if not is_ignored(path, root, spec):
55
+ yield path
@@ -0,0 +1,195 @@
1
+ """File system watcher for SQL file changes."""
2
+
3
+ import subprocess
4
+ import threading
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import pathspec
9
+ from watchdog.events import FileSystemEventHandler
10
+
11
+ from sqlcg.utils.ignore import is_ignored
12
+ from sqlcg.utils.logging import getLogger
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ class SqlFileEventHandler(FileSystemEventHandler):
18
+ """Watchdog event handler for SQL file changes."""
19
+
20
+ def __init__(self, job_manager, db, ignore_spec: pathspec.PathSpec, root: Path, indexer=None):
21
+ """Initialize the event handler.
22
+
23
+ Args:
24
+ job_manager: WatchJobManager instance
25
+ db: GraphBackend instance
26
+ ignore_spec: PathSpec with ignore patterns
27
+ root: Root directory being watched
28
+ indexer: Indexer instance (used by BranchMonitor)
29
+ """
30
+ super().__init__()
31
+ self._jobs = job_manager
32
+ self._db = db
33
+ self._spec = ignore_spec
34
+ self._root = root
35
+ # Create and start BranchMonitor if indexer is provided
36
+ self._branch_monitor: BranchMonitor | None = None
37
+ if indexer is not None:
38
+ self._branch_monitor = BranchMonitor(root, job_manager, indexer, db)
39
+ self._branch_monitor.start()
40
+
41
+ def _is_sql(self, path: str | bytes) -> bool:
42
+ """Check if a path is a non-ignored SQL file.
43
+
44
+ Args:
45
+ path: File path to check (str or bytes from watchdog)
46
+
47
+ Returns:
48
+ True if the path is a .sql file not matching ignore patterns
49
+ """
50
+ if isinstance(path, bytes):
51
+ path = path.decode("utf-8")
52
+ path_obj = Path(path)
53
+ return path_obj.suffix == ".sql" and not is_ignored(path_obj, self._root, self._spec)
54
+
55
+ def on_modified(self, event):
56
+ """Handle file modification events.
57
+
58
+ Args:
59
+ event: Watchdog event object
60
+ """
61
+ if not event.is_directory and self._is_sql(event.src_path):
62
+ self._jobs.schedule(event.src_path)
63
+
64
+ def on_created(self, event):
65
+ """Handle file creation events.
66
+
67
+ Args:
68
+ event: Watchdog event object
69
+ """
70
+ if not event.is_directory and self._is_sql(event.src_path):
71
+ self._jobs.schedule(event.src_path)
72
+
73
+ def on_moved(self, event):
74
+ """Handle file move events.
75
+
76
+ Args:
77
+ event: Watchdog event object
78
+ """
79
+ if not event.is_directory and self._is_sql(event.dest_path):
80
+ self._jobs.schedule(event.dest_path)
81
+
82
+ def on_deleted(self, event):
83
+ """Handle file deletion events.
84
+
85
+ Args:
86
+ event: Watchdog event object
87
+ """
88
+ if not event.is_directory and self._is_sql(event.src_path):
89
+ self._db.delete_nodes_for_file(event.src_path)
90
+
91
+
92
+ class BranchMonitor(threading.Thread):
93
+ """Background thread that detects branch changes and triggers full resyncs.
94
+
95
+ Polls `git rev-parse --abbrev-ref HEAD` every 2 seconds. When the branch
96
+ changes, pauses the job manager, runs a full reindex, then resumes and
97
+ drains queued file events.
98
+ """
99
+
100
+ def __init__(
101
+ self, watched_path: Path, job_manager, indexer, db, _poll_interval: float = 2.0
102
+ ):
103
+ """Initialize the branch monitor.
104
+
105
+ Args:
106
+ watched_path: Path being watched (used to find git root)
107
+ job_manager: WatchJobManager instance
108
+ indexer: Indexer instance
109
+ db: GraphBackend instance
110
+ _poll_interval: Polling interval in seconds (for testing)
111
+ """
112
+ # daemon=False ensures that if index_repo() is in-flight when shutdown is requested,
113
+ # the process will wait (via join(timeout=5) in watch.py) up to 5 seconds before exiting.
114
+ # This avoids data loss from killing an in-progress resync.
115
+ super().__init__(daemon=False)
116
+ self._watched_path = watched_path
117
+ self._job_manager = job_manager
118
+ self._indexer = indexer
119
+ self._db = db
120
+ self._stop_event = threading.Event()
121
+ self._current_branch: str | None = None
122
+ self._poll_interval = _poll_interval
123
+
124
+ def run(self) -> None:
125
+ """Poll git branch and trigger resync on change."""
126
+ while not self._stop_event.is_set():
127
+ try:
128
+ branch = self._get_current_branch()
129
+ if branch is not None and branch != self._current_branch:
130
+ logger.debug(
131
+ "Branch change detected: %s -> %s", self._current_branch, branch
132
+ )
133
+ self._current_branch = branch
134
+ self._on_branch_change()
135
+ except subprocess.CalledProcessError:
136
+ # Not a git repo or git not available
137
+ logger.debug("Could not get current branch (not a git repo or git unavailable)")
138
+ self._stop_event.set()
139
+ break
140
+ except Exception as exc:
141
+ logger.debug("BranchMonitor error: %s", exc)
142
+
143
+ # Sleep in small increments to allow quick shutdown
144
+ for _ in range(int(self._poll_interval * 10)):
145
+ if self._stop_event.is_set():
146
+ break
147
+ time.sleep(0.1)
148
+
149
+ def _get_current_branch(self) -> str | None:
150
+ """Get the current git branch name.
151
+
152
+ Returns:
153
+ Branch name, or None if git command fails
154
+
155
+ Raises:
156
+ subprocess.CalledProcessError if git command fails
157
+ """
158
+ result = subprocess.run(
159
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
160
+ cwd=str(self._watched_path),
161
+ capture_output=True,
162
+ text=True,
163
+ check=True,
164
+ )
165
+ return result.stdout.strip()
166
+
167
+ def _on_branch_change(self) -> None:
168
+ """Handle branch change: pause, resync, resume, drain queue."""
169
+ # Pause new file events
170
+ self._job_manager.set_paused(True)
171
+
172
+ # Cancel pending file timers
173
+ self._job_manager.cancel_all()
174
+
175
+ # Run full resync
176
+ try:
177
+ self._indexer.index_repo(self._watched_path, dialect=None, db=self._db)
178
+ except Exception as exc:
179
+ logger.error("Branch change resync failed: %s", exc)
180
+ finally:
181
+ # Resume and drain queued events
182
+ self._job_manager.set_paused(False)
183
+ self._job_manager.drain_queued()
184
+
185
+ def stop(self) -> None:
186
+ """Signal the thread to stop."""
187
+ self._stop_event.set()
188
+
189
+ def join(self, timeout: float | None = None) -> None:
190
+ """Wait for the thread to stop.
191
+
192
+ Args:
193
+ timeout: Maximum time to wait in seconds
194
+ """
195
+ super().join(timeout=timeout)
@@ -0,0 +1 @@
1
+ """Lineage resolution and analysis module."""
@@ -0,0 +1,58 @@
1
+ """Cross-file lineage aggregator for two-pass resolution."""
2
+
3
+ from sqlcg.parsers.base import ParsedFile
4
+ from sqlcg.utils.logging import getLogger
5
+
6
+ logger = getLogger(__name__)
7
+
8
+
9
+ class CrossFileAggregator:
10
+ """Aggregates parsed files for cross-file resolution.
11
+
12
+ Pass 1: register all parsed files and build view/table source mappings.
13
+ Pass 2: re-parse files with cross-file schema context.
14
+ """
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize the aggregator."""
18
+ # Maps table.full_id -> ParsedFile that defines it
19
+ self.sources: dict[str, ParsedFile] = {}
20
+
21
+ def register_pass1(self, parsed: ParsedFile) -> None:
22
+ """Register a pass-1 result and build view/table source map.
23
+
24
+ Args:
25
+ parsed: ParsedFile from pass 1
26
+ """
27
+ for table in parsed.defined_tables:
28
+ self.sources[table.full_id] = parsed
29
+
30
+ def resolve_pass2(self, parser, parsed: ParsedFile) -> ParsedFile:
31
+ """Re-parse with cross-file schema context.
32
+
33
+ Args:
34
+ parser: SqlParser instance
35
+ parsed: ParsedFile from pass 1
36
+
37
+ Returns:
38
+ ParsedFile from pass 2 with resolved cross-file references,
39
+ or the pass-1 result if the file cannot be re-read.
40
+
41
+ Raises:
42
+ No exceptions are raised; file read errors are logged as WARNING
43
+ and the pass-1 result is returned unchanged.
44
+ """
45
+ # Register view sources for schema resolution
46
+ parser._schema.add_view_sources(self.sources)
47
+
48
+ try:
49
+ sql = parsed.path.read_text(encoding="utf-8")
50
+ except (FileNotFoundError, OSError) as exc:
51
+ logger.warning(
52
+ "resolve_pass2: cannot re-read %s (%s) — returning pass-1 result",
53
+ parsed.path,
54
+ exc,
55
+ )
56
+ return parsed
57
+
58
+ return parser.parse_file(parsed.path, sql)