sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sql_code_graph-0.2.1.dist-info/METADATA +171 -0
  2. sql_code_graph-0.2.1.dist-info/RECORD +55 -0
  3. sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
  4. sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
  5. sqlcg/__init__.py +5 -0
  6. sqlcg/__main__.py +6 -0
  7. sqlcg/cli/__init__.py +1 -0
  8. sqlcg/cli/commands/__init__.py +1 -0
  9. sqlcg/cli/commands/analyze.py +93 -0
  10. sqlcg/cli/commands/db.py +83 -0
  11. sqlcg/cli/commands/find.py +63 -0
  12. sqlcg/cli/commands/gain.py +169 -0
  13. sqlcg/cli/commands/git.py +73 -0
  14. sqlcg/cli/commands/index.py +92 -0
  15. sqlcg/cli/commands/install.py +60 -0
  16. sqlcg/cli/commands/mcp.py +54 -0
  17. sqlcg/cli/commands/report.py +135 -0
  18. sqlcg/cli/commands/watch.py +57 -0
  19. sqlcg/cli/main.py +40 -0
  20. sqlcg/core/__init__.py +8 -0
  21. sqlcg/core/config.py +104 -0
  22. sqlcg/core/graph_db.py +179 -0
  23. sqlcg/core/jobs.py +105 -0
  24. sqlcg/core/kuzu_backend.py +269 -0
  25. sqlcg/core/neo4j_backend.py +195 -0
  26. sqlcg/core/queries.py +82 -0
  27. sqlcg/core/schema.cypher +104 -0
  28. sqlcg/core/schema.py +48 -0
  29. sqlcg/indexer/__init__.py +1 -0
  30. sqlcg/indexer/dbt_adapter.py +23 -0
  31. sqlcg/indexer/indexer.py +317 -0
  32. sqlcg/indexer/walker.py +55 -0
  33. sqlcg/indexer/watcher.py +195 -0
  34. sqlcg/lineage/__init__.py +1 -0
  35. sqlcg/lineage/aggregator.py +58 -0
  36. sqlcg/lineage/schema_resolver.py +198 -0
  37. sqlcg/metrics/__init__.py +5 -0
  38. sqlcg/metrics/store.py +273 -0
  39. sqlcg/parsers/__init__.py +30 -0
  40. sqlcg/parsers/ansi_parser.py +215 -0
  41. sqlcg/parsers/base.py +414 -0
  42. sqlcg/parsers/bigquery_parser.py +77 -0
  43. sqlcg/parsers/postgres_parser.py +27 -0
  44. sqlcg/parsers/registry.py +46 -0
  45. sqlcg/parsers/snowflake_parser.py +148 -0
  46. sqlcg/parsers/tsql_parser.py +27 -0
  47. sqlcg/server/__init__.py +1 -0
  48. sqlcg/server/exceptions.py +20 -0
  49. sqlcg/server/models.py +83 -0
  50. sqlcg/server/server.py +57 -0
  51. sqlcg/server/tools.py +663 -0
  52. sqlcg/utils/__init__.py +6 -0
  53. sqlcg/utils/hashing.py +18 -0
  54. sqlcg/utils/ignore.py +36 -0
  55. sqlcg/utils/logging.py +29 -0
@@ -0,0 +1,215 @@
1
+ """ANSI SQL parser implementation."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import sqlglot
7
+ import sqlglot.expressions as exp
8
+
9
+ from sqlcg.lineage.schema_resolver import SchemaResolver
10
+ from sqlcg.parsers.base import ParsedFile, QueryNode, SqlParser, TableRef
11
+ from sqlcg.parsers.registry import register
12
+ from sqlcg.utils.logging import getLogger
13
+
14
+ logger = getLogger(__name__)
15
+
16
+
17
+ @register(None) # None = ANSI/default dialect
18
+ class AnsiParser(SqlParser):
19
+ """ANSI SQL parser for standard SQL dialects.
20
+
21
+ Attributes:
22
+ DIALECT: Set to None for ANSI/default dialect
23
+ """
24
+
25
+ DIALECT: str | None = None
26
+
27
+ def __init__(self, schema_resolver: SchemaResolver):
28
+ """Initialize ANSI parser.
29
+
30
+ Args:
31
+ schema_resolver: SchemaResolver instance for table/column lookups
32
+ """
33
+ super().__init__(schema_resolver)
34
+
35
+ def parse_file(self, path: Path, sql: str) -> ParsedFile:
36
+ """Parse SQL file and extract table/column lineage.
37
+
38
+ Args:
39
+ path: Path to the source file
40
+ sql: SQL text to parse
41
+
42
+ Returns:
43
+ ParsedFile with parsed statements and metadata
44
+ """
45
+ out = ParsedFile(path=path, dialect=self.DIALECT)
46
+
47
+ # Parse all statements in the file
48
+ try:
49
+ statements = sqlglot.parse(sql, dialect=self.DIALECT)
50
+ except Exception as exc:
51
+ logger.warning("Failed to parse file %s: %s", path, exc)
52
+ out.errors.append(f"parse_error:{exc}")
53
+ return out
54
+
55
+ # Process each statement
56
+ for stmt_index, stmt in enumerate(statements):
57
+ if stmt is None:
58
+ continue
59
+
60
+ try:
61
+ query_node = self._parse_statement(stmt, path, stmt_index)
62
+ out.statements.append(query_node)
63
+
64
+ # Track defined and referenced tables
65
+ if query_node.kind in ("CREATE_TABLE", "CREATE_VIEW"):
66
+ if query_node.target:
67
+ out.defined_tables.append(query_node.target)
68
+
69
+ out.referenced_tables.extend(query_node.sources)
70
+
71
+ except Exception as exc:
72
+ logger.warning("Failed to process statement %d in %s: %s", stmt_index, path, exc)
73
+ out.errors.append(f"statement_error:{stmt_index}:{exc}")
74
+
75
+ return out
76
+
77
+ def _parse_statement(self, stmt: Any, path: Path, stmt_index: int) -> QueryNode:
78
+ """Parse a single SQL statement into a QueryNode.
79
+
80
+ Args:
81
+ stmt: sqlglot AST node
82
+ path: Path to the source file
83
+ stmt_index: Statement index in the file
84
+
85
+ Returns:
86
+ QueryNode with extracted metadata
87
+ """
88
+ kind = self._classify(stmt)
89
+ sql = stmt.sql(dialect=self.DIALECT)
90
+
91
+ # Extract target table for CREATE/INSERT statements
92
+ target = None
93
+ if isinstance(stmt, exp.Create):
94
+ target = self._extract_target_table(stmt)
95
+ elif isinstance(stmt, exp.Insert):
96
+ target = self._extract_insert_target(stmt)
97
+
98
+ # Extract source tables
99
+ sources = []
100
+ ctes = {}
101
+ confidence = 1.0
102
+ parse_failed = False
103
+
104
+ # Try to extract table references using scope analysis
105
+ try:
106
+ from sqlglot.optimizer.scope import build_scope
107
+
108
+ root_scope = build_scope(stmt)
109
+ if root_scope:
110
+ sources = self._real_tables(root_scope)
111
+ else:
112
+ # Fallback to basic table extraction
113
+ sources = self._fallback_table_scan(stmt)
114
+ parse_failed = True
115
+ except Exception as exc:
116
+ logger.warning(
117
+ "Failed to build scope for statement %d in %s: %s", stmt_index, path, exc
118
+ )
119
+ sources = self._fallback_table_scan(stmt)
120
+ parse_failed = True
121
+
122
+ # Extract column lineage (currently minimal implementation)
123
+ column_lineage = []
124
+
125
+ # Remove duplicates while preserving order
126
+ sources = self._deduplicate_table_refs(sources)
127
+
128
+ return QueryNode(
129
+ file=path,
130
+ statement_index=stmt_index,
131
+ sql=sql,
132
+ kind=kind,
133
+ target=target,
134
+ sources=sources,
135
+ ctes=ctes,
136
+ column_lineage=column_lineage,
137
+ parse_failed=parse_failed,
138
+ confidence=confidence,
139
+ parsing_mode="sqlglot",
140
+ )
141
+
142
+ @staticmethod
143
+ def _extract_target_table(create_stmt: exp.Create) -> TableRef | None:
144
+ """Extract the target table from a CREATE statement.
145
+
146
+ Args:
147
+ create_stmt: CREATE expression
148
+
149
+ Returns:
150
+ TableRef or None if not a table create
151
+ """
152
+ if create_stmt.kind not in ("TABLE", "VIEW"):
153
+ return None
154
+
155
+ if not create_stmt.this:
156
+ return None
157
+
158
+ return AnsiParser._convert_table_expr_to_ref(create_stmt.this)
159
+
160
+ @staticmethod
161
+ def _extract_insert_target(insert_stmt: exp.Insert) -> TableRef | None:
162
+ """Extract the target table from an INSERT statement.
163
+
164
+ Args:
165
+ insert_stmt: INSERT expression
166
+
167
+ Returns:
168
+ TableRef or None
169
+ """
170
+ if not insert_stmt.this:
171
+ return None
172
+
173
+ return AnsiParser._convert_table_expr_to_ref(insert_stmt.this)
174
+
175
+ @staticmethod
176
+ def _fallback_table_scan(stmt: Any) -> list[TableRef]:
177
+ """Fallback table extraction when scope analysis fails.
178
+
179
+ Walks the AST tree to find all Table nodes.
180
+
181
+ Args:
182
+ stmt: SQL expression
183
+
184
+ Returns:
185
+ List of TableRef objects found
186
+ """
187
+ tables = []
188
+ seen = set()
189
+
190
+ # Use find_all to iterate through all nodes of type Table
191
+ for table_node in stmt.find_all(exp.Table):
192
+ ref = AnsiParser._convert_table_expr_to_ref(table_node)
193
+ if ref and ref.full_id not in seen:
194
+ tables.append(ref)
195
+ seen.add(ref.full_id)
196
+
197
+ return tables
198
+
199
+ @staticmethod
200
+ def _deduplicate_table_refs(tables: list[TableRef]) -> list[TableRef]:
201
+ """Remove duplicate table references while preserving order.
202
+
203
+ Args:
204
+ tables: List of TableRef objects
205
+
206
+ Returns:
207
+ Deduplicated list
208
+ """
209
+ seen = set()
210
+ result = []
211
+ for table in tables:
212
+ if table.full_id not in seen:
213
+ result.append(table)
214
+ seen.add(table.full_id)
215
+ return result
sqlcg/parsers/base.py ADDED
@@ -0,0 +1,414 @@
1
+ """Base data models and abstract parser for SQL parsing and lineage extraction."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from enum import StrEnum
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from sqlcg.lineage.schema_resolver import SchemaResolver
11
+
12
+
13
+ class QueryKind(StrEnum):
14
+ """SQL query classification types."""
15
+
16
+ SELECT = "SELECT"
17
+ INSERT = "INSERT"
18
+ UPDATE = "UPDATE"
19
+ DELETE = "DELETE"
20
+ CREATE_TABLE = "CREATE_TABLE"
21
+ CREATE_VIEW = "CREATE_VIEW"
22
+ CREATE_PROC = "CREATE_PROC"
23
+ MERGE = "MERGE"
24
+ OTHER = "OTHER"
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class TableRef:
29
+ """A reference to a table (immutable).
30
+
31
+ Attributes:
32
+ catalog: Optional catalog name (e.g., "my_warehouse")
33
+ db: Optional database/schema name (e.g., "public")
34
+ name: Table name (always required)
35
+ alias: Optional alias used in the query context
36
+ """
37
+
38
+ catalog: str | None = None
39
+ db: str | None = None
40
+ name: str = ""
41
+ alias: str | None = None
42
+
43
+ @property
44
+ def full_id(self) -> str:
45
+ """Return the fully qualified table identifier.
46
+
47
+ Includes all non-None components: catalog.db.name or db.name or name.
48
+ Used as the Table.qualified key in the graph.
49
+ """
50
+ parts = [p for p in (self.catalog, self.db, self.name) if p]
51
+ return ".".join(parts)
52
+
53
+ @property
54
+ def qualified(self) -> str:
55
+ """Alias for full_id for compatibility."""
56
+ return self.full_id
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class ColumnRef:
61
+ """A reference to a column (immutable).
62
+
63
+ Attributes:
64
+ table: The TableRef this column belongs to
65
+ name: Column name
66
+ """
67
+
68
+ table: TableRef
69
+ name: str = ""
70
+
71
+ @property
72
+ def full_id(self) -> str:
73
+ """Return the fully qualified column identifier.
74
+
75
+ Format: table.full_id.column_name
76
+ Used as the Column.id key in the graph (finding 5.4).
77
+ """
78
+ return f"{self.table.full_id}.{self.name}"
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class LineageEdge:
83
+ """A column-level lineage edge (immutable by design for hashing).
84
+
85
+ Attributes:
86
+ src: Source ColumnRef
87
+ dst: Destination ColumnRef
88
+ transform: Description of the transformation applied (e.g., "PASS_THROUGH", "AGGREGATION")
89
+ confidence: Confidence score (0.0 to 1.0)
90
+ query_id: Optional identifier of the query that created this edge
91
+ """
92
+
93
+ src: ColumnRef
94
+ dst: ColumnRef
95
+ transform: str = "UNKNOWN"
96
+ confidence: float = 1.0
97
+ query_id: str | None = None
98
+
99
+ def __hash__(self) -> int:
100
+ """Support hashing for use in sets/dicts."""
101
+ return hash((self.src, self.dst, self.transform, self.confidence, self.query_id))
102
+
103
+ def __eq__(self, other: object) -> bool:
104
+ """Support equality comparison."""
105
+ if not isinstance(other, LineageEdge):
106
+ return NotImplemented
107
+ return (
108
+ self.src == other.src
109
+ and self.dst == other.dst
110
+ and self.transform == other.transform
111
+ and self.confidence == other.confidence
112
+ and self.query_id == other.query_id
113
+ )
114
+
115
+
116
+ @dataclass
117
+ class QueryNode:
118
+ """A parsed SQL query node (mutable by design for pass-2 patching).
119
+
120
+ This class is intentionally mutable to support the two-pass resolution strategy.
121
+ Pass 1 extracts initial lineage; pass 2 patches QueryNode fields directly.
122
+ Do not freeze this class.
123
+
124
+ Note: Do not rely on QueryNode identity after pass 2; use dataclasses.replace()
125
+ to create a new instance if mutation semantics are a concern.
126
+
127
+ Attributes:
128
+ file: Path to the source file
129
+ statement_index: 0-based index of the statement in the file
130
+ sql: The original SQL text
131
+ kind: Query type (SELECT, INSERT, UPDATE, DELETE, CREATE_TABLE, CREATE_VIEW, etc.)
132
+ target: Optional TableRef for INSERT/CREATE targets
133
+ sources: List of source TableRefs
134
+ ctes: Dict of CTE names to their TableRef definitions
135
+ column_lineage: List of LineageEdge objects
136
+ parse_failed: Whether parsing failed (fallback to table-only lineage)
137
+ confidence: Overall confidence score for this query's lineage
138
+ parsing_mode: How the query was parsed (e.g., "sqlglot", "fallback", "scripting")
139
+ """
140
+
141
+ file: Path
142
+ statement_index: int
143
+ sql: str
144
+ kind: str = ""
145
+ target: TableRef | None = None
146
+ sources: list[TableRef] = field(default_factory=list)
147
+ ctes: dict[str, TableRef] = field(default_factory=dict)
148
+ column_lineage: list[LineageEdge] = field(default_factory=list)
149
+ parse_failed: bool = False
150
+ confidence: float = 1.0
151
+ parsing_mode: str = "sqlglot"
152
+
153
+
154
+ @dataclass
155
+ class ParsedFile:
156
+ """Result of parsing a single SQL file (mutable for aggregation).
157
+
158
+ Attributes:
159
+ path: Path to the source file
160
+ dialect: SQL dialect used for parsing
161
+ statements: List of QueryNode objects parsed from the file
162
+ defined_tables: List of TableRef for tables defined in this file
163
+ referenced_tables: List of TableRef for tables referenced in this file
164
+ errors: List of error messages encountered during parsing
165
+ """
166
+
167
+ path: Path
168
+ dialect: str | None = None
169
+ statements: list[QueryNode] = field(default_factory=list)
170
+ defined_tables: list[TableRef] = field(default_factory=list)
171
+ referenced_tables: list[TableRef] = field(default_factory=list)
172
+ errors: list[str] = field(default_factory=list)
173
+
174
+ @property
175
+ def path_str(self) -> str:
176
+ """Return the path as a string."""
177
+ return str(self.path)
178
+
179
+
180
+ class SqlParser(ABC):
181
+ """Abstract base class for SQL parsers.
182
+
183
+ Attributes:
184
+ DIALECT: SQL dialect identifier (None for ANSI, "snowflake", "bigquery", etc.)
185
+ _schema: SchemaResolver instance for table/column lookups
186
+ _log: Logger instance for this parser
187
+ """
188
+
189
+ DIALECT: str | None = None
190
+
191
+ def __init__(self, schema_resolver: "SchemaResolver"):
192
+ """Initialize parser with schema resolver.
193
+
194
+ Args:
195
+ schema_resolver: SchemaResolver instance for table/column lookups
196
+ """
197
+ from sqlcg.utils.logging import getLogger
198
+
199
+ self._schema = schema_resolver
200
+ self._log = getLogger(f"{__name__}.{self.__class__.__name__}")
201
+
202
+ @abstractmethod
203
+ def parse_file(self, path: Path, sql: str) -> ParsedFile:
204
+ """Parse SQL text and return a ParsedFile with all statements.
205
+
206
+ Args:
207
+ path: Path to the source file
208
+ sql: SQL text to parse
209
+
210
+ Returns:
211
+ ParsedFile containing parsed statements and metadata
212
+ """
213
+ ...
214
+
215
+ def _classify(self, stmt: Any) -> str:
216
+ """Classify a SQL statement into a kind string.
217
+
218
+ Args:
219
+ stmt: sqlglot AST node
220
+
221
+ Returns:
222
+ One of: SELECT, INSERT, UPDATE, DELETE, CREATE_TABLE, CREATE_VIEW,
223
+ CREATE_PROC, MERGE, OTHER
224
+ """
225
+ import sqlglot.expressions as exp
226
+
227
+ match stmt:
228
+ case exp.Select():
229
+ return QueryKind.SELECT
230
+ case exp.Insert():
231
+ return QueryKind.INSERT
232
+ case exp.Update():
233
+ return QueryKind.UPDATE
234
+ case exp.Delete():
235
+ return QueryKind.DELETE
236
+ case exp.Create():
237
+ match stmt.kind:
238
+ case "TABLE":
239
+ return QueryKind.CREATE_TABLE
240
+ case "VIEW":
241
+ return QueryKind.CREATE_VIEW
242
+ case "PROCEDURE" | "FUNCTION":
243
+ return QueryKind.CREATE_PROC
244
+ case _:
245
+ return QueryKind.CREATE_TABLE # Default to table
246
+ case exp.Merge():
247
+ return QueryKind.MERGE
248
+ case _:
249
+ return QueryKind.OTHER
250
+
251
+ def _real_tables(self, scope: Any) -> list[TableRef]:
252
+ """Return real (non-CTE) tables referenced in a scope.
253
+
254
+ Args:
255
+ scope: sqlglot scope object
256
+
257
+ Returns:
258
+ List of TableRef objects for non-CTE tables
259
+ """
260
+ if not scope:
261
+ return []
262
+
263
+ cte_sources = set(scope.cte_sources.keys()) if hasattr(scope, "cte_sources") else set()
264
+ tables = []
265
+
266
+ if hasattr(scope, "tables"):
267
+ # scope.tables is a list of Table expressions
268
+ table_list = scope.tables
269
+ if isinstance(table_list, list):
270
+ for table_expr in table_list:
271
+ # Extract the table name to check if it's a CTE
272
+ table_name = None
273
+ if hasattr(table_expr, "name"):
274
+ table_name = table_expr.name
275
+
276
+ if table_name not in cte_sources:
277
+ # Convert table expression to TableRef
278
+ ref = self._convert_table_expr_to_ref(table_expr)
279
+ if ref:
280
+ tables.append(ref)
281
+
282
+ return tables
283
+
284
+ @staticmethod
285
+ def _convert_table_expr_to_ref(table_expr: Any) -> "TableRef | None":
286
+ """Convert a table expression to a TableRef.
287
+
288
+ Args:
289
+ table_expr: Table expression (e.g., Table, Identifier, Schema)
290
+
291
+ Returns:
292
+ TableRef or None
293
+ """
294
+ import sqlglot.expressions as exp
295
+
296
+ match table_expr:
297
+ case exp.Table():
298
+ return TableRef(
299
+ catalog=table_expr.catalog,
300
+ db=table_expr.db,
301
+ name=table_expr.name,
302
+ )
303
+ case exp.Identifier():
304
+ return TableRef(name=table_expr.name)
305
+ case exp.Schema():
306
+ # Schema wraps the actual table
307
+ return SqlParser._convert_table_expr_to_ref(table_expr.this)
308
+ case _:
309
+ # Try to extract name from string representation
310
+ name = str(table_expr)
311
+ if name:
312
+ return TableRef(name=name)
313
+ return None
314
+
315
+ def _extract_column_lineage(
316
+ self, stmt: Any, path: Path, out: ParsedFile, schema: dict
317
+ ) -> list[LineageEdge]:
318
+ """Extract column-level lineage with structured error recording.
319
+
320
+ On sqlglot.lineage failure: log WARNING, append to ParsedFile.errors,
321
+ emit LineageEdge with confidence=0.0, continue (do NOT raise or skip silently).
322
+
323
+ For columns not found in the schema, emits edges with reduced confidence (0.5)
324
+ rather than skipping silently (only applies when schema is non-empty).
325
+
326
+ Args:
327
+ stmt: sqlglot AST node (Select/Insert/Create)
328
+ path: Path to the source file
329
+ out: ParsedFile object to append errors to
330
+ schema: Schema dict from _schema.as_dict()
331
+
332
+ Returns:
333
+ List of LineageEdge objects
334
+ """
335
+ import sqlglot.expressions as exp
336
+ from sqlglot.lineage import lineage as sg_lineage
337
+
338
+ edges: list[LineageEdge] = []
339
+
340
+ # Only extract column lineage for certain statement types
341
+ if not isinstance(stmt, (exp.Select, exp.Insert, exp.Create)):
342
+ return edges
343
+
344
+ # Extract column references from SELECT list or target
345
+ try:
346
+ # Get the body of the query for lineage extraction
347
+ if isinstance(stmt, exp.Select):
348
+ body = stmt
349
+ # Extract output columns
350
+ for col_expr in stmt.expressions:
351
+ col_name = col_expr.alias if col_expr.alias else str(col_expr)
352
+ # Schema validation: if schema is loaded and column isn't in it,
353
+ # emit a reduced-confidence edge rather than a full-confidence one.
354
+ if schema:
355
+ table_cols: list[str] | None = None
356
+ for _scope_name, cols in schema.items():
357
+ if isinstance(cols, list):
358
+ table_cols = cols
359
+ break
360
+ elif isinstance(cols, dict):
361
+ for _db, tables in cols.items():
362
+ if isinstance(tables, dict):
363
+ for _tbl, tcols in tables.items():
364
+ if col_name in tcols:
365
+ table_cols = tcols
366
+ break
367
+ if table_cols is not None and col_name not in table_cols:
368
+ self._log.warning(
369
+ "column %s not found in schema, emitting low-confidence edge",
370
+ col_name,
371
+ )
372
+ edges.append(
373
+ LineageEdge(
374
+ src=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
375
+ dst=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
376
+ transform="UNKNOWN",
377
+ confidence=0.5,
378
+ )
379
+ )
380
+ continue
381
+
382
+ try:
383
+ root = sg_lineage(col_name, body, schema=schema, dialect=self.DIALECT)
384
+ if root:
385
+ # Successfully extracted lineage
386
+ # TODO: convert root to LineageEdge(s)
387
+ pass
388
+ except Exception as exc:
389
+ self._log.warning(
390
+ "column lineage extraction failed: file=%s col=%s error=%s",
391
+ path,
392
+ col_name,
393
+ exc,
394
+ )
395
+ out.errors.append(f"col_lineage:{col_name}:{exc}")
396
+ # Emit a zero-confidence placeholder edge
397
+ edges.append(
398
+ LineageEdge(
399
+ src=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
400
+ dst=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
401
+ transform="UNKNOWN",
402
+ confidence=0.0,
403
+ )
404
+ )
405
+
406
+ except Exception as exc:
407
+ self._log.warning(
408
+ "column lineage extraction failed for entire statement: file=%s error=%s",
409
+ path,
410
+ exc,
411
+ )
412
+ out.errors.append(f"col_lineage:statement:{exc}")
413
+
414
+ return edges
@@ -0,0 +1,77 @@
1
+ """BigQuery SQL parser with scripting block detection."""
2
+
3
+ from pathlib import Path
4
+
5
+ from sqlcg.lineage.schema_resolver import SchemaResolver
6
+ from sqlcg.parsers.ansi_parser import AnsiParser
7
+ from sqlcg.parsers.base import ParsedFile
8
+ from sqlcg.parsers.registry import register
9
+ from sqlcg.utils.logging import getLogger
10
+
11
+ logger = getLogger(__name__)
12
+
13
+
14
+ @register("bigquery")
15
+ class BigQueryParser(AnsiParser):
16
+ """BigQuery SQL parser.
17
+
18
+ Handles BigQuery-specific features:
19
+ - Scripting block detection (DECLARE, BEGIN, IF)
20
+ - Standard table/column lineage extraction
21
+ """
22
+
23
+ DIALECT: str | None = "bigquery"
24
+
25
+ def __init__(self, schema_resolver: SchemaResolver):
26
+ """Initialize BigQuery parser.
27
+
28
+ Args:
29
+ schema_resolver: SchemaResolver instance for table/column lookups
30
+ """
31
+ super().__init__(schema_resolver)
32
+
33
+ def parse_file(self, path: Path, sql: str) -> ParsedFile:
34
+ """Parse BigQuery SQL file with scripting block detection.
35
+
36
+ Args:
37
+ path: Path to the source file
38
+ sql: SQL text to parse
39
+
40
+ Returns:
41
+ ParsedFile with parsed statements and metadata
42
+ """
43
+ # Check for scripting blocks
44
+ if self._has_scripting_block(sql):
45
+ logger.info("BigQuery scripting block detected in %s, marking as parse_failed", path)
46
+ # Scripting blocks are not fully parseable; mark as parse_failed
47
+ out = ParsedFile(path=path, dialect=self.DIALECT)
48
+ out.errors.append("parse_mode:scripting_block")
49
+ return out
50
+
51
+ # Otherwise use standard ANSI parsing with BigQuery dialect
52
+ return AnsiParser.parse_file(self, path, sql) # type: ignore
53
+
54
+ def _has_scripting_block(self, sql: str) -> bool:
55
+ """Token-aware scripting block detection for BigQuery.
56
+
57
+ Detects DECLARE, BEGIN, IF, and other scripting keywords.
58
+
59
+ Args:
60
+ sql: SQL text to check
61
+
62
+ Returns:
63
+ True if a scripting block is detected
64
+ """
65
+ try:
66
+ from sqlglot.tokens import Tokenizer, TokenType # type: ignore
67
+
68
+ toks = Tokenizer.from_dialect("bigquery").tokenize(sql) # type: ignore
69
+ scripting_tokens = {
70
+ TokenType.DECLARE, # type: ignore
71
+ TokenType.BEGIN, # type: ignore
72
+ }
73
+ return any(t.token_type in scripting_tokens for t in toks)
74
+ except Exception:
75
+ # Fallback: check for keywords in uppercase
76
+ sql_upper = sql.upper()
77
+ return any(keyword in sql_upper for keyword in ("DECLARE ", "BEGIN "))