sql-code-graph 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sql_code_graph-0.2.1.dist-info/METADATA +171 -0
- sql_code_graph-0.2.1.dist-info/RECORD +55 -0
- sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
- sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
- sqlcg/__init__.py +5 -0
- sqlcg/__main__.py +6 -0
- sqlcg/cli/__init__.py +1 -0
- sqlcg/cli/commands/__init__.py +1 -0
- sqlcg/cli/commands/analyze.py +93 -0
- sqlcg/cli/commands/db.py +83 -0
- sqlcg/cli/commands/find.py +63 -0
- sqlcg/cli/commands/gain.py +169 -0
- sqlcg/cli/commands/git.py +73 -0
- sqlcg/cli/commands/index.py +92 -0
- sqlcg/cli/commands/install.py +60 -0
- sqlcg/cli/commands/mcp.py +54 -0
- sqlcg/cli/commands/report.py +135 -0
- sqlcg/cli/commands/watch.py +57 -0
- sqlcg/cli/main.py +40 -0
- sqlcg/core/__init__.py +8 -0
- sqlcg/core/config.py +104 -0
- sqlcg/core/graph_db.py +179 -0
- sqlcg/core/jobs.py +105 -0
- sqlcg/core/kuzu_backend.py +269 -0
- sqlcg/core/neo4j_backend.py +195 -0
- sqlcg/core/queries.py +82 -0
- sqlcg/core/schema.cypher +104 -0
- sqlcg/core/schema.py +48 -0
- sqlcg/indexer/__init__.py +1 -0
- sqlcg/indexer/dbt_adapter.py +23 -0
- sqlcg/indexer/indexer.py +317 -0
- sqlcg/indexer/walker.py +55 -0
- sqlcg/indexer/watcher.py +195 -0
- sqlcg/lineage/__init__.py +1 -0
- sqlcg/lineage/aggregator.py +58 -0
- sqlcg/lineage/schema_resolver.py +198 -0
- sqlcg/metrics/__init__.py +5 -0
- sqlcg/metrics/store.py +273 -0
- sqlcg/parsers/__init__.py +30 -0
- sqlcg/parsers/ansi_parser.py +215 -0
- sqlcg/parsers/base.py +414 -0
- sqlcg/parsers/bigquery_parser.py +77 -0
- sqlcg/parsers/postgres_parser.py +27 -0
- sqlcg/parsers/registry.py +46 -0
- sqlcg/parsers/snowflake_parser.py +148 -0
- sqlcg/parsers/tsql_parser.py +27 -0
- sqlcg/server/__init__.py +1 -0
- sqlcg/server/exceptions.py +20 -0
- sqlcg/server/models.py +83 -0
- sqlcg/server/server.py +57 -0
- sqlcg/server/tools.py +663 -0
- sqlcg/utils/__init__.py +6 -0
- sqlcg/utils/hashing.py +18 -0
- sqlcg/utils/ignore.py +36 -0
- sqlcg/utils/logging.py +29 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""ANSI SQL parser implementation."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import sqlglot
|
|
7
|
+
import sqlglot.expressions as exp
|
|
8
|
+
|
|
9
|
+
from sqlcg.lineage.schema_resolver import SchemaResolver
|
|
10
|
+
from sqlcg.parsers.base import ParsedFile, QueryNode, SqlParser, TableRef
|
|
11
|
+
from sqlcg.parsers.registry import register
|
|
12
|
+
from sqlcg.utils.logging import getLogger
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register(None) # None = ANSI/default dialect
|
|
18
|
+
class AnsiParser(SqlParser):
|
|
19
|
+
"""ANSI SQL parser for standard SQL dialects.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
DIALECT: Set to None for ANSI/default dialect
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
DIALECT: str | None = None
|
|
26
|
+
|
|
27
|
+
def __init__(self, schema_resolver: SchemaResolver):
|
|
28
|
+
"""Initialize ANSI parser.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
schema_resolver: SchemaResolver instance for table/column lookups
|
|
32
|
+
"""
|
|
33
|
+
super().__init__(schema_resolver)
|
|
34
|
+
|
|
35
|
+
def parse_file(self, path: Path, sql: str) -> ParsedFile:
|
|
36
|
+
"""Parse SQL file and extract table/column lineage.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
path: Path to the source file
|
|
40
|
+
sql: SQL text to parse
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
ParsedFile with parsed statements and metadata
|
|
44
|
+
"""
|
|
45
|
+
out = ParsedFile(path=path, dialect=self.DIALECT)
|
|
46
|
+
|
|
47
|
+
# Parse all statements in the file
|
|
48
|
+
try:
|
|
49
|
+
statements = sqlglot.parse(sql, dialect=self.DIALECT)
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
logger.warning("Failed to parse file %s: %s", path, exc)
|
|
52
|
+
out.errors.append(f"parse_error:{exc}")
|
|
53
|
+
return out
|
|
54
|
+
|
|
55
|
+
# Process each statement
|
|
56
|
+
for stmt_index, stmt in enumerate(statements):
|
|
57
|
+
if stmt is None:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
query_node = self._parse_statement(stmt, path, stmt_index)
|
|
62
|
+
out.statements.append(query_node)
|
|
63
|
+
|
|
64
|
+
# Track defined and referenced tables
|
|
65
|
+
if query_node.kind in ("CREATE_TABLE", "CREATE_VIEW"):
|
|
66
|
+
if query_node.target:
|
|
67
|
+
out.defined_tables.append(query_node.target)
|
|
68
|
+
|
|
69
|
+
out.referenced_tables.extend(query_node.sources)
|
|
70
|
+
|
|
71
|
+
except Exception as exc:
|
|
72
|
+
logger.warning("Failed to process statement %d in %s: %s", stmt_index, path, exc)
|
|
73
|
+
out.errors.append(f"statement_error:{stmt_index}:{exc}")
|
|
74
|
+
|
|
75
|
+
return out
|
|
76
|
+
|
|
77
|
+
def _parse_statement(self, stmt: Any, path: Path, stmt_index: int) -> QueryNode:
|
|
78
|
+
"""Parse a single SQL statement into a QueryNode.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
stmt: sqlglot AST node
|
|
82
|
+
path: Path to the source file
|
|
83
|
+
stmt_index: Statement index in the file
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
QueryNode with extracted metadata
|
|
87
|
+
"""
|
|
88
|
+
kind = self._classify(stmt)
|
|
89
|
+
sql = stmt.sql(dialect=self.DIALECT)
|
|
90
|
+
|
|
91
|
+
# Extract target table for CREATE/INSERT statements
|
|
92
|
+
target = None
|
|
93
|
+
if isinstance(stmt, exp.Create):
|
|
94
|
+
target = self._extract_target_table(stmt)
|
|
95
|
+
elif isinstance(stmt, exp.Insert):
|
|
96
|
+
target = self._extract_insert_target(stmt)
|
|
97
|
+
|
|
98
|
+
# Extract source tables
|
|
99
|
+
sources = []
|
|
100
|
+
ctes = {}
|
|
101
|
+
confidence = 1.0
|
|
102
|
+
parse_failed = False
|
|
103
|
+
|
|
104
|
+
# Try to extract table references using scope analysis
|
|
105
|
+
try:
|
|
106
|
+
from sqlglot.optimizer.scope import build_scope
|
|
107
|
+
|
|
108
|
+
root_scope = build_scope(stmt)
|
|
109
|
+
if root_scope:
|
|
110
|
+
sources = self._real_tables(root_scope)
|
|
111
|
+
else:
|
|
112
|
+
# Fallback to basic table extraction
|
|
113
|
+
sources = self._fallback_table_scan(stmt)
|
|
114
|
+
parse_failed = True
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
logger.warning(
|
|
117
|
+
"Failed to build scope for statement %d in %s: %s", stmt_index, path, exc
|
|
118
|
+
)
|
|
119
|
+
sources = self._fallback_table_scan(stmt)
|
|
120
|
+
parse_failed = True
|
|
121
|
+
|
|
122
|
+
# Extract column lineage (currently minimal implementation)
|
|
123
|
+
column_lineage = []
|
|
124
|
+
|
|
125
|
+
# Remove duplicates while preserving order
|
|
126
|
+
sources = self._deduplicate_table_refs(sources)
|
|
127
|
+
|
|
128
|
+
return QueryNode(
|
|
129
|
+
file=path,
|
|
130
|
+
statement_index=stmt_index,
|
|
131
|
+
sql=sql,
|
|
132
|
+
kind=kind,
|
|
133
|
+
target=target,
|
|
134
|
+
sources=sources,
|
|
135
|
+
ctes=ctes,
|
|
136
|
+
column_lineage=column_lineage,
|
|
137
|
+
parse_failed=parse_failed,
|
|
138
|
+
confidence=confidence,
|
|
139
|
+
parsing_mode="sqlglot",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def _extract_target_table(create_stmt: exp.Create) -> TableRef | None:
|
|
144
|
+
"""Extract the target table from a CREATE statement.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
create_stmt: CREATE expression
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
TableRef or None if not a table create
|
|
151
|
+
"""
|
|
152
|
+
if create_stmt.kind not in ("TABLE", "VIEW"):
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
if not create_stmt.this:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
return AnsiParser._convert_table_expr_to_ref(create_stmt.this)
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _extract_insert_target(insert_stmt: exp.Insert) -> TableRef | None:
|
|
162
|
+
"""Extract the target table from an INSERT statement.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
insert_stmt: INSERT expression
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
TableRef or None
|
|
169
|
+
"""
|
|
170
|
+
if not insert_stmt.this:
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
return AnsiParser._convert_table_expr_to_ref(insert_stmt.this)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _fallback_table_scan(stmt: Any) -> list[TableRef]:
|
|
177
|
+
"""Fallback table extraction when scope analysis fails.
|
|
178
|
+
|
|
179
|
+
Walks the AST tree to find all Table nodes.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
stmt: SQL expression
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
List of TableRef objects found
|
|
186
|
+
"""
|
|
187
|
+
tables = []
|
|
188
|
+
seen = set()
|
|
189
|
+
|
|
190
|
+
# Use find_all to iterate through all nodes of type Table
|
|
191
|
+
for table_node in stmt.find_all(exp.Table):
|
|
192
|
+
ref = AnsiParser._convert_table_expr_to_ref(table_node)
|
|
193
|
+
if ref and ref.full_id not in seen:
|
|
194
|
+
tables.append(ref)
|
|
195
|
+
seen.add(ref.full_id)
|
|
196
|
+
|
|
197
|
+
return tables
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _deduplicate_table_refs(tables: list[TableRef]) -> list[TableRef]:
|
|
201
|
+
"""Remove duplicate table references while preserving order.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
tables: List of TableRef objects
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Deduplicated list
|
|
208
|
+
"""
|
|
209
|
+
seen = set()
|
|
210
|
+
result = []
|
|
211
|
+
for table in tables:
|
|
212
|
+
if table.full_id not in seen:
|
|
213
|
+
result.append(table)
|
|
214
|
+
seen.add(table.full_id)
|
|
215
|
+
return result
|
sqlcg/parsers/base.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
"""Base data models and abstract parser for SQL parsing and lineage extraction."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from sqlcg.lineage.schema_resolver import SchemaResolver
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class QueryKind(StrEnum):
|
|
14
|
+
"""SQL query classification types."""
|
|
15
|
+
|
|
16
|
+
SELECT = "SELECT"
|
|
17
|
+
INSERT = "INSERT"
|
|
18
|
+
UPDATE = "UPDATE"
|
|
19
|
+
DELETE = "DELETE"
|
|
20
|
+
CREATE_TABLE = "CREATE_TABLE"
|
|
21
|
+
CREATE_VIEW = "CREATE_VIEW"
|
|
22
|
+
CREATE_PROC = "CREATE_PROC"
|
|
23
|
+
MERGE = "MERGE"
|
|
24
|
+
OTHER = "OTHER"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class TableRef:
|
|
29
|
+
"""A reference to a table (immutable).
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
catalog: Optional catalog name (e.g., "my_warehouse")
|
|
33
|
+
db: Optional database/schema name (e.g., "public")
|
|
34
|
+
name: Table name (always required)
|
|
35
|
+
alias: Optional alias used in the query context
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
catalog: str | None = None
|
|
39
|
+
db: str | None = None
|
|
40
|
+
name: str = ""
|
|
41
|
+
alias: str | None = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def full_id(self) -> str:
|
|
45
|
+
"""Return the fully qualified table identifier.
|
|
46
|
+
|
|
47
|
+
Includes all non-None components: catalog.db.name or db.name or name.
|
|
48
|
+
Used as the Table.qualified key in the graph.
|
|
49
|
+
"""
|
|
50
|
+
parts = [p for p in (self.catalog, self.db, self.name) if p]
|
|
51
|
+
return ".".join(parts)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def qualified(self) -> str:
|
|
55
|
+
"""Alias for full_id for compatibility."""
|
|
56
|
+
return self.full_id
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class ColumnRef:
|
|
61
|
+
"""A reference to a column (immutable).
|
|
62
|
+
|
|
63
|
+
Attributes:
|
|
64
|
+
table: The TableRef this column belongs to
|
|
65
|
+
name: Column name
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
table: TableRef
|
|
69
|
+
name: str = ""
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def full_id(self) -> str:
|
|
73
|
+
"""Return the fully qualified column identifier.
|
|
74
|
+
|
|
75
|
+
Format: table.full_id.column_name
|
|
76
|
+
Used as the Column.id key in the graph (finding 5.4).
|
|
77
|
+
"""
|
|
78
|
+
return f"{self.table.full_id}.{self.name}"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class LineageEdge:
|
|
83
|
+
"""A column-level lineage edge (immutable by design for hashing).
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
src: Source ColumnRef
|
|
87
|
+
dst: Destination ColumnRef
|
|
88
|
+
transform: Description of the transformation applied (e.g., "PASS_THROUGH", "AGGREGATION")
|
|
89
|
+
confidence: Confidence score (0.0 to 1.0)
|
|
90
|
+
query_id: Optional identifier of the query that created this edge
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
src: ColumnRef
|
|
94
|
+
dst: ColumnRef
|
|
95
|
+
transform: str = "UNKNOWN"
|
|
96
|
+
confidence: float = 1.0
|
|
97
|
+
query_id: str | None = None
|
|
98
|
+
|
|
99
|
+
def __hash__(self) -> int:
|
|
100
|
+
"""Support hashing for use in sets/dicts."""
|
|
101
|
+
return hash((self.src, self.dst, self.transform, self.confidence, self.query_id))
|
|
102
|
+
|
|
103
|
+
def __eq__(self, other: object) -> bool:
|
|
104
|
+
"""Support equality comparison."""
|
|
105
|
+
if not isinstance(other, LineageEdge):
|
|
106
|
+
return NotImplemented
|
|
107
|
+
return (
|
|
108
|
+
self.src == other.src
|
|
109
|
+
and self.dst == other.dst
|
|
110
|
+
and self.transform == other.transform
|
|
111
|
+
and self.confidence == other.confidence
|
|
112
|
+
and self.query_id == other.query_id
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class QueryNode:
|
|
118
|
+
"""A parsed SQL query node (mutable by design for pass-2 patching).
|
|
119
|
+
|
|
120
|
+
This class is intentionally mutable to support the two-pass resolution strategy.
|
|
121
|
+
Pass 1 extracts initial lineage; pass 2 patches QueryNode fields directly.
|
|
122
|
+
Do not freeze this class.
|
|
123
|
+
|
|
124
|
+
Note: Do not rely on QueryNode identity after pass 2; use dataclasses.replace()
|
|
125
|
+
to create a new instance if mutation semantics are a concern.
|
|
126
|
+
|
|
127
|
+
Attributes:
|
|
128
|
+
file: Path to the source file
|
|
129
|
+
statement_index: 0-based index of the statement in the file
|
|
130
|
+
sql: The original SQL text
|
|
131
|
+
kind: Query type (SELECT, INSERT, UPDATE, DELETE, CREATE_TABLE, CREATE_VIEW, etc.)
|
|
132
|
+
target: Optional TableRef for INSERT/CREATE targets
|
|
133
|
+
sources: List of source TableRefs
|
|
134
|
+
ctes: Dict of CTE names to their TableRef definitions
|
|
135
|
+
column_lineage: List of LineageEdge objects
|
|
136
|
+
parse_failed: Whether parsing failed (fallback to table-only lineage)
|
|
137
|
+
confidence: Overall confidence score for this query's lineage
|
|
138
|
+
parsing_mode: How the query was parsed (e.g., "sqlglot", "fallback", "scripting")
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
file: Path
|
|
142
|
+
statement_index: int
|
|
143
|
+
sql: str
|
|
144
|
+
kind: str = ""
|
|
145
|
+
target: TableRef | None = None
|
|
146
|
+
sources: list[TableRef] = field(default_factory=list)
|
|
147
|
+
ctes: dict[str, TableRef] = field(default_factory=dict)
|
|
148
|
+
column_lineage: list[LineageEdge] = field(default_factory=list)
|
|
149
|
+
parse_failed: bool = False
|
|
150
|
+
confidence: float = 1.0
|
|
151
|
+
parsing_mode: str = "sqlglot"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class ParsedFile:
|
|
156
|
+
"""Result of parsing a single SQL file (mutable for aggregation).
|
|
157
|
+
|
|
158
|
+
Attributes:
|
|
159
|
+
path: Path to the source file
|
|
160
|
+
dialect: SQL dialect used for parsing
|
|
161
|
+
statements: List of QueryNode objects parsed from the file
|
|
162
|
+
defined_tables: List of TableRef for tables defined in this file
|
|
163
|
+
referenced_tables: List of TableRef for tables referenced in this file
|
|
164
|
+
errors: List of error messages encountered during parsing
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
path: Path
|
|
168
|
+
dialect: str | None = None
|
|
169
|
+
statements: list[QueryNode] = field(default_factory=list)
|
|
170
|
+
defined_tables: list[TableRef] = field(default_factory=list)
|
|
171
|
+
referenced_tables: list[TableRef] = field(default_factory=list)
|
|
172
|
+
errors: list[str] = field(default_factory=list)
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def path_str(self) -> str:
|
|
176
|
+
"""Return the path as a string."""
|
|
177
|
+
return str(self.path)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class SqlParser(ABC):
|
|
181
|
+
"""Abstract base class for SQL parsers.
|
|
182
|
+
|
|
183
|
+
Attributes:
|
|
184
|
+
DIALECT: SQL dialect identifier (None for ANSI, "snowflake", "bigquery", etc.)
|
|
185
|
+
_schema: SchemaResolver instance for table/column lookups
|
|
186
|
+
_log: Logger instance for this parser
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
DIALECT: str | None = None
|
|
190
|
+
|
|
191
|
+
def __init__(self, schema_resolver: "SchemaResolver"):
|
|
192
|
+
"""Initialize parser with schema resolver.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
schema_resolver: SchemaResolver instance for table/column lookups
|
|
196
|
+
"""
|
|
197
|
+
from sqlcg.utils.logging import getLogger
|
|
198
|
+
|
|
199
|
+
self._schema = schema_resolver
|
|
200
|
+
self._log = getLogger(f"{__name__}.{self.__class__.__name__}")
|
|
201
|
+
|
|
202
|
+
@abstractmethod
|
|
203
|
+
def parse_file(self, path: Path, sql: str) -> ParsedFile:
|
|
204
|
+
"""Parse SQL text and return a ParsedFile with all statements.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
path: Path to the source file
|
|
208
|
+
sql: SQL text to parse
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
ParsedFile containing parsed statements and metadata
|
|
212
|
+
"""
|
|
213
|
+
...
|
|
214
|
+
|
|
215
|
+
def _classify(self, stmt: Any) -> str:
|
|
216
|
+
"""Classify a SQL statement into a kind string.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
stmt: sqlglot AST node
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
One of: SELECT, INSERT, UPDATE, DELETE, CREATE_TABLE, CREATE_VIEW,
|
|
223
|
+
CREATE_PROC, MERGE, OTHER
|
|
224
|
+
"""
|
|
225
|
+
import sqlglot.expressions as exp
|
|
226
|
+
|
|
227
|
+
match stmt:
|
|
228
|
+
case exp.Select():
|
|
229
|
+
return QueryKind.SELECT
|
|
230
|
+
case exp.Insert():
|
|
231
|
+
return QueryKind.INSERT
|
|
232
|
+
case exp.Update():
|
|
233
|
+
return QueryKind.UPDATE
|
|
234
|
+
case exp.Delete():
|
|
235
|
+
return QueryKind.DELETE
|
|
236
|
+
case exp.Create():
|
|
237
|
+
match stmt.kind:
|
|
238
|
+
case "TABLE":
|
|
239
|
+
return QueryKind.CREATE_TABLE
|
|
240
|
+
case "VIEW":
|
|
241
|
+
return QueryKind.CREATE_VIEW
|
|
242
|
+
case "PROCEDURE" | "FUNCTION":
|
|
243
|
+
return QueryKind.CREATE_PROC
|
|
244
|
+
case _:
|
|
245
|
+
return QueryKind.CREATE_TABLE # Default to table
|
|
246
|
+
case exp.Merge():
|
|
247
|
+
return QueryKind.MERGE
|
|
248
|
+
case _:
|
|
249
|
+
return QueryKind.OTHER
|
|
250
|
+
|
|
251
|
+
def _real_tables(self, scope: Any) -> list[TableRef]:
|
|
252
|
+
"""Return real (non-CTE) tables referenced in a scope.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
scope: sqlglot scope object
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
List of TableRef objects for non-CTE tables
|
|
259
|
+
"""
|
|
260
|
+
if not scope:
|
|
261
|
+
return []
|
|
262
|
+
|
|
263
|
+
cte_sources = set(scope.cte_sources.keys()) if hasattr(scope, "cte_sources") else set()
|
|
264
|
+
tables = []
|
|
265
|
+
|
|
266
|
+
if hasattr(scope, "tables"):
|
|
267
|
+
# scope.tables is a list of Table expressions
|
|
268
|
+
table_list = scope.tables
|
|
269
|
+
if isinstance(table_list, list):
|
|
270
|
+
for table_expr in table_list:
|
|
271
|
+
# Extract the table name to check if it's a CTE
|
|
272
|
+
table_name = None
|
|
273
|
+
if hasattr(table_expr, "name"):
|
|
274
|
+
table_name = table_expr.name
|
|
275
|
+
|
|
276
|
+
if table_name not in cte_sources:
|
|
277
|
+
# Convert table expression to TableRef
|
|
278
|
+
ref = self._convert_table_expr_to_ref(table_expr)
|
|
279
|
+
if ref:
|
|
280
|
+
tables.append(ref)
|
|
281
|
+
|
|
282
|
+
return tables
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def _convert_table_expr_to_ref(table_expr: Any) -> "TableRef | None":
|
|
286
|
+
"""Convert a table expression to a TableRef.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
table_expr: Table expression (e.g., Table, Identifier, Schema)
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
TableRef or None
|
|
293
|
+
"""
|
|
294
|
+
import sqlglot.expressions as exp
|
|
295
|
+
|
|
296
|
+
match table_expr:
|
|
297
|
+
case exp.Table():
|
|
298
|
+
return TableRef(
|
|
299
|
+
catalog=table_expr.catalog,
|
|
300
|
+
db=table_expr.db,
|
|
301
|
+
name=table_expr.name,
|
|
302
|
+
)
|
|
303
|
+
case exp.Identifier():
|
|
304
|
+
return TableRef(name=table_expr.name)
|
|
305
|
+
case exp.Schema():
|
|
306
|
+
# Schema wraps the actual table
|
|
307
|
+
return SqlParser._convert_table_expr_to_ref(table_expr.this)
|
|
308
|
+
case _:
|
|
309
|
+
# Try to extract name from string representation
|
|
310
|
+
name = str(table_expr)
|
|
311
|
+
if name:
|
|
312
|
+
return TableRef(name=name)
|
|
313
|
+
return None
|
|
314
|
+
|
|
315
|
+
def _extract_column_lineage(
|
|
316
|
+
self, stmt: Any, path: Path, out: ParsedFile, schema: dict
|
|
317
|
+
) -> list[LineageEdge]:
|
|
318
|
+
"""Extract column-level lineage with structured error recording.
|
|
319
|
+
|
|
320
|
+
On sqlglot.lineage failure: log WARNING, append to ParsedFile.errors,
|
|
321
|
+
emit LineageEdge with confidence=0.0, continue (do NOT raise or skip silently).
|
|
322
|
+
|
|
323
|
+
For columns not found in the schema, emits edges with reduced confidence (0.5)
|
|
324
|
+
rather than skipping silently (only applies when schema is non-empty).
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
stmt: sqlglot AST node (Select/Insert/Create)
|
|
328
|
+
path: Path to the source file
|
|
329
|
+
out: ParsedFile object to append errors to
|
|
330
|
+
schema: Schema dict from _schema.as_dict()
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
List of LineageEdge objects
|
|
334
|
+
"""
|
|
335
|
+
import sqlglot.expressions as exp
|
|
336
|
+
from sqlglot.lineage import lineage as sg_lineage
|
|
337
|
+
|
|
338
|
+
edges: list[LineageEdge] = []
|
|
339
|
+
|
|
340
|
+
# Only extract column lineage for certain statement types
|
|
341
|
+
if not isinstance(stmt, (exp.Select, exp.Insert, exp.Create)):
|
|
342
|
+
return edges
|
|
343
|
+
|
|
344
|
+
# Extract column references from SELECT list or target
|
|
345
|
+
try:
|
|
346
|
+
# Get the body of the query for lineage extraction
|
|
347
|
+
if isinstance(stmt, exp.Select):
|
|
348
|
+
body = stmt
|
|
349
|
+
# Extract output columns
|
|
350
|
+
for col_expr in stmt.expressions:
|
|
351
|
+
col_name = col_expr.alias if col_expr.alias else str(col_expr)
|
|
352
|
+
# Schema validation: if schema is loaded and column isn't in it,
|
|
353
|
+
# emit a reduced-confidence edge rather than a full-confidence one.
|
|
354
|
+
if schema:
|
|
355
|
+
table_cols: list[str] | None = None
|
|
356
|
+
for _scope_name, cols in schema.items():
|
|
357
|
+
if isinstance(cols, list):
|
|
358
|
+
table_cols = cols
|
|
359
|
+
break
|
|
360
|
+
elif isinstance(cols, dict):
|
|
361
|
+
for _db, tables in cols.items():
|
|
362
|
+
if isinstance(tables, dict):
|
|
363
|
+
for _tbl, tcols in tables.items():
|
|
364
|
+
if col_name in tcols:
|
|
365
|
+
table_cols = tcols
|
|
366
|
+
break
|
|
367
|
+
if table_cols is not None and col_name not in table_cols:
|
|
368
|
+
self._log.warning(
|
|
369
|
+
"column %s not found in schema, emitting low-confidence edge",
|
|
370
|
+
col_name,
|
|
371
|
+
)
|
|
372
|
+
edges.append(
|
|
373
|
+
LineageEdge(
|
|
374
|
+
src=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
|
|
375
|
+
dst=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
|
|
376
|
+
transform="UNKNOWN",
|
|
377
|
+
confidence=0.5,
|
|
378
|
+
)
|
|
379
|
+
)
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
root = sg_lineage(col_name, body, schema=schema, dialect=self.DIALECT)
|
|
384
|
+
if root:
|
|
385
|
+
# Successfully extracted lineage
|
|
386
|
+
# TODO: convert root to LineageEdge(s)
|
|
387
|
+
pass
|
|
388
|
+
except Exception as exc:
|
|
389
|
+
self._log.warning(
|
|
390
|
+
"column lineage extraction failed: file=%s col=%s error=%s",
|
|
391
|
+
path,
|
|
392
|
+
col_name,
|
|
393
|
+
exc,
|
|
394
|
+
)
|
|
395
|
+
out.errors.append(f"col_lineage:{col_name}:{exc}")
|
|
396
|
+
# Emit a zero-confidence placeholder edge
|
|
397
|
+
edges.append(
|
|
398
|
+
LineageEdge(
|
|
399
|
+
src=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
|
|
400
|
+
dst=ColumnRef(TableRef(None, None, "<unknown>"), col_name),
|
|
401
|
+
transform="UNKNOWN",
|
|
402
|
+
confidence=0.0,
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
except Exception as exc:
|
|
407
|
+
self._log.warning(
|
|
408
|
+
"column lineage extraction failed for entire statement: file=%s error=%s",
|
|
409
|
+
path,
|
|
410
|
+
exc,
|
|
411
|
+
)
|
|
412
|
+
out.errors.append(f"col_lineage:statement:{exc}")
|
|
413
|
+
|
|
414
|
+
return edges
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""BigQuery SQL parser with scripting block detection."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from sqlcg.lineage.schema_resolver import SchemaResolver
|
|
6
|
+
from sqlcg.parsers.ansi_parser import AnsiParser
|
|
7
|
+
from sqlcg.parsers.base import ParsedFile
|
|
8
|
+
from sqlcg.parsers.registry import register
|
|
9
|
+
from sqlcg.utils.logging import getLogger
|
|
10
|
+
|
|
11
|
+
logger = getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register("bigquery")
|
|
15
|
+
class BigQueryParser(AnsiParser):
|
|
16
|
+
"""BigQuery SQL parser.
|
|
17
|
+
|
|
18
|
+
Handles BigQuery-specific features:
|
|
19
|
+
- Scripting block detection (DECLARE, BEGIN, IF)
|
|
20
|
+
- Standard table/column lineage extraction
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
DIALECT: str | None = "bigquery"
|
|
24
|
+
|
|
25
|
+
def __init__(self, schema_resolver: SchemaResolver):
|
|
26
|
+
"""Initialize BigQuery parser.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
schema_resolver: SchemaResolver instance for table/column lookups
|
|
30
|
+
"""
|
|
31
|
+
super().__init__(schema_resolver)
|
|
32
|
+
|
|
33
|
+
def parse_file(self, path: Path, sql: str) -> ParsedFile:
|
|
34
|
+
"""Parse BigQuery SQL file with scripting block detection.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
path: Path to the source file
|
|
38
|
+
sql: SQL text to parse
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
ParsedFile with parsed statements and metadata
|
|
42
|
+
"""
|
|
43
|
+
# Check for scripting blocks
|
|
44
|
+
if self._has_scripting_block(sql):
|
|
45
|
+
logger.info("BigQuery scripting block detected in %s, marking as parse_failed", path)
|
|
46
|
+
# Scripting blocks are not fully parseable; mark as parse_failed
|
|
47
|
+
out = ParsedFile(path=path, dialect=self.DIALECT)
|
|
48
|
+
out.errors.append("parse_mode:scripting_block")
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
# Otherwise use standard ANSI parsing with BigQuery dialect
|
|
52
|
+
return AnsiParser.parse_file(self, path, sql) # type: ignore
|
|
53
|
+
|
|
54
|
+
def _has_scripting_block(self, sql: str) -> bool:
|
|
55
|
+
"""Token-aware scripting block detection for BigQuery.
|
|
56
|
+
|
|
57
|
+
Detects DECLARE, BEGIN, IF, and other scripting keywords.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
sql: SQL text to check
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
True if a scripting block is detected
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
from sqlglot.tokens import Tokenizer, TokenType # type: ignore
|
|
67
|
+
|
|
68
|
+
toks = Tokenizer.from_dialect("bigquery").tokenize(sql) # type: ignore
|
|
69
|
+
scripting_tokens = {
|
|
70
|
+
TokenType.DECLARE, # type: ignore
|
|
71
|
+
TokenType.BEGIN, # type: ignore
|
|
72
|
+
}
|
|
73
|
+
return any(t.token_type in scripting_tokens for t in toks)
|
|
74
|
+
except Exception:
|
|
75
|
+
# Fallback: check for keywords in uppercase
|
|
76
|
+
sql_upper = sql.upper()
|
|
77
|
+
return any(keyword in sql_upper for keyword in ("DECLARE ", "BEGIN "))
|