sql-glider 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  """Core lineage analysis using SQLGlot."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Callable, Iterator, List, Optional, Set, Tuple
4
+ from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
5
5
 
6
6
  from pydantic import BaseModel, Field
7
7
  from sqlglot import exp, parse
@@ -99,6 +99,9 @@ class LineageAnalyzer:
99
99
  self.sql = sql
100
100
  self.dialect = dialect
101
101
  self._skipped_queries: List[SkippedQuery] = []
102
+ # File-scoped schema context for cross-statement lineage
103
+ # Maps table/view names to their column definitions
104
+ self._file_schema: Dict[str, Dict[str, str]] = {}
102
105
 
103
106
  try:
104
107
  # Parse all statements in the SQL string
@@ -155,7 +158,25 @@ class LineageAnalyzer:
155
158
  if target_table:
156
159
  # DML/DDL: Use target table for output column qualification
157
160
  # The columns are from the SELECT, but qualified with the target table
158
- for projection in select_node.expressions:
161
+ projections = self._get_select_projections(select_node)
162
+ first_select = self._get_first_select(select_node)
163
+
164
+ for projection in projections:
165
+ # Handle SELECT * by resolving from file schema
166
+ if isinstance(projection, exp.Star):
167
+ if first_select:
168
+ star_columns = self._resolve_star_columns(first_select)
169
+ for star_col in star_columns:
170
+ qualified_name = f"{target_table}.{star_col}"
171
+ columns.append(qualified_name)
172
+ self._column_mapping[qualified_name] = star_col
173
+ if not columns:
174
+ # Fallback: can't resolve *, use * as column name
175
+ qualified_name = f"{target_table}.*"
176
+ columns.append(qualified_name)
177
+ self._column_mapping[qualified_name] = "*"
178
+ continue
179
+
159
180
  # Get the underlying expression (unwrap alias if present)
160
181
  if isinstance(projection, exp.Alias):
161
182
  # For aliased columns, use the alias as the column name
@@ -178,7 +199,10 @@ class LineageAnalyzer:
178
199
 
179
200
  else:
180
201
  # DQL (pure SELECT): Use the SELECT columns as output
181
- for projection in select_node.expressions:
202
+ projections = self._get_select_projections(select_node)
203
+ # Get the first SELECT for table resolution (handles UNION case)
204
+ first_select = self._get_first_select(select_node)
205
+ for projection in projections:
182
206
  # Get the underlying expression (unwrap alias if present)
183
207
  if isinstance(projection, exp.Alias):
184
208
  source_expr = projection.this
@@ -195,20 +219,20 @@ class LineageAnalyzer:
195
219
  table_name = source_expr.table
196
220
  col_name = column_name or source_expr.name
197
221
 
198
- if table_name:
222
+ if table_name and first_select:
199
223
  # Resolve table reference (could be table, CTE, or subquery alias)
200
224
  # This works at any nesting level because we're only looking at the immediate context
201
225
  resolved_table = self._resolve_table_reference(
202
- table_name, select_node
226
+ table_name, first_select
203
227
  )
204
228
  qualified_name = f"{resolved_table}.{col_name}"
205
229
  columns.append(qualified_name)
206
230
  # Map qualified name to what lineage expects
207
231
  self._column_mapping[qualified_name] = lineage_name or col_name
208
- else:
232
+ elif first_select:
209
233
  # No table qualifier - try to infer from FROM clause
210
234
  # This handles "SELECT col FROM single_source" cases
211
- inferred_table = self._infer_single_table_source(select_node)
235
+ inferred_table = self._infer_single_table_source(first_select)
212
236
  if inferred_table:
213
237
  qualified_name = f"{inferred_table}.{col_name}"
214
238
  columns.append(qualified_name)
@@ -219,6 +243,10 @@ class LineageAnalyzer:
219
243
  # Can't infer table, just use column name
220
244
  columns.append(col_name)
221
245
  self._column_mapping[col_name] = lineage_name or col_name
246
+ else:
247
+ # No SELECT found, just use column name
248
+ columns.append(col_name)
249
+ self._column_mapping[col_name] = lineage_name or col_name
222
250
  else:
223
251
  # For other expressions (literals, functions, etc.)
224
252
  # Use the alias if available, otherwise the SQL representation
@@ -232,6 +260,46 @@ class LineageAnalyzer:
232
260
 
233
261
  return columns
234
262
 
263
+ def _get_select_projections(self, node: exp.Expression) -> List[exp.Expression]:
264
+ """
265
+ Get the SELECT projections from a SELECT or set operation node.
266
+
267
+ For set operations (UNION, INTERSECT, EXCEPT), returns projections from
268
+ the first branch since all branches must have the same number of columns
269
+ with compatible types.
270
+
271
+ Args:
272
+ node: A SELECT or set operation (UNION/INTERSECT/EXCEPT) expression
273
+
274
+ Returns:
275
+ List of projection expressions from the SELECT clause
276
+ """
277
+ if isinstance(node, exp.Select):
278
+ return list(node.expressions)
279
+ elif isinstance(node, (exp.Union, exp.Intersect, exp.Except)):
280
+ # Recursively get from the left branch (could be nested set operations)
281
+ return self._get_select_projections(node.left)
282
+ return []
283
+
284
+ def _get_first_select(self, node: exp.Expression) -> Optional[exp.Select]:
285
+ """
286
+ Get the first SELECT node from a SELECT or set operation expression.
287
+
288
+ For set operations (UNION, INTERSECT, EXCEPT), returns the leftmost
289
+ SELECT branch.
290
+
291
+ Args:
292
+ node: A SELECT or set operation (UNION/INTERSECT/EXCEPT) expression
293
+
294
+ Returns:
295
+ The first SELECT node, or None if not found
296
+ """
297
+ if isinstance(node, exp.Select):
298
+ return node
299
+ elif isinstance(node, (exp.Union, exp.Intersect, exp.Except)):
300
+ return self._get_first_select(node.left)
301
+ return None
302
+
235
303
  def analyze_queries(
236
304
  self,
237
305
  level: AnalysisLevel = AnalysisLevel.COLUMN,
@@ -276,6 +344,7 @@ class LineageAnalyzer:
276
344
  """
277
345
  results = []
278
346
  self._skipped_queries = [] # Reset skipped queries for this analysis
347
+ self._file_schema = {} # Reset file schema for this analysis run
279
348
 
280
349
  for query_index, expr, preview in self._iterate_queries(table_filter):
281
350
  # Temporarily swap self.expr to analyze this query
@@ -327,6 +396,9 @@ class LineageAnalyzer:
327
396
  )
328
397
  )
329
398
  finally:
399
+ # Extract schema from this statement AFTER analysis
400
+ # This builds up context for subsequent statements to use
401
+ self._extract_schema_from_statement(expr)
330
402
  # Restore original expression
331
403
  self.expr = original_expr
332
404
 
@@ -654,7 +726,13 @@ class LineageAnalyzer:
654
726
  lineage_col = self._column_mapping.get(col, col)
655
727
 
656
728
  # Get lineage tree for this column using current query SQL only
657
- node = lineage(lineage_col, current_query_sql, dialect=self.dialect)
729
+ # Pass file schema to enable SELECT * expansion for known tables/views
730
+ node = lineage(
731
+ lineage_col,
732
+ current_query_sql,
733
+ dialect=self.dialect,
734
+ schema=self._file_schema if self._file_schema else None,
735
+ )
658
736
 
659
737
  # Collect all source columns
660
738
  sources: Set[str] = set()
@@ -795,7 +873,9 @@ class LineageAnalyzer:
795
873
 
796
874
  def _get_target_and_select(
797
875
  self,
798
- ) -> Optional[tuple[Optional[str], exp.Select]]:
876
+ ) -> Optional[
877
+ tuple[Optional[str], Union[exp.Select, exp.Union, exp.Intersect, exp.Except]]
878
+ ]:
799
879
  """
800
880
  Detect if this is a DML/DDL statement and extract the target table and SELECT node.
801
881
 
@@ -817,9 +897,11 @@ class LineageAnalyzer:
817
897
  target = self.expr.this
818
898
  if isinstance(target, exp.Table):
819
899
  target_name = self._get_qualified_table_name(target)
820
- # Find the SELECT within the INSERT
900
+ # Find the SELECT within the INSERT (may be a set operation)
821
901
  select_node = self.expr.expression
822
- if isinstance(select_node, exp.Select):
902
+ if isinstance(
903
+ select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
904
+ ):
823
905
  return (target_name, select_node)
824
906
 
825
907
  # Check for CREATE TABLE AS SELECT (CTAS) or CREATE VIEW AS SELECT
@@ -831,9 +913,11 @@ class LineageAnalyzer:
831
913
  target = target.this
832
914
  if isinstance(target, exp.Table):
833
915
  target_name = self._get_qualified_table_name(target)
834
- # Find the SELECT in the expression
916
+ # Find the SELECT in the expression (may be a set operation)
835
917
  select_node = self.expr.expression
836
- if isinstance(select_node, exp.Select):
918
+ if isinstance(
919
+ select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
920
+ ):
837
921
  return (target_name, select_node)
838
922
 
839
923
  # Check for MERGE statement
@@ -1181,3 +1265,187 @@ class LineageAnalyzer:
1181
1265
  preview = self._generate_query_preview(expr)
1182
1266
 
1183
1267
  yield idx, expr, preview
1268
+
1269
+ # -------------------------------------------------------------------------
1270
+ # File-scoped schema context methods
1271
+ # -------------------------------------------------------------------------
1272
+
1273
+ def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
1274
+ """
1275
+ Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
1276
+
1277
+ This method builds up file-scoped schema context as statements are processed,
1278
+ enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
1279
+
1280
+ Args:
1281
+ expr: The SQL expression to extract schema from
1282
+ """
1283
+ # Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
1284
+ if not isinstance(expr, exp.Create):
1285
+ return
1286
+ if expr.kind not in ("VIEW", "TABLE"):
1287
+ return
1288
+
1289
+ # Get target table/view name
1290
+ target = expr.this
1291
+ if isinstance(target, exp.Schema):
1292
+ target = target.this
1293
+ if not isinstance(target, exp.Table):
1294
+ return
1295
+
1296
+ target_name = self._get_qualified_table_name(target)
1297
+
1298
+ # Get the SELECT node from the CREATE statement
1299
+ select_node = expr.expression
1300
+ if select_node is None:
1301
+ return
1302
+
1303
+ # Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
1304
+ if isinstance(select_node, exp.Subquery):
1305
+ select_node = select_node.this
1306
+
1307
+ if not isinstance(
1308
+ select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
1309
+ ):
1310
+ return
1311
+
1312
+ # Extract column names from the SELECT
1313
+ columns = self._extract_columns_from_select(select_node)
1314
+
1315
+ if columns:
1316
+ # Store with UNKNOWN type - SQLGlot only needs column names for expansion
1317
+ self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
1318
+
1319
+ def _extract_columns_from_select(
1320
+ self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
1321
+ ) -> List[str]:
1322
+ """
1323
+ Extract column names from a SELECT statement.
1324
+
1325
+ Handles aliases, direct column references, and SELECT * by resolving
1326
+ against the known file schema.
1327
+
1328
+ Args:
1329
+ select_node: The SELECT or set operation expression
1330
+
1331
+ Returns:
1332
+ List of column names
1333
+ """
1334
+ columns: List[str] = []
1335
+
1336
+ # Get projections (for UNION, use first branch)
1337
+ projections = self._get_select_projections(select_node)
1338
+ first_select = self._get_first_select(select_node)
1339
+
1340
+ for projection in projections:
1341
+ if isinstance(projection, exp.Alias):
1342
+ # Use the alias name as the column name
1343
+ columns.append(projection.alias)
1344
+ elif isinstance(projection, exp.Column):
1345
+ # Use the column name
1346
+ columns.append(projection.name)
1347
+ elif isinstance(projection, exp.Star):
1348
+ # Resolve SELECT * from known schema
1349
+ if first_select:
1350
+ star_columns = self._resolve_star_columns(first_select)
1351
+ columns.extend(star_columns)
1352
+ else:
1353
+ # For expressions without alias, use SQL representation
1354
+ col_sql = projection.sql(dialect=self.dialect)
1355
+ columns.append(col_sql)
1356
+
1357
+ return columns
1358
+
1359
+ def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
1360
+ """
1361
+ Resolve SELECT * to actual column names from known file schema or CTEs.
1362
+
1363
+ Args:
1364
+ select_node: The SELECT node containing the * reference
1365
+
1366
+ Returns:
1367
+ List of column names if source is known, empty list otherwise
1368
+ """
1369
+ columns: List[str] = []
1370
+
1371
+ # Get the source table(s) from FROM clause
1372
+ from_clause = select_node.args.get("from")
1373
+ if not from_clause or not isinstance(from_clause, exp.From):
1374
+ return columns
1375
+
1376
+ source = from_clause.this
1377
+
1378
+ # Handle table reference
1379
+ if isinstance(source, exp.Table):
1380
+ source_name = self._get_qualified_table_name(source)
1381
+
1382
+ # First check file schema (views/tables from previous statements)
1383
+ if source_name in self._file_schema:
1384
+ columns.extend(self._file_schema[source_name].keys())
1385
+ else:
1386
+ # Check if this is a CTE reference within the same statement
1387
+ cte_columns = self._resolve_cte_columns(source_name, select_node)
1388
+ columns.extend(cte_columns)
1389
+
1390
+ # Handle subquery - can't resolve without deeper analysis
1391
+ elif isinstance(source, exp.Subquery) and source.alias:
1392
+ # Check if this subquery alias is in file schema (unlikely)
1393
+ if source.alias in self._file_schema:
1394
+ columns.extend(self._file_schema[source.alias].keys())
1395
+
1396
+ return columns
1397
+
1398
+ def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
1399
+ """
1400
+ Resolve columns from a CTE definition within the same statement.
1401
+
1402
+ Args:
1403
+ cte_name: Name of the CTE to resolve
1404
+ select_node: The SELECT node that references the CTE
1405
+
1406
+ Returns:
1407
+ List of column names from the CTE, empty if CTE not found
1408
+ """
1409
+ # Walk up the tree to find the WITH clause containing this CTE
1410
+ parent = select_node
1411
+ while parent:
1412
+ if hasattr(parent, "args") and parent.args.get("with"):
1413
+ with_clause = parent.args["with"]
1414
+ for cte in with_clause.expressions:
1415
+ if isinstance(cte, exp.CTE) and cte.alias == cte_name:
1416
+ # Found the CTE - extract its columns
1417
+ cte_select = cte.this
1418
+ if isinstance(cte_select, exp.Select):
1419
+ return self._extract_cte_select_columns(cte_select)
1420
+ parent = parent.parent if hasattr(parent, "parent") else None
1421
+
1422
+ return []
1423
+
1424
+ def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
1425
+ """
1426
+ Extract column names from a CTE's SELECT statement.
1427
+
1428
+ This handles SELECT * within the CTE by resolving against file schema.
1429
+
1430
+ Args:
1431
+ cte_select: The SELECT expression within the CTE
1432
+
1433
+ Returns:
1434
+ List of column names
1435
+ """
1436
+ columns: List[str] = []
1437
+
1438
+ for projection in cte_select.expressions:
1439
+ if isinstance(projection, exp.Alias):
1440
+ columns.append(projection.alias)
1441
+ elif isinstance(projection, exp.Column):
1442
+ columns.append(projection.name)
1443
+ elif isinstance(projection, exp.Star):
1444
+ # Resolve SELECT * in CTE from file schema
1445
+ star_columns = self._resolve_star_columns(cte_select)
1446
+ columns.extend(star_columns)
1447
+ else:
1448
+ col_sql = projection.sql(dialect=self.dialect)
1449
+ columns.append(col_sql)
1450
+
1451
+ return columns
sqlglider/utils/config.py CHANGED
@@ -23,6 +23,28 @@ class TemplatingConfig(BaseModel):
23
23
  variables: Optional[Dict[str, Any]] = None
24
24
 
25
25
 
26
+ class DatabricksCatalogConfig(BaseModel):
27
+ """Configuration for Databricks catalog provider.
28
+
29
+ All fields are optional - they can also be set via environment variables.
30
+ The SDK supports unified authentication with multiple methods.
31
+ """
32
+
33
+ warehouse_id: Optional[str] = None
34
+ profile: Optional[str] = None # Databricks CLI profile from ~/.databrickscfg
35
+ host: Optional[str] = None
36
+ token: Optional[str] = None # Legacy PAT, prefer OAuth or profile
37
+
38
+
39
+ class CatalogConfig(BaseModel):
40
+ """Configuration for catalog providers.
41
+
42
+ Contains provider-specific configuration under sub-keys.
43
+ """
44
+
45
+ databricks: Optional[DatabricksCatalogConfig] = None
46
+
47
+
26
48
  class ConfigSettings(BaseModel):
27
49
  """Configuration settings for SQL Glider.
28
50
 
@@ -35,6 +57,9 @@ class ConfigSettings(BaseModel):
35
57
  output_format: Optional[str] = None
36
58
  templater: Optional[str] = None
37
59
  templating: Optional[TemplatingConfig] = None
60
+ catalog_type: Optional[str] = None
61
+ ddl_folder: Optional[str] = None
62
+ catalog: Optional[CatalogConfig] = None
38
63
 
39
64
 
40
65
  def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:
@@ -1,26 +0,0 @@
1
- sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
- sqlglider/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
3
- sqlglider/cli.py,sha256=POWIhv0jfvoNtwSoURpxJydco1rvxX9rAvyjuA9FGC8,36445
4
- sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
- sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
6
- sqlglider/graph/builder.py,sha256=rrcpGAXLz-VHZ1Y73uw6R7kMXHpzBz7tQ2tdV5BY05w,10202
7
- sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
8
- sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
9
- sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
10
- sqlglider/graph/serialization.py,sha256=7JJo31rwSlxnDhdqdTJdK4Dr_ZcSYetXfx3_CmndSac,2662
11
- sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
12
- sqlglider/lineage/analyzer.py,sha256=58lyrUc0XsCUrYSb23A02OSBmq7eCtJwc477PbjS3c0,45905
13
- sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
14
- sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
15
- sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
16
- sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
17
- sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
18
- sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
19
- sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
20
- sqlglider/utils/config.py,sha256=mkven_CcE_dNfKiHi0h2CsE5TMQDX9XqbU7GGEELwEY,3959
21
- sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
22
- sql_glider-0.1.2.dist-info/METADATA,sha256=JUXRDvhfnJBj2owWMaupDugZj4Y6uDv1R7RCCkaEWlw,22349
23
- sql_glider-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
- sql_glider-0.1.2.dist-info/entry_points.txt,sha256=LWVdQEfvDT5uZ2RQ4Rse8m0HxBCOMbbqDkxdwUh9d78,169
25
- sql_glider-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
26
- sql_glider-0.1.2.dist-info/RECORD,,