sql-glider 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.4 → sql_glider-0.1.6}/PKG-INFO +1 -1
- sql_glider-0.1.6/plans/2026-01-26-file-scoped-schema-context.md +201 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/_version.py +2 -2
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/analyzer.py +198 -18
- sql_glider-0.1.6/tests/fixtures/original_queries/test_cte_view_star.sql +22 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/test_analyzer.py +271 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/.github/workflows/ci.yml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/.github/workflows/publish.yml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/.gitignore +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/.python-version +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/ARCHITECTURE.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/CLAUDE.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/LICENSE +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/README.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-05-column-level-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-05-reverse-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-config-file-support.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-graph-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-unify-single-multi-query.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-07-sample-data-model.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-07-sql-templating.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-08-tables-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-09-graph-query-paths.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-13-dissect-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-14-tables-pull-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/pyproject.toml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/README.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/expire_dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/load_fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/load_fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/merge_dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/merge_dim_product.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/conditional_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/cte_insert.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/multi_table_transform.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/dim_product.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_addresses.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_order_items.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/maintenance/update_product_status.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/sqlglider.toml.example +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/databricks.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/cli.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/analyzer.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/global_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/builder.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/merge.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/query.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/serialization.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/jinja.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/variables.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/config.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/file_utils.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/reports.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_cte.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_multi.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_single_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_subquery.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_tables.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_view.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/sample_manifest.csv +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_databricks.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_analyzer.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_builder.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_merge.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_query.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_serialization.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/test_formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_jinja.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_variables.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/test_cli.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/test_config.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/test_file_utils.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.6}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Plan: File-Scoped Schema Context for SQL Lineage Analyzer
|
|
2
|
+
|
|
3
|
+
**Status:** Completed
|
|
4
|
+
|
|
5
|
+
## Summary
|
|
6
|
+
|
|
7
|
+
Add file-scoped schema context to the SQL Glider lineage analyzer so that SQLGlot can correctly expand `SELECT *` and trace cross-statement references when a file contains multiple related statements.
|
|
8
|
+
|
|
9
|
+
## Problem
|
|
10
|
+
|
|
11
|
+
When analyzing this SQL:
|
|
12
|
+
```sql
|
|
13
|
+
CREATE TEMPORARY VIEW first_view AS (SELECT a, b, c FROM source_table);
|
|
14
|
+
CREATE TEMPORARY VIEW second_view AS
|
|
15
|
+
WITH first_view_cte AS (
|
|
16
|
+
SELECT *, row_number() OVER (PARTITION BY a ORDER BY b DESC) AS row_num
|
|
17
|
+
FROM first_view
|
|
18
|
+
)
|
|
19
|
+
SELECT * FROM first_view_cte WHERE c = 1;
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Previous output:** `* -> second_view.*` (useless - no column-level lineage)
|
|
23
|
+
**Expected output:** `first_view.a -> second_view.a`, `first_view.b -> second_view.b`, etc.
|
|
24
|
+
|
|
25
|
+
## Root Cause
|
|
26
|
+
|
|
27
|
+
SQLGlot's `lineage()` function accepts a `schema` parameter that provides table/view column definitions. Without this schema context, SQLGlot cannot expand `SELECT *` to actual column names.
|
|
28
|
+
|
|
29
|
+
## Solution
|
|
30
|
+
|
|
31
|
+
Build up schema context incrementally as CREATE VIEW/TABLE statements are processed, then pass that schema to subsequent `lineage()` calls.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Implementation Steps
|
|
36
|
+
|
|
37
|
+
### 1. Add Schema Instance Variable
|
|
38
|
+
|
|
39
|
+
- [x] Add `_file_schema: Dict[str, Dict[str, str]] = {}` to `LineageAnalyzer.__init__()`
|
|
40
|
+
|
|
41
|
+
### 2. Add Schema Extraction Methods
|
|
42
|
+
|
|
43
|
+
- [x] `_extract_schema_from_statement()` - Extract columns from CREATE VIEW/TABLE AS SELECT
|
|
44
|
+
- [x] `_extract_columns_from_select()` - Extract column names from SELECT projections
|
|
45
|
+
- [x] `_resolve_star_columns()` - Resolve SELECT * from file schema or CTEs
|
|
46
|
+
- [x] `_resolve_source_columns()` - Resolve columns from a single source (table, subquery)
|
|
47
|
+
- [x] `_resolve_qualified_star()` - Resolve table-qualified star (e.g., `t.*`)
|
|
48
|
+
- [x] `_extract_subquery_columns()` - Extract columns from subquery's SELECT
|
|
49
|
+
- [x] `_resolve_cte_columns()` - Resolve columns from CTE definitions
|
|
50
|
+
- [x] `_extract_cte_select_columns()` - Extract columns from CTE's SELECT
|
|
51
|
+
|
|
52
|
+
### 3. Integrate Schema Building into Analysis Loop
|
|
53
|
+
|
|
54
|
+
- [x] Reset `_file_schema = {}` at start of `analyze_queries()`
|
|
55
|
+
- [x] Call `_extract_schema_from_statement(expr)` in `finally` block AFTER analysis
|
|
56
|
+
- [x] Critical: Schema must be extracted AFTER analysis to avoid confusing SQLGlot
|
|
57
|
+
|
|
58
|
+
### 4. Pass Schema to lineage() Calls
|
|
59
|
+
|
|
60
|
+
- [x] Modify `_analyze_column_lineage_internal()` to pass schema:
|
|
61
|
+
```python
|
|
62
|
+
node = lineage(
|
|
63
|
+
lineage_col,
|
|
64
|
+
current_query_sql,
|
|
65
|
+
dialect=self.dialect,
|
|
66
|
+
schema=self._file_schema if self._file_schema else None,
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 5. Handle SELECT * in get_output_columns()
|
|
71
|
+
|
|
72
|
+
- [x] Handle `exp.Star` projections by resolving from file schema
|
|
73
|
+
- [x] Handle table-qualified stars (`t.*`) represented as `exp.Column` with `exp.Star` as `this`
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Edge Cases Handled
|
|
78
|
+
|
|
79
|
+
| Case | Implementation |
|
|
80
|
+
|------|----------------|
|
|
81
|
+
| `SELECT *` from unknown table | Returns empty columns, falls back to `*` behavior |
|
|
82
|
+
| Nested `SELECT *` through CTEs | Resolves CTE source from schema first |
|
|
83
|
+
| UNION in CREATE VIEW | Uses first branch's columns |
|
|
84
|
+
| Expressions without aliases | Uses SQL representation as column name |
|
|
85
|
+
| TEMPORARY VIEW | Treated same as regular VIEW |
|
|
86
|
+
| Multiple JOINs | Collects columns from all joined tables |
|
|
87
|
+
| LEFT/RIGHT/FULL OUTER JOIN | Same handling as INNER JOIN |
|
|
88
|
+
| CROSS JOIN | Same handling as INNER JOIN |
|
|
89
|
+
| Subquery in FROM clause | Extracts columns from inner SELECT |
|
|
90
|
+
| Table aliases (`v1 AS x`) | Resolves alias to actual table name |
|
|
91
|
+
| Schema-qualified names | Handles `schema.table` correctly |
|
|
92
|
+
| CTE referencing earlier CTE | Recursive CTE column resolution |
|
|
93
|
+
| `SELECT *, extra_col` | Combines * expansion with extra columns |
|
|
94
|
+
| Table-qualified `t.*` | Handles `v1.*` style syntax |
|
|
95
|
+
| LATERAL VIEW explode | Collects generated columns from `laterals` clause |
|
|
96
|
+
| LATERAL VIEW posexplode | Collects both position and element columns |
|
|
97
|
+
| Multiple LATERAL VIEWs | Collects columns from all LATERAL VIEWs |
|
|
98
|
+
| LATERAL VIEW OUTER | Same handling as regular LATERAL VIEW |
|
|
99
|
+
| LEFT SEMI JOIN | Only includes left table columns (right table excluded) |
|
|
100
|
+
| LEFT ANTI JOIN | Only includes left table columns (right table excluded) |
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Files Modified
|
|
105
|
+
|
|
106
|
+
| File | Changes |
|
|
107
|
+
|------|---------|
|
|
108
|
+
| `src/sqlglider/lineage/analyzer.py` | Added `_file_schema` instance variable; Added 9 schema extraction methods (including `_resolve_lateral_columns`); Modified `analyze_queries()` and `_analyze_column_lineage_internal()` and `get_output_columns()`; Added SEMI/ANTI join handling in `_resolve_star_columns()` |
|
|
109
|
+
| `tests/sqlglider/lineage/test_analyzer.py` | Added `TestFileSchemaExtraction` (9 tests), `TestCrossStatementLineage` (12 tests), `TestLateralViewColumnResolution` (5 tests), and `TestSemiAntiJoinColumnResolution` (3 tests) |
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Testing
|
|
114
|
+
|
|
115
|
+
### Test Classes Added
|
|
116
|
+
|
|
117
|
+
**TestFileSchemaExtraction (9 tests):**
|
|
118
|
+
- `test_extract_schema_from_create_view`
|
|
119
|
+
- `test_extract_schema_from_create_temporary_view`
|
|
120
|
+
- `test_extract_schema_from_create_table_as`
|
|
121
|
+
- `test_extract_schema_with_aliases`
|
|
122
|
+
- `test_extract_schema_select_star_from_known_table`
|
|
123
|
+
- `test_extract_schema_select_star_from_unknown_table`
|
|
124
|
+
- `test_schema_not_extracted_from_pure_select`
|
|
125
|
+
- `test_schema_not_extracted_from_insert`
|
|
126
|
+
- `test_schema_reset_between_analysis_calls`
|
|
127
|
+
|
|
128
|
+
**TestCrossStatementLineage (12 tests):**
|
|
129
|
+
- `test_view_referencing_earlier_view`
|
|
130
|
+
- `test_select_star_expansion_through_view`
|
|
131
|
+
- `test_cte_with_select_star_from_view`
|
|
132
|
+
- `test_window_function_with_select_star`
|
|
133
|
+
- `test_insert_from_view_lineage`
|
|
134
|
+
- `test_multi_hop_view_lineage`
|
|
135
|
+
- `test_original_problem_scenario`
|
|
136
|
+
- `test_select_star_from_join`
|
|
137
|
+
- `test_nested_ctes_and_views_with_select_star`
|
|
138
|
+
- `test_select_star_from_subquery`
|
|
139
|
+
- `test_table_qualified_star`
|
|
140
|
+
- `test_table_qualified_star_with_alias`
|
|
141
|
+
|
|
142
|
+
**TestLateralViewColumnResolution (5 tests):**
|
|
143
|
+
- `test_select_star_with_lateral_view_explode`
|
|
144
|
+
- `test_select_star_with_lateral_view_posexplode`
|
|
145
|
+
- `test_select_star_with_multiple_lateral_views`
|
|
146
|
+
- `test_select_star_with_lateral_view_outer`
|
|
147
|
+
- `test_lateral_view_with_join`
|
|
148
|
+
|
|
149
|
+
### Verification Commands
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Run all tests
|
|
153
|
+
uv run pytest --cov=sqlglider --cov-fail-under=80
|
|
154
|
+
|
|
155
|
+
# Run schema-related tests
|
|
156
|
+
uv run pytest tests/sqlglider/lineage/test_analyzer.py -k "schema or CrossStatement" -v
|
|
157
|
+
|
|
158
|
+
# Test the original problem scenario
|
|
159
|
+
uv run sqlglider graph build test_view_window_cte.sql --dialect spark --output graph.json
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Implementation Notes
|
|
165
|
+
|
|
166
|
+
### Critical Timing Issue
|
|
167
|
+
|
|
168
|
+
Initially, schema extraction was done BEFORE analysis in the loop, which caused SQLGlot to return unqualified column names (e.g., `customer_id` instead of `orders.customer_id`).
|
|
169
|
+
|
|
170
|
+
**Fix:** Move `_extract_schema_from_statement(expr)` to the `finally` block AFTER analysis completes. This ensures:
|
|
171
|
+
1. The current statement is analyzed without its own schema (correct behavior)
|
|
172
|
+
2. The schema is then extracted for use by subsequent statements
|
|
173
|
+
|
|
174
|
+
### Table-Qualified Star Handling
|
|
175
|
+
|
|
176
|
+
Table-qualified stars (`v1.*`) are represented differently than unqualified stars (`*`):
|
|
177
|
+
- `*` is `exp.Star`
|
|
178
|
+
- `v1.*` is `exp.Column` with `this` being `exp.Star` and `table` being `v1`
|
|
179
|
+
|
|
180
|
+
Both cases needed handling in:
|
|
181
|
+
- `_extract_columns_from_select()` for schema extraction
|
|
182
|
+
- `get_output_columns()` for lineage analysis output
|
|
183
|
+
|
|
184
|
+
### Subquery Column Resolution
|
|
185
|
+
|
|
186
|
+
For `SELECT * FROM (SELECT * FROM v1) sub`, the code:
|
|
187
|
+
1. Detects the subquery in `_resolve_source_columns()`
|
|
188
|
+
2. Extracts columns from the inner SELECT via `_extract_subquery_columns()`
|
|
189
|
+
3. Recursively resolves any `SELECT *` in the inner query
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Lessons Learned
|
|
194
|
+
|
|
195
|
+
1. **Timing matters:** Schema context must be built AFTER analyzing a statement, not before, to avoid confusing SQLGlot's lineage tracing.
|
|
196
|
+
|
|
197
|
+
2. **AST structure varies:** Different SQL constructs have different AST representations (e.g., `*` vs `t.*`), requiring multiple code paths.
|
|
198
|
+
|
|
199
|
+
3. **Recursive resolution:** CTEs and subqueries can reference other CTEs/views, requiring recursive column resolution.
|
|
200
|
+
|
|
201
|
+
4. **Edge cases compound:** JOINs + aliases + qualified stars can all combine, requiring careful handling of each case.
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.6'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 6)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -182,20 +182,43 @@ class LineageAnalyzer:
|
|
|
182
182
|
# For aliased columns, use the alias as the column name
|
|
183
183
|
column_name = projection.alias
|
|
184
184
|
lineage_name = column_name # SQLGlot lineage uses the alias
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
185
|
+
# Qualify with target table
|
|
186
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
187
|
+
columns.append(qualified_name)
|
|
188
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
189
|
+
elif isinstance(projection, exp.Column):
|
|
190
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
191
|
+
if isinstance(projection.this, exp.Star):
|
|
192
|
+
source_table = projection.table
|
|
193
|
+
qualified_star_cols: List[str] = []
|
|
194
|
+
if source_table and first_select:
|
|
195
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
196
|
+
source_table, first_select
|
|
197
|
+
)
|
|
198
|
+
for col in qualified_star_cols:
|
|
199
|
+
qualified_name = f"{target_table}.{col}"
|
|
200
|
+
columns.append(qualified_name)
|
|
201
|
+
self._column_mapping[qualified_name] = col
|
|
202
|
+
if not qualified_star_cols:
|
|
203
|
+
# Fallback: can't resolve t.*, use * as column name
|
|
204
|
+
qualified_name = f"{target_table}.*"
|
|
205
|
+
columns.append(qualified_name)
|
|
206
|
+
self._column_mapping[qualified_name] = "*"
|
|
190
207
|
else:
|
|
191
|
-
|
|
192
|
-
column_name = source_expr.sql(dialect=self.dialect)
|
|
208
|
+
column_name = projection.name
|
|
193
209
|
lineage_name = column_name
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
210
|
+
# Qualify with target table
|
|
211
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
212
|
+
columns.append(qualified_name)
|
|
213
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
214
|
+
else:
|
|
215
|
+
# For expressions, use the SQL representation
|
|
216
|
+
column_name = projection.sql(dialect=self.dialect)
|
|
217
|
+
lineage_name = column_name
|
|
218
|
+
# Qualify with target table
|
|
219
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
220
|
+
columns.append(qualified_name)
|
|
221
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
199
222
|
|
|
200
223
|
else:
|
|
201
224
|
# DQL (pure SELECT): Use the SELECT columns as output
|
|
@@ -1342,8 +1365,18 @@ class LineageAnalyzer:
|
|
|
1342
1365
|
# Use the alias name as the column name
|
|
1343
1366
|
columns.append(projection.alias)
|
|
1344
1367
|
elif isinstance(projection, exp.Column):
|
|
1345
|
-
#
|
|
1346
|
-
|
|
1368
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
1369
|
+
if isinstance(projection.this, exp.Star):
|
|
1370
|
+
# Resolve table-qualified star from known schema
|
|
1371
|
+
table_name = projection.table
|
|
1372
|
+
if table_name and first_select:
|
|
1373
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
1374
|
+
table_name, first_select
|
|
1375
|
+
)
|
|
1376
|
+
columns.extend(qualified_star_cols)
|
|
1377
|
+
else:
|
|
1378
|
+
# Use the column name
|
|
1379
|
+
columns.append(projection.name)
|
|
1347
1380
|
elif isinstance(projection, exp.Star):
|
|
1348
1381
|
# Resolve SELECT * from known schema
|
|
1349
1382
|
if first_select:
|
|
@@ -1375,6 +1408,64 @@ class LineageAnalyzer:
|
|
|
1375
1408
|
|
|
1376
1409
|
source = from_clause.this
|
|
1377
1410
|
|
|
1411
|
+
# Handle table reference from FROM clause
|
|
1412
|
+
columns.extend(self._resolve_source_columns(source, select_node))
|
|
1413
|
+
|
|
1414
|
+
# Handle JOIN clauses - collect columns from all joined tables
|
|
1415
|
+
# EXCEPT for SEMI and ANTI joins which only return left table columns
|
|
1416
|
+
joins = select_node.args.get("joins")
|
|
1417
|
+
if joins:
|
|
1418
|
+
for join in joins:
|
|
1419
|
+
if isinstance(join, exp.Join):
|
|
1420
|
+
# SEMI and ANTI joins don't include right table columns in SELECT *
|
|
1421
|
+
join_kind = join.kind
|
|
1422
|
+
if join_kind in ("SEMI", "ANTI"):
|
|
1423
|
+
# Skip right table columns for SEMI/ANTI joins
|
|
1424
|
+
continue
|
|
1425
|
+
join_source = join.this
|
|
1426
|
+
columns.extend(
|
|
1427
|
+
self._resolve_source_columns(join_source, select_node)
|
|
1428
|
+
)
|
|
1429
|
+
|
|
1430
|
+
# Handle LATERAL VIEW clauses - collect generated columns
|
|
1431
|
+
laterals = select_node.args.get("laterals")
|
|
1432
|
+
if laterals:
|
|
1433
|
+
for lateral in laterals:
|
|
1434
|
+
if isinstance(lateral, exp.Lateral):
|
|
1435
|
+
lateral_cols = self._resolve_lateral_columns(lateral)
|
|
1436
|
+
columns.extend(lateral_cols)
|
|
1437
|
+
|
|
1438
|
+
return columns
|
|
1439
|
+
|
|
1440
|
+
def _resolve_lateral_columns(self, lateral: exp.Lateral) -> List[str]:
|
|
1441
|
+
"""
|
|
1442
|
+
Extract generated column names from a LATERAL VIEW clause.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
lateral: The Lateral expression node
|
|
1446
|
+
|
|
1447
|
+
Returns:
|
|
1448
|
+
List of generated column names (e.g., ['elem'] for explode,
|
|
1449
|
+
['pos', 'elem'] for posexplode)
|
|
1450
|
+
"""
|
|
1451
|
+
# Use SQLGlot's built-in property to get alias column names
|
|
1452
|
+
return lateral.alias_column_names or []
|
|
1453
|
+
|
|
1454
|
+
def _resolve_source_columns(
|
|
1455
|
+
self, source: exp.Expression, select_node: exp.Select
|
|
1456
|
+
) -> List[str]:
|
|
1457
|
+
"""
|
|
1458
|
+
Resolve columns from a single source (table, subquery, etc.).
|
|
1459
|
+
|
|
1460
|
+
Args:
|
|
1461
|
+
source: The source expression (Table, Subquery, etc.)
|
|
1462
|
+
select_node: The containing SELECT node for CTE resolution
|
|
1463
|
+
|
|
1464
|
+
Returns:
|
|
1465
|
+
List of column names from the source
|
|
1466
|
+
"""
|
|
1467
|
+
columns: List[str] = []
|
|
1468
|
+
|
|
1378
1469
|
# Handle table reference
|
|
1379
1470
|
if isinstance(source, exp.Table):
|
|
1380
1471
|
source_name = self._get_qualified_table_name(source)
|
|
@@ -1387,11 +1478,100 @@ class LineageAnalyzer:
|
|
|
1387
1478
|
cte_columns = self._resolve_cte_columns(source_name, select_node)
|
|
1388
1479
|
columns.extend(cte_columns)
|
|
1389
1480
|
|
|
1390
|
-
# Handle subquery
|
|
1391
|
-
elif isinstance(source, exp.Subquery)
|
|
1392
|
-
#
|
|
1393
|
-
if source.alias in self._file_schema:
|
|
1481
|
+
# Handle subquery with alias
|
|
1482
|
+
elif isinstance(source, exp.Subquery):
|
|
1483
|
+
# First check if this subquery alias is in file schema
|
|
1484
|
+
if source.alias and source.alias in self._file_schema:
|
|
1394
1485
|
columns.extend(self._file_schema[source.alias].keys())
|
|
1486
|
+
else:
|
|
1487
|
+
# Extract columns from the subquery's SELECT
|
|
1488
|
+
inner_select = source.this
|
|
1489
|
+
if isinstance(inner_select, exp.Select):
|
|
1490
|
+
subquery_cols = self._extract_subquery_columns(inner_select)
|
|
1491
|
+
columns.extend(subquery_cols)
|
|
1492
|
+
|
|
1493
|
+
return columns
|
|
1494
|
+
|
|
1495
|
+
def _resolve_qualified_star(
|
|
1496
|
+
self, table_name: str, select_node: exp.Select
|
|
1497
|
+
) -> List[str]:
|
|
1498
|
+
"""
|
|
1499
|
+
Resolve a table-qualified star (e.g., t.*) to actual column names.
|
|
1500
|
+
|
|
1501
|
+
Args:
|
|
1502
|
+
table_name: The table/alias name qualifying the star
|
|
1503
|
+
select_node: The SELECT node for context
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
List of column names from the specified table
|
|
1507
|
+
"""
|
|
1508
|
+
# First check file schema
|
|
1509
|
+
if table_name in self._file_schema:
|
|
1510
|
+
return list(self._file_schema[table_name].keys())
|
|
1511
|
+
|
|
1512
|
+
# Check if it's a CTE reference
|
|
1513
|
+
cte_columns = self._resolve_cte_columns(table_name, select_node)
|
|
1514
|
+
if cte_columns:
|
|
1515
|
+
return cte_columns
|
|
1516
|
+
|
|
1517
|
+
# Check if the table name is an alias - need to resolve the actual table
|
|
1518
|
+
from_clause = select_node.args.get("from")
|
|
1519
|
+
if from_clause and isinstance(from_clause, exp.From):
|
|
1520
|
+
source = from_clause.this
|
|
1521
|
+
if isinstance(source, exp.Table) and source.alias == table_name:
|
|
1522
|
+
actual_name = self._get_qualified_table_name(source)
|
|
1523
|
+
if actual_name in self._file_schema:
|
|
1524
|
+
return list(self._file_schema[actual_name].keys())
|
|
1525
|
+
|
|
1526
|
+
# Check JOIN clauses for aliased tables
|
|
1527
|
+
joins = select_node.args.get("joins")
|
|
1528
|
+
if joins:
|
|
1529
|
+
for join in joins:
|
|
1530
|
+
if isinstance(join, exp.Join):
|
|
1531
|
+
join_source = join.this
|
|
1532
|
+
if (
|
|
1533
|
+
isinstance(join_source, exp.Table)
|
|
1534
|
+
and join_source.alias == table_name
|
|
1535
|
+
):
|
|
1536
|
+
actual_name = self._get_qualified_table_name(join_source)
|
|
1537
|
+
if actual_name in self._file_schema:
|
|
1538
|
+
return list(self._file_schema[actual_name].keys())
|
|
1539
|
+
|
|
1540
|
+
return []
|
|
1541
|
+
|
|
1542
|
+
def _extract_subquery_columns(self, subquery_select: exp.Select) -> List[str]:
|
|
1543
|
+
"""
|
|
1544
|
+
Extract column names from a subquery's SELECT statement.
|
|
1545
|
+
|
|
1546
|
+
Args:
|
|
1547
|
+
subquery_select: The SELECT expression within the subquery
|
|
1548
|
+
|
|
1549
|
+
Returns:
|
|
1550
|
+
List of column names
|
|
1551
|
+
"""
|
|
1552
|
+
columns: List[str] = []
|
|
1553
|
+
|
|
1554
|
+
for projection in subquery_select.expressions:
|
|
1555
|
+
if isinstance(projection, exp.Alias):
|
|
1556
|
+
columns.append(projection.alias)
|
|
1557
|
+
elif isinstance(projection, exp.Column):
|
|
1558
|
+
# Check for table-qualified star (t.*)
|
|
1559
|
+
if isinstance(projection.this, exp.Star):
|
|
1560
|
+
table_name = projection.table
|
|
1561
|
+
if table_name:
|
|
1562
|
+
qualified_cols = self._resolve_qualified_star(
|
|
1563
|
+
table_name, subquery_select
|
|
1564
|
+
)
|
|
1565
|
+
columns.extend(qualified_cols)
|
|
1566
|
+
else:
|
|
1567
|
+
columns.append(projection.name)
|
|
1568
|
+
elif isinstance(projection, exp.Star):
|
|
1569
|
+
# Resolve SELECT * in subquery
|
|
1570
|
+
star_columns = self._resolve_star_columns(subquery_select)
|
|
1571
|
+
columns.extend(star_columns)
|
|
1572
|
+
else:
|
|
1573
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1574
|
+
columns.append(col_sql)
|
|
1395
1575
|
|
|
1396
1576
|
return columns
|
|
1397
1577
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
CREATE TEMPORARY VIEW first_view AS
|
|
2
|
+
SELECT
|
|
3
|
+
v1,
|
|
4
|
+
v2,
|
|
5
|
+
v3
|
|
6
|
+
FROM source_db.source_table;
|
|
7
|
+
CREATE TEMPORARY VIEW second_view AS WITH cte AS (
|
|
8
|
+
SELECT
|
|
9
|
+
*,
|
|
10
|
+
row_number() OVER (ORDER BY v1) AS row_num
|
|
11
|
+
FROM first_view
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
SELECT * FROM cte
|
|
15
|
+
WHERE row_num = 1;
|
|
16
|
+
|
|
17
|
+
INSERT INTO target_db.target_table
|
|
18
|
+
SELECT
|
|
19
|
+
v1,
|
|
20
|
+
v2,
|
|
21
|
+
v3
|
|
22
|
+
FROM second_view;
|