sql-glider 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.3 → sql_glider-0.1.5}/PKG-INFO +1 -1
- sql_glider-0.1.5/plans/2026-01-26-file-scoped-schema-context.md +199 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/_version.py +2 -2
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/analyzer.py +402 -14
- sql_glider-0.1.5/tests/fixtures/original_queries/test_view_window_cte.sql +27 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_builder.py +150 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_analyzer.py +504 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/.github/workflows/ci.yml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/.github/workflows/publish.yml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/.gitignore +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/.python-version +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/ARCHITECTURE.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/CLAUDE.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/LICENSE +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/README.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-05-column-level-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-05-reverse-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-config-file-support.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-graph-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-unify-single-multi-query.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-07-sample-data-model.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-07-sql-templating.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-08-tables-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-09-graph-query-paths.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-13-dissect-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-14-tables-pull-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/pyproject.toml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/README.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/expire_dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/load_fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/load_fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_product.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/conditional_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/cte_insert.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/multi_table_transform.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/dim_product.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_addresses.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_order_items.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/maintenance/update_product_status.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/sqlglider.toml.example +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/databricks.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/cli.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/analyzer.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/global_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/builder.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/merge.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/query.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/serialization.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/jinja.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/variables.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/config.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/file_utils.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/reports.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_single_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_subquery.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_tables.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_view.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/sample_manifest.csv +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_databricks.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_analyzer.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_merge.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_query.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_serialization.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_jinja.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_variables.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/test_cli.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/test_config.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/test_file_utils.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.5}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# Plan: File-Scoped Schema Context for SQL Lineage Analyzer
|
|
2
|
+
|
|
3
|
+
**Status:** Completed
|
|
4
|
+
|
|
5
|
+
## Summary
|
|
6
|
+
|
|
7
|
+
Add file-scoped schema context to the SQL Glider lineage analyzer so that SQLGlot can correctly expand `SELECT *` and trace cross-statement references when a file contains multiple related statements.
|
|
8
|
+
|
|
9
|
+
## Problem
|
|
10
|
+
|
|
11
|
+
When analyzing this SQL:
|
|
12
|
+
```sql
|
|
13
|
+
CREATE TEMPORARY VIEW first_view AS (SELECT a, b, c FROM source_table);
|
|
14
|
+
CREATE TEMPORARY VIEW second_view AS
|
|
15
|
+
WITH first_view_cte AS (
|
|
16
|
+
SELECT *, row_number() OVER (PARTITION BY a ORDER BY b DESC) AS row_num
|
|
17
|
+
FROM first_view
|
|
18
|
+
)
|
|
19
|
+
SELECT * FROM first_view_cte WHERE c = 1;
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Previous output:** `* -> second_view.*` (useless - no column-level lineage)
|
|
23
|
+
**Expected output:** `first_view.a -> second_view.a`, `first_view.b -> second_view.b`, etc.
|
|
24
|
+
|
|
25
|
+
## Root Cause
|
|
26
|
+
|
|
27
|
+
SQLGlot's `lineage()` function accepts a `schema` parameter that provides table/view column definitions. Without this schema context, SQLGlot cannot expand `SELECT *` to actual column names.
|
|
28
|
+
|
|
29
|
+
## Solution
|
|
30
|
+
|
|
31
|
+
Build up schema context incrementally as CREATE VIEW/TABLE statements are processed, then pass that schema to subsequent `lineage()` calls.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Implementation Steps
|
|
36
|
+
|
|
37
|
+
### 1. Add Schema Instance Variable
|
|
38
|
+
|
|
39
|
+
- [x] Add `_file_schema: Dict[str, Dict[str, str]] = {}` to `LineageAnalyzer.__init__()`
|
|
40
|
+
|
|
41
|
+
### 2. Add Schema Extraction Methods
|
|
42
|
+
|
|
43
|
+
- [x] `_extract_schema_from_statement()` - Extract columns from CREATE VIEW/TABLE AS SELECT
|
|
44
|
+
- [x] `_extract_columns_from_select()` - Extract column names from SELECT projections
|
|
45
|
+
- [x] `_resolve_star_columns()` - Resolve SELECT * from file schema or CTEs
|
|
46
|
+
- [x] `_resolve_source_columns()` - Resolve columns from a single source (table, subquery)
|
|
47
|
+
- [x] `_resolve_qualified_star()` - Resolve table-qualified star (e.g., `t.*`)
|
|
48
|
+
- [x] `_extract_subquery_columns()` - Extract columns from subquery's SELECT
|
|
49
|
+
- [x] `_resolve_cte_columns()` - Resolve columns from CTE definitions
|
|
50
|
+
- [x] `_extract_cte_select_columns()` - Extract columns from CTE's SELECT
|
|
51
|
+
|
|
52
|
+
### 3. Integrate Schema Building into Analysis Loop
|
|
53
|
+
|
|
54
|
+
- [x] Reset `_file_schema = {}` at start of `analyze_queries()`
|
|
55
|
+
- [x] Call `_extract_schema_from_statement(expr)` in `finally` block AFTER analysis
|
|
56
|
+
- [x] Critical: Schema must be extracted AFTER analysis to avoid confusing SQLGlot
|
|
57
|
+
|
|
58
|
+
### 4. Pass Schema to lineage() Calls
|
|
59
|
+
|
|
60
|
+
- [x] Modify `_analyze_column_lineage_internal()` to pass schema:
|
|
61
|
+
```python
|
|
62
|
+
node = lineage(
|
|
63
|
+
lineage_col,
|
|
64
|
+
current_query_sql,
|
|
65
|
+
dialect=self.dialect,
|
|
66
|
+
schema=self._file_schema if self._file_schema else None,
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 5. Handle SELECT * in get_output_columns()
|
|
71
|
+
|
|
72
|
+
- [x] Handle `exp.Star` projections by resolving from file schema
|
|
73
|
+
- [x] Handle table-qualified stars (`t.*`) represented as `exp.Column` with `exp.Star` as `this`
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Edge Cases Handled
|
|
78
|
+
|
|
79
|
+
| Case | Implementation |
|
|
80
|
+
|------|----------------|
|
|
81
|
+
| `SELECT *` from unknown table | Returns empty columns, falls back to `*` behavior |
|
|
82
|
+
| Nested `SELECT *` through CTEs | Resolves CTE source from schema first |
|
|
83
|
+
| UNION in CREATE VIEW | Uses first branch's columns |
|
|
84
|
+
| Expressions without aliases | Uses SQL representation as column name |
|
|
85
|
+
| TEMPORARY VIEW | Treated same as regular VIEW |
|
|
86
|
+
| Multiple JOINs | Collects columns from all joined tables |
|
|
87
|
+
| LEFT/RIGHT/FULL OUTER JOIN | Same handling as INNER JOIN |
|
|
88
|
+
| CROSS JOIN | Same handling as INNER JOIN |
|
|
89
|
+
| Subquery in FROM clause | Extracts columns from inner SELECT |
|
|
90
|
+
| Table aliases (`v1 AS x`) | Resolves alias to actual table name |
|
|
91
|
+
| Schema-qualified names | Handles `schema.table` correctly |
|
|
92
|
+
| CTE referencing earlier CTE | Recursive CTE column resolution |
|
|
93
|
+
| `SELECT *, extra_col` | Combines * expansion with extra columns |
|
|
94
|
+
| Table-qualified `t.*` | Handles `v1.*` style syntax |
|
|
95
|
+
| LATERAL VIEW explode | Collects generated columns from `laterals` clause |
|
|
96
|
+
| LATERAL VIEW posexplode | Collects both position and element columns |
|
|
97
|
+
| Multiple LATERAL VIEWs | Collects columns from all LATERAL VIEWs |
|
|
98
|
+
| LATERAL VIEW OUTER | Same handling as regular LATERAL VIEW |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Files Modified
|
|
103
|
+
|
|
104
|
+
| File | Changes |
|
|
105
|
+
|------|---------|
|
|
106
|
+
| `src/sqlglider/lineage/analyzer.py` | Added `_file_schema` instance variable; Added 9 schema extraction methods (including `_resolve_lateral_columns`); Modified `analyze_queries()` and `_analyze_column_lineage_internal()` and `get_output_columns()` |
|
|
107
|
+
| `tests/sqlglider/lineage/test_analyzer.py` | Added `TestFileSchemaExtraction` (9 tests), `TestCrossStatementLineage` (12 tests), and `TestLateralViewColumnResolution` (5 tests) |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Testing
|
|
112
|
+
|
|
113
|
+
### Test Classes Added
|
|
114
|
+
|
|
115
|
+
**TestFileSchemaExtraction (9 tests):**
|
|
116
|
+
- `test_extract_schema_from_create_view`
|
|
117
|
+
- `test_extract_schema_from_create_temporary_view`
|
|
118
|
+
- `test_extract_schema_from_create_table_as`
|
|
119
|
+
- `test_extract_schema_with_aliases`
|
|
120
|
+
- `test_extract_schema_select_star_from_known_table`
|
|
121
|
+
- `test_extract_schema_select_star_from_unknown_table`
|
|
122
|
+
- `test_schema_not_extracted_from_pure_select`
|
|
123
|
+
- `test_schema_not_extracted_from_insert`
|
|
124
|
+
- `test_schema_reset_between_analysis_calls`
|
|
125
|
+
|
|
126
|
+
**TestCrossStatementLineage (12 tests):**
|
|
127
|
+
- `test_view_referencing_earlier_view`
|
|
128
|
+
- `test_select_star_expansion_through_view`
|
|
129
|
+
- `test_cte_with_select_star_from_view`
|
|
130
|
+
- `test_window_function_with_select_star`
|
|
131
|
+
- `test_insert_from_view_lineage`
|
|
132
|
+
- `test_multi_hop_view_lineage`
|
|
133
|
+
- `test_original_problem_scenario`
|
|
134
|
+
- `test_select_star_from_join`
|
|
135
|
+
- `test_nested_ctes_and_views_with_select_star`
|
|
136
|
+
- `test_select_star_from_subquery`
|
|
137
|
+
- `test_table_qualified_star`
|
|
138
|
+
- `test_table_qualified_star_with_alias`
|
|
139
|
+
|
|
140
|
+
**TestLateralViewColumnResolution (5 tests):**
|
|
141
|
+
- `test_select_star_with_lateral_view_explode`
|
|
142
|
+
- `test_select_star_with_lateral_view_posexplode`
|
|
143
|
+
- `test_select_star_with_multiple_lateral_views`
|
|
144
|
+
- `test_select_star_with_lateral_view_outer`
|
|
145
|
+
- `test_lateral_view_with_join`
|
|
146
|
+
|
|
147
|
+
### Verification Commands
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Run all tests
|
|
151
|
+
uv run pytest --cov=sqlglider --cov-fail-under=80
|
|
152
|
+
|
|
153
|
+
# Run schema-related tests
|
|
154
|
+
uv run pytest tests/sqlglider/lineage/test_analyzer.py -k "schema or CrossStatement" -v
|
|
155
|
+
|
|
156
|
+
# Test the original problem scenario
|
|
157
|
+
uv run sqlglider graph build test_view_window_cte.sql --dialect spark --output graph.json
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Implementation Notes
|
|
163
|
+
|
|
164
|
+
### Critical Timing Issue
|
|
165
|
+
|
|
166
|
+
Initially, schema extraction was done BEFORE analysis in the loop, which caused SQLGlot to return unqualified column names (e.g., `customer_id` instead of `orders.customer_id`).
|
|
167
|
+
|
|
168
|
+
**Fix:** Move `_extract_schema_from_statement(expr)` to the `finally` block AFTER analysis completes. This ensures:
|
|
169
|
+
1. The current statement is analyzed without its own schema (correct behavior)
|
|
170
|
+
2. The schema is then extracted for use by subsequent statements
|
|
171
|
+
|
|
172
|
+
### Table-Qualified Star Handling
|
|
173
|
+
|
|
174
|
+
Table-qualified stars (`v1.*`) are represented differently than unqualified stars (`*`):
|
|
175
|
+
- `*` is `exp.Star`
|
|
176
|
+
- `v1.*` is `exp.Column` with `this` being `exp.Star` and `table` being `v1`
|
|
177
|
+
|
|
178
|
+
Both cases needed handling in:
|
|
179
|
+
- `_extract_columns_from_select()` for schema extraction
|
|
180
|
+
- `get_output_columns()` for lineage analysis output
|
|
181
|
+
|
|
182
|
+
### Subquery Column Resolution
|
|
183
|
+
|
|
184
|
+
For `SELECT * FROM (SELECT * FROM v1) sub`, the code:
|
|
185
|
+
1. Detects the subquery in `_resolve_source_columns()`
|
|
186
|
+
2. Extracts columns from the inner SELECT via `_extract_subquery_columns()`
|
|
187
|
+
3. Recursively resolves any `SELECT *` in the inner query
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Lessons Learned
|
|
192
|
+
|
|
193
|
+
1. **Timing matters:** Schema context must be built AFTER analyzing a statement, not before, to avoid confusing SQLGlot's lineage tracing.
|
|
194
|
+
|
|
195
|
+
2. **AST structure varies:** Different SQL constructs have different AST representations (e.g., `*` vs `t.*`), requiring multiple code paths.
|
|
196
|
+
|
|
197
|
+
3. **Recursive resolution:** CTEs and subqueries can reference other CTEs/views, requiring recursive column resolution.
|
|
198
|
+
|
|
199
|
+
4. **Edge cases compound:** JOINs + aliases + qualified stars can all combine, requiring careful handling of each case.
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.5'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 5)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Core lineage analysis using SQLGlot."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
|
|
4
|
+
from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
from sqlglot import exp, parse
|
|
@@ -99,6 +99,9 @@ class LineageAnalyzer:
|
|
|
99
99
|
self.sql = sql
|
|
100
100
|
self.dialect = dialect
|
|
101
101
|
self._skipped_queries: List[SkippedQuery] = []
|
|
102
|
+
# File-scoped schema context for cross-statement lineage
|
|
103
|
+
# Maps table/view names to their column definitions
|
|
104
|
+
self._file_schema: Dict[str, Dict[str, str]] = {}
|
|
102
105
|
|
|
103
106
|
try:
|
|
104
107
|
# Parse all statements in the SQL string
|
|
@@ -156,26 +159,66 @@ class LineageAnalyzer:
|
|
|
156
159
|
# DML/DDL: Use target table for output column qualification
|
|
157
160
|
# The columns are from the SELECT, but qualified with the target table
|
|
158
161
|
projections = self._get_select_projections(select_node)
|
|
162
|
+
first_select = self._get_first_select(select_node)
|
|
163
|
+
|
|
159
164
|
for projection in projections:
|
|
165
|
+
# Handle SELECT * by resolving from file schema
|
|
166
|
+
if isinstance(projection, exp.Star):
|
|
167
|
+
if first_select:
|
|
168
|
+
star_columns = self._resolve_star_columns(first_select)
|
|
169
|
+
for star_col in star_columns:
|
|
170
|
+
qualified_name = f"{target_table}.{star_col}"
|
|
171
|
+
columns.append(qualified_name)
|
|
172
|
+
self._column_mapping[qualified_name] = star_col
|
|
173
|
+
if not columns:
|
|
174
|
+
# Fallback: can't resolve *, use * as column name
|
|
175
|
+
qualified_name = f"{target_table}.*"
|
|
176
|
+
columns.append(qualified_name)
|
|
177
|
+
self._column_mapping[qualified_name] = "*"
|
|
178
|
+
continue
|
|
179
|
+
|
|
160
180
|
# Get the underlying expression (unwrap alias if present)
|
|
161
181
|
if isinstance(projection, exp.Alias):
|
|
162
182
|
# For aliased columns, use the alias as the column name
|
|
163
183
|
column_name = projection.alias
|
|
164
184
|
lineage_name = column_name # SQLGlot lineage uses the alias
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
185
|
+
# Qualify with target table
|
|
186
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
187
|
+
columns.append(qualified_name)
|
|
188
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
189
|
+
elif isinstance(projection, exp.Column):
|
|
190
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
191
|
+
if isinstance(projection.this, exp.Star):
|
|
192
|
+
source_table = projection.table
|
|
193
|
+
qualified_star_cols: List[str] = []
|
|
194
|
+
if source_table and first_select:
|
|
195
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
196
|
+
source_table, first_select
|
|
197
|
+
)
|
|
198
|
+
for col in qualified_star_cols:
|
|
199
|
+
qualified_name = f"{target_table}.{col}"
|
|
200
|
+
columns.append(qualified_name)
|
|
201
|
+
self._column_mapping[qualified_name] = col
|
|
202
|
+
if not qualified_star_cols:
|
|
203
|
+
# Fallback: can't resolve t.*, use * as column name
|
|
204
|
+
qualified_name = f"{target_table}.*"
|
|
205
|
+
columns.append(qualified_name)
|
|
206
|
+
self._column_mapping[qualified_name] = "*"
|
|
170
207
|
else:
|
|
171
|
-
|
|
172
|
-
column_name = source_expr.sql(dialect=self.dialect)
|
|
208
|
+
column_name = projection.name
|
|
173
209
|
lineage_name = column_name
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
210
|
+
# Qualify with target table
|
|
211
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
212
|
+
columns.append(qualified_name)
|
|
213
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
214
|
+
else:
|
|
215
|
+
# For expressions, use the SQL representation
|
|
216
|
+
column_name = projection.sql(dialect=self.dialect)
|
|
217
|
+
lineage_name = column_name
|
|
218
|
+
# Qualify with target table
|
|
219
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
220
|
+
columns.append(qualified_name)
|
|
221
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
179
222
|
|
|
180
223
|
else:
|
|
181
224
|
# DQL (pure SELECT): Use the SELECT columns as output
|
|
@@ -324,6 +367,7 @@ class LineageAnalyzer:
|
|
|
324
367
|
"""
|
|
325
368
|
results = []
|
|
326
369
|
self._skipped_queries = [] # Reset skipped queries for this analysis
|
|
370
|
+
self._file_schema = {} # Reset file schema for this analysis run
|
|
327
371
|
|
|
328
372
|
for query_index, expr, preview in self._iterate_queries(table_filter):
|
|
329
373
|
# Temporarily swap self.expr to analyze this query
|
|
@@ -375,6 +419,9 @@ class LineageAnalyzer:
|
|
|
375
419
|
)
|
|
376
420
|
)
|
|
377
421
|
finally:
|
|
422
|
+
# Extract schema from this statement AFTER analysis
|
|
423
|
+
# This builds up context for subsequent statements to use
|
|
424
|
+
self._extract_schema_from_statement(expr)
|
|
378
425
|
# Restore original expression
|
|
379
426
|
self.expr = original_expr
|
|
380
427
|
|
|
@@ -702,7 +749,13 @@ class LineageAnalyzer:
|
|
|
702
749
|
lineage_col = self._column_mapping.get(col, col)
|
|
703
750
|
|
|
704
751
|
# Get lineage tree for this column using current query SQL only
|
|
705
|
-
|
|
752
|
+
# Pass file schema to enable SELECT * expansion for known tables/views
|
|
753
|
+
node = lineage(
|
|
754
|
+
lineage_col,
|
|
755
|
+
current_query_sql,
|
|
756
|
+
dialect=self.dialect,
|
|
757
|
+
schema=self._file_schema if self._file_schema else None,
|
|
758
|
+
)
|
|
706
759
|
|
|
707
760
|
# Collect all source columns
|
|
708
761
|
sources: Set[str] = set()
|
|
@@ -1235,3 +1288,338 @@ class LineageAnalyzer:
|
|
|
1235
1288
|
preview = self._generate_query_preview(expr)
|
|
1236
1289
|
|
|
1237
1290
|
yield idx, expr, preview
|
|
1291
|
+
|
|
1292
|
+
# -------------------------------------------------------------------------
|
|
1293
|
+
# File-scoped schema context methods
|
|
1294
|
+
# -------------------------------------------------------------------------
|
|
1295
|
+
|
|
1296
|
+
def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
|
|
1297
|
+
"""
|
|
1298
|
+
Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
|
|
1299
|
+
|
|
1300
|
+
This method builds up file-scoped schema context as statements are processed,
|
|
1301
|
+
enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
|
|
1302
|
+
|
|
1303
|
+
Args:
|
|
1304
|
+
expr: The SQL expression to extract schema from
|
|
1305
|
+
"""
|
|
1306
|
+
# Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
|
|
1307
|
+
if not isinstance(expr, exp.Create):
|
|
1308
|
+
return
|
|
1309
|
+
if expr.kind not in ("VIEW", "TABLE"):
|
|
1310
|
+
return
|
|
1311
|
+
|
|
1312
|
+
# Get target table/view name
|
|
1313
|
+
target = expr.this
|
|
1314
|
+
if isinstance(target, exp.Schema):
|
|
1315
|
+
target = target.this
|
|
1316
|
+
if not isinstance(target, exp.Table):
|
|
1317
|
+
return
|
|
1318
|
+
|
|
1319
|
+
target_name = self._get_qualified_table_name(target)
|
|
1320
|
+
|
|
1321
|
+
# Get the SELECT node from the CREATE statement
|
|
1322
|
+
select_node = expr.expression
|
|
1323
|
+
if select_node is None:
|
|
1324
|
+
return
|
|
1325
|
+
|
|
1326
|
+
# Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
|
|
1327
|
+
if isinstance(select_node, exp.Subquery):
|
|
1328
|
+
select_node = select_node.this
|
|
1329
|
+
|
|
1330
|
+
if not isinstance(
|
|
1331
|
+
select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
|
|
1332
|
+
):
|
|
1333
|
+
return
|
|
1334
|
+
|
|
1335
|
+
# Extract column names from the SELECT
|
|
1336
|
+
columns = self._extract_columns_from_select(select_node)
|
|
1337
|
+
|
|
1338
|
+
if columns:
|
|
1339
|
+
# Store with UNKNOWN type - SQLGlot only needs column names for expansion
|
|
1340
|
+
self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
|
|
1341
|
+
|
|
1342
|
+
def _extract_columns_from_select(
|
|
1343
|
+
self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
|
|
1344
|
+
) -> List[str]:
|
|
1345
|
+
"""
|
|
1346
|
+
Extract column names from a SELECT statement.
|
|
1347
|
+
|
|
1348
|
+
Handles aliases, direct column references, and SELECT * by resolving
|
|
1349
|
+
against the known file schema.
|
|
1350
|
+
|
|
1351
|
+
Args:
|
|
1352
|
+
select_node: The SELECT or set operation expression
|
|
1353
|
+
|
|
1354
|
+
Returns:
|
|
1355
|
+
List of column names
|
|
1356
|
+
"""
|
|
1357
|
+
columns: List[str] = []
|
|
1358
|
+
|
|
1359
|
+
# Get projections (for UNION, use first branch)
|
|
1360
|
+
projections = self._get_select_projections(select_node)
|
|
1361
|
+
first_select = self._get_first_select(select_node)
|
|
1362
|
+
|
|
1363
|
+
for projection in projections:
|
|
1364
|
+
if isinstance(projection, exp.Alias):
|
|
1365
|
+
# Use the alias name as the column name
|
|
1366
|
+
columns.append(projection.alias)
|
|
1367
|
+
elif isinstance(projection, exp.Column):
|
|
1368
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
1369
|
+
if isinstance(projection.this, exp.Star):
|
|
1370
|
+
# Resolve table-qualified star from known schema
|
|
1371
|
+
table_name = projection.table
|
|
1372
|
+
if table_name and first_select:
|
|
1373
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
1374
|
+
table_name, first_select
|
|
1375
|
+
)
|
|
1376
|
+
columns.extend(qualified_star_cols)
|
|
1377
|
+
else:
|
|
1378
|
+
# Use the column name
|
|
1379
|
+
columns.append(projection.name)
|
|
1380
|
+
elif isinstance(projection, exp.Star):
|
|
1381
|
+
# Resolve SELECT * from known schema
|
|
1382
|
+
if first_select:
|
|
1383
|
+
star_columns = self._resolve_star_columns(first_select)
|
|
1384
|
+
columns.extend(star_columns)
|
|
1385
|
+
else:
|
|
1386
|
+
# For expressions without alias, use SQL representation
|
|
1387
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1388
|
+
columns.append(col_sql)
|
|
1389
|
+
|
|
1390
|
+
return columns
|
|
1391
|
+
|
|
1392
|
+
def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
|
|
1393
|
+
"""
|
|
1394
|
+
Resolve SELECT * to actual column names from known file schema or CTEs.
|
|
1395
|
+
|
|
1396
|
+
Args:
|
|
1397
|
+
select_node: The SELECT node containing the * reference
|
|
1398
|
+
|
|
1399
|
+
Returns:
|
|
1400
|
+
List of column names if source is known, empty list otherwise
|
|
1401
|
+
"""
|
|
1402
|
+
columns: List[str] = []
|
|
1403
|
+
|
|
1404
|
+
# Get the source table(s) from FROM clause
|
|
1405
|
+
from_clause = select_node.args.get("from")
|
|
1406
|
+
if not from_clause or not isinstance(from_clause, exp.From):
|
|
1407
|
+
return columns
|
|
1408
|
+
|
|
1409
|
+
source = from_clause.this
|
|
1410
|
+
|
|
1411
|
+
# Handle table reference from FROM clause
|
|
1412
|
+
columns.extend(self._resolve_source_columns(source, select_node))
|
|
1413
|
+
|
|
1414
|
+
# Handle JOIN clauses - collect columns from all joined tables
|
|
1415
|
+
joins = select_node.args.get("joins")
|
|
1416
|
+
if joins:
|
|
1417
|
+
for join in joins:
|
|
1418
|
+
if isinstance(join, exp.Join):
|
|
1419
|
+
join_source = join.this
|
|
1420
|
+
columns.extend(
|
|
1421
|
+
self._resolve_source_columns(join_source, select_node)
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
# Handle LATERAL VIEW clauses - collect generated columns
|
|
1425
|
+
laterals = select_node.args.get("laterals")
|
|
1426
|
+
if laterals:
|
|
1427
|
+
for lateral in laterals:
|
|
1428
|
+
if isinstance(lateral, exp.Lateral):
|
|
1429
|
+
lateral_cols = self._resolve_lateral_columns(lateral)
|
|
1430
|
+
columns.extend(lateral_cols)
|
|
1431
|
+
|
|
1432
|
+
return columns
|
|
1433
|
+
|
|
1434
|
+
def _resolve_lateral_columns(self, lateral: exp.Lateral) -> List[str]:
|
|
1435
|
+
"""
|
|
1436
|
+
Extract generated column names from a LATERAL VIEW clause.
|
|
1437
|
+
|
|
1438
|
+
Args:
|
|
1439
|
+
lateral: The Lateral expression node
|
|
1440
|
+
|
|
1441
|
+
Returns:
|
|
1442
|
+
List of generated column names (e.g., ['elem'] for explode,
|
|
1443
|
+
['pos', 'elem'] for posexplode)
|
|
1444
|
+
"""
|
|
1445
|
+
# Use SQLGlot's built-in property to get alias column names
|
|
1446
|
+
return lateral.alias_column_names or []
|
|
1447
|
+
|
|
1448
|
+
def _resolve_source_columns(
|
|
1449
|
+
self, source: exp.Expression, select_node: exp.Select
|
|
1450
|
+
) -> List[str]:
|
|
1451
|
+
"""
|
|
1452
|
+
Resolve columns from a single source (table, subquery, etc.).
|
|
1453
|
+
|
|
1454
|
+
Args:
|
|
1455
|
+
source: The source expression (Table, Subquery, etc.)
|
|
1456
|
+
select_node: The containing SELECT node for CTE resolution
|
|
1457
|
+
|
|
1458
|
+
Returns:
|
|
1459
|
+
List of column names from the source
|
|
1460
|
+
"""
|
|
1461
|
+
columns: List[str] = []
|
|
1462
|
+
|
|
1463
|
+
# Handle table reference
|
|
1464
|
+
if isinstance(source, exp.Table):
|
|
1465
|
+
source_name = self._get_qualified_table_name(source)
|
|
1466
|
+
|
|
1467
|
+
# First check file schema (views/tables from previous statements)
|
|
1468
|
+
if source_name in self._file_schema:
|
|
1469
|
+
columns.extend(self._file_schema[source_name].keys())
|
|
1470
|
+
else:
|
|
1471
|
+
# Check if this is a CTE reference within the same statement
|
|
1472
|
+
cte_columns = self._resolve_cte_columns(source_name, select_node)
|
|
1473
|
+
columns.extend(cte_columns)
|
|
1474
|
+
|
|
1475
|
+
# Handle subquery with alias
|
|
1476
|
+
elif isinstance(source, exp.Subquery):
|
|
1477
|
+
# First check if this subquery alias is in file schema
|
|
1478
|
+
if source.alias and source.alias in self._file_schema:
|
|
1479
|
+
columns.extend(self._file_schema[source.alias].keys())
|
|
1480
|
+
else:
|
|
1481
|
+
# Extract columns from the subquery's SELECT
|
|
1482
|
+
inner_select = source.this
|
|
1483
|
+
if isinstance(inner_select, exp.Select):
|
|
1484
|
+
subquery_cols = self._extract_subquery_columns(inner_select)
|
|
1485
|
+
columns.extend(subquery_cols)
|
|
1486
|
+
|
|
1487
|
+
return columns
|
|
1488
|
+
|
|
1489
|
+
def _resolve_qualified_star(
|
|
1490
|
+
self, table_name: str, select_node: exp.Select
|
|
1491
|
+
) -> List[str]:
|
|
1492
|
+
"""
|
|
1493
|
+
Resolve a table-qualified star (e.g., t.*) to actual column names.
|
|
1494
|
+
|
|
1495
|
+
Args:
|
|
1496
|
+
table_name: The table/alias name qualifying the star
|
|
1497
|
+
select_node: The SELECT node for context
|
|
1498
|
+
|
|
1499
|
+
Returns:
|
|
1500
|
+
List of column names from the specified table
|
|
1501
|
+
"""
|
|
1502
|
+
# First check file schema
|
|
1503
|
+
if table_name in self._file_schema:
|
|
1504
|
+
return list(self._file_schema[table_name].keys())
|
|
1505
|
+
|
|
1506
|
+
# Check if it's a CTE reference
|
|
1507
|
+
cte_columns = self._resolve_cte_columns(table_name, select_node)
|
|
1508
|
+
if cte_columns:
|
|
1509
|
+
return cte_columns
|
|
1510
|
+
|
|
1511
|
+
# Check if the table name is an alias - need to resolve the actual table
|
|
1512
|
+
from_clause = select_node.args.get("from")
|
|
1513
|
+
if from_clause and isinstance(from_clause, exp.From):
|
|
1514
|
+
source = from_clause.this
|
|
1515
|
+
if isinstance(source, exp.Table) and source.alias == table_name:
|
|
1516
|
+
actual_name = self._get_qualified_table_name(source)
|
|
1517
|
+
if actual_name in self._file_schema:
|
|
1518
|
+
return list(self._file_schema[actual_name].keys())
|
|
1519
|
+
|
|
1520
|
+
# Check JOIN clauses for aliased tables
|
|
1521
|
+
joins = select_node.args.get("joins")
|
|
1522
|
+
if joins:
|
|
1523
|
+
for join in joins:
|
|
1524
|
+
if isinstance(join, exp.Join):
|
|
1525
|
+
join_source = join.this
|
|
1526
|
+
if (
|
|
1527
|
+
isinstance(join_source, exp.Table)
|
|
1528
|
+
and join_source.alias == table_name
|
|
1529
|
+
):
|
|
1530
|
+
actual_name = self._get_qualified_table_name(join_source)
|
|
1531
|
+
if actual_name in self._file_schema:
|
|
1532
|
+
return list(self._file_schema[actual_name].keys())
|
|
1533
|
+
|
|
1534
|
+
return []
|
|
1535
|
+
|
|
1536
|
+
def _extract_subquery_columns(self, subquery_select: exp.Select) -> List[str]:
|
|
1537
|
+
"""
|
|
1538
|
+
Extract column names from a subquery's SELECT statement.
|
|
1539
|
+
|
|
1540
|
+
Args:
|
|
1541
|
+
subquery_select: The SELECT expression within the subquery
|
|
1542
|
+
|
|
1543
|
+
Returns:
|
|
1544
|
+
List of column names
|
|
1545
|
+
"""
|
|
1546
|
+
columns: List[str] = []
|
|
1547
|
+
|
|
1548
|
+
for projection in subquery_select.expressions:
|
|
1549
|
+
if isinstance(projection, exp.Alias):
|
|
1550
|
+
columns.append(projection.alias)
|
|
1551
|
+
elif isinstance(projection, exp.Column):
|
|
1552
|
+
# Check for table-qualified star (t.*)
|
|
1553
|
+
if isinstance(projection.this, exp.Star):
|
|
1554
|
+
table_name = projection.table
|
|
1555
|
+
if table_name:
|
|
1556
|
+
qualified_cols = self._resolve_qualified_star(
|
|
1557
|
+
table_name, subquery_select
|
|
1558
|
+
)
|
|
1559
|
+
columns.extend(qualified_cols)
|
|
1560
|
+
else:
|
|
1561
|
+
columns.append(projection.name)
|
|
1562
|
+
elif isinstance(projection, exp.Star):
|
|
1563
|
+
# Resolve SELECT * in subquery
|
|
1564
|
+
star_columns = self._resolve_star_columns(subquery_select)
|
|
1565
|
+
columns.extend(star_columns)
|
|
1566
|
+
else:
|
|
1567
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1568
|
+
columns.append(col_sql)
|
|
1569
|
+
|
|
1570
|
+
return columns
|
|
1571
|
+
|
|
1572
|
+
def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
|
|
1573
|
+
"""
|
|
1574
|
+
Resolve columns from a CTE definition within the same statement.
|
|
1575
|
+
|
|
1576
|
+
Args:
|
|
1577
|
+
cte_name: Name of the CTE to resolve
|
|
1578
|
+
select_node: The SELECT node that references the CTE
|
|
1579
|
+
|
|
1580
|
+
Returns:
|
|
1581
|
+
List of column names from the CTE, empty if CTE not found
|
|
1582
|
+
"""
|
|
1583
|
+
# Walk up the tree to find the WITH clause containing this CTE
|
|
1584
|
+
parent = select_node
|
|
1585
|
+
while parent:
|
|
1586
|
+
if hasattr(parent, "args") and parent.args.get("with"):
|
|
1587
|
+
with_clause = parent.args["with"]
|
|
1588
|
+
for cte in with_clause.expressions:
|
|
1589
|
+
if isinstance(cte, exp.CTE) and cte.alias == cte_name:
|
|
1590
|
+
# Found the CTE - extract its columns
|
|
1591
|
+
cte_select = cte.this
|
|
1592
|
+
if isinstance(cte_select, exp.Select):
|
|
1593
|
+
return self._extract_cte_select_columns(cte_select)
|
|
1594
|
+
parent = parent.parent if hasattr(parent, "parent") else None
|
|
1595
|
+
|
|
1596
|
+
return []
|
|
1597
|
+
|
|
1598
|
+
def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
|
|
1599
|
+
"""
|
|
1600
|
+
Extract column names from a CTE's SELECT statement.
|
|
1601
|
+
|
|
1602
|
+
This handles SELECT * within the CTE by resolving against file schema.
|
|
1603
|
+
|
|
1604
|
+
Args:
|
|
1605
|
+
cte_select: The SELECT expression within the CTE
|
|
1606
|
+
|
|
1607
|
+
Returns:
|
|
1608
|
+
List of column names
|
|
1609
|
+
"""
|
|
1610
|
+
columns: List[str] = []
|
|
1611
|
+
|
|
1612
|
+
for projection in cte_select.expressions:
|
|
1613
|
+
if isinstance(projection, exp.Alias):
|
|
1614
|
+
columns.append(projection.alias)
|
|
1615
|
+
elif isinstance(projection, exp.Column):
|
|
1616
|
+
columns.append(projection.name)
|
|
1617
|
+
elif isinstance(projection, exp.Star):
|
|
1618
|
+
# Resolve SELECT * in CTE from file schema
|
|
1619
|
+
star_columns = self._resolve_star_columns(cte_select)
|
|
1620
|
+
columns.extend(star_columns)
|
|
1621
|
+
else:
|
|
1622
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1623
|
+
columns.append(col_sql)
|
|
1624
|
+
|
|
1625
|
+
return columns
|