sql-glider 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.4 → sql_glider-0.1.5}/PKG-INFO +1 -1
- sql_glider-0.1.5/plans/2026-01-26-file-scoped-schema-context.md +199 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/_version.py +2 -2
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/lineage/analyzer.py +192 -18
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_analyzer.py +217 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/.github/workflows/ci.yml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/.github/workflows/publish.yml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/.gitignore +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/.python-version +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/ARCHITECTURE.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/CLAUDE.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/LICENSE +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/README.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-05-column-level-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-05-reverse-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-06-config-file-support.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-06-graph-lineage.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-06-unify-single-multi-query.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-07-sample-data-model.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-07-sql-templating.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-08-tables-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-09-graph-query-paths.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-13-dissect-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2025-12-14-tables-pull-command.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/pyproject.toml +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/README.md +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/expire_dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/load_fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/load_fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_product.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/complex/conditional_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/complex/cte_insert.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/complex/multi_table_transform.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/dim_customer.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/dim_product.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_addresses.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_order_items.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/raw_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/stg_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/stg_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/stg_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/ddl/stg_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/maintenance/update_product_status.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_payments.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_products.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/sqlglider.toml.example +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/catalog/base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/catalog/databricks.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/catalog/registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/cli.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/dissection/analyzer.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/dissection/formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/dissection/models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/global_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/builder.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/merge.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/query.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/graph/serialization.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/lineage/formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/templating/base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/templating/jinja.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/templating/registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/templating/variables.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/utils/config.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/src/sqlglider/utils/file_utils.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/customers.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/orders.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/reports.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_single_query.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_subquery.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_tables.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_view.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/sample_manifest.csv +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_databricks.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_analyzer.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/test_builder.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/test_merge.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/test_models.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/test_query.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/graph/test_serialization.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_formatters.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/templating/test_base.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/templating/test_jinja.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/templating/test_registry.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/templating/test_variables.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/test_cli.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/utils/test_config.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/tests/sqlglider/utils/test_file_utils.py +0 -0
- {sql_glider-0.1.4 → sql_glider-0.1.5}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# Plan: File-Scoped Schema Context for SQL Lineage Analyzer
|
|
2
|
+
|
|
3
|
+
**Status:** Completed
|
|
4
|
+
|
|
5
|
+
## Summary
|
|
6
|
+
|
|
7
|
+
Add file-scoped schema context to the SQL Glider lineage analyzer so that SQLGlot can correctly expand `SELECT *` and trace cross-statement references when a file contains multiple related statements.
|
|
8
|
+
|
|
9
|
+
## Problem
|
|
10
|
+
|
|
11
|
+
When analyzing this SQL:
|
|
12
|
+
```sql
|
|
13
|
+
CREATE TEMPORARY VIEW first_view AS (SELECT a, b, c FROM source_table);
|
|
14
|
+
CREATE TEMPORARY VIEW second_view AS
|
|
15
|
+
WITH first_view_cte AS (
|
|
16
|
+
SELECT *, row_number() OVER (PARTITION BY a ORDER BY b DESC) AS row_num
|
|
17
|
+
FROM first_view
|
|
18
|
+
)
|
|
19
|
+
SELECT * FROM first_view_cte WHERE c = 1;
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Previous output:** `* -> second_view.*` (useless - no column-level lineage)
|
|
23
|
+
**Expected output:** `first_view.a -> second_view.a`, `first_view.b -> second_view.b`, etc.
|
|
24
|
+
|
|
25
|
+
## Root Cause
|
|
26
|
+
|
|
27
|
+
SQLGlot's `lineage()` function accepts a `schema` parameter that provides table/view column definitions. Without this schema context, SQLGlot cannot expand `SELECT *` to actual column names.
|
|
28
|
+
|
|
29
|
+
## Solution
|
|
30
|
+
|
|
31
|
+
Build up schema context incrementally as CREATE VIEW/TABLE statements are processed, then pass that schema to subsequent `lineage()` calls.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Implementation Steps
|
|
36
|
+
|
|
37
|
+
### 1. Add Schema Instance Variable
|
|
38
|
+
|
|
39
|
+
- [x] Add `_file_schema: Dict[str, Dict[str, str]] = {}` to `LineageAnalyzer.__init__()`
|
|
40
|
+
|
|
41
|
+
### 2. Add Schema Extraction Methods
|
|
42
|
+
|
|
43
|
+
- [x] `_extract_schema_from_statement()` - Extract columns from CREATE VIEW/TABLE AS SELECT
|
|
44
|
+
- [x] `_extract_columns_from_select()` - Extract column names from SELECT projections
|
|
45
|
+
- [x] `_resolve_star_columns()` - Resolve SELECT * from file schema or CTEs
|
|
46
|
+
- [x] `_resolve_source_columns()` - Resolve columns from a single source (table, subquery)
|
|
47
|
+
- [x] `_resolve_qualified_star()` - Resolve table-qualified star (e.g., `t.*`)
|
|
48
|
+
- [x] `_extract_subquery_columns()` - Extract columns from subquery's SELECT
|
|
49
|
+
- [x] `_resolve_cte_columns()` - Resolve columns from CTE definitions
|
|
50
|
+
- [x] `_extract_cte_select_columns()` - Extract columns from CTE's SELECT
|
|
51
|
+
|
|
52
|
+
### 3. Integrate Schema Building into Analysis Loop
|
|
53
|
+
|
|
54
|
+
- [x] Reset `_file_schema = {}` at start of `analyze_queries()`
|
|
55
|
+
- [x] Call `_extract_schema_from_statement(expr)` in `finally` block AFTER analysis
|
|
56
|
+
- [x] Critical: Schema must be extracted AFTER analysis to avoid confusing SQLGlot
|
|
57
|
+
|
|
58
|
+
### 4. Pass Schema to lineage() Calls
|
|
59
|
+
|
|
60
|
+
- [x] Modify `_analyze_column_lineage_internal()` to pass schema:
|
|
61
|
+
```python
|
|
62
|
+
node = lineage(
|
|
63
|
+
lineage_col,
|
|
64
|
+
current_query_sql,
|
|
65
|
+
dialect=self.dialect,
|
|
66
|
+
schema=self._file_schema if self._file_schema else None,
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 5. Handle SELECT * in get_output_columns()
|
|
71
|
+
|
|
72
|
+
- [x] Handle `exp.Star` projections by resolving from file schema
|
|
73
|
+
- [x] Handle table-qualified stars (`t.*`) represented as `exp.Column` with `exp.Star` as `this`
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Edge Cases Handled
|
|
78
|
+
|
|
79
|
+
| Case | Implementation |
|
|
80
|
+
|------|----------------|
|
|
81
|
+
| `SELECT *` from unknown table | Returns empty columns, falls back to `*` behavior |
|
|
82
|
+
| Nested `SELECT *` through CTEs | Resolves CTE source from schema first |
|
|
83
|
+
| UNION in CREATE VIEW | Uses first branch's columns |
|
|
84
|
+
| Expressions without aliases | Uses SQL representation as column name |
|
|
85
|
+
| TEMPORARY VIEW | Treated same as regular VIEW |
|
|
86
|
+
| Multiple JOINs | Collects columns from all joined tables |
|
|
87
|
+
| LEFT/RIGHT/FULL OUTER JOIN | Same handling as INNER JOIN |
|
|
88
|
+
| CROSS JOIN | Same handling as INNER JOIN |
|
|
89
|
+
| Subquery in FROM clause | Extracts columns from inner SELECT |
|
|
90
|
+
| Table aliases (`v1 AS x`) | Resolves alias to actual table name |
|
|
91
|
+
| Schema-qualified names | Handles `schema.table` correctly |
|
|
92
|
+
| CTE referencing earlier CTE | Recursive CTE column resolution |
|
|
93
|
+
| `SELECT *, extra_col` | Combines * expansion with extra columns |
|
|
94
|
+
| Table-qualified `t.*` | Handles `v1.*` style syntax |
|
|
95
|
+
| LATERAL VIEW explode | Collects generated columns from `laterals` clause |
|
|
96
|
+
| LATERAL VIEW posexplode | Collects both position and element columns |
|
|
97
|
+
| Multiple LATERAL VIEWs | Collects columns from all LATERAL VIEWs |
|
|
98
|
+
| LATERAL VIEW OUTER | Same handling as regular LATERAL VIEW |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Files Modified
|
|
103
|
+
|
|
104
|
+
| File | Changes |
|
|
105
|
+
|------|---------|
|
|
106
|
+
| `src/sqlglider/lineage/analyzer.py` | Added `_file_schema` instance variable; Added 9 schema extraction methods (including `_resolve_lateral_columns`); Modified `analyze_queries()` and `_analyze_column_lineage_internal()` and `get_output_columns()` |
|
|
107
|
+
| `tests/sqlglider/lineage/test_analyzer.py` | Added `TestFileSchemaExtraction` (9 tests), `TestCrossStatementLineage` (12 tests), and `TestLateralViewColumnResolution` (5 tests) |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Testing
|
|
112
|
+
|
|
113
|
+
### Test Classes Added
|
|
114
|
+
|
|
115
|
+
**TestFileSchemaExtraction (9 tests):**
|
|
116
|
+
- `test_extract_schema_from_create_view`
|
|
117
|
+
- `test_extract_schema_from_create_temporary_view`
|
|
118
|
+
- `test_extract_schema_from_create_table_as`
|
|
119
|
+
- `test_extract_schema_with_aliases`
|
|
120
|
+
- `test_extract_schema_select_star_from_known_table`
|
|
121
|
+
- `test_extract_schema_select_star_from_unknown_table`
|
|
122
|
+
- `test_schema_not_extracted_from_pure_select`
|
|
123
|
+
- `test_schema_not_extracted_from_insert`
|
|
124
|
+
- `test_schema_reset_between_analysis_calls`
|
|
125
|
+
|
|
126
|
+
**TestCrossStatementLineage (12 tests):**
|
|
127
|
+
- `test_view_referencing_earlier_view`
|
|
128
|
+
- `test_select_star_expansion_through_view`
|
|
129
|
+
- `test_cte_with_select_star_from_view`
|
|
130
|
+
- `test_window_function_with_select_star`
|
|
131
|
+
- `test_insert_from_view_lineage`
|
|
132
|
+
- `test_multi_hop_view_lineage`
|
|
133
|
+
- `test_original_problem_scenario`
|
|
134
|
+
- `test_select_star_from_join`
|
|
135
|
+
- `test_nested_ctes_and_views_with_select_star`
|
|
136
|
+
- `test_select_star_from_subquery`
|
|
137
|
+
- `test_table_qualified_star`
|
|
138
|
+
- `test_table_qualified_star_with_alias`
|
|
139
|
+
|
|
140
|
+
**TestLateralViewColumnResolution (5 tests):**
|
|
141
|
+
- `test_select_star_with_lateral_view_explode`
|
|
142
|
+
- `test_select_star_with_lateral_view_posexplode`
|
|
143
|
+
- `test_select_star_with_multiple_lateral_views`
|
|
144
|
+
- `test_select_star_with_lateral_view_outer`
|
|
145
|
+
- `test_lateral_view_with_join`
|
|
146
|
+
|
|
147
|
+
### Verification Commands
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Run all tests
|
|
151
|
+
uv run pytest --cov=sqlglider --cov-fail-under=80
|
|
152
|
+
|
|
153
|
+
# Run schema-related tests
|
|
154
|
+
uv run pytest tests/sqlglider/lineage/test_analyzer.py -k "schema or CrossStatement" -v
|
|
155
|
+
|
|
156
|
+
# Test the original problem scenario
|
|
157
|
+
uv run sqlglider graph build test_view_window_cte.sql --dialect spark --output graph.json
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Implementation Notes
|
|
163
|
+
|
|
164
|
+
### Critical Timing Issue
|
|
165
|
+
|
|
166
|
+
Initially, schema extraction was done BEFORE analysis in the loop, which caused SQLGlot to return unqualified column names (e.g., `customer_id` instead of `orders.customer_id`).
|
|
167
|
+
|
|
168
|
+
**Fix:** Move `_extract_schema_from_statement(expr)` to the `finally` block AFTER analysis completes. This ensures:
|
|
169
|
+
1. The current statement is analyzed without its own schema (correct behavior)
|
|
170
|
+
2. The schema is then extracted for use by subsequent statements
|
|
171
|
+
|
|
172
|
+
### Table-Qualified Star Handling
|
|
173
|
+
|
|
174
|
+
Table-qualified stars (`v1.*`) are represented differently than unqualified stars (`*`):
|
|
175
|
+
- `*` is `exp.Star`
|
|
176
|
+
- `v1.*` is `exp.Column` with `this` being `exp.Star` and `table` being `v1`
|
|
177
|
+
|
|
178
|
+
Both cases needed handling in:
|
|
179
|
+
- `_extract_columns_from_select()` for schema extraction
|
|
180
|
+
- `get_output_columns()` for lineage analysis output
|
|
181
|
+
|
|
182
|
+
### Subquery Column Resolution
|
|
183
|
+
|
|
184
|
+
For `SELECT * FROM (SELECT * FROM v1) sub`, the code:
|
|
185
|
+
1. Detects the subquery in `_resolve_source_columns()`
|
|
186
|
+
2. Extracts columns from the inner SELECT via `_extract_subquery_columns()`
|
|
187
|
+
3. Recursively resolves any `SELECT *` in the inner query
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Lessons Learned
|
|
192
|
+
|
|
193
|
+
1. **Timing matters:** Schema context must be built AFTER analyzing a statement, not before, to avoid confusing SQLGlot's lineage tracing.
|
|
194
|
+
|
|
195
|
+
2. **AST structure varies:** Different SQL constructs have different AST representations (e.g., `*` vs `t.*`), requiring multiple code paths.
|
|
196
|
+
|
|
197
|
+
3. **Recursive resolution:** CTEs and subqueries can reference other CTEs/views, requiring recursive column resolution.
|
|
198
|
+
|
|
199
|
+
4. **Edge cases compound:** JOINs + aliases + qualified stars can all combine, requiring careful handling of each case.
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.5'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 5)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -182,20 +182,43 @@ class LineageAnalyzer:
|
|
|
182
182
|
# For aliased columns, use the alias as the column name
|
|
183
183
|
column_name = projection.alias
|
|
184
184
|
lineage_name = column_name # SQLGlot lineage uses the alias
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
185
|
+
# Qualify with target table
|
|
186
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
187
|
+
columns.append(qualified_name)
|
|
188
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
189
|
+
elif isinstance(projection, exp.Column):
|
|
190
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
191
|
+
if isinstance(projection.this, exp.Star):
|
|
192
|
+
source_table = projection.table
|
|
193
|
+
qualified_star_cols: List[str] = []
|
|
194
|
+
if source_table and first_select:
|
|
195
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
196
|
+
source_table, first_select
|
|
197
|
+
)
|
|
198
|
+
for col in qualified_star_cols:
|
|
199
|
+
qualified_name = f"{target_table}.{col}"
|
|
200
|
+
columns.append(qualified_name)
|
|
201
|
+
self._column_mapping[qualified_name] = col
|
|
202
|
+
if not qualified_star_cols:
|
|
203
|
+
# Fallback: can't resolve t.*, use * as column name
|
|
204
|
+
qualified_name = f"{target_table}.*"
|
|
205
|
+
columns.append(qualified_name)
|
|
206
|
+
self._column_mapping[qualified_name] = "*"
|
|
190
207
|
else:
|
|
191
|
-
|
|
192
|
-
column_name = source_expr.sql(dialect=self.dialect)
|
|
208
|
+
column_name = projection.name
|
|
193
209
|
lineage_name = column_name
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
210
|
+
# Qualify with target table
|
|
211
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
212
|
+
columns.append(qualified_name)
|
|
213
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
214
|
+
else:
|
|
215
|
+
# For expressions, use the SQL representation
|
|
216
|
+
column_name = projection.sql(dialect=self.dialect)
|
|
217
|
+
lineage_name = column_name
|
|
218
|
+
# Qualify with target table
|
|
219
|
+
qualified_name = f"{target_table}.{column_name}"
|
|
220
|
+
columns.append(qualified_name)
|
|
221
|
+
self._column_mapping[qualified_name] = lineage_name
|
|
199
222
|
|
|
200
223
|
else:
|
|
201
224
|
# DQL (pure SELECT): Use the SELECT columns as output
|
|
@@ -1342,8 +1365,18 @@ class LineageAnalyzer:
|
|
|
1342
1365
|
# Use the alias name as the column name
|
|
1343
1366
|
columns.append(projection.alias)
|
|
1344
1367
|
elif isinstance(projection, exp.Column):
|
|
1345
|
-
#
|
|
1346
|
-
|
|
1368
|
+
# Check if this is a table-qualified star (e.g., t.*)
|
|
1369
|
+
if isinstance(projection.this, exp.Star):
|
|
1370
|
+
# Resolve table-qualified star from known schema
|
|
1371
|
+
table_name = projection.table
|
|
1372
|
+
if table_name and first_select:
|
|
1373
|
+
qualified_star_cols = self._resolve_qualified_star(
|
|
1374
|
+
table_name, first_select
|
|
1375
|
+
)
|
|
1376
|
+
columns.extend(qualified_star_cols)
|
|
1377
|
+
else:
|
|
1378
|
+
# Use the column name
|
|
1379
|
+
columns.append(projection.name)
|
|
1347
1380
|
elif isinstance(projection, exp.Star):
|
|
1348
1381
|
# Resolve SELECT * from known schema
|
|
1349
1382
|
if first_select:
|
|
@@ -1375,6 +1408,58 @@ class LineageAnalyzer:
|
|
|
1375
1408
|
|
|
1376
1409
|
source = from_clause.this
|
|
1377
1410
|
|
|
1411
|
+
# Handle table reference from FROM clause
|
|
1412
|
+
columns.extend(self._resolve_source_columns(source, select_node))
|
|
1413
|
+
|
|
1414
|
+
# Handle JOIN clauses - collect columns from all joined tables
|
|
1415
|
+
joins = select_node.args.get("joins")
|
|
1416
|
+
if joins:
|
|
1417
|
+
for join in joins:
|
|
1418
|
+
if isinstance(join, exp.Join):
|
|
1419
|
+
join_source = join.this
|
|
1420
|
+
columns.extend(
|
|
1421
|
+
self._resolve_source_columns(join_source, select_node)
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
# Handle LATERAL VIEW clauses - collect generated columns
|
|
1425
|
+
laterals = select_node.args.get("laterals")
|
|
1426
|
+
if laterals:
|
|
1427
|
+
for lateral in laterals:
|
|
1428
|
+
if isinstance(lateral, exp.Lateral):
|
|
1429
|
+
lateral_cols = self._resolve_lateral_columns(lateral)
|
|
1430
|
+
columns.extend(lateral_cols)
|
|
1431
|
+
|
|
1432
|
+
return columns
|
|
1433
|
+
|
|
1434
|
+
def _resolve_lateral_columns(self, lateral: exp.Lateral) -> List[str]:
|
|
1435
|
+
"""
|
|
1436
|
+
Extract generated column names from a LATERAL VIEW clause.
|
|
1437
|
+
|
|
1438
|
+
Args:
|
|
1439
|
+
lateral: The Lateral expression node
|
|
1440
|
+
|
|
1441
|
+
Returns:
|
|
1442
|
+
List of generated column names (e.g., ['elem'] for explode,
|
|
1443
|
+
['pos', 'elem'] for posexplode)
|
|
1444
|
+
"""
|
|
1445
|
+
# Use SQLGlot's built-in property to get alias column names
|
|
1446
|
+
return lateral.alias_column_names or []
|
|
1447
|
+
|
|
1448
|
+
def _resolve_source_columns(
|
|
1449
|
+
self, source: exp.Expression, select_node: exp.Select
|
|
1450
|
+
) -> List[str]:
|
|
1451
|
+
"""
|
|
1452
|
+
Resolve columns from a single source (table, subquery, etc.).
|
|
1453
|
+
|
|
1454
|
+
Args:
|
|
1455
|
+
source: The source expression (Table, Subquery, etc.)
|
|
1456
|
+
select_node: The containing SELECT node for CTE resolution
|
|
1457
|
+
|
|
1458
|
+
Returns:
|
|
1459
|
+
List of column names from the source
|
|
1460
|
+
"""
|
|
1461
|
+
columns: List[str] = []
|
|
1462
|
+
|
|
1378
1463
|
# Handle table reference
|
|
1379
1464
|
if isinstance(source, exp.Table):
|
|
1380
1465
|
source_name = self._get_qualified_table_name(source)
|
|
@@ -1387,11 +1472,100 @@ class LineageAnalyzer:
|
|
|
1387
1472
|
cte_columns = self._resolve_cte_columns(source_name, select_node)
|
|
1388
1473
|
columns.extend(cte_columns)
|
|
1389
1474
|
|
|
1390
|
-
# Handle subquery
|
|
1391
|
-
elif isinstance(source, exp.Subquery)
|
|
1392
|
-
#
|
|
1393
|
-
if source.alias in self._file_schema:
|
|
1475
|
+
# Handle subquery with alias
|
|
1476
|
+
elif isinstance(source, exp.Subquery):
|
|
1477
|
+
# First check if this subquery alias is in file schema
|
|
1478
|
+
if source.alias and source.alias in self._file_schema:
|
|
1394
1479
|
columns.extend(self._file_schema[source.alias].keys())
|
|
1480
|
+
else:
|
|
1481
|
+
# Extract columns from the subquery's SELECT
|
|
1482
|
+
inner_select = source.this
|
|
1483
|
+
if isinstance(inner_select, exp.Select):
|
|
1484
|
+
subquery_cols = self._extract_subquery_columns(inner_select)
|
|
1485
|
+
columns.extend(subquery_cols)
|
|
1486
|
+
|
|
1487
|
+
return columns
|
|
1488
|
+
|
|
1489
|
+
def _resolve_qualified_star(
|
|
1490
|
+
self, table_name: str, select_node: exp.Select
|
|
1491
|
+
) -> List[str]:
|
|
1492
|
+
"""
|
|
1493
|
+
Resolve a table-qualified star (e.g., t.*) to actual column names.
|
|
1494
|
+
|
|
1495
|
+
Args:
|
|
1496
|
+
table_name: The table/alias name qualifying the star
|
|
1497
|
+
select_node: The SELECT node for context
|
|
1498
|
+
|
|
1499
|
+
Returns:
|
|
1500
|
+
List of column names from the specified table
|
|
1501
|
+
"""
|
|
1502
|
+
# First check file schema
|
|
1503
|
+
if table_name in self._file_schema:
|
|
1504
|
+
return list(self._file_schema[table_name].keys())
|
|
1505
|
+
|
|
1506
|
+
# Check if it's a CTE reference
|
|
1507
|
+
cte_columns = self._resolve_cte_columns(table_name, select_node)
|
|
1508
|
+
if cte_columns:
|
|
1509
|
+
return cte_columns
|
|
1510
|
+
|
|
1511
|
+
# Check if the table name is an alias - need to resolve the actual table
|
|
1512
|
+
from_clause = select_node.args.get("from")
|
|
1513
|
+
if from_clause and isinstance(from_clause, exp.From):
|
|
1514
|
+
source = from_clause.this
|
|
1515
|
+
if isinstance(source, exp.Table) and source.alias == table_name:
|
|
1516
|
+
actual_name = self._get_qualified_table_name(source)
|
|
1517
|
+
if actual_name in self._file_schema:
|
|
1518
|
+
return list(self._file_schema[actual_name].keys())
|
|
1519
|
+
|
|
1520
|
+
# Check JOIN clauses for aliased tables
|
|
1521
|
+
joins = select_node.args.get("joins")
|
|
1522
|
+
if joins:
|
|
1523
|
+
for join in joins:
|
|
1524
|
+
if isinstance(join, exp.Join):
|
|
1525
|
+
join_source = join.this
|
|
1526
|
+
if (
|
|
1527
|
+
isinstance(join_source, exp.Table)
|
|
1528
|
+
and join_source.alias == table_name
|
|
1529
|
+
):
|
|
1530
|
+
actual_name = self._get_qualified_table_name(join_source)
|
|
1531
|
+
if actual_name in self._file_schema:
|
|
1532
|
+
return list(self._file_schema[actual_name].keys())
|
|
1533
|
+
|
|
1534
|
+
return []
|
|
1535
|
+
|
|
1536
|
+
def _extract_subquery_columns(self, subquery_select: exp.Select) -> List[str]:
|
|
1537
|
+
"""
|
|
1538
|
+
Extract column names from a subquery's SELECT statement.
|
|
1539
|
+
|
|
1540
|
+
Args:
|
|
1541
|
+
subquery_select: The SELECT expression within the subquery
|
|
1542
|
+
|
|
1543
|
+
Returns:
|
|
1544
|
+
List of column names
|
|
1545
|
+
"""
|
|
1546
|
+
columns: List[str] = []
|
|
1547
|
+
|
|
1548
|
+
for projection in subquery_select.expressions:
|
|
1549
|
+
if isinstance(projection, exp.Alias):
|
|
1550
|
+
columns.append(projection.alias)
|
|
1551
|
+
elif isinstance(projection, exp.Column):
|
|
1552
|
+
# Check for table-qualified star (t.*)
|
|
1553
|
+
if isinstance(projection.this, exp.Star):
|
|
1554
|
+
table_name = projection.table
|
|
1555
|
+
if table_name:
|
|
1556
|
+
qualified_cols = self._resolve_qualified_star(
|
|
1557
|
+
table_name, subquery_select
|
|
1558
|
+
)
|
|
1559
|
+
columns.extend(qualified_cols)
|
|
1560
|
+
else:
|
|
1561
|
+
columns.append(projection.name)
|
|
1562
|
+
elif isinstance(projection, exp.Star):
|
|
1563
|
+
# Resolve SELECT * in subquery
|
|
1564
|
+
star_columns = self._resolve_star_columns(subquery_select)
|
|
1565
|
+
columns.extend(star_columns)
|
|
1566
|
+
else:
|
|
1567
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1568
|
+
columns.append(col_sql)
|
|
1395
1569
|
|
|
1396
1570
|
return columns
|
|
1397
1571
|
|
|
@@ -2549,3 +2549,220 @@ class TestCrossStatementLineage:
|
|
|
2549
2549
|
assert "output_table.b" in third_outputs
|
|
2550
2550
|
assert "output_table.c" in third_outputs
|
|
2551
2551
|
assert "output_table.row_num" in third_outputs
|
|
2552
|
+
|
|
2553
|
+
def test_select_star_from_join(self):
|
|
2554
|
+
"""SELECT * from JOIN should include columns from all joined tables."""
|
|
2555
|
+
sql = """
|
|
2556
|
+
CREATE VIEW v1 AS SELECT a, b FROM t1;
|
|
2557
|
+
CREATE VIEW v2 AS SELECT c, d FROM t2;
|
|
2558
|
+
CREATE VIEW v3 AS SELECT * FROM v1 JOIN v2 ON v1.a = v2.c;
|
|
2559
|
+
"""
|
|
2560
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2561
|
+
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2562
|
+
|
|
2563
|
+
assert len(results) == 3
|
|
2564
|
+
|
|
2565
|
+
# Third view should have all columns from both v1 and v2
|
|
2566
|
+
third_result = results[2]
|
|
2567
|
+
third_outputs = {item.output_name for item in third_result.lineage_items}
|
|
2568
|
+
assert "v3.a" in third_outputs
|
|
2569
|
+
assert "v3.b" in third_outputs
|
|
2570
|
+
assert "v3.c" in third_outputs
|
|
2571
|
+
assert "v3.d" in third_outputs
|
|
2572
|
+
|
|
2573
|
+
# Sources should be from both v1 and v2
|
|
2574
|
+
third_sources = {item.source_name for item in third_result.lineage_items}
|
|
2575
|
+
assert "v1.a" in third_sources
|
|
2576
|
+
assert "v1.b" in third_sources
|
|
2577
|
+
assert "v2.c" in third_sources
|
|
2578
|
+
assert "v2.d" in third_sources
|
|
2579
|
+
|
|
2580
|
+
def test_nested_ctes_and_views_with_select_star(self):
|
|
2581
|
+
"""Complex nested CTEs and views with SELECT * should resolve correctly."""
|
|
2582
|
+
sql = """
|
|
2583
|
+
CREATE VIEW v1 AS SELECT a, b FROM t1;
|
|
2584
|
+
CREATE VIEW v2 AS SELECT c, d FROM t2;
|
|
2585
|
+
CREATE VIEW v3 AS
|
|
2586
|
+
WITH cte1 AS (SELECT * FROM v1)
|
|
2587
|
+
SELECT * FROM cte1;
|
|
2588
|
+
CREATE VIEW v4 AS
|
|
2589
|
+
SELECT * FROM v3 JOIN v2 ON v3.a = v2.c;
|
|
2590
|
+
CREATE VIEW v5 AS
|
|
2591
|
+
WITH
|
|
2592
|
+
cte1 AS (SELECT * FROM v4),
|
|
2593
|
+
cte2 AS (SELECT * FROM cte1)
|
|
2594
|
+
SELECT * FROM cte2;
|
|
2595
|
+
"""
|
|
2596
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2597
|
+
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2598
|
+
|
|
2599
|
+
assert len(results) == 5
|
|
2600
|
+
|
|
2601
|
+
# Verify file schema was correctly built
|
|
2602
|
+
assert "v1" in analyzer._file_schema
|
|
2603
|
+
assert set(analyzer._file_schema["v1"].keys()) == {"a", "b"}
|
|
2604
|
+
|
|
2605
|
+
assert "v2" in analyzer._file_schema
|
|
2606
|
+
assert set(analyzer._file_schema["v2"].keys()) == {"c", "d"}
|
|
2607
|
+
|
|
2608
|
+
assert "v3" in analyzer._file_schema
|
|
2609
|
+
assert set(analyzer._file_schema["v3"].keys()) == {"a", "b"}
|
|
2610
|
+
|
|
2611
|
+
assert "v4" in analyzer._file_schema
|
|
2612
|
+
assert set(analyzer._file_schema["v4"].keys()) == {"a", "b", "c", "d"}
|
|
2613
|
+
|
|
2614
|
+
assert "v5" in analyzer._file_schema
|
|
2615
|
+
assert set(analyzer._file_schema["v5"].keys()) == {"a", "b", "c", "d"}
|
|
2616
|
+
|
|
2617
|
+
# Final view should have all columns
|
|
2618
|
+
fifth_result = results[4]
|
|
2619
|
+
fifth_outputs = {item.output_name for item in fifth_result.lineage_items}
|
|
2620
|
+
assert "v5.a" in fifth_outputs
|
|
2621
|
+
assert "v5.b" in fifth_outputs
|
|
2622
|
+
assert "v5.c" in fifth_outputs
|
|
2623
|
+
assert "v5.d" in fifth_outputs
|
|
2624
|
+
|
|
2625
|
+
def test_select_star_from_subquery(self):
|
|
2626
|
+
"""SELECT * from subquery should resolve columns from inner SELECT."""
|
|
2627
|
+
sql = """
|
|
2628
|
+
CREATE VIEW v1 AS SELECT a, b FROM t1;
|
|
2629
|
+
CREATE VIEW v2 AS SELECT * FROM (SELECT * FROM v1) sub;
|
|
2630
|
+
"""
|
|
2631
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2632
|
+
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2633
|
+
|
|
2634
|
+
assert len(results) == 2
|
|
2635
|
+
|
|
2636
|
+
# Second view should have columns from subquery
|
|
2637
|
+
second_result = results[1]
|
|
2638
|
+
second_outputs = {item.output_name for item in second_result.lineage_items}
|
|
2639
|
+
assert "v2.a" in second_outputs
|
|
2640
|
+
assert "v2.b" in second_outputs
|
|
2641
|
+
|
|
2642
|
+
# File schema should also be correct
|
|
2643
|
+
assert set(analyzer._file_schema["v2"].keys()) == {"a", "b"}
|
|
2644
|
+
|
|
2645
|
+
def test_table_qualified_star(self):
|
|
2646
|
+
"""Table-qualified star (t.*) should resolve to table columns."""
|
|
2647
|
+
sql = """
|
|
2648
|
+
CREATE VIEW v1 AS SELECT a, b FROM t1;
|
|
2649
|
+
CREATE VIEW v2 AS SELECT c FROM t2;
|
|
2650
|
+
CREATE VIEW v3 AS SELECT v1.*, v2.c FROM v1 JOIN v2 ON v1.a = v2.c;
|
|
2651
|
+
"""
|
|
2652
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2653
|
+
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2654
|
+
|
|
2655
|
+
assert len(results) == 3
|
|
2656
|
+
|
|
2657
|
+
# Third view should have all columns
|
|
2658
|
+
third_result = results[2]
|
|
2659
|
+
third_outputs = {item.output_name for item in third_result.lineage_items}
|
|
2660
|
+
assert "v3.a" in third_outputs
|
|
2661
|
+
assert "v3.b" in third_outputs
|
|
2662
|
+
assert "v3.c" in third_outputs
|
|
2663
|
+
|
|
2664
|
+
# File schema should be correct
|
|
2665
|
+
assert set(analyzer._file_schema["v3"].keys()) == {"a", "b", "c"}
|
|
2666
|
+
|
|
2667
|
+
def test_table_qualified_star_with_alias(self):
|
|
2668
|
+
"""Table-qualified star with alias (x.*) should resolve correctly."""
|
|
2669
|
+
sql = """
|
|
2670
|
+
CREATE VIEW v1 AS SELECT a, b FROM t1;
|
|
2671
|
+
CREATE VIEW v2 AS SELECT c FROM t2;
|
|
2672
|
+
CREATE VIEW v3 AS SELECT x.*, y.c FROM v1 AS x JOIN v2 AS y ON x.a = y.c;
|
|
2673
|
+
"""
|
|
2674
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2675
|
+
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2676
|
+
|
|
2677
|
+
assert len(results) == 3
|
|
2678
|
+
|
|
2679
|
+
# Third view should have all columns
|
|
2680
|
+
third_result = results[2]
|
|
2681
|
+
third_outputs = {item.output_name for item in third_result.lineage_items}
|
|
2682
|
+
assert "v3.a" in third_outputs
|
|
2683
|
+
assert "v3.b" in third_outputs
|
|
2684
|
+
assert "v3.c" in third_outputs
|
|
2685
|
+
|
|
2686
|
+
# File schema should be correct
|
|
2687
|
+
assert set(analyzer._file_schema["v3"].keys()) == {"a", "b", "c"}
|
|
2688
|
+
|
|
2689
|
+
|
|
2690
|
+
class TestLateralViewColumnResolution:
|
|
2691
|
+
"""Tests for LATERAL VIEW column resolution in SELECT *."""
|
|
2692
|
+
|
|
2693
|
+
def test_select_star_with_lateral_view_explode(self):
|
|
2694
|
+
"""SELECT * should include explode-generated columns."""
|
|
2695
|
+
sql = """
|
|
2696
|
+
CREATE VIEW v1 AS SELECT arr FROM t1;
|
|
2697
|
+
CREATE VIEW v2 AS SELECT * FROM v1 LATERAL VIEW explode(arr) t AS elem;
|
|
2698
|
+
"""
|
|
2699
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2700
|
+
analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2701
|
+
|
|
2702
|
+
# v2 schema should include both arr and elem
|
|
2703
|
+
assert "v2" in analyzer._file_schema
|
|
2704
|
+
assert set(analyzer._file_schema["v2"].keys()) == {"arr", "elem"}
|
|
2705
|
+
|
|
2706
|
+
def test_select_star_with_lateral_view_posexplode(self):
|
|
2707
|
+
"""SELECT * should include posexplode-generated columns (pos + elem)."""
|
|
2708
|
+
sql = """
|
|
2709
|
+
CREATE VIEW v1 AS SELECT arr FROM t1;
|
|
2710
|
+
CREATE VIEW v2 AS SELECT * FROM v1 LATERAL VIEW posexplode(arr) t AS pos, elem;
|
|
2711
|
+
"""
|
|
2712
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2713
|
+
analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2714
|
+
|
|
2715
|
+
# v2 schema should include arr, pos, and elem
|
|
2716
|
+
assert "v2" in analyzer._file_schema
|
|
2717
|
+
assert set(analyzer._file_schema["v2"].keys()) == {"arr", "pos", "elem"}
|
|
2718
|
+
|
|
2719
|
+
def test_select_star_with_multiple_lateral_views(self):
|
|
2720
|
+
"""SELECT * should include columns from multiple LATERAL VIEWs."""
|
|
2721
|
+
sql = """
|
|
2722
|
+
CREATE VIEW v1 AS SELECT arr1, arr2 FROM t1;
|
|
2723
|
+
CREATE VIEW v2 AS
|
|
2724
|
+
SELECT * FROM v1
|
|
2725
|
+
LATERAL VIEW explode(arr1) t1 AS elem1
|
|
2726
|
+
LATERAL VIEW explode(arr2) t2 AS elem2;
|
|
2727
|
+
"""
|
|
2728
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2729
|
+
analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2730
|
+
|
|
2731
|
+
# v2 schema should include all columns
|
|
2732
|
+
assert "v2" in analyzer._file_schema
|
|
2733
|
+
assert set(analyzer._file_schema["v2"].keys()) == {
|
|
2734
|
+
"arr1",
|
|
2735
|
+
"arr2",
|
|
2736
|
+
"elem1",
|
|
2737
|
+
"elem2",
|
|
2738
|
+
}
|
|
2739
|
+
|
|
2740
|
+
def test_select_star_with_lateral_view_outer(self):
|
|
2741
|
+
"""LATERAL VIEW OUTER should work the same as regular LATERAL VIEW."""
|
|
2742
|
+
sql = """
|
|
2743
|
+
CREATE VIEW v1 AS SELECT arr FROM t1;
|
|
2744
|
+
CREATE VIEW v2 AS SELECT * FROM v1 LATERAL VIEW OUTER explode(arr) t AS elem;
|
|
2745
|
+
"""
|
|
2746
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2747
|
+
analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2748
|
+
|
|
2749
|
+
# v2 schema should include both arr and elem
|
|
2750
|
+
assert "v2" in analyzer._file_schema
|
|
2751
|
+
assert set(analyzer._file_schema["v2"].keys()) == {"arr", "elem"}
|
|
2752
|
+
|
|
2753
|
+
def test_lateral_view_with_join(self):
|
|
2754
|
+
"""LATERAL VIEW combined with JOIN should resolve all columns."""
|
|
2755
|
+
sql = """
|
|
2756
|
+
CREATE VIEW v1 AS SELECT id, arr FROM t1;
|
|
2757
|
+
CREATE VIEW v2 AS SELECT name FROM t2;
|
|
2758
|
+
CREATE VIEW v3 AS
|
|
2759
|
+
SELECT * FROM v1
|
|
2760
|
+
JOIN v2 ON v1.id = v2.name
|
|
2761
|
+
LATERAL VIEW explode(arr) t AS elem;
|
|
2762
|
+
"""
|
|
2763
|
+
analyzer = LineageAnalyzer(sql, dialect="spark")
|
|
2764
|
+
analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
2765
|
+
|
|
2766
|
+
# v3 schema should include columns from v1, v2, and the lateral view
|
|
2767
|
+
assert "v3" in analyzer._file_schema
|
|
2768
|
+
assert set(analyzer._file_schema["v3"].keys()) == {"id", "arr", "name", "elem"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/business/update_dim_customer_metrics.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/incremental/incr_pres_sales_summary.sql
RENAMED
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/maintenance/delete_expired_customers.sql
RENAMED
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/maintenance/update_product_status.sql
RENAMED
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_360.sql
RENAMED
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_cohort.sql
RENAMED
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_sales_summary.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/view_based_merge.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sql_glider-0.1.4 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_view_window_cte.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|