sql-glider 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.3 → sql_glider-0.1.4}/PKG-INFO +1 -1
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/_version.py +2 -2
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/lineage/analyzer.py +216 -2
- sql_glider-0.1.4/tests/fixtures/original_queries/test_view_window_cte.sql +27 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_builder.py +150 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/lineage/test_analyzer.py +287 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/.github/workflows/ci.yml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/.github/workflows/publish.yml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/.gitignore +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/.python-version +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/ARCHITECTURE.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/CLAUDE.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/LICENSE +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/README.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-05-column-level-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-05-reverse-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-06-config-file-support.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-06-graph-lineage.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-06-unify-single-multi-query.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-07-sample-data-model.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-07-sql-templating.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-08-tables-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-09-graph-query-paths.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-13-dissect-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2025-12-14-tables-pull-command.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/pyproject.toml +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/README.md +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/expire_dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/load_fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/load_fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/merge_dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/merge_dim_product.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/complex/conditional_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/complex/cte_insert.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/complex/multi_table_transform.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/dim_customer.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/dim_product.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_addresses.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_order_items.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/raw_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/stg_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/stg_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/stg_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/ddl/stg_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/maintenance/update_product_status.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/staging/load_stg_customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/staging/load_stg_orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/staging/load_stg_payments.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sample_data_model/staging/load_stg_products.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/sqlglider.toml.example +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/catalog/base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/catalog/databricks.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/catalog/registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/cli.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/dissection/analyzer.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/dissection/formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/dissection/models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/global_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/builder.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/merge.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/query.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/graph/serialization.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/lineage/formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/templating/base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/templating/jinja.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/templating/registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/templating/variables.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/utils/config.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/utils/file_utils.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/customers.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/orders.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/reports.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_cte.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_multi.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_single_query.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_subquery.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_tables.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/original_queries/test_view.sql +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/fixtures/sample_manifest.csv +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/catalog/test_base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/catalog/test_databricks.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/catalog/test_registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/dissection/test_analyzer.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/dissection/test_formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/dissection/test_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_merge.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_models.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_query.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_serialization.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/lineage/test_formatters.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/templating/test_base.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/templating/test_jinja.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/templating/test_registry.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/templating/test_variables.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/test_cli.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/utils/test_config.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/utils/test_file_utils.py +0 -0
- {sql_glider-0.1.3 → sql_glider-0.1.4}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.4'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 4)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Core lineage analysis using SQLGlot."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
|
|
4
|
+
from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
from sqlglot import exp, parse
|
|
@@ -99,6 +99,9 @@ class LineageAnalyzer:
|
|
|
99
99
|
self.sql = sql
|
|
100
100
|
self.dialect = dialect
|
|
101
101
|
self._skipped_queries: List[SkippedQuery] = []
|
|
102
|
+
# File-scoped schema context for cross-statement lineage
|
|
103
|
+
# Maps table/view names to their column definitions
|
|
104
|
+
self._file_schema: Dict[str, Dict[str, str]] = {}
|
|
102
105
|
|
|
103
106
|
try:
|
|
104
107
|
# Parse all statements in the SQL string
|
|
@@ -156,7 +159,24 @@ class LineageAnalyzer:
|
|
|
156
159
|
# DML/DDL: Use target table for output column qualification
|
|
157
160
|
# The columns are from the SELECT, but qualified with the target table
|
|
158
161
|
projections = self._get_select_projections(select_node)
|
|
162
|
+
first_select = self._get_first_select(select_node)
|
|
163
|
+
|
|
159
164
|
for projection in projections:
|
|
165
|
+
# Handle SELECT * by resolving from file schema
|
|
166
|
+
if isinstance(projection, exp.Star):
|
|
167
|
+
if first_select:
|
|
168
|
+
star_columns = self._resolve_star_columns(first_select)
|
|
169
|
+
for star_col in star_columns:
|
|
170
|
+
qualified_name = f"{target_table}.{star_col}"
|
|
171
|
+
columns.append(qualified_name)
|
|
172
|
+
self._column_mapping[qualified_name] = star_col
|
|
173
|
+
if not columns:
|
|
174
|
+
# Fallback: can't resolve *, use * as column name
|
|
175
|
+
qualified_name = f"{target_table}.*"
|
|
176
|
+
columns.append(qualified_name)
|
|
177
|
+
self._column_mapping[qualified_name] = "*"
|
|
178
|
+
continue
|
|
179
|
+
|
|
160
180
|
# Get the underlying expression (unwrap alias if present)
|
|
161
181
|
if isinstance(projection, exp.Alias):
|
|
162
182
|
# For aliased columns, use the alias as the column name
|
|
@@ -324,6 +344,7 @@ class LineageAnalyzer:
|
|
|
324
344
|
"""
|
|
325
345
|
results = []
|
|
326
346
|
self._skipped_queries = [] # Reset skipped queries for this analysis
|
|
347
|
+
self._file_schema = {} # Reset file schema for this analysis run
|
|
327
348
|
|
|
328
349
|
for query_index, expr, preview in self._iterate_queries(table_filter):
|
|
329
350
|
# Temporarily swap self.expr to analyze this query
|
|
@@ -375,6 +396,9 @@ class LineageAnalyzer:
|
|
|
375
396
|
)
|
|
376
397
|
)
|
|
377
398
|
finally:
|
|
399
|
+
# Extract schema from this statement AFTER analysis
|
|
400
|
+
# This builds up context for subsequent statements to use
|
|
401
|
+
self._extract_schema_from_statement(expr)
|
|
378
402
|
# Restore original expression
|
|
379
403
|
self.expr = original_expr
|
|
380
404
|
|
|
@@ -702,7 +726,13 @@ class LineageAnalyzer:
|
|
|
702
726
|
lineage_col = self._column_mapping.get(col, col)
|
|
703
727
|
|
|
704
728
|
# Get lineage tree for this column using current query SQL only
|
|
705
|
-
|
|
729
|
+
# Pass file schema to enable SELECT * expansion for known tables/views
|
|
730
|
+
node = lineage(
|
|
731
|
+
lineage_col,
|
|
732
|
+
current_query_sql,
|
|
733
|
+
dialect=self.dialect,
|
|
734
|
+
schema=self._file_schema if self._file_schema else None,
|
|
735
|
+
)
|
|
706
736
|
|
|
707
737
|
# Collect all source columns
|
|
708
738
|
sources: Set[str] = set()
|
|
@@ -1235,3 +1265,187 @@ class LineageAnalyzer:
|
|
|
1235
1265
|
preview = self._generate_query_preview(expr)
|
|
1236
1266
|
|
|
1237
1267
|
yield idx, expr, preview
|
|
1268
|
+
|
|
1269
|
+
# -------------------------------------------------------------------------
|
|
1270
|
+
# File-scoped schema context methods
|
|
1271
|
+
# -------------------------------------------------------------------------
|
|
1272
|
+
|
|
1273
|
+
def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
|
|
1274
|
+
"""
|
|
1275
|
+
Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
|
|
1276
|
+
|
|
1277
|
+
This method builds up file-scoped schema context as statements are processed,
|
|
1278
|
+
enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
|
|
1279
|
+
|
|
1280
|
+
Args:
|
|
1281
|
+
expr: The SQL expression to extract schema from
|
|
1282
|
+
"""
|
|
1283
|
+
# Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
|
|
1284
|
+
if not isinstance(expr, exp.Create):
|
|
1285
|
+
return
|
|
1286
|
+
if expr.kind not in ("VIEW", "TABLE"):
|
|
1287
|
+
return
|
|
1288
|
+
|
|
1289
|
+
# Get target table/view name
|
|
1290
|
+
target = expr.this
|
|
1291
|
+
if isinstance(target, exp.Schema):
|
|
1292
|
+
target = target.this
|
|
1293
|
+
if not isinstance(target, exp.Table):
|
|
1294
|
+
return
|
|
1295
|
+
|
|
1296
|
+
target_name = self._get_qualified_table_name(target)
|
|
1297
|
+
|
|
1298
|
+
# Get the SELECT node from the CREATE statement
|
|
1299
|
+
select_node = expr.expression
|
|
1300
|
+
if select_node is None:
|
|
1301
|
+
return
|
|
1302
|
+
|
|
1303
|
+
# Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
|
|
1304
|
+
if isinstance(select_node, exp.Subquery):
|
|
1305
|
+
select_node = select_node.this
|
|
1306
|
+
|
|
1307
|
+
if not isinstance(
|
|
1308
|
+
select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
|
|
1309
|
+
):
|
|
1310
|
+
return
|
|
1311
|
+
|
|
1312
|
+
# Extract column names from the SELECT
|
|
1313
|
+
columns = self._extract_columns_from_select(select_node)
|
|
1314
|
+
|
|
1315
|
+
if columns:
|
|
1316
|
+
# Store with UNKNOWN type - SQLGlot only needs column names for expansion
|
|
1317
|
+
self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
|
|
1318
|
+
|
|
1319
|
+
def _extract_columns_from_select(
|
|
1320
|
+
self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
|
|
1321
|
+
) -> List[str]:
|
|
1322
|
+
"""
|
|
1323
|
+
Extract column names from a SELECT statement.
|
|
1324
|
+
|
|
1325
|
+
Handles aliases, direct column references, and SELECT * by resolving
|
|
1326
|
+
against the known file schema.
|
|
1327
|
+
|
|
1328
|
+
Args:
|
|
1329
|
+
select_node: The SELECT or set operation expression
|
|
1330
|
+
|
|
1331
|
+
Returns:
|
|
1332
|
+
List of column names
|
|
1333
|
+
"""
|
|
1334
|
+
columns: List[str] = []
|
|
1335
|
+
|
|
1336
|
+
# Get projections (for UNION, use first branch)
|
|
1337
|
+
projections = self._get_select_projections(select_node)
|
|
1338
|
+
first_select = self._get_first_select(select_node)
|
|
1339
|
+
|
|
1340
|
+
for projection in projections:
|
|
1341
|
+
if isinstance(projection, exp.Alias):
|
|
1342
|
+
# Use the alias name as the column name
|
|
1343
|
+
columns.append(projection.alias)
|
|
1344
|
+
elif isinstance(projection, exp.Column):
|
|
1345
|
+
# Use the column name
|
|
1346
|
+
columns.append(projection.name)
|
|
1347
|
+
elif isinstance(projection, exp.Star):
|
|
1348
|
+
# Resolve SELECT * from known schema
|
|
1349
|
+
if first_select:
|
|
1350
|
+
star_columns = self._resolve_star_columns(first_select)
|
|
1351
|
+
columns.extend(star_columns)
|
|
1352
|
+
else:
|
|
1353
|
+
# For expressions without alias, use SQL representation
|
|
1354
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1355
|
+
columns.append(col_sql)
|
|
1356
|
+
|
|
1357
|
+
return columns
|
|
1358
|
+
|
|
1359
|
+
def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
|
|
1360
|
+
"""
|
|
1361
|
+
Resolve SELECT * to actual column names from known file schema or CTEs.
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
select_node: The SELECT node containing the * reference
|
|
1365
|
+
|
|
1366
|
+
Returns:
|
|
1367
|
+
List of column names if source is known, empty list otherwise
|
|
1368
|
+
"""
|
|
1369
|
+
columns: List[str] = []
|
|
1370
|
+
|
|
1371
|
+
# Get the source table(s) from FROM clause
|
|
1372
|
+
from_clause = select_node.args.get("from")
|
|
1373
|
+
if not from_clause or not isinstance(from_clause, exp.From):
|
|
1374
|
+
return columns
|
|
1375
|
+
|
|
1376
|
+
source = from_clause.this
|
|
1377
|
+
|
|
1378
|
+
# Handle table reference
|
|
1379
|
+
if isinstance(source, exp.Table):
|
|
1380
|
+
source_name = self._get_qualified_table_name(source)
|
|
1381
|
+
|
|
1382
|
+
# First check file schema (views/tables from previous statements)
|
|
1383
|
+
if source_name in self._file_schema:
|
|
1384
|
+
columns.extend(self._file_schema[source_name].keys())
|
|
1385
|
+
else:
|
|
1386
|
+
# Check if this is a CTE reference within the same statement
|
|
1387
|
+
cte_columns = self._resolve_cte_columns(source_name, select_node)
|
|
1388
|
+
columns.extend(cte_columns)
|
|
1389
|
+
|
|
1390
|
+
# Handle subquery - can't resolve without deeper analysis
|
|
1391
|
+
elif isinstance(source, exp.Subquery) and source.alias:
|
|
1392
|
+
# Check if this subquery alias is in file schema (unlikely)
|
|
1393
|
+
if source.alias in self._file_schema:
|
|
1394
|
+
columns.extend(self._file_schema[source.alias].keys())
|
|
1395
|
+
|
|
1396
|
+
return columns
|
|
1397
|
+
|
|
1398
|
+
def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
|
|
1399
|
+
"""
|
|
1400
|
+
Resolve columns from a CTE definition within the same statement.
|
|
1401
|
+
|
|
1402
|
+
Args:
|
|
1403
|
+
cte_name: Name of the CTE to resolve
|
|
1404
|
+
select_node: The SELECT node that references the CTE
|
|
1405
|
+
|
|
1406
|
+
Returns:
|
|
1407
|
+
List of column names from the CTE, empty if CTE not found
|
|
1408
|
+
"""
|
|
1409
|
+
# Walk up the tree to find the WITH clause containing this CTE
|
|
1410
|
+
parent = select_node
|
|
1411
|
+
while parent:
|
|
1412
|
+
if hasattr(parent, "args") and parent.args.get("with"):
|
|
1413
|
+
with_clause = parent.args["with"]
|
|
1414
|
+
for cte in with_clause.expressions:
|
|
1415
|
+
if isinstance(cte, exp.CTE) and cte.alias == cte_name:
|
|
1416
|
+
# Found the CTE - extract its columns
|
|
1417
|
+
cte_select = cte.this
|
|
1418
|
+
if isinstance(cte_select, exp.Select):
|
|
1419
|
+
return self._extract_cte_select_columns(cte_select)
|
|
1420
|
+
parent = parent.parent if hasattr(parent, "parent") else None
|
|
1421
|
+
|
|
1422
|
+
return []
|
|
1423
|
+
|
|
1424
|
+
def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
|
|
1425
|
+
"""
|
|
1426
|
+
Extract column names from a CTE's SELECT statement.
|
|
1427
|
+
|
|
1428
|
+
This handles SELECT * within the CTE by resolving against file schema.
|
|
1429
|
+
|
|
1430
|
+
Args:
|
|
1431
|
+
cte_select: The SELECT expression within the CTE
|
|
1432
|
+
|
|
1433
|
+
Returns:
|
|
1434
|
+
List of column names
|
|
1435
|
+
"""
|
|
1436
|
+
columns: List[str] = []
|
|
1437
|
+
|
|
1438
|
+
for projection in cte_select.expressions:
|
|
1439
|
+
if isinstance(projection, exp.Alias):
|
|
1440
|
+
columns.append(projection.alias)
|
|
1441
|
+
elif isinstance(projection, exp.Column):
|
|
1442
|
+
columns.append(projection.name)
|
|
1443
|
+
elif isinstance(projection, exp.Star):
|
|
1444
|
+
# Resolve SELECT * in CTE from file schema
|
|
1445
|
+
star_columns = self._resolve_star_columns(cte_select)
|
|
1446
|
+
columns.extend(star_columns)
|
|
1447
|
+
else:
|
|
1448
|
+
col_sql = projection.sql(dialect=self.dialect)
|
|
1449
|
+
columns.append(col_sql)
|
|
1450
|
+
|
|
1451
|
+
return columns
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
CREATE TEMPORARY VIEW first_view AS (
|
|
2
|
+
SELECT
|
|
3
|
+
a,
|
|
4
|
+
b,
|
|
5
|
+
c
|
|
6
|
+
FROM source_table
|
|
7
|
+
);
|
|
8
|
+
|
|
9
|
+
CREATE TEMPORARY VIEW second_view AS
|
|
10
|
+
WITH first_view_cte AS (
|
|
11
|
+
SELECT
|
|
12
|
+
*,
|
|
13
|
+
row_number() OVER (
|
|
14
|
+
PARTITION BY a ORDER BY b DESC
|
|
15
|
+
) AS row_num
|
|
16
|
+
FROM first_view
|
|
17
|
+
)
|
|
18
|
+
SELECT * FROM first_view_cte
|
|
19
|
+
WHERE c = 1;
|
|
20
|
+
|
|
21
|
+
INSERT OVERWRITE output_table
|
|
22
|
+
SELECT
|
|
23
|
+
a,
|
|
24
|
+
b,
|
|
25
|
+
c,
|
|
26
|
+
row_num
|
|
27
|
+
FROM second_view;
|
|
@@ -428,3 +428,153 @@ class TestGraphBuilderInsertWithUnion:
|
|
|
428
428
|
assert "db.source_a.last" in upstream_ids
|
|
429
429
|
assert "db.source_b.first" in upstream_ids
|
|
430
430
|
assert "db.source_b.last" in upstream_ids
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class TestGraphBuilderCreateViewWithCTEAndWindowFunction:
|
|
434
|
+
"""Tests for CREATE VIEW statements with CTEs and window functions."""
|
|
435
|
+
|
|
436
|
+
def test_create_view_with_cte_and_row_number(self, tmp_path):
|
|
437
|
+
"""CREATE VIEW with CTE and ROW_NUMBER() OVER (PARTITION BY ...) should work."""
|
|
438
|
+
sql_file = tmp_path / "query.sql"
|
|
439
|
+
sql_file.write_text("""
|
|
440
|
+
CREATE VIEW my_view AS
|
|
441
|
+
WITH ranked_orders AS (
|
|
442
|
+
SELECT
|
|
443
|
+
customer_id,
|
|
444
|
+
order_date,
|
|
445
|
+
amount,
|
|
446
|
+
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) as rn
|
|
447
|
+
FROM orders
|
|
448
|
+
)
|
|
449
|
+
SELECT customer_id, order_date, amount
|
|
450
|
+
FROM ranked_orders
|
|
451
|
+
WHERE rn = 1
|
|
452
|
+
""")
|
|
453
|
+
|
|
454
|
+
builder = GraphBuilder(dialect="spark")
|
|
455
|
+
builder.add_file(sql_file)
|
|
456
|
+
graph = builder.build()
|
|
457
|
+
|
|
458
|
+
# Should have nodes created successfully
|
|
459
|
+
assert graph.metadata.total_nodes > 0
|
|
460
|
+
assert graph.metadata.total_edges > 0
|
|
461
|
+
|
|
462
|
+
# Check that output columns are qualified with the view name
|
|
463
|
+
node_ids = {node.identifier for node in graph.nodes}
|
|
464
|
+
assert "my_view.customer_id" in node_ids
|
|
465
|
+
assert "my_view.order_date" in node_ids
|
|
466
|
+
assert "my_view.amount" in node_ids
|
|
467
|
+
|
|
468
|
+
# Source columns from orders table should exist
|
|
469
|
+
assert "orders.customer_id" in node_ids
|
|
470
|
+
assert "orders.order_date" in node_ids
|
|
471
|
+
assert "orders.amount" in node_ids
|
|
472
|
+
|
|
473
|
+
def test_create_view_with_cte_row_number_lineage_tracing(self, tmp_path):
|
|
474
|
+
"""Test that lineage correctly traces through CTE with window function."""
|
|
475
|
+
from sqlglider.graph.query import GraphQuerier
|
|
476
|
+
|
|
477
|
+
sql_file = tmp_path / "query.sql"
|
|
478
|
+
sql_file.write_text("""
|
|
479
|
+
CREATE VIEW latest_orders AS
|
|
480
|
+
WITH ranked AS (
|
|
481
|
+
SELECT
|
|
482
|
+
o.customer_id,
|
|
483
|
+
o.order_date,
|
|
484
|
+
o.total_amount,
|
|
485
|
+
ROW_NUMBER() OVER (PARTITION BY o.customer_id ORDER BY o.order_date DESC) as rn
|
|
486
|
+
FROM sales.orders o
|
|
487
|
+
)
|
|
488
|
+
SELECT customer_id, order_date, total_amount
|
|
489
|
+
FROM ranked
|
|
490
|
+
WHERE rn = 1
|
|
491
|
+
""")
|
|
492
|
+
|
|
493
|
+
builder = GraphBuilder(dialect="spark")
|
|
494
|
+
builder.add_file(sql_file)
|
|
495
|
+
graph = builder.build()
|
|
496
|
+
|
|
497
|
+
# Query upstream from output columns
|
|
498
|
+
querier = GraphQuerier(graph)
|
|
499
|
+
|
|
500
|
+
# customer_id should trace back to sales.orders.customer_id
|
|
501
|
+
upstream_customer = querier.find_upstream("latest_orders.customer_id")
|
|
502
|
+
upstream_ids = {n.identifier for n in upstream_customer.related_columns}
|
|
503
|
+
assert "sales.orders.customer_id" in upstream_ids
|
|
504
|
+
|
|
505
|
+
# total_amount should trace back to sales.orders.total_amount
|
|
506
|
+
upstream_amount = querier.find_upstream("latest_orders.total_amount")
|
|
507
|
+
upstream_ids = {n.identifier for n in upstream_amount.related_columns}
|
|
508
|
+
assert "sales.orders.total_amount" in upstream_ids
|
|
509
|
+
|
|
510
|
+
def test_create_view_multiple_window_functions(self, tmp_path):
|
|
511
|
+
"""Test CREATE VIEW with multiple window functions."""
|
|
512
|
+
sql_file = tmp_path / "query.sql"
|
|
513
|
+
sql_file.write_text("""
|
|
514
|
+
CREATE VIEW customer_rankings AS
|
|
515
|
+
WITH metrics AS (
|
|
516
|
+
SELECT
|
|
517
|
+
customer_id,
|
|
518
|
+
total_spend,
|
|
519
|
+
ROW_NUMBER() OVER (ORDER BY total_spend DESC) as spend_rank,
|
|
520
|
+
RANK() OVER (PARTITION BY region ORDER BY total_spend DESC) as region_rank,
|
|
521
|
+
LAG(total_spend) OVER (PARTITION BY customer_id ORDER BY order_date) as prev_spend
|
|
522
|
+
FROM customer_orders
|
|
523
|
+
)
|
|
524
|
+
SELECT customer_id, total_spend, spend_rank, region_rank
|
|
525
|
+
FROM metrics
|
|
526
|
+
""")
|
|
527
|
+
|
|
528
|
+
builder = GraphBuilder(dialect="spark")
|
|
529
|
+
builder.add_file(sql_file)
|
|
530
|
+
graph = builder.build()
|
|
531
|
+
|
|
532
|
+
# Should process successfully with multiple window functions
|
|
533
|
+
assert graph.metadata.total_nodes > 0
|
|
534
|
+
|
|
535
|
+
node_ids = {node.identifier for node in graph.nodes}
|
|
536
|
+
assert "customer_rankings.customer_id" in node_ids
|
|
537
|
+
assert "customer_rankings.total_spend" in node_ids
|
|
538
|
+
assert "customer_rankings.spend_rank" in node_ids
|
|
539
|
+
assert "customer_rankings.region_rank" in node_ids
|
|
540
|
+
|
|
541
|
+
def test_create_view_nested_ctes_with_window(self, tmp_path):
|
|
542
|
+
"""Test CREATE VIEW with nested CTEs and window functions."""
|
|
543
|
+
sql_file = tmp_path / "query.sql"
|
|
544
|
+
sql_file.write_text("""
|
|
545
|
+
CREATE VIEW final_report AS
|
|
546
|
+
WITH base_data AS (
|
|
547
|
+
SELECT customer_id, product_id, quantity, sale_date
|
|
548
|
+
FROM raw_sales
|
|
549
|
+
),
|
|
550
|
+
ranked_sales AS (
|
|
551
|
+
SELECT
|
|
552
|
+
customer_id,
|
|
553
|
+
product_id,
|
|
554
|
+
quantity,
|
|
555
|
+
ROW_NUMBER() OVER (
|
|
556
|
+
PARTITION BY customer_id, product_id
|
|
557
|
+
ORDER BY sale_date DESC
|
|
558
|
+
) as sale_rank
|
|
559
|
+
FROM base_data
|
|
560
|
+
)
|
|
561
|
+
SELECT customer_id, product_id, quantity
|
|
562
|
+
FROM ranked_sales
|
|
563
|
+
WHERE sale_rank = 1
|
|
564
|
+
""")
|
|
565
|
+
|
|
566
|
+
builder = GraphBuilder(dialect="spark")
|
|
567
|
+
builder.add_file(sql_file)
|
|
568
|
+
graph = builder.build()
|
|
569
|
+
|
|
570
|
+
assert graph.metadata.total_nodes > 0
|
|
571
|
+
|
|
572
|
+
node_ids = {node.identifier for node in graph.nodes}
|
|
573
|
+
assert "final_report.customer_id" in node_ids
|
|
574
|
+
assert "final_report.product_id" in node_ids
|
|
575
|
+
assert "final_report.quantity" in node_ids
|
|
576
|
+
|
|
577
|
+
# Source should trace to raw_sales
|
|
578
|
+
assert "raw_sales.customer_id" in node_ids
|
|
579
|
+
assert "raw_sales.product_id" in node_ids
|
|
580
|
+
assert "raw_sales.quantity" in node_ids
|