sql-glider 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {sql_glider-0.1.4 → sql_glider-0.1.6}/PKG-INFO +1 -1
  2. sql_glider-0.1.6/plans/2026-01-26-file-scoped-schema-context.md +201 -0
  3. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/_version.py +2 -2
  4. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/analyzer.py +198 -18
  5. sql_glider-0.1.6/tests/fixtures/original_queries/test_cte_view_star.sql +22 -0
  6. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/test_analyzer.py +271 -0
  7. {sql_glider-0.1.4 → sql_glider-0.1.6}/.github/workflows/ci.yml +0 -0
  8. {sql_glider-0.1.4 → sql_glider-0.1.6}/.github/workflows/publish.yml +0 -0
  9. {sql_glider-0.1.4 → sql_glider-0.1.6}/.gitignore +0 -0
  10. {sql_glider-0.1.4 → sql_glider-0.1.6}/.python-version +0 -0
  11. {sql_glider-0.1.4 → sql_glider-0.1.6}/ARCHITECTURE.md +0 -0
  12. {sql_glider-0.1.4 → sql_glider-0.1.6}/CLAUDE.md +0 -0
  13. {sql_glider-0.1.4 → sql_glider-0.1.6}/LICENSE +0 -0
  14. {sql_glider-0.1.4 → sql_glider-0.1.6}/README.md +0 -0
  15. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-05-column-level-lineage.md +0 -0
  16. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-05-reverse-lineage.md +0 -0
  17. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-config-file-support.md +0 -0
  18. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-graph-lineage.md +0 -0
  19. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  20. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-07-sample-data-model.md +0 -0
  21. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-07-sql-templating.md +0 -0
  22. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-08-tables-command.md +0 -0
  23. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-09-graph-query-paths.md +0 -0
  24. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-13-dissect-command.md +0 -0
  25. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2025-12-14-tables-pull-command.md +0 -0
  26. {sql_glider-0.1.4 → sql_glider-0.1.6}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  27. {sql_glider-0.1.4 → sql_glider-0.1.6}/pyproject.toml +0 -0
  28. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/README.md +0 -0
  29. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/expire_dim_customer.sql +0 -0
  30. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/load_fact_orders.sql +0 -0
  31. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/load_fact_payments.sql +0 -0
  32. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/merge_dim_customer.sql +0 -0
  33. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/merge_dim_product.sql +0 -0
  34. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  35. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/conditional_merge.sql +0 -0
  36. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/cte_insert.sql +0 -0
  37. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/complex/multi_table_transform.sql +0 -0
  38. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/dim_customer.sql +0 -0
  39. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/dim_product.sql +0 -0
  40. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/fact_orders.sql +0 -0
  41. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/fact_payments.sql +0 -0
  42. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_addresses.sql +0 -0
  43. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_customers.sql +0 -0
  44. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_order_items.sql +0 -0
  45. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_orders.sql +0 -0
  46. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_payments.sql +0 -0
  47. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/raw_products.sql +0 -0
  48. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_customers.sql +0 -0
  49. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_orders.sql +0 -0
  50. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_payments.sql +0 -0
  51. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/ddl/stg_products.sql +0 -0
  52. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  53. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  54. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  55. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  56. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/maintenance/update_product_status.sql +0 -0
  57. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  58. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  59. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  60. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  61. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_customers.sql +0 -0
  62. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_orders.sql +0 -0
  63. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_payments.sql +0 -0
  64. {sql_glider-0.1.4 → sql_glider-0.1.6}/sample_data_model/staging/load_stg_products.sql +0 -0
  65. {sql_glider-0.1.4 → sql_glider-0.1.6}/sqlglider.toml.example +0 -0
  66. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/__init__.py +0 -0
  67. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/__init__.py +0 -0
  68. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/base.py +0 -0
  69. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/databricks.py +0 -0
  70. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/catalog/registry.py +0 -0
  71. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/cli.py +0 -0
  72. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/__init__.py +0 -0
  73. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/analyzer.py +0 -0
  74. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/formatters.py +0 -0
  75. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/dissection/models.py +0 -0
  76. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/global_models.py +0 -0
  77. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/__init__.py +0 -0
  78. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/builder.py +0 -0
  79. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/merge.py +0 -0
  80. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/models.py +0 -0
  81. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/query.py +0 -0
  82. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/graph/serialization.py +0 -0
  83. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/__init__.py +0 -0
  84. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/lineage/formatters.py +0 -0
  85. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/__init__.py +0 -0
  86. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/base.py +0 -0
  87. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/jinja.py +0 -0
  88. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/registry.py +0 -0
  89. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/templating/variables.py +0 -0
  90. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/__init__.py +0 -0
  91. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/config.py +0 -0
  92. {sql_glider-0.1.4 → sql_glider-0.1.6}/src/sqlglider/utils/file_utils.py +0 -0
  93. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/__init__.py +0 -0
  94. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  95. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  96. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  97. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  98. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  99. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  100. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_cte.sql +0 -0
  101. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  102. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  103. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_multi.sql +0 -0
  104. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  105. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  106. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  107. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_tables.sql +0 -0
  108. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_view.sql +0 -0
  109. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  110. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/fixtures/sample_manifest.csv +0 -0
  111. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/__init__.py +0 -0
  112. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/__init__.py +0 -0
  113. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_base.py +0 -0
  114. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_databricks.py +0 -0
  115. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/catalog/test_registry.py +0 -0
  116. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/__init__.py +0 -0
  117. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  118. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_formatters.py +0 -0
  119. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/dissection/test_models.py +0 -0
  120. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/__init__.py +0 -0
  121. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_builder.py +0 -0
  122. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_merge.py +0 -0
  123. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_models.py +0 -0
  124. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_query.py +0 -0
  125. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/graph/test_serialization.py +0 -0
  126. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/__init__.py +0 -0
  127. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/lineage/test_formatters.py +0 -0
  128. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/__init__.py +0 -0
  129. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_base.py +0 -0
  130. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_jinja.py +0 -0
  131. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_registry.py +0 -0
  132. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/templating/test_variables.py +0 -0
  133. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/test_cli.py +0 -0
  134. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/__init__.py +0 -0
  135. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/test_config.py +0 -0
  136. {sql_glider-0.1.4 → sql_glider-0.1.6}/tests/sqlglider/utils/test_file_utils.py +0 -0
  137. {sql_glider-0.1.4 → sql_glider-0.1.6}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,201 @@
1
+ # Plan: File-Scoped Schema Context for SQL Lineage Analyzer
2
+
3
+ **Status:** Completed
4
+
5
+ ## Summary
6
+
7
+ Add file-scoped schema context to the SQL Glider lineage analyzer so that SQLGlot can correctly expand `SELECT *` and trace cross-statement references when a file contains multiple related statements.
8
+
9
+ ## Problem
10
+
11
+ When analyzing this SQL:
12
+ ```sql
13
+ CREATE TEMPORARY VIEW first_view AS (SELECT a, b, c FROM source_table);
14
+ CREATE TEMPORARY VIEW second_view AS
15
+ WITH first_view_cte AS (
16
+ SELECT *, row_number() OVER (PARTITION BY a ORDER BY b DESC) AS row_num
17
+ FROM first_view
18
+ )
19
+ SELECT * FROM first_view_cte WHERE c = 1;
20
+ ```
21
+
22
+ **Previous output:** `* -> second_view.*` (useless - no column-level lineage)
23
+ **Expected output:** `first_view.a -> second_view.a`, `first_view.b -> second_view.b`, etc.
24
+
25
+ ## Root Cause
26
+
27
+ SQLGlot's `lineage()` function accepts a `schema` parameter that provides table/view column definitions. Without this schema context, SQLGlot cannot expand `SELECT *` to actual column names.
28
+
29
+ ## Solution
30
+
31
+ Build up schema context incrementally as CREATE VIEW/TABLE statements are processed, then pass that schema to subsequent `lineage()` calls.
32
+
33
+ ---
34
+
35
+ ## Implementation Steps
36
+
37
+ ### 1. Add Schema Instance Variable
38
+
39
+ - [x] Add `_file_schema: Dict[str, Dict[str, str]] = {}` to `LineageAnalyzer.__init__()`
40
+
41
+ ### 2. Add Schema Extraction Methods
42
+
43
+ - [x] `_extract_schema_from_statement()` - Extract columns from CREATE VIEW/TABLE AS SELECT
44
+ - [x] `_extract_columns_from_select()` - Extract column names from SELECT projections
45
+ - [x] `_resolve_star_columns()` - Resolve SELECT * from file schema or CTEs
46
+ - [x] `_resolve_source_columns()` - Resolve columns from a single source (table, subquery)
47
+ - [x] `_resolve_qualified_star()` - Resolve table-qualified star (e.g., `t.*`)
48
+ - [x] `_extract_subquery_columns()` - Extract columns from subquery's SELECT
49
+ - [x] `_resolve_cte_columns()` - Resolve columns from CTE definitions
50
+ - [x] `_extract_cte_select_columns()` - Extract columns from CTE's SELECT
51
+
52
+ ### 3. Integrate Schema Building into Analysis Loop
53
+
54
+ - [x] Reset `_file_schema = {}` at start of `analyze_queries()`
55
+ - [x] Call `_extract_schema_from_statement(expr)` in `finally` block AFTER analysis
56
+ - [x] Critical: Schema must be extracted AFTER analysis to avoid confusing SQLGlot
57
+
58
+ ### 4. Pass Schema to lineage() Calls
59
+
60
+ - [x] Modify `_analyze_column_lineage_internal()` to pass schema:
61
+ ```python
62
+ node = lineage(
63
+ lineage_col,
64
+ current_query_sql,
65
+ dialect=self.dialect,
66
+ schema=self._file_schema if self._file_schema else None,
67
+ )
68
+ ```
69
+
70
+ ### 5. Handle SELECT * in get_output_columns()
71
+
72
+ - [x] Handle `exp.Star` projections by resolving from file schema
73
+ - [x] Handle table-qualified stars (`t.*`) represented as `exp.Column` with `exp.Star` as `this`
74
+
75
+ ---
76
+
77
+ ## Edge Cases Handled
78
+
79
+ | Case | Implementation |
80
+ |------|----------------|
81
+ | `SELECT *` from unknown table | Returns empty columns, falls back to `*` behavior |
82
+ | Nested `SELECT *` through CTEs | Resolves CTE source from schema first |
83
+ | UNION in CREATE VIEW | Uses first branch's columns |
84
+ | Expressions without aliases | Uses SQL representation as column name |
85
+ | TEMPORARY VIEW | Treated same as regular VIEW |
86
+ | Multiple JOINs | Collects columns from all joined tables |
87
+ | LEFT/RIGHT/FULL OUTER JOIN | Same handling as INNER JOIN |
88
+ | CROSS JOIN | Same handling as INNER JOIN |
89
+ | Subquery in FROM clause | Extracts columns from inner SELECT |
90
+ | Table aliases (`v1 AS x`) | Resolves alias to actual table name |
91
+ | Schema-qualified names | Handles `schema.table` correctly |
92
+ | CTE referencing earlier CTE | Recursive CTE column resolution |
93
+ | `SELECT *, extra_col` | Combines * expansion with extra columns |
94
+ | Table-qualified `t.*` | Handles `v1.*` style syntax |
95
+ | LATERAL VIEW explode | Collects generated columns from `laterals` clause |
96
+ | LATERAL VIEW posexplode | Collects both position and element columns |
97
+ | Multiple LATERAL VIEWs | Collects columns from all LATERAL VIEWs |
98
+ | LATERAL VIEW OUTER | Same handling as regular LATERAL VIEW |
99
+ | LEFT SEMI JOIN | Only includes left table columns (right table excluded) |
100
+ | LEFT ANTI JOIN | Only includes left table columns (right table excluded) |
101
+
102
+ ---
103
+
104
+ ## Files Modified
105
+
106
+ | File | Changes |
107
+ |------|---------|
108
+ | `src/sqlglider/lineage/analyzer.py` | Added `_file_schema` instance variable; Added 9 schema extraction methods (including `_resolve_lateral_columns`); Modified `analyze_queries()` and `_analyze_column_lineage_internal()` and `get_output_columns()`; Added SEMI/ANTI join handling in `_resolve_star_columns()` |
109
+ | `tests/sqlglider/lineage/test_analyzer.py` | Added `TestFileSchemaExtraction` (9 tests), `TestCrossStatementLineage` (12 tests), `TestLateralViewColumnResolution` (5 tests), and `TestSemiAntiJoinColumnResolution` (3 tests) |
110
+
111
+ ---
112
+
113
+ ## Testing
114
+
115
+ ### Test Classes Added
116
+
117
+ **TestFileSchemaExtraction (9 tests):**
118
+ - `test_extract_schema_from_create_view`
119
+ - `test_extract_schema_from_create_temporary_view`
120
+ - `test_extract_schema_from_create_table_as`
121
+ - `test_extract_schema_with_aliases`
122
+ - `test_extract_schema_select_star_from_known_table`
123
+ - `test_extract_schema_select_star_from_unknown_table`
124
+ - `test_schema_not_extracted_from_pure_select`
125
+ - `test_schema_not_extracted_from_insert`
126
+ - `test_schema_reset_between_analysis_calls`
127
+
128
+ **TestCrossStatementLineage (12 tests):**
129
+ - `test_view_referencing_earlier_view`
130
+ - `test_select_star_expansion_through_view`
131
+ - `test_cte_with_select_star_from_view`
132
+ - `test_window_function_with_select_star`
133
+ - `test_insert_from_view_lineage`
134
+ - `test_multi_hop_view_lineage`
135
+ - `test_original_problem_scenario`
136
+ - `test_select_star_from_join`
137
+ - `test_nested_ctes_and_views_with_select_star`
138
+ - `test_select_star_from_subquery`
139
+ - `test_table_qualified_star`
140
+ - `test_table_qualified_star_with_alias`
141
+
142
+ **TestLateralViewColumnResolution (5 tests):**
143
+ - `test_select_star_with_lateral_view_explode`
144
+ - `test_select_star_with_lateral_view_posexplode`
145
+ - `test_select_star_with_multiple_lateral_views`
146
+ - `test_select_star_with_lateral_view_outer`
147
+ - `test_lateral_view_with_join`
148
+
149
+ ### Verification Commands
150
+
151
+ ```bash
152
+ # Run all tests
153
+ uv run pytest --cov=sqlglider --cov-fail-under=80
154
+
155
+ # Run schema-related tests
156
+ uv run pytest tests/sqlglider/lineage/test_analyzer.py -k "schema or CrossStatement" -v
157
+
158
+ # Test the original problem scenario
159
+ uv run sqlglider graph build test_view_window_cte.sql --dialect spark --output graph.json
160
+ ```
161
+
162
+ ---
163
+
164
+ ## Implementation Notes
165
+
166
+ ### Critical Timing Issue
167
+
168
+ Initially, schema extraction was done BEFORE analysis in the loop, which caused SQLGlot to return unqualified column names (e.g., `customer_id` instead of `orders.customer_id`).
169
+
170
+ **Fix:** Move `_extract_schema_from_statement(expr)` to the `finally` block AFTER analysis completes. This ensures:
171
+ 1. The current statement is analyzed without its own schema (correct behavior)
172
+ 2. The schema is then extracted for use by subsequent statements
173
+
174
+ ### Table-Qualified Star Handling
175
+
176
+ Table-qualified stars (`v1.*`) are represented differently than unqualified stars (`*`):
177
+ - `*` is `exp.Star`
178
+ - `v1.*` is `exp.Column` with `this` being `exp.Star` and `table` being `v1`
179
+
180
+ Both cases needed handling in:
181
+ - `_extract_columns_from_select()` for schema extraction
182
+ - `get_output_columns()` for lineage analysis output
183
+
184
+ ### Subquery Column Resolution
185
+
186
+ For `SELECT * FROM (SELECT * FROM v1) sub`, the code:
187
+ 1. Detects the subquery in `_resolve_source_columns()`
188
+ 2. Extracts columns from the inner SELECT via `_extract_subquery_columns()`
189
+ 3. Recursively resolves any `SELECT *` in the inner query
190
+
191
+ ---
192
+
193
+ ## Lessons Learned
194
+
195
+ 1. **Timing matters:** Schema context must be built AFTER analyzing a statement, not before, to avoid confusing SQLGlot's lineage tracing.
196
+
197
+ 2. **AST structure varies:** Different SQL constructs have different AST representations (e.g., `*` vs `t.*`), requiring multiple code paths.
198
+
199
+ 3. **Recursive resolution:** CTEs and subqueries can reference other CTEs/views, requiring recursive column resolution.
200
+
201
+ 4. **Edge cases compound:** JOINs + aliases + qualified stars can all combine, requiring careful handling of each case.
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.4'
32
- __version_tuple__ = version_tuple = (0, 1, 4)
31
+ __version__ = version = '0.1.6'
32
+ __version_tuple__ = version_tuple = (0, 1, 6)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -182,20 +182,43 @@ class LineageAnalyzer:
182
182
  # For aliased columns, use the alias as the column name
183
183
  column_name = projection.alias
184
184
  lineage_name = column_name # SQLGlot lineage uses the alias
185
- else:
186
- source_expr = projection
187
- if isinstance(source_expr, exp.Column):
188
- column_name = source_expr.name
189
- lineage_name = column_name
185
+ # Qualify with target table
186
+ qualified_name = f"{target_table}.{column_name}"
187
+ columns.append(qualified_name)
188
+ self._column_mapping[qualified_name] = lineage_name
189
+ elif isinstance(projection, exp.Column):
190
+ # Check if this is a table-qualified star (e.g., t.*)
191
+ if isinstance(projection.this, exp.Star):
192
+ source_table = projection.table
193
+ qualified_star_cols: List[str] = []
194
+ if source_table and first_select:
195
+ qualified_star_cols = self._resolve_qualified_star(
196
+ source_table, first_select
197
+ )
198
+ for col in qualified_star_cols:
199
+ qualified_name = f"{target_table}.{col}"
200
+ columns.append(qualified_name)
201
+ self._column_mapping[qualified_name] = col
202
+ if not qualified_star_cols:
203
+ # Fallback: can't resolve t.*, use * as column name
204
+ qualified_name = f"{target_table}.*"
205
+ columns.append(qualified_name)
206
+ self._column_mapping[qualified_name] = "*"
190
207
  else:
191
- # For expressions, use the SQL representation
192
- column_name = source_expr.sql(dialect=self.dialect)
208
+ column_name = projection.name
193
209
  lineage_name = column_name
194
-
195
- # Qualify with target table
196
- qualified_name = f"{target_table}.{column_name}"
197
- columns.append(qualified_name)
198
- self._column_mapping[qualified_name] = lineage_name
210
+ # Qualify with target table
211
+ qualified_name = f"{target_table}.{column_name}"
212
+ columns.append(qualified_name)
213
+ self._column_mapping[qualified_name] = lineage_name
214
+ else:
215
+ # For expressions, use the SQL representation
216
+ column_name = projection.sql(dialect=self.dialect)
217
+ lineage_name = column_name
218
+ # Qualify with target table
219
+ qualified_name = f"{target_table}.{column_name}"
220
+ columns.append(qualified_name)
221
+ self._column_mapping[qualified_name] = lineage_name
199
222
 
200
223
  else:
201
224
  # DQL (pure SELECT): Use the SELECT columns as output
@@ -1342,8 +1365,18 @@ class LineageAnalyzer:
1342
1365
  # Use the alias name as the column name
1343
1366
  columns.append(projection.alias)
1344
1367
  elif isinstance(projection, exp.Column):
1345
- # Use the column name
1346
- columns.append(projection.name)
1368
+ # Check if this is a table-qualified star (e.g., t.*)
1369
+ if isinstance(projection.this, exp.Star):
1370
+ # Resolve table-qualified star from known schema
1371
+ table_name = projection.table
1372
+ if table_name and first_select:
1373
+ qualified_star_cols = self._resolve_qualified_star(
1374
+ table_name, first_select
1375
+ )
1376
+ columns.extend(qualified_star_cols)
1377
+ else:
1378
+ # Use the column name
1379
+ columns.append(projection.name)
1347
1380
  elif isinstance(projection, exp.Star):
1348
1381
  # Resolve SELECT * from known schema
1349
1382
  if first_select:
@@ -1375,6 +1408,64 @@ class LineageAnalyzer:
1375
1408
 
1376
1409
  source = from_clause.this
1377
1410
 
1411
+ # Handle table reference from FROM clause
1412
+ columns.extend(self._resolve_source_columns(source, select_node))
1413
+
1414
+ # Handle JOIN clauses - collect columns from all joined tables
1415
+ # EXCEPT for SEMI and ANTI joins which only return left table columns
1416
+ joins = select_node.args.get("joins")
1417
+ if joins:
1418
+ for join in joins:
1419
+ if isinstance(join, exp.Join):
1420
+ # SEMI and ANTI joins don't include right table columns in SELECT *
1421
+ join_kind = join.kind
1422
+ if join_kind in ("SEMI", "ANTI"):
1423
+ # Skip right table columns for SEMI/ANTI joins
1424
+ continue
1425
+ join_source = join.this
1426
+ columns.extend(
1427
+ self._resolve_source_columns(join_source, select_node)
1428
+ )
1429
+
1430
+ # Handle LATERAL VIEW clauses - collect generated columns
1431
+ laterals = select_node.args.get("laterals")
1432
+ if laterals:
1433
+ for lateral in laterals:
1434
+ if isinstance(lateral, exp.Lateral):
1435
+ lateral_cols = self._resolve_lateral_columns(lateral)
1436
+ columns.extend(lateral_cols)
1437
+
1438
+ return columns
1439
+
1440
+ def _resolve_lateral_columns(self, lateral: exp.Lateral) -> List[str]:
1441
+ """
1442
+ Extract generated column names from a LATERAL VIEW clause.
1443
+
1444
+ Args:
1445
+ lateral: The Lateral expression node
1446
+
1447
+ Returns:
1448
+ List of generated column names (e.g., ['elem'] for explode,
1449
+ ['pos', 'elem'] for posexplode)
1450
+ """
1451
+ # Use SQLGlot's built-in property to get alias column names
1452
+ return lateral.alias_column_names or []
1453
+
1454
+ def _resolve_source_columns(
1455
+ self, source: exp.Expression, select_node: exp.Select
1456
+ ) -> List[str]:
1457
+ """
1458
+ Resolve columns from a single source (table, subquery, etc.).
1459
+
1460
+ Args:
1461
+ source: The source expression (Table, Subquery, etc.)
1462
+ select_node: The containing SELECT node for CTE resolution
1463
+
1464
+ Returns:
1465
+ List of column names from the source
1466
+ """
1467
+ columns: List[str] = []
1468
+
1378
1469
  # Handle table reference
1379
1470
  if isinstance(source, exp.Table):
1380
1471
  source_name = self._get_qualified_table_name(source)
@@ -1387,11 +1478,100 @@ class LineageAnalyzer:
1387
1478
  cte_columns = self._resolve_cte_columns(source_name, select_node)
1388
1479
  columns.extend(cte_columns)
1389
1480
 
1390
- # Handle subquery - can't resolve without deeper analysis
1391
- elif isinstance(source, exp.Subquery) and source.alias:
1392
- # Check if this subquery alias is in file schema (unlikely)
1393
- if source.alias in self._file_schema:
1481
+ # Handle subquery with alias
1482
+ elif isinstance(source, exp.Subquery):
1483
+ # First check if this subquery alias is in file schema
1484
+ if source.alias and source.alias in self._file_schema:
1394
1485
  columns.extend(self._file_schema[source.alias].keys())
1486
+ else:
1487
+ # Extract columns from the subquery's SELECT
1488
+ inner_select = source.this
1489
+ if isinstance(inner_select, exp.Select):
1490
+ subquery_cols = self._extract_subquery_columns(inner_select)
1491
+ columns.extend(subquery_cols)
1492
+
1493
+ return columns
1494
+
1495
+ def _resolve_qualified_star(
1496
+ self, table_name: str, select_node: exp.Select
1497
+ ) -> List[str]:
1498
+ """
1499
+ Resolve a table-qualified star (e.g., t.*) to actual column names.
1500
+
1501
+ Args:
1502
+ table_name: The table/alias name qualifying the star
1503
+ select_node: The SELECT node for context
1504
+
1505
+ Returns:
1506
+ List of column names from the specified table
1507
+ """
1508
+ # First check file schema
1509
+ if table_name in self._file_schema:
1510
+ return list(self._file_schema[table_name].keys())
1511
+
1512
+ # Check if it's a CTE reference
1513
+ cte_columns = self._resolve_cte_columns(table_name, select_node)
1514
+ if cte_columns:
1515
+ return cte_columns
1516
+
1517
+ # Check if the table name is an alias - need to resolve the actual table
1518
+ from_clause = select_node.args.get("from")
1519
+ if from_clause and isinstance(from_clause, exp.From):
1520
+ source = from_clause.this
1521
+ if isinstance(source, exp.Table) and source.alias == table_name:
1522
+ actual_name = self._get_qualified_table_name(source)
1523
+ if actual_name in self._file_schema:
1524
+ return list(self._file_schema[actual_name].keys())
1525
+
1526
+ # Check JOIN clauses for aliased tables
1527
+ joins = select_node.args.get("joins")
1528
+ if joins:
1529
+ for join in joins:
1530
+ if isinstance(join, exp.Join):
1531
+ join_source = join.this
1532
+ if (
1533
+ isinstance(join_source, exp.Table)
1534
+ and join_source.alias == table_name
1535
+ ):
1536
+ actual_name = self._get_qualified_table_name(join_source)
1537
+ if actual_name in self._file_schema:
1538
+ return list(self._file_schema[actual_name].keys())
1539
+
1540
+ return []
1541
+
1542
+ def _extract_subquery_columns(self, subquery_select: exp.Select) -> List[str]:
1543
+ """
1544
+ Extract column names from a subquery's SELECT statement.
1545
+
1546
+ Args:
1547
+ subquery_select: The SELECT expression within the subquery
1548
+
1549
+ Returns:
1550
+ List of column names
1551
+ """
1552
+ columns: List[str] = []
1553
+
1554
+ for projection in subquery_select.expressions:
1555
+ if isinstance(projection, exp.Alias):
1556
+ columns.append(projection.alias)
1557
+ elif isinstance(projection, exp.Column):
1558
+ # Check for table-qualified star (t.*)
1559
+ if isinstance(projection.this, exp.Star):
1560
+ table_name = projection.table
1561
+ if table_name:
1562
+ qualified_cols = self._resolve_qualified_star(
1563
+ table_name, subquery_select
1564
+ )
1565
+ columns.extend(qualified_cols)
1566
+ else:
1567
+ columns.append(projection.name)
1568
+ elif isinstance(projection, exp.Star):
1569
+ # Resolve SELECT * in subquery
1570
+ star_columns = self._resolve_star_columns(subquery_select)
1571
+ columns.extend(star_columns)
1572
+ else:
1573
+ col_sql = projection.sql(dialect=self.dialect)
1574
+ columns.append(col_sql)
1395
1575
 
1396
1576
  return columns
1397
1577
 
@@ -0,0 +1,22 @@
1
+ CREATE TEMPORARY VIEW first_view AS
2
+ SELECT
3
+ v1,
4
+ v2,
5
+ v3
6
+ FROM source_db.source_table;
7
+ CREATE TEMPORARY VIEW second_view AS WITH cte AS (
8
+ SELECT
9
+ *,
10
+ row_number() OVER (ORDER BY v1) AS row_num
11
+ FROM first_view
12
+ )
13
+
14
+ SELECT * FROM cte
15
+ WHERE row_num = 1;
16
+
17
+ INSERT INTO target_db.target_table
18
+ SELECT
19
+ v1,
20
+ v2,
21
+ v3
22
+ FROM second_view;