sql-glider 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {sql_glider-0.1.3 → sql_glider-0.1.5}/PKG-INFO +1 -1
  2. sql_glider-0.1.5/plans/2026-01-26-file-scoped-schema-context.md +199 -0
  3. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/_version.py +2 -2
  4. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/analyzer.py +402 -14
  5. sql_glider-0.1.5/tests/fixtures/original_queries/test_view_window_cte.sql +27 -0
  6. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_builder.py +150 -0
  7. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_analyzer.py +504 -0
  8. {sql_glider-0.1.3 → sql_glider-0.1.5}/.github/workflows/ci.yml +0 -0
  9. {sql_glider-0.1.3 → sql_glider-0.1.5}/.github/workflows/publish.yml +0 -0
  10. {sql_glider-0.1.3 → sql_glider-0.1.5}/.gitignore +0 -0
  11. {sql_glider-0.1.3 → sql_glider-0.1.5}/.python-version +0 -0
  12. {sql_glider-0.1.3 → sql_glider-0.1.5}/ARCHITECTURE.md +0 -0
  13. {sql_glider-0.1.3 → sql_glider-0.1.5}/CLAUDE.md +0 -0
  14. {sql_glider-0.1.3 → sql_glider-0.1.5}/LICENSE +0 -0
  15. {sql_glider-0.1.3 → sql_glider-0.1.5}/README.md +0 -0
  16. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-05-column-level-lineage.md +0 -0
  17. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-05-reverse-lineage.md +0 -0
  18. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-config-file-support.md +0 -0
  19. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-graph-lineage.md +0 -0
  20. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  21. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-07-sample-data-model.md +0 -0
  22. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-07-sql-templating.md +0 -0
  23. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-08-tables-command.md +0 -0
  24. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-09-graph-query-paths.md +0 -0
  25. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-13-dissect-command.md +0 -0
  26. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2025-12-14-tables-pull-command.md +0 -0
  27. {sql_glider-0.1.3 → sql_glider-0.1.5}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  28. {sql_glider-0.1.3 → sql_glider-0.1.5}/pyproject.toml +0 -0
  29. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/README.md +0 -0
  30. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/expire_dim_customer.sql +0 -0
  31. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/load_fact_orders.sql +0 -0
  32. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/load_fact_payments.sql +0 -0
  33. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_customer.sql +0 -0
  34. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/merge_dim_product.sql +0 -0
  35. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  36. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/conditional_merge.sql +0 -0
  37. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/cte_insert.sql +0 -0
  38. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/complex/multi_table_transform.sql +0 -0
  39. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/dim_customer.sql +0 -0
  40. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/dim_product.sql +0 -0
  41. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/fact_orders.sql +0 -0
  42. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/fact_payments.sql +0 -0
  43. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_addresses.sql +0 -0
  44. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_customers.sql +0 -0
  45. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_order_items.sql +0 -0
  46. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_orders.sql +0 -0
  47. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_payments.sql +0 -0
  48. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/raw_products.sql +0 -0
  49. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_customers.sql +0 -0
  50. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_orders.sql +0 -0
  51. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_payments.sql +0 -0
  52. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/ddl/stg_products.sql +0 -0
  53. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  54. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  55. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  56. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  57. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/maintenance/update_product_status.sql +0 -0
  58. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  59. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  60. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  61. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  62. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_customers.sql +0 -0
  63. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_orders.sql +0 -0
  64. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_payments.sql +0 -0
  65. {sql_glider-0.1.3 → sql_glider-0.1.5}/sample_data_model/staging/load_stg_products.sql +0 -0
  66. {sql_glider-0.1.3 → sql_glider-0.1.5}/sqlglider.toml.example +0 -0
  67. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/__init__.py +0 -0
  68. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/__init__.py +0 -0
  69. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/base.py +0 -0
  70. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/databricks.py +0 -0
  71. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/catalog/registry.py +0 -0
  72. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/cli.py +0 -0
  73. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/__init__.py +0 -0
  74. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/analyzer.py +0 -0
  75. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/formatters.py +0 -0
  76. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/dissection/models.py +0 -0
  77. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/global_models.py +0 -0
  78. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/__init__.py +0 -0
  79. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/builder.py +0 -0
  80. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/merge.py +0 -0
  81. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/models.py +0 -0
  82. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/query.py +0 -0
  83. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/graph/serialization.py +0 -0
  84. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/__init__.py +0 -0
  85. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/lineage/formatters.py +0 -0
  86. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/__init__.py +0 -0
  87. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/base.py +0 -0
  88. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/jinja.py +0 -0
  89. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/registry.py +0 -0
  90. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/templating/variables.py +0 -0
  91. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/__init__.py +0 -0
  92. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/config.py +0 -0
  93. {sql_glider-0.1.3 → sql_glider-0.1.5}/src/sqlglider/utils/file_utils.py +0 -0
  94. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/__init__.py +0 -0
  95. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  96. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  97. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  98. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  99. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  100. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  101. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte.sql +0 -0
  102. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  103. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  104. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi.sql +0 -0
  105. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  106. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  107. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  108. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_tables.sql +0 -0
  109. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/original_queries/test_view.sql +0 -0
  110. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/fixtures/sample_manifest.csv +0 -0
  111. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/__init__.py +0 -0
  112. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/__init__.py +0 -0
  113. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_base.py +0 -0
  114. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_databricks.py +0 -0
  115. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/catalog/test_registry.py +0 -0
  116. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/__init__.py +0 -0
  117. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  118. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_formatters.py +0 -0
  119. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/dissection/test_models.py +0 -0
  120. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/__init__.py +0 -0
  121. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_merge.py +0 -0
  122. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_models.py +0 -0
  123. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_query.py +0 -0
  124. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/graph/test_serialization.py +0 -0
  125. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/__init__.py +0 -0
  126. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/lineage/test_formatters.py +0 -0
  127. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/__init__.py +0 -0
  128. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_base.py +0 -0
  129. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_jinja.py +0 -0
  130. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_registry.py +0 -0
  131. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/templating/test_variables.py +0 -0
  132. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/test_cli.py +0 -0
  133. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/__init__.py +0 -0
  134. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/test_config.py +0 -0
  135. {sql_glider-0.1.3 → sql_glider-0.1.5}/tests/sqlglider/utils/test_file_utils.py +0 -0
  136. {sql_glider-0.1.3 → sql_glider-0.1.5}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,199 @@
1
+ # Plan: File-Scoped Schema Context for SQL Lineage Analyzer
2
+
3
+ **Status:** Completed
4
+
5
+ ## Summary
6
+
7
+ Add file-scoped schema context to the SQL Glider lineage analyzer so that SQLGlot can correctly expand `SELECT *` and trace cross-statement references when a file contains multiple related statements.
8
+
9
+ ## Problem
10
+
11
+ When analyzing this SQL:
12
+ ```sql
13
+ CREATE TEMPORARY VIEW first_view AS (SELECT a, b, c FROM source_table);
14
+ CREATE TEMPORARY VIEW second_view AS
15
+ WITH first_view_cte AS (
16
+ SELECT *, row_number() OVER (PARTITION BY a ORDER BY b DESC) AS row_num
17
+ FROM first_view
18
+ )
19
+ SELECT * FROM first_view_cte WHERE c = 1;
20
+ ```
21
+
22
+ **Previous output:** `* -> second_view.*` (useless - no column-level lineage)
23
+ **Expected output:** `first_view.a -> second_view.a`, `first_view.b -> second_view.b`, etc.
24
+
25
+ ## Root Cause
26
+
27
+ SQLGlot's `lineage()` function accepts a `schema` parameter that provides table/view column definitions. Without this schema context, SQLGlot cannot expand `SELECT *` to actual column names.
28
+
29
+ ## Solution
30
+
31
+ Build up schema context incrementally as CREATE VIEW/TABLE statements are processed, then pass that schema to subsequent `lineage()` calls.
32
+
33
+ ---
34
+
35
+ ## Implementation Steps
36
+
37
+ ### 1. Add Schema Instance Variable
38
+
39
+ - [x] Add `_file_schema: Dict[str, Dict[str, str]] = {}` to `LineageAnalyzer.__init__()`
40
+
41
+ ### 2. Add Schema Extraction Methods
42
+
43
+ - [x] `_extract_schema_from_statement()` - Extract columns from CREATE VIEW/TABLE AS SELECT
44
+ - [x] `_extract_columns_from_select()` - Extract column names from SELECT projections
45
+ - [x] `_resolve_star_columns()` - Resolve SELECT * from file schema or CTEs
46
+ - [x] `_resolve_source_columns()` - Resolve columns from a single source (table, subquery)
47
+ - [x] `_resolve_qualified_star()` - Resolve table-qualified star (e.g., `t.*`)
48
+ - [x] `_extract_subquery_columns()` - Extract columns from subquery's SELECT
49
+ - [x] `_resolve_cte_columns()` - Resolve columns from CTE definitions
50
+ - [x] `_extract_cte_select_columns()` - Extract columns from CTE's SELECT
51
+
52
+ ### 3. Integrate Schema Building into Analysis Loop
53
+
54
+ - [x] Reset `_file_schema = {}` at start of `analyze_queries()`
55
+ - [x] Call `_extract_schema_from_statement(expr)` in `finally` block AFTER analysis
56
+ - [x] Critical: Schema must be extracted AFTER analysis to avoid confusing SQLGlot
57
+
58
+ ### 4. Pass Schema to lineage() Calls
59
+
60
+ - [x] Modify `_analyze_column_lineage_internal()` to pass schema:
61
+ ```python
62
+ node = lineage(
63
+ lineage_col,
64
+ current_query_sql,
65
+ dialect=self.dialect,
66
+ schema=self._file_schema if self._file_schema else None,
67
+ )
68
+ ```
69
+
70
+ ### 5. Handle SELECT * in get_output_columns()
71
+
72
+ - [x] Handle `exp.Star` projections by resolving from file schema
73
+ - [x] Handle table-qualified stars (`t.*`) represented as `exp.Column` with `exp.Star` as `this`
74
+
75
+ ---
76
+
77
+ ## Edge Cases Handled
78
+
79
+ | Case | Implementation |
80
+ |------|----------------|
81
+ | `SELECT *` from unknown table | Returns empty columns, falls back to `*` behavior |
82
+ | Nested `SELECT *` through CTEs | Resolves CTE source from schema first |
83
+ | UNION in CREATE VIEW | Uses first branch's columns |
84
+ | Expressions without aliases | Uses SQL representation as column name |
85
+ | TEMPORARY VIEW | Treated same as regular VIEW |
86
+ | Multiple JOINs | Collects columns from all joined tables |
87
+ | LEFT/RIGHT/FULL OUTER JOIN | Same handling as INNER JOIN |
88
+ | CROSS JOIN | Same handling as INNER JOIN |
89
+ | Subquery in FROM clause | Extracts columns from inner SELECT |
90
+ | Table aliases (`v1 AS x`) | Resolves alias to actual table name |
91
+ | Schema-qualified names | Handles `schema.table` correctly |
92
+ | CTE referencing earlier CTE | Recursive CTE column resolution |
93
+ | `SELECT *, extra_col` | Combines * expansion with extra columns |
94
+ | Table-qualified `t.*` | Handles `v1.*` style syntax |
95
+ | LATERAL VIEW explode | Collects generated columns from `laterals` clause |
96
+ | LATERAL VIEW posexplode | Collects both position and element columns |
97
+ | Multiple LATERAL VIEWs | Collects columns from all LATERAL VIEWs |
98
+ | LATERAL VIEW OUTER | Same handling as regular LATERAL VIEW |
99
+
100
+ ---
101
+
102
+ ## Files Modified
103
+
104
+ | File | Changes |
105
+ |------|---------|
106
+ | `src/sqlglider/lineage/analyzer.py` | Added `_file_schema` instance variable; Added 9 schema extraction methods (including `_resolve_lateral_columns`); Modified `analyze_queries()` and `_analyze_column_lineage_internal()` and `get_output_columns()` |
107
+ | `tests/sqlglider/lineage/test_analyzer.py` | Added `TestFileSchemaExtraction` (9 tests), `TestCrossStatementLineage` (12 tests), and `TestLateralViewColumnResolution` (5 tests) |
108
+
109
+ ---
110
+
111
+ ## Testing
112
+
113
+ ### Test Classes Added
114
+
115
+ **TestFileSchemaExtraction (9 tests):**
116
+ - `test_extract_schema_from_create_view`
117
+ - `test_extract_schema_from_create_temporary_view`
118
+ - `test_extract_schema_from_create_table_as`
119
+ - `test_extract_schema_with_aliases`
120
+ - `test_extract_schema_select_star_from_known_table`
121
+ - `test_extract_schema_select_star_from_unknown_table`
122
+ - `test_schema_not_extracted_from_pure_select`
123
+ - `test_schema_not_extracted_from_insert`
124
+ - `test_schema_reset_between_analysis_calls`
125
+
126
+ **TestCrossStatementLineage (12 tests):**
127
+ - `test_view_referencing_earlier_view`
128
+ - `test_select_star_expansion_through_view`
129
+ - `test_cte_with_select_star_from_view`
130
+ - `test_window_function_with_select_star`
131
+ - `test_insert_from_view_lineage`
132
+ - `test_multi_hop_view_lineage`
133
+ - `test_original_problem_scenario`
134
+ - `test_select_star_from_join`
135
+ - `test_nested_ctes_and_views_with_select_star`
136
+ - `test_select_star_from_subquery`
137
+ - `test_table_qualified_star`
138
+ - `test_table_qualified_star_with_alias`
139
+
140
+ **TestLateralViewColumnResolution (5 tests):**
141
+ - `test_select_star_with_lateral_view_explode`
142
+ - `test_select_star_with_lateral_view_posexplode`
143
+ - `test_select_star_with_multiple_lateral_views`
144
+ - `test_select_star_with_lateral_view_outer`
145
+ - `test_lateral_view_with_join`
146
+
147
+ ### Verification Commands
148
+
149
+ ```bash
150
+ # Run all tests
151
+ uv run pytest --cov=sqlglider --cov-fail-under=80
152
+
153
+ # Run schema-related tests
154
+ uv run pytest tests/sqlglider/lineage/test_analyzer.py -k "schema or CrossStatement" -v
155
+
156
+ # Test the original problem scenario
157
+ uv run sqlglider graph build test_view_window_cte.sql --dialect spark --output graph.json
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Implementation Notes
163
+
164
+ ### Critical Timing Issue
165
+
166
+ Initially, schema extraction was done BEFORE analysis in the loop, which caused SQLGlot to return unqualified column names (e.g., `customer_id` instead of `orders.customer_id`).
167
+
168
+ **Fix:** Move `_extract_schema_from_statement(expr)` to the `finally` block AFTER analysis completes. This ensures:
169
+ 1. The current statement is analyzed without its own schema (correct behavior)
170
+ 2. The schema is then extracted for use by subsequent statements
171
+
172
+ ### Table-Qualified Star Handling
173
+
174
+ Table-qualified stars (`v1.*`) are represented differently than unqualified stars (`*`):
175
+ - `*` is `exp.Star`
176
+ - `v1.*` is `exp.Column` with `this` being `exp.Star` and `table` being `v1`
177
+
178
+ Both cases needed handling in:
179
+ - `_extract_columns_from_select()` for schema extraction
180
+ - `get_output_columns()` for lineage analysis output
181
+
182
+ ### Subquery Column Resolution
183
+
184
+ For `SELECT * FROM (SELECT * FROM v1) sub`, the code:
185
+ 1. Detects the subquery in `_resolve_source_columns()`
186
+ 2. Extracts columns from the inner SELECT via `_extract_subquery_columns()`
187
+ 3. Recursively resolves any `SELECT *` in the inner query
188
+
189
+ ---
190
+
191
+ ## Lessons Learned
192
+
193
+ 1. **Timing matters:** Schema context must be built AFTER analyzing a statement, not before, to avoid confusing SQLGlot's lineage tracing.
194
+
195
+ 2. **AST structure varies:** Different SQL constructs have different AST representations (e.g., `*` vs `t.*`), requiring multiple code paths.
196
+
197
+ 3. **Recursive resolution:** CTEs and subqueries can reference other CTEs/views, requiring recursive column resolution.
198
+
199
+ 4. **Edge cases compound:** JOINs + aliases + qualified stars can all combine, requiring careful handling of each case.
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.3'
32
- __version_tuple__ = version_tuple = (0, 1, 3)
31
+ __version__ = version = '0.1.5'
32
+ __version_tuple__ = version_tuple = (0, 1, 5)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,7 +1,7 @@
1
1
  """Core lineage analysis using SQLGlot."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
4
+ from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
5
5
 
6
6
  from pydantic import BaseModel, Field
7
7
  from sqlglot import exp, parse
@@ -99,6 +99,9 @@ class LineageAnalyzer:
99
99
  self.sql = sql
100
100
  self.dialect = dialect
101
101
  self._skipped_queries: List[SkippedQuery] = []
102
+ # File-scoped schema context for cross-statement lineage
103
+ # Maps table/view names to their column definitions
104
+ self._file_schema: Dict[str, Dict[str, str]] = {}
102
105
 
103
106
  try:
104
107
  # Parse all statements in the SQL string
@@ -156,26 +159,66 @@ class LineageAnalyzer:
156
159
  # DML/DDL: Use target table for output column qualification
157
160
  # The columns are from the SELECT, but qualified with the target table
158
161
  projections = self._get_select_projections(select_node)
162
+ first_select = self._get_first_select(select_node)
163
+
159
164
  for projection in projections:
165
+ # Handle SELECT * by resolving from file schema
166
+ if isinstance(projection, exp.Star):
167
+ if first_select:
168
+ star_columns = self._resolve_star_columns(first_select)
169
+ for star_col in star_columns:
170
+ qualified_name = f"{target_table}.{star_col}"
171
+ columns.append(qualified_name)
172
+ self._column_mapping[qualified_name] = star_col
173
+ if not columns:
174
+ # Fallback: can't resolve *, use * as column name
175
+ qualified_name = f"{target_table}.*"
176
+ columns.append(qualified_name)
177
+ self._column_mapping[qualified_name] = "*"
178
+ continue
179
+
160
180
  # Get the underlying expression (unwrap alias if present)
161
181
  if isinstance(projection, exp.Alias):
162
182
  # For aliased columns, use the alias as the column name
163
183
  column_name = projection.alias
164
184
  lineage_name = column_name # SQLGlot lineage uses the alias
165
- else:
166
- source_expr = projection
167
- if isinstance(source_expr, exp.Column):
168
- column_name = source_expr.name
169
- lineage_name = column_name
185
+ # Qualify with target table
186
+ qualified_name = f"{target_table}.{column_name}"
187
+ columns.append(qualified_name)
188
+ self._column_mapping[qualified_name] = lineage_name
189
+ elif isinstance(projection, exp.Column):
190
+ # Check if this is a table-qualified star (e.g., t.*)
191
+ if isinstance(projection.this, exp.Star):
192
+ source_table = projection.table
193
+ qualified_star_cols: List[str] = []
194
+ if source_table and first_select:
195
+ qualified_star_cols = self._resolve_qualified_star(
196
+ source_table, first_select
197
+ )
198
+ for col in qualified_star_cols:
199
+ qualified_name = f"{target_table}.{col}"
200
+ columns.append(qualified_name)
201
+ self._column_mapping[qualified_name] = col
202
+ if not qualified_star_cols:
203
+ # Fallback: can't resolve t.*, use * as column name
204
+ qualified_name = f"{target_table}.*"
205
+ columns.append(qualified_name)
206
+ self._column_mapping[qualified_name] = "*"
170
207
  else:
171
- # For expressions, use the SQL representation
172
- column_name = source_expr.sql(dialect=self.dialect)
208
+ column_name = projection.name
173
209
  lineage_name = column_name
174
-
175
- # Qualify with target table
176
- qualified_name = f"{target_table}.{column_name}"
177
- columns.append(qualified_name)
178
- self._column_mapping[qualified_name] = lineage_name
210
+ # Qualify with target table
211
+ qualified_name = f"{target_table}.{column_name}"
212
+ columns.append(qualified_name)
213
+ self._column_mapping[qualified_name] = lineage_name
214
+ else:
215
+ # For expressions, use the SQL representation
216
+ column_name = projection.sql(dialect=self.dialect)
217
+ lineage_name = column_name
218
+ # Qualify with target table
219
+ qualified_name = f"{target_table}.{column_name}"
220
+ columns.append(qualified_name)
221
+ self._column_mapping[qualified_name] = lineage_name
179
222
 
180
223
  else:
181
224
  # DQL (pure SELECT): Use the SELECT columns as output
@@ -324,6 +367,7 @@ class LineageAnalyzer:
324
367
  """
325
368
  results = []
326
369
  self._skipped_queries = [] # Reset skipped queries for this analysis
370
+ self._file_schema = {} # Reset file schema for this analysis run
327
371
 
328
372
  for query_index, expr, preview in self._iterate_queries(table_filter):
329
373
  # Temporarily swap self.expr to analyze this query
@@ -375,6 +419,9 @@ class LineageAnalyzer:
375
419
  )
376
420
  )
377
421
  finally:
422
+ # Extract schema from this statement AFTER analysis
423
+ # This builds up context for subsequent statements to use
424
+ self._extract_schema_from_statement(expr)
378
425
  # Restore original expression
379
426
  self.expr = original_expr
380
427
 
@@ -702,7 +749,13 @@ class LineageAnalyzer:
702
749
  lineage_col = self._column_mapping.get(col, col)
703
750
 
704
751
  # Get lineage tree for this column using current query SQL only
705
- node = lineage(lineage_col, current_query_sql, dialect=self.dialect)
752
+ # Pass file schema to enable SELECT * expansion for known tables/views
753
+ node = lineage(
754
+ lineage_col,
755
+ current_query_sql,
756
+ dialect=self.dialect,
757
+ schema=self._file_schema if self._file_schema else None,
758
+ )
706
759
 
707
760
  # Collect all source columns
708
761
  sources: Set[str] = set()
@@ -1235,3 +1288,338 @@ class LineageAnalyzer:
1235
1288
  preview = self._generate_query_preview(expr)
1236
1289
 
1237
1290
  yield idx, expr, preview
1291
+
1292
+ # -------------------------------------------------------------------------
1293
+ # File-scoped schema context methods
1294
+ # -------------------------------------------------------------------------
1295
+
1296
+ def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
1297
+ """
1298
+ Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
1299
+
1300
+ This method builds up file-scoped schema context as statements are processed,
1301
+ enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
1302
+
1303
+ Args:
1304
+ expr: The SQL expression to extract schema from
1305
+ """
1306
+ # Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
1307
+ if not isinstance(expr, exp.Create):
1308
+ return
1309
+ if expr.kind not in ("VIEW", "TABLE"):
1310
+ return
1311
+
1312
+ # Get target table/view name
1313
+ target = expr.this
1314
+ if isinstance(target, exp.Schema):
1315
+ target = target.this
1316
+ if not isinstance(target, exp.Table):
1317
+ return
1318
+
1319
+ target_name = self._get_qualified_table_name(target)
1320
+
1321
+ # Get the SELECT node from the CREATE statement
1322
+ select_node = expr.expression
1323
+ if select_node is None:
1324
+ return
1325
+
1326
+ # Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
1327
+ if isinstance(select_node, exp.Subquery):
1328
+ select_node = select_node.this
1329
+
1330
+ if not isinstance(
1331
+ select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
1332
+ ):
1333
+ return
1334
+
1335
+ # Extract column names from the SELECT
1336
+ columns = self._extract_columns_from_select(select_node)
1337
+
1338
+ if columns:
1339
+ # Store with UNKNOWN type - SQLGlot only needs column names for expansion
1340
+ self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
1341
+
1342
+ def _extract_columns_from_select(
1343
+ self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
1344
+ ) -> List[str]:
1345
+ """
1346
+ Extract column names from a SELECT statement.
1347
+
1348
+ Handles aliases, direct column references, and SELECT * by resolving
1349
+ against the known file schema.
1350
+
1351
+ Args:
1352
+ select_node: The SELECT or set operation expression
1353
+
1354
+ Returns:
1355
+ List of column names
1356
+ """
1357
+ columns: List[str] = []
1358
+
1359
+ # Get projections (for UNION, use first branch)
1360
+ projections = self._get_select_projections(select_node)
1361
+ first_select = self._get_first_select(select_node)
1362
+
1363
+ for projection in projections:
1364
+ if isinstance(projection, exp.Alias):
1365
+ # Use the alias name as the column name
1366
+ columns.append(projection.alias)
1367
+ elif isinstance(projection, exp.Column):
1368
+ # Check if this is a table-qualified star (e.g., t.*)
1369
+ if isinstance(projection.this, exp.Star):
1370
+ # Resolve table-qualified star from known schema
1371
+ table_name = projection.table
1372
+ if table_name and first_select:
1373
+ qualified_star_cols = self._resolve_qualified_star(
1374
+ table_name, first_select
1375
+ )
1376
+ columns.extend(qualified_star_cols)
1377
+ else:
1378
+ # Use the column name
1379
+ columns.append(projection.name)
1380
+ elif isinstance(projection, exp.Star):
1381
+ # Resolve SELECT * from known schema
1382
+ if first_select:
1383
+ star_columns = self._resolve_star_columns(first_select)
1384
+ columns.extend(star_columns)
1385
+ else:
1386
+ # For expressions without alias, use SQL representation
1387
+ col_sql = projection.sql(dialect=self.dialect)
1388
+ columns.append(col_sql)
1389
+
1390
+ return columns
1391
+
1392
+ def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
1393
+ """
1394
+ Resolve SELECT * to actual column names from known file schema or CTEs.
1395
+
1396
+ Args:
1397
+ select_node: The SELECT node containing the * reference
1398
+
1399
+ Returns:
1400
+ List of column names if source is known, empty list otherwise
1401
+ """
1402
+ columns: List[str] = []
1403
+
1404
+ # Get the source table(s) from FROM clause
1405
+ from_clause = select_node.args.get("from")
1406
+ if not from_clause or not isinstance(from_clause, exp.From):
1407
+ return columns
1408
+
1409
+ source = from_clause.this
1410
+
1411
+ # Handle table reference from FROM clause
1412
+ columns.extend(self._resolve_source_columns(source, select_node))
1413
+
1414
+ # Handle JOIN clauses - collect columns from all joined tables
1415
+ joins = select_node.args.get("joins")
1416
+ if joins:
1417
+ for join in joins:
1418
+ if isinstance(join, exp.Join):
1419
+ join_source = join.this
1420
+ columns.extend(
1421
+ self._resolve_source_columns(join_source, select_node)
1422
+ )
1423
+
1424
+ # Handle LATERAL VIEW clauses - collect generated columns
1425
+ laterals = select_node.args.get("laterals")
1426
+ if laterals:
1427
+ for lateral in laterals:
1428
+ if isinstance(lateral, exp.Lateral):
1429
+ lateral_cols = self._resolve_lateral_columns(lateral)
1430
+ columns.extend(lateral_cols)
1431
+
1432
+ return columns
1433
+
1434
+ def _resolve_lateral_columns(self, lateral: exp.Lateral) -> List[str]:
1435
+ """
1436
+ Extract generated column names from a LATERAL VIEW clause.
1437
+
1438
+ Args:
1439
+ lateral: The Lateral expression node
1440
+
1441
+ Returns:
1442
+ List of generated column names (e.g., ['elem'] for explode,
1443
+ ['pos', 'elem'] for posexplode)
1444
+ """
1445
+ # Use SQLGlot's built-in property to get alias column names
1446
+ return lateral.alias_column_names or []
1447
+
1448
+ def _resolve_source_columns(
1449
+ self, source: exp.Expression, select_node: exp.Select
1450
+ ) -> List[str]:
1451
+ """
1452
+ Resolve columns from a single source (table, subquery, etc.).
1453
+
1454
+ Args:
1455
+ source: The source expression (Table, Subquery, etc.)
1456
+ select_node: The containing SELECT node for CTE resolution
1457
+
1458
+ Returns:
1459
+ List of column names from the source
1460
+ """
1461
+ columns: List[str] = []
1462
+
1463
+ # Handle table reference
1464
+ if isinstance(source, exp.Table):
1465
+ source_name = self._get_qualified_table_name(source)
1466
+
1467
+ # First check file schema (views/tables from previous statements)
1468
+ if source_name in self._file_schema:
1469
+ columns.extend(self._file_schema[source_name].keys())
1470
+ else:
1471
+ # Check if this is a CTE reference within the same statement
1472
+ cte_columns = self._resolve_cte_columns(source_name, select_node)
1473
+ columns.extend(cte_columns)
1474
+
1475
+ # Handle subquery with alias
1476
+ elif isinstance(source, exp.Subquery):
1477
+ # First check if this subquery alias is in file schema
1478
+ if source.alias and source.alias in self._file_schema:
1479
+ columns.extend(self._file_schema[source.alias].keys())
1480
+ else:
1481
+ # Extract columns from the subquery's SELECT
1482
+ inner_select = source.this
1483
+ if isinstance(inner_select, exp.Select):
1484
+ subquery_cols = self._extract_subquery_columns(inner_select)
1485
+ columns.extend(subquery_cols)
1486
+
1487
+ return columns
1488
+
1489
+ def _resolve_qualified_star(
1490
+ self, table_name: str, select_node: exp.Select
1491
+ ) -> List[str]:
1492
+ """
1493
+ Resolve a table-qualified star (e.g., t.*) to actual column names.
1494
+
1495
+ Args:
1496
+ table_name: The table/alias name qualifying the star
1497
+ select_node: The SELECT node for context
1498
+
1499
+ Returns:
1500
+ List of column names from the specified table
1501
+ """
1502
+ # First check file schema
1503
+ if table_name in self._file_schema:
1504
+ return list(self._file_schema[table_name].keys())
1505
+
1506
+ # Check if it's a CTE reference
1507
+ cte_columns = self._resolve_cte_columns(table_name, select_node)
1508
+ if cte_columns:
1509
+ return cte_columns
1510
+
1511
+ # Check if the table name is an alias - need to resolve the actual table
1512
+ from_clause = select_node.args.get("from")
1513
+ if from_clause and isinstance(from_clause, exp.From):
1514
+ source = from_clause.this
1515
+ if isinstance(source, exp.Table) and source.alias == table_name:
1516
+ actual_name = self._get_qualified_table_name(source)
1517
+ if actual_name in self._file_schema:
1518
+ return list(self._file_schema[actual_name].keys())
1519
+
1520
+ # Check JOIN clauses for aliased tables
1521
+ joins = select_node.args.get("joins")
1522
+ if joins:
1523
+ for join in joins:
1524
+ if isinstance(join, exp.Join):
1525
+ join_source = join.this
1526
+ if (
1527
+ isinstance(join_source, exp.Table)
1528
+ and join_source.alias == table_name
1529
+ ):
1530
+ actual_name = self._get_qualified_table_name(join_source)
1531
+ if actual_name in self._file_schema:
1532
+ return list(self._file_schema[actual_name].keys())
1533
+
1534
+ return []
1535
+
1536
+ def _extract_subquery_columns(self, subquery_select: exp.Select) -> List[str]:
1537
+ """
1538
+ Extract column names from a subquery's SELECT statement.
1539
+
1540
+ Args:
1541
+ subquery_select: The SELECT expression within the subquery
1542
+
1543
+ Returns:
1544
+ List of column names
1545
+ """
1546
+ columns: List[str] = []
1547
+
1548
+ for projection in subquery_select.expressions:
1549
+ if isinstance(projection, exp.Alias):
1550
+ columns.append(projection.alias)
1551
+ elif isinstance(projection, exp.Column):
1552
+ # Check for table-qualified star (t.*)
1553
+ if isinstance(projection.this, exp.Star):
1554
+ table_name = projection.table
1555
+ if table_name:
1556
+ qualified_cols = self._resolve_qualified_star(
1557
+ table_name, subquery_select
1558
+ )
1559
+ columns.extend(qualified_cols)
1560
+ else:
1561
+ columns.append(projection.name)
1562
+ elif isinstance(projection, exp.Star):
1563
+ # Resolve SELECT * in subquery
1564
+ star_columns = self._resolve_star_columns(subquery_select)
1565
+ columns.extend(star_columns)
1566
+ else:
1567
+ col_sql = projection.sql(dialect=self.dialect)
1568
+ columns.append(col_sql)
1569
+
1570
+ return columns
1571
+
1572
+ def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
1573
+ """
1574
+ Resolve columns from a CTE definition within the same statement.
1575
+
1576
+ Args:
1577
+ cte_name: Name of the CTE to resolve
1578
+ select_node: The SELECT node that references the CTE
1579
+
1580
+ Returns:
1581
+ List of column names from the CTE, empty if CTE not found
1582
+ """
1583
+ # Walk up the tree to find the WITH clause containing this CTE
1584
+ parent = select_node
1585
+ while parent:
1586
+ if hasattr(parent, "args") and parent.args.get("with"):
1587
+ with_clause = parent.args["with"]
1588
+ for cte in with_clause.expressions:
1589
+ if isinstance(cte, exp.CTE) and cte.alias == cte_name:
1590
+ # Found the CTE - extract its columns
1591
+ cte_select = cte.this
1592
+ if isinstance(cte_select, exp.Select):
1593
+ return self._extract_cte_select_columns(cte_select)
1594
+ parent = parent.parent if hasattr(parent, "parent") else None
1595
+
1596
+ return []
1597
+
1598
+ def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
1599
+ """
1600
+ Extract column names from a CTE's SELECT statement.
1601
+
1602
+ This handles SELECT * within the CTE by resolving against file schema.
1603
+
1604
+ Args:
1605
+ cte_select: The SELECT expression within the CTE
1606
+
1607
+ Returns:
1608
+ List of column names
1609
+ """
1610
+ columns: List[str] = []
1611
+
1612
+ for projection in cte_select.expressions:
1613
+ if isinstance(projection, exp.Alias):
1614
+ columns.append(projection.alias)
1615
+ elif isinstance(projection, exp.Column):
1616
+ columns.append(projection.name)
1617
+ elif isinstance(projection, exp.Star):
1618
+ # Resolve SELECT * in CTE from file schema
1619
+ star_columns = self._resolve_star_columns(cte_select)
1620
+ columns.extend(star_columns)
1621
+ else:
1622
+ col_sql = projection.sql(dialect=self.dialect)
1623
+ columns.append(col_sql)
1624
+
1625
+ return columns