sql-glider 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {sql_glider-0.1.11 → sql_glider-0.1.12}/PKG-INFO +1 -1
  2. sql_glider-0.1.12/plans/2026-01-29-no-star-flag.md +47 -0
  3. sql_glider-0.1.12/plans/2026-01-29-resolve-schema.md +49 -0
  4. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/_version.py +2 -2
  5. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/cli.py +87 -1
  6. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/builder.py +201 -20
  7. sql_glider-0.1.12/src/sqlglider/graph/formatters.py +98 -0
  8. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/lineage/analyzer.py +156 -3
  9. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/utils/config.py +4 -0
  10. sql_glider-0.1.12/src/sqlglider/utils/schema.py +62 -0
  11. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/test_builder.py +211 -0
  12. sql_glider-0.1.12/tests/sqlglider/graph/test_formatters.py +86 -0
  13. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/lineage/test_analyzer.py +134 -0
  14. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/test_cli.py +172 -0
  15. sql_glider-0.1.12/tests/sqlglider/utils/test_schema.py +55 -0
  16. {sql_glider-0.1.11 → sql_glider-0.1.12}/.github/workflows/ci.yml +0 -0
  17. {sql_glider-0.1.11 → sql_glider-0.1.12}/.github/workflows/publish.yml +0 -0
  18. {sql_glider-0.1.11 → sql_glider-0.1.12}/.gitignore +0 -0
  19. {sql_glider-0.1.11 → sql_glider-0.1.12}/.python-version +0 -0
  20. {sql_glider-0.1.11 → sql_glider-0.1.12}/ARCHITECTURE.md +0 -0
  21. {sql_glider-0.1.11 → sql_glider-0.1.12}/CLAUDE.md +0 -0
  22. {sql_glider-0.1.11 → sql_glider-0.1.12}/LICENSE +0 -0
  23. {sql_glider-0.1.11 → sql_glider-0.1.12}/README.md +0 -0
  24. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-05-column-level-lineage.md +0 -0
  25. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-05-reverse-lineage.md +0 -0
  26. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-06-config-file-support.md +0 -0
  27. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-06-graph-lineage.md +0 -0
  28. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  29. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-07-sample-data-model.md +0 -0
  30. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-07-sql-templating.md +0 -0
  31. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-08-tables-command.md +0 -0
  32. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-09-graph-query-paths.md +0 -0
  33. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-13-dissect-command.md +0 -0
  34. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2025-12-14-tables-pull-command.md +0 -0
  35. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  36. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
  37. {sql_glider-0.1.11 → sql_glider-0.1.12}/plans/2026-01-28-sparksql-table-extraction.md +0 -0
  38. {sql_glider-0.1.11 → sql_glider-0.1.12}/pyproject.toml +0 -0
  39. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/README.md +0 -0
  40. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/expire_dim_customer.sql +0 -0
  41. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/load_fact_orders.sql +0 -0
  42. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/load_fact_payments.sql +0 -0
  43. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/merge_dim_customer.sql +0 -0
  44. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/merge_dim_product.sql +0 -0
  45. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  46. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/complex/conditional_merge.sql +0 -0
  47. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/complex/cte_insert.sql +0 -0
  48. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/complex/multi_table_transform.sql +0 -0
  49. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/dim_customer.sql +0 -0
  50. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/dim_product.sql +0 -0
  51. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/fact_orders.sql +0 -0
  52. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/fact_payments.sql +0 -0
  53. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_addresses.sql +0 -0
  54. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_customers.sql +0 -0
  55. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_order_items.sql +0 -0
  56. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_orders.sql +0 -0
  57. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_payments.sql +0 -0
  58. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/raw_products.sql +0 -0
  59. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/stg_customers.sql +0 -0
  60. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/stg_orders.sql +0 -0
  61. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/stg_payments.sql +0 -0
  62. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/ddl/stg_products.sql +0 -0
  63. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  64. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  65. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  66. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  67. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/maintenance/update_product_status.sql +0 -0
  68. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  69. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  70. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  71. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  72. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_customers.sql +0 -0
  73. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_orders.sql +0 -0
  74. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_payments.sql +0 -0
  75. {sql_glider-0.1.11 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_products.sql +0 -0
  76. {sql_glider-0.1.11 → sql_glider-0.1.12}/sqlglider.toml.example +0 -0
  77. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/__init__.py +0 -0
  78. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/catalog/__init__.py +0 -0
  79. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/catalog/base.py +0 -0
  80. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/catalog/databricks.py +0 -0
  81. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/catalog/registry.py +0 -0
  82. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/dissection/__init__.py +0 -0
  83. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/dissection/analyzer.py +0 -0
  84. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/dissection/formatters.py +0 -0
  85. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/dissection/models.py +0 -0
  86. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/global_models.py +0 -0
  87. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/__init__.py +0 -0
  88. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/merge.py +0 -0
  89. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/models.py +0 -0
  90. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/query.py +0 -0
  91. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/graph/serialization.py +0 -0
  92. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/lineage/__init__.py +0 -0
  93. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/lineage/formatters.py +0 -0
  94. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/templating/__init__.py +0 -0
  95. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/templating/base.py +0 -0
  96. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/templating/jinja.py +0 -0
  97. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/templating/registry.py +0 -0
  98. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/templating/variables.py +0 -0
  99. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/utils/__init__.py +0 -0
  100. {sql_glider-0.1.11 → sql_glider-0.1.12}/src/sqlglider/utils/file_utils.py +0 -0
  101. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/__init__.py +0 -0
  102. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  103. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  104. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  105. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  106. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  107. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  108. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte.sql +0 -0
  109. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  110. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
  111. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  112. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_multi.sql +0 -0
  113. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  114. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  115. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  116. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_tables.sql +0 -0
  117. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_view.sql +0 -0
  118. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  119. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/fixtures/sample_manifest.csv +0 -0
  120. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/__init__.py +0 -0
  121. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/catalog/__init__.py +0 -0
  122. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_base.py +0 -0
  123. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_databricks.py +0 -0
  124. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_registry.py +0 -0
  125. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/dissection/__init__.py +0 -0
  126. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  127. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_formatters.py +0 -0
  128. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_models.py +0 -0
  129. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/__init__.py +0 -0
  130. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/test_merge.py +0 -0
  131. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/test_models.py +0 -0
  132. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/test_query.py +0 -0
  133. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/graph/test_serialization.py +0 -0
  134. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/lineage/__init__.py +0 -0
  135. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/lineage/test_formatters.py +0 -0
  136. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/templating/__init__.py +0 -0
  137. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/templating/test_base.py +0 -0
  138. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/templating/test_jinja.py +0 -0
  139. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/templating/test_registry.py +0 -0
  140. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/templating/test_variables.py +0 -0
  141. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/utils/__init__.py +0 -0
  142. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/utils/test_config.py +0 -0
  143. {sql_glider-0.1.11 → sql_glider-0.1.12}/tests/sqlglider/utils/test_file_utils.py +0 -0
  144. {sql_glider-0.1.11 → sql_glider-0.1.12}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,47 @@
1
+ # Add `--no-star` Flag
2
+
3
+ **Status:** Completed
4
+
5
+ ## Overview
6
+ Add `--no-star` flag to `lineage` and `graph build` commands. When set, analysis fails if `SELECT *` or `t.*` cannot be resolved to actual columns.
7
+
8
+ ## Changes
9
+
10
+ ### 1. `src/sqlglider/utils/config.py` — Add to ConfigSettings
11
+ - [x] Add `no_star: Optional[bool] = None`
12
+
13
+ ### 2. `src/sqlglider/lineage/analyzer.py` — Add parameter + enforce
14
+ - [x] Add `no_star: bool = False` to `__init__`, store as `self._no_star`
15
+ - [x] Add `StarResolutionError` exception class (distinct from `ValueError` to avoid being swallowed by skipped-query handler)
16
+ - [x] DML/DDL path: raise `StarResolutionError` before fallback for bare `*` and `t.*`
17
+ - [x] DQL path: add star handling for both bare `*` and `t.*` with same error behavior
18
+ - [x] Re-raise `StarResolutionError` in `analyze_queries` instead of treating as skipped query
19
+
20
+ ### 3. `src/sqlglider/graph/builder.py` — Pass through
21
+ - [x] Add `no_star: bool = False` to `__init__`, store as `self.no_star`
22
+ - [x] Pass to `LineageAnalyzer(sql_content, dialect=file_dialect, no_star=self.no_star)`
23
+
24
+ ### 4. `src/sqlglider/cli.py` — Add CLI options
25
+ - [x] `lineage` command: Add `no_star: bool = typer.Option(False, "--no-star", ...)`
26
+ - [x] Resolve: `no_star = no_star or config.no_star or False`
27
+ - [x] Pass to `LineageAnalyzer(sql, dialect=dialect, no_star=no_star)`
28
+ - [x] `graph_build` command: same option, passed to `GraphBuilder(..., no_star=no_star)`
29
+
30
+ ### 5. `tests/sqlglider/lineage/test_analyzer.py` — Tests
31
+ - [x] Test bare `SELECT *` with `no_star=True` raises `StarResolutionError`
32
+ - [x] Test `SELECT t.*` with `no_star=True` raises `StarResolutionError`
33
+ - [x] Test resolvable star (via CTE) still works with `no_star=True`
34
+ - [x] Test resolvable qualified star (via CTE) still works with `no_star=True`
35
+ - [x] Test default (`no_star=False`) still falls back to `table.*`
36
+
37
+ ## Implementation Notes
38
+
39
+ ### Deviations from original plan
40
+ - Used `StarResolutionError` instead of `ValueError` because `analyze_queries` catches `ValueError` to handle unsupported statement types (skipped queries). A plain `ValueError` would be silently swallowed.
41
+ - Added star handling in the DQL (plain SELECT) code path in addition to the DML/DDL path. The original plan only addressed the DML/DDL path, but plain `SELECT *` queries go through a different branch in `get_output_columns`.
42
+ - Resolvable star tests use CTEs instead of `CREATE TABLE` with explicit columns, since `_extract_schema_from_statement` only handles `CREATE ... AS SELECT`, not DDL with column definitions.
43
+
44
+ ## Verification
45
+ - `uv run pytest` — 597 passed, 1 skipped, coverage 80.48%
46
+ - `uv run basedpyright src/` — 0 errors
47
+ - `uv run ruff check` — all checks passed
@@ -0,0 +1,49 @@
1
+ # `--resolve-schema` Flag and Catalog Integration
2
+
3
+ **Status:** Completed
4
+
5
+ ## Overview
6
+
7
+ Add `--resolve-schema` flag to `graph build` that runs a two-pass process: first extracting schema from all files, then running lineage analysis with the full schema available. Optionally, `--catalog-type` fills schema gaps by pulling DDL from a remote catalog.
8
+
9
+ ## Design Decisions
10
+
11
+ - **Types are not required** — SQLGlot only needs column names for star expansion; types are stored as `"UNKNOWN"`
12
+ - **Two-pass approach** — Pass 1 extracts schema from all files (order-independent), Pass 2 runs lineage with full schema
13
+ - **Opt-in via `--resolve-schema`** — default behavior unchanged
14
+ - **Catalog fills gaps only** — file-derived schema always wins over catalog-sourced schema
15
+ - **`--catalog-type` requires `--resolve-schema`** — validated at CLI level
16
+
17
+ ## Implementation
18
+
19
+ - [x] Add `schema` param to `LineageAnalyzer.__init__()` — pre-populates `_file_schema`
20
+ - [x] Add `extract_schema_only()` and `get_extracted_schema()` methods to `LineageAnalyzer`
21
+ - [x] Create `src/sqlglider/utils/schema.py` with `parse_ddl_to_schema()` for DDL column extraction
22
+ - [x] Add `resolve_schema`, `catalog_type`, `catalog_config` to `GraphBuilder`
23
+ - [x] Implement `_extract_schemas()` for pass 1 and `_fill_schema_from_catalog()` for catalog gap-filling
24
+ - [x] Two-pass flow in `add_files()` and `add_manifest()`
25
+ - [x] Add `--resolve-schema` and `--catalog-type` CLI flags to `graph build`
26
+ - [x] Add `resolve_schema` to `ConfigSettings`
27
+ - [x] Tests: 25 new tests (schema parsing, analyzer schema param, cross-file resolution, catalog mocking)
28
+
29
+ ## Files Modified
30
+
31
+ - `src/sqlglider/lineage/analyzer.py` — schema param, extraction methods
32
+ - `src/sqlglider/graph/builder.py` — two-pass processing, catalog integration
33
+ - `src/sqlglider/cli.py` — CLI flags
34
+ - `src/sqlglider/utils/config.py` — config setting
35
+ - `src/sqlglider/utils/schema.py` — **new** DDL parsing utility
36
+ - `tests/sqlglider/utils/test_schema.py` — **new**
37
+ - `tests/sqlglider/graph/test_builder.py` — resolve schema + catalog tests
38
+ - `tests/sqlglider/lineage/test_analyzer.py` — schema param tests
39
+
40
+ ## Verification
41
+
42
+ - 617 passed, 1 skipped
43
+ - Coverage: 80.43%
44
+ - basedpyright: 0 errors
45
+ - ruff: all checks passed
46
+
47
+ ## Known Limitations
48
+
49
+ - Cross-file CTAS chains with `SELECT *` (view B depends on view A via star) may not resolve if both are in separate files and the schema extraction pass processes B before A. This is rare in practice.
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.11'
32
- __version_tuple__ = version_tuple = (0, 1, 11)
31
+ __version__ = version = '0.1.12'
32
+ __version_tuple__ = version_tuple = (0, 1, 12)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
12
12
  from typing_extensions import Annotated
13
13
 
14
14
  from sqlglider.global_models import AnalysisLevel, NodeFormat
15
- from sqlglider.lineage.analyzer import LineageAnalyzer
15
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
16
16
  from sqlglider.lineage.formatters import (
17
17
  CsvFormatter,
18
18
  JsonFormatter,
@@ -1001,6 +1001,35 @@ def graph_build(
1001
1001
  "--no-star",
1002
1002
  help="Fail if SELECT * cannot be resolved to actual columns",
1003
1003
  ),
1004
+ resolve_schema: bool = typer.Option(
1005
+ False,
1006
+ "--resolve-schema",
1007
+ help="Extract schema from all files before lineage analysis, "
1008
+ "enabling cross-file star resolution",
1009
+ ),
1010
+ catalog_type: Optional[str] = typer.Option(
1011
+ None,
1012
+ "--catalog-type",
1013
+ "-c",
1014
+ help="Catalog provider for pulling DDL of tables not found in files "
1015
+ "(requires --resolve-schema). E.g. 'databricks'",
1016
+ ),
1017
+ dump_schema: Optional[Path] = typer.Option(
1018
+ None,
1019
+ "--dump-schema",
1020
+ help="Dump resolved schema to file (requires --resolve-schema)",
1021
+ ),
1022
+ dump_schema_format: Optional[str] = typer.Option(
1023
+ None,
1024
+ "--dump-schema-format",
1025
+ help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
1026
+ ),
1027
+ strict_schema: bool = typer.Option(
1028
+ False,
1029
+ "--strict-schema",
1030
+ help="Fail if any column's table cannot be identified during schema extraction "
1031
+ "(requires --resolve-schema)",
1032
+ ),
1004
1033
  ) -> None:
1005
1034
  """
1006
1035
  Build a lineage graph from SQL files.
@@ -1036,6 +1065,37 @@ def graph_build(
1036
1065
  dialect = dialect or config.dialect or "spark"
1037
1066
  templater = templater or config.templater # None means no templating
1038
1067
  no_star = no_star or config.no_star or False
1068
+ resolve_schema = resolve_schema or config.resolve_schema or False
1069
+ strict_schema = strict_schema or config.strict_schema or False
1070
+
1071
+ if strict_schema and not resolve_schema:
1072
+ err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
1073
+ raise typer.Exit(1)
1074
+
1075
+ if catalog_type and not resolve_schema:
1076
+ err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
1077
+ raise typer.Exit(1)
1078
+
1079
+ # Resolve dump_schema options from config
1080
+ dump_schema = dump_schema or (
1081
+ Path(config.dump_schema) if config.dump_schema else None
1082
+ )
1083
+ dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
1084
+
1085
+ if dump_schema and not resolve_schema:
1086
+ err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
1087
+ raise typer.Exit(1)
1088
+
1089
+ if dump_schema_format not in ("text", "json", "csv"):
1090
+ err_console.print(
1091
+ f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
1092
+ "Use 'text', 'json', or 'csv'."
1093
+ )
1094
+ raise typer.Exit(1)
1095
+
1096
+ # Only inherit catalog_type from config when resolve_schema is active
1097
+ if resolve_schema and not catalog_type:
1098
+ catalog_type = config.catalog_type
1039
1099
 
1040
1100
  # Validate and convert node format to enum
1041
1101
  try:
@@ -1088,11 +1148,22 @@ def graph_build(
1088
1148
  sql_preprocessor = _preprocess
1089
1149
 
1090
1150
  try:
1151
+ # Build catalog config from config file if available
1152
+ catalog_config_dict = None
1153
+ if catalog_type and config.catalog:
1154
+ provider_config = getattr(config.catalog, catalog_type, None)
1155
+ if provider_config:
1156
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1157
+
1091
1158
  builder = GraphBuilder(
1092
1159
  node_format=node_format_enum,
1093
1160
  dialect=dialect,
1094
1161
  sql_preprocessor=sql_preprocessor,
1095
1162
  no_star=no_star,
1163
+ resolve_schema=resolve_schema,
1164
+ catalog_type=catalog_type,
1165
+ catalog_config=catalog_config_dict,
1166
+ strict_schema=strict_schema,
1096
1167
  )
1097
1168
 
1098
1169
  # Process manifest if provided
@@ -1115,6 +1186,17 @@ def graph_build(
1115
1186
  raise typer.Exit(1)
1116
1187
  builder.add_files(all_files, dialect=dialect)
1117
1188
 
1189
+ # Dump resolved schema if requested
1190
+ if dump_schema:
1191
+ from sqlglider.graph.formatters import format_schema
1192
+
1193
+ schema_content = format_schema(builder.resolved_schema, dump_schema_format)
1194
+ dump_schema.write_text(schema_content, encoding="utf-8")
1195
+ console.print(
1196
+ f"[green]Schema dumped to {dump_schema} "
1197
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1198
+ )
1199
+
1118
1200
  # Build and save graph
1119
1201
  graph = builder.build()
1120
1202
  save_graph(graph, output)
@@ -1124,6 +1206,10 @@ def graph_build(
1124
1206
  f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
1125
1207
  )
1126
1208
 
1209
+ except SchemaResolutionError as e:
1210
+ err_console.print(f"[red]Error:[/red] {e}")
1211
+ raise typer.Exit(1)
1212
+
1127
1213
  except FileNotFoundError as e:
1128
1214
  err_console.print(f"[red]Error:[/red] {e}")
1129
1215
  raise typer.Exit(1)
@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
20
20
  from sqlglider.utils.file_utils import read_sql_file
21
+ from sqlglider.utils.schema import parse_ddl_to_schema
21
22
 
22
23
  console = Console(stderr=True)
23
24
 
@@ -34,6 +35,10 @@ class GraphBuilder:
34
35
  dialect: str = "spark",
35
36
  sql_preprocessor: Optional[SqlPreprocessor] = None,
36
37
  no_star: bool = False,
38
+ resolve_schema: bool = False,
39
+ catalog_type: Optional[str] = None,
40
+ catalog_config: Optional[Dict[str, object]] = None,
41
+ strict_schema: bool = False,
37
42
  ):
38
43
  """
39
44
  Initialize the graph builder.
@@ -45,16 +50,31 @@ class GraphBuilder:
45
50
  Takes (sql: str, file_path: Path) and returns processed SQL.
46
51
  Useful for templating (e.g., Jinja2 rendering).
47
52
  no_star: If True, fail when SELECT * cannot be resolved to columns
53
+ resolve_schema: If True, run a schema extraction pass across all
54
+ files before lineage analysis so that schema from any file is
55
+ available when analyzing every other file.
56
+ catalog_type: Optional catalog provider name (e.g. "databricks").
57
+ When set together with resolve_schema, DDL is pulled from the
58
+ catalog for tables whose schema could not be inferred from files.
59
+ catalog_config: Optional provider-specific configuration dict
60
+ passed to the catalog's configure() method.
61
+ strict_schema: If True, fail during schema extraction when an
62
+ unqualified column cannot be attributed to a table.
48
63
  """
49
64
  self.node_format = node_format
50
65
  self.dialect = dialect
51
66
  self.sql_preprocessor = sql_preprocessor
52
67
  self.no_star = no_star
68
+ self.resolve_schema = resolve_schema
69
+ self.catalog_type = catalog_type
70
+ self.catalog_config = catalog_config
71
+ self.strict_schema = strict_schema
53
72
  self.graph: rx.PyDiGraph = rx.PyDiGraph()
54
73
  self._node_index_map: Dict[str, int] = {} # identifier -> rustworkx node index
55
74
  self._source_files: Set[str] = set()
56
75
  self._edge_set: Set[tuple] = set() # (source, target) for dedup
57
76
  self._skipped_files: List[tuple[str, str]] = [] # (file_path, reason)
77
+ self._resolved_schema: Dict[str, Dict[str, str]] = {} # accumulated schema
58
78
 
59
79
  def add_file(
60
80
  self,
@@ -86,7 +106,10 @@ class GraphBuilder:
86
106
  sql_content = self.sql_preprocessor(sql_content, file_path)
87
107
 
88
108
  analyzer = LineageAnalyzer(
89
- sql_content, dialect=file_dialect, no_star=self.no_star
109
+ sql_content,
110
+ dialect=file_dialect,
111
+ no_star=self.no_star,
112
+ schema=self._resolved_schema if self._resolved_schema else None,
90
113
  )
91
114
  results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
92
115
 
@@ -209,23 +232,37 @@ class GraphBuilder:
209
232
  entry_dialect = entry.dialect or dialect or self.dialect
210
233
  files_with_dialects.append((file_path, entry_dialect))
211
234
 
212
- # Process with progress
213
- if files_with_dialects:
214
- total = len(files_with_dialects)
215
- with Progress(
216
- TextColumn("[progress.description]{task.description}"),
217
- BarColumn(),
218
- TaskProgressColumn(),
219
- console=console,
220
- transient=False,
221
- ) as progress:
222
- task = progress.add_task("Parsing", total=total)
223
- for i, (file_path, file_dialect) in enumerate(
224
- files_with_dialects, start=1
225
- ):
226
- console.print(f"Parsing file {i}/{total}: {file_path.name}")
227
- self.add_file(file_path, file_dialect)
228
- progress.advance(task)
235
+ if not files_with_dialects:
236
+ return self
237
+
238
+ # Two-pass schema resolution
239
+ if self.resolve_schema:
240
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
241
+ file_paths_only = [fp for fp, _ in files_with_dialects]
242
+ self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
243
+ if self.catalog_type:
244
+ self._resolved_schema = self._fill_schema_from_catalog(
245
+ self._resolved_schema, file_paths_only, dialect
246
+ )
247
+ console.print(
248
+ f"[blue]Schema resolved for "
249
+ f"{len(self._resolved_schema)} table(s)[/blue]"
250
+ )
251
+
252
+ total = len(files_with_dialects)
253
+ description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
254
+ with Progress(
255
+ TextColumn("[progress.description]{task.description}"),
256
+ BarColumn(),
257
+ TaskProgressColumn(),
258
+ console=console,
259
+ transient=False,
260
+ ) as progress:
261
+ task = progress.add_task(description, total=total)
262
+ for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
263
+ console.print(f"Parsing file {i}/{total}: {file_path.name}")
264
+ self.add_file(file_path, file_dialect)
265
+ progress.advance(task)
229
266
 
230
267
  return self
231
268
 
@@ -249,8 +286,24 @@ class GraphBuilder:
249
286
  if not file_paths:
250
287
  return self
251
288
 
289
+ # Two-pass schema resolution: extract schema from all files first
290
+ if self.resolve_schema:
291
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
292
+ self._resolved_schema = self._extract_schemas(file_paths, dialect)
293
+ if self.catalog_type:
294
+ self._resolved_schema = self._fill_schema_from_catalog(
295
+ self._resolved_schema, file_paths, dialect
296
+ )
297
+ console.print(
298
+ f"[blue]Schema resolved for "
299
+ f"{len(self._resolved_schema)} table(s)[/blue]"
300
+ )
301
+
252
302
  if show_progress:
253
303
  total = len(file_paths)
304
+ description = (
305
+ "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
306
+ )
254
307
  with Progress(
255
308
  TextColumn("[progress.description]{task.description}"),
256
309
  BarColumn(),
@@ -258,7 +311,7 @@ class GraphBuilder:
258
311
  console=console,
259
312
  transient=False,
260
313
  ) as progress:
261
- task = progress.add_task("Parsing", total=total)
314
+ task = progress.add_task(description, total=total)
262
315
  for i, file_path in enumerate(file_paths, start=1):
263
316
  console.print(f"Parsing file {i}/{total}: {file_path.name}")
264
317
  self.add_file(file_path, dialect)
@@ -268,6 +321,129 @@ class GraphBuilder:
268
321
  self.add_file(file_path, dialect)
269
322
  return self
270
323
 
324
+ def _extract_schemas(
325
+ self,
326
+ file_paths: List[Path],
327
+ dialect: Optional[str] = None,
328
+ ) -> Dict[str, Dict[str, str]]:
329
+ """Run schema extraction pass across all files.
330
+
331
+ Parses each file and extracts schema from CREATE TABLE/VIEW
332
+ statements without performing lineage analysis.
333
+
334
+ Args:
335
+ file_paths: SQL files to extract schema from
336
+ dialect: SQL dialect override
337
+
338
+ Returns:
339
+ Accumulated schema dict from all files
340
+ """
341
+ schema: Dict[str, Dict[str, str]] = {}
342
+ total = len(file_paths)
343
+ with Progress(
344
+ TextColumn("[progress.description]{task.description}"),
345
+ BarColumn(),
346
+ TaskProgressColumn(),
347
+ console=console,
348
+ transient=False,
349
+ ) as progress:
350
+ task = progress.add_task("Pass 1: Extracting schema", total=total)
351
+ for i, file_path in enumerate(file_paths, start=1):
352
+ console.print(f"Extracting schema {i}/{total}: {file_path.name}")
353
+ file_dialect = dialect or self.dialect
354
+ try:
355
+ sql_content = read_sql_file(file_path)
356
+ if self.sql_preprocessor:
357
+ sql_content = self.sql_preprocessor(sql_content, file_path)
358
+ analyzer = LineageAnalyzer(
359
+ sql_content,
360
+ dialect=file_dialect,
361
+ schema=schema,
362
+ strict_schema=self.strict_schema,
363
+ )
364
+ file_schema = analyzer.extract_schema_only()
365
+ schema.update(file_schema)
366
+ except SchemaResolutionError:
367
+ raise
368
+ except Exception:
369
+ # Schema extraction failures are non-fatal; the file
370
+ # will be reported during the lineage pass if it also fails.
371
+ pass
372
+ progress.advance(task)
373
+ return schema
374
+
375
+ def _fill_schema_from_catalog(
376
+ self,
377
+ schema: Dict[str, Dict[str, str]],
378
+ file_paths: List[Path],
379
+ dialect: Optional[str] = None,
380
+ ) -> Dict[str, Dict[str, str]]:
381
+ """Pull DDL from catalog for tables not yet in schema.
382
+
383
+ Extracts all table names referenced across the files, identifies
384
+ those missing from the schema, and fetches their DDL from the
385
+ configured catalog provider.
386
+
387
+ Args:
388
+ schema: Schema dict already populated from file extraction
389
+ file_paths: SQL files to scan for table references
390
+ dialect: SQL dialect override
391
+
392
+ Returns:
393
+ Updated schema dict with catalog-sourced entries added
394
+ """
395
+ from sqlglider.catalog import get_catalog
396
+
397
+ catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
398
+ if self.catalog_config:
399
+ catalog.configure(self.catalog_config)
400
+
401
+ # Collect all referenced table names across files
402
+ all_tables: Set[str] = set()
403
+ for file_path in file_paths:
404
+ file_dialect = dialect or self.dialect
405
+ try:
406
+ sql_content = read_sql_file(file_path)
407
+ if self.sql_preprocessor:
408
+ sql_content = self.sql_preprocessor(sql_content, file_path)
409
+ analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
410
+ tables_results = analyzer.analyze_tables()
411
+ for result in tables_results:
412
+ for table_info in result.tables:
413
+ # Skip CTEs — they don't exist in catalogs
414
+ from sqlglider.lineage.analyzer import ObjectType
415
+
416
+ if table_info.object_type != ObjectType.CTE:
417
+ all_tables.add(table_info.name)
418
+ except Exception:
419
+ pass
420
+
421
+ # Find tables missing from schema
422
+ missing = [t for t in all_tables if t not in schema]
423
+ if not missing:
424
+ return schema
425
+
426
+ console.print(
427
+ f"[blue]Pulling DDL from {self.catalog_type} "
428
+ f"for {len(missing)} table(s)...[/blue]"
429
+ )
430
+
431
+ ddl_results = catalog.get_ddl_batch(missing)
432
+ file_dialect = dialect or self.dialect
433
+ for table_name, ddl in ddl_results.items():
434
+ if ddl.startswith("ERROR:"):
435
+ console.print(
436
+ f"[yellow]Warning:[/yellow] Could not pull DDL "
437
+ f"for {table_name}: {ddl}"
438
+ )
439
+ continue
440
+ parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
441
+ for name, cols in parsed_schema.items():
442
+ if name not in schema:
443
+ schema[name] = cols
444
+
445
+ return schema
446
+
271
447
  def _ensure_node(
272
448
  self,
273
449
  identifier: str,
@@ -348,6 +524,11 @@ class GraphBuilder:
348
524
  """Get mapping from node identifiers to rustworkx indices."""
349
525
  return self._node_index_map.copy()
350
526
 
527
+ @property
528
+ def resolved_schema(self) -> Dict[str, Dict[str, str]]:
529
+ """Get the resolved schema dictionary from schema extraction pass."""
530
+ return self._resolved_schema.copy()
531
+
351
532
  @property
352
533
  def skipped_files(self) -> List[tuple[str, str]]:
353
534
  """Get list of files that were skipped during graph building."""
@@ -0,0 +1,98 @@
1
+ """Output formatters for resolved schema data."""
2
+
3
+ import csv
4
+ import json
5
+ from io import StringIO
6
+ from typing import Dict
7
+
8
+ SchemaDict = Dict[str, Dict[str, str]]
9
+
10
+
11
+ def format_schema_text(schema: SchemaDict) -> str:
12
+ """Format resolved schema as human-readable text.
13
+
14
+ Output format:
15
+ customers
16
+ id
17
+ name
18
+
19
+ schema.orders
20
+ order_id
21
+ customer_id
22
+
23
+ Args:
24
+ schema: Resolved schema dictionary mapping table names to column dicts.
25
+
26
+ Returns:
27
+ Text-formatted string.
28
+ """
29
+ lines: list[str] = []
30
+ for table_name in sorted(schema):
31
+ if lines:
32
+ lines.append("")
33
+ lines.append(table_name)
34
+ for column_name in sorted(schema[table_name]):
35
+ lines.append(f" {column_name}")
36
+ return "\n".join(lines) + "\n" if lines else ""
37
+
38
+
39
+ def format_schema_json(schema: SchemaDict) -> str:
40
+ """Format resolved schema as JSON.
41
+
42
+ Args:
43
+ schema: Resolved schema dictionary mapping table names to column dicts.
44
+
45
+ Returns:
46
+ JSON-formatted string.
47
+ """
48
+ sorted_schema = {k: schema[k] for k in sorted(schema)}
49
+ return json.dumps(sorted_schema, indent=2)
50
+
51
+
52
+ def format_schema_csv(schema: SchemaDict) -> str:
53
+ """Format resolved schema as CSV.
54
+
55
+ Output format:
56
+ table,column,type
57
+ customers,id,UNKNOWN
58
+ customers,name,UNKNOWN
59
+
60
+ Args:
61
+ schema: Resolved schema dictionary mapping table names to column dicts.
62
+
63
+ Returns:
64
+ CSV-formatted string.
65
+ """
66
+ output = StringIO()
67
+ writer = csv.writer(output)
68
+ writer.writerow(["table", "column", "type"])
69
+ for table_name in sorted(schema):
70
+ for column_name in sorted(schema[table_name]):
71
+ writer.writerow([table_name, column_name, schema[table_name][column_name]])
72
+ return output.getvalue()
73
+
74
+
75
+ def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
76
+ """Format resolved schema in the specified format.
77
+
78
+ Args:
79
+ schema: Resolved schema dictionary.
80
+ output_format: One of "text", "json", or "csv".
81
+
82
+ Returns:
83
+ Formatted string.
84
+
85
+ Raises:
86
+ ValueError: If output_format is not recognized.
87
+ """
88
+ formatters = {
89
+ "text": format_schema_text,
90
+ "json": format_schema_json,
91
+ "csv": format_schema_csv,
92
+ }
93
+ formatter = formatters.get(output_format)
94
+ if formatter is None:
95
+ raise ValueError(
96
+ f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
97
+ )
98
+ return formatter(schema)