sql-glider 0.1.10__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.10 → sql_glider-0.1.12}/PKG-INFO +1 -1
- sql_glider-0.1.12/plans/2026-01-28-sparksql-table-extraction.md +58 -0
- sql_glider-0.1.12/plans/2026-01-29-no-star-flag.md +47 -0
- sql_glider-0.1.12/plans/2026-01-29-resolve-schema.md +49 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/_version.py +2 -2
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/cli.py +101 -2
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/builder.py +206 -20
- sql_glider-0.1.12/src/sqlglider/graph/formatters.py +98 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/lineage/analyzer.py +217 -3
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/utils/config.py +5 -0
- sql_glider-0.1.12/src/sqlglider/utils/schema.py +62 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/test_builder.py +211 -0
- sql_glider-0.1.12/tests/sqlglider/graph/test_formatters.py +86 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/lineage/test_analyzer.py +211 -1
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/test_cli.py +172 -0
- sql_glider-0.1.12/tests/sqlglider/utils/test_schema.py +55 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/.github/workflows/ci.yml +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/.github/workflows/publish.yml +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/.gitignore +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/.python-version +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/ARCHITECTURE.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/CLAUDE.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/LICENSE +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/README.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-05-column-level-lineage.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-05-reverse-lineage.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-06-config-file-support.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-06-graph-lineage.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-06-unify-single-multi-query.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-07-sample-data-model.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-07-sql-templating.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-08-tables-command.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-09-graph-query-paths.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-13-dissect-command.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2025-12-14-tables-pull-command.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/pyproject.toml +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/README.md +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/expire_dim_customer.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/load_fact_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/load_fact_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/merge_dim_customer.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/merge_dim_product.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/complex/conditional_merge.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/complex/cte_insert.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/complex/multi_table_transform.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/dim_customer.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/dim_product.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/fact_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/fact_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_addresses.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_customers.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_order_items.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/raw_products.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/stg_customers.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/stg_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/stg_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/ddl/stg_products.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/maintenance/update_product_status.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_customers.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_payments.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sample_data_model/staging/load_stg_products.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/sqlglider.toml.example +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/catalog/base.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/catalog/databricks.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/catalog/registry.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/dissection/analyzer.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/dissection/formatters.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/dissection/models.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/global_models.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/merge.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/models.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/query.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/graph/serialization.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/lineage/formatters.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/templating/base.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/templating/jinja.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/templating/registry.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/templating/variables.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/src/sqlglider/utils/file_utils.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/customers.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/orders.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/reports.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_multi.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_single_query.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_subquery.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_tables.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_view.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/fixtures/sample_manifest.csv +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/catalog/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_base.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_databricks.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/catalog/test_registry.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/dissection/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_analyzer.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_formatters.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/dissection/test_models.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/test_merge.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/test_models.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/test_query.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/graph/test_serialization.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/lineage/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/lineage/test_formatters.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/templating/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/templating/test_base.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/templating/test_jinja.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/templating/test_registry.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/templating/test_variables.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/utils/__init__.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/utils/test_config.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/tests/sqlglider/utils/test_file_utils.py +0 -0
- {sql_glider-0.1.10 → sql_glider-0.1.12}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# SparkSQL Table Extraction for Unsupported Statement Types
|
|
2
|
+
|
|
3
|
+
**Status:** Planned
|
|
4
|
+
**Date:** 2026-01-28
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
The `analyze_tables()` method in `LineageAnalyzer` currently extracts tables from a subset of statement types (SELECT, INSERT, CREATE, DELETE, DROP, TRUNCATE, CACHE). Several SparkSQL-specific statement types reference tables but are not captured during table extraction. This plan adds table extraction support for these missing types.
|
|
9
|
+
|
|
10
|
+
Column lineage is **not affected** — these statements contain no SELECT and cannot produce column-level lineage. The goal is to ensure `sqlglider tables overview` reports all tables referenced in a SQL file.
|
|
11
|
+
|
|
12
|
+
## Statements to Add
|
|
13
|
+
|
|
14
|
+
| Statement | SQLGlot Expression | Table Location | Proposed Usage |
|
|
15
|
+
|-----------|-------------------|----------------|----------------|
|
|
16
|
+
| `UNCACHE TABLE t` | `exp.Uncache` | `expr.this` | `INPUT` |
|
|
17
|
+
| `REFRESH TABLE t` | `exp.Refresh` | `expr.this` | `INPUT` |
|
|
18
|
+
| `LOAD DATA INPATH '...' INTO TABLE t` | `exp.LoadData` | `expr.this` | `OUTPUT` |
|
|
19
|
+
| `ALTER TABLE t ...` | `exp.Alter` | `expr.this` | `OUTPUT` |
|
|
20
|
+
| `ANALYZE TABLE t COMPUTE STATISTICS` | `exp.Analyze` | `expr.this` | `INPUT` |
|
|
21
|
+
|
|
22
|
+
### Usage Rationale
|
|
23
|
+
|
|
24
|
+
- **UNCACHE / REFRESH / ANALYZE**: Read-oriented metadata operations on an existing table → `INPUT`
|
|
25
|
+
- **LOAD DATA**: Writes data into a table → `OUTPUT`
|
|
26
|
+
- **ALTER TABLE**: Modifies table structure → `OUTPUT`
|
|
27
|
+
|
|
28
|
+
## Implementation Steps
|
|
29
|
+
|
|
30
|
+
- [ ] Add extraction logic to `_get_target_table_info()` in [analyzer.py](src/sqlglider/lineage/analyzer.py) for each new expression type
|
|
31
|
+
- [ ] Add each type to the `_get_statement_type()` type_map for readable skip messages
|
|
32
|
+
- [ ] Add entries to `_is_target_table()` where applicable (LOAD DATA, ALTER)
|
|
33
|
+
- [ ] Ensure `_get_target_and_select()` returns `None` gracefully for these types (they have no SELECT)
|
|
34
|
+
- [ ] Add unit tests in [test_analyzer.py](tests/sqlglider/lineage/test_analyzer.py):
|
|
35
|
+
- Table extraction returns correct table name and usage for each type
|
|
36
|
+
- Column lineage correctly skips these with appropriate message
|
|
37
|
+
- Parameterized test covering all five statement types
|
|
38
|
+
- [ ] Verify graph build handles these gracefully (skipped queries warning)
|
|
39
|
+
- [ ] Run full test suite and coverage check
|
|
40
|
+
|
|
41
|
+
## Files to Modify
|
|
42
|
+
|
|
43
|
+
- `src/sqlglider/lineage/analyzer.py` — extraction logic
|
|
44
|
+
- `tests/sqlglider/lineage/test_analyzer.py` — unit tests
|
|
45
|
+
|
|
46
|
+
## Testing Strategy
|
|
47
|
+
|
|
48
|
+
- Parameterized tests with SparkSQL syntax for each statement type
|
|
49
|
+
- Verify `analyze_tables()` returns correct table name, usage, and object type
|
|
50
|
+
- Verify `analyze_queries()` adds these to `skipped_queries` with clear reason
|
|
51
|
+
- Ensure no regressions in existing tests
|
|
52
|
+
- Coverage threshold (80%) maintained
|
|
53
|
+
|
|
54
|
+
## Notes
|
|
55
|
+
|
|
56
|
+
- These are all parsed by sqlglot's Spark dialect parser, so no custom parsing is needed
|
|
57
|
+
- Some of these (SHOW, DESCRIBE, EXPLAIN) parse as `exp.Command` — those are intentionally excluded since they don't reference tables in a structured way
|
|
58
|
+
- INSERT OVERWRITE and multi-INSERT patterns may warrant separate investigation
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Add `--no-star` Flag
|
|
2
|
+
|
|
3
|
+
**Status:** Completed
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
Add `--no-star` flag to `lineage` and `graph build` commands. When set, analysis fails if `SELECT *` or `t.*` cannot be resolved to actual columns.
|
|
7
|
+
|
|
8
|
+
## Changes
|
|
9
|
+
|
|
10
|
+
### 1. `src/sqlglider/utils/config.py` — Add to ConfigSettings
|
|
11
|
+
- [x] Add `no_star: Optional[bool] = None`
|
|
12
|
+
|
|
13
|
+
### 2. `src/sqlglider/lineage/analyzer.py` — Add parameter + enforce
|
|
14
|
+
- [x] Add `no_star: bool = False` to `__init__`, store as `self._no_star`
|
|
15
|
+
- [x] Add `StarResolutionError` exception class (distinct from `ValueError` to avoid being swallowed by skipped-query handler)
|
|
16
|
+
- [x] DML/DDL path: raise `StarResolutionError` before fallback for bare `*` and `t.*`
|
|
17
|
+
- [x] DQL path: add star handling for both bare `*` and `t.*` with same error behavior
|
|
18
|
+
- [x] Re-raise `StarResolutionError` in `analyze_queries` instead of treating as skipped query
|
|
19
|
+
|
|
20
|
+
### 3. `src/sqlglider/graph/builder.py` — Pass through
|
|
21
|
+
- [x] Add `no_star: bool = False` to `__init__`, store as `self.no_star`
|
|
22
|
+
- [x] Pass to `LineageAnalyzer(sql_content, dialect=file_dialect, no_star=self.no_star)`
|
|
23
|
+
|
|
24
|
+
### 4. `src/sqlglider/cli.py` — Add CLI options
|
|
25
|
+
- [x] `lineage` command: Add `no_star: bool = typer.Option(False, "--no-star", ...)`
|
|
26
|
+
- [x] Resolve: `no_star = no_star or config.no_star or False`
|
|
27
|
+
- [x] Pass to `LineageAnalyzer(sql, dialect=dialect, no_star=no_star)`
|
|
28
|
+
- [x] `graph_build` command: same option, passed to `GraphBuilder(..., no_star=no_star)`
|
|
29
|
+
|
|
30
|
+
### 5. `tests/sqlglider/lineage/test_analyzer.py` — Tests
|
|
31
|
+
- [x] Test bare `SELECT *` with `no_star=True` raises `StarResolutionError`
|
|
32
|
+
- [x] Test `SELECT t.*` with `no_star=True` raises `StarResolutionError`
|
|
33
|
+
- [x] Test resolvable star (via CTE) still works with `no_star=True`
|
|
34
|
+
- [x] Test resolvable qualified star (via CTE) still works with `no_star=True`
|
|
35
|
+
- [x] Test default (`no_star=False`) still falls back to `table.*`
|
|
36
|
+
|
|
37
|
+
## Implementation Notes
|
|
38
|
+
|
|
39
|
+
### Deviations from original plan
|
|
40
|
+
- Used `StarResolutionError` instead of `ValueError` because `analyze_queries` catches `ValueError` to handle unsupported statement types (skipped queries). A plain `ValueError` would be silently swallowed.
|
|
41
|
+
- Added star handling in the DQL (plain SELECT) code path in addition to the DML/DDL path. The original plan only addressed the DML/DDL path, but plain `SELECT *` queries go through a different branch in `get_output_columns`.
|
|
42
|
+
- Resolvable star tests use CTEs instead of `CREATE TABLE` with explicit columns, since `_extract_schema_from_statement` only handles `CREATE ... AS SELECT`, not DDL with column definitions.
|
|
43
|
+
|
|
44
|
+
## Verification
|
|
45
|
+
- `uv run pytest` — 597 passed, 1 skipped, coverage 80.48%
|
|
46
|
+
- `uv run basedpyright src/` — 0 errors
|
|
47
|
+
- `uv run ruff check` — all checks passed
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# `--resolve-schema` Flag and Catalog Integration
|
|
2
|
+
|
|
3
|
+
**Status:** Completed
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Add `--resolve-schema` flag to `graph build` that runs a two-pass process: first extracting schema from all files, then running lineage analysis with the full schema available. Optionally, `--catalog-type` fills schema gaps by pulling DDL from a remote catalog.
|
|
8
|
+
|
|
9
|
+
## Design Decisions
|
|
10
|
+
|
|
11
|
+
- **Types are not required** — SQLGlot only needs column names for star expansion; types are stored as `"UNKNOWN"`
|
|
12
|
+
- **Two-pass approach** — Pass 1 extracts schema from all files (order-independent), Pass 2 runs lineage with full schema
|
|
13
|
+
- **Opt-in via `--resolve-schema`** — default behavior unchanged
|
|
14
|
+
- **Catalog fills gaps only** — file-derived schema always wins over catalog-sourced schema
|
|
15
|
+
- **`--catalog-type` requires `--resolve-schema`** — validated at CLI level
|
|
16
|
+
|
|
17
|
+
## Implementation
|
|
18
|
+
|
|
19
|
+
- [x] Add `schema` param to `LineageAnalyzer.__init__()` — pre-populates `_file_schema`
|
|
20
|
+
- [x] Add `extract_schema_only()` and `get_extracted_schema()` methods to `LineageAnalyzer`
|
|
21
|
+
- [x] Create `src/sqlglider/utils/schema.py` with `parse_ddl_to_schema()` for DDL column extraction
|
|
22
|
+
- [x] Add `resolve_schema`, `catalog_type`, `catalog_config` to `GraphBuilder`
|
|
23
|
+
- [x] Implement `_extract_schemas()` for pass 1 and `_fill_schema_from_catalog()` for catalog gap-filling
|
|
24
|
+
- [x] Two-pass flow in `add_files()` and `add_manifest()`
|
|
25
|
+
- [x] Add `--resolve-schema` and `--catalog-type` CLI flags to `graph build`
|
|
26
|
+
- [x] Add `resolve_schema` to `ConfigSettings`
|
|
27
|
+
- [x] Tests: 25 new tests (schema parsing, analyzer schema param, cross-file resolution, catalog mocking)
|
|
28
|
+
|
|
29
|
+
## Files Modified
|
|
30
|
+
|
|
31
|
+
- `src/sqlglider/lineage/analyzer.py` — schema param, extraction methods
|
|
32
|
+
- `src/sqlglider/graph/builder.py` — two-pass processing, catalog integration
|
|
33
|
+
- `src/sqlglider/cli.py` — CLI flags
|
|
34
|
+
- `src/sqlglider/utils/config.py` — config setting
|
|
35
|
+
- `src/sqlglider/utils/schema.py` — **new** DDL parsing utility
|
|
36
|
+
- `tests/sqlglider/utils/test_schema.py` — **new**
|
|
37
|
+
- `tests/sqlglider/graph/test_builder.py` — resolve schema + catalog tests
|
|
38
|
+
- `tests/sqlglider/lineage/test_analyzer.py` — schema param tests
|
|
39
|
+
|
|
40
|
+
## Verification
|
|
41
|
+
|
|
42
|
+
- 617 passed, 1 skipped
|
|
43
|
+
- Coverage: 80.43%
|
|
44
|
+
- basedpyright: 0 errors
|
|
45
|
+
- ruff: all checks passed
|
|
46
|
+
|
|
47
|
+
## Known Limitations
|
|
48
|
+
|
|
49
|
+
- Cross-file CTAS chains with `SELECT *` (view B depends on view A via star) may not resolve if both are in separate files and the schema extraction pass processes B before A. This is rare in practice.
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.12'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 12)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
|
|
|
12
12
|
from typing_extensions import Annotated
|
|
13
13
|
|
|
14
14
|
from sqlglider.global_models import AnalysisLevel, NodeFormat
|
|
15
|
-
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
15
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
|
|
16
16
|
from sqlglider.lineage.formatters import (
|
|
17
17
|
CsvFormatter,
|
|
18
18
|
JsonFormatter,
|
|
@@ -166,6 +166,11 @@ def lineage(
|
|
|
166
166
|
exists=True,
|
|
167
167
|
help="Path to variables file (JSON or YAML)",
|
|
168
168
|
),
|
|
169
|
+
no_star: bool = typer.Option(
|
|
170
|
+
False,
|
|
171
|
+
"--no-star",
|
|
172
|
+
help="Fail if SELECT * cannot be resolved to actual columns",
|
|
173
|
+
),
|
|
169
174
|
) -> None:
|
|
170
175
|
"""
|
|
171
176
|
Analyze column or table lineage for a SQL file.
|
|
@@ -207,6 +212,7 @@ def lineage(
|
|
|
207
212
|
level_str = level or config.level or "column"
|
|
208
213
|
output_format = output_format or config.output_format or "text"
|
|
209
214
|
templater = templater or config.templater # None means no templating
|
|
215
|
+
no_star = no_star or config.no_star or False
|
|
210
216
|
# Validate and convert level to enum
|
|
211
217
|
try:
|
|
212
218
|
analysis_level = AnalysisLevel(level_str)
|
|
@@ -261,7 +267,7 @@ def lineage(
|
|
|
261
267
|
)
|
|
262
268
|
|
|
263
269
|
# Create analyzer
|
|
264
|
-
analyzer = LineageAnalyzer(sql, dialect=dialect)
|
|
270
|
+
analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
|
|
265
271
|
|
|
266
272
|
# Unified lineage analysis (handles both single and multi-query files)
|
|
267
273
|
results = analyzer.analyze_queries(
|
|
@@ -990,6 +996,40 @@ def graph_build(
|
|
|
990
996
|
exists=True,
|
|
991
997
|
help="Path to variables file (JSON or YAML)",
|
|
992
998
|
),
|
|
999
|
+
no_star: bool = typer.Option(
|
|
1000
|
+
False,
|
|
1001
|
+
"--no-star",
|
|
1002
|
+
help="Fail if SELECT * cannot be resolved to actual columns",
|
|
1003
|
+
),
|
|
1004
|
+
resolve_schema: bool = typer.Option(
|
|
1005
|
+
False,
|
|
1006
|
+
"--resolve-schema",
|
|
1007
|
+
help="Extract schema from all files before lineage analysis, "
|
|
1008
|
+
"enabling cross-file star resolution",
|
|
1009
|
+
),
|
|
1010
|
+
catalog_type: Optional[str] = typer.Option(
|
|
1011
|
+
None,
|
|
1012
|
+
"--catalog-type",
|
|
1013
|
+
"-c",
|
|
1014
|
+
help="Catalog provider for pulling DDL of tables not found in files "
|
|
1015
|
+
"(requires --resolve-schema). E.g. 'databricks'",
|
|
1016
|
+
),
|
|
1017
|
+
dump_schema: Optional[Path] = typer.Option(
|
|
1018
|
+
None,
|
|
1019
|
+
"--dump-schema",
|
|
1020
|
+
help="Dump resolved schema to file (requires --resolve-schema)",
|
|
1021
|
+
),
|
|
1022
|
+
dump_schema_format: Optional[str] = typer.Option(
|
|
1023
|
+
None,
|
|
1024
|
+
"--dump-schema-format",
|
|
1025
|
+
help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
|
|
1026
|
+
),
|
|
1027
|
+
strict_schema: bool = typer.Option(
|
|
1028
|
+
False,
|
|
1029
|
+
"--strict-schema",
|
|
1030
|
+
help="Fail if any column's table cannot be identified during schema extraction "
|
|
1031
|
+
"(requires --resolve-schema)",
|
|
1032
|
+
),
|
|
993
1033
|
) -> None:
|
|
994
1034
|
"""
|
|
995
1035
|
Build a lineage graph from SQL files.
|
|
@@ -1024,6 +1064,38 @@ def graph_build(
|
|
|
1024
1064
|
config = load_config()
|
|
1025
1065
|
dialect = dialect or config.dialect or "spark"
|
|
1026
1066
|
templater = templater or config.templater # None means no templating
|
|
1067
|
+
no_star = no_star or config.no_star or False
|
|
1068
|
+
resolve_schema = resolve_schema or config.resolve_schema or False
|
|
1069
|
+
strict_schema = strict_schema or config.strict_schema or False
|
|
1070
|
+
|
|
1071
|
+
if strict_schema and not resolve_schema:
|
|
1072
|
+
err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
|
|
1073
|
+
raise typer.Exit(1)
|
|
1074
|
+
|
|
1075
|
+
if catalog_type and not resolve_schema:
|
|
1076
|
+
err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
|
|
1077
|
+
raise typer.Exit(1)
|
|
1078
|
+
|
|
1079
|
+
# Resolve dump_schema options from config
|
|
1080
|
+
dump_schema = dump_schema or (
|
|
1081
|
+
Path(config.dump_schema) if config.dump_schema else None
|
|
1082
|
+
)
|
|
1083
|
+
dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
|
|
1084
|
+
|
|
1085
|
+
if dump_schema and not resolve_schema:
|
|
1086
|
+
err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
|
|
1087
|
+
raise typer.Exit(1)
|
|
1088
|
+
|
|
1089
|
+
if dump_schema_format not in ("text", "json", "csv"):
|
|
1090
|
+
err_console.print(
|
|
1091
|
+
f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
|
|
1092
|
+
"Use 'text', 'json', or 'csv'."
|
|
1093
|
+
)
|
|
1094
|
+
raise typer.Exit(1)
|
|
1095
|
+
|
|
1096
|
+
# Only inherit catalog_type from config when resolve_schema is active
|
|
1097
|
+
if resolve_schema and not catalog_type:
|
|
1098
|
+
catalog_type = config.catalog_type
|
|
1027
1099
|
|
|
1028
1100
|
# Validate and convert node format to enum
|
|
1029
1101
|
try:
|
|
@@ -1076,10 +1148,22 @@ def graph_build(
|
|
|
1076
1148
|
sql_preprocessor = _preprocess
|
|
1077
1149
|
|
|
1078
1150
|
try:
|
|
1151
|
+
# Build catalog config from config file if available
|
|
1152
|
+
catalog_config_dict = None
|
|
1153
|
+
if catalog_type and config.catalog:
|
|
1154
|
+
provider_config = getattr(config.catalog, catalog_type, None)
|
|
1155
|
+
if provider_config:
|
|
1156
|
+
catalog_config_dict = provider_config.model_dump(exclude_none=True)
|
|
1157
|
+
|
|
1079
1158
|
builder = GraphBuilder(
|
|
1080
1159
|
node_format=node_format_enum,
|
|
1081
1160
|
dialect=dialect,
|
|
1082
1161
|
sql_preprocessor=sql_preprocessor,
|
|
1162
|
+
no_star=no_star,
|
|
1163
|
+
resolve_schema=resolve_schema,
|
|
1164
|
+
catalog_type=catalog_type,
|
|
1165
|
+
catalog_config=catalog_config_dict,
|
|
1166
|
+
strict_schema=strict_schema,
|
|
1083
1167
|
)
|
|
1084
1168
|
|
|
1085
1169
|
# Process manifest if provided
|
|
@@ -1102,6 +1186,17 @@ def graph_build(
|
|
|
1102
1186
|
raise typer.Exit(1)
|
|
1103
1187
|
builder.add_files(all_files, dialect=dialect)
|
|
1104
1188
|
|
|
1189
|
+
# Dump resolved schema if requested
|
|
1190
|
+
if dump_schema:
|
|
1191
|
+
from sqlglider.graph.formatters import format_schema
|
|
1192
|
+
|
|
1193
|
+
schema_content = format_schema(builder.resolved_schema, dump_schema_format)
|
|
1194
|
+
dump_schema.write_text(schema_content, encoding="utf-8")
|
|
1195
|
+
console.print(
|
|
1196
|
+
f"[green]Schema dumped to {dump_schema} "
|
|
1197
|
+
f"({len(builder.resolved_schema)} table(s))[/green]"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1105
1200
|
# Build and save graph
|
|
1106
1201
|
graph = builder.build()
|
|
1107
1202
|
save_graph(graph, output)
|
|
@@ -1111,6 +1206,10 @@ def graph_build(
|
|
|
1111
1206
|
f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
|
|
1112
1207
|
)
|
|
1113
1208
|
|
|
1209
|
+
except SchemaResolutionError as e:
|
|
1210
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1211
|
+
raise typer.Exit(1)
|
|
1212
|
+
|
|
1114
1213
|
except FileNotFoundError as e:
|
|
1115
1214
|
err_console.print(f"[red]Error:[/red] {e}")
|
|
1116
1215
|
raise typer.Exit(1)
|
|
@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
|
|
|
16
16
|
LineageGraph,
|
|
17
17
|
Manifest,
|
|
18
18
|
)
|
|
19
|
-
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
19
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
|
|
20
20
|
from sqlglider.utils.file_utils import read_sql_file
|
|
21
|
+
from sqlglider.utils.schema import parse_ddl_to_schema
|
|
21
22
|
|
|
22
23
|
console = Console(stderr=True)
|
|
23
24
|
|
|
@@ -33,6 +34,11 @@ class GraphBuilder:
|
|
|
33
34
|
node_format: NodeFormat = NodeFormat.QUALIFIED,
|
|
34
35
|
dialect: str = "spark",
|
|
35
36
|
sql_preprocessor: Optional[SqlPreprocessor] = None,
|
|
37
|
+
no_star: bool = False,
|
|
38
|
+
resolve_schema: bool = False,
|
|
39
|
+
catalog_type: Optional[str] = None,
|
|
40
|
+
catalog_config: Optional[Dict[str, object]] = None,
|
|
41
|
+
strict_schema: bool = False,
|
|
36
42
|
):
|
|
37
43
|
"""
|
|
38
44
|
Initialize the graph builder.
|
|
@@ -43,15 +49,32 @@ class GraphBuilder:
|
|
|
43
49
|
sql_preprocessor: Optional function to preprocess SQL before analysis.
|
|
44
50
|
Takes (sql: str, file_path: Path) and returns processed SQL.
|
|
45
51
|
Useful for templating (e.g., Jinja2 rendering).
|
|
52
|
+
no_star: If True, fail when SELECT * cannot be resolved to columns
|
|
53
|
+
resolve_schema: If True, run a schema extraction pass across all
|
|
54
|
+
files before lineage analysis so that schema from any file is
|
|
55
|
+
available when analyzing every other file.
|
|
56
|
+
catalog_type: Optional catalog provider name (e.g. "databricks").
|
|
57
|
+
When set together with resolve_schema, DDL is pulled from the
|
|
58
|
+
catalog for tables whose schema could not be inferred from files.
|
|
59
|
+
catalog_config: Optional provider-specific configuration dict
|
|
60
|
+
passed to the catalog's configure() method.
|
|
61
|
+
strict_schema: If True, fail during schema extraction when an
|
|
62
|
+
unqualified column cannot be attributed to a table.
|
|
46
63
|
"""
|
|
47
64
|
self.node_format = node_format
|
|
48
65
|
self.dialect = dialect
|
|
49
66
|
self.sql_preprocessor = sql_preprocessor
|
|
67
|
+
self.no_star = no_star
|
|
68
|
+
self.resolve_schema = resolve_schema
|
|
69
|
+
self.catalog_type = catalog_type
|
|
70
|
+
self.catalog_config = catalog_config
|
|
71
|
+
self.strict_schema = strict_schema
|
|
50
72
|
self.graph: rx.PyDiGraph = rx.PyDiGraph()
|
|
51
73
|
self._node_index_map: Dict[str, int] = {} # identifier -> rustworkx node index
|
|
52
74
|
self._source_files: Set[str] = set()
|
|
53
75
|
self._edge_set: Set[tuple] = set() # (source, target) for dedup
|
|
54
76
|
self._skipped_files: List[tuple[str, str]] = [] # (file_path, reason)
|
|
77
|
+
self._resolved_schema: Dict[str, Dict[str, str]] = {} # accumulated schema
|
|
55
78
|
|
|
56
79
|
def add_file(
|
|
57
80
|
self,
|
|
@@ -82,7 +105,12 @@ class GraphBuilder:
|
|
|
82
105
|
if self.sql_preprocessor:
|
|
83
106
|
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
84
107
|
|
|
85
|
-
analyzer = LineageAnalyzer(
|
|
108
|
+
analyzer = LineageAnalyzer(
|
|
109
|
+
sql_content,
|
|
110
|
+
dialect=file_dialect,
|
|
111
|
+
no_star=self.no_star,
|
|
112
|
+
schema=self._resolved_schema if self._resolved_schema else None,
|
|
113
|
+
)
|
|
86
114
|
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
87
115
|
|
|
88
116
|
# Print warnings for any skipped queries within the file
|
|
@@ -204,23 +232,37 @@ class GraphBuilder:
|
|
|
204
232
|
entry_dialect = entry.dialect or dialect or self.dialect
|
|
205
233
|
files_with_dialects.append((file_path, entry_dialect))
|
|
206
234
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
235
|
+
if not files_with_dialects:
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
# Two-pass schema resolution
|
|
239
|
+
if self.resolve_schema:
|
|
240
|
+
console.print("[blue]Pass 1: Extracting schema from files[/blue]")
|
|
241
|
+
file_paths_only = [fp for fp, _ in files_with_dialects]
|
|
242
|
+
self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
|
|
243
|
+
if self.catalog_type:
|
|
244
|
+
self._resolved_schema = self._fill_schema_from_catalog(
|
|
245
|
+
self._resolved_schema, file_paths_only, dialect
|
|
246
|
+
)
|
|
247
|
+
console.print(
|
|
248
|
+
f"[blue]Schema resolved for "
|
|
249
|
+
f"{len(self._resolved_schema)} table(s)[/blue]"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
total = len(files_with_dialects)
|
|
253
|
+
description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
|
|
254
|
+
with Progress(
|
|
255
|
+
TextColumn("[progress.description]{task.description}"),
|
|
256
|
+
BarColumn(),
|
|
257
|
+
TaskProgressColumn(),
|
|
258
|
+
console=console,
|
|
259
|
+
transient=False,
|
|
260
|
+
) as progress:
|
|
261
|
+
task = progress.add_task(description, total=total)
|
|
262
|
+
for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
|
|
263
|
+
console.print(f"Parsing file {i}/{total}: {file_path.name}")
|
|
264
|
+
self.add_file(file_path, file_dialect)
|
|
265
|
+
progress.advance(task)
|
|
224
266
|
|
|
225
267
|
return self
|
|
226
268
|
|
|
@@ -244,8 +286,24 @@ class GraphBuilder:
|
|
|
244
286
|
if not file_paths:
|
|
245
287
|
return self
|
|
246
288
|
|
|
289
|
+
# Two-pass schema resolution: extract schema from all files first
|
|
290
|
+
if self.resolve_schema:
|
|
291
|
+
console.print("[blue]Pass 1: Extracting schema from files[/blue]")
|
|
292
|
+
self._resolved_schema = self._extract_schemas(file_paths, dialect)
|
|
293
|
+
if self.catalog_type:
|
|
294
|
+
self._resolved_schema = self._fill_schema_from_catalog(
|
|
295
|
+
self._resolved_schema, file_paths, dialect
|
|
296
|
+
)
|
|
297
|
+
console.print(
|
|
298
|
+
f"[blue]Schema resolved for "
|
|
299
|
+
f"{len(self._resolved_schema)} table(s)[/blue]"
|
|
300
|
+
)
|
|
301
|
+
|
|
247
302
|
if show_progress:
|
|
248
303
|
total = len(file_paths)
|
|
304
|
+
description = (
|
|
305
|
+
"Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
|
|
306
|
+
)
|
|
249
307
|
with Progress(
|
|
250
308
|
TextColumn("[progress.description]{task.description}"),
|
|
251
309
|
BarColumn(),
|
|
@@ -253,7 +311,7 @@ class GraphBuilder:
|
|
|
253
311
|
console=console,
|
|
254
312
|
transient=False,
|
|
255
313
|
) as progress:
|
|
256
|
-
task = progress.add_task(
|
|
314
|
+
task = progress.add_task(description, total=total)
|
|
257
315
|
for i, file_path in enumerate(file_paths, start=1):
|
|
258
316
|
console.print(f"Parsing file {i}/{total}: {file_path.name}")
|
|
259
317
|
self.add_file(file_path, dialect)
|
|
@@ -263,6 +321,129 @@ class GraphBuilder:
|
|
|
263
321
|
self.add_file(file_path, dialect)
|
|
264
322
|
return self
|
|
265
323
|
|
|
324
|
+
def _extract_schemas(
|
|
325
|
+
self,
|
|
326
|
+
file_paths: List[Path],
|
|
327
|
+
dialect: Optional[str] = None,
|
|
328
|
+
) -> Dict[str, Dict[str, str]]:
|
|
329
|
+
"""Run schema extraction pass across all files.
|
|
330
|
+
|
|
331
|
+
Parses each file and extracts schema from CREATE TABLE/VIEW
|
|
332
|
+
statements without performing lineage analysis.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
file_paths: SQL files to extract schema from
|
|
336
|
+
dialect: SQL dialect override
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Accumulated schema dict from all files
|
|
340
|
+
"""
|
|
341
|
+
schema: Dict[str, Dict[str, str]] = {}
|
|
342
|
+
total = len(file_paths)
|
|
343
|
+
with Progress(
|
|
344
|
+
TextColumn("[progress.description]{task.description}"),
|
|
345
|
+
BarColumn(),
|
|
346
|
+
TaskProgressColumn(),
|
|
347
|
+
console=console,
|
|
348
|
+
transient=False,
|
|
349
|
+
) as progress:
|
|
350
|
+
task = progress.add_task("Pass 1: Extracting schema", total=total)
|
|
351
|
+
for i, file_path in enumerate(file_paths, start=1):
|
|
352
|
+
console.print(f"Extracting schema {i}/{total}: {file_path.name}")
|
|
353
|
+
file_dialect = dialect or self.dialect
|
|
354
|
+
try:
|
|
355
|
+
sql_content = read_sql_file(file_path)
|
|
356
|
+
if self.sql_preprocessor:
|
|
357
|
+
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
358
|
+
analyzer = LineageAnalyzer(
|
|
359
|
+
sql_content,
|
|
360
|
+
dialect=file_dialect,
|
|
361
|
+
schema=schema,
|
|
362
|
+
strict_schema=self.strict_schema,
|
|
363
|
+
)
|
|
364
|
+
file_schema = analyzer.extract_schema_only()
|
|
365
|
+
schema.update(file_schema)
|
|
366
|
+
except SchemaResolutionError:
|
|
367
|
+
raise
|
|
368
|
+
except Exception:
|
|
369
|
+
# Schema extraction failures are non-fatal; the file
|
|
370
|
+
# will be reported during the lineage pass if it also fails.
|
|
371
|
+
pass
|
|
372
|
+
progress.advance(task)
|
|
373
|
+
return schema
|
|
374
|
+
|
|
375
|
+
def _fill_schema_from_catalog(
|
|
376
|
+
self,
|
|
377
|
+
schema: Dict[str, Dict[str, str]],
|
|
378
|
+
file_paths: List[Path],
|
|
379
|
+
dialect: Optional[str] = None,
|
|
380
|
+
) -> Dict[str, Dict[str, str]]:
|
|
381
|
+
"""Pull DDL from catalog for tables not yet in schema.
|
|
382
|
+
|
|
383
|
+
Extracts all table names referenced across the files, identifies
|
|
384
|
+
those missing from the schema, and fetches their DDL from the
|
|
385
|
+
configured catalog provider.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
schema: Schema dict already populated from file extraction
|
|
389
|
+
file_paths: SQL files to scan for table references
|
|
390
|
+
dialect: SQL dialect override
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Updated schema dict with catalog-sourced entries added
|
|
394
|
+
"""
|
|
395
|
+
from sqlglider.catalog import get_catalog
|
|
396
|
+
|
|
397
|
+
catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
|
|
398
|
+
if self.catalog_config:
|
|
399
|
+
catalog.configure(self.catalog_config)
|
|
400
|
+
|
|
401
|
+
# Collect all referenced table names across files
|
|
402
|
+
all_tables: Set[str] = set()
|
|
403
|
+
for file_path in file_paths:
|
|
404
|
+
file_dialect = dialect or self.dialect
|
|
405
|
+
try:
|
|
406
|
+
sql_content = read_sql_file(file_path)
|
|
407
|
+
if self.sql_preprocessor:
|
|
408
|
+
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
409
|
+
analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
|
|
410
|
+
tables_results = analyzer.analyze_tables()
|
|
411
|
+
for result in tables_results:
|
|
412
|
+
for table_info in result.tables:
|
|
413
|
+
# Skip CTEs — they don't exist in catalogs
|
|
414
|
+
from sqlglider.lineage.analyzer import ObjectType
|
|
415
|
+
|
|
416
|
+
if table_info.object_type != ObjectType.CTE:
|
|
417
|
+
all_tables.add(table_info.name)
|
|
418
|
+
except Exception:
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
# Find tables missing from schema
|
|
422
|
+
missing = [t for t in all_tables if t not in schema]
|
|
423
|
+
if not missing:
|
|
424
|
+
return schema
|
|
425
|
+
|
|
426
|
+
console.print(
|
|
427
|
+
f"[blue]Pulling DDL from {self.catalog_type} "
|
|
428
|
+
f"for {len(missing)} table(s)...[/blue]"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
ddl_results = catalog.get_ddl_batch(missing)
|
|
432
|
+
file_dialect = dialect or self.dialect
|
|
433
|
+
for table_name, ddl in ddl_results.items():
|
|
434
|
+
if ddl.startswith("ERROR:"):
|
|
435
|
+
console.print(
|
|
436
|
+
f"[yellow]Warning:[/yellow] Could not pull DDL "
|
|
437
|
+
f"for {table_name}: {ddl}"
|
|
438
|
+
)
|
|
439
|
+
continue
|
|
440
|
+
parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
|
|
441
|
+
for name, cols in parsed_schema.items():
|
|
442
|
+
if name not in schema:
|
|
443
|
+
schema[name] = cols
|
|
444
|
+
|
|
445
|
+
return schema
|
|
446
|
+
|
|
266
447
|
def _ensure_node(
|
|
267
448
|
self,
|
|
268
449
|
identifier: str,
|
|
@@ -343,6 +524,11 @@ class GraphBuilder:
|
|
|
343
524
|
"""Get mapping from node identifiers to rustworkx indices."""
|
|
344
525
|
return self._node_index_map.copy()
|
|
345
526
|
|
|
527
|
+
@property
|
|
528
|
+
def resolved_schema(self) -> Dict[str, Dict[str, str]]:
|
|
529
|
+
"""Get the resolved schema dictionary from schema extraction pass."""
|
|
530
|
+
return self._resolved_schema.copy()
|
|
531
|
+
|
|
346
532
|
@property
|
|
347
533
|
def skipped_files(self) -> List[tuple[str, str]]:
|
|
348
534
|
"""Get list of files that were skipped during graph building."""
|