sql-glider 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. {sql_glider-0.1.13 → sql_glider-0.1.15}/PKG-INFO +1 -1
  2. sql_glider-0.1.15/plans/2026-01-29-schema-pruning-optimization.md +63 -0
  3. sql_glider-0.1.15/plans/2026-01-29-tables-scrape-command.md +118 -0
  4. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/_version.py +2 -2
  5. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/cli.py +303 -26
  6. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/builder.py +25 -131
  7. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/formatters.py +92 -1
  8. sql_glider-0.1.15/src/sqlglider/schema/__init__.py +0 -0
  9. sql_glider-0.1.15/src/sqlglider/schema/extractor.py +205 -0
  10. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_formatters.py +62 -0
  11. sql_glider-0.1.15/tests/sqlglider/schema/__init__.py +0 -0
  12. sql_glider-0.1.15/tests/sqlglider/schema/test_extractor.py +163 -0
  13. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/test_cli.py +299 -0
  14. {sql_glider-0.1.13 → sql_glider-0.1.15}/.github/workflows/ci.yml +0 -0
  15. {sql_glider-0.1.13 → sql_glider-0.1.15}/.github/workflows/publish.yml +0 -0
  16. {sql_glider-0.1.13 → sql_glider-0.1.15}/.gitignore +0 -0
  17. {sql_glider-0.1.13 → sql_glider-0.1.15}/.python-version +0 -0
  18. {sql_glider-0.1.13 → sql_glider-0.1.15}/ARCHITECTURE.md +0 -0
  19. {sql_glider-0.1.13 → sql_glider-0.1.15}/CLAUDE.md +0 -0
  20. {sql_glider-0.1.13 → sql_glider-0.1.15}/LICENSE +0 -0
  21. {sql_glider-0.1.13 → sql_glider-0.1.15}/README.md +0 -0
  22. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-05-column-level-lineage.md +0 -0
  23. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-05-reverse-lineage.md +0 -0
  24. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-06-config-file-support.md +0 -0
  25. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-06-graph-lineage.md +0 -0
  26. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  27. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-07-sample-data-model.md +0 -0
  28. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-07-sql-templating.md +0 -0
  29. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-08-tables-command.md +0 -0
  30. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-09-graph-query-paths.md +0 -0
  31. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-13-dissect-command.md +0 -0
  32. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2025-12-14-tables-pull-command.md +0 -0
  33. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  34. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
  35. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2026-01-28-sparksql-table-extraction.md +0 -0
  36. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2026-01-29-no-star-flag.md +0 -0
  37. {sql_glider-0.1.13 → sql_glider-0.1.15}/plans/2026-01-29-resolve-schema.md +0 -0
  38. {sql_glider-0.1.13 → sql_glider-0.1.15}/pyproject.toml +0 -0
  39. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/README.md +0 -0
  40. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/expire_dim_customer.sql +0 -0
  41. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/load_fact_orders.sql +0 -0
  42. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/load_fact_payments.sql +0 -0
  43. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/merge_dim_customer.sql +0 -0
  44. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/merge_dim_product.sql +0 -0
  45. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  46. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/complex/conditional_merge.sql +0 -0
  47. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/complex/cte_insert.sql +0 -0
  48. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/complex/multi_table_transform.sql +0 -0
  49. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/dim_customer.sql +0 -0
  50. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/dim_product.sql +0 -0
  51. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/fact_orders.sql +0 -0
  52. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/fact_payments.sql +0 -0
  53. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_addresses.sql +0 -0
  54. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_customers.sql +0 -0
  55. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_order_items.sql +0 -0
  56. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_orders.sql +0 -0
  57. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_payments.sql +0 -0
  58. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/raw_products.sql +0 -0
  59. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/stg_customers.sql +0 -0
  60. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/stg_orders.sql +0 -0
  61. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/stg_payments.sql +0 -0
  62. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/ddl/stg_products.sql +0 -0
  63. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  64. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  65. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  66. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  67. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/maintenance/update_product_status.sql +0 -0
  68. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  69. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  70. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  71. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  72. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/staging/load_stg_customers.sql +0 -0
  73. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/staging/load_stg_orders.sql +0 -0
  74. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/staging/load_stg_payments.sql +0 -0
  75. {sql_glider-0.1.13 → sql_glider-0.1.15}/sample_data_model/staging/load_stg_products.sql +0 -0
  76. {sql_glider-0.1.13 → sql_glider-0.1.15}/sqlglider.toml.example +0 -0
  77. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/__init__.py +0 -0
  78. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/catalog/__init__.py +0 -0
  79. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/catalog/base.py +0 -0
  80. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/catalog/databricks.py +0 -0
  81. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/catalog/registry.py +0 -0
  82. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/dissection/__init__.py +0 -0
  83. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/dissection/analyzer.py +0 -0
  84. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/dissection/formatters.py +0 -0
  85. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/dissection/models.py +0 -0
  86. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/global_models.py +0 -0
  87. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/__init__.py +0 -0
  88. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/merge.py +0 -0
  89. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/models.py +0 -0
  90. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/query.py +0 -0
  91. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/graph/serialization.py +0 -0
  92. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/lineage/__init__.py +0 -0
  93. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/lineage/analyzer.py +0 -0
  94. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/lineage/formatters.py +0 -0
  95. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/templating/__init__.py +0 -0
  96. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/templating/base.py +0 -0
  97. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/templating/jinja.py +0 -0
  98. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/templating/registry.py +0 -0
  99. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/templating/variables.py +0 -0
  100. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/utils/__init__.py +0 -0
  101. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/utils/config.py +0 -0
  102. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/utils/file_utils.py +0 -0
  103. {sql_glider-0.1.13 → sql_glider-0.1.15}/src/sqlglider/utils/schema.py +0 -0
  104. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/__init__.py +0 -0
  105. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  106. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  107. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  108. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  109. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  110. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  111. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_cte.sql +0 -0
  112. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  113. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
  114. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  115. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_multi.sql +0 -0
  116. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  117. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  118. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  119. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_tables.sql +0 -0
  120. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_view.sql +0 -0
  121. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  122. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/fixtures/sample_manifest.csv +0 -0
  123. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/__init__.py +0 -0
  124. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/catalog/__init__.py +0 -0
  125. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/catalog/test_base.py +0 -0
  126. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/catalog/test_databricks.py +0 -0
  127. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/catalog/test_registry.py +0 -0
  128. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/dissection/__init__.py +0 -0
  129. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  130. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/dissection/test_formatters.py +0 -0
  131. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/dissection/test_models.py +0 -0
  132. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/__init__.py +0 -0
  133. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_builder.py +0 -0
  134. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_merge.py +0 -0
  135. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_models.py +0 -0
  136. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_query.py +0 -0
  137. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/graph/test_serialization.py +0 -0
  138. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/lineage/__init__.py +0 -0
  139. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/lineage/test_analyzer.py +0 -0
  140. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/lineage/test_formatters.py +0 -0
  141. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/templating/__init__.py +0 -0
  142. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/templating/test_base.py +0 -0
  143. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/templating/test_jinja.py +0 -0
  144. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/templating/test_registry.py +0 -0
  145. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/templating/test_variables.py +0 -0
  146. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/utils/__init__.py +0 -0
  147. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/utils/test_config.py +0 -0
  148. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/utils/test_file_utils.py +0 -0
  149. {sql_glider-0.1.13 → sql_glider-0.1.15}/tests/sqlglider/utils/test_schema.py +0 -0
  150. {sql_glider-0.1.13 → sql_glider-0.1.15}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,63 @@
1
+ # Schema Pruning Optimization for `--resolve-schema`
2
+
3
+ **Status:** Completed
4
+
5
+ ## Overview
6
+
7
+ Optimize `--resolve-schema` graph build performance by pruning the schema dict to only tables referenced in each query before passing it to `sqlglot.lineage()`. Also moved schema dumping (`--dump-schema`) to occur before graph building (between Pass 1 and Pass 2).
8
+
9
+ ## Problem
10
+
11
+ `sqlglot.lineage()` performance degrades dramatically with large schema dicts. Benchmarks showed:
12
+
13
+ | Schema Size | Time (6 columns) |
14
+ |---|---|
15
+ | No schema | 2.6ms |
16
+ | 4 tables | 8.3ms |
17
+ | 204 tables | **1,041ms** |
18
+
19
+ The full accumulated schema from all files was passed to every `lineage()` call, even though each query only references a handful of tables. For projects with hundreds of files/tables, this made `--resolve-schema` unusable.
20
+
21
+ ## Design Decisions
22
+
23
+ - **Prune in the analyzer, not the builder** — keeps the optimization localized and benefits all callers of `LineageAnalyzer`, not just graph builds
24
+ - **Prune once per query, not per column** — `_get_query_tables()` is called once before the column loop, and the pruned schema is reused for all columns in that query
25
+ - **Case-insensitive matching** — table names are lowered for comparison to handle mixed-case schemas
26
+ - **Moved schema dump before graph build** — `_resolved_schema` is fully populated after Pass 1, so dumping between passes is safe and gives users earlier feedback. Required exposing `extract_schemas()` as a public method on `GraphBuilder`
27
+
28
+ ## Implementation
29
+
30
+ - [x] Add schema pruning in `_analyze_column_lineage_internal()` using existing `_get_query_tables()` method
31
+ - [x] Expose `extract_schemas()` as public method on `GraphBuilder`
32
+ - [x] Skip Pass 1 in `add_files()`/`add_manifest()` if `_resolved_schema` is already populated
33
+ - [x] Restructure CLI `graph_build` to call `extract_schemas()` then dump schema before `add_files()`/`add_manifest()`
34
+ - [x] Tests for `extract_schemas()` method and schema pruning correctness
35
+
36
+ ## Files Modified
37
+
38
+ - `src/sqlglider/lineage/analyzer.py` — schema pruning before `lineage()` calls
39
+ - `src/sqlglider/graph/builder.py` — public `extract_schemas()`, skip Pass 1 when already resolved
40
+ - `src/sqlglider/cli.py` — restructured `graph_build` to dump schema before graph building
41
+ - `tests/sqlglider/lineage/test_analyzer.py` — `TestSchemaPruning` (2 tests)
42
+ - `tests/sqlglider/graph/test_builder.py` — `TestExtractSchemas` (3 tests)
43
+
44
+ ## Benchmark Results (After)
45
+
46
+ | Schema Size | Time (6 columns) |
47
+ |---|---|
48
+ | No schema | 2.6ms |
49
+ | 4 tables | 8.3ms |
50
+ | 204 tables | **8.3ms** |
51
+
52
+ Full `analyze_queries` benchmark on complex fixture (analytics_pipeline.sql):
53
+
54
+ | Scenario | Before | After |
55
+ |---|---|---|
56
+ | No schema | ~392ms | ~392ms |
57
+ | Small schema (4 tables) | ~373ms | ~373ms |
58
+ | Big schema (204 tables) | ~1,400ms+ | **~387ms** |
59
+
60
+ ## Lessons Learned
61
+
62
+ - The initial assumption was that double-parsing (Pass 1 + Pass 2 both calling `sqlglot.parse()`) was the bottleneck. Benchmarking showed `parse()` costs ~10ms, while `lineage()` with a 200-table schema costs ~1,000ms. Profiling before optimizing avoided wasted effort on AST caching.
63
+ - `sqlglot.lineage()` appears to have O(n) or worse scaling with schema size, even for tables not referenced in the query. Pruning is essential for multi-file workloads.
@@ -0,0 +1,118 @@
1
+ **Status:** Completed
2
+
3
+ # Plan: `tables scrape` Command
4
+
5
+ ## Overview
6
+
7
+ Add a `tables scrape` subcommand that performs schema inference (the same logic `graph build --resolve-schema` uses) but outputs the inferred schema directly instead of building a lineage graph. This makes schema inference a standalone, reusable operation.
8
+
9
+ ## Key Changes
10
+
11
+ ### 1. Refactor schema extraction out of GraphBuilder
12
+
13
+ **File:** `src/sqlglider/graph/builder.py`
14
+
15
+ Extract `_extract_schemas()` and `_fill_schema_from_catalog()` into a standalone module so both `graph build` and `tables scrape` can use them without instantiating a full `GraphBuilder`.
16
+
17
+ **New file:** `src/sqlglider/schema/extractor.py`
18
+ - `extract_schemas_from_files(file_paths, dialect, sql_preprocessor, schema, strict_schema, console) -> SchemaDict` — core extraction loop with Rich progress bar
19
+ - `fill_schema_from_catalog(schema, file_paths, dialect, sql_preprocessor, catalog_type, catalog_config, console) -> SchemaDict` — catalog fill logic
20
+ - `extract_and_resolve_schema(file_paths, dialect, sql_preprocessor, strict_schema, catalog_type, catalog_config, console) -> SchemaDict` — high-level orchestrator (extract + optional catalog fill)
21
+
22
+ **Update GraphBuilder** to delegate to these new functions instead of implementing them inline. `GraphBuilder._extract_schemas` and `_fill_schema_from_catalog` become thin wrappers or are removed, with `extract_schemas()` calling the shared code.
23
+
24
+ ### 2. Add `tables scrape` CLI command
25
+
26
+ **File:** `src/sqlglider/cli.py`
27
+
28
+ Add `@tables_app.command("scrape")` with these parameters (mirroring `graph build`):
29
+
30
+ | Parameter | Source |
31
+ |-----------|--------|
32
+ | `paths` | Same as `graph build` — file(s) or directory(ies) |
33
+ | `--recursive / -r` | Same recursive directory traversal |
34
+ | `--glob / -g` | Same glob pattern (default `*.sql`) |
35
+ | `--manifest / -m` | Same manifest CSV support |
36
+ | `--dialect / -d` | SQL dialect |
37
+ | `--templater / -t` | Templater name |
38
+ | `--var / -v` | Template variables |
39
+ | `--vars-file` | Variables file |
40
+ | `--strict-schema` | Strict schema mode |
41
+ | `--catalog-type / -c` | Catalog provider for remote DDL |
42
+ | `--output-format / -f` | `text` (default), `json`, or `csv` |
43
+ | `--output-file / -o` | Output file path (stdout if omitted) |
44
+
45
+ **Flow:**
46
+ 1. Resolve config defaults (same pattern as `graph build`)
47
+ 2. Set up templating preprocessor (same shared code)
48
+ 3. Collect files from paths/manifest (same logic as `graph build`)
49
+ 4. Call `extract_and_resolve_schema(...)` from the new shared module
50
+ 5. Format output using existing `format_schema()` from `src/sqlglider/graph/formatters.py`
51
+ 6. Write to file or stdout using `OutputWriter`
52
+
53
+ ### 3. Move schema formatters
54
+
55
+ The existing formatters in `src/sqlglider/graph/formatters.py` are already generic (they format `SchemaDict`). **Decision: leave them in place** to minimize churn — they work for both `graph build --dump-schema` and `tables scrape`.
56
+
57
+ ### 4. Refactor shared file-collection logic
58
+
59
+ The file collection code (paths + recursive glob + manifest) is duplicated between `graph build` and the new `tables scrape`. Extract a helper function:
60
+
61
+ ```python
62
+ def _collect_sql_files(
63
+ paths: Optional[List[Path]],
64
+ manifest: Optional[Path],
65
+ recursive: bool,
66
+ glob_pattern: str,
67
+ ) -> tuple[list[Path], list[Path]]:
68
+ """Returns (manifest_files, path_files)."""
69
+ ```
70
+
71
+ Place this in `cli.py` as a private helper used by both commands.
72
+
73
+ ## Implementation Steps
74
+
75
+ - [x] Create `src/sqlglider/schema/__init__.py`
76
+ - [x] Create `src/sqlglider/schema/extractor.py` with shared schema extraction logic
77
+ - [x] Update `src/sqlglider/graph/builder.py` to delegate to shared extractor
78
+ - [x] Add `_collect_sql_files` helper to `src/sqlglider/cli.py`
79
+ - [x] Refactor `graph build` to use `_collect_sql_files`
80
+ - [x] Add `tables scrape` command to `src/sqlglider/cli.py`
81
+ - [x] Create `tests/sqlglider/schema/__init__.py`
82
+ - [x] Create `tests/sqlglider/schema/test_extractor.py` (10 tests)
83
+ - [x] Add `TestTablesScrapeCommand` to `tests/sqlglider/test_cli.py` (11 tests)
84
+ - [x] All 672 tests pass, 81.5% coverage, ruff clean
85
+
86
+ ## Files Created/Modified
87
+
88
+ | File | Action |
89
+ |------|--------|
90
+ | `src/sqlglider/schema/__init__.py` | Created — empty |
91
+ | `src/sqlglider/schema/extractor.py` | Created — shared schema extraction logic |
92
+ | `src/sqlglider/graph/builder.py` | Modified — delegate to shared extractor |
93
+ | `src/sqlglider/cli.py` | Modified — add `tables scrape` command + `_collect_sql_files` helper |
94
+ | `tests/sqlglider/schema/__init__.py` | Created — empty |
95
+ | `tests/sqlglider/schema/test_extractor.py` | Created — tests for shared extractor |
96
+ | `tests/sqlglider/test_cli.py` | Modified — add tests for `tables scrape` command |
97
+
98
+ ## Testing Strategy
99
+
100
+ 1. **Unit tests for `schema/extractor.py`**: Test `extract_schemas_from_files` with CREATE VIEW/TABLE AS SELECT and DQL qualified refs
101
+ 2. **CLI tests for `tables scrape`**: Use `CliRunner` to test text/json/csv output, recursive glob, templating, error cases
102
+ 3. **Regression**: Full test suite passes (672 tests), coverage at 81.5%
103
+
104
+ ## Verification
105
+
106
+ ```bash
107
+ # Basic usage
108
+ uv run sqlglider tables scrape ./queries/ -r
109
+
110
+ # With output format
111
+ uv run sqlglider tables scrape ./queries/ -r -f json -o schema.json
112
+
113
+ # With catalog
114
+ uv run sqlglider tables scrape ./queries/ -r -c databricks -f csv
115
+
116
+ # Ensure graph build still works
117
+ uv run sqlglider graph build ./queries/ -r --resolve-schema --dump-schema schema.txt -o graph.json
118
+ ```
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.13'
32
- __version_tuple__ = version_tuple = (0, 1, 13)
31
+ __version__ = version = '0.1.15'
32
+ __version_tuple__ = version_tuple = (0, 1, 15)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -171,6 +171,12 @@ def lineage(
171
171
  "--no-star",
172
172
  help="Fail if SELECT * cannot be resolved to actual columns",
173
173
  ),
174
+ provide_schema: Optional[Path] = typer.Option(
175
+ None,
176
+ "--provide-schema",
177
+ exists=True,
178
+ help="Path to a schema file (JSON, CSV, or text) for star resolution",
179
+ ),
174
180
  ) -> None:
175
181
  """
176
182
  Analyze column or table lineage for a SQL file.
@@ -266,8 +272,15 @@ def lineage(
266
272
  source_path=source_path,
267
273
  )
268
274
 
275
+ # Load provided schema if specified
276
+ schema = None
277
+ if provide_schema:
278
+ from sqlglider.graph.formatters import load_schema_file
279
+
280
+ schema = load_schema_file(provide_schema)
281
+
269
282
  # Create analyzer
270
- analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
283
+ analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star, schema=schema)
271
284
 
272
285
  # Unified lineage analysis (handles both single and multi-query files)
273
286
  results = analyzer.analyze_queries(
@@ -788,6 +801,274 @@ def tables_pull(
788
801
  raise typer.Exit(1)
789
802
 
790
803
 
804
+ def _collect_sql_files(
805
+ paths: Optional[List[Path]],
806
+ manifest: Optional[Path],
807
+ recursive: bool,
808
+ glob_pattern: str,
809
+ ) -> tuple[list[Path], list[Path]]:
810
+ """Collect SQL files from paths and/or manifest.
811
+
812
+ Args:
813
+ paths: File or directory paths to scan.
814
+ manifest: Optional manifest CSV path.
815
+ recursive: Whether to recurse into directories.
816
+ glob_pattern: Glob pattern for directory scanning.
817
+
818
+ Returns:
819
+ Tuple of (manifest_files, path_files).
820
+ """
821
+ path_files: list[Path] = []
822
+ if paths:
823
+ for path in paths:
824
+ if path.is_dir():
825
+ pattern = f"**/{glob_pattern}" if recursive else glob_pattern
826
+ path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
827
+ elif path.is_file():
828
+ path_files.append(path)
829
+ else:
830
+ err_console.print(f"[red]Error:[/red] Path not found: {path}")
831
+ raise typer.Exit(1)
832
+
833
+ manifest_files: list[Path] = []
834
+ if manifest:
835
+ from sqlglider.graph.models import Manifest
836
+
837
+ manifest_data = Manifest.from_csv(manifest)
838
+ base_dir = manifest.parent
839
+ for entry in manifest_data.entries:
840
+ file_path = Path(entry.file_path)
841
+ if not file_path.is_absolute():
842
+ file_path = (base_dir / entry.file_path).resolve()
843
+ manifest_files.append(file_path)
844
+
845
+ return manifest_files, path_files
846
+
847
+
848
+ @tables_app.command("scrape")
849
+ def tables_scrape(
850
+ paths: List[Path] = typer.Argument(
851
+ None,
852
+ help="SQL file(s) or directory path to process",
853
+ ),
854
+ recursive: bool = typer.Option(
855
+ False,
856
+ "--recursive",
857
+ "-r",
858
+ help="Recursively search directories for SQL files",
859
+ ),
860
+ glob_pattern: str = typer.Option(
861
+ "*.sql",
862
+ "--glob",
863
+ "-g",
864
+ help="Glob pattern for matching SQL files in directories",
865
+ ),
866
+ manifest: Optional[Path] = typer.Option(
867
+ None,
868
+ "--manifest",
869
+ "-m",
870
+ exists=True,
871
+ help="Path to manifest CSV file with file_path and optional dialect columns",
872
+ ),
873
+ dialect: Optional[str] = typer.Option(
874
+ None,
875
+ "--dialect",
876
+ "-d",
877
+ help="SQL dialect (default: spark)",
878
+ ),
879
+ templater: Optional[str] = typer.Option(
880
+ None,
881
+ "--templater",
882
+ "-t",
883
+ help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
884
+ ),
885
+ var: Optional[List[str]] = typer.Option(
886
+ None,
887
+ "--var",
888
+ "-v",
889
+ help="Template variable in key=value format (repeatable)",
890
+ ),
891
+ vars_file: Optional[Path] = typer.Option(
892
+ None,
893
+ "--vars-file",
894
+ exists=True,
895
+ help="Path to variables file (JSON or YAML)",
896
+ ),
897
+ strict_schema: bool = typer.Option(
898
+ False,
899
+ "--strict-schema",
900
+ help="Fail if any column's table cannot be identified during schema extraction",
901
+ ),
902
+ catalog_type: Optional[str] = typer.Option(
903
+ None,
904
+ "--catalog-type",
905
+ "-c",
906
+ help="Catalog provider for pulling DDL of tables not found in files "
907
+ "(e.g. 'databricks')",
908
+ ),
909
+ output_format: Optional[str] = typer.Option(
910
+ None,
911
+ "--output-format",
912
+ "-f",
913
+ help="Output format: 'text' (default), 'json', or 'csv'",
914
+ ),
915
+ output_file: Optional[Path] = typer.Option(
916
+ None,
917
+ "--output-file",
918
+ "-o",
919
+ help="Output file path (prints to stdout if not provided)",
920
+ ),
921
+ ) -> None:
922
+ """
923
+ Scrape schema information from SQL files.
924
+
925
+ Infers table and column schemas from DDL statements and DQL column
926
+ references across one or more SQL files. Supports the same file input
927
+ modes as `graph build` (paths, directories, manifests).
928
+
929
+ Examples:
930
+
931
+ # Scrape schema from a directory
932
+ sqlglider tables scrape ./queries/ -r
933
+
934
+ # Output as JSON
935
+ sqlglider tables scrape ./queries/ -r -f json
936
+
937
+ # Save to file
938
+ sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
939
+
940
+ # With Jinja2 templating
941
+ sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
942
+
943
+ # With catalog fallback
944
+ sqlglider tables scrape ./queries/ -r -c databricks
945
+ """
946
+ from sqlglider.graph.formatters import format_schema
947
+ from sqlglider.lineage.analyzer import SchemaResolutionError
948
+ from sqlglider.schema.extractor import extract_and_resolve_schema
949
+
950
+ # Load config for defaults
951
+ config = load_config()
952
+ dialect = dialect or config.dialect or "spark"
953
+ templater = templater or config.templater
954
+ strict_schema = strict_schema or config.strict_schema or False
955
+ output_format = output_format or config.output_format or "text"
956
+
957
+ if output_format not in ("text", "json", "csv"):
958
+ err_console.print(
959
+ f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
960
+ "Use 'text', 'json', or 'csv'."
961
+ )
962
+ raise typer.Exit(1)
963
+
964
+ # Only inherit catalog_type from config when not provided via CLI
965
+ if not catalog_type:
966
+ catalog_type = config.catalog_type
967
+
968
+ # Validate inputs
969
+ if not paths and not manifest:
970
+ err_console.print(
971
+ "[red]Error:[/red] Must provide either file/directory paths or --manifest option."
972
+ )
973
+ raise typer.Exit(1)
974
+
975
+ # Create SQL preprocessor if templating is enabled
976
+ sql_preprocessor: Optional[Callable[[str, Path], str]] = None
977
+ if templater:
978
+ config_vars_file = None
979
+ config_vars = None
980
+ if config.templating:
981
+ if config.templating.variables_file and not vars_file:
982
+ config_vars_file = Path(config.templating.variables_file)
983
+ if not config_vars_file.exists():
984
+ err_console.print(
985
+ f"[yellow]Warning:[/yellow] Variables file from config "
986
+ f"not found: {config_vars_file}"
987
+ )
988
+ config_vars_file = None
989
+ config_vars = config.templating.variables
990
+
991
+ variables = load_all_variables(
992
+ cli_vars=var,
993
+ vars_file=vars_file or config_vars_file,
994
+ config_vars=config_vars,
995
+ use_env=True,
996
+ )
997
+
998
+ templater_instance = get_templater(templater)
999
+
1000
+ def _preprocess(sql: str, file_path: Path) -> str:
1001
+ return templater_instance.render(
1002
+ sql, variables=variables, source_path=file_path
1003
+ )
1004
+
1005
+ sql_preprocessor = _preprocess
1006
+
1007
+ try:
1008
+ # Build catalog config from config file if available
1009
+ catalog_config_dict = None
1010
+ if catalog_type and config.catalog:
1011
+ provider_config = getattr(config.catalog, catalog_type, None)
1012
+ if provider_config:
1013
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1014
+
1015
+ # Collect files
1016
+ manifest_files, path_files = _collect_sql_files(
1017
+ paths, manifest, recursive, glob_pattern
1018
+ )
1019
+ all_files = manifest_files + path_files
1020
+
1021
+ if not all_files:
1022
+ err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
1023
+ raise typer.Exit(0)
1024
+
1025
+ # Extract schema
1026
+ schema = extract_and_resolve_schema(
1027
+ all_files,
1028
+ dialect=dialect,
1029
+ sql_preprocessor=sql_preprocessor,
1030
+ strict_schema=strict_schema,
1031
+ catalog_type=catalog_type,
1032
+ catalog_config=catalog_config_dict,
1033
+ console=err_console,
1034
+ )
1035
+
1036
+ if not schema:
1037
+ err_console.print("[yellow]No schema information found.[/yellow]")
1038
+ raise typer.Exit(0)
1039
+
1040
+ # Format and output
1041
+ formatted = format_schema(schema, output_format)
1042
+ if output_file:
1043
+ OutputWriter.write(formatted, output_file)
1044
+ err_console.print(
1045
+ f"[green]Schema written to {output_file} "
1046
+ f"({len(schema)} table(s))[/green]"
1047
+ )
1048
+ else:
1049
+ console.print(formatted, end="")
1050
+
1051
+ except SchemaResolutionError as e:
1052
+ err_console.print(f"[red]Error:[/red] {e}")
1053
+ raise typer.Exit(1)
1054
+
1055
+ except FileNotFoundError as e:
1056
+ err_console.print(f"[red]Error:[/red] {e}")
1057
+ raise typer.Exit(1)
1058
+
1059
+ except TemplaterError as e:
1060
+ err_console.print(f"[red]Error:[/red] {e}")
1061
+ raise typer.Exit(1)
1062
+
1063
+ except ValueError as e:
1064
+ err_console.print(f"[red]Error:[/red] {e}")
1065
+ raise typer.Exit(1)
1066
+
1067
+ except Exception as e:
1068
+ err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
1069
+ raise typer.Exit(1)
1070
+
1071
+
791
1072
  @app.command()
792
1073
  def template(
793
1074
  sql_file: Annotated[
@@ -1024,6 +1305,13 @@ def graph_build(
1024
1305
  "--dump-schema-format",
1025
1306
  help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
1026
1307
  ),
1308
+ provide_schema: Optional[Path] = typer.Option(
1309
+ None,
1310
+ "--provide-schema",
1311
+ exists=True,
1312
+ help="Path to a schema file (JSON, CSV, or text) to use for star resolution. "
1313
+ "Can be combined with --resolve-schema to merge file-extracted schema on top.",
1314
+ ),
1027
1315
  strict_schema: bool = typer.Option(
1028
1316
  False,
1029
1317
  "--strict-schema",
@@ -1166,32 +1454,21 @@ def graph_build(
1166
1454
  strict_schema=strict_schema,
1167
1455
  )
1168
1456
 
1169
- # Collect file paths for schema extraction
1170
- path_files: list[Path] = []
1171
- if paths:
1172
- for path in paths:
1173
- if path.is_dir():
1174
- pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1175
- path_files.extend(
1176
- f for f in sorted(path.glob(pattern)) if f.is_file()
1177
- )
1178
- elif path.is_file():
1179
- path_files.append(path)
1180
- else:
1181
- err_console.print(f"[red]Error:[/red] Path not found: {path}")
1182
- raise typer.Exit(1)
1457
+ # Load provided schema file if specified
1458
+ if provide_schema:
1459
+ from sqlglider.graph.formatters import load_schema_file
1183
1460
 
1184
- manifest_files: list[Path] = []
1185
- if manifest:
1186
- from sqlglider.graph.models import Manifest
1187
-
1188
- manifest_data = Manifest.from_csv(manifest)
1189
- base_dir = manifest.parent
1190
- for entry in manifest_data.entries:
1191
- file_path = Path(entry.file_path)
1192
- if not file_path.is_absolute():
1193
- file_path = (base_dir / entry.file_path).resolve()
1194
- manifest_files.append(file_path)
1461
+ loaded_schema = load_schema_file(provide_schema)
1462
+ builder.set_schema(loaded_schema)
1463
+ console.print(
1464
+ f"[green]Loaded schema from {provide_schema} "
1465
+ f"({len(loaded_schema)} table(s))[/green]"
1466
+ )
1467
+
1468
+ # Collect file paths for schema extraction
1469
+ manifest_files, path_files = _collect_sql_files(
1470
+ paths, manifest, recursive, glob_pattern
1471
+ )
1195
1472
 
1196
1473
  # Extract schema upfront if requested, then dump before graph building
1197
1474
  all_files = manifest_files + path_files