sql-glider 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. {sql_glider-0.1.13 → sql_glider-0.1.14}/PKG-INFO +1 -1
  2. sql_glider-0.1.14/plans/2026-01-29-schema-pruning-optimization.md +63 -0
  3. sql_glider-0.1.14/plans/2026-01-29-tables-scrape-command.md +118 -0
  4. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/_version.py +2 -2
  5. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/cli.py +271 -25
  6. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/builder.py +11 -133
  7. sql_glider-0.1.14/src/sqlglider/schema/__init__.py +0 -0
  8. sql_glider-0.1.14/src/sqlglider/schema/extractor.py +202 -0
  9. sql_glider-0.1.14/tests/sqlglider/schema/__init__.py +0 -0
  10. sql_glider-0.1.14/tests/sqlglider/schema/test_extractor.py +163 -0
  11. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/test_cli.py +161 -0
  12. {sql_glider-0.1.13 → sql_glider-0.1.14}/.github/workflows/ci.yml +0 -0
  13. {sql_glider-0.1.13 → sql_glider-0.1.14}/.github/workflows/publish.yml +0 -0
  14. {sql_glider-0.1.13 → sql_glider-0.1.14}/.gitignore +0 -0
  15. {sql_glider-0.1.13 → sql_glider-0.1.14}/.python-version +0 -0
  16. {sql_glider-0.1.13 → sql_glider-0.1.14}/ARCHITECTURE.md +0 -0
  17. {sql_glider-0.1.13 → sql_glider-0.1.14}/CLAUDE.md +0 -0
  18. {sql_glider-0.1.13 → sql_glider-0.1.14}/LICENSE +0 -0
  19. {sql_glider-0.1.13 → sql_glider-0.1.14}/README.md +0 -0
  20. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-05-column-level-lineage.md +0 -0
  21. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-05-reverse-lineage.md +0 -0
  22. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-06-config-file-support.md +0 -0
  23. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-06-graph-lineage.md +0 -0
  24. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  25. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-07-sample-data-model.md +0 -0
  26. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-07-sql-templating.md +0 -0
  27. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-08-tables-command.md +0 -0
  28. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-09-graph-query-paths.md +0 -0
  29. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-13-dissect-command.md +0 -0
  30. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2025-12-14-tables-pull-command.md +0 -0
  31. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  32. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
  33. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2026-01-28-sparksql-table-extraction.md +0 -0
  34. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2026-01-29-no-star-flag.md +0 -0
  35. {sql_glider-0.1.13 → sql_glider-0.1.14}/plans/2026-01-29-resolve-schema.md +0 -0
  36. {sql_glider-0.1.13 → sql_glider-0.1.14}/pyproject.toml +0 -0
  37. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/README.md +0 -0
  38. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/expire_dim_customer.sql +0 -0
  39. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/load_fact_orders.sql +0 -0
  40. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/load_fact_payments.sql +0 -0
  41. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/merge_dim_customer.sql +0 -0
  42. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/merge_dim_product.sql +0 -0
  43. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  44. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/complex/conditional_merge.sql +0 -0
  45. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/complex/cte_insert.sql +0 -0
  46. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/complex/multi_table_transform.sql +0 -0
  47. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/dim_customer.sql +0 -0
  48. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/dim_product.sql +0 -0
  49. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/fact_orders.sql +0 -0
  50. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/fact_payments.sql +0 -0
  51. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_addresses.sql +0 -0
  52. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_customers.sql +0 -0
  53. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_order_items.sql +0 -0
  54. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_orders.sql +0 -0
  55. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_payments.sql +0 -0
  56. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/raw_products.sql +0 -0
  57. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/stg_customers.sql +0 -0
  58. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/stg_orders.sql +0 -0
  59. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/stg_payments.sql +0 -0
  60. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/ddl/stg_products.sql +0 -0
  61. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  62. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  63. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  64. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  65. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/maintenance/update_product_status.sql +0 -0
  66. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  67. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  68. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  69. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  70. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_customers.sql +0 -0
  71. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_orders.sql +0 -0
  72. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_payments.sql +0 -0
  73. {sql_glider-0.1.13 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_products.sql +0 -0
  74. {sql_glider-0.1.13 → sql_glider-0.1.14}/sqlglider.toml.example +0 -0
  75. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/__init__.py +0 -0
  76. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/catalog/__init__.py +0 -0
  77. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/catalog/base.py +0 -0
  78. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/catalog/databricks.py +0 -0
  79. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/catalog/registry.py +0 -0
  80. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/dissection/__init__.py +0 -0
  81. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/dissection/analyzer.py +0 -0
  82. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/dissection/formatters.py +0 -0
  83. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/dissection/models.py +0 -0
  84. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/global_models.py +0 -0
  85. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/__init__.py +0 -0
  86. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/formatters.py +0 -0
  87. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/merge.py +0 -0
  88. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/models.py +0 -0
  89. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/query.py +0 -0
  90. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/graph/serialization.py +0 -0
  91. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/lineage/__init__.py +0 -0
  92. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/lineage/analyzer.py +0 -0
  93. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/lineage/formatters.py +0 -0
  94. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/templating/__init__.py +0 -0
  95. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/templating/base.py +0 -0
  96. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/templating/jinja.py +0 -0
  97. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/templating/registry.py +0 -0
  98. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/templating/variables.py +0 -0
  99. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/utils/__init__.py +0 -0
  100. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/utils/config.py +0 -0
  101. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/utils/file_utils.py +0 -0
  102. {sql_glider-0.1.13 → sql_glider-0.1.14}/src/sqlglider/utils/schema.py +0 -0
  103. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/__init__.py +0 -0
  104. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  105. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  106. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  107. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  108. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  109. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  110. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte.sql +0 -0
  111. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  112. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
  113. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  114. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_multi.sql +0 -0
  115. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  116. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  117. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  118. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_tables.sql +0 -0
  119. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_view.sql +0 -0
  120. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  121. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/fixtures/sample_manifest.csv +0 -0
  122. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/__init__.py +0 -0
  123. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/catalog/__init__.py +0 -0
  124. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_base.py +0 -0
  125. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_databricks.py +0 -0
  126. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_registry.py +0 -0
  127. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/dissection/__init__.py +0 -0
  128. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  129. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_formatters.py +0 -0
  130. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_models.py +0 -0
  131. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/__init__.py +0 -0
  132. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_builder.py +0 -0
  133. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_formatters.py +0 -0
  134. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_merge.py +0 -0
  135. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_models.py +0 -0
  136. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_query.py +0 -0
  137. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/graph/test_serialization.py +0 -0
  138. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/lineage/__init__.py +0 -0
  139. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/lineage/test_analyzer.py +0 -0
  140. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/lineage/test_formatters.py +0 -0
  141. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/templating/__init__.py +0 -0
  142. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/templating/test_base.py +0 -0
  143. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/templating/test_jinja.py +0 -0
  144. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/templating/test_registry.py +0 -0
  145. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/templating/test_variables.py +0 -0
  146. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/utils/__init__.py +0 -0
  147. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/utils/test_config.py +0 -0
  148. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/utils/test_file_utils.py +0 -0
  149. {sql_glider-0.1.13 → sql_glider-0.1.14}/tests/sqlglider/utils/test_schema.py +0 -0
  150. {sql_glider-0.1.13 → sql_glider-0.1.14}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,63 @@
1
+ # Schema Pruning Optimization for `--resolve-schema`
2
+
3
+ **Status:** Completed
4
+
5
+ ## Overview
6
+
7
+ Optimize `--resolve-schema` graph build performance by pruning the schema dict to only tables referenced in each query before passing it to `sqlglot.lineage()`. Also moved schema dumping (`--dump-schema`) to occur before graph building (between Pass 1 and Pass 2).
8
+
9
+ ## Problem
10
+
11
+ `sqlglot.lineage()` performance degrades dramatically with large schema dicts. Benchmarks showed:
12
+
13
+ | Schema Size | Time (6 columns) |
14
+ |---|---|
15
+ | No schema | 2.6ms |
16
+ | 4 tables | 8.3ms |
17
+ | 204 tables | **1,041ms** |
18
+
19
+ The full accumulated schema from all files was passed to every `lineage()` call, even though each query only references a handful of tables. For projects with hundreds of files/tables, this made `--resolve-schema` unusable.
20
+
21
+ ## Design Decisions
22
+
23
+ - **Prune in the analyzer, not the builder** — keeps the optimization localized and benefits all callers of `LineageAnalyzer`, not just graph builds
24
+ - **Prune once per query, not per column** — `_get_query_tables()` is called once before the column loop, and the pruned schema is reused for all columns in that query
25
+ - **Case-insensitive matching** — table names are lowered for comparison to handle mixed-case schemas
26
+ - **Moved schema dump before graph build** — `_resolved_schema` is fully populated after Pass 1, so dumping between passes is safe and gives users earlier feedback. Required exposing `extract_schemas()` as a public method on `GraphBuilder`
27
+
28
+ ## Implementation
29
+
30
+ - [x] Add schema pruning in `_analyze_column_lineage_internal()` using existing `_get_query_tables()` method
31
+ - [x] Expose `extract_schemas()` as public method on `GraphBuilder`
32
+ - [x] Skip Pass 1 in `add_files()`/`add_manifest()` if `_resolved_schema` is already populated
33
+ - [x] Restructure CLI `graph_build` to call `extract_schemas()` then dump schema before `add_files()`/`add_manifest()`
34
+ - [x] Tests for `extract_schemas()` method and schema pruning correctness
35
+
36
+ ## Files Modified
37
+
38
+ - `src/sqlglider/lineage/analyzer.py` — schema pruning before `lineage()` calls
39
+ - `src/sqlglider/graph/builder.py` — public `extract_schemas()`, skip Pass 1 when already resolved
40
+ - `src/sqlglider/cli.py` — restructured `graph_build` to dump schema before graph building
41
+ - `tests/sqlglider/lineage/test_analyzer.py` — `TestSchemaPruning` (2 tests)
42
+ - `tests/sqlglider/graph/test_builder.py` — `TestExtractSchemas` (3 tests)
43
+
44
+ ## Benchmark Results (After)
45
+
46
+ | Schema Size | Time (6 columns) |
47
+ |---|---|
48
+ | No schema | 2.6ms |
49
+ | 4 tables | 8.3ms |
50
+ | 204 tables | **8.3ms** |
51
+
52
+ Full `analyze_queries` benchmark on complex fixture (analytics_pipeline.sql):
53
+
54
+ | Scenario | Before | After |
55
+ |---|---|---|
56
+ | No schema | ~392ms | ~392ms |
57
+ | Small schema (4 tables) | ~373ms | ~373ms |
58
+ | Big schema (204 tables) | ~1,400ms+ | **~387ms** |
59
+
60
+ ## Lessons Learned
61
+
62
+ - The initial assumption was that double-parsing (Pass 1 + Pass 2 both calling `sqlglot.parse()`) was the bottleneck. Benchmarking showed `parse()` costs ~10ms, while `lineage()` with a 200-table schema costs ~1,000ms. Profiling before optimizing avoided wasted effort on AST caching.
63
+ - `sqlglot.lineage()` appears to have O(n) or worse scaling with schema size, even for tables not referenced in the query. Pruning is essential for multi-file workloads.
@@ -0,0 +1,118 @@
1
+ **Status:** Completed
2
+
3
+ # Plan: `tables scrape` Command
4
+
5
+ ## Overview
6
+
7
+ Add a `tables scrape` subcommand that performs schema inference (the same logic `graph build --resolve-schema` uses) but outputs the inferred schema directly instead of building a lineage graph. This makes schema inference a standalone, reusable operation.
8
+
9
+ ## Key Changes
10
+
11
+ ### 1. Refactor schema extraction out of GraphBuilder
12
+
13
+ **File:** `src/sqlglider/graph/builder.py`
14
+
15
+ Extract `_extract_schemas()` and `_fill_schema_from_catalog()` into a standalone module so both `graph build` and `tables scrape` can use them without instantiating a full `GraphBuilder`.
16
+
17
+ **New file:** `src/sqlglider/schema/extractor.py`
18
+ - `extract_schemas_from_files(file_paths, dialect, sql_preprocessor, schema, strict_schema, console) -> SchemaDict` — core extraction loop with Rich progress bar
19
+ - `fill_schema_from_catalog(schema, file_paths, dialect, sql_preprocessor, catalog_type, catalog_config, console) -> SchemaDict` — catalog fill logic
20
+ - `extract_and_resolve_schema(file_paths, dialect, sql_preprocessor, strict_schema, catalog_type, catalog_config, console) -> SchemaDict` — high-level orchestrator (extract + optional catalog fill)
21
+
22
+ **Update GraphBuilder** to delegate to these new functions instead of implementing them inline. `GraphBuilder._extract_schemas` and `_fill_schema_from_catalog` become thin wrappers or are removed, with `extract_schemas()` calling the shared code.
23
+
24
+ ### 2. Add `tables scrape` CLI command
25
+
26
+ **File:** `src/sqlglider/cli.py`
27
+
28
+ Add `@tables_app.command("scrape")` with these parameters (mirroring `graph build`):
29
+
30
+ | Parameter | Source |
31
+ |-----------|--------|
32
+ | `paths` | Same as `graph build` — file(s) or directory(ies) |
33
+ | `--recursive / -r` | Same recursive directory traversal |
34
+ | `--glob / -g` | Same glob pattern (default `*.sql`) |
35
+ | `--manifest / -m` | Same manifest CSV support |
36
+ | `--dialect / -d` | SQL dialect |
37
+ | `--templater / -t` | Templater name |
38
+ | `--var / -v` | Template variables |
39
+ | `--vars-file` | Variables file |
40
+ | `--strict-schema` | Strict schema mode |
41
+ | `--catalog-type / -c` | Catalog provider for remote DDL |
42
+ | `--output-format / -f` | `text` (default), `json`, or `csv` |
43
+ | `--output-file / -o` | Output file path (stdout if omitted) |
44
+
45
+ **Flow:**
46
+ 1. Resolve config defaults (same pattern as `graph build`)
47
+ 2. Set up templating preprocessor (same shared code)
48
+ 3. Collect files from paths/manifest (same logic as `graph build`)
49
+ 4. Call `extract_and_resolve_schema(...)` from the new shared module
50
+ 5. Format output using existing `format_schema()` from `src/sqlglider/graph/formatters.py`
51
+ 6. Write to file or stdout using `OutputWriter`
52
+
53
+ ### 3. Move schema formatters
54
+
55
+ The existing formatters in `src/sqlglider/graph/formatters.py` are already generic (they format `SchemaDict`). **Decision: leave them in place** to minimize churn — they work for both `graph build --dump-schema` and `tables scrape`.
56
+
57
+ ### 4. Refactor shared file-collection logic
58
+
59
+ The file collection code (paths + recursive glob + manifest) is duplicated between `graph build` and the new `tables scrape`. Extract a helper function:
60
+
61
+ ```python
62
+ def _collect_sql_files(
63
+ paths: Optional[List[Path]],
64
+ manifest: Optional[Path],
65
+ recursive: bool,
66
+ glob_pattern: str,
67
+ ) -> tuple[list[Path], list[Path]]:
68
+ """Returns (manifest_files, path_files)."""
69
+ ```
70
+
71
+ Place this in `cli.py` as a private helper used by both commands.
72
+
73
+ ## Implementation Steps
74
+
75
+ - [x] Create `src/sqlglider/schema/__init__.py`
76
+ - [x] Create `src/sqlglider/schema/extractor.py` with shared schema extraction logic
77
+ - [x] Update `src/sqlglider/graph/builder.py` to delegate to shared extractor
78
+ - [x] Add `_collect_sql_files` helper to `src/sqlglider/cli.py`
79
+ - [x] Refactor `graph build` to use `_collect_sql_files`
80
+ - [x] Add `tables scrape` command to `src/sqlglider/cli.py`
81
+ - [x] Create `tests/sqlglider/schema/__init__.py`
82
+ - [x] Create `tests/sqlglider/schema/test_extractor.py` (10 tests)
83
+ - [x] Add `TestTablesScrapeCommand` to `tests/sqlglider/test_cli.py` (11 tests)
84
+ - [x] All 672 tests pass, 81.5% coverage, ruff clean
85
+
86
+ ## Files Created/Modified
87
+
88
+ | File | Action |
89
+ |------|--------|
90
+ | `src/sqlglider/schema/__init__.py` | Created — empty |
91
+ | `src/sqlglider/schema/extractor.py` | Created — shared schema extraction logic |
92
+ | `src/sqlglider/graph/builder.py` | Modified — delegate to shared extractor |
93
+ | `src/sqlglider/cli.py` | Modified — add `tables scrape` command + `_collect_sql_files` helper |
94
+ | `tests/sqlglider/schema/__init__.py` | Created — empty |
95
+ | `tests/sqlglider/schema/test_extractor.py` | Created — tests for shared extractor |
96
+ | `tests/sqlglider/test_cli.py` | Modified — add tests for `tables scrape` command |
97
+
98
+ ## Testing Strategy
99
+
100
+ 1. **Unit tests for `schema/extractor.py`**: Test `extract_schemas_from_files` with CREATE VIEW/TABLE AS SELECT and DQL qualified refs
101
+ 2. **CLI tests for `tables scrape`**: Use `CliRunner` to test text/json/csv output, recursive glob, templating, error cases
102
+ 3. **Regression**: Full test suite passes (672 tests), coverage at 81.5%
103
+
104
+ ## Verification
105
+
106
+ ```bash
107
+ # Basic usage
108
+ uv run sqlglider tables scrape ./queries/ -r
109
+
110
+ # With output format
111
+ uv run sqlglider tables scrape ./queries/ -r -f json -o schema.json
112
+
113
+ # With catalog
114
+ uv run sqlglider tables scrape ./queries/ -r -c databricks -f csv
115
+
116
+ # Ensure graph build still works
117
+ uv run sqlglider graph build ./queries/ -r --resolve-schema --dump-schema schema.txt -o graph.json
118
+ ```
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.13'
32
- __version_tuple__ = version_tuple = (0, 1, 13)
31
+ __version__ = version = '0.1.14'
32
+ __version_tuple__ = version_tuple = (0, 1, 14)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -788,6 +788,274 @@ def tables_pull(
788
788
  raise typer.Exit(1)
789
789
 
790
790
 
791
+ def _collect_sql_files(
792
+ paths: Optional[List[Path]],
793
+ manifest: Optional[Path],
794
+ recursive: bool,
795
+ glob_pattern: str,
796
+ ) -> tuple[list[Path], list[Path]]:
797
+ """Collect SQL files from paths and/or manifest.
798
+
799
+ Args:
800
+ paths: File or directory paths to scan.
801
+ manifest: Optional manifest CSV path.
802
+ recursive: Whether to recurse into directories.
803
+ glob_pattern: Glob pattern for directory scanning.
804
+
805
+ Returns:
806
+ Tuple of (manifest_files, path_files).
807
+ """
808
+ path_files: list[Path] = []
809
+ if paths:
810
+ for path in paths:
811
+ if path.is_dir():
812
+ pattern = f"**/{glob_pattern}" if recursive else glob_pattern
813
+ path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
814
+ elif path.is_file():
815
+ path_files.append(path)
816
+ else:
817
+ err_console.print(f"[red]Error:[/red] Path not found: {path}")
818
+ raise typer.Exit(1)
819
+
820
+ manifest_files: list[Path] = []
821
+ if manifest:
822
+ from sqlglider.graph.models import Manifest
823
+
824
+ manifest_data = Manifest.from_csv(manifest)
825
+ base_dir = manifest.parent
826
+ for entry in manifest_data.entries:
827
+ file_path = Path(entry.file_path)
828
+ if not file_path.is_absolute():
829
+ file_path = (base_dir / entry.file_path).resolve()
830
+ manifest_files.append(file_path)
831
+
832
+ return manifest_files, path_files
833
+
834
+
835
+ @tables_app.command("scrape")
836
+ def tables_scrape(
837
+ paths: List[Path] = typer.Argument(
838
+ None,
839
+ help="SQL file(s) or directory path to process",
840
+ ),
841
+ recursive: bool = typer.Option(
842
+ False,
843
+ "--recursive",
844
+ "-r",
845
+ help="Recursively search directories for SQL files",
846
+ ),
847
+ glob_pattern: str = typer.Option(
848
+ "*.sql",
849
+ "--glob",
850
+ "-g",
851
+ help="Glob pattern for matching SQL files in directories",
852
+ ),
853
+ manifest: Optional[Path] = typer.Option(
854
+ None,
855
+ "--manifest",
856
+ "-m",
857
+ exists=True,
858
+ help="Path to manifest CSV file with file_path and optional dialect columns",
859
+ ),
860
+ dialect: Optional[str] = typer.Option(
861
+ None,
862
+ "--dialect",
863
+ "-d",
864
+ help="SQL dialect (default: spark)",
865
+ ),
866
+ templater: Optional[str] = typer.Option(
867
+ None,
868
+ "--templater",
869
+ "-t",
870
+ help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
871
+ ),
872
+ var: Optional[List[str]] = typer.Option(
873
+ None,
874
+ "--var",
875
+ "-v",
876
+ help="Template variable in key=value format (repeatable)",
877
+ ),
878
+ vars_file: Optional[Path] = typer.Option(
879
+ None,
880
+ "--vars-file",
881
+ exists=True,
882
+ help="Path to variables file (JSON or YAML)",
883
+ ),
884
+ strict_schema: bool = typer.Option(
885
+ False,
886
+ "--strict-schema",
887
+ help="Fail if any column's table cannot be identified during schema extraction",
888
+ ),
889
+ catalog_type: Optional[str] = typer.Option(
890
+ None,
891
+ "--catalog-type",
892
+ "-c",
893
+ help="Catalog provider for pulling DDL of tables not found in files "
894
+ "(e.g. 'databricks')",
895
+ ),
896
+ output_format: Optional[str] = typer.Option(
897
+ None,
898
+ "--output-format",
899
+ "-f",
900
+ help="Output format: 'text' (default), 'json', or 'csv'",
901
+ ),
902
+ output_file: Optional[Path] = typer.Option(
903
+ None,
904
+ "--output-file",
905
+ "-o",
906
+ help="Output file path (prints to stdout if not provided)",
907
+ ),
908
+ ) -> None:
909
+ """
910
+ Scrape schema information from SQL files.
911
+
912
+ Infers table and column schemas from DDL statements and DQL column
913
+ references across one or more SQL files. Supports the same file input
914
+ modes as `graph build` (paths, directories, manifests).
915
+
916
+ Examples:
917
+
918
+ # Scrape schema from a directory
919
+ sqlglider tables scrape ./queries/ -r
920
+
921
+ # Output as JSON
922
+ sqlglider tables scrape ./queries/ -r -f json
923
+
924
+ # Save to file
925
+ sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
926
+
927
+ # With Jinja2 templating
928
+ sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
929
+
930
+ # With catalog fallback
931
+ sqlglider tables scrape ./queries/ -r -c databricks
932
+ """
933
+ from sqlglider.graph.formatters import format_schema
934
+ from sqlglider.lineage.analyzer import SchemaResolutionError
935
+ from sqlglider.schema.extractor import extract_and_resolve_schema
936
+
937
+ # Load config for defaults
938
+ config = load_config()
939
+ dialect = dialect or config.dialect or "spark"
940
+ templater = templater or config.templater
941
+ strict_schema = strict_schema or config.strict_schema or False
942
+ output_format = output_format or config.output_format or "text"
943
+
944
+ if output_format not in ("text", "json", "csv"):
945
+ err_console.print(
946
+ f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
947
+ "Use 'text', 'json', or 'csv'."
948
+ )
949
+ raise typer.Exit(1)
950
+
951
+ # Only inherit catalog_type from config when not provided via CLI
952
+ if not catalog_type:
953
+ catalog_type = config.catalog_type
954
+
955
+ # Validate inputs
956
+ if not paths and not manifest:
957
+ err_console.print(
958
+ "[red]Error:[/red] Must provide either file/directory paths or --manifest option."
959
+ )
960
+ raise typer.Exit(1)
961
+
962
+ # Create SQL preprocessor if templating is enabled
963
+ sql_preprocessor: Optional[Callable[[str, Path], str]] = None
964
+ if templater:
965
+ config_vars_file = None
966
+ config_vars = None
967
+ if config.templating:
968
+ if config.templating.variables_file and not vars_file:
969
+ config_vars_file = Path(config.templating.variables_file)
970
+ if not config_vars_file.exists():
971
+ err_console.print(
972
+ f"[yellow]Warning:[/yellow] Variables file from config "
973
+ f"not found: {config_vars_file}"
974
+ )
975
+ config_vars_file = None
976
+ config_vars = config.templating.variables
977
+
978
+ variables = load_all_variables(
979
+ cli_vars=var,
980
+ vars_file=vars_file or config_vars_file,
981
+ config_vars=config_vars,
982
+ use_env=True,
983
+ )
984
+
985
+ templater_instance = get_templater(templater)
986
+
987
+ def _preprocess(sql: str, file_path: Path) -> str:
988
+ return templater_instance.render(
989
+ sql, variables=variables, source_path=file_path
990
+ )
991
+
992
+ sql_preprocessor = _preprocess
993
+
994
+ try:
995
+ # Build catalog config from config file if available
996
+ catalog_config_dict = None
997
+ if catalog_type and config.catalog:
998
+ provider_config = getattr(config.catalog, catalog_type, None)
999
+ if provider_config:
1000
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1001
+
1002
+ # Collect files
1003
+ manifest_files, path_files = _collect_sql_files(
1004
+ paths, manifest, recursive, glob_pattern
1005
+ )
1006
+ all_files = manifest_files + path_files
1007
+
1008
+ if not all_files:
1009
+ err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
1010
+ raise typer.Exit(0)
1011
+
1012
+ # Extract schema
1013
+ schema = extract_and_resolve_schema(
1014
+ all_files,
1015
+ dialect=dialect,
1016
+ sql_preprocessor=sql_preprocessor,
1017
+ strict_schema=strict_schema,
1018
+ catalog_type=catalog_type,
1019
+ catalog_config=catalog_config_dict,
1020
+ console=err_console,
1021
+ )
1022
+
1023
+ if not schema:
1024
+ err_console.print("[yellow]No schema information found.[/yellow]")
1025
+ raise typer.Exit(0)
1026
+
1027
+ # Format and output
1028
+ formatted = format_schema(schema, output_format)
1029
+ if output_file:
1030
+ OutputWriter.write(formatted, output_file)
1031
+ err_console.print(
1032
+ f"[green]Schema written to {output_file} "
1033
+ f"({len(schema)} table(s))[/green]"
1034
+ )
1035
+ else:
1036
+ console.print(formatted, end="")
1037
+
1038
+ except SchemaResolutionError as e:
1039
+ err_console.print(f"[red]Error:[/red] {e}")
1040
+ raise typer.Exit(1)
1041
+
1042
+ except FileNotFoundError as e:
1043
+ err_console.print(f"[red]Error:[/red] {e}")
1044
+ raise typer.Exit(1)
1045
+
1046
+ except TemplaterError as e:
1047
+ err_console.print(f"[red]Error:[/red] {e}")
1048
+ raise typer.Exit(1)
1049
+
1050
+ except ValueError as e:
1051
+ err_console.print(f"[red]Error:[/red] {e}")
1052
+ raise typer.Exit(1)
1053
+
1054
+ except Exception as e:
1055
+ err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
1056
+ raise typer.Exit(1)
1057
+
1058
+
791
1059
  @app.command()
792
1060
  def template(
793
1061
  sql_file: Annotated[
@@ -1167,31 +1435,9 @@ def graph_build(
1167
1435
  )
1168
1436
 
1169
1437
  # Collect file paths for schema extraction
1170
- path_files: list[Path] = []
1171
- if paths:
1172
- for path in paths:
1173
- if path.is_dir():
1174
- pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1175
- path_files.extend(
1176
- f for f in sorted(path.glob(pattern)) if f.is_file()
1177
- )
1178
- elif path.is_file():
1179
- path_files.append(path)
1180
- else:
1181
- err_console.print(f"[red]Error:[/red] Path not found: {path}")
1182
- raise typer.Exit(1)
1183
-
1184
- manifest_files: list[Path] = []
1185
- if manifest:
1186
- from sqlglider.graph.models import Manifest
1187
-
1188
- manifest_data = Manifest.from_csv(manifest)
1189
- base_dir = manifest.parent
1190
- for entry in manifest_data.entries:
1191
- file_path = Path(entry.file_path)
1192
- if not file_path.is_absolute():
1193
- file_path = (base_dir / entry.file_path).resolve()
1194
- manifest_files.append(file_path)
1438
+ manifest_files, path_files = _collect_sql_files(
1439
+ paths, manifest, recursive, glob_pattern
1440
+ )
1195
1441
 
1196
1442
  # Extract schema upfront if requested, then dump before graph building
1197
1443
  all_files = manifest_files + path_files
@@ -16,9 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer
20
+ from sqlglider.schema.extractor import extract_and_resolve_schema
20
21
  from sqlglider.utils.file_utils import read_sql_file
21
- from sqlglider.utils.schema import parse_ddl_to_schema
22
22
 
23
23
  console = Console(stderr=True)
24
24
 
@@ -320,139 +320,17 @@ class GraphBuilder:
320
320
  Returns:
321
321
  Resolved schema dict
322
322
  """
323
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
324
- self._resolved_schema = self._extract_schemas(file_paths, dialect)
325
- if self.catalog_type:
326
- self._resolved_schema = self._fill_schema_from_catalog(
327
- self._resolved_schema, file_paths, dialect
328
- )
329
- console.print(
330
- f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
331
- )
332
- return self._resolved_schema.copy()
333
-
334
- def _extract_schemas(
335
- self,
336
- file_paths: List[Path],
337
- dialect: Optional[str] = None,
338
- ) -> Dict[str, Dict[str, str]]:
339
- """Run schema extraction pass across all files.
340
-
341
- Parses each file and extracts schema from CREATE TABLE/VIEW
342
- statements without performing lineage analysis.
343
-
344
- Args:
345
- file_paths: SQL files to extract schema from
346
- dialect: SQL dialect override
347
-
348
- Returns:
349
- Accumulated schema dict from all files
350
- """
351
- schema: Dict[str, Dict[str, str]] = {}
352
- total = len(file_paths)
353
- with Progress(
354
- TextColumn("[progress.description]{task.description}"),
355
- BarColumn(),
356
- TaskProgressColumn(),
323
+ file_dialect = dialect or self.dialect
324
+ self._resolved_schema = extract_and_resolve_schema(
325
+ file_paths,
326
+ dialect=file_dialect,
327
+ sql_preprocessor=self.sql_preprocessor,
328
+ strict_schema=self.strict_schema,
329
+ catalog_type=self.catalog_type,
330
+ catalog_config=self.catalog_config,
357
331
  console=console,
358
- transient=False,
359
- ) as progress:
360
- task = progress.add_task("Pass 1: Extracting schema", total=total)
361
- for i, file_path in enumerate(file_paths, start=1):
362
- console.print(f"Extracting schema {i}/{total}: {file_path.name}")
363
- file_dialect = dialect or self.dialect
364
- try:
365
- sql_content = read_sql_file(file_path)
366
- if self.sql_preprocessor:
367
- sql_content = self.sql_preprocessor(sql_content, file_path)
368
- analyzer = LineageAnalyzer(
369
- sql_content,
370
- dialect=file_dialect,
371
- schema=schema,
372
- strict_schema=self.strict_schema,
373
- )
374
- file_schema = analyzer.extract_schema_only()
375
- schema.update(file_schema)
376
- except SchemaResolutionError:
377
- raise
378
- except Exception:
379
- # Schema extraction failures are non-fatal; the file
380
- # will be reported during the lineage pass if it also fails.
381
- pass
382
- progress.advance(task)
383
- return schema
384
-
385
- def _fill_schema_from_catalog(
386
- self,
387
- schema: Dict[str, Dict[str, str]],
388
- file_paths: List[Path],
389
- dialect: Optional[str] = None,
390
- ) -> Dict[str, Dict[str, str]]:
391
- """Pull DDL from catalog for tables not yet in schema.
392
-
393
- Extracts all table names referenced across the files, identifies
394
- those missing from the schema, and fetches their DDL from the
395
- configured catalog provider.
396
-
397
- Args:
398
- schema: Schema dict already populated from file extraction
399
- file_paths: SQL files to scan for table references
400
- dialect: SQL dialect override
401
-
402
- Returns:
403
- Updated schema dict with catalog-sourced entries added
404
- """
405
- from sqlglider.catalog import get_catalog
406
-
407
- catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
408
- if self.catalog_config:
409
- catalog.configure(self.catalog_config)
410
-
411
- # Collect all referenced table names across files
412
- all_tables: Set[str] = set()
413
- for file_path in file_paths:
414
- file_dialect = dialect or self.dialect
415
- try:
416
- sql_content = read_sql_file(file_path)
417
- if self.sql_preprocessor:
418
- sql_content = self.sql_preprocessor(sql_content, file_path)
419
- analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
420
- tables_results = analyzer.analyze_tables()
421
- for result in tables_results:
422
- for table_info in result.tables:
423
- # Skip CTEs — they don't exist in catalogs
424
- from sqlglider.lineage.analyzer import ObjectType
425
-
426
- if table_info.object_type != ObjectType.CTE:
427
- all_tables.add(table_info.name)
428
- except Exception:
429
- pass
430
-
431
- # Find tables missing from schema
432
- missing = [t for t in all_tables if t not in schema]
433
- if not missing:
434
- return schema
435
-
436
- console.print(
437
- f"[blue]Pulling DDL from {self.catalog_type} "
438
- f"for {len(missing)} table(s)...[/blue]"
439
332
  )
440
-
441
- ddl_results = catalog.get_ddl_batch(missing)
442
- file_dialect = dialect or self.dialect
443
- for table_name, ddl in ddl_results.items():
444
- if ddl.startswith("ERROR:"):
445
- console.print(
446
- f"[yellow]Warning:[/yellow] Could not pull DDL "
447
- f"for {table_name}: {ddl}"
448
- )
449
- continue
450
- parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
451
- for name, cols in parsed_schema.items():
452
- if name not in schema:
453
- schema[name] = cols
454
-
455
- return schema
333
+ return self._resolved_schema.copy()
456
334
 
457
335
  def _ensure_node(
458
336
  self,
File without changes