sql-glider 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. {sql_glider-0.1.12 → sql_glider-0.1.14}/PKG-INFO +1 -1
  2. sql_glider-0.1.14/plans/2026-01-29-schema-pruning-optimization.md +63 -0
  3. sql_glider-0.1.14/plans/2026-01-29-tables-scrape-command.md +118 -0
  4. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/_version.py +2 -2
  5. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/cli.py +293 -26
  6. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/builder.py +22 -134
  7. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/lineage/analyzer.py +15 -2
  8. sql_glider-0.1.14/src/sqlglider/schema/__init__.py +0 -0
  9. sql_glider-0.1.14/src/sqlglider/schema/extractor.py +202 -0
  10. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_builder.py +42 -0
  11. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/lineage/test_analyzer.py +48 -0
  12. sql_glider-0.1.14/tests/sqlglider/schema/__init__.py +0 -0
  13. sql_glider-0.1.14/tests/sqlglider/schema/test_extractor.py +163 -0
  14. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/test_cli.py +161 -0
  15. {sql_glider-0.1.12 → sql_glider-0.1.14}/.github/workflows/ci.yml +0 -0
  16. {sql_glider-0.1.12 → sql_glider-0.1.14}/.github/workflows/publish.yml +0 -0
  17. {sql_glider-0.1.12 → sql_glider-0.1.14}/.gitignore +0 -0
  18. {sql_glider-0.1.12 → sql_glider-0.1.14}/.python-version +0 -0
  19. {sql_glider-0.1.12 → sql_glider-0.1.14}/ARCHITECTURE.md +0 -0
  20. {sql_glider-0.1.12 → sql_glider-0.1.14}/CLAUDE.md +0 -0
  21. {sql_glider-0.1.12 → sql_glider-0.1.14}/LICENSE +0 -0
  22. {sql_glider-0.1.12 → sql_glider-0.1.14}/README.md +0 -0
  23. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-05-column-level-lineage.md +0 -0
  24. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-05-reverse-lineage.md +0 -0
  25. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-06-config-file-support.md +0 -0
  26. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-06-graph-lineage.md +0 -0
  27. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  28. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-07-sample-data-model.md +0 -0
  29. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-07-sql-templating.md +0 -0
  30. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-08-tables-command.md +0 -0
  31. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-09-graph-query-paths.md +0 -0
  32. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-13-dissect-command.md +0 -0
  33. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2025-12-14-tables-pull-command.md +0 -0
  34. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  35. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
  36. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2026-01-28-sparksql-table-extraction.md +0 -0
  37. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2026-01-29-no-star-flag.md +0 -0
  38. {sql_glider-0.1.12 → sql_glider-0.1.14}/plans/2026-01-29-resolve-schema.md +0 -0
  39. {sql_glider-0.1.12 → sql_glider-0.1.14}/pyproject.toml +0 -0
  40. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/README.md +0 -0
  41. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/expire_dim_customer.sql +0 -0
  42. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/load_fact_orders.sql +0 -0
  43. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/load_fact_payments.sql +0 -0
  44. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/merge_dim_customer.sql +0 -0
  45. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/merge_dim_product.sql +0 -0
  46. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  47. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/complex/conditional_merge.sql +0 -0
  48. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/complex/cte_insert.sql +0 -0
  49. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/complex/multi_table_transform.sql +0 -0
  50. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/dim_customer.sql +0 -0
  51. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/dim_product.sql +0 -0
  52. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/fact_orders.sql +0 -0
  53. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/fact_payments.sql +0 -0
  54. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_addresses.sql +0 -0
  55. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_customers.sql +0 -0
  56. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_order_items.sql +0 -0
  57. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_orders.sql +0 -0
  58. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_payments.sql +0 -0
  59. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/raw_products.sql +0 -0
  60. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/stg_customers.sql +0 -0
  61. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/stg_orders.sql +0 -0
  62. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/stg_payments.sql +0 -0
  63. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/ddl/stg_products.sql +0 -0
  64. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  65. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  66. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  67. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  68. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/maintenance/update_product_status.sql +0 -0
  69. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  70. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  71. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  72. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  73. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_customers.sql +0 -0
  74. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_orders.sql +0 -0
  75. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_payments.sql +0 -0
  76. {sql_glider-0.1.12 → sql_glider-0.1.14}/sample_data_model/staging/load_stg_products.sql +0 -0
  77. {sql_glider-0.1.12 → sql_glider-0.1.14}/sqlglider.toml.example +0 -0
  78. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/__init__.py +0 -0
  79. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/catalog/__init__.py +0 -0
  80. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/catalog/base.py +0 -0
  81. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/catalog/databricks.py +0 -0
  82. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/catalog/registry.py +0 -0
  83. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/dissection/__init__.py +0 -0
  84. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/dissection/analyzer.py +0 -0
  85. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/dissection/formatters.py +0 -0
  86. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/dissection/models.py +0 -0
  87. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/global_models.py +0 -0
  88. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/__init__.py +0 -0
  89. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/formatters.py +0 -0
  90. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/merge.py +0 -0
  91. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/models.py +0 -0
  92. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/query.py +0 -0
  93. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/graph/serialization.py +0 -0
  94. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/lineage/__init__.py +0 -0
  95. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/lineage/formatters.py +0 -0
  96. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/templating/__init__.py +0 -0
  97. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/templating/base.py +0 -0
  98. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/templating/jinja.py +0 -0
  99. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/templating/registry.py +0 -0
  100. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/templating/variables.py +0 -0
  101. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/utils/__init__.py +0 -0
  102. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/utils/config.py +0 -0
  103. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/utils/file_utils.py +0 -0
  104. {sql_glider-0.1.12 → sql_glider-0.1.14}/src/sqlglider/utils/schema.py +0 -0
  105. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/__init__.py +0 -0
  106. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  107. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  108. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  109. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  110. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  111. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  112. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte.sql +0 -0
  113. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  114. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
  115. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  116. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_multi.sql +0 -0
  117. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  118. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  119. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  120. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_tables.sql +0 -0
  121. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_view.sql +0 -0
  122. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  123. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/fixtures/sample_manifest.csv +0 -0
  124. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/__init__.py +0 -0
  125. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/catalog/__init__.py +0 -0
  126. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_base.py +0 -0
  127. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_databricks.py +0 -0
  128. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/catalog/test_registry.py +0 -0
  129. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/dissection/__init__.py +0 -0
  130. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  131. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_formatters.py +0 -0
  132. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/dissection/test_models.py +0 -0
  133. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/__init__.py +0 -0
  134. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_formatters.py +0 -0
  135. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_merge.py +0 -0
  136. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_models.py +0 -0
  137. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_query.py +0 -0
  138. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/graph/test_serialization.py +0 -0
  139. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/lineage/__init__.py +0 -0
  140. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/lineage/test_formatters.py +0 -0
  141. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/templating/__init__.py +0 -0
  142. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/templating/test_base.py +0 -0
  143. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/templating/test_jinja.py +0 -0
  144. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/templating/test_registry.py +0 -0
  145. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/templating/test_variables.py +0 -0
  146. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/utils/__init__.py +0 -0
  147. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/utils/test_config.py +0 -0
  148. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/utils/test_file_utils.py +0 -0
  149. {sql_glider-0.1.12 → sql_glider-0.1.14}/tests/sqlglider/utils/test_schema.py +0 -0
  150. {sql_glider-0.1.12 → sql_glider-0.1.14}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -0,0 +1,63 @@
1
+ # Schema Pruning Optimization for `--resolve-schema`
2
+
3
+ **Status:** Completed
4
+
5
+ ## Overview
6
+
7
+ Optimize `--resolve-schema` graph build performance by pruning the schema dict to only tables referenced in each query before passing it to `sqlglot.lineage()`. Also moved schema dumping (`--dump-schema`) to occur before graph building (between Pass 1 and Pass 2).
8
+
9
+ ## Problem
10
+
11
+ `sqlglot.lineage()` performance degrades dramatically with large schema dicts. Benchmarks showed:
12
+
13
+ | Schema Size | Time (6 columns) |
14
+ |---|---|
15
+ | No schema | 2.6ms |
16
+ | 4 tables | 8.3ms |
17
+ | 204 tables | **1,041ms** |
18
+
19
+ The full accumulated schema from all files was passed to every `lineage()` call, even though each query only references a handful of tables. For projects with hundreds of files/tables, this made `--resolve-schema` unusable.
20
+
21
+ ## Design Decisions
22
+
23
+ - **Prune in the analyzer, not the builder** — keeps the optimization localized and benefits all callers of `LineageAnalyzer`, not just graph builds
24
+ - **Prune once per query, not per column** — `_get_query_tables()` is called once before the column loop, and the pruned schema is reused for all columns in that query
25
+ - **Case-insensitive matching** — table names are lowered for comparison to handle mixed-case schemas
26
+ - **Moved schema dump before graph build** — `_resolved_schema` is fully populated after Pass 1, so dumping between passes is safe and gives users earlier feedback. Required exposing `extract_schemas()` as a public method on `GraphBuilder`
27
+
28
+ ## Implementation
29
+
30
+ - [x] Add schema pruning in `_analyze_column_lineage_internal()` using existing `_get_query_tables()` method
31
+ - [x] Expose `extract_schemas()` as public method on `GraphBuilder`
32
+ - [x] Skip Pass 1 in `add_files()`/`add_manifest()` if `_resolved_schema` is already populated
33
+ - [x] Restructure CLI `graph_build` to call `extract_schemas()` then dump schema before `add_files()`/`add_manifest()`
34
+ - [x] Tests for `extract_schemas()` method and schema pruning correctness
35
+
36
+ ## Files Modified
37
+
38
+ - `src/sqlglider/lineage/analyzer.py` — schema pruning before `lineage()` calls
39
+ - `src/sqlglider/graph/builder.py` — public `extract_schemas()`, skip Pass 1 when already resolved
40
+ - `src/sqlglider/cli.py` — restructured `graph_build` to dump schema before graph building
41
+ - `tests/sqlglider/lineage/test_analyzer.py` — `TestSchemaPruning` (2 tests)
42
+ - `tests/sqlglider/graph/test_builder.py` — `TestExtractSchemas` (3 tests)
43
+
44
+ ## Benchmark Results (After)
45
+
46
+ | Schema Size | Time (6 columns) |
47
+ |---|---|
48
+ | No schema | 2.6ms |
49
+ | 4 tables | 8.3ms |
50
+ | 204 tables | **8.3ms** |
51
+
52
+ Full `analyze_queries` benchmark on complex fixture (analytics_pipeline.sql):
53
+
54
+ | Scenario | Before | After |
55
+ |---|---|---|
56
+ | No schema | ~392ms | ~392ms |
57
+ | Small schema (4 tables) | ~373ms | ~373ms |
58
+ | Big schema (204 tables) | ~1,400ms+ | **~387ms** |
59
+
60
+ ## Lessons Learned
61
+
62
+ - The initial assumption was that double-parsing (Pass 1 + Pass 2 both calling `sqlglot.parse()`) was the bottleneck. Benchmarking showed `parse()` costs ~10ms, while `lineage()` with a 200-table schema costs ~1,000ms. Profiling before optimizing avoided wasted effort on AST caching.
63
+ - `sqlglot.lineage()` appears to have O(n) or worse scaling with schema size, even for tables not referenced in the query. Pruning is essential for multi-file workloads.
@@ -0,0 +1,118 @@
1
+ **Status:** Completed
2
+
3
+ # Plan: `tables scrape` Command
4
+
5
+ ## Overview
6
+
7
+ Add a `tables scrape` subcommand that performs schema inference (the same logic `graph build --resolve-schema` uses) but outputs the inferred schema directly instead of building a lineage graph. This makes schema inference a standalone, reusable operation.
8
+
9
+ ## Key Changes
10
+
11
+ ### 1. Refactor schema extraction out of GraphBuilder
12
+
13
+ **File:** `src/sqlglider/graph/builder.py`
14
+
15
+ Extract `_extract_schemas()` and `_fill_schema_from_catalog()` into a standalone module so both `graph build` and `tables scrape` can use them without instantiating a full `GraphBuilder`.
16
+
17
+ **New file:** `src/sqlglider/schema/extractor.py`
18
+ - `extract_schemas_from_files(file_paths, dialect, sql_preprocessor, schema, strict_schema, console) -> SchemaDict` — core extraction loop with Rich progress bar
19
+ - `fill_schema_from_catalog(schema, file_paths, dialect, sql_preprocessor, catalog_type, catalog_config, console) -> SchemaDict` — catalog fill logic
20
+ - `extract_and_resolve_schema(file_paths, dialect, sql_preprocessor, strict_schema, catalog_type, catalog_config, console) -> SchemaDict` — high-level orchestrator (extract + optional catalog fill)
21
+
22
+ **Update GraphBuilder** to delegate to these new functions instead of implementing them inline. `GraphBuilder._extract_schemas` and `_fill_schema_from_catalog` become thin wrappers or are removed, with `extract_schemas()` calling the shared code.
23
+
24
+ ### 2. Add `tables scrape` CLI command
25
+
26
+ **File:** `src/sqlglider/cli.py`
27
+
28
+ Add `@tables_app.command("scrape")` with these parameters (mirroring `graph build`):
29
+
30
+ | Parameter | Source |
31
+ |-----------|--------|
32
+ | `paths` | Same as `graph build` — file(s) or directory(ies) |
33
+ | `--recursive / -r` | Same recursive directory traversal |
34
+ | `--glob / -g` | Same glob pattern (default `*.sql`) |
35
+ | `--manifest / -m` | Same manifest CSV support |
36
+ | `--dialect / -d` | SQL dialect |
37
+ | `--templater / -t` | Templater name |
38
+ | `--var / -v` | Template variables |
39
+ | `--vars-file` | Variables file |
40
+ | `--strict-schema` | Strict schema mode |
41
+ | `--catalog-type / -c` | Catalog provider for remote DDL |
42
+ | `--output-format / -f` | `text` (default), `json`, or `csv` |
43
+ | `--output-file / -o` | Output file path (stdout if omitted) |
44
+
45
+ **Flow:**
46
+ 1. Resolve config defaults (same pattern as `graph build`)
47
+ 2. Set up templating preprocessor (same shared code)
48
+ 3. Collect files from paths/manifest (same logic as `graph build`)
49
+ 4. Call `extract_and_resolve_schema(...)` from the new shared module
50
+ 5. Format output using existing `format_schema()` from `src/sqlglider/graph/formatters.py`
51
+ 6. Write to file or stdout using `OutputWriter`
52
+
53
+ ### 3. Move schema formatters
54
+
55
+ The existing formatters in `src/sqlglider/graph/formatters.py` are already generic (they format `SchemaDict`). **Decision: leave them in place** to minimize churn — they work for both `graph build --dump-schema` and `tables scrape`.
56
+
57
+ ### 4. Refactor shared file-collection logic
58
+
59
+ The file collection code (paths + recursive glob + manifest) is duplicated between `graph build` and the new `tables scrape`. Extract a helper function:
60
+
61
+ ```python
62
+ def _collect_sql_files(
63
+ paths: Optional[List[Path]],
64
+ manifest: Optional[Path],
65
+ recursive: bool,
66
+ glob_pattern: str,
67
+ ) -> tuple[list[Path], list[Path]]:
68
+ """Returns (manifest_files, path_files)."""
69
+ ```
70
+
71
+ Place this in `cli.py` as a private helper used by both commands.
72
+
73
+ ## Implementation Steps
74
+
75
+ - [x] Create `src/sqlglider/schema/__init__.py`
76
+ - [x] Create `src/sqlglider/schema/extractor.py` with shared schema extraction logic
77
+ - [x] Update `src/sqlglider/graph/builder.py` to delegate to shared extractor
78
+ - [x] Add `_collect_sql_files` helper to `src/sqlglider/cli.py`
79
+ - [x] Refactor `graph build` to use `_collect_sql_files`
80
+ - [x] Add `tables scrape` command to `src/sqlglider/cli.py`
81
+ - [x] Create `tests/sqlglider/schema/__init__.py`
82
+ - [x] Create `tests/sqlglider/schema/test_extractor.py` (10 tests)
83
+ - [x] Add `TestTablesScrapeCommand` to `tests/sqlglider/test_cli.py` (11 tests)
84
+ - [x] All 672 tests pass, 81.5% coverage, ruff clean
85
+
86
+ ## Files Created/Modified
87
+
88
+ | File | Action |
89
+ |------|--------|
90
+ | `src/sqlglider/schema/__init__.py` | Created — empty |
91
+ | `src/sqlglider/schema/extractor.py` | Created — shared schema extraction logic |
92
+ | `src/sqlglider/graph/builder.py` | Modified — delegate to shared extractor |
93
+ | `src/sqlglider/cli.py` | Modified — add `tables scrape` command + `_collect_sql_files` helper |
94
+ | `tests/sqlglider/schema/__init__.py` | Created — empty |
95
+ | `tests/sqlglider/schema/test_extractor.py` | Created — tests for shared extractor |
96
+ | `tests/sqlglider/test_cli.py` | Modified — add tests for `tables scrape` command |
97
+
98
+ ## Testing Strategy
99
+
100
+ 1. **Unit tests for `schema/extractor.py`**: Test `extract_schemas_from_files` with CREATE VIEW/TABLE AS SELECT and DQL qualified refs
101
+ 2. **CLI tests for `tables scrape`**: Use `CliRunner` to test text/json/csv output, recursive glob, templating, error cases
102
+ 3. **Regression**: Full test suite passes (672 tests), coverage at 81.5%
103
+
104
+ ## Verification
105
+
106
+ ```bash
107
+ # Basic usage
108
+ uv run sqlglider tables scrape ./queries/ -r
109
+
110
+ # With output format
111
+ uv run sqlglider tables scrape ./queries/ -r -f json -o schema.json
112
+
113
+ # With catalog
114
+ uv run sqlglider tables scrape ./queries/ -r -c databricks -f csv
115
+
116
+ # Ensure graph build still works
117
+ uv run sqlglider graph build ./queries/ -r --resolve-schema --dump-schema schema.txt -o graph.json
118
+ ```
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.12'
32
- __version_tuple__ = version_tuple = (0, 1, 12)
31
+ __version__ = version = '0.1.14'
32
+ __version_tuple__ = version_tuple = (0, 1, 14)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -788,6 +788,274 @@ def tables_pull(
788
788
  raise typer.Exit(1)
789
789
 
790
790
 
791
+ def _collect_sql_files(
792
+ paths: Optional[List[Path]],
793
+ manifest: Optional[Path],
794
+ recursive: bool,
795
+ glob_pattern: str,
796
+ ) -> tuple[list[Path], list[Path]]:
797
+ """Collect SQL files from paths and/or manifest.
798
+
799
+ Args:
800
+ paths: File or directory paths to scan.
801
+ manifest: Optional manifest CSV path.
802
+ recursive: Whether to recurse into directories.
803
+ glob_pattern: Glob pattern for directory scanning.
804
+
805
+ Returns:
806
+ Tuple of (manifest_files, path_files).
807
+ """
808
+ path_files: list[Path] = []
809
+ if paths:
810
+ for path in paths:
811
+ if path.is_dir():
812
+ pattern = f"**/{glob_pattern}" if recursive else glob_pattern
813
+ path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
814
+ elif path.is_file():
815
+ path_files.append(path)
816
+ else:
817
+ err_console.print(f"[red]Error:[/red] Path not found: {path}")
818
+ raise typer.Exit(1)
819
+
820
+ manifest_files: list[Path] = []
821
+ if manifest:
822
+ from sqlglider.graph.models import Manifest
823
+
824
+ manifest_data = Manifest.from_csv(manifest)
825
+ base_dir = manifest.parent
826
+ for entry in manifest_data.entries:
827
+ file_path = Path(entry.file_path)
828
+ if not file_path.is_absolute():
829
+ file_path = (base_dir / entry.file_path).resolve()
830
+ manifest_files.append(file_path)
831
+
832
+ return manifest_files, path_files
833
+
834
+
835
+ @tables_app.command("scrape")
836
+ def tables_scrape(
837
+ paths: List[Path] = typer.Argument(
838
+ None,
839
+ help="SQL file(s) or directory path to process",
840
+ ),
841
+ recursive: bool = typer.Option(
842
+ False,
843
+ "--recursive",
844
+ "-r",
845
+ help="Recursively search directories for SQL files",
846
+ ),
847
+ glob_pattern: str = typer.Option(
848
+ "*.sql",
849
+ "--glob",
850
+ "-g",
851
+ help="Glob pattern for matching SQL files in directories",
852
+ ),
853
+ manifest: Optional[Path] = typer.Option(
854
+ None,
855
+ "--manifest",
856
+ "-m",
857
+ exists=True,
858
+ help="Path to manifest CSV file with file_path and optional dialect columns",
859
+ ),
860
+ dialect: Optional[str] = typer.Option(
861
+ None,
862
+ "--dialect",
863
+ "-d",
864
+ help="SQL dialect (default: spark)",
865
+ ),
866
+ templater: Optional[str] = typer.Option(
867
+ None,
868
+ "--templater",
869
+ "-t",
870
+ help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
871
+ ),
872
+ var: Optional[List[str]] = typer.Option(
873
+ None,
874
+ "--var",
875
+ "-v",
876
+ help="Template variable in key=value format (repeatable)",
877
+ ),
878
+ vars_file: Optional[Path] = typer.Option(
879
+ None,
880
+ "--vars-file",
881
+ exists=True,
882
+ help="Path to variables file (JSON or YAML)",
883
+ ),
884
+ strict_schema: bool = typer.Option(
885
+ False,
886
+ "--strict-schema",
887
+ help="Fail if any column's table cannot be identified during schema extraction",
888
+ ),
889
+ catalog_type: Optional[str] = typer.Option(
890
+ None,
891
+ "--catalog-type",
892
+ "-c",
893
+ help="Catalog provider for pulling DDL of tables not found in files "
894
+ "(e.g. 'databricks')",
895
+ ),
896
+ output_format: Optional[str] = typer.Option(
897
+ None,
898
+ "--output-format",
899
+ "-f",
900
+ help="Output format: 'text' (default), 'json', or 'csv'",
901
+ ),
902
+ output_file: Optional[Path] = typer.Option(
903
+ None,
904
+ "--output-file",
905
+ "-o",
906
+ help="Output file path (prints to stdout if not provided)",
907
+ ),
908
+ ) -> None:
909
+ """
910
+ Scrape schema information from SQL files.
911
+
912
+ Infers table and column schemas from DDL statements and DQL column
913
+ references across one or more SQL files. Supports the same file input
914
+ modes as `graph build` (paths, directories, manifests).
915
+
916
+ Examples:
917
+
918
+ # Scrape schema from a directory
919
+ sqlglider tables scrape ./queries/ -r
920
+
921
+ # Output as JSON
922
+ sqlglider tables scrape ./queries/ -r -f json
923
+
924
+ # Save to file
925
+ sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
926
+
927
+ # With Jinja2 templating
928
+ sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
929
+
930
+ # With catalog fallback
931
+ sqlglider tables scrape ./queries/ -r -c databricks
932
+ """
933
+ from sqlglider.graph.formatters import format_schema
934
+ from sqlglider.lineage.analyzer import SchemaResolutionError
935
+ from sqlglider.schema.extractor import extract_and_resolve_schema
936
+
937
+ # Load config for defaults
938
+ config = load_config()
939
+ dialect = dialect or config.dialect or "spark"
940
+ templater = templater or config.templater
941
+ strict_schema = strict_schema or config.strict_schema or False
942
+ output_format = output_format or config.output_format or "text"
943
+
944
+ if output_format not in ("text", "json", "csv"):
945
+ err_console.print(
946
+ f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
947
+ "Use 'text', 'json', or 'csv'."
948
+ )
949
+ raise typer.Exit(1)
950
+
951
+ # Only inherit catalog_type from config when not provided via CLI
952
+ if not catalog_type:
953
+ catalog_type = config.catalog_type
954
+
955
+ # Validate inputs
956
+ if not paths and not manifest:
957
+ err_console.print(
958
+ "[red]Error:[/red] Must provide either file/directory paths or --manifest option."
959
+ )
960
+ raise typer.Exit(1)
961
+
962
+ # Create SQL preprocessor if templating is enabled
963
+ sql_preprocessor: Optional[Callable[[str, Path], str]] = None
964
+ if templater:
965
+ config_vars_file = None
966
+ config_vars = None
967
+ if config.templating:
968
+ if config.templating.variables_file and not vars_file:
969
+ config_vars_file = Path(config.templating.variables_file)
970
+ if not config_vars_file.exists():
971
+ err_console.print(
972
+ f"[yellow]Warning:[/yellow] Variables file from config "
973
+ f"not found: {config_vars_file}"
974
+ )
975
+ config_vars_file = None
976
+ config_vars = config.templating.variables
977
+
978
+ variables = load_all_variables(
979
+ cli_vars=var,
980
+ vars_file=vars_file or config_vars_file,
981
+ config_vars=config_vars,
982
+ use_env=True,
983
+ )
984
+
985
+ templater_instance = get_templater(templater)
986
+
987
+ def _preprocess(sql: str, file_path: Path) -> str:
988
+ return templater_instance.render(
989
+ sql, variables=variables, source_path=file_path
990
+ )
991
+
992
+ sql_preprocessor = _preprocess
993
+
994
+ try:
995
+ # Build catalog config from config file if available
996
+ catalog_config_dict = None
997
+ if catalog_type and config.catalog:
998
+ provider_config = getattr(config.catalog, catalog_type, None)
999
+ if provider_config:
1000
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1001
+
1002
+ # Collect files
1003
+ manifest_files, path_files = _collect_sql_files(
1004
+ paths, manifest, recursive, glob_pattern
1005
+ )
1006
+ all_files = manifest_files + path_files
1007
+
1008
+ if not all_files:
1009
+ err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
1010
+ raise typer.Exit(0)
1011
+
1012
+ # Extract schema
1013
+ schema = extract_and_resolve_schema(
1014
+ all_files,
1015
+ dialect=dialect,
1016
+ sql_preprocessor=sql_preprocessor,
1017
+ strict_schema=strict_schema,
1018
+ catalog_type=catalog_type,
1019
+ catalog_config=catalog_config_dict,
1020
+ console=err_console,
1021
+ )
1022
+
1023
+ if not schema:
1024
+ err_console.print("[yellow]No schema information found.[/yellow]")
1025
+ raise typer.Exit(0)
1026
+
1027
+ # Format and output
1028
+ formatted = format_schema(schema, output_format)
1029
+ if output_file:
1030
+ OutputWriter.write(formatted, output_file)
1031
+ err_console.print(
1032
+ f"[green]Schema written to {output_file} "
1033
+ f"({len(schema)} table(s))[/green]"
1034
+ )
1035
+ else:
1036
+ console.print(formatted, end="")
1037
+
1038
+ except SchemaResolutionError as e:
1039
+ err_console.print(f"[red]Error:[/red] {e}")
1040
+ raise typer.Exit(1)
1041
+
1042
+ except FileNotFoundError as e:
1043
+ err_console.print(f"[red]Error:[/red] {e}")
1044
+ raise typer.Exit(1)
1045
+
1046
+ except TemplaterError as e:
1047
+ err_console.print(f"[red]Error:[/red] {e}")
1048
+ raise typer.Exit(1)
1049
+
1050
+ except ValueError as e:
1051
+ err_console.print(f"[red]Error:[/red] {e}")
1052
+ raise typer.Exit(1)
1053
+
1054
+ except Exception as e:
1055
+ err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
1056
+ raise typer.Exit(1)
1057
+
1058
+
791
1059
  @app.command()
792
1060
  def template(
793
1061
  sql_file: Annotated[
@@ -1166,36 +1434,35 @@ def graph_build(
1166
1434
  strict_schema=strict_schema,
1167
1435
  )
1168
1436
 
1437
+ # Collect file paths for schema extraction
1438
+ manifest_files, path_files = _collect_sql_files(
1439
+ paths, manifest, recursive, glob_pattern
1440
+ )
1441
+
1442
+ # Extract schema upfront if requested, then dump before graph building
1443
+ all_files = manifest_files + path_files
1444
+ if resolve_schema and all_files:
1445
+ builder.extract_schemas(all_files, dialect=dialect)
1446
+
1447
+ if dump_schema:
1448
+ from sqlglider.graph.formatters import format_schema
1449
+
1450
+ schema_content = format_schema(
1451
+ builder.resolved_schema, dump_schema_format
1452
+ )
1453
+ dump_schema.write_text(schema_content, encoding="utf-8")
1454
+ console.print(
1455
+ f"[green]Schema dumped to {dump_schema} "
1456
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1457
+ )
1458
+
1169
1459
  # Process manifest if provided
1170
1460
  if manifest:
1171
1461
  builder.add_manifest(manifest, dialect=dialect)
1172
1462
 
1173
- # Process paths - collect all files first for progress tracking
1174
- if paths:
1175
- all_files: list[Path] = []
1176
- for path in paths:
1177
- if path.is_dir():
1178
- pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1179
- all_files.extend(
1180
- f for f in sorted(path.glob(pattern)) if f.is_file()
1181
- )
1182
- elif path.is_file():
1183
- all_files.append(path)
1184
- else:
1185
- err_console.print(f"[red]Error:[/red] Path not found: {path}")
1186
- raise typer.Exit(1)
1187
- builder.add_files(all_files, dialect=dialect)
1188
-
1189
- # Dump resolved schema if requested
1190
- if dump_schema:
1191
- from sqlglider.graph.formatters import format_schema
1192
-
1193
- schema_content = format_schema(builder.resolved_schema, dump_schema_format)
1194
- dump_schema.write_text(schema_content, encoding="utf-8")
1195
- console.print(
1196
- f"[green]Schema dumped to {dump_schema} "
1197
- f"({len(builder.resolved_schema)} table(s))[/green]"
1198
- )
1463
+ # Process path-based files
1464
+ if path_files:
1465
+ builder.add_files(path_files, dialect=dialect)
1199
1466
 
1200
1467
  # Build and save graph
1201
1468
  graph = builder.build()