sql-glider 0.1.12__tar.gz → 0.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {sql_glider-0.1.12 → sql_glider-0.1.13}/PKG-INFO +1 -1
  2. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/_version.py +2 -2
  3. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/cli.py +39 -18
  4. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/builder.py +34 -24
  5. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/lineage/analyzer.py +15 -2
  6. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_builder.py +42 -0
  7. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/lineage/test_analyzer.py +48 -0
  8. {sql_glider-0.1.12 → sql_glider-0.1.13}/.github/workflows/ci.yml +0 -0
  9. {sql_glider-0.1.12 → sql_glider-0.1.13}/.github/workflows/publish.yml +0 -0
  10. {sql_glider-0.1.12 → sql_glider-0.1.13}/.gitignore +0 -0
  11. {sql_glider-0.1.12 → sql_glider-0.1.13}/.python-version +0 -0
  12. {sql_glider-0.1.12 → sql_glider-0.1.13}/ARCHITECTURE.md +0 -0
  13. {sql_glider-0.1.12 → sql_glider-0.1.13}/CLAUDE.md +0 -0
  14. {sql_glider-0.1.12 → sql_glider-0.1.13}/LICENSE +0 -0
  15. {sql_glider-0.1.12 → sql_glider-0.1.13}/README.md +0 -0
  16. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-05-column-level-lineage.md +0 -0
  17. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-05-reverse-lineage.md +0 -0
  18. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-06-config-file-support.md +0 -0
  19. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-06-graph-lineage.md +0 -0
  20. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-06-unify-single-multi-query.md +0 -0
  21. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-07-sample-data-model.md +0 -0
  22. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-07-sql-templating.md +0 -0
  23. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-08-tables-command.md +0 -0
  24. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-09-graph-query-paths.md +0 -0
  25. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-13-dissect-command.md +0 -0
  26. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2025-12-14-tables-pull-command.md +0 -0
  27. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2026-01-25-fix-union-lineage-chain.md +0 -0
  28. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2026-01-26-file-scoped-schema-context.md +0 -0
  29. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2026-01-28-sparksql-table-extraction.md +0 -0
  30. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2026-01-29-no-star-flag.md +0 -0
  31. {sql_glider-0.1.12 → sql_glider-0.1.13}/plans/2026-01-29-resolve-schema.md +0 -0
  32. {sql_glider-0.1.12 → sql_glider-0.1.13}/pyproject.toml +0 -0
  33. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/README.md +0 -0
  34. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/expire_dim_customer.sql +0 -0
  35. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/load_fact_orders.sql +0 -0
  36. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/load_fact_payments.sql +0 -0
  37. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/merge_dim_customer.sql +0 -0
  38. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/merge_dim_product.sql +0 -0
  39. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/business/update_dim_customer_metrics.sql +0 -0
  40. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/complex/conditional_merge.sql +0 -0
  41. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/complex/cte_insert.sql +0 -0
  42. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/complex/multi_table_transform.sql +0 -0
  43. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/dim_customer.sql +0 -0
  44. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/dim_product.sql +0 -0
  45. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/fact_orders.sql +0 -0
  46. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/fact_payments.sql +0 -0
  47. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_addresses.sql +0 -0
  48. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_customers.sql +0 -0
  49. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_order_items.sql +0 -0
  50. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_orders.sql +0 -0
  51. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_payments.sql +0 -0
  52. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/raw_products.sql +0 -0
  53. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/stg_customers.sql +0 -0
  54. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/stg_orders.sql +0 -0
  55. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/stg_payments.sql +0 -0
  56. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/ddl/stg_products.sql +0 -0
  57. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/incremental/incr_fact_orders.sql +0 -0
  58. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/incremental/incr_fact_payments.sql +0 -0
  59. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/incremental/incr_pres_sales_summary.sql +0 -0
  60. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/maintenance/delete_expired_customers.sql +0 -0
  61. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/maintenance/update_product_status.sql +0 -0
  62. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/presentation/load_pres_customer_360.sql +0 -0
  63. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/presentation/load_pres_customer_cohort.sql +0 -0
  64. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/presentation/load_pres_product_performance.sql +0 -0
  65. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/presentation/load_pres_sales_summary.sql +0 -0
  66. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/staging/load_stg_customers.sql +0 -0
  67. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/staging/load_stg_orders.sql +0 -0
  68. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/staging/load_stg_payments.sql +0 -0
  69. {sql_glider-0.1.12 → sql_glider-0.1.13}/sample_data_model/staging/load_stg_products.sql +0 -0
  70. {sql_glider-0.1.12 → sql_glider-0.1.13}/sqlglider.toml.example +0 -0
  71. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/__init__.py +0 -0
  72. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/catalog/__init__.py +0 -0
  73. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/catalog/base.py +0 -0
  74. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/catalog/databricks.py +0 -0
  75. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/catalog/registry.py +0 -0
  76. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/dissection/__init__.py +0 -0
  77. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/dissection/analyzer.py +0 -0
  78. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/dissection/formatters.py +0 -0
  79. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/dissection/models.py +0 -0
  80. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/global_models.py +0 -0
  81. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/__init__.py +0 -0
  82. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/formatters.py +0 -0
  83. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/merge.py +0 -0
  84. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/models.py +0 -0
  85. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/query.py +0 -0
  86. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/graph/serialization.py +0 -0
  87. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/lineage/__init__.py +0 -0
  88. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/lineage/formatters.py +0 -0
  89. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/templating/__init__.py +0 -0
  90. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/templating/base.py +0 -0
  91. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/templating/jinja.py +0 -0
  92. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/templating/registry.py +0 -0
  93. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/templating/variables.py +0 -0
  94. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/utils/__init__.py +0 -0
  95. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/utils/config.py +0 -0
  96. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/utils/file_utils.py +0 -0
  97. {sql_glider-0.1.12 → sql_glider-0.1.13}/src/sqlglider/utils/schema.py +0 -0
  98. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/__init__.py +0 -0
  99. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/analytics_pipeline.sql +0 -0
  100. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/analytics_pipeline_union_merge.sql +0 -0
  101. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/customers.sql +0 -0
  102. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/orders.sql +0 -0
  103. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/reports.sql +0 -0
  104. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/multi_file_queries/view_based_merge.sql +0 -0
  105. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_cte.sql +0 -0
  106. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_cte_query.sql +0 -0
  107. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_cte_view_star.sql +0 -0
  108. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_generated_column_query.sql +0 -0
  109. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_multi.sql +0 -0
  110. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_multi_query.sql +0 -0
  111. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_single_query.sql +0 -0
  112. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_subquery.sql +0 -0
  113. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_tables.sql +0 -0
  114. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_view.sql +0 -0
  115. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/original_queries/test_view_window_cte.sql +0 -0
  116. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/fixtures/sample_manifest.csv +0 -0
  117. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/__init__.py +0 -0
  118. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/catalog/__init__.py +0 -0
  119. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/catalog/test_base.py +0 -0
  120. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/catalog/test_databricks.py +0 -0
  121. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/catalog/test_registry.py +0 -0
  122. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/dissection/__init__.py +0 -0
  123. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/dissection/test_analyzer.py +0 -0
  124. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/dissection/test_formatters.py +0 -0
  125. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/dissection/test_models.py +0 -0
  126. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/__init__.py +0 -0
  127. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_formatters.py +0 -0
  128. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_merge.py +0 -0
  129. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_models.py +0 -0
  130. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_query.py +0 -0
  131. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/graph/test_serialization.py +0 -0
  132. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/lineage/__init__.py +0 -0
  133. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/lineage/test_formatters.py +0 -0
  134. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/templating/__init__.py +0 -0
  135. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/templating/test_base.py +0 -0
  136. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/templating/test_jinja.py +0 -0
  137. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/templating/test_registry.py +0 -0
  138. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/templating/test_variables.py +0 -0
  139. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/test_cli.py +0 -0
  140. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/utils/__init__.py +0 -0
  141. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/utils/test_config.py +0 -0
  142. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/utils/test_file_utils.py +0 -0
  143. {sql_glider-0.1.12 → sql_glider-0.1.13}/tests/sqlglider/utils/test_schema.py +0 -0
  144. {sql_glider-0.1.12 → sql_glider-0.1.13}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.12
3
+ Version: 0.1.13
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.12'
32
- __version_tuple__ = version_tuple = (0, 1, 12)
31
+ __version__ = version = '0.1.13'
32
+ __version_tuple__ = version_tuple = (0, 1, 13)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1166,36 +1166,57 @@ def graph_build(
1166
1166
  strict_schema=strict_schema,
1167
1167
  )
1168
1168
 
1169
- # Process manifest if provided
1170
- if manifest:
1171
- builder.add_manifest(manifest, dialect=dialect)
1172
-
1173
- # Process paths - collect all files first for progress tracking
1169
+ # Collect file paths for schema extraction
1170
+ path_files: list[Path] = []
1174
1171
  if paths:
1175
- all_files: list[Path] = []
1176
1172
  for path in paths:
1177
1173
  if path.is_dir():
1178
1174
  pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1179
- all_files.extend(
1175
+ path_files.extend(
1180
1176
  f for f in sorted(path.glob(pattern)) if f.is_file()
1181
1177
  )
1182
1178
  elif path.is_file():
1183
- all_files.append(path)
1179
+ path_files.append(path)
1184
1180
  else:
1185
1181
  err_console.print(f"[red]Error:[/red] Path not found: {path}")
1186
1182
  raise typer.Exit(1)
1187
- builder.add_files(all_files, dialect=dialect)
1188
1183
 
1189
- # Dump resolved schema if requested
1190
- if dump_schema:
1191
- from sqlglider.graph.formatters import format_schema
1184
+ manifest_files: list[Path] = []
1185
+ if manifest:
1186
+ from sqlglider.graph.models import Manifest
1187
+
1188
+ manifest_data = Manifest.from_csv(manifest)
1189
+ base_dir = manifest.parent
1190
+ for entry in manifest_data.entries:
1191
+ file_path = Path(entry.file_path)
1192
+ if not file_path.is_absolute():
1193
+ file_path = (base_dir / entry.file_path).resolve()
1194
+ manifest_files.append(file_path)
1195
+
1196
+ # Extract schema upfront if requested, then dump before graph building
1197
+ all_files = manifest_files + path_files
1198
+ if resolve_schema and all_files:
1199
+ builder.extract_schemas(all_files, dialect=dialect)
1200
+
1201
+ if dump_schema:
1202
+ from sqlglider.graph.formatters import format_schema
1203
+
1204
+ schema_content = format_schema(
1205
+ builder.resolved_schema, dump_schema_format
1206
+ )
1207
+ dump_schema.write_text(schema_content, encoding="utf-8")
1208
+ console.print(
1209
+ f"[green]Schema dumped to {dump_schema} "
1210
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1211
+ )
1212
+
1213
+ # Process manifest if provided
1214
+ if manifest:
1215
+ builder.add_manifest(manifest, dialect=dialect)
1192
1216
 
1193
- schema_content = format_schema(builder.resolved_schema, dump_schema_format)
1194
- dump_schema.write_text(schema_content, encoding="utf-8")
1195
- console.print(
1196
- f"[green]Schema dumped to {dump_schema} "
1197
- f"({len(builder.resolved_schema)} table(s))[/green]"
1198
- )
1217
+ # Process path-based files
1218
+ if path_files:
1219
+ builder.add_files(path_files, dialect=dialect)
1199
1220
 
1200
1221
  # Build and save graph
1201
1222
  graph = builder.build()
@@ -235,19 +235,10 @@ class GraphBuilder:
235
235
  if not files_with_dialects:
236
236
  return self
237
237
 
238
- # Two-pass schema resolution
239
- if self.resolve_schema:
240
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
238
+ # Two-pass schema resolution (skip if already resolved)
239
+ if self.resolve_schema and not self._resolved_schema:
241
240
  file_paths_only = [fp for fp, _ in files_with_dialects]
242
- self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
243
- if self.catalog_type:
244
- self._resolved_schema = self._fill_schema_from_catalog(
245
- self._resolved_schema, file_paths_only, dialect
246
- )
247
- console.print(
248
- f"[blue]Schema resolved for "
249
- f"{len(self._resolved_schema)} table(s)[/blue]"
250
- )
241
+ self.extract_schemas(file_paths_only, dialect)
251
242
 
252
243
  total = len(files_with_dialects)
253
244
  description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
@@ -286,18 +277,9 @@ class GraphBuilder:
286
277
  if not file_paths:
287
278
  return self
288
279
 
289
- # Two-pass schema resolution: extract schema from all files first
290
- if self.resolve_schema:
291
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
292
- self._resolved_schema = self._extract_schemas(file_paths, dialect)
293
- if self.catalog_type:
294
- self._resolved_schema = self._fill_schema_from_catalog(
295
- self._resolved_schema, file_paths, dialect
296
- )
297
- console.print(
298
- f"[blue]Schema resolved for "
299
- f"{len(self._resolved_schema)} table(s)[/blue]"
300
- )
280
+ # Two-pass schema resolution (skip if already resolved)
281
+ if self.resolve_schema and not self._resolved_schema:
282
+ self.extract_schemas(file_paths, dialect)
301
283
 
302
284
  if show_progress:
303
285
  total = len(file_paths)
@@ -321,6 +303,34 @@ class GraphBuilder:
321
303
  self.add_file(file_path, dialect)
322
304
  return self
323
305
 
306
+ def extract_schemas(
307
+ self,
308
+ file_paths: List[Path],
309
+ dialect: Optional[str] = None,
310
+ ) -> Dict[str, Dict[str, str]]:
311
+ """Run schema extraction pass and optionally fill from catalog.
312
+
313
+ Call this before add_files/add_manifest to resolve schema upfront.
314
+ The resolved schema is stored internally and also returned.
315
+
316
+ Args:
317
+ file_paths: SQL files to extract schema from
318
+ dialect: SQL dialect override
319
+
320
+ Returns:
321
+ Resolved schema dict
322
+ """
323
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
324
+ self._resolved_schema = self._extract_schemas(file_paths, dialect)
325
+ if self.catalog_type:
326
+ self._resolved_schema = self._fill_schema_from_catalog(
327
+ self._resolved_schema, file_paths, dialect
328
+ )
329
+ console.print(
330
+ f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
331
+ )
332
+ return self._resolved_schema.copy()
333
+
324
334
  def _extract_schemas(
325
335
  self,
326
336
  file_paths: List[Path],
@@ -859,18 +859,31 @@ class LineageAnalyzer:
859
859
  else:
860
860
  current_query_sql = self.expr.sql(dialect=self.dialect)
861
861
 
862
+ # Prune schema to only tables referenced in this query to avoid
863
+ # sqlglot.lineage() performance degradation with large schema dicts
864
+ pruned_schema: Optional[Dict[str, Dict[str, str]]] = None
865
+ if self._file_schema:
866
+ referenced = {t.lower() for t in self._get_query_tables()}
867
+ pruned_schema = {
868
+ table: cols
869
+ for table, cols in self._file_schema.items()
870
+ if table.lower() in referenced
871
+ }
872
+ if not pruned_schema:
873
+ pruned_schema = None
874
+
862
875
  for col in columns_to_analyze:
863
876
  try:
864
877
  # Get the column name that lineage expects
865
878
  lineage_col = self._column_mapping.get(col, col)
866
879
 
867
880
  # Get lineage tree for this column using current query SQL only
868
- # Pass file schema to enable SELECT * expansion for known tables/views
881
+ # Pass pruned schema to enable SELECT * expansion for known tables/views
869
882
  node = lineage(
870
883
  lineage_col,
871
884
  current_query_sql,
872
885
  dialect=self.dialect,
873
- schema=self._file_schema if self._file_schema else None,
886
+ schema=pruned_schema,
874
887
  )
875
888
 
876
889
  # Collect all source columns
@@ -842,3 +842,45 @@ class TestResolvedSchemaProperty:
842
842
  schema = builder.resolved_schema
843
843
  schema["injected"] = {"col": "UNKNOWN"}
844
844
  assert "injected" not in builder.resolved_schema
845
+
846
+
847
+ class TestExtractSchemas:
848
+ """Tests for the public extract_schemas method."""
849
+
850
+ def test_extract_schemas_returns_schema(self, tmp_path):
851
+ """extract_schemas returns inferred schema from files."""
852
+ sql_file = tmp_path / "query.sql"
853
+ sql_file.write_text("SELECT c.id, c.name FROM customers c;")
854
+ builder = GraphBuilder(resolve_schema=True)
855
+ schema = builder.extract_schemas([sql_file])
856
+ assert "customers" in schema
857
+ assert "id" in schema["customers"]
858
+ assert "name" in schema["customers"]
859
+
860
+ def test_extract_schemas_before_add_files(self, tmp_path):
861
+ """Calling extract_schemas before add_files avoids duplicate extraction."""
862
+ schema_file = tmp_path / "schema.sql"
863
+ schema_file.write_text("SELECT c.id, c.name FROM customers c;")
864
+ query_file = tmp_path / "query.sql"
865
+ query_file.write_text("SELECT * FROM customers;")
866
+
867
+ builder = GraphBuilder(resolve_schema=True)
868
+ schema = builder.extract_schemas([schema_file, query_file])
869
+ assert "customers" in schema
870
+
871
+ # add_files should skip Pass 1 since schema is already resolved
872
+ builder.add_files([query_file])
873
+ graph = builder.build()
874
+ assert graph is not None
875
+
876
+ # Schema should still be the same
877
+ assert builder.resolved_schema == schema
878
+
879
+ def test_extract_schemas_populates_resolved_schema(self, tmp_path):
880
+ """extract_schemas populates the resolved_schema property."""
881
+ sql_file = tmp_path / "query.sql"
882
+ sql_file.write_text("SELECT u.id, u.email FROM users u;")
883
+ builder = GraphBuilder(resolve_schema=True)
884
+ assert builder.resolved_schema == {}
885
+ builder.extract_schemas([sql_file])
886
+ assert "users" in builder.resolved_schema
@@ -3133,3 +3133,51 @@ class TestSchemaParam:
3133
3133
  schema = analyzer.get_extracted_schema()
3134
3134
  assert "v1" in schema
3135
3135
  assert set(schema["v1"].keys()) == {"id", "name"}
3136
+
3137
+
3138
+ class TestSchemaPruning:
3139
+ """Tests that schema pruning doesn't affect lineage correctness."""
3140
+
3141
+ def test_large_schema_same_results_as_small(self):
3142
+ """Lineage results are identical with a large unreferenced schema."""
3143
+ sql = "SELECT c.id, c.name FROM customers c"
3144
+
3145
+ small_schema = {
3146
+ "customers": {"id": "UNKNOWN", "name": "UNKNOWN"},
3147
+ }
3148
+ big_schema = dict(small_schema)
3149
+ for i in range(200):
3150
+ big_schema[f"unrelated_table_{i}"] = {
3151
+ f"col_{j}": "UNKNOWN" for j in range(20)
3152
+ }
3153
+
3154
+ analyzer_small = LineageAnalyzer(sql, dialect="spark", schema=small_schema)
3155
+ results_small = analyzer_small.analyze_queries(level=AnalysisLevel.COLUMN)
3156
+
3157
+ analyzer_big = LineageAnalyzer(sql, dialect="spark", schema=big_schema)
3158
+ results_big = analyzer_big.analyze_queries(level=AnalysisLevel.COLUMN)
3159
+
3160
+ items_small = [
3161
+ (item.output_name, item.source_name)
3162
+ for r in results_small
3163
+ for item in r.lineage_items
3164
+ ]
3165
+ items_big = [
3166
+ (item.output_name, item.source_name)
3167
+ for r in results_big
3168
+ for item in r.lineage_items
3169
+ ]
3170
+ assert items_small == items_big
3171
+
3172
+ def test_star_expansion_works_with_pruned_schema(self):
3173
+ """SELECT * expansion still works when schema is pruned."""
3174
+ sql = "SELECT * FROM users"
3175
+ schema = {
3176
+ "users": {"id": "UNKNOWN", "email": "UNKNOWN"},
3177
+ "unrelated": {"col": "UNKNOWN"},
3178
+ }
3179
+ analyzer = LineageAnalyzer(sql, dialect="spark", schema=schema)
3180
+ results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
3181
+ output_names = {item.output_name for r in results for item in r.lineage_items}
3182
+ assert "id" in output_names
3183
+ assert "email" in output_names
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes