sql-glider 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -26,6 +26,8 @@ Requires-Dist: rich>=13.0.0
26
26
  Requires-Dist: rustworkx>=0.15.0
27
27
  Requires-Dist: sqlglot[rs]>=25.0.0
28
28
  Requires-Dist: typer>=0.9.0
29
+ Provides-Extra: databricks
30
+ Requires-Dist: databricks-sdk>=0.20.0; extra == 'databricks'
29
31
  Description-Content-Type: text/markdown
30
32
 
31
33
  # SQL Glider
@@ -40,6 +42,7 @@ SQL Glider provides powerful column-level and table-level lineage analysis for S
40
42
 
41
43
  - **Forward Lineage:** Trace output columns back to their source tables and columns
42
44
  - **Reverse Lineage:** Impact analysis - find which output columns are affected by a source column
45
+ - **Query Dissection:** Decompose SQL into components (CTEs, subqueries, UNION branches) for unit testing
43
46
  - **Table Extraction:** List all tables in SQL files with usage type (INPUT/OUTPUT) and object type (TABLE/VIEW/CTE)
44
47
  - **Multi-level Tracing:** Automatically handles CTEs, subqueries, and complex expressions
45
48
  - **Graph-Based Lineage:** Build and query lineage graphs across thousands of SQL files
@@ -171,15 +174,32 @@ List all tables involved in SQL files with usage and type information:
171
174
 
172
175
  ```bash
173
176
  # List all tables in a SQL file
174
- uv run sqlglider tables query.sql
177
+ uv run sqlglider tables overview query.sql
175
178
 
176
179
  # JSON output with detailed table info
177
- uv run sqlglider tables query.sql --output-format json
180
+ uv run sqlglider tables overview query.sql --output-format json
178
181
 
179
182
  # Export to CSV
180
- uv run sqlglider tables query.sql --output-format csv --output-file tables.csv
183
+ uv run sqlglider tables overview query.sql --output-format csv --output-file tables.csv
181
184
  ```
182
185
 
186
+ ### Pull DDL from Remote Catalogs
187
+
188
+ Fetch DDL definitions from remote data catalogs (e.g., Databricks Unity Catalog):
189
+
190
+ ```bash
191
+ # Pull DDL for all tables used in a SQL file (outputs to stdout)
192
+ uv run sqlglider tables pull query.sql --catalog-type databricks
193
+
194
+ # Save DDL files to a folder (one file per table)
195
+ uv run sqlglider tables pull query.sql -c databricks -o ./ddl/
196
+
197
+ # List available catalog providers
198
+ uv run sqlglider tables pull --list
199
+ ```
200
+
201
+ **Note:** Requires optional dependencies. Install with: `pip install sql-glider[databricks]`
202
+
183
203
  **Example Output (JSON):**
184
204
  ```json
185
205
  {
@@ -204,6 +224,94 @@ uv run sqlglider tables query.sql --output-format csv --output-file tables.csv
204
224
  - `CTE`: Common Table Expression (WITH clause)
205
225
  - `UNKNOWN`: Cannot determine type from SQL alone
206
226
 
227
+ ### Query Dissection
228
+
229
+ Decompose SQL queries into constituent parts for unit testing and analysis:
230
+
231
+ ```bash
232
+ # Dissect a SQL file (text output)
233
+ uv run sqlglider dissect query.sql
234
+
235
+ # JSON output with full component details
236
+ uv run sqlglider dissect query.sql --output-format json
237
+
238
+ # CSV output for spreadsheet analysis
239
+ uv run sqlglider dissect query.sql --output-format csv
240
+
241
+ # Export to file
242
+ uv run sqlglider dissect query.sql -f json -o dissected.json
243
+
244
+ # With templating support
245
+ uv run sqlglider dissect query.sql --templater jinja --var schema=analytics
246
+
247
+ # From stdin
248
+ echo "WITH cte AS (SELECT id FROM users) SELECT * FROM cte" | uv run sqlglider dissect
249
+ ```
250
+
251
+ **Example Input:**
252
+ ```sql
253
+ WITH order_totals AS (
254
+ SELECT customer_id, SUM(amount) AS total
255
+ FROM orders
256
+ GROUP BY customer_id
257
+ )
258
+ INSERT INTO analytics.summary
259
+ SELECT * FROM order_totals WHERE total > 100
260
+ ```
261
+
262
+ **Example Output (JSON):**
263
+ ```json
264
+ {
265
+ "queries": [{
266
+ "query_index": 0,
267
+ "statement_type": "INSERT",
268
+ "total_components": 3,
269
+ "components": [
270
+ {
271
+ "component_type": "CTE",
272
+ "component_index": 0,
273
+ "name": "order_totals",
274
+ "sql": "SELECT customer_id, SUM(amount) AS total FROM orders GROUP BY customer_id",
275
+ "is_executable": true,
276
+ "dependencies": [],
277
+ "location": "WITH clause"
278
+ },
279
+ {
280
+ "component_type": "TARGET_TABLE",
281
+ "component_index": 1,
282
+ "name": "analytics.summary",
283
+ "sql": "analytics.summary",
284
+ "is_executable": false,
285
+ "location": "INSERT INTO target"
286
+ },
287
+ {
288
+ "component_type": "SOURCE_QUERY",
289
+ "component_index": 2,
290
+ "sql": "SELECT * FROM order_totals WHERE total > 100",
291
+ "is_executable": true,
292
+ "dependencies": ["order_totals"],
293
+ "location": "INSERT source SELECT"
294
+ }
295
+ ]
296
+ }]
297
+ }
298
+ ```
299
+
300
+ **Extracted Component Types:**
301
+ - `CTE`: Common Table Expressions from WITH clause
302
+ - `MAIN_QUERY`: The primary SELECT statement
303
+ - `SUBQUERY`: Nested SELECT in FROM clause
304
+ - `SCALAR_SUBQUERY`: Single-value subquery in SELECT list, WHERE, HAVING
305
+ - `TARGET_TABLE`: Output table for INSERT/CREATE/MERGE (not executable)
306
+ - `SOURCE_QUERY`: SELECT within DML/DDL statements
307
+ - `UNION_BRANCH`: Individual SELECT in UNION/UNION ALL
308
+
309
+ **Use Cases:**
310
+ - Unit test CTEs and subqueries individually
311
+ - Extract DQL from CTAS, CREATE VIEW, INSERT statements
312
+ - Analyze query structure and component dependencies
313
+ - Break apart complex queries for understanding
314
+
207
315
  ### Different SQL Dialects
208
316
 
209
317
  ```bash
@@ -475,7 +583,7 @@ Options:
475
583
  ### Tables Command
476
584
 
477
585
  ```
478
- sqlglider tables <sql_file> [OPTIONS]
586
+ sqlglider tables overview <sql_file> [OPTIONS]
479
587
 
480
588
  Arguments:
481
589
  sql_file Path to SQL file to analyze [required]
@@ -491,6 +599,66 @@ Options:
491
599
  --help Show help message and exit
492
600
  ```
493
601
 
602
+ ```
603
+ sqlglider tables pull <sql_file> [OPTIONS]
604
+
605
+ Arguments:
606
+ sql_file Path to SQL file to analyze [optional, reads from stdin if omitted]
607
+
608
+ Options:
609
+ --catalog-type, -c Catalog provider (e.g., 'databricks') [required if not in config]
610
+ --ddl-folder, -o Output folder for DDL files [optional, outputs to stdout if omitted]
611
+ --dialect, -d SQL dialect (spark, postgres, snowflake, etc.) [default: spark]
612
+ --templater, -t Templater for SQL preprocessing (e.g., 'jinja', 'none') [optional]
613
+ --var, -v Template variable in key=value format (repeatable) [optional]
614
+ --vars-file Path to variables file (JSON or YAML) [optional]
615
+ --list, -l List available catalog providers and exit
616
+ --help Show help message and exit
617
+ ```
618
+
619
+ **Databricks Setup:**
620
+
621
+ Install the optional Databricks dependency:
622
+ ```bash
623
+ pip install sql-glider[databricks]
624
+ ```
625
+
626
+ Configure authentication (via environment variables or `sqlglider.toml`):
627
+ ```bash
628
+ export DATABRICKS_HOST="https://your-workspace.cloud.databricks.com"
629
+ export DATABRICKS_TOKEN="dapi..."
630
+ export DATABRICKS_WAREHOUSE_ID="abc123..."
631
+ ```
632
+
633
+ ### Dissect Command
634
+
635
+ ```
636
+ sqlglider dissect [sql_file] [OPTIONS]
637
+
638
+ Arguments:
639
+ sql_file Path to SQL file to analyze [optional, reads from stdin if omitted]
640
+
641
+ Options:
642
+ --dialect, -d SQL dialect (spark, postgres, snowflake, etc.) [default: spark]
643
+ --output-format, -f Output format: 'text', 'json', or 'csv' [default: text]
644
+ --output-file, -o Write output to file instead of stdout [optional]
645
+ --templater, -t Templater for SQL preprocessing (e.g., 'jinja', 'none') [optional]
646
+ --var, -v Template variable in key=value format (repeatable) [optional]
647
+ --vars-file Path to variables file (JSON or YAML) [optional]
648
+ --help Show help message and exit
649
+ ```
650
+
651
+ **Output Fields:**
652
+ - `component_type`: Type of component (CTE, MAIN_QUERY, SUBQUERY, etc.)
653
+ - `component_index`: Sequential order within the query (0-based)
654
+ - `name`: CTE name, subquery alias, or target table name
655
+ - `sql`: The extracted SQL for this component
656
+ - `is_executable`: Whether the component can run standalone (TARGET_TABLE is false)
657
+ - `dependencies`: List of CTE names this component references
658
+ - `location`: Human-readable context (e.g., "WITH clause", "FROM clause")
659
+ - `depth`: Nesting level (0 = top-level)
660
+ - `parent_index`: Index of parent component for nested components
661
+
494
662
  ### Graph Commands
495
663
 
496
664
  ```
@@ -612,6 +780,10 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed technical documentation.
612
780
  ```
613
781
  src/sqlglider/
614
782
  ├── cli.py # Typer CLI entry point
783
+ ├── dissection/
784
+ │ ├── analyzer.py # DissectionAnalyzer for query decomposition
785
+ │ ├── formatters.py # Output formatters (text, JSON, CSV)
786
+ │ └── models.py # ComponentType, SQLComponent, QueryDissectionResult
615
787
  ├── graph/
616
788
  │ ├── builder.py # Build graphs from SQL files
617
789
  │ ├── merge.py # Merge multiple graphs
@@ -0,0 +1,34 @@
1
+ sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
+ sqlglider/_version.py,sha256=rLCrf4heo25FJtBY-2Ap7ZuWW-5FS7sqTjsolIUuI5c,704
3
+ sqlglider/cli.py,sha256=9sweHRVLk2iBSzCzT2Gcj8y1g1XKzq26iApQsMaFbx4,51786
4
+ sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
+ sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
6
+ sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
7
+ sqlglider/catalog/databricks.py,sha256=Ho1crIKv1bw-fXkWUhQhcKfiYQEGinGSxBS2zoVLB3o,9504
8
+ sqlglider/catalog/registry.py,sha256=KD1XrvK46xSrK5IikzbdbTSk_-wwRTXvyBxXn3m-Rx0,3391
9
+ sqlglider/dissection/__init__.py,sha256=ObXM7AXTJZvheIg36ps9KuFsXPV7WmWamaA4xPfxP4s,396
10
+ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0HqrU,27085
11
+ sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
12
+ sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
13
+ sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
14
+ sqlglider/graph/builder.py,sha256=WdMUwKlB6UGtr7CA-J5Lj7D2GMQJZzteDetzr7Pe4Kk,11916
15
+ sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
16
+ sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
17
+ sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
18
+ sqlglider/graph/serialization.py,sha256=7JJo31rwSlxnDhdqdTJdK4Dr_ZcSYetXfx3_CmndSac,2662
19
+ sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
20
+ sqlglider/lineage/analyzer.py,sha256=kRhGcGaiixxtrf9vO8g09omayjB2G3LA9hLCOLaTyPg,56811
21
+ sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
22
+ sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
23
+ sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
24
+ sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
25
+ sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
26
+ sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
27
+ sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
28
+ sqlglider/utils/config.py,sha256=iNJgSXFw3pmL2MCdvW3SJp4X2T3AQP2QyQuXIXT-6H0,4761
29
+ sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
30
+ sql_glider-0.1.4.dist-info/METADATA,sha256=-gzDzEyZ116YpDBNbIwWMgMO184s-WkDKMxMH92lOqA,28445
31
+ sql_glider-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
32
+ sql_glider-0.1.4.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
33
+ sql_glider-0.1.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ sql_glider-0.1.4.dist-info/RECORD,,
@@ -1,6 +1,9 @@
1
1
  [console_scripts]
2
2
  sqlglider = sqlglider.cli:app
3
3
 
4
+ [sqlglider.catalogs]
5
+ databricks = sqlglider.catalog.databricks:DatabricksCatalog
6
+
4
7
  [sqlglider.templaters]
5
8
  jinja = sqlglider.templating.jinja:JinjaTemplater
6
9
  none = sqlglider.templating.base:NoOpTemplater
sqlglider/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.2'
32
- __version_tuple__ = version_tuple = (0, 1, 2)
31
+ __version__ = version = '0.1.4'
32
+ __version_tuple__ = version_tuple = (0, 1, 4)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -0,0 +1,30 @@
1
+ """Catalog module for fetching DDL from remote data catalogs.
2
+
3
+ This module provides a plugin system for connecting to various data catalogs
4
+ (e.g., Databricks Unity Catalog) and fetching table DDL definitions.
5
+
6
+ Example:
7
+ >>> from sqlglider.catalog import get_catalog, list_catalogs
8
+ >>> print(list_catalogs())
9
+ ['databricks']
10
+ >>> catalog = get_catalog("databricks")
11
+ >>> catalog.configure({"warehouse_id": "abc123"})
12
+ >>> ddl = catalog.get_ddl("my_catalog.my_schema.my_table")
13
+ """
14
+
15
+ from sqlglider.catalog.base import Catalog, CatalogError
16
+ from sqlglider.catalog.registry import (
17
+ clear_registry,
18
+ get_catalog,
19
+ list_catalogs,
20
+ register_catalog,
21
+ )
22
+
23
+ __all__ = [
24
+ "Catalog",
25
+ "CatalogError",
26
+ "get_catalog",
27
+ "list_catalogs",
28
+ "register_catalog",
29
+ "clear_registry",
30
+ ]
@@ -0,0 +1,99 @@
1
+ """Base classes for catalog system.
2
+
3
+ This module defines the abstract interface for catalog providers and provides
4
+ the exception class for catalog-related errors.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, Dict, List, Optional
9
+
10
+
11
+ class CatalogError(Exception):
12
+ """Exception raised when catalog operations fail."""
13
+
14
+ pass
15
+
16
+
17
+ class Catalog(ABC):
18
+ """Abstract base class for catalog providers.
19
+
20
+ All catalog implementations must inherit from this class and implement
21
+ the required methods. Catalogs are discovered via entry points and
22
+ can be used to fetch DDL definitions from remote data catalogs.
23
+
24
+ Example:
25
+ >>> class MyCatalog(Catalog):
26
+ ... @property
27
+ ... def name(self) -> str:
28
+ ... return "my-catalog"
29
+ ...
30
+ ... def get_ddl(self, table_name: str) -> str:
31
+ ... # Fetch DDL from remote catalog
32
+ ... return "CREATE TABLE ..."
33
+ ...
34
+ ... def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
35
+ ... return {name: self.get_ddl(name) for name in table_names}
36
+ """
37
+
38
+ @property
39
+ @abstractmethod
40
+ def name(self) -> str:
41
+ """Return the catalog provider name.
42
+
43
+ This name is used to identify the catalog in configuration
44
+ and CLI options.
45
+
46
+ Returns:
47
+ The unique name of this catalog provider.
48
+ """
49
+ pass
50
+
51
+ @abstractmethod
52
+ def get_ddl(self, table_name: str) -> str:
53
+ """Fetch DDL for a single table from the remote catalog.
54
+
55
+ Args:
56
+ table_name: The fully qualified table name (e.g., "catalog.schema.table").
57
+
58
+ Returns:
59
+ The DDL statement for creating the table.
60
+
61
+ Raises:
62
+ CatalogError: If the DDL cannot be fetched (table not found,
63
+ authentication failure, network error, etc.).
64
+ """
65
+ pass
66
+
67
+ @abstractmethod
68
+ def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
69
+ """Fetch DDL for multiple tables from the remote catalog.
70
+
71
+ This method may be more efficient than calling get_ddl() multiple
72
+ times, as implementations can batch requests where supported.
73
+
74
+ Args:
75
+ table_names: List of fully qualified table names.
76
+
77
+ Returns:
78
+ Dictionary mapping table names to their DDL statements.
79
+ Tables that couldn't be found will have None as their value.
80
+
81
+ Raises:
82
+ CatalogError: If the batch operation fails entirely.
83
+ """
84
+ pass
85
+
86
+ def configure(self, config: Optional[Dict[str, Any]] = None) -> None:
87
+ """Configure the catalog with provider-specific settings.
88
+
89
+ This method is called after instantiation to pass configuration
90
+ from sqlglider.toml or environment variables.
91
+
92
+ Args:
93
+ config: Provider-specific configuration dictionary.
94
+ Keys and values depend on the catalog implementation.
95
+
96
+ Raises:
97
+ CatalogError: If required configuration is missing or invalid.
98
+ """
99
+ pass
@@ -0,0 +1,255 @@
1
+ """Databricks catalog implementation.
2
+
3
+ This module provides integration with Databricks Unity Catalog for fetching
4
+ table DDL definitions using the Databricks SDK.
5
+
6
+ Requires the optional 'databricks' dependency:
7
+ pip install sql-glider[databricks]
8
+ """
9
+
10
+ import os
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from sqlglider.catalog.base import Catalog, CatalogError
14
+
15
+ # Lazy import to avoid requiring databricks-sdk unless actually used
16
+ _databricks_sdk_available: Optional[bool] = None
17
+
18
+
19
+ def _check_databricks_sdk() -> None:
20
+ """Check if databricks-sdk is installed."""
21
+ global _databricks_sdk_available
22
+ if _databricks_sdk_available is None:
23
+ try:
24
+ import databricks.sdk # noqa: F401
25
+
26
+ _databricks_sdk_available = True
27
+ except ImportError:
28
+ _databricks_sdk_available = False
29
+
30
+ if not _databricks_sdk_available:
31
+ raise CatalogError(
32
+ "The 'databricks-sdk' package is required for Databricks catalog support. "
33
+ "Install it with: pip install sql-glider[databricks]"
34
+ )
35
+
36
+
37
+ class DatabricksCatalog(Catalog):
38
+ """Databricks Unity Catalog provider.
39
+
40
+ Fetches table DDL using the Databricks SDK's statement execution API.
41
+
42
+ Authentication:
43
+ Authentication is handled by the Databricks SDK's unified authentication,
44
+ which automatically tries multiple methods in order:
45
+
46
+ 1. Direct configuration (host + token in sqlglider.toml)
47
+ 2. Environment variables (DATABRICKS_HOST, DATABRICKS_TOKEN, etc.)
48
+ 3. Databricks CLI profile (~/.databrickscfg) - use 'profile' config option
49
+ 4. Azure CLI authentication (for Azure Databricks)
50
+ 5. Google Cloud authentication (for GCP Databricks)
51
+ 6. OAuth M2M (client credentials) via environment variables:
52
+ - DATABRICKS_CLIENT_ID
53
+ - DATABRICKS_CLIENT_SECRET
54
+
55
+ For OAuth M2M, set these environment variables:
56
+ export DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
57
+ export DATABRICKS_CLIENT_ID=your-client-id
58
+ export DATABRICKS_CLIENT_SECRET=your-client-secret
59
+
60
+ For Databricks CLI profile, either:
61
+ - Configure DEFAULT profile in ~/.databrickscfg
62
+ - Set profile name in sqlglider.toml: profile = "my-profile"
63
+
64
+ Configuration:
65
+ - warehouse_id (required): SQL warehouse ID for statement execution
66
+ - profile (optional): Databricks CLI profile name from ~/.databrickscfg
67
+ - host (optional): Databricks workspace URL
68
+ - token (optional): Personal access token (legacy, prefer OAuth)
69
+
70
+ Example:
71
+ >>> # Using environment variables or CLI profile (recommended)
72
+ >>> catalog = DatabricksCatalog()
73
+ >>> catalog.configure({"warehouse_id": "abc123def456"})
74
+ >>> ddl = catalog.get_ddl("my_catalog.my_schema.my_table")
75
+
76
+ >>> # Using specific CLI profile
77
+ >>> catalog.configure({"warehouse_id": "abc123", "profile": "dev-workspace"})
78
+ """
79
+
80
+ def __init__(self) -> None:
81
+ """Initialize the Databricks catalog."""
82
+ self._warehouse_id: Optional[str] = None
83
+ self._profile: Optional[str] = None
84
+ self._host: Optional[str] = None
85
+ self._token: Optional[str] = None
86
+ self._client: Any = None
87
+
88
+ @property
89
+ def name(self) -> str:
90
+ """Return the catalog provider name."""
91
+ return "databricks"
92
+
93
+ def configure(self, config: Optional[Dict[str, Any]] = None) -> None:
94
+ """Configure the Databricks catalog.
95
+
96
+ Args:
97
+ config: Configuration dictionary with optional keys:
98
+ - warehouse_id: SQL warehouse ID (required, or set DATABRICKS_WAREHOUSE_ID)
99
+ - profile: Databricks CLI profile name from ~/.databrickscfg
100
+ - host: Databricks workspace URL (only needed if not using profile/env)
101
+ - token: Personal access token (legacy, prefer OAuth or profile)
102
+
103
+ Raises:
104
+ CatalogError: If warehouse_id is not provided and not in environment.
105
+ """
106
+ config = config or {}
107
+
108
+ # Get warehouse_id from config or environment
109
+ self._warehouse_id = config.get("warehouse_id") or os.environ.get(
110
+ "DATABRICKS_WAREHOUSE_ID"
111
+ )
112
+ if not self._warehouse_id:
113
+ raise CatalogError(
114
+ "Databricks warehouse_id is required. "
115
+ "Set it in sqlglider.toml under [sqlglider.catalog.databricks] "
116
+ "or via the DATABRICKS_WAREHOUSE_ID environment variable."
117
+ )
118
+
119
+ # Get optional profile for CLI profile-based auth
120
+ self._profile = config.get("profile")
121
+
122
+ # Get optional host and token - only from config, not env vars
123
+ # Let the SDK handle env var discovery for better unified auth support
124
+ self._host = config.get("host")
125
+ self._token = config.get("token")
126
+
127
+ # Reset client so it gets recreated with new config
128
+ self._client = None
129
+
130
+ def _get_client(self) -> Any:
131
+ """Get or create the Databricks WorkspaceClient.
132
+
133
+ The SDK uses unified authentication, trying methods in this order:
134
+ 1. Explicit host/token if provided in config
135
+ 2. Profile from ~/.databrickscfg if specified
136
+ 3. Environment variables (DATABRICKS_HOST, DATABRICKS_TOKEN, etc.)
137
+ 4. OAuth M2M via DATABRICKS_CLIENT_ID/DATABRICKS_CLIENT_SECRET
138
+ 5. Azure CLI / Google Cloud auth for cloud-hosted workspaces
139
+
140
+ Returns:
141
+ The WorkspaceClient instance.
142
+
143
+ Raises:
144
+ CatalogError: If the SDK is not installed or authentication fails.
145
+ """
146
+ _check_databricks_sdk()
147
+
148
+ if self._client is None:
149
+ from databricks.sdk import WorkspaceClient
150
+
151
+ # Build kwargs for WorkspaceClient
152
+ # Only pass values that are explicitly configured
153
+ # Let SDK handle env var discovery for unified auth
154
+ kwargs: Dict[str, Any] = {}
155
+ if self._profile:
156
+ kwargs["profile"] = self._profile
157
+ if self._host:
158
+ kwargs["host"] = self._host
159
+ if self._token:
160
+ kwargs["token"] = self._token
161
+
162
+ try:
163
+ self._client = WorkspaceClient(**kwargs)
164
+ except Exception as e:
165
+ raise CatalogError(
166
+ f"Failed to authenticate with Databricks: {e}"
167
+ ) from e
168
+
169
+ return self._client
170
+
171
+ def get_ddl(self, table_name: str) -> str:
172
+ """Fetch DDL for a single table from Databricks.
173
+
174
+ Uses SHOW CREATE TABLE to get the full DDL statement.
175
+
176
+ Args:
177
+ table_name: The fully qualified table name (catalog.schema.table).
178
+
179
+ Returns:
180
+ The CREATE TABLE DDL statement.
181
+
182
+ Raises:
183
+ CatalogError: If the table is not found or the query fails.
184
+ """
185
+ if not self._warehouse_id:
186
+ raise CatalogError(
187
+ "Catalog not configured. Call configure() with warehouse_id first."
188
+ )
189
+
190
+ client = self._get_client()
191
+
192
+ try:
193
+ # Execute SHOW CREATE TABLE statement
194
+ response = client.statement_execution.execute_statement(
195
+ warehouse_id=self._warehouse_id,
196
+ statement=f"SHOW CREATE TABLE {table_name}",
197
+ wait_timeout="30s",
198
+ )
199
+
200
+ # Check for errors
201
+ if response.status and response.status.state:
202
+ state = response.status.state.value
203
+ if state == "FAILED":
204
+ error_msg = (
205
+ response.status.error.message
206
+ if response.status.error
207
+ else "Unknown error"
208
+ )
209
+ raise CatalogError(
210
+ f"Failed to get DDL for '{table_name}': {error_msg}"
211
+ )
212
+
213
+ # Extract DDL from result
214
+ if response.result and response.result.data_array:
215
+ # SHOW CREATE TABLE returns a single row with the DDL
216
+ ddl_parts = []
217
+ for row in response.result.data_array:
218
+ if row:
219
+ ddl_parts.append(str(row[0]))
220
+ return "\n".join(ddl_parts)
221
+
222
+ raise CatalogError(f"No DDL returned for table '{table_name}'")
223
+
224
+ except CatalogError:
225
+ raise
226
+ except Exception as e:
227
+ raise CatalogError(f"Failed to fetch DDL for '{table_name}': {e}") from e
228
+
229
+ def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
230
+ """Fetch DDL for multiple tables from Databricks.
231
+
232
+ Currently executes individual queries for each table.
233
+ Future optimization could use parallel execution.
234
+
235
+ Args:
236
+ table_names: List of fully qualified table names.
237
+
238
+ Returns:
239
+ Dictionary mapping table names to their DDL statements.
240
+ Tables that couldn't be found will have error messages as values
241
+ prefixed with "ERROR: ".
242
+
243
+ Raises:
244
+ CatalogError: If the batch operation fails entirely.
245
+ """
246
+ results: Dict[str, str] = {}
247
+
248
+ for table_name in table_names:
249
+ try:
250
+ results[table_name] = self.get_ddl(table_name)
251
+ except CatalogError as e:
252
+ # Store error message for this table but continue with others
253
+ results[table_name] = f"ERROR: {e}"
254
+
255
+ return results