sql-glider 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.2.dist-info → sql_glider-0.1.3.dist-info}/METADATA +177 -5
- sql_glider-0.1.3.dist-info/RECORD +34 -0
- {sql_glider-0.1.2.dist-info → sql_glider-0.1.3.dist-info}/entry_points.txt +3 -0
- sqlglider/_version.py +2 -2
- sqlglider/catalog/__init__.py +30 -0
- sqlglider/catalog/base.py +99 -0
- sqlglider/catalog/databricks.py +255 -0
- sqlglider/catalog/registry.py +121 -0
- sqlglider/cli.py +467 -15
- sqlglider/dissection/__init__.py +17 -0
- sqlglider/dissection/analyzer.py +767 -0
- sqlglider/dissection/formatters.py +222 -0
- sqlglider/dissection/models.py +112 -0
- sqlglider/graph/builder.py +46 -8
- sqlglider/lineage/analyzer.py +66 -12
- sqlglider/utils/config.py +25 -0
- sql_glider-0.1.2.dist-info/RECORD +0 -26
- {sql_glider-0.1.2.dist-info → sql_glider-0.1.3.dist-info}/WHEEL +0 -0
- {sql_glider-0.1.2.dist-info → sql_glider-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -26,6 +26,8 @@ Requires-Dist: rich>=13.0.0
|
|
|
26
26
|
Requires-Dist: rustworkx>=0.15.0
|
|
27
27
|
Requires-Dist: sqlglot[rs]>=25.0.0
|
|
28
28
|
Requires-Dist: typer>=0.9.0
|
|
29
|
+
Provides-Extra: databricks
|
|
30
|
+
Requires-Dist: databricks-sdk>=0.20.0; extra == 'databricks'
|
|
29
31
|
Description-Content-Type: text/markdown
|
|
30
32
|
|
|
31
33
|
# SQL Glider
|
|
@@ -40,6 +42,7 @@ SQL Glider provides powerful column-level and table-level lineage analysis for S
|
|
|
40
42
|
|
|
41
43
|
- **Forward Lineage:** Trace output columns back to their source tables and columns
|
|
42
44
|
- **Reverse Lineage:** Impact analysis - find which output columns are affected by a source column
|
|
45
|
+
- **Query Dissection:** Decompose SQL into components (CTEs, subqueries, UNION branches) for unit testing
|
|
43
46
|
- **Table Extraction:** List all tables in SQL files with usage type (INPUT/OUTPUT) and object type (TABLE/VIEW/CTE)
|
|
44
47
|
- **Multi-level Tracing:** Automatically handles CTEs, subqueries, and complex expressions
|
|
45
48
|
- **Graph-Based Lineage:** Build and query lineage graphs across thousands of SQL files
|
|
@@ -171,15 +174,32 @@ List all tables involved in SQL files with usage and type information:
|
|
|
171
174
|
|
|
172
175
|
```bash
|
|
173
176
|
# List all tables in a SQL file
|
|
174
|
-
uv run sqlglider tables query.sql
|
|
177
|
+
uv run sqlglider tables overview query.sql
|
|
175
178
|
|
|
176
179
|
# JSON output with detailed table info
|
|
177
|
-
uv run sqlglider tables query.sql --output-format json
|
|
180
|
+
uv run sqlglider tables overview query.sql --output-format json
|
|
178
181
|
|
|
179
182
|
# Export to CSV
|
|
180
|
-
uv run sqlglider tables query.sql --output-format csv --output-file tables.csv
|
|
183
|
+
uv run sqlglider tables overview query.sql --output-format csv --output-file tables.csv
|
|
181
184
|
```
|
|
182
185
|
|
|
186
|
+
### Pull DDL from Remote Catalogs
|
|
187
|
+
|
|
188
|
+
Fetch DDL definitions from remote data catalogs (e.g., Databricks Unity Catalog):
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Pull DDL for all tables used in a SQL file (outputs to stdout)
|
|
192
|
+
uv run sqlglider tables pull query.sql --catalog-type databricks
|
|
193
|
+
|
|
194
|
+
# Save DDL files to a folder (one file per table)
|
|
195
|
+
uv run sqlglider tables pull query.sql -c databricks -o ./ddl/
|
|
196
|
+
|
|
197
|
+
# List available catalog providers
|
|
198
|
+
uv run sqlglider tables pull --list
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
**Note:** Requires optional dependencies. Install with: `pip install sql-glider[databricks]`
|
|
202
|
+
|
|
183
203
|
**Example Output (JSON):**
|
|
184
204
|
```json
|
|
185
205
|
{
|
|
@@ -204,6 +224,94 @@ uv run sqlglider tables query.sql --output-format csv --output-file tables.csv
|
|
|
204
224
|
- `CTE`: Common Table Expression (WITH clause)
|
|
205
225
|
- `UNKNOWN`: Cannot determine type from SQL alone
|
|
206
226
|
|
|
227
|
+
### Query Dissection
|
|
228
|
+
|
|
229
|
+
Decompose SQL queries into constituent parts for unit testing and analysis:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# Dissect a SQL file (text output)
|
|
233
|
+
uv run sqlglider dissect query.sql
|
|
234
|
+
|
|
235
|
+
# JSON output with full component details
|
|
236
|
+
uv run sqlglider dissect query.sql --output-format json
|
|
237
|
+
|
|
238
|
+
# CSV output for spreadsheet analysis
|
|
239
|
+
uv run sqlglider dissect query.sql --output-format csv
|
|
240
|
+
|
|
241
|
+
# Export to file
|
|
242
|
+
uv run sqlglider dissect query.sql -f json -o dissected.json
|
|
243
|
+
|
|
244
|
+
# With templating support
|
|
245
|
+
uv run sqlglider dissect query.sql --templater jinja --var schema=analytics
|
|
246
|
+
|
|
247
|
+
# From stdin
|
|
248
|
+
echo "WITH cte AS (SELECT id FROM users) SELECT * FROM cte" | uv run sqlglider dissect
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
**Example Input:**
|
|
252
|
+
```sql
|
|
253
|
+
WITH order_totals AS (
|
|
254
|
+
SELECT customer_id, SUM(amount) AS total
|
|
255
|
+
FROM orders
|
|
256
|
+
GROUP BY customer_id
|
|
257
|
+
)
|
|
258
|
+
INSERT INTO analytics.summary
|
|
259
|
+
SELECT * FROM order_totals WHERE total > 100
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
**Example Output (JSON):**
|
|
263
|
+
```json
|
|
264
|
+
{
|
|
265
|
+
"queries": [{
|
|
266
|
+
"query_index": 0,
|
|
267
|
+
"statement_type": "INSERT",
|
|
268
|
+
"total_components": 3,
|
|
269
|
+
"components": [
|
|
270
|
+
{
|
|
271
|
+
"component_type": "CTE",
|
|
272
|
+
"component_index": 0,
|
|
273
|
+
"name": "order_totals",
|
|
274
|
+
"sql": "SELECT customer_id, SUM(amount) AS total FROM orders GROUP BY customer_id",
|
|
275
|
+
"is_executable": true,
|
|
276
|
+
"dependencies": [],
|
|
277
|
+
"location": "WITH clause"
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
"component_type": "TARGET_TABLE",
|
|
281
|
+
"component_index": 1,
|
|
282
|
+
"name": "analytics.summary",
|
|
283
|
+
"sql": "analytics.summary",
|
|
284
|
+
"is_executable": false,
|
|
285
|
+
"location": "INSERT INTO target"
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"component_type": "SOURCE_QUERY",
|
|
289
|
+
"component_index": 2,
|
|
290
|
+
"sql": "SELECT * FROM order_totals WHERE total > 100",
|
|
291
|
+
"is_executable": true,
|
|
292
|
+
"dependencies": ["order_totals"],
|
|
293
|
+
"location": "INSERT source SELECT"
|
|
294
|
+
}
|
|
295
|
+
]
|
|
296
|
+
}]
|
|
297
|
+
}
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
**Extracted Component Types:**
|
|
301
|
+
- `CTE`: Common Table Expressions from WITH clause
|
|
302
|
+
- `MAIN_QUERY`: The primary SELECT statement
|
|
303
|
+
- `SUBQUERY`: Nested SELECT in FROM clause
|
|
304
|
+
- `SCALAR_SUBQUERY`: Single-value subquery in SELECT list, WHERE, HAVING
|
|
305
|
+
- `TARGET_TABLE`: Output table for INSERT/CREATE/MERGE (not executable)
|
|
306
|
+
- `SOURCE_QUERY`: SELECT within DML/DDL statements
|
|
307
|
+
- `UNION_BRANCH`: Individual SELECT in UNION/UNION ALL
|
|
308
|
+
|
|
309
|
+
**Use Cases:**
|
|
310
|
+
- Unit test CTEs and subqueries individually
|
|
311
|
+
- Extract DQL from CTAS, CREATE VIEW, INSERT statements
|
|
312
|
+
- Analyze query structure and component dependencies
|
|
313
|
+
- Break apart complex queries for understanding
|
|
314
|
+
|
|
207
315
|
### Different SQL Dialects
|
|
208
316
|
|
|
209
317
|
```bash
|
|
@@ -475,7 +583,7 @@ Options:
|
|
|
475
583
|
### Tables Command
|
|
476
584
|
|
|
477
585
|
```
|
|
478
|
-
sqlglider tables <sql_file> [OPTIONS]
|
|
586
|
+
sqlglider tables overview <sql_file> [OPTIONS]
|
|
479
587
|
|
|
480
588
|
Arguments:
|
|
481
589
|
sql_file Path to SQL file to analyze [required]
|
|
@@ -491,6 +599,66 @@ Options:
|
|
|
491
599
|
--help Show help message and exit
|
|
492
600
|
```
|
|
493
601
|
|
|
602
|
+
```
|
|
603
|
+
sqlglider tables pull <sql_file> [OPTIONS]
|
|
604
|
+
|
|
605
|
+
Arguments:
|
|
606
|
+
sql_file Path to SQL file to analyze [optional, reads from stdin if omitted]
|
|
607
|
+
|
|
608
|
+
Options:
|
|
609
|
+
--catalog-type, -c Catalog provider (e.g., 'databricks') [required if not in config]
|
|
610
|
+
--ddl-folder, -o Output folder for DDL files [optional, outputs to stdout if omitted]
|
|
611
|
+
--dialect, -d SQL dialect (spark, postgres, snowflake, etc.) [default: spark]
|
|
612
|
+
--templater, -t Templater for SQL preprocessing (e.g., 'jinja', 'none') [optional]
|
|
613
|
+
--var, -v Template variable in key=value format (repeatable) [optional]
|
|
614
|
+
--vars-file Path to variables file (JSON or YAML) [optional]
|
|
615
|
+
--list, -l List available catalog providers and exit
|
|
616
|
+
--help Show help message and exit
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
**Databricks Setup:**
|
|
620
|
+
|
|
621
|
+
Install the optional Databricks dependency:
|
|
622
|
+
```bash
|
|
623
|
+
pip install sql-glider[databricks]
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
Configure authentication (via environment variables or `sqlglider.toml`):
|
|
627
|
+
```bash
|
|
628
|
+
export DATABRICKS_HOST="https://your-workspace.cloud.databricks.com"
|
|
629
|
+
export DATABRICKS_TOKEN="dapi..."
|
|
630
|
+
export DATABRICKS_WAREHOUSE_ID="abc123..."
|
|
631
|
+
```
|
|
632
|
+
|
|
633
|
+
### Dissect Command
|
|
634
|
+
|
|
635
|
+
```
|
|
636
|
+
sqlglider dissect [sql_file] [OPTIONS]
|
|
637
|
+
|
|
638
|
+
Arguments:
|
|
639
|
+
sql_file Path to SQL file to analyze [optional, reads from stdin if omitted]
|
|
640
|
+
|
|
641
|
+
Options:
|
|
642
|
+
--dialect, -d SQL dialect (spark, postgres, snowflake, etc.) [default: spark]
|
|
643
|
+
--output-format, -f Output format: 'text', 'json', or 'csv' [default: text]
|
|
644
|
+
--output-file, -o Write output to file instead of stdout [optional]
|
|
645
|
+
--templater, -t Templater for SQL preprocessing (e.g., 'jinja', 'none') [optional]
|
|
646
|
+
--var, -v Template variable in key=value format (repeatable) [optional]
|
|
647
|
+
--vars-file Path to variables file (JSON or YAML) [optional]
|
|
648
|
+
--help Show help message and exit
|
|
649
|
+
```
|
|
650
|
+
|
|
651
|
+
**Output Fields:**
|
|
652
|
+
- `component_type`: Type of component (CTE, MAIN_QUERY, SUBQUERY, etc.)
|
|
653
|
+
- `component_index`: Sequential order within the query (0-based)
|
|
654
|
+
- `name`: CTE name, subquery alias, or target table name
|
|
655
|
+
- `sql`: The extracted SQL for this component
|
|
656
|
+
- `is_executable`: Whether the component can run standalone (TARGET_TABLE is false)
|
|
657
|
+
- `dependencies`: List of CTE names this component references
|
|
658
|
+
- `location`: Human-readable context (e.g., "WITH clause", "FROM clause")
|
|
659
|
+
- `depth`: Nesting level (0 = top-level)
|
|
660
|
+
- `parent_index`: Index of parent component for nested components
|
|
661
|
+
|
|
494
662
|
### Graph Commands
|
|
495
663
|
|
|
496
664
|
```
|
|
@@ -612,6 +780,10 @@ See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed technical documentation.
|
|
|
612
780
|
```
|
|
613
781
|
src/sqlglider/
|
|
614
782
|
├── cli.py # Typer CLI entry point
|
|
783
|
+
├── dissection/
|
|
784
|
+
│ ├── analyzer.py # DissectionAnalyzer for query decomposition
|
|
785
|
+
│ ├── formatters.py # Output formatters (text, JSON, CSV)
|
|
786
|
+
│ └── models.py # ComponentType, SQLComponent, QueryDissectionResult
|
|
615
787
|
├── graph/
|
|
616
788
|
│ ├── builder.py # Build graphs from SQL files
|
|
617
789
|
│ ├── merge.py # Merge multiple graphs
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
|
|
2
|
+
sqlglider/_version.py,sha256=q5nF98G8SoVeJqaknL0xdyxtv0egsqb0fK06_84Izu8,704
|
|
3
|
+
sqlglider/cli.py,sha256=9sweHRVLk2iBSzCzT2Gcj8y1g1XKzq26iApQsMaFbx4,51786
|
|
4
|
+
sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
|
|
5
|
+
sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
|
|
6
|
+
sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
|
|
7
|
+
sqlglider/catalog/databricks.py,sha256=Ho1crIKv1bw-fXkWUhQhcKfiYQEGinGSxBS2zoVLB3o,9504
|
|
8
|
+
sqlglider/catalog/registry.py,sha256=KD1XrvK46xSrK5IikzbdbTSk_-wwRTXvyBxXn3m-Rx0,3391
|
|
9
|
+
sqlglider/dissection/__init__.py,sha256=ObXM7AXTJZvheIg36ps9KuFsXPV7WmWamaA4xPfxP4s,396
|
|
10
|
+
sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0HqrU,27085
|
|
11
|
+
sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
|
|
12
|
+
sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
|
|
13
|
+
sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
|
|
14
|
+
sqlglider/graph/builder.py,sha256=WdMUwKlB6UGtr7CA-J5Lj7D2GMQJZzteDetzr7Pe4Kk,11916
|
|
15
|
+
sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
|
|
16
|
+
sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
|
|
17
|
+
sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
|
|
18
|
+
sqlglider/graph/serialization.py,sha256=7JJo31rwSlxnDhdqdTJdK4Dr_ZcSYetXfx3_CmndSac,2662
|
|
19
|
+
sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
|
|
20
|
+
sqlglider/lineage/analyzer.py,sha256=HyyjGMP7VvEmvt-V-qT48C-41Usj2OmT5FPYYKdJsSs,48218
|
|
21
|
+
sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
|
|
22
|
+
sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
|
|
23
|
+
sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
|
|
24
|
+
sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
|
|
25
|
+
sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
|
|
26
|
+
sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
|
|
27
|
+
sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
|
|
28
|
+
sqlglider/utils/config.py,sha256=iNJgSXFw3pmL2MCdvW3SJp4X2T3AQP2QyQuXIXT-6H0,4761
|
|
29
|
+
sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
|
|
30
|
+
sql_glider-0.1.3.dist-info/METADATA,sha256=D83HzMc1l3AHbnR3Y9aOzwTbp6yMKR9tQMXWTWkH1Sw,28445
|
|
31
|
+
sql_glider-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
32
|
+
sql_glider-0.1.3.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
|
|
33
|
+
sql_glider-0.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
34
|
+
sql_glider-0.1.3.dist-info/RECORD,,
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
sqlglider = sqlglider.cli:app
|
|
3
3
|
|
|
4
|
+
[sqlglider.catalogs]
|
|
5
|
+
databricks = sqlglider.catalog.databricks:DatabricksCatalog
|
|
6
|
+
|
|
4
7
|
[sqlglider.templaters]
|
|
5
8
|
jinja = sqlglider.templating.jinja:JinjaTemplater
|
|
6
9
|
none = sqlglider.templating.base:NoOpTemplater
|
sqlglider/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 3)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Catalog module for fetching DDL from remote data catalogs.
|
|
2
|
+
|
|
3
|
+
This module provides a plugin system for connecting to various data catalogs
|
|
4
|
+
(e.g., Databricks Unity Catalog) and fetching table DDL definitions.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> from sqlglider.catalog import get_catalog, list_catalogs
|
|
8
|
+
>>> print(list_catalogs())
|
|
9
|
+
['databricks']
|
|
10
|
+
>>> catalog = get_catalog("databricks")
|
|
11
|
+
>>> catalog.configure({"warehouse_id": "abc123"})
|
|
12
|
+
>>> ddl = catalog.get_ddl("my_catalog.my_schema.my_table")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from sqlglider.catalog.base import Catalog, CatalogError
|
|
16
|
+
from sqlglider.catalog.registry import (
|
|
17
|
+
clear_registry,
|
|
18
|
+
get_catalog,
|
|
19
|
+
list_catalogs,
|
|
20
|
+
register_catalog,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"Catalog",
|
|
25
|
+
"CatalogError",
|
|
26
|
+
"get_catalog",
|
|
27
|
+
"list_catalogs",
|
|
28
|
+
"register_catalog",
|
|
29
|
+
"clear_registry",
|
|
30
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Base classes for catalog system.
|
|
2
|
+
|
|
3
|
+
This module defines the abstract interface for catalog providers and provides
|
|
4
|
+
the exception class for catalog-related errors.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CatalogError(Exception):
|
|
12
|
+
"""Exception raised when catalog operations fail."""
|
|
13
|
+
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Catalog(ABC):
|
|
18
|
+
"""Abstract base class for catalog providers.
|
|
19
|
+
|
|
20
|
+
All catalog implementations must inherit from this class and implement
|
|
21
|
+
the required methods. Catalogs are discovered via entry points and
|
|
22
|
+
can be used to fetch DDL definitions from remote data catalogs.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
>>> class MyCatalog(Catalog):
|
|
26
|
+
... @property
|
|
27
|
+
... def name(self) -> str:
|
|
28
|
+
... return "my-catalog"
|
|
29
|
+
...
|
|
30
|
+
... def get_ddl(self, table_name: str) -> str:
|
|
31
|
+
... # Fetch DDL from remote catalog
|
|
32
|
+
... return "CREATE TABLE ..."
|
|
33
|
+
...
|
|
34
|
+
... def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
|
|
35
|
+
... return {name: self.get_ddl(name) for name in table_names}
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def name(self) -> str:
|
|
41
|
+
"""Return the catalog provider name.
|
|
42
|
+
|
|
43
|
+
This name is used to identify the catalog in configuration
|
|
44
|
+
and CLI options.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The unique name of this catalog provider.
|
|
48
|
+
"""
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def get_ddl(self, table_name: str) -> str:
|
|
53
|
+
"""Fetch DDL for a single table from the remote catalog.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
table_name: The fully qualified table name (e.g., "catalog.schema.table").
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
The DDL statement for creating the table.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
CatalogError: If the DDL cannot be fetched (table not found,
|
|
63
|
+
authentication failure, network error, etc.).
|
|
64
|
+
"""
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
|
|
69
|
+
"""Fetch DDL for multiple tables from the remote catalog.
|
|
70
|
+
|
|
71
|
+
This method may be more efficient than calling get_ddl() multiple
|
|
72
|
+
times, as implementations can batch requests where supported.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
table_names: List of fully qualified table names.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Dictionary mapping table names to their DDL statements.
|
|
79
|
+
Tables that couldn't be found will have None as their value.
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
CatalogError: If the batch operation fails entirely.
|
|
83
|
+
"""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
def configure(self, config: Optional[Dict[str, Any]] = None) -> None:
|
|
87
|
+
"""Configure the catalog with provider-specific settings.
|
|
88
|
+
|
|
89
|
+
This method is called after instantiation to pass configuration
|
|
90
|
+
from sqlglider.toml or environment variables.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
config: Provider-specific configuration dictionary.
|
|
94
|
+
Keys and values depend on the catalog implementation.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
CatalogError: If required configuration is missing or invalid.
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Databricks catalog implementation.
|
|
2
|
+
|
|
3
|
+
This module provides integration with Databricks Unity Catalog for fetching
|
|
4
|
+
table DDL definitions using the Databricks SDK.
|
|
5
|
+
|
|
6
|
+
Requires the optional 'databricks' dependency:
|
|
7
|
+
pip install sql-glider[databricks]
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from sqlglider.catalog.base import Catalog, CatalogError
|
|
14
|
+
|
|
15
|
+
# Lazy import to avoid requiring databricks-sdk unless actually used
|
|
16
|
+
_databricks_sdk_available: Optional[bool] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _check_databricks_sdk() -> None:
|
|
20
|
+
"""Check if databricks-sdk is installed."""
|
|
21
|
+
global _databricks_sdk_available
|
|
22
|
+
if _databricks_sdk_available is None:
|
|
23
|
+
try:
|
|
24
|
+
import databricks.sdk # noqa: F401
|
|
25
|
+
|
|
26
|
+
_databricks_sdk_available = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
_databricks_sdk_available = False
|
|
29
|
+
|
|
30
|
+
if not _databricks_sdk_available:
|
|
31
|
+
raise CatalogError(
|
|
32
|
+
"The 'databricks-sdk' package is required for Databricks catalog support. "
|
|
33
|
+
"Install it with: pip install sql-glider[databricks]"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabricksCatalog(Catalog):
|
|
38
|
+
"""Databricks Unity Catalog provider.
|
|
39
|
+
|
|
40
|
+
Fetches table DDL using the Databricks SDK's statement execution API.
|
|
41
|
+
|
|
42
|
+
Authentication:
|
|
43
|
+
Authentication is handled by the Databricks SDK's unified authentication,
|
|
44
|
+
which automatically tries multiple methods in order:
|
|
45
|
+
|
|
46
|
+
1. Direct configuration (host + token in sqlglider.toml)
|
|
47
|
+
2. Environment variables (DATABRICKS_HOST, DATABRICKS_TOKEN, etc.)
|
|
48
|
+
3. Databricks CLI profile (~/.databrickscfg) - use 'profile' config option
|
|
49
|
+
4. Azure CLI authentication (for Azure Databricks)
|
|
50
|
+
5. Google Cloud authentication (for GCP Databricks)
|
|
51
|
+
6. OAuth M2M (client credentials) via environment variables:
|
|
52
|
+
- DATABRICKS_CLIENT_ID
|
|
53
|
+
- DATABRICKS_CLIENT_SECRET
|
|
54
|
+
|
|
55
|
+
For OAuth M2M, set these environment variables:
|
|
56
|
+
export DATABRICKS_HOST=https://your-workspace.cloud.databricks.com
|
|
57
|
+
export DATABRICKS_CLIENT_ID=your-client-id
|
|
58
|
+
export DATABRICKS_CLIENT_SECRET=your-client-secret
|
|
59
|
+
|
|
60
|
+
For Databricks CLI profile, either:
|
|
61
|
+
- Configure DEFAULT profile in ~/.databrickscfg
|
|
62
|
+
- Set profile name in sqlglider.toml: profile = "my-profile"
|
|
63
|
+
|
|
64
|
+
Configuration:
|
|
65
|
+
- warehouse_id (required): SQL warehouse ID for statement execution
|
|
66
|
+
- profile (optional): Databricks CLI profile name from ~/.databrickscfg
|
|
67
|
+
- host (optional): Databricks workspace URL
|
|
68
|
+
- token (optional): Personal access token (legacy, prefer OAuth)
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> # Using environment variables or CLI profile (recommended)
|
|
72
|
+
>>> catalog = DatabricksCatalog()
|
|
73
|
+
>>> catalog.configure({"warehouse_id": "abc123def456"})
|
|
74
|
+
>>> ddl = catalog.get_ddl("my_catalog.my_schema.my_table")
|
|
75
|
+
|
|
76
|
+
>>> # Using specific CLI profile
|
|
77
|
+
>>> catalog.configure({"warehouse_id": "abc123", "profile": "dev-workspace"})
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self) -> None:
|
|
81
|
+
"""Initialize the Databricks catalog."""
|
|
82
|
+
self._warehouse_id: Optional[str] = None
|
|
83
|
+
self._profile: Optional[str] = None
|
|
84
|
+
self._host: Optional[str] = None
|
|
85
|
+
self._token: Optional[str] = None
|
|
86
|
+
self._client: Any = None
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def name(self) -> str:
|
|
90
|
+
"""Return the catalog provider name."""
|
|
91
|
+
return "databricks"
|
|
92
|
+
|
|
93
|
+
def configure(self, config: Optional[Dict[str, Any]] = None) -> None:
|
|
94
|
+
"""Configure the Databricks catalog.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
config: Configuration dictionary with optional keys:
|
|
98
|
+
- warehouse_id: SQL warehouse ID (required, or set DATABRICKS_WAREHOUSE_ID)
|
|
99
|
+
- profile: Databricks CLI profile name from ~/.databrickscfg
|
|
100
|
+
- host: Databricks workspace URL (only needed if not using profile/env)
|
|
101
|
+
- token: Personal access token (legacy, prefer OAuth or profile)
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
CatalogError: If warehouse_id is not provided and not in environment.
|
|
105
|
+
"""
|
|
106
|
+
config = config or {}
|
|
107
|
+
|
|
108
|
+
# Get warehouse_id from config or environment
|
|
109
|
+
self._warehouse_id = config.get("warehouse_id") or os.environ.get(
|
|
110
|
+
"DATABRICKS_WAREHOUSE_ID"
|
|
111
|
+
)
|
|
112
|
+
if not self._warehouse_id:
|
|
113
|
+
raise CatalogError(
|
|
114
|
+
"Databricks warehouse_id is required. "
|
|
115
|
+
"Set it in sqlglider.toml under [sqlglider.catalog.databricks] "
|
|
116
|
+
"or via the DATABRICKS_WAREHOUSE_ID environment variable."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Get optional profile for CLI profile-based auth
|
|
120
|
+
self._profile = config.get("profile")
|
|
121
|
+
|
|
122
|
+
# Get optional host and token - only from config, not env vars
|
|
123
|
+
# Let the SDK handle env var discovery for better unified auth support
|
|
124
|
+
self._host = config.get("host")
|
|
125
|
+
self._token = config.get("token")
|
|
126
|
+
|
|
127
|
+
# Reset client so it gets recreated with new config
|
|
128
|
+
self._client = None
|
|
129
|
+
|
|
130
|
+
def _get_client(self) -> Any:
|
|
131
|
+
"""Get or create the Databricks WorkspaceClient.
|
|
132
|
+
|
|
133
|
+
The SDK uses unified authentication, trying methods in this order:
|
|
134
|
+
1. Explicit host/token if provided in config
|
|
135
|
+
2. Profile from ~/.databrickscfg if specified
|
|
136
|
+
3. Environment variables (DATABRICKS_HOST, DATABRICKS_TOKEN, etc.)
|
|
137
|
+
4. OAuth M2M via DATABRICKS_CLIENT_ID/DATABRICKS_CLIENT_SECRET
|
|
138
|
+
5. Azure CLI / Google Cloud auth for cloud-hosted workspaces
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
The WorkspaceClient instance.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
CatalogError: If the SDK is not installed or authentication fails.
|
|
145
|
+
"""
|
|
146
|
+
_check_databricks_sdk()
|
|
147
|
+
|
|
148
|
+
if self._client is None:
|
|
149
|
+
from databricks.sdk import WorkspaceClient
|
|
150
|
+
|
|
151
|
+
# Build kwargs for WorkspaceClient
|
|
152
|
+
# Only pass values that are explicitly configured
|
|
153
|
+
# Let SDK handle env var discovery for unified auth
|
|
154
|
+
kwargs: Dict[str, Any] = {}
|
|
155
|
+
if self._profile:
|
|
156
|
+
kwargs["profile"] = self._profile
|
|
157
|
+
if self._host:
|
|
158
|
+
kwargs["host"] = self._host
|
|
159
|
+
if self._token:
|
|
160
|
+
kwargs["token"] = self._token
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
self._client = WorkspaceClient(**kwargs)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
raise CatalogError(
|
|
166
|
+
f"Failed to authenticate with Databricks: {e}"
|
|
167
|
+
) from e
|
|
168
|
+
|
|
169
|
+
return self._client
|
|
170
|
+
|
|
171
|
+
def get_ddl(self, table_name: str) -> str:
|
|
172
|
+
"""Fetch DDL for a single table from Databricks.
|
|
173
|
+
|
|
174
|
+
Uses SHOW CREATE TABLE to get the full DDL statement.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
table_name: The fully qualified table name (catalog.schema.table).
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The CREATE TABLE DDL statement.
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
CatalogError: If the table is not found or the query fails.
|
|
184
|
+
"""
|
|
185
|
+
if not self._warehouse_id:
|
|
186
|
+
raise CatalogError(
|
|
187
|
+
"Catalog not configured. Call configure() with warehouse_id first."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
client = self._get_client()
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
# Execute SHOW CREATE TABLE statement
|
|
194
|
+
response = client.statement_execution.execute_statement(
|
|
195
|
+
warehouse_id=self._warehouse_id,
|
|
196
|
+
statement=f"SHOW CREATE TABLE {table_name}",
|
|
197
|
+
wait_timeout="30s",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Check for errors
|
|
201
|
+
if response.status and response.status.state:
|
|
202
|
+
state = response.status.state.value
|
|
203
|
+
if state == "FAILED":
|
|
204
|
+
error_msg = (
|
|
205
|
+
response.status.error.message
|
|
206
|
+
if response.status.error
|
|
207
|
+
else "Unknown error"
|
|
208
|
+
)
|
|
209
|
+
raise CatalogError(
|
|
210
|
+
f"Failed to get DDL for '{table_name}': {error_msg}"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Extract DDL from result
|
|
214
|
+
if response.result and response.result.data_array:
|
|
215
|
+
# SHOW CREATE TABLE returns a single row with the DDL
|
|
216
|
+
ddl_parts = []
|
|
217
|
+
for row in response.result.data_array:
|
|
218
|
+
if row:
|
|
219
|
+
ddl_parts.append(str(row[0]))
|
|
220
|
+
return "\n".join(ddl_parts)
|
|
221
|
+
|
|
222
|
+
raise CatalogError(f"No DDL returned for table '{table_name}'")
|
|
223
|
+
|
|
224
|
+
except CatalogError:
|
|
225
|
+
raise
|
|
226
|
+
except Exception as e:
|
|
227
|
+
raise CatalogError(f"Failed to fetch DDL for '{table_name}': {e}") from e
|
|
228
|
+
|
|
229
|
+
def get_ddl_batch(self, table_names: List[str]) -> Dict[str, str]:
|
|
230
|
+
"""Fetch DDL for multiple tables from Databricks.
|
|
231
|
+
|
|
232
|
+
Currently executes individual queries for each table.
|
|
233
|
+
Future optimization could use parallel execution.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
table_names: List of fully qualified table names.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Dictionary mapping table names to their DDL statements.
|
|
240
|
+
Tables that couldn't be found will have error messages as values
|
|
241
|
+
prefixed with "ERROR: ".
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
CatalogError: If the batch operation fails entirely.
|
|
245
|
+
"""
|
|
246
|
+
results: Dict[str, str] = {}
|
|
247
|
+
|
|
248
|
+
for table_name in table_names:
|
|
249
|
+
try:
|
|
250
|
+
results[table_name] = self.get_ddl(table_name)
|
|
251
|
+
except CatalogError as e:
|
|
252
|
+
# Store error message for this table but continue with others
|
|
253
|
+
results[table_name] = f"ERROR: {e}"
|
|
254
|
+
|
|
255
|
+
return results
|