duckrun 0.2.11.dev0__tar.gz → 0.2.13.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/PKG-INFO +74 -65
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/README.md +73 -64
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/auth.py +38 -14
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/core.py +0 -26
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun.egg-info/PKG-INFO +74 -65
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun.egg-info/requires.txt +1 -1
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/pyproject.toml +2 -2
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/LICENSE +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/__init__.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/files.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/runner.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/stats.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun/writer.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13.dev0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13.dev0
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.2
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
A helper package for
|
|
23
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
24
24
|
|
|
25
25
|
## Important Notes
|
|
26
26
|
|
|
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
29
|
- **Workspace names with spaces are fully supported!** ✅
|
|
30
30
|
|
|
31
|
-
|
|
32
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
32
|
|
|
34
33
|
## What It Does
|
|
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
40
39
|
```bash
|
|
41
40
|
pip install duckrun
|
|
42
41
|
```
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
46
|
pip install duckrun[local]
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
|
+
|
|
49
51
|
## Quick Start
|
|
50
52
|
|
|
51
53
|
### Simple Example for New Users
|
|
@@ -163,9 +165,6 @@ con.sql("""
|
|
|
163
165
|
GROUP BY customer_id
|
|
164
166
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
167
|
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
168
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
169
|
con.sql("""
|
|
171
170
|
SELECT
|
|
@@ -324,6 +323,73 @@ pipeline = [
|
|
|
324
323
|
|
|
325
324
|
## Advanced Features
|
|
326
325
|
|
|
326
|
+
### SQL Lookup Functions
|
|
327
|
+
|
|
328
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
329
|
+
|
|
330
|
+
**Available Functions:**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
334
|
+
|
|
335
|
+
# ID → Name lookups (most common use case)
|
|
336
|
+
con.sql("""
|
|
337
|
+
SELECT
|
|
338
|
+
workspace_id,
|
|
339
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
340
|
+
lakehouse_id,
|
|
341
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
342
|
+
FROM storage_logs
|
|
343
|
+
""").show()
|
|
344
|
+
|
|
345
|
+
# Name → ID lookups (reverse)
|
|
346
|
+
con.sql("""
|
|
347
|
+
SELECT
|
|
348
|
+
workspace_name,
|
|
349
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
350
|
+
lakehouse_name,
|
|
351
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
352
|
+
FROM configuration_table
|
|
353
|
+
""").show()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Function Reference:**
|
|
357
|
+
|
|
358
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
359
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
360
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
361
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
362
|
+
|
|
363
|
+
**Features:**
|
|
364
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
365
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
366
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
367
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
368
|
+
|
|
369
|
+
**Example Use Case:**
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Enrich OneLake storage logs with friendly names
|
|
373
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
374
|
+
|
|
375
|
+
result = con.sql("""
|
|
376
|
+
SELECT
|
|
377
|
+
workspace_id,
|
|
378
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
379
|
+
lakehouse_id,
|
|
380
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
381
|
+
operation_name,
|
|
382
|
+
COUNT(*) as operation_count,
|
|
383
|
+
SUM(bytes_transferred) as total_bytes
|
|
384
|
+
FROM onelake_storage_logs
|
|
385
|
+
WHERE log_date = CURRENT_DATE
|
|
386
|
+
GROUP BY ALL
|
|
387
|
+
ORDER BY workspace_name, lakehouse_name
|
|
388
|
+
""").show()
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
392
|
+
|
|
327
393
|
### Schema Evolution & Partitioning
|
|
328
394
|
|
|
329
395
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -467,63 +533,6 @@ con = duckrun.connect(
|
|
|
467
533
|
)
|
|
468
534
|
```
|
|
469
535
|
|
|
470
|
-
## File Management API Reference
|
|
471
|
-
|
|
472
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
473
|
-
|
|
474
|
-
Upload files from a local folder to OneLake Files section.
|
|
475
|
-
|
|
476
|
-
**Parameters:**
|
|
477
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
478
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
479
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
480
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
481
|
-
|
|
482
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
483
|
-
|
|
484
|
-
**Examples:**
|
|
485
|
-
```python
|
|
486
|
-
# Upload all files to a target folder
|
|
487
|
-
con.copy("./data", "processed_data")
|
|
488
|
-
|
|
489
|
-
# Upload only CSV and Parquet files
|
|
490
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
491
|
-
|
|
492
|
-
# Upload with overwrite enabled
|
|
493
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
494
|
-
```
|
|
495
|
-
|
|
496
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
497
|
-
|
|
498
|
-
Download files from OneLake Files section to a local folder.
|
|
499
|
-
|
|
500
|
-
**Parameters:**
|
|
501
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
502
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
503
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
504
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
505
|
-
|
|
506
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
507
|
-
|
|
508
|
-
**Examples:**
|
|
509
|
-
```python
|
|
510
|
-
# Download all files from OneLake Files root
|
|
511
|
-
con.download()
|
|
512
|
-
|
|
513
|
-
# Download from specific folder
|
|
514
|
-
con.download("processed_data", "./local_data")
|
|
515
|
-
|
|
516
|
-
# Download only JSON files
|
|
517
|
-
con.download("config", "./configs", ['.json'])
|
|
518
|
-
```
|
|
519
|
-
|
|
520
|
-
**Important Notes:**
|
|
521
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
522
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
523
|
-
- Both methods default to `overwrite=False` for safety
|
|
524
|
-
- Folder structure is preserved during upload/download operations
|
|
525
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
526
|
-
|
|
527
536
|
## Complete Example
|
|
528
537
|
|
|
529
538
|
```python
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
2
2
|
|
|
3
|
-
A helper package for
|
|
3
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
4
4
|
|
|
5
5
|
## Important Notes
|
|
6
6
|
|
|
@@ -8,7 +8,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
8
8
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
9
9
|
- **Workspace names with spaces are fully supported!** ✅
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
13
12
|
|
|
14
13
|
## What It Does
|
|
@@ -20,12 +19,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
20
19
|
```bash
|
|
21
20
|
pip install duckrun
|
|
22
21
|
```
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
24
24
|
|
|
25
25
|
```bash
|
|
26
26
|
pip install duckrun[local]
|
|
27
27
|
```
|
|
28
28
|
|
|
29
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
30
|
+
|
|
29
31
|
## Quick Start
|
|
30
32
|
|
|
31
33
|
### Simple Example for New Users
|
|
@@ -143,9 +145,6 @@ con.sql("""
|
|
|
143
145
|
GROUP BY customer_id
|
|
144
146
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
145
147
|
|
|
146
|
-
# Append mode
|
|
147
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
148
|
-
|
|
149
148
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
150
149
|
con.sql("""
|
|
151
150
|
SELECT
|
|
@@ -304,6 +303,73 @@ pipeline = [
|
|
|
304
303
|
|
|
305
304
|
## Advanced Features
|
|
306
305
|
|
|
306
|
+
### SQL Lookup Functions
|
|
307
|
+
|
|
308
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
309
|
+
|
|
310
|
+
**Available Functions:**
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
314
|
+
|
|
315
|
+
# ID → Name lookups (most common use case)
|
|
316
|
+
con.sql("""
|
|
317
|
+
SELECT
|
|
318
|
+
workspace_id,
|
|
319
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
320
|
+
lakehouse_id,
|
|
321
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
322
|
+
FROM storage_logs
|
|
323
|
+
""").show()
|
|
324
|
+
|
|
325
|
+
# Name → ID lookups (reverse)
|
|
326
|
+
con.sql("""
|
|
327
|
+
SELECT
|
|
328
|
+
workspace_name,
|
|
329
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
330
|
+
lakehouse_name,
|
|
331
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
332
|
+
FROM configuration_table
|
|
333
|
+
""").show()
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
**Function Reference:**
|
|
337
|
+
|
|
338
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
339
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
340
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
341
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
342
|
+
|
|
343
|
+
**Features:**
|
|
344
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
345
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
346
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
347
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
348
|
+
|
|
349
|
+
**Example Use Case:**
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# Enrich OneLake storage logs with friendly names
|
|
353
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
354
|
+
|
|
355
|
+
result = con.sql("""
|
|
356
|
+
SELECT
|
|
357
|
+
workspace_id,
|
|
358
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
359
|
+
lakehouse_id,
|
|
360
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
361
|
+
operation_name,
|
|
362
|
+
COUNT(*) as operation_count,
|
|
363
|
+
SUM(bytes_transferred) as total_bytes
|
|
364
|
+
FROM onelake_storage_logs
|
|
365
|
+
WHERE log_date = CURRENT_DATE
|
|
366
|
+
GROUP BY ALL
|
|
367
|
+
ORDER BY workspace_name, lakehouse_name
|
|
368
|
+
""").show()
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
372
|
+
|
|
307
373
|
### Schema Evolution & Partitioning
|
|
308
374
|
|
|
309
375
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -447,63 +513,6 @@ con = duckrun.connect(
|
|
|
447
513
|
)
|
|
448
514
|
```
|
|
449
515
|
|
|
450
|
-
## File Management API Reference
|
|
451
|
-
|
|
452
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
453
|
-
|
|
454
|
-
Upload files from a local folder to OneLake Files section.
|
|
455
|
-
|
|
456
|
-
**Parameters:**
|
|
457
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
458
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
459
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
460
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
461
|
-
|
|
462
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
463
|
-
|
|
464
|
-
**Examples:**
|
|
465
|
-
```python
|
|
466
|
-
# Upload all files to a target folder
|
|
467
|
-
con.copy("./data", "processed_data")
|
|
468
|
-
|
|
469
|
-
# Upload only CSV and Parquet files
|
|
470
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
471
|
-
|
|
472
|
-
# Upload with overwrite enabled
|
|
473
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
474
|
-
```
|
|
475
|
-
|
|
476
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
477
|
-
|
|
478
|
-
Download files from OneLake Files section to a local folder.
|
|
479
|
-
|
|
480
|
-
**Parameters:**
|
|
481
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
482
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
483
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
484
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
485
|
-
|
|
486
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
487
|
-
|
|
488
|
-
**Examples:**
|
|
489
|
-
```python
|
|
490
|
-
# Download all files from OneLake Files root
|
|
491
|
-
con.download()
|
|
492
|
-
|
|
493
|
-
# Download from specific folder
|
|
494
|
-
con.download("processed_data", "./local_data")
|
|
495
|
-
|
|
496
|
-
# Download only JSON files
|
|
497
|
-
con.download("config", "./configs", ['.json'])
|
|
498
|
-
```
|
|
499
|
-
|
|
500
|
-
**Important Notes:**
|
|
501
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
502
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
503
|
-
- Both methods default to `overwrite=False` for safety
|
|
504
|
-
- Folder structure is preserved during upload/download operations
|
|
505
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
506
|
-
|
|
507
516
|
## Complete Example
|
|
508
517
|
|
|
509
518
|
```python
|
|
@@ -630,4 +639,4 @@ For a complete production example, see [fabric_demo](https://github.com/djoualla
|
|
|
630
639
|
|
|
631
640
|
## License
|
|
632
641
|
|
|
633
|
-
MIT
|
|
642
|
+
MIT
|
|
@@ -38,21 +38,16 @@ def get_token() -> Optional[str]:
|
|
|
38
38
|
except Exception as e:
|
|
39
39
|
print(f"⚠️ Fabric notebook authentication failed: {e}")
|
|
40
40
|
|
|
41
|
-
#
|
|
41
|
+
# Try local/VS Code authentication (Azure CLI + browser)
|
|
42
|
+
print("🖥️ Trying local authentication (Azure CLI + browser fallback)...")
|
|
43
|
+
token = _get_local_token()
|
|
44
|
+
if token:
|
|
45
|
+
return token
|
|
46
|
+
|
|
47
|
+
# If local auth failed, fall back to device code flow
|
|
48
|
+
print("🔐 Falling back to device code flow for remote/headless environment...")
|
|
42
49
|
try:
|
|
43
|
-
|
|
44
|
-
try:
|
|
45
|
-
import google.colab
|
|
46
|
-
print("🚀 Google Colab detected - using device code flow")
|
|
47
|
-
return _get_device_code_token()
|
|
48
|
-
except ImportError:
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
# For all other environments (including VS Code), try Azure CLI first
|
|
52
|
-
# This includes local development, VS Code notebooks, etc.
|
|
53
|
-
print("🖥️ Local/VS Code environment detected - trying Azure CLI first, then browser fallback")
|
|
54
|
-
return _get_local_token()
|
|
55
|
-
|
|
50
|
+
return _get_device_code_token()
|
|
56
51
|
except Exception as e:
|
|
57
52
|
print(f"❌ Authentication failed: {e}")
|
|
58
53
|
print("💡 Try refreshing and running again, or check your Azure permissions")
|
|
@@ -82,6 +77,35 @@ def _get_device_code_token() -> Optional[str]:
|
|
|
82
77
|
return None
|
|
83
78
|
|
|
84
79
|
|
|
80
|
+
def _is_databricks() -> bool:
|
|
81
|
+
"""Check if we're running in a Databricks environment"""
|
|
82
|
+
# Databricks sets specific environment variables
|
|
83
|
+
return (
|
|
84
|
+
os.environ.get("DATABRICKS_RUNTIME_VERSION") is not None or
|
|
85
|
+
os.environ.get("DB_HOME") is not None or
|
|
86
|
+
"databricks" in os.environ.get("SPARK_HOME", "").lower()
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _get_databricks_token() -> Optional[str]:
|
|
91
|
+
"""Get token using DefaultAzureCredential for Databricks environments"""
|
|
92
|
+
try:
|
|
93
|
+
from azure.identity import DefaultAzureCredential
|
|
94
|
+
|
|
95
|
+
# DefaultAzureCredential will automatically use Databricks managed identity
|
|
96
|
+
credential = DefaultAzureCredential()
|
|
97
|
+
token_obj = credential.get_token("https://storage.azure.com/.default")
|
|
98
|
+
|
|
99
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token_obj.token
|
|
100
|
+
print("✅ Databricks authentication successful!")
|
|
101
|
+
return token_obj.token
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"❌ Databricks authentication failed: {e}")
|
|
105
|
+
print("💡 Make sure your Databricks cluster has the required Azure permissions")
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
85
109
|
def _get_local_token() -> Optional[str]:
|
|
86
110
|
"""Get token using CLI first, then browser fallback for local environments"""
|
|
87
111
|
# First try Azure CLI directly
|
|
@@ -133,11 +133,8 @@ class Duckrun:
|
|
|
133
133
|
|
|
134
134
|
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
|
135
135
|
if "/" not in connection_string:
|
|
136
|
-
print(f"Connecting to workspace '{connection_string}' for management operations...")
|
|
137
136
|
return WorkspaceConnection(connection_string)
|
|
138
137
|
|
|
139
|
-
print("Connecting to Lakehouse...")
|
|
140
|
-
|
|
141
138
|
scan_all_schemas = False
|
|
142
139
|
|
|
143
140
|
# Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
|
@@ -195,17 +192,14 @@ class Duckrun:
|
|
|
195
192
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
196
193
|
|
|
197
194
|
if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
|
|
198
|
-
print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
|
|
199
195
|
return workspace_name, lakehouse_name
|
|
200
196
|
|
|
201
197
|
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
|
202
198
|
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
|
203
199
|
if " " not in workspace_name:
|
|
204
|
-
print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
|
|
205
200
|
return workspace_name, lakehouse_name
|
|
206
201
|
|
|
207
202
|
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
|
208
|
-
print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
|
|
209
203
|
|
|
210
204
|
try:
|
|
211
205
|
# Get authentication token using enhanced auth system
|
|
@@ -242,7 +236,6 @@ class Duckrun:
|
|
|
242
236
|
if not lakehouse_id:
|
|
243
237
|
raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
|
|
244
238
|
|
|
245
|
-
print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
|
|
246
239
|
return workspace_id, lakehouse_id
|
|
247
240
|
|
|
248
241
|
except Exception as e:
|
|
@@ -388,7 +381,6 @@ class Duckrun:
|
|
|
388
381
|
tables_found.append((schema_name, table_name))
|
|
389
382
|
else:
|
|
390
383
|
# Scan specific schema only
|
|
391
|
-
print(f"🔍 Discovering tables in schema '{self.schema}'...")
|
|
392
384
|
schema_path = f"{base_path}{self.schema}/"
|
|
393
385
|
result = obs.list_with_delimiter(store, prefix=schema_path)
|
|
394
386
|
|
|
@@ -407,10 +399,6 @@ class Duckrun:
|
|
|
407
399
|
tables = self._discover_tables_fast()
|
|
408
400
|
|
|
409
401
|
if not tables:
|
|
410
|
-
if self.scan_all_schemas:
|
|
411
|
-
print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
|
|
412
|
-
else:
|
|
413
|
-
print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
|
|
414
402
|
return
|
|
415
403
|
|
|
416
404
|
# Group tables by schema for display
|
|
@@ -420,12 +408,6 @@ class Duckrun:
|
|
|
420
408
|
schema_tables[schema_name] = []
|
|
421
409
|
schema_tables[schema_name].append(table_name)
|
|
422
410
|
|
|
423
|
-
# Display tables by schema
|
|
424
|
-
print(f"\n📊 Found {len(tables)} tables:")
|
|
425
|
-
for schema_name in sorted(schema_tables.keys()):
|
|
426
|
-
table_list = sorted(schema_tables[schema_name])
|
|
427
|
-
print(f" {schema_name}: {', '.join(table_list)}")
|
|
428
|
-
|
|
429
411
|
attached_count = 0
|
|
430
412
|
skipped_tables = []
|
|
431
413
|
|
|
@@ -447,16 +429,9 @@ class Duckrun:
|
|
|
447
429
|
except Exception as e:
|
|
448
430
|
skipped_tables.append(f"{schema_name}.{table_name}")
|
|
449
431
|
continue
|
|
450
|
-
|
|
451
|
-
print(f"\n{'='*60}")
|
|
452
|
-
print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
|
|
453
|
-
if skipped_tables:
|
|
454
|
-
print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
|
|
455
|
-
print(f"{'='*60}\n")
|
|
456
432
|
|
|
457
433
|
except Exception as e:
|
|
458
434
|
print(f"❌ Error attaching lakehouse: {e}")
|
|
459
|
-
print("Continuing without pre-attached tables.")
|
|
460
435
|
|
|
461
436
|
def _register_lookup_functions(self):
|
|
462
437
|
"""
|
|
@@ -599,7 +574,6 @@ class Duckrun:
|
|
|
599
574
|
self.con.create_function("get_lakehouse_name", get_lakehouse_name)
|
|
600
575
|
self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name)
|
|
601
576
|
self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name)
|
|
602
|
-
print("✅ Registered lookup functions: get_workspace_name, get_lakehouse_name, get_workspace_id_from_name, get_lakehouse_id_from_name")
|
|
603
577
|
except Exception as e:
|
|
604
578
|
print(f"⚠️ Warning: Could not register lookup functions: {e}")
|
|
605
579
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13.dev0
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.2
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
A helper package for
|
|
23
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
24
24
|
|
|
25
25
|
## Important Notes
|
|
26
26
|
|
|
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
29
|
- **Workspace names with spaces are fully supported!** ✅
|
|
30
30
|
|
|
31
|
-
|
|
32
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
32
|
|
|
34
33
|
## What It Does
|
|
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
40
39
|
```bash
|
|
41
40
|
pip install duckrun
|
|
42
41
|
```
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
46
|
pip install duckrun[local]
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
|
+
|
|
49
51
|
## Quick Start
|
|
50
52
|
|
|
51
53
|
### Simple Example for New Users
|
|
@@ -163,9 +165,6 @@ con.sql("""
|
|
|
163
165
|
GROUP BY customer_id
|
|
164
166
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
167
|
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
168
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
169
|
con.sql("""
|
|
171
170
|
SELECT
|
|
@@ -324,6 +323,73 @@ pipeline = [
|
|
|
324
323
|
|
|
325
324
|
## Advanced Features
|
|
326
325
|
|
|
326
|
+
### SQL Lookup Functions
|
|
327
|
+
|
|
328
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
329
|
+
|
|
330
|
+
**Available Functions:**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
334
|
+
|
|
335
|
+
# ID → Name lookups (most common use case)
|
|
336
|
+
con.sql("""
|
|
337
|
+
SELECT
|
|
338
|
+
workspace_id,
|
|
339
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
340
|
+
lakehouse_id,
|
|
341
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
342
|
+
FROM storage_logs
|
|
343
|
+
""").show()
|
|
344
|
+
|
|
345
|
+
# Name → ID lookups (reverse)
|
|
346
|
+
con.sql("""
|
|
347
|
+
SELECT
|
|
348
|
+
workspace_name,
|
|
349
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
350
|
+
lakehouse_name,
|
|
351
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
352
|
+
FROM configuration_table
|
|
353
|
+
""").show()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Function Reference:**
|
|
357
|
+
|
|
358
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
359
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
360
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
361
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
362
|
+
|
|
363
|
+
**Features:**
|
|
364
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
365
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
366
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
367
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
368
|
+
|
|
369
|
+
**Example Use Case:**
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Enrich OneLake storage logs with friendly names
|
|
373
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
374
|
+
|
|
375
|
+
result = con.sql("""
|
|
376
|
+
SELECT
|
|
377
|
+
workspace_id,
|
|
378
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
379
|
+
lakehouse_id,
|
|
380
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
381
|
+
operation_name,
|
|
382
|
+
COUNT(*) as operation_count,
|
|
383
|
+
SUM(bytes_transferred) as total_bytes
|
|
384
|
+
FROM onelake_storage_logs
|
|
385
|
+
WHERE log_date = CURRENT_DATE
|
|
386
|
+
GROUP BY ALL
|
|
387
|
+
ORDER BY workspace_name, lakehouse_name
|
|
388
|
+
""").show()
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
392
|
+
|
|
327
393
|
### Schema Evolution & Partitioning
|
|
328
394
|
|
|
329
395
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -467,63 +533,6 @@ con = duckrun.connect(
|
|
|
467
533
|
)
|
|
468
534
|
```
|
|
469
535
|
|
|
470
|
-
## File Management API Reference
|
|
471
|
-
|
|
472
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
473
|
-
|
|
474
|
-
Upload files from a local folder to OneLake Files section.
|
|
475
|
-
|
|
476
|
-
**Parameters:**
|
|
477
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
478
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
479
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
480
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
481
|
-
|
|
482
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
483
|
-
|
|
484
|
-
**Examples:**
|
|
485
|
-
```python
|
|
486
|
-
# Upload all files to a target folder
|
|
487
|
-
con.copy("./data", "processed_data")
|
|
488
|
-
|
|
489
|
-
# Upload only CSV and Parquet files
|
|
490
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
491
|
-
|
|
492
|
-
# Upload with overwrite enabled
|
|
493
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
494
|
-
```
|
|
495
|
-
|
|
496
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
497
|
-
|
|
498
|
-
Download files from OneLake Files section to a local folder.
|
|
499
|
-
|
|
500
|
-
**Parameters:**
|
|
501
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
502
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
503
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
504
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
505
|
-
|
|
506
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
507
|
-
|
|
508
|
-
**Examples:**
|
|
509
|
-
```python
|
|
510
|
-
# Download all files from OneLake Files root
|
|
511
|
-
con.download()
|
|
512
|
-
|
|
513
|
-
# Download from specific folder
|
|
514
|
-
con.download("processed_data", "./local_data")
|
|
515
|
-
|
|
516
|
-
# Download only JSON files
|
|
517
|
-
con.download("config", "./configs", ['.json'])
|
|
518
|
-
```
|
|
519
|
-
|
|
520
|
-
**Important Notes:**
|
|
521
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
522
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
523
|
-
- Both methods default to `overwrite=False` for safety
|
|
524
|
-
- Folder structure is preserved during upload/download operations
|
|
525
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
526
|
-
|
|
527
536
|
## Complete Example
|
|
528
537
|
|
|
529
538
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.13.dev0"
|
|
8
8
|
description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -13,7 +13,7 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
requires-python = ">=3.9"
|
|
15
15
|
dependencies = [
|
|
16
|
-
"duckdb>=1.2.
|
|
16
|
+
"duckdb>=1.2.2",
|
|
17
17
|
"deltalake<=0.18.2",
|
|
18
18
|
"requests>=2.28.0",
|
|
19
19
|
"obstore>=0.2.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|