duckrun 0.2.11.dev0__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/PKG-INFO +74 -65
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/README.md +73 -64
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/auth.py +9 -15
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/core.py +7 -36
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun.egg-info/PKG-INFO +74 -65
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun.egg-info/requires.txt +1 -1
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/pyproject.toml +2 -2
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/LICENSE +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/__init__.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/files.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/runner.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/stats.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun/writer.py +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.11.dev0 → duckrun-0.2.13}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.2
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
A helper package for
|
|
23
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
24
24
|
|
|
25
25
|
## Important Notes
|
|
26
26
|
|
|
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
29
|
- **Workspace names with spaces are fully supported!** ✅
|
|
30
30
|
|
|
31
|
-
|
|
32
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
32
|
|
|
34
33
|
## What It Does
|
|
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
40
39
|
```bash
|
|
41
40
|
pip install duckrun
|
|
42
41
|
```
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
46
|
pip install duckrun[local]
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
|
+
|
|
49
51
|
## Quick Start
|
|
50
52
|
|
|
51
53
|
### Simple Example for New Users
|
|
@@ -163,9 +165,6 @@ con.sql("""
|
|
|
163
165
|
GROUP BY customer_id
|
|
164
166
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
167
|
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
168
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
169
|
con.sql("""
|
|
171
170
|
SELECT
|
|
@@ -324,6 +323,73 @@ pipeline = [
|
|
|
324
323
|
|
|
325
324
|
## Advanced Features
|
|
326
325
|
|
|
326
|
+
### SQL Lookup Functions
|
|
327
|
+
|
|
328
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
329
|
+
|
|
330
|
+
**Available Functions:**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
334
|
+
|
|
335
|
+
# ID → Name lookups (most common use case)
|
|
336
|
+
con.sql("""
|
|
337
|
+
SELECT
|
|
338
|
+
workspace_id,
|
|
339
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
340
|
+
lakehouse_id,
|
|
341
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
342
|
+
FROM storage_logs
|
|
343
|
+
""").show()
|
|
344
|
+
|
|
345
|
+
# Name → ID lookups (reverse)
|
|
346
|
+
con.sql("""
|
|
347
|
+
SELECT
|
|
348
|
+
workspace_name,
|
|
349
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
350
|
+
lakehouse_name,
|
|
351
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
352
|
+
FROM configuration_table
|
|
353
|
+
""").show()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Function Reference:**
|
|
357
|
+
|
|
358
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
359
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
360
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
361
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
362
|
+
|
|
363
|
+
**Features:**
|
|
364
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
365
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
366
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
367
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
368
|
+
|
|
369
|
+
**Example Use Case:**
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Enrich OneLake storage logs with friendly names
|
|
373
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
374
|
+
|
|
375
|
+
result = con.sql("""
|
|
376
|
+
SELECT
|
|
377
|
+
workspace_id,
|
|
378
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
379
|
+
lakehouse_id,
|
|
380
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
381
|
+
operation_name,
|
|
382
|
+
COUNT(*) as operation_count,
|
|
383
|
+
SUM(bytes_transferred) as total_bytes
|
|
384
|
+
FROM onelake_storage_logs
|
|
385
|
+
WHERE log_date = CURRENT_DATE
|
|
386
|
+
GROUP BY ALL
|
|
387
|
+
ORDER BY workspace_name, lakehouse_name
|
|
388
|
+
""").show()
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
392
|
+
|
|
327
393
|
### Schema Evolution & Partitioning
|
|
328
394
|
|
|
329
395
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -467,63 +533,6 @@ con = duckrun.connect(
|
|
|
467
533
|
)
|
|
468
534
|
```
|
|
469
535
|
|
|
470
|
-
## File Management API Reference
|
|
471
|
-
|
|
472
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
473
|
-
|
|
474
|
-
Upload files from a local folder to OneLake Files section.
|
|
475
|
-
|
|
476
|
-
**Parameters:**
|
|
477
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
478
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
479
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
480
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
481
|
-
|
|
482
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
483
|
-
|
|
484
|
-
**Examples:**
|
|
485
|
-
```python
|
|
486
|
-
# Upload all files to a target folder
|
|
487
|
-
con.copy("./data", "processed_data")
|
|
488
|
-
|
|
489
|
-
# Upload only CSV and Parquet files
|
|
490
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
491
|
-
|
|
492
|
-
# Upload with overwrite enabled
|
|
493
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
494
|
-
```
|
|
495
|
-
|
|
496
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
497
|
-
|
|
498
|
-
Download files from OneLake Files section to a local folder.
|
|
499
|
-
|
|
500
|
-
**Parameters:**
|
|
501
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
502
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
503
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
504
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
505
|
-
|
|
506
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
507
|
-
|
|
508
|
-
**Examples:**
|
|
509
|
-
```python
|
|
510
|
-
# Download all files from OneLake Files root
|
|
511
|
-
con.download()
|
|
512
|
-
|
|
513
|
-
# Download from specific folder
|
|
514
|
-
con.download("processed_data", "./local_data")
|
|
515
|
-
|
|
516
|
-
# Download only JSON files
|
|
517
|
-
con.download("config", "./configs", ['.json'])
|
|
518
|
-
```
|
|
519
|
-
|
|
520
|
-
**Important Notes:**
|
|
521
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
522
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
523
|
-
- Both methods default to `overwrite=False` for safety
|
|
524
|
-
- Folder structure is preserved during upload/download operations
|
|
525
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
526
|
-
|
|
527
536
|
## Complete Example
|
|
528
537
|
|
|
529
538
|
```python
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
2
2
|
|
|
3
|
-
A helper package for
|
|
3
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
4
4
|
|
|
5
5
|
## Important Notes
|
|
6
6
|
|
|
@@ -8,7 +8,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
8
8
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
9
9
|
- **Workspace names with spaces are fully supported!** ✅
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
13
12
|
|
|
14
13
|
## What It Does
|
|
@@ -20,12 +19,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
20
19
|
```bash
|
|
21
20
|
pip install duckrun
|
|
22
21
|
```
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
24
24
|
|
|
25
25
|
```bash
|
|
26
26
|
pip install duckrun[local]
|
|
27
27
|
```
|
|
28
28
|
|
|
29
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
30
|
+
|
|
29
31
|
## Quick Start
|
|
30
32
|
|
|
31
33
|
### Simple Example for New Users
|
|
@@ -143,9 +145,6 @@ con.sql("""
|
|
|
143
145
|
GROUP BY customer_id
|
|
144
146
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
145
147
|
|
|
146
|
-
# Append mode
|
|
147
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
148
|
-
|
|
149
148
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
150
149
|
con.sql("""
|
|
151
150
|
SELECT
|
|
@@ -304,6 +303,73 @@ pipeline = [
|
|
|
304
303
|
|
|
305
304
|
## Advanced Features
|
|
306
305
|
|
|
306
|
+
### SQL Lookup Functions
|
|
307
|
+
|
|
308
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
309
|
+
|
|
310
|
+
**Available Functions:**
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
314
|
+
|
|
315
|
+
# ID → Name lookups (most common use case)
|
|
316
|
+
con.sql("""
|
|
317
|
+
SELECT
|
|
318
|
+
workspace_id,
|
|
319
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
320
|
+
lakehouse_id,
|
|
321
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
322
|
+
FROM storage_logs
|
|
323
|
+
""").show()
|
|
324
|
+
|
|
325
|
+
# Name → ID lookups (reverse)
|
|
326
|
+
con.sql("""
|
|
327
|
+
SELECT
|
|
328
|
+
workspace_name,
|
|
329
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
330
|
+
lakehouse_name,
|
|
331
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
332
|
+
FROM configuration_table
|
|
333
|
+
""").show()
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
**Function Reference:**
|
|
337
|
+
|
|
338
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
339
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
340
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
341
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
342
|
+
|
|
343
|
+
**Features:**
|
|
344
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
345
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
346
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
347
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
348
|
+
|
|
349
|
+
**Example Use Case:**
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# Enrich OneLake storage logs with friendly names
|
|
353
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
354
|
+
|
|
355
|
+
result = con.sql("""
|
|
356
|
+
SELECT
|
|
357
|
+
workspace_id,
|
|
358
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
359
|
+
lakehouse_id,
|
|
360
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
361
|
+
operation_name,
|
|
362
|
+
COUNT(*) as operation_count,
|
|
363
|
+
SUM(bytes_transferred) as total_bytes
|
|
364
|
+
FROM onelake_storage_logs
|
|
365
|
+
WHERE log_date = CURRENT_DATE
|
|
366
|
+
GROUP BY ALL
|
|
367
|
+
ORDER BY workspace_name, lakehouse_name
|
|
368
|
+
""").show()
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
372
|
+
|
|
307
373
|
### Schema Evolution & Partitioning
|
|
308
374
|
|
|
309
375
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -447,63 +513,6 @@ con = duckrun.connect(
|
|
|
447
513
|
)
|
|
448
514
|
```
|
|
449
515
|
|
|
450
|
-
## File Management API Reference
|
|
451
|
-
|
|
452
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
453
|
-
|
|
454
|
-
Upload files from a local folder to OneLake Files section.
|
|
455
|
-
|
|
456
|
-
**Parameters:**
|
|
457
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
458
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
459
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
460
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
461
|
-
|
|
462
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
463
|
-
|
|
464
|
-
**Examples:**
|
|
465
|
-
```python
|
|
466
|
-
# Upload all files to a target folder
|
|
467
|
-
con.copy("./data", "processed_data")
|
|
468
|
-
|
|
469
|
-
# Upload only CSV and Parquet files
|
|
470
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
471
|
-
|
|
472
|
-
# Upload with overwrite enabled
|
|
473
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
474
|
-
```
|
|
475
|
-
|
|
476
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
477
|
-
|
|
478
|
-
Download files from OneLake Files section to a local folder.
|
|
479
|
-
|
|
480
|
-
**Parameters:**
|
|
481
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
482
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
483
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
484
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
485
|
-
|
|
486
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
487
|
-
|
|
488
|
-
**Examples:**
|
|
489
|
-
```python
|
|
490
|
-
# Download all files from OneLake Files root
|
|
491
|
-
con.download()
|
|
492
|
-
|
|
493
|
-
# Download from specific folder
|
|
494
|
-
con.download("processed_data", "./local_data")
|
|
495
|
-
|
|
496
|
-
# Download only JSON files
|
|
497
|
-
con.download("config", "./configs", ['.json'])
|
|
498
|
-
```
|
|
499
|
-
|
|
500
|
-
**Important Notes:**
|
|
501
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
502
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
503
|
-
- Both methods default to `overwrite=False` for safety
|
|
504
|
-
- Folder structure is preserved during upload/download operations
|
|
505
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
506
|
-
|
|
507
516
|
## Complete Example
|
|
508
517
|
|
|
509
518
|
```python
|
|
@@ -630,4 +639,4 @@ For a complete production example, see [fabric_demo](https://github.com/djoualla
|
|
|
630
639
|
|
|
631
640
|
## License
|
|
632
641
|
|
|
633
|
-
MIT
|
|
642
|
+
MIT
|
|
@@ -20,7 +20,6 @@ def get_token() -> Optional[str]:
|
|
|
20
20
|
# Check if we already have a cached token
|
|
21
21
|
token_env = os.environ.get("AZURE_STORAGE_TOKEN")
|
|
22
22
|
if token_env and token_env != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
|
23
|
-
print("✅ Using existing Azure Storage token")
|
|
24
23
|
return token_env
|
|
25
24
|
|
|
26
25
|
print("🔐 Starting Azure authentication...")
|
|
@@ -38,21 +37,16 @@ def get_token() -> Optional[str]:
|
|
|
38
37
|
except Exception as e:
|
|
39
38
|
print(f"⚠️ Fabric notebook authentication failed: {e}")
|
|
40
39
|
|
|
41
|
-
#
|
|
40
|
+
# Try local/VS Code authentication (Azure CLI + browser)
|
|
41
|
+
print("🖥️ Trying local authentication (Azure CLI + browser fallback)...")
|
|
42
|
+
token = _get_local_token()
|
|
43
|
+
if token:
|
|
44
|
+
return token
|
|
45
|
+
|
|
46
|
+
# If local auth failed, fall back to device code flow
|
|
47
|
+
print("🔐 Falling back to device code flow for remote/headless environment...")
|
|
42
48
|
try:
|
|
43
|
-
|
|
44
|
-
try:
|
|
45
|
-
import google.colab
|
|
46
|
-
print("🚀 Google Colab detected - using device code flow")
|
|
47
|
-
return _get_device_code_token()
|
|
48
|
-
except ImportError:
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
# For all other environments (including VS Code), try Azure CLI first
|
|
52
|
-
# This includes local development, VS Code notebooks, etc.
|
|
53
|
-
print("🖥️ Local/VS Code environment detected - trying Azure CLI first, then browser fallback")
|
|
54
|
-
return _get_local_token()
|
|
55
|
-
|
|
49
|
+
return _get_device_code_token()
|
|
56
50
|
except Exception as e:
|
|
57
51
|
print(f"❌ Authentication failed: {e}")
|
|
58
52
|
print("💡 Try refreshing and running again, or check your Azure permissions")
|
|
@@ -133,11 +133,8 @@ class Duckrun:
|
|
|
133
133
|
|
|
134
134
|
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
|
135
135
|
if "/" not in connection_string:
|
|
136
|
-
print(f"Connecting to workspace '{connection_string}' for management operations...")
|
|
137
136
|
return WorkspaceConnection(connection_string)
|
|
138
137
|
|
|
139
|
-
print("Connecting to Lakehouse...")
|
|
140
|
-
|
|
141
138
|
scan_all_schemas = False
|
|
142
139
|
|
|
143
140
|
# Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
|
@@ -195,17 +192,14 @@ class Duckrun:
|
|
|
195
192
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
196
193
|
|
|
197
194
|
if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
|
|
198
|
-
print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
|
|
199
195
|
return workspace_name, lakehouse_name
|
|
200
196
|
|
|
201
197
|
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
|
202
198
|
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
|
203
199
|
if " " not in workspace_name:
|
|
204
|
-
print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
|
|
205
200
|
return workspace_name, lakehouse_name
|
|
206
201
|
|
|
207
202
|
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
|
208
|
-
print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
|
|
209
203
|
|
|
210
204
|
try:
|
|
211
205
|
# Get authentication token using enhanced auth system
|
|
@@ -242,7 +236,6 @@ class Duckrun:
|
|
|
242
236
|
if not lakehouse_id:
|
|
243
237
|
raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
|
|
244
238
|
|
|
245
|
-
print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
|
|
246
239
|
return workspace_id, lakehouse_id
|
|
247
240
|
|
|
248
241
|
except Exception as e:
|
|
@@ -388,7 +381,6 @@ class Duckrun:
|
|
|
388
381
|
tables_found.append((schema_name, table_name))
|
|
389
382
|
else:
|
|
390
383
|
# Scan specific schema only
|
|
391
|
-
print(f"🔍 Discovering tables in schema '{self.schema}'...")
|
|
392
384
|
schema_path = f"{base_path}{self.schema}/"
|
|
393
385
|
result = obs.list_with_delimiter(store, prefix=schema_path)
|
|
394
386
|
|
|
@@ -407,27 +399,10 @@ class Duckrun:
|
|
|
407
399
|
tables = self._discover_tables_fast()
|
|
408
400
|
|
|
409
401
|
if not tables:
|
|
410
|
-
if self.scan_all_schemas:
|
|
411
|
-
print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
|
|
412
|
-
else:
|
|
413
|
-
print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
|
|
414
402
|
return
|
|
415
403
|
|
|
416
|
-
#
|
|
417
|
-
|
|
418
|
-
for schema_name, table_name in tables:
|
|
419
|
-
if schema_name not in schema_tables:
|
|
420
|
-
schema_tables[schema_name] = []
|
|
421
|
-
schema_tables[schema_name].append(table_name)
|
|
422
|
-
|
|
423
|
-
# Display tables by schema
|
|
424
|
-
print(f"\n📊 Found {len(tables)} tables:")
|
|
425
|
-
for schema_name in sorted(schema_tables.keys()):
|
|
426
|
-
table_list = sorted(schema_tables[schema_name])
|
|
427
|
-
print(f" {schema_name}: {', '.join(table_list)}")
|
|
428
|
-
|
|
429
|
-
attached_count = 0
|
|
430
|
-
skipped_tables = []
|
|
404
|
+
# Collect table names for display
|
|
405
|
+
table_names = []
|
|
431
406
|
|
|
432
407
|
for schema_name, table_name in tables:
|
|
433
408
|
try:
|
|
@@ -435,28 +410,25 @@ class Duckrun:
|
|
|
435
410
|
# Create proper schema.table structure in DuckDB
|
|
436
411
|
self.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
|
|
437
412
|
view_name = f"{schema_name}.{table_name}"
|
|
413
|
+
table_names.append(view_name)
|
|
438
414
|
else:
|
|
439
415
|
# Single schema mode - use just table name
|
|
440
416
|
view_name = table_name
|
|
417
|
+
table_names.append(table_name)
|
|
441
418
|
|
|
442
419
|
self.con.sql(f"""
|
|
443
420
|
CREATE OR REPLACE VIEW {view_name}
|
|
444
421
|
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
|
445
422
|
""")
|
|
446
|
-
attached_count += 1
|
|
447
423
|
except Exception as e:
|
|
448
|
-
skipped_tables.append(f"{schema_name}.{table_name}")
|
|
449
424
|
continue
|
|
450
425
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
|
|
455
|
-
print(f"{'='*60}\n")
|
|
426
|
+
# Print discovered tables as comma-separated list
|
|
427
|
+
if table_names:
|
|
428
|
+
print(", ".join(table_names))
|
|
456
429
|
|
|
457
430
|
except Exception as e:
|
|
458
431
|
print(f"❌ Error attaching lakehouse: {e}")
|
|
459
|
-
print("Continuing without pre-attached tables.")
|
|
460
432
|
|
|
461
433
|
def _register_lookup_functions(self):
|
|
462
434
|
"""
|
|
@@ -599,7 +571,6 @@ class Duckrun:
|
|
|
599
571
|
self.con.create_function("get_lakehouse_name", get_lakehouse_name)
|
|
600
572
|
self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name)
|
|
601
573
|
self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name)
|
|
602
|
-
print("✅ Registered lookup functions: get_workspace_name, get_lakehouse_name, get_workspace_id_from_name, get_lakehouse_id_from_name")
|
|
603
574
|
except Exception as e:
|
|
604
575
|
print(f"⚠️ Warning: Could not register lookup functions: {e}")
|
|
605
576
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.2
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
A helper package for
|
|
23
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
24
24
|
|
|
25
25
|
## Important Notes
|
|
26
26
|
|
|
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
29
|
- **Workspace names with spaces are fully supported!** ✅
|
|
30
30
|
|
|
31
|
-
|
|
32
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
32
|
|
|
34
33
|
## What It Does
|
|
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
40
39
|
```bash
|
|
41
40
|
pip install duckrun
|
|
42
41
|
```
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
46
|
pip install duckrun[local]
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
|
+
|
|
49
51
|
## Quick Start
|
|
50
52
|
|
|
51
53
|
### Simple Example for New Users
|
|
@@ -163,9 +165,6 @@ con.sql("""
|
|
|
163
165
|
GROUP BY customer_id
|
|
164
166
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
167
|
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
168
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
169
|
con.sql("""
|
|
171
170
|
SELECT
|
|
@@ -324,6 +323,73 @@ pipeline = [
|
|
|
324
323
|
|
|
325
324
|
## Advanced Features
|
|
326
325
|
|
|
326
|
+
### SQL Lookup Functions
|
|
327
|
+
|
|
328
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
329
|
+
|
|
330
|
+
**Available Functions:**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
334
|
+
|
|
335
|
+
# ID → Name lookups (most common use case)
|
|
336
|
+
con.sql("""
|
|
337
|
+
SELECT
|
|
338
|
+
workspace_id,
|
|
339
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
340
|
+
lakehouse_id,
|
|
341
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
342
|
+
FROM storage_logs
|
|
343
|
+
""").show()
|
|
344
|
+
|
|
345
|
+
# Name → ID lookups (reverse)
|
|
346
|
+
con.sql("""
|
|
347
|
+
SELECT
|
|
348
|
+
workspace_name,
|
|
349
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
350
|
+
lakehouse_name,
|
|
351
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
352
|
+
FROM configuration_table
|
|
353
|
+
""").show()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Function Reference:**
|
|
357
|
+
|
|
358
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
359
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
360
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
361
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
362
|
+
|
|
363
|
+
**Features:**
|
|
364
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
365
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
366
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
367
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
368
|
+
|
|
369
|
+
**Example Use Case:**
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Enrich OneLake storage logs with friendly names
|
|
373
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
374
|
+
|
|
375
|
+
result = con.sql("""
|
|
376
|
+
SELECT
|
|
377
|
+
workspace_id,
|
|
378
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
379
|
+
lakehouse_id,
|
|
380
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
381
|
+
operation_name,
|
|
382
|
+
COUNT(*) as operation_count,
|
|
383
|
+
SUM(bytes_transferred) as total_bytes
|
|
384
|
+
FROM onelake_storage_logs
|
|
385
|
+
WHERE log_date = CURRENT_DATE
|
|
386
|
+
GROUP BY ALL
|
|
387
|
+
ORDER BY workspace_name, lakehouse_name
|
|
388
|
+
""").show()
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
392
|
+
|
|
327
393
|
### Schema Evolution & Partitioning
|
|
328
394
|
|
|
329
395
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -467,63 +533,6 @@ con = duckrun.connect(
|
|
|
467
533
|
)
|
|
468
534
|
```
|
|
469
535
|
|
|
470
|
-
## File Management API Reference
|
|
471
|
-
|
|
472
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
473
|
-
|
|
474
|
-
Upload files from a local folder to OneLake Files section.
|
|
475
|
-
|
|
476
|
-
**Parameters:**
|
|
477
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
478
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
479
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
480
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
481
|
-
|
|
482
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
483
|
-
|
|
484
|
-
**Examples:**
|
|
485
|
-
```python
|
|
486
|
-
# Upload all files to a target folder
|
|
487
|
-
con.copy("./data", "processed_data")
|
|
488
|
-
|
|
489
|
-
# Upload only CSV and Parquet files
|
|
490
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
491
|
-
|
|
492
|
-
# Upload with overwrite enabled
|
|
493
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
494
|
-
```
|
|
495
|
-
|
|
496
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
497
|
-
|
|
498
|
-
Download files from OneLake Files section to a local folder.
|
|
499
|
-
|
|
500
|
-
**Parameters:**
|
|
501
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
502
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
503
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
504
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
505
|
-
|
|
506
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
507
|
-
|
|
508
|
-
**Examples:**
|
|
509
|
-
```python
|
|
510
|
-
# Download all files from OneLake Files root
|
|
511
|
-
con.download()
|
|
512
|
-
|
|
513
|
-
# Download from specific folder
|
|
514
|
-
con.download("processed_data", "./local_data")
|
|
515
|
-
|
|
516
|
-
# Download only JSON files
|
|
517
|
-
con.download("config", "./configs", ['.json'])
|
|
518
|
-
```
|
|
519
|
-
|
|
520
|
-
**Important Notes:**
|
|
521
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
522
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
523
|
-
- Both methods default to `overwrite=False` for safety
|
|
524
|
-
- Folder structure is preserved during upload/download operations
|
|
525
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
526
|
-
|
|
527
536
|
## Complete Example
|
|
528
537
|
|
|
529
538
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.13"
|
|
8
8
|
description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -13,7 +13,7 @@ authors = [
|
|
|
13
13
|
]
|
|
14
14
|
requires-python = ">=3.9"
|
|
15
15
|
dependencies = [
|
|
16
|
-
"duckdb>=1.2.
|
|
16
|
+
"duckdb>=1.2.2",
|
|
17
17
|
"deltalake<=0.18.2",
|
|
18
18
|
"requests>=2.28.0",
|
|
19
19
|
"obstore>=0.2.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|