duckrun 0.2.9.dev4__tar.gz → 0.2.18.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/PKG-INFO +114 -67
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/README.md +113 -66
- duckrun-0.2.18.dev4/duckrun/__init__.py +11 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/auth.py +21 -15
- duckrun-0.2.18.dev4/duckrun/core.py +1487 -0
- duckrun-0.2.18.dev4/duckrun/notebook.py +322 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/runner.py +15 -45
- duckrun-0.2.18.dev4/duckrun/semantic_model.py +847 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/stats.py +115 -30
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/writer.py +35 -6
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun.egg-info/PKG-INFO +114 -67
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun.egg-info/SOURCES.txt +1 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/pyproject.toml +2 -2
- duckrun-0.2.9.dev4/duckrun/__init__.py +0 -10
- duckrun-0.2.9.dev4/duckrun/core.py +0 -884
- duckrun-0.2.9.dev4/duckrun/semantic_model.py +0 -427
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/LICENSE +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/files.py +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.9.dev4 → duckrun-0.2.18.dev4}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.18.dev4
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
A helper package for
|
|
23
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
24
24
|
|
|
25
25
|
## Important Notes
|
|
26
26
|
|
|
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
28
28
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
29
|
- **Workspace names with spaces are fully supported!** ✅
|
|
30
30
|
|
|
31
|
-
|
|
32
31
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
32
|
|
|
34
33
|
## What It Does
|
|
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
40
39
|
```bash
|
|
41
40
|
pip install duckrun
|
|
42
41
|
```
|
|
43
|
-
|
|
42
|
+
|
|
43
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
46
|
pip install duckrun[local]
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
|
+
|
|
49
51
|
## Quick Start
|
|
50
52
|
|
|
51
53
|
### Simple Example for New Users
|
|
@@ -163,9 +165,6 @@ con.sql("""
|
|
|
163
165
|
GROUP BY customer_id
|
|
164
166
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
167
|
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
168
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
169
|
con.sql("""
|
|
171
170
|
SELECT
|
|
@@ -324,6 +323,73 @@ pipeline = [
|
|
|
324
323
|
|
|
325
324
|
## Advanced Features
|
|
326
325
|
|
|
326
|
+
### SQL Lookup Functions
|
|
327
|
+
|
|
328
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
329
|
+
|
|
330
|
+
**Available Functions:**
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
334
|
+
|
|
335
|
+
# ID → Name lookups (most common use case)
|
|
336
|
+
con.sql("""
|
|
337
|
+
SELECT
|
|
338
|
+
workspace_id,
|
|
339
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
340
|
+
lakehouse_id,
|
|
341
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
342
|
+
FROM storage_logs
|
|
343
|
+
""").show()
|
|
344
|
+
|
|
345
|
+
# Name → ID lookups (reverse)
|
|
346
|
+
con.sql("""
|
|
347
|
+
SELECT
|
|
348
|
+
workspace_name,
|
|
349
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
350
|
+
lakehouse_name,
|
|
351
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
352
|
+
FROM configuration_table
|
|
353
|
+
""").show()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Function Reference:**
|
|
357
|
+
|
|
358
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
359
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
360
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
361
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
362
|
+
|
|
363
|
+
**Features:**
|
|
364
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
365
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
366
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
367
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
368
|
+
|
|
369
|
+
**Example Use Case:**
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Enrich OneLake storage logs with friendly names
|
|
373
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
374
|
+
|
|
375
|
+
result = con.sql("""
|
|
376
|
+
SELECT
|
|
377
|
+
workspace_id,
|
|
378
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
379
|
+
lakehouse_id,
|
|
380
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
381
|
+
operation_name,
|
|
382
|
+
COUNT(*) as operation_count,
|
|
383
|
+
SUM(bytes_transferred) as total_bytes
|
|
384
|
+
FROM onelake_storage_logs
|
|
385
|
+
WHERE log_date = CURRENT_DATE
|
|
386
|
+
GROUP BY ALL
|
|
387
|
+
ORDER BY workspace_name, lakehouse_name
|
|
388
|
+
""").show()
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
392
|
+
|
|
327
393
|
### Schema Evolution & Partitioning
|
|
328
394
|
|
|
329
395
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -420,6 +486,37 @@ success = con.run(pipeline) # Returns True only if ALL tasks succeed
|
|
|
420
486
|
|
|
421
487
|
This prevents downstream tasks from processing incomplete or corrupted data.
|
|
422
488
|
|
|
489
|
+
### Semantic Model Deployment
|
|
490
|
+
|
|
491
|
+
Deploy Power BI semantic models directly from BIM files using DirectLake mode:
|
|
492
|
+
|
|
493
|
+
```python
|
|
494
|
+
# Connect to lakehouse
|
|
495
|
+
con = duckrun.connect("Analytics/Sales.lakehouse/dbo")
|
|
496
|
+
|
|
497
|
+
# Deploy with auto-generated name (lakehouse_schema)
|
|
498
|
+
con.deploy("https://raw.githubusercontent.com/user/repo/main/model.bim")
|
|
499
|
+
|
|
500
|
+
# Deploy with custom name
|
|
501
|
+
con.deploy(
|
|
502
|
+
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
503
|
+
dataset_name="Sales Analytics Model",
|
|
504
|
+
wait_seconds=10 # Wait for permission propagation
|
|
505
|
+
)
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
**Features:**
|
|
509
|
+
- 🚀 **DirectLake Mode**: Deploys semantic models with DirectLake connection
|
|
510
|
+
- 🔄 **Automatic Configuration**: Auto-configures workspace, lakehouse, and schema connections
|
|
511
|
+
- 📦 **BIM from URL**: Load model definitions from GitHub or any accessible URL
|
|
512
|
+
- ⏱️ **Permission Handling**: Configurable wait time for permission propagation
|
|
513
|
+
|
|
514
|
+
**Use Cases:**
|
|
515
|
+
- Deploy semantic models as part of CI/CD pipelines
|
|
516
|
+
- Version control your semantic models in Git
|
|
517
|
+
- Automated model deployment across environments
|
|
518
|
+
- Streamline DirectLake model creation
|
|
519
|
+
|
|
423
520
|
### Delta Lake Optimization
|
|
424
521
|
|
|
425
522
|
Duckrun automatically:
|
|
@@ -436,63 +533,6 @@ con = duckrun.connect(
|
|
|
436
533
|
)
|
|
437
534
|
```
|
|
438
535
|
|
|
439
|
-
## File Management API Reference
|
|
440
|
-
|
|
441
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
442
|
-
|
|
443
|
-
Upload files from a local folder to OneLake Files section.
|
|
444
|
-
|
|
445
|
-
**Parameters:**
|
|
446
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
447
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
448
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
449
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
450
|
-
|
|
451
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
452
|
-
|
|
453
|
-
**Examples:**
|
|
454
|
-
```python
|
|
455
|
-
# Upload all files to a target folder
|
|
456
|
-
con.copy("./data", "processed_data")
|
|
457
|
-
|
|
458
|
-
# Upload only CSV and Parquet files
|
|
459
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
460
|
-
|
|
461
|
-
# Upload with overwrite enabled
|
|
462
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
463
|
-
```
|
|
464
|
-
|
|
465
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
466
|
-
|
|
467
|
-
Download files from OneLake Files section to a local folder.
|
|
468
|
-
|
|
469
|
-
**Parameters:**
|
|
470
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
471
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
472
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
473
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
474
|
-
|
|
475
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
476
|
-
|
|
477
|
-
**Examples:**
|
|
478
|
-
```python
|
|
479
|
-
# Download all files from OneLake Files root
|
|
480
|
-
con.download()
|
|
481
|
-
|
|
482
|
-
# Download from specific folder
|
|
483
|
-
con.download("processed_data", "./local_data")
|
|
484
|
-
|
|
485
|
-
# Download only JSON files
|
|
486
|
-
con.download("config", "./configs", ['.json'])
|
|
487
|
-
```
|
|
488
|
-
|
|
489
|
-
**Important Notes:**
|
|
490
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
491
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
492
|
-
- Both methods default to `overwrite=False` for safety
|
|
493
|
-
- Folder structure is preserved during upload/download operations
|
|
494
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
495
|
-
|
|
496
536
|
## Complete Example
|
|
497
537
|
|
|
498
538
|
```python
|
|
@@ -534,6 +574,12 @@ con.sql("""
|
|
|
534
574
|
|
|
535
575
|
# 5. Download processed files for external systems
|
|
536
576
|
con.download("processed_reports", "./exports", ['.csv'])
|
|
577
|
+
|
|
578
|
+
# 6. Deploy semantic model for Power BI
|
|
579
|
+
con.deploy(
|
|
580
|
+
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
581
|
+
dataset_name="Sales Analytics"
|
|
582
|
+
)
|
|
537
583
|
```
|
|
538
584
|
|
|
539
585
|
**This example demonstrates:**
|
|
@@ -541,8 +587,9 @@ con.download("processed_reports", "./exports", ['.csv'])
|
|
|
541
587
|
- 🔄 **Pipeline orchestration** with SQL and Python tasks
|
|
542
588
|
- ⚡ **Fast data exploration** with DuckDB
|
|
543
589
|
- 💾 **Delta table creation** with Spark-style API
|
|
544
|
-
-
|
|
545
|
-
-
|
|
590
|
+
- 🔀 **Schema evolution** and partitioning
|
|
591
|
+
- 📤 **File downloads** from OneLake Files
|
|
592
|
+
- 📊 **Semantic model deployment** with DirectLake
|
|
546
593
|
|
|
547
594
|
## Schema Evolution & Partitioning Guide
|
|
548
595
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
2
2
|
|
|
3
|
-
A helper package for
|
|
3
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
|
|
4
4
|
|
|
5
5
|
## Important Notes
|
|
6
6
|
|
|
@@ -8,7 +8,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
|
|
|
8
8
|
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
9
9
|
- **Workspace names with spaces are fully supported!** ✅
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
13
12
|
|
|
14
13
|
## What It Does
|
|
@@ -20,12 +19,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
|
|
|
20
19
|
```bash
|
|
21
20
|
pip install duckrun
|
|
22
21
|
```
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
For local usage (requires Azure CLI or interactive browser auth):
|
|
24
24
|
|
|
25
25
|
```bash
|
|
26
26
|
pip install duckrun[local]
|
|
27
27
|
```
|
|
28
28
|
|
|
29
|
+
Note: When running locally, your internet speed will be the main bottleneck.
|
|
30
|
+
|
|
29
31
|
## Quick Start
|
|
30
32
|
|
|
31
33
|
### Simple Example for New Users
|
|
@@ -143,9 +145,6 @@ con.sql("""
|
|
|
143
145
|
GROUP BY customer_id
|
|
144
146
|
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
145
147
|
|
|
146
|
-
# Append mode
|
|
147
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
148
|
-
|
|
149
148
|
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
150
149
|
con.sql("""
|
|
151
150
|
SELECT
|
|
@@ -304,6 +303,73 @@ pipeline = [
|
|
|
304
303
|
|
|
305
304
|
## Advanced Features
|
|
306
305
|
|
|
306
|
+
### SQL Lookup Functions
|
|
307
|
+
|
|
308
|
+
Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
|
|
309
|
+
|
|
310
|
+
**Available Functions:**
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
314
|
+
|
|
315
|
+
# ID → Name lookups (most common use case)
|
|
316
|
+
con.sql("""
|
|
317
|
+
SELECT
|
|
318
|
+
workspace_id,
|
|
319
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
320
|
+
lakehouse_id,
|
|
321
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
322
|
+
FROM storage_logs
|
|
323
|
+
""").show()
|
|
324
|
+
|
|
325
|
+
# Name → ID lookups (reverse)
|
|
326
|
+
con.sql("""
|
|
327
|
+
SELECT
|
|
328
|
+
workspace_name,
|
|
329
|
+
get_workspace_id_from_name(workspace_name) as workspace_id,
|
|
330
|
+
lakehouse_name,
|
|
331
|
+
get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
|
|
332
|
+
FROM configuration_table
|
|
333
|
+
""").show()
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
**Function Reference:**
|
|
337
|
+
|
|
338
|
+
- `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
|
|
339
|
+
- `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
|
|
340
|
+
- `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
|
|
341
|
+
- `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
|
|
342
|
+
|
|
343
|
+
**Features:**
|
|
344
|
+
- ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
|
|
345
|
+
- ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
|
|
346
|
+
- ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
|
|
347
|
+
- ✅ **Always Available**: Functions are automatically registered on connection
|
|
348
|
+
|
|
349
|
+
**Example Use Case:**
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# Enrich OneLake storage logs with friendly names
|
|
353
|
+
con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
|
|
354
|
+
|
|
355
|
+
result = con.sql("""
|
|
356
|
+
SELECT
|
|
357
|
+
workspace_id,
|
|
358
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
359
|
+
lakehouse_id,
|
|
360
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
|
|
361
|
+
operation_name,
|
|
362
|
+
COUNT(*) as operation_count,
|
|
363
|
+
SUM(bytes_transferred) as total_bytes
|
|
364
|
+
FROM onelake_storage_logs
|
|
365
|
+
WHERE log_date = CURRENT_DATE
|
|
366
|
+
GROUP BY ALL
|
|
367
|
+
ORDER BY workspace_name, lakehouse_name
|
|
368
|
+
""").show()
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
This makes it easy to create human-readable reports from GUID-based log data!
|
|
372
|
+
|
|
307
373
|
### Schema Evolution & Partitioning
|
|
308
374
|
|
|
309
375
|
Handle evolving schemas and optimize query performance with partitioning:
|
|
@@ -400,6 +466,37 @@ success = con.run(pipeline) # Returns True only if ALL tasks succeed
|
|
|
400
466
|
|
|
401
467
|
This prevents downstream tasks from processing incomplete or corrupted data.
|
|
402
468
|
|
|
469
|
+
### Semantic Model Deployment
|
|
470
|
+
|
|
471
|
+
Deploy Power BI semantic models directly from BIM files using DirectLake mode:
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
# Connect to lakehouse
|
|
475
|
+
con = duckrun.connect("Analytics/Sales.lakehouse/dbo")
|
|
476
|
+
|
|
477
|
+
# Deploy with auto-generated name (lakehouse_schema)
|
|
478
|
+
con.deploy("https://raw.githubusercontent.com/user/repo/main/model.bim")
|
|
479
|
+
|
|
480
|
+
# Deploy with custom name
|
|
481
|
+
con.deploy(
|
|
482
|
+
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
483
|
+
dataset_name="Sales Analytics Model",
|
|
484
|
+
wait_seconds=10 # Wait for permission propagation
|
|
485
|
+
)
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
**Features:**
|
|
489
|
+
- 🚀 **DirectLake Mode**: Deploys semantic models with DirectLake connection
|
|
490
|
+
- 🔄 **Automatic Configuration**: Auto-configures workspace, lakehouse, and schema connections
|
|
491
|
+
- 📦 **BIM from URL**: Load model definitions from GitHub or any accessible URL
|
|
492
|
+
- ⏱️ **Permission Handling**: Configurable wait time for permission propagation
|
|
493
|
+
|
|
494
|
+
**Use Cases:**
|
|
495
|
+
- Deploy semantic models as part of CI/CD pipelines
|
|
496
|
+
- Version control your semantic models in Git
|
|
497
|
+
- Automated model deployment across environments
|
|
498
|
+
- Streamline DirectLake model creation
|
|
499
|
+
|
|
403
500
|
### Delta Lake Optimization
|
|
404
501
|
|
|
405
502
|
Duckrun automatically:
|
|
@@ -416,63 +513,6 @@ con = duckrun.connect(
|
|
|
416
513
|
)
|
|
417
514
|
```
|
|
418
515
|
|
|
419
|
-
## File Management API Reference
|
|
420
|
-
|
|
421
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
422
|
-
|
|
423
|
-
Upload files from a local folder to OneLake Files section.
|
|
424
|
-
|
|
425
|
-
**Parameters:**
|
|
426
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
427
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
428
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
429
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
430
|
-
|
|
431
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
432
|
-
|
|
433
|
-
**Examples:**
|
|
434
|
-
```python
|
|
435
|
-
# Upload all files to a target folder
|
|
436
|
-
con.copy("./data", "processed_data")
|
|
437
|
-
|
|
438
|
-
# Upload only CSV and Parquet files
|
|
439
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
440
|
-
|
|
441
|
-
# Upload with overwrite enabled
|
|
442
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
443
|
-
```
|
|
444
|
-
|
|
445
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
446
|
-
|
|
447
|
-
Download files from OneLake Files section to a local folder.
|
|
448
|
-
|
|
449
|
-
**Parameters:**
|
|
450
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
451
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
452
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
453
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
454
|
-
|
|
455
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
456
|
-
|
|
457
|
-
**Examples:**
|
|
458
|
-
```python
|
|
459
|
-
# Download all files from OneLake Files root
|
|
460
|
-
con.download()
|
|
461
|
-
|
|
462
|
-
# Download from specific folder
|
|
463
|
-
con.download("processed_data", "./local_data")
|
|
464
|
-
|
|
465
|
-
# Download only JSON files
|
|
466
|
-
con.download("config", "./configs", ['.json'])
|
|
467
|
-
```
|
|
468
|
-
|
|
469
|
-
**Important Notes:**
|
|
470
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
471
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
472
|
-
- Both methods default to `overwrite=False` for safety
|
|
473
|
-
- Folder structure is preserved during upload/download operations
|
|
474
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
475
|
-
|
|
476
516
|
## Complete Example
|
|
477
517
|
|
|
478
518
|
```python
|
|
@@ -514,6 +554,12 @@ con.sql("""
|
|
|
514
554
|
|
|
515
555
|
# 5. Download processed files for external systems
|
|
516
556
|
con.download("processed_reports", "./exports", ['.csv'])
|
|
557
|
+
|
|
558
|
+
# 6. Deploy semantic model for Power BI
|
|
559
|
+
con.deploy(
|
|
560
|
+
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
561
|
+
dataset_name="Sales Analytics"
|
|
562
|
+
)
|
|
517
563
|
```
|
|
518
564
|
|
|
519
565
|
**This example demonstrates:**
|
|
@@ -521,8 +567,9 @@ con.download("processed_reports", "./exports", ['.csv'])
|
|
|
521
567
|
- 🔄 **Pipeline orchestration** with SQL and Python tasks
|
|
522
568
|
- ⚡ **Fast data exploration** with DuckDB
|
|
523
569
|
- 💾 **Delta table creation** with Spark-style API
|
|
524
|
-
-
|
|
525
|
-
-
|
|
570
|
+
- 🔀 **Schema evolution** and partitioning
|
|
571
|
+
- 📤 **File downloads** from OneLake Files
|
|
572
|
+
- 📊 **Semantic model deployment** with DirectLake
|
|
526
573
|
|
|
527
574
|
## Schema Evolution & Partitioning Guide
|
|
528
575
|
|
|
@@ -592,4 +639,4 @@ For a complete production example, see [fabric_demo](https://github.com/djoualla
|
|
|
592
639
|
|
|
593
640
|
## License
|
|
594
641
|
|
|
595
|
-
MIT
|
|
642
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Duckrun - Lakehouse task runner powered by DuckDB"""
|
|
2
|
+
|
|
3
|
+
from duckrun.core import Duckrun
|
|
4
|
+
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
+
|
|
6
|
+
__version__ = "0.2.18.dev2"
|
|
7
|
+
|
|
8
|
+
# Expose unified connect method at module level
|
|
9
|
+
connect = Duckrun.connect
|
|
10
|
+
|
|
11
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
@@ -2,9 +2,21 @@
|
|
|
2
2
|
Enhanced authentication module for duckrun - supports multiple notebook environments
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
+
import sys
|
|
5
6
|
from typing import Optional, Tuple
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
def safe_print(message: str):
|
|
10
|
+
"""Print message with safe encoding handling for Windows"""
|
|
11
|
+
try:
|
|
12
|
+
print(message)
|
|
13
|
+
except UnicodeEncodeError:
|
|
14
|
+
# Fallback: remove emojis and special chars
|
|
15
|
+
import re
|
|
16
|
+
clean_message = re.sub(r'[^\x00-\x7F]+', '', message)
|
|
17
|
+
print(clean_message)
|
|
18
|
+
|
|
19
|
+
|
|
8
20
|
def get_token() -> Optional[str]:
|
|
9
21
|
"""
|
|
10
22
|
Smart authentication that works across multiple environments:
|
|
@@ -20,7 +32,6 @@ def get_token() -> Optional[str]:
|
|
|
20
32
|
# Check if we already have a cached token
|
|
21
33
|
token_env = os.environ.get("AZURE_STORAGE_TOKEN")
|
|
22
34
|
if token_env and token_env != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
|
23
|
-
print("✅ Using existing Azure Storage token")
|
|
24
35
|
return token_env
|
|
25
36
|
|
|
26
37
|
print("🔐 Starting Azure authentication...")
|
|
@@ -38,21 +49,16 @@ def get_token() -> Optional[str]:
|
|
|
38
49
|
except Exception as e:
|
|
39
50
|
print(f"⚠️ Fabric notebook authentication failed: {e}")
|
|
40
51
|
|
|
41
|
-
#
|
|
52
|
+
# Try local/VS Code authentication (Azure CLI + browser)
|
|
53
|
+
print("🖥️ Trying local authentication (Azure CLI + browser fallback)...")
|
|
54
|
+
token = _get_local_token()
|
|
55
|
+
if token:
|
|
56
|
+
return token
|
|
57
|
+
|
|
58
|
+
# If local auth failed, fall back to device code flow
|
|
59
|
+
print("🔐 Falling back to device code flow for remote/headless environment...")
|
|
42
60
|
try:
|
|
43
|
-
|
|
44
|
-
try:
|
|
45
|
-
import google.colab
|
|
46
|
-
print("🚀 Google Colab detected - using device code flow")
|
|
47
|
-
return _get_device_code_token()
|
|
48
|
-
except ImportError:
|
|
49
|
-
pass
|
|
50
|
-
|
|
51
|
-
# For all other environments (including VS Code), try Azure CLI first
|
|
52
|
-
# This includes local development, VS Code notebooks, etc.
|
|
53
|
-
print("🖥️ Local/VS Code environment detected - trying Azure CLI first, then browser fallback")
|
|
54
|
-
return _get_local_token()
|
|
55
|
-
|
|
61
|
+
return _get_device_code_token()
|
|
56
62
|
except Exception as e:
|
|
57
63
|
print(f"❌ Authentication failed: {e}")
|
|
58
64
|
print("💡 Try refreshing and running again, or check your Azure permissions")
|