duckrun 0.2.11.dev0__py3-none-any.whl → 0.2.13.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/auth.py CHANGED
@@ -38,21 +38,16 @@ def get_token() -> Optional[str]:
38
38
  except Exception as e:
39
39
  print(f"⚠️ Fabric notebook authentication failed: {e}")
40
40
 
41
- # Detect environment type for fallback authentication
41
+ # Try local/VS Code authentication (Azure CLI + browser)
42
+ print("🖥️ Trying local authentication (Azure CLI + browser fallback)...")
43
+ token = _get_local_token()
44
+ if token:
45
+ return token
46
+
47
+ # If local auth failed, fall back to device code flow
48
+ print("🔐 Falling back to device code flow for remote/headless environment...")
42
49
  try:
43
- # Check if we're in Google Colab first
44
- try:
45
- import google.colab
46
- print("🚀 Google Colab detected - using device code flow")
47
- return _get_device_code_token()
48
- except ImportError:
49
- pass
50
-
51
- # For all other environments (including VS Code), try Azure CLI first
52
- # This includes local development, VS Code notebooks, etc.
53
- print("🖥️ Local/VS Code environment detected - trying Azure CLI first, then browser fallback")
54
- return _get_local_token()
55
-
50
+ return _get_device_code_token()
56
51
  except Exception as e:
57
52
  print(f"❌ Authentication failed: {e}")
58
53
  print("💡 Try refreshing and running again, or check your Azure permissions")
@@ -82,6 +77,35 @@ def _get_device_code_token() -> Optional[str]:
82
77
  return None
83
78
 
84
79
 
80
+ def _is_databricks() -> bool:
81
+ """Check if we're running in a Databricks environment"""
82
+ # Databricks sets specific environment variables
83
+ return (
84
+ os.environ.get("DATABRICKS_RUNTIME_VERSION") is not None or
85
+ os.environ.get("DB_HOME") is not None or
86
+ "databricks" in os.environ.get("SPARK_HOME", "").lower()
87
+ )
88
+
89
+
90
+ def _get_databricks_token() -> Optional[str]:
91
+ """Get token using DefaultAzureCredential for Databricks environments"""
92
+ try:
93
+ from azure.identity import DefaultAzureCredential
94
+
95
+ # DefaultAzureCredential will automatically use Databricks managed identity
96
+ credential = DefaultAzureCredential()
97
+ token_obj = credential.get_token("https://storage.azure.com/.default")
98
+
99
+ os.environ["AZURE_STORAGE_TOKEN"] = token_obj.token
100
+ print("✅ Databricks authentication successful!")
101
+ return token_obj.token
102
+
103
+ except Exception as e:
104
+ print(f"❌ Databricks authentication failed: {e}")
105
+ print("💡 Make sure your Databricks cluster has the required Azure permissions")
106
+ return None
107
+
108
+
85
109
  def _get_local_token() -> Optional[str]:
86
110
  """Get token using CLI first, then browser fallback for local environments"""
87
111
  # First try Azure CLI directly
duckrun/core.py CHANGED
@@ -133,11 +133,8 @@ class Duckrun:
133
133
 
134
134
  # Check if it's a workspace-only connection (no "/" means workspace name only)
135
135
  if "/" not in connection_string:
136
- print(f"Connecting to workspace '{connection_string}' for management operations...")
137
136
  return WorkspaceConnection(connection_string)
138
137
 
139
- print("Connecting to Lakehouse...")
140
-
141
138
  scan_all_schemas = False
142
139
 
143
140
  # Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
@@ -195,17 +192,14 @@ class Duckrun:
195
192
  guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
196
193
 
197
194
  if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
198
- print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
199
195
  return workspace_name, lakehouse_name
200
196
 
201
197
  # Optimization: If workspace name has no spaces, use both names directly (old behavior)
202
198
  # Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
203
199
  if " " not in workspace_name:
204
- print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
205
200
  return workspace_name, lakehouse_name
206
201
 
207
202
  # Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
208
- print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
209
203
 
210
204
  try:
211
205
  # Get authentication token using enhanced auth system
@@ -242,7 +236,6 @@ class Duckrun:
242
236
  if not lakehouse_id:
243
237
  raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
244
238
 
245
- print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
246
239
  return workspace_id, lakehouse_id
247
240
 
248
241
  except Exception as e:
@@ -388,7 +381,6 @@ class Duckrun:
388
381
  tables_found.append((schema_name, table_name))
389
382
  else:
390
383
  # Scan specific schema only
391
- print(f"🔍 Discovering tables in schema '{self.schema}'...")
392
384
  schema_path = f"{base_path}{self.schema}/"
393
385
  result = obs.list_with_delimiter(store, prefix=schema_path)
394
386
 
@@ -407,10 +399,6 @@ class Duckrun:
407
399
  tables = self._discover_tables_fast()
408
400
 
409
401
  if not tables:
410
- if self.scan_all_schemas:
411
- print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
412
- else:
413
- print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
414
402
  return
415
403
 
416
404
  # Group tables by schema for display
@@ -420,12 +408,6 @@ class Duckrun:
420
408
  schema_tables[schema_name] = []
421
409
  schema_tables[schema_name].append(table_name)
422
410
 
423
- # Display tables by schema
424
- print(f"\n📊 Found {len(tables)} tables:")
425
- for schema_name in sorted(schema_tables.keys()):
426
- table_list = sorted(schema_tables[schema_name])
427
- print(f" {schema_name}: {', '.join(table_list)}")
428
-
429
411
  attached_count = 0
430
412
  skipped_tables = []
431
413
 
@@ -447,16 +429,9 @@ class Duckrun:
447
429
  except Exception as e:
448
430
  skipped_tables.append(f"{schema_name}.{table_name}")
449
431
  continue
450
-
451
- print(f"\n{'='*60}")
452
- print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
453
- if skipped_tables:
454
- print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
455
- print(f"{'='*60}\n")
456
432
 
457
433
  except Exception as e:
458
434
  print(f"❌ Error attaching lakehouse: {e}")
459
- print("Continuing without pre-attached tables.")
460
435
 
461
436
  def _register_lookup_functions(self):
462
437
  """
@@ -599,7 +574,6 @@ class Duckrun:
599
574
  self.con.create_function("get_lakehouse_name", get_lakehouse_name)
600
575
  self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name)
601
576
  self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name)
602
- print("✅ Registered lookup functions: get_workspace_name, get_lakehouse_name, get_workspace_id_from_name, get_lakehouse_id_from_name")
603
577
  except Exception as e:
604
578
  print(f"⚠️ Warning: Could not register lookup functions: {e}")
605
579
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.11.dev0
3
+ Version: 0.2.13.dev0
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
10
10
  Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
- Requires-Dist: duckdb>=1.2.0
13
+ Requires-Dist: duckdb>=1.2.2
14
14
  Requires-Dist: deltalake<=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
16
  Requires-Dist: obstore>=0.2.0
@@ -20,7 +20,7 @@ Dynamic: license-file
20
20
 
21
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
22
22
 
23
- A helper package for stuff that made my life easier when working with Fabric Python notebooks. Just the things that actually made sense to me - nothing fancy
23
+ A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
24
24
 
25
25
  ## Important Notes
26
26
 
@@ -28,7 +28,6 @@ A helper package for stuff that made my life easier when working with Fabric Pyt
28
28
  - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
29
29
  - **Workspace names with spaces are fully supported!** ✅
30
30
 
31
-
32
31
  **Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
33
32
 
34
33
  ## What It Does
@@ -40,12 +39,15 @@ It does orchestration, arbitrary SQL statements, and file manipulation. That's i
40
39
  ```bash
41
40
  pip install duckrun
42
41
  ```
43
- for local usage, Note: When running locally, your internet speed will be the main bottleneck.
42
+
43
+ For local usage (requires Azure CLI or interactive browser auth):
44
44
 
45
45
  ```bash
46
46
  pip install duckrun[local]
47
47
  ```
48
48
 
49
+ Note: When running locally, your internet speed will be the main bottleneck.
50
+
49
51
  ## Quick Start
50
52
 
51
53
  ### Simple Example for New Users
@@ -163,9 +165,6 @@ con.sql("""
163
165
  GROUP BY customer_id
164
166
  """).write.mode("overwrite").saveAsTable("customer_totals")
165
167
 
166
- # Append mode
167
- con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
168
-
169
168
  # Schema evolution and partitioning (exact Spark API compatibility)
170
169
  con.sql("""
171
170
  SELECT
@@ -324,6 +323,73 @@ pipeline = [
324
323
 
325
324
  ## Advanced Features
326
325
 
326
+ ### SQL Lookup Functions
327
+
328
+ Duckrun automatically registers helper functions that allow you to resolve workspace and lakehouse names from GUIDs directly in SQL queries. These are especially useful when working with storage logs or audit data that contains workspace/lakehouse IDs.
329
+
330
+ **Available Functions:**
331
+
332
+ ```python
333
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
334
+
335
+ # ID → Name lookups (most common use case)
336
+ con.sql("""
337
+ SELECT
338
+ workspace_id,
339
+ get_workspace_name(workspace_id) as workspace_name,
340
+ lakehouse_id,
341
+ get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
342
+ FROM storage_logs
343
+ """).show()
344
+
345
+ # Name → ID lookups (reverse)
346
+ con.sql("""
347
+ SELECT
348
+ workspace_name,
349
+ get_workspace_id_from_name(workspace_name) as workspace_id,
350
+ lakehouse_name,
351
+ get_lakehouse_id_from_name(workspace_id, lakehouse_name) as lakehouse_id
352
+ FROM configuration_table
353
+ """).show()
354
+ ```
355
+
356
+ **Function Reference:**
357
+
358
+ - `get_workspace_name(workspace_id)` - Convert workspace GUID to display name
359
+ - `get_lakehouse_name(workspace_id, lakehouse_id)` - Convert lakehouse GUID to display name
360
+ - `get_workspace_id_from_name(workspace_name)` - Convert workspace name to GUID
361
+ - `get_lakehouse_id_from_name(workspace_id, lakehouse_name)` - Convert lakehouse name to GUID
362
+
363
+ **Features:**
364
+ - ✅ **Automatic Caching**: Results are cached to avoid repeated API calls
365
+ - ✅ **NULL on Error**: Returns `NULL` instead of errors for missing or inaccessible items
366
+ - ✅ **Fabric API Integration**: Resolves names using Microsoft Fabric REST API
367
+ - ✅ **Always Available**: Functions are automatically registered on connection
368
+
369
+ **Example Use Case:**
370
+
371
+ ```python
372
+ # Enrich OneLake storage logs with friendly names
373
+ con = duckrun.connect("Analytics/Monitoring.lakehouse/dbo")
374
+
375
+ result = con.sql("""
376
+ SELECT
377
+ workspace_id,
378
+ get_workspace_name(workspace_id) as workspace_name,
379
+ lakehouse_id,
380
+ get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name,
381
+ operation_name,
382
+ COUNT(*) as operation_count,
383
+ SUM(bytes_transferred) as total_bytes
384
+ FROM onelake_storage_logs
385
+ WHERE log_date = CURRENT_DATE
386
+ GROUP BY ALL
387
+ ORDER BY workspace_name, lakehouse_name
388
+ """).show()
389
+ ```
390
+
391
+ This makes it easy to create human-readable reports from GUID-based log data!
392
+
327
393
  ### Schema Evolution & Partitioning
328
394
 
329
395
  Handle evolving schemas and optimize query performance with partitioning:
@@ -467,63 +533,6 @@ con = duckrun.connect(
467
533
  )
468
534
  ```
469
535
 
470
- ## File Management API Reference
471
-
472
- ### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
473
-
474
- Upload files from a local folder to OneLake Files section.
475
-
476
- **Parameters:**
477
- - `local_folder` (str): Path to local folder containing files to upload
478
- - `remote_folder` (str): **Required** target folder path in OneLake Files
479
- - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
480
- - `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
481
-
482
- **Returns:** `True` if all files uploaded successfully, `False` otherwise
483
-
484
- **Examples:**
485
- ```python
486
- # Upload all files to a target folder
487
- con.copy("./data", "processed_data")
488
-
489
- # Upload only CSV and Parquet files
490
- con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
491
-
492
- # Upload with overwrite enabled
493
- con.copy("./backup", "daily_backup", overwrite=True)
494
- ```
495
-
496
- ### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
497
-
498
- Download files from OneLake Files section to a local folder.
499
-
500
- **Parameters:**
501
- - `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
502
- - `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
503
- - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
504
- - `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
505
-
506
- **Returns:** `True` if all files downloaded successfully, `False` otherwise
507
-
508
- **Examples:**
509
- ```python
510
- # Download all files from OneLake Files root
511
- con.download()
512
-
513
- # Download from specific folder
514
- con.download("processed_data", "./local_data")
515
-
516
- # Download only JSON files
517
- con.download("config", "./configs", ['.json'])
518
- ```
519
-
520
- **Important Notes:**
521
- - Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
522
- - The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
523
- - Both methods default to `overwrite=False` for safety
524
- - Folder structure is preserved during upload/download operations
525
- - Progress is reported with file names, sizes, and upload/download status
526
-
527
536
  ## Complete Example
528
537
 
529
538
  ```python
@@ -1,14 +1,14 @@
1
1
  duckrun/__init__.py,sha256=cTj6KQ6hKmgu1z7k9nhDcO5lct049luxjx1V0QnymCo,235
2
- duckrun/auth.py,sha256=qPaLQ7InlV9leA9r6E6VEeYavFFoBi0zSN8m_l1aoQs,9545
3
- duckrun/core.py,sha256=MlaHOOz9bg3-EDXR3C4pEcp75QsnEcbTOmvsMjomLKc,54279
2
+ duckrun/auth.py,sha256=WjRpbB60lB7z3MTDyBMb0FqoHEesiC3nRiW2moAsrrs,10490
3
+ duckrun/core.py,sha256=3usRl9SetUytVFzCzbpiFXppTjHzwTqFlSEKnUSbcK8,52460
4
4
  duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
5
  duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
6
  duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
7
7
  duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
8
  duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
9
9
  duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
- duckrun-0.2.11.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
- duckrun-0.2.11.dev0.dist-info/METADATA,sha256=NuW94zw7gizsp_cVPFktiC-I9aMP8O37vrtfMq7cmiI,20629
12
- duckrun-0.2.11.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- duckrun-0.2.11.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
- duckrun-0.2.11.dev0.dist-info/RECORD,,
10
+ duckrun-0.2.13.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.13.dev0.dist-info/METADATA,sha256=dgJzGSr1W2XaHsPbHbE2Vc9T03DKofX4DAC3lNRAh3I,20771
12
+ duckrun-0.2.13.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.13.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.13.dev0.dist-info/RECORD,,