duckrun 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
 
5
- __version__ = "0.1.0"
5
+ __version__ = "0.2.9.dev5"
6
6
 
7
7
  # Expose unified connect method at module level
8
8
  connect = Duckrun.connect
duckrun/auth.py ADDED
@@ -0,0 +1,249 @@
1
+ """
2
+ Enhanced authentication module for duckrun - supports multiple notebook environments
3
+ """
4
+ import os
5
+ from typing import Optional, Tuple
6
+
7
+
8
+ def get_token() -> Optional[str]:
9
+ """
10
+ Smart authentication that works across multiple environments:
11
+ - Microsoft Fabric notebooks (uses notebookutils)
12
+ - Local environments with Azure CLI (uses CLI + browser fallback)
13
+ - Google Colab (uses device code flow)
14
+ - Other headless environments (uses device code flow)
15
+ - Existing token from environment (uses cached token)
16
+
17
+ Returns:
18
+ Azure Storage token string or None if authentication fails
19
+ """
20
+ # Check if we already have a cached token
21
+ token_env = os.environ.get("AZURE_STORAGE_TOKEN")
22
+ if token_env and token_env != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
23
+ print("✅ Using existing Azure Storage token")
24
+ return token_env
25
+
26
+ print("🔐 Starting Azure authentication...")
27
+
28
+ # Try Fabric notebook environment first
29
+ try:
30
+ import notebookutils # type: ignore
31
+ print("📓 Microsoft Fabric notebook detected - using notebookutils")
32
+ token = notebookutils.credentials.getToken("pbi")
33
+ os.environ["AZURE_STORAGE_TOKEN"] = token
34
+ print("✅ Fabric notebook authentication successful!")
35
+ return token
36
+ except ImportError:
37
+ pass # Not in Fabric notebook
38
+ except Exception as e:
39
+ print(f"⚠️ Fabric notebook authentication failed: {e}")
40
+
41
+ # Detect environment type for fallback authentication
42
+ try:
43
+ # Check if we're in Google Colab first
44
+ try:
45
+ import google.colab
46
+ print("🚀 Google Colab detected - using device code flow")
47
+ return _get_device_code_token()
48
+ except ImportError:
49
+ pass
50
+
51
+ # For all other environments (including VS Code), try Azure CLI first
52
+ # This includes local development, VS Code notebooks, etc.
53
+ print("🖥️ Local/VS Code environment detected - trying Azure CLI first, then browser fallback")
54
+ return _get_local_token()
55
+
56
+ except Exception as e:
57
+ print(f"❌ Authentication failed: {e}")
58
+ print("💡 Try refreshing and running again, or check your Azure permissions")
59
+ return None
60
+
61
+
62
+ def _get_device_code_token() -> Optional[str]:
63
+ """Get token using device code flow for headless environments"""
64
+ try:
65
+ from azure.identity import DeviceCodeCredential
66
+
67
+ # Use Azure CLI client ID for device code flow
68
+ credential = DeviceCodeCredential(
69
+ client_id="04b07795-8ddb-461a-bbee-02f9e1bf7b46", # Azure CLI client ID
70
+ tenant_id="common"
71
+ )
72
+
73
+ print("🔐 Follow the authentication prompts in your browser...")
74
+ token_obj = credential.get_token("https://storage.azure.com/.default")
75
+
76
+ os.environ["AZURE_STORAGE_TOKEN"] = token_obj.token
77
+ print("✅ Device code authentication successful!")
78
+ return token_obj.token
79
+
80
+ except Exception as e:
81
+ print(f"❌ Device code authentication failed: {e}")
82
+ return None
83
+
84
+
85
+ def _get_local_token() -> Optional[str]:
86
+ """Get token using CLI first, then browser fallback for local environments"""
87
+ # First try Azure CLI directly
88
+ try:
89
+ from azure.identity import AzureCliCredential
90
+ print("🔐 Trying Azure CLI authentication...")
91
+
92
+ cli_credential = AzureCliCredential()
93
+ token_obj = cli_credential.get_token("https://storage.azure.com/.default")
94
+
95
+ os.environ["AZURE_STORAGE_TOKEN"] = token_obj.token
96
+ print("✅ Azure CLI authentication successful!")
97
+ return token_obj.token
98
+
99
+ except Exception as cli_error:
100
+ print(f"⚠️ Azure CLI authentication failed: {cli_error}")
101
+ print("🔐 Falling back to interactive browser authentication...")
102
+
103
+ # Fallback to interactive browser
104
+ try:
105
+ from azure.identity import InteractiveBrowserCredential
106
+
107
+ browser_credential = InteractiveBrowserCredential()
108
+ token_obj = browser_credential.get_token("https://storage.azure.com/.default")
109
+
110
+ os.environ["AZURE_STORAGE_TOKEN"] = token_obj.token
111
+ print("✅ Interactive browser authentication successful!")
112
+ return token_obj.token
113
+
114
+ except Exception as browser_error:
115
+ print(f"❌ Interactive browser authentication failed: {browser_error}")
116
+ return None
117
+
118
+
119
+ def get_fabric_api_token() -> Optional[str]:
120
+ """
121
+ Get token for Fabric API operations (different scope than storage)
122
+
123
+ Returns:
124
+ Fabric API token string or None if authentication fails
125
+ """
126
+ # Check if we already have a cached Fabric API token
127
+ fabric_token_env = os.environ.get("FABRIC_API_TOKEN")
128
+ if fabric_token_env:
129
+ print("✅ Using cached Fabric API token")
130
+ return fabric_token_env
131
+
132
+ print("🔐 Getting Fabric API token...")
133
+
134
+ # Try Fabric notebook environment first
135
+ try:
136
+ import notebookutils # type: ignore
137
+ print("📓 Microsoft Fabric notebook detected - using notebookutils")
138
+ token = notebookutils.credentials.getToken("pbi")
139
+ os.environ["FABRIC_API_TOKEN"] = token
140
+ print("✅ Fabric API token obtained!")
141
+ return token
142
+ except ImportError:
143
+ pass # Not in Fabric notebook
144
+ except Exception as e:
145
+ print(f"⚠️ Fabric notebook token failed: {e}")
146
+
147
+ # Fallback to azure-identity for external environments
148
+ try:
149
+ # Check if we're in Google Colab
150
+ try:
151
+ import google.colab
152
+ print("💻 Using device code flow for Fabric API (Colab)")
153
+ from azure.identity import DeviceCodeCredential
154
+ credential = DeviceCodeCredential(
155
+ client_id="04b07795-8ddb-461a-bbee-02f9e1bf7b46",
156
+ tenant_id="common"
157
+ )
158
+ except ImportError:
159
+ # For all other environments, try CLI first then browser
160
+ print("🖥️ Using CLI + browser fallback for Fabric API")
161
+
162
+ # Try CLI first
163
+ try:
164
+ from azure.identity import AzureCliCredential
165
+ print("🔐 Trying Azure CLI for Fabric API...")
166
+ credential = AzureCliCredential()
167
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
168
+ os.environ["FABRIC_API_TOKEN"] = token_obj.token
169
+ print("✅ Fabric API token obtained via Azure CLI!")
170
+ return token_obj.token
171
+ except Exception as cli_error:
172
+ print(f"⚠️ Azure CLI failed for Fabric API: {cli_error}")
173
+ print("🔐 Falling back to interactive browser for Fabric API...")
174
+ from azure.identity import InteractiveBrowserCredential
175
+ credential = InteractiveBrowserCredential()
176
+
177
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
178
+ os.environ["FABRIC_API_TOKEN"] = token_obj.token
179
+ print("✅ Fabric API token obtained!")
180
+ return token_obj.token
181
+
182
+ except Exception as e:
183
+ print(f"❌ Fabric API authentication failed: {e}")
184
+ return None
185
+
186
+
187
+ def authenticate_for_environment() -> Tuple[bool, Optional[str]]:
188
+ """
189
+ Main authentication entry point - detects environment and authenticates appropriately
190
+
191
+ Returns:
192
+ Tuple of (success: bool, token: Optional[str])
193
+ """
194
+ print("\n🔍 Detecting execution environment...")
195
+
196
+ # Check environment
197
+ try:
198
+ import notebookutils # type: ignore
199
+ env_type = "Microsoft Fabric Notebook"
200
+ except ImportError:
201
+ try:
202
+ import google.colab
203
+ env_type = "Google Colab"
204
+ except ImportError:
205
+ # For all other environments (VS Code, local Python, etc.)
206
+ # we'll treat as local and try Azure CLI first
207
+ env_type = "Local/VS Code Environment"
208
+
209
+ print(f"📍 Environment: {env_type}")
210
+
211
+ token = get_token()
212
+ if token:
213
+ print(f"✅ Authentication successful for {env_type}")
214
+ return True, token
215
+ else:
216
+ print(f"❌ Authentication failed for {env_type}")
217
+ return False, None
218
+
219
+
220
+ # For backward compatibility - expose the same interface as before
221
+ def get_storage_token() -> str:
222
+ """
223
+ Backward compatible method - returns token or placeholder
224
+ """
225
+ token = get_token()
226
+ return token if token else "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE"
227
+
228
+
229
+ # Example usage function for testing
230
+ def test_authentication():
231
+ """
232
+ Test authentication in current environment
233
+ """
234
+ print("=" * 60)
235
+ print("🧪 TESTING DUCKRUN AUTHENTICATION")
236
+ print("=" * 60)
237
+
238
+ success, token = authenticate_for_environment()
239
+
240
+ if success:
241
+ print("\n✅ Authentication test successful!")
242
+ print(f"Token length: {len(token) if token else 0} characters")
243
+ print(f"Token starts with: {token[:20] if token else 'None'}...")
244
+ else:
245
+ print("\n❌ Authentication test failed!")
246
+ print("Please check your Azure setup and permissions.")
247
+
248
+ print("=" * 60)
249
+ return success
duckrun/core.py CHANGED
@@ -82,6 +82,15 @@ class Duckrun:
82
82
 
83
83
  self.con = duckdb.connect()
84
84
  self.con.sql("SET preserve_insertion_order = false")
85
+
86
+ # Configure Azure transport for Colab (fixes SSL cert issues)
87
+ try:
88
+ import google.colab # type: ignore
89
+ self.con.sql("SET azure_transport_option_type = 'curl'")
90
+ print("🔧 Colab detected - using curl transport for Azure")
91
+ except ImportError:
92
+ pass # Not in Colab, use default transport
93
+
85
94
  self._attach_lakehouse()
86
95
 
87
96
  @classmethod
@@ -196,18 +205,19 @@ class Duckrun:
196
205
  print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
197
206
 
198
207
  try:
199
- # Get authentication token (try notebook environment first, then azure-identity)
208
+ # Get authentication token using enhanced auth system
209
+ from .auth import get_fabric_api_token
210
+ token = get_fabric_api_token()
211
+ if not token:
212
+ raise ValueError("Failed to obtain Fabric API token")
213
+
214
+ # Try to get current workspace ID if in notebook environment
215
+ current_workspace_id = None
200
216
  try:
201
217
  import notebookutils # type: ignore
202
- token = notebookutils.credentials.getToken("pbi")
203
218
  current_workspace_id = notebookutils.runtime.context.get("workspaceId")
204
219
  except ImportError:
205
- current_workspace_id = None
206
- # Fallback to azure-identity for external environments
207
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
208
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
209
- token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
210
- token = token_obj.token
220
+ pass # Not in notebook environment
211
221
 
212
222
  # Resolve workspace name to ID
213
223
  if current_workspace_id:
@@ -302,19 +312,23 @@ class Duckrun:
302
312
  return WorkspaceConnection(workspace_name)
303
313
 
304
314
  def _get_storage_token(self):
305
- return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
315
+ from .auth import get_storage_token
316
+ return get_storage_token()
306
317
 
307
318
  def _create_onelake_secret(self):
308
319
  token = self._get_storage_token()
309
320
  if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
310
321
  self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
311
322
  else:
312
- print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
313
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
314
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
315
- token = credential.get_token("https://storage.azure.com/.default")
316
- os.environ["AZURE_STORAGE_TOKEN"] = token.token
317
- self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
323
+ # Enhanced authentication - try all methods
324
+ from .auth import get_token
325
+ token = get_token()
326
+ if token:
327
+ os.environ["AZURE_STORAGE_TOKEN"] = token
328
+ self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
329
+ else:
330
+ # Final fallback to persistent secret
331
+ self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
318
332
 
319
333
  def _discover_tables_fast(self) -> List[Tuple[str, str]]:
320
334
  """
@@ -326,12 +340,12 @@ class Duckrun:
326
340
  """
327
341
  token = self._get_storage_token()
328
342
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
329
- print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
330
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
331
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
332
- token_obj = credential.get_token("https://storage.azure.com/.default")
333
- token = token_obj.token
334
- os.environ["AZURE_STORAGE_TOKEN"] = token
343
+ print("Authenticating with Azure for table discovery (detecting environment automatically)...")
344
+ from .auth import get_token
345
+ token = get_token()
346
+ if not token:
347
+ print("❌ Failed to authenticate for table discovery")
348
+ return []
335
349
 
336
350
  url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
337
351
  store = AzureStore.from_url(url, bearer_token=token)
@@ -579,19 +593,22 @@ class Duckrun:
579
593
  List of lakehouse names
580
594
  """
581
595
  try:
582
- # Try to get token from notebook environment first
596
+ # Get authentication token using enhanced auth system
597
+ from .auth import get_fabric_api_token
598
+ token = get_fabric_api_token()
599
+ if not token:
600
+ print("❌ Failed to authenticate for listing lakehouses")
601
+ return []
602
+
603
+ # Try to get current workspace ID if in notebook environment
604
+ workspace_id = None
583
605
  try:
584
606
  import notebookutils # type: ignore
585
- token = notebookutils.credentials.getToken("pbi")
586
607
  workspace_id = notebookutils.runtime.context.get("workspaceId")
587
608
  except ImportError:
588
- # Fallback to azure-identity
589
- print("Getting authentication token...")
590
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
591
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
592
- token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
593
- token = token_obj.token
594
-
609
+ pass # Not in notebook environment
610
+
611
+ if not workspace_id:
595
612
  # Get workspace ID by name
596
613
  workspace_id = self._get_workspace_id_by_name(token, self.workspace)
597
614
  if not workspace_id:
@@ -626,19 +643,22 @@ class Duckrun:
626
643
  True if lakehouse exists or was created successfully, False otherwise
627
644
  """
628
645
  try:
629
- # Try to get token from notebook environment first
646
+ # Get authentication token using enhanced auth system
647
+ from .auth import get_fabric_api_token
648
+ token = get_fabric_api_token()
649
+ if not token:
650
+ print("❌ Failed to authenticate for lakehouse creation")
651
+ return False
652
+
653
+ # Try to get current workspace ID if in notebook environment
654
+ workspace_id = None
630
655
  try:
631
656
  import notebookutils # type: ignore
632
- token = notebookutils.credentials.getToken("pbi")
633
657
  workspace_id = notebookutils.runtime.context.get("workspaceId")
634
658
  except ImportError:
635
- # Fallback to azure-identity
636
- print("Getting authentication token...")
637
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
638
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
639
- token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
640
- token = token_obj.token
641
-
659
+ pass # Not in notebook environment
660
+
661
+ if not workspace_id:
642
662
  # Get workspace ID by name
643
663
  workspace_id = self._get_workspace_id_by_name(token, self.workspace)
644
664
  if not workspace_id:
@@ -676,6 +696,45 @@ class Duckrun:
676
696
  print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
677
697
  return False
678
698
 
699
+ def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
700
+ wait_seconds: int = 5) -> int:
701
+ """
702
+ Deploy a semantic model from a BIM file using DirectLake mode.
703
+
704
+ Args:
705
+ bim_url: URL to the BIM file (e.g., GitHub raw URL)
706
+ dataset_name: Name for the semantic model (default: lakehouse_schema)
707
+ wait_seconds: Seconds to wait for permission propagation (default: 5)
708
+
709
+ Returns:
710
+ 1 for success, 0 for failure
711
+
712
+ Examples:
713
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
714
+
715
+ # Deploy with auto-generated name
716
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim")
717
+
718
+ # Deploy with custom name
719
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim",
720
+ dataset_name="Sales Model")
721
+ """
722
+ from .semantic_model import deploy_semantic_model
723
+
724
+ # Auto-generate dataset name if not provided
725
+ if dataset_name is None:
726
+ dataset_name = f"{self.lakehouse_name}_{self.schema}"
727
+
728
+ # Call the deployment function (DirectLake only)
729
+ return deploy_semantic_model(
730
+ workspace_name_or_id=self.workspace,
731
+ lakehouse_name_or_id=self.lakehouse_name,
732
+ schema_name=self.schema,
733
+ dataset_name=dataset_name,
734
+ bim_url=bim_url,
735
+ wait_seconds=wait_seconds
736
+ )
737
+
679
738
  def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
680
739
  """Helper method to get workspace ID from name"""
681
740
  try:
@@ -718,28 +777,18 @@ class WorkspaceConnection:
718
777
  List of lakehouse names
719
778
  """
720
779
  try:
721
- # Try to get token from notebook environment first
722
- try:
723
- import notebookutils # type: ignore
724
- token = notebookutils.credentials.getToken("pbi")
725
- # Always resolve workspace name to ID, even in notebook environment
726
- workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
727
- if not workspace_id:
728
- print(f"Workspace '{self.workspace_name}' not found")
729
- return []
730
- except ImportError:
731
- # Fallback to azure-identity
732
- print("Getting authentication token...")
733
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
734
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
735
- token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
736
- token = token_obj.token
737
-
738
- # Get workspace ID by name
739
- workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
740
- if not workspace_id:
741
- print(f"Workspace '{self.workspace_name}' not found")
742
- return []
780
+ # Get authentication token using enhanced auth system
781
+ from .auth import get_fabric_api_token
782
+ token = get_fabric_api_token()
783
+ if not token:
784
+ print("❌ Failed to authenticate for listing lakehouses")
785
+ return []
786
+
787
+ # Always resolve workspace name to ID, even in notebook environment
788
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
789
+ if not workspace_id:
790
+ print(f"Workspace '{self.workspace_name}' not found")
791
+ return []
743
792
 
744
793
  # List lakehouses
745
794
  url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
@@ -768,28 +817,18 @@ class WorkspaceConnection:
768
817
  True if lakehouse exists or was created successfully, False otherwise
769
818
  """
770
819
  try:
771
- # Try to get token from notebook environment first
772
- try:
773
- import notebookutils # type: ignore
774
- token = notebookutils.credentials.getToken("pbi")
775
- # Always resolve workspace name to ID, even in notebook environment
776
- workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
777
- if not workspace_id:
778
- print(f"Workspace '{self.workspace_name}' not found")
779
- return False
780
- except ImportError:
781
- # Fallback to azure-identity
782
- print("Getting authentication token...")
783
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
784
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
785
- token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
786
- token = token_obj.token
787
-
788
- # Get workspace ID by name
789
- workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
790
- if not workspace_id:
791
- print(f"Workspace '{self.workspace_name}' not found")
792
- return False
820
+ # Get authentication token using enhanced auth system
821
+ from .auth import get_fabric_api_token
822
+ token = get_fabric_api_token()
823
+ if not token:
824
+ print("❌ Failed to authenticate for lakehouse creation")
825
+ return False
826
+
827
+ # Always resolve workspace name to ID, even in notebook environment
828
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
829
+ if not workspace_id:
830
+ print(f"Workspace '{self.workspace_name}' not found")
831
+ return False
793
832
 
794
833
  # Check if lakehouse already exists
795
834
  url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
duckrun/files.py CHANGED
@@ -41,15 +41,15 @@ def copy(duckrun_instance, local_folder: str, remote_folder: str,
41
41
  print(f"❌ Path is not a directory: {local_folder}")
42
42
  return False
43
43
 
44
- # Get Azure token
44
+ # Get Azure token using enhanced auth system
45
+ from .auth import get_token
45
46
  token = duckrun_instance._get_storage_token()
46
47
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
47
- print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
48
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
49
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
50
- token_obj = credential.get_token("https://storage.azure.com/.default")
51
- token = token_obj.token
52
- os.environ["AZURE_STORAGE_TOKEN"] = token
48
+ print("Authenticating with Azure for file upload (detecting environment automatically)...")
49
+ token = get_token()
50
+ if not token:
51
+ print("❌ Failed to authenticate for file upload")
52
+ return False
53
53
 
54
54
  # Setup OneLake Files URL (use correct format without .Lakehouse suffix)
55
55
  files_base_url = duckrun_instance.files_base_url
@@ -150,15 +150,15 @@ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./d
150
150
  # Download only CSV files from a specific subfolder
151
151
  dr.download("daily_reports", "./reports", ['.csv'])
152
152
  """
153
- # Get Azure token
153
+ # Get Azure token using enhanced auth system
154
+ from .auth import get_token
154
155
  token = duckrun_instance._get_storage_token()
155
156
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
156
- print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
157
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
158
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
159
- token_obj = credential.get_token("https://storage.azure.com/.default")
160
- token = token_obj.token
161
- os.environ["AZURE_STORAGE_TOKEN"] = token
157
+ print("Authenticating with Azure for file download (detecting environment automatically)...")
158
+ token = get_token()
159
+ if not token:
160
+ print("❌ Failed to authenticate for file download")
161
+ return False
162
162
 
163
163
  # Setup OneLake Files URL (use correct format without .Lakehouse suffix)
164
164
  files_base_url = duckrun_instance.files_base_url
duckrun/runner.py CHANGED
@@ -15,7 +15,7 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
15
15
  """
16
16
  Build arguments for write_deltalake based on requirements:
17
17
  - If schema_mode='merge': use rust engine (no row group params)
18
- - Otherwise: use pyarrow engine with row group optimization
18
+ - Otherwise: use pyarrow engine with row group optimization (if supported)
19
19
  """
20
20
  args = {
21
21
  'table_or_uri': path,
@@ -33,10 +33,17 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
33
33
  args['schema_mode'] = 'merge'
34
34
  args['engine'] = 'rust'
35
35
  else:
36
- # Use pyarrow engine with row group optimization (default)
37
- args['max_rows_per_file'] = RG
38
- args['max_rows_per_group'] = RG
39
- args['min_rows_per_group'] = RG
36
+ # Try to use pyarrow engine with row group optimization
37
+ # Check if row group parameters are supported by inspecting function signature
38
+ import inspect
39
+ sig = inspect.signature(write_deltalake)
40
+
41
+ if 'max_rows_per_file' in sig.parameters:
42
+ # Older deltalake version - use row group optimization
43
+ args['max_rows_per_file'] = RG
44
+ args['max_rows_per_group'] = RG
45
+ args['min_rows_per_group'] = RG
46
+ # For newer versions, just use default parameters
40
47
 
41
48
  return args
42
49
 
@@ -0,0 +1,434 @@
1
+ """
2
+ Semantic Model Deployer - DirectLake mode for Fabric Lakehouses
3
+ Uses duckrun's authentication. Works anywhere duckrun works.
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ import time
9
+ import base64
10
+
11
+
12
+ class FabricRestClient:
13
+ """Fabric REST API client using duckrun's authentication."""
14
+
15
+ def __init__(self):
16
+ self.base_url = "https://api.fabric.microsoft.com"
17
+ self.token = None
18
+ self._get_token()
19
+
20
+ def _get_token(self):
21
+ """Get Fabric API token using duckrun's auth module"""
22
+ from duckrun.auth import get_fabric_api_token
23
+ self.token = get_fabric_api_token()
24
+ if not self.token:
25
+ raise Exception("Failed to get Fabric API token")
26
+
27
+ def _get_headers(self):
28
+ return {
29
+ "Authorization": f"Bearer {self.token}",
30
+ "Content-Type": "application/json"
31
+ }
32
+
33
+ def get(self, endpoint: str):
34
+ url = f"{self.base_url}{endpoint}"
35
+ response = requests.get(url, headers=self._get_headers())
36
+ response.raise_for_status()
37
+ return response
38
+
39
+ def post(self, endpoint: str, json: dict = None):
40
+ url = f"{self.base_url}{endpoint}"
41
+ response = requests.post(url, headers=self._get_headers(), json=json)
42
+ response.raise_for_status()
43
+ return response
44
+
45
+
46
+ def get_workspace_id(workspace_name_or_id, client):
47
+ """Get workspace ID by name or validate if already a GUID"""
48
+ import re
49
+
50
+ # Check if input is already a GUID
51
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
52
+ if guid_pattern.match(workspace_name_or_id):
53
+ # It's already a GUID, verify it exists
54
+ try:
55
+ response = client.get(f"/v1/workspaces/{workspace_name_or_id}")
56
+ workspace_name = response.json().get('displayName', workspace_name_or_id)
57
+ print(f"✓ Found workspace: {workspace_name}")
58
+ return workspace_name_or_id
59
+ except:
60
+ raise ValueError(f"Workspace with ID '{workspace_name_or_id}' not found")
61
+
62
+ # It's a name, search for it
63
+ response = client.get("/v1/workspaces")
64
+ workspaces = response.json().get('value', [])
65
+
66
+ workspace_match = next((ws for ws in workspaces if ws.get('displayName') == workspace_name_or_id), None)
67
+ if not workspace_match:
68
+ raise ValueError(f"Workspace '{workspace_name_or_id}' not found")
69
+
70
+ workspace_id = workspace_match['id']
71
+ print(f"✓ Found workspace: {workspace_name_or_id}")
72
+ return workspace_id
73
+
74
+
75
+ def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
76
+ """Get lakehouse ID by name or validate if already a GUID"""
77
+ import re
78
+
79
+ # Check if input is already a GUID
80
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
81
+ if guid_pattern.match(lakehouse_name_or_id):
82
+ # It's already a GUID, verify it exists
83
+ try:
84
+ response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
85
+ items = response.json().get('value', [])
86
+ lakehouse_match = next((item for item in items if item.get('id') == lakehouse_name_or_id), None)
87
+ if lakehouse_match:
88
+ lakehouse_name = lakehouse_match.get('displayName', lakehouse_name_or_id)
89
+ print(f"✓ Found lakehouse: {lakehouse_name}")
90
+ return lakehouse_name_or_id
91
+ else:
92
+ raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found")
93
+ except Exception as e:
94
+ raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found: {e}")
95
+
96
+ # It's a name, search for it
97
+ response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
98
+ items = response.json().get('value', [])
99
+
100
+ lakehouse_match = next((item for item in items if item.get('displayName') == lakehouse_name_or_id), None)
101
+ if not lakehouse_match:
102
+ raise ValueError(f"Lakehouse '{lakehouse_name_or_id}' not found")
103
+
104
+ lakehouse_id = lakehouse_match['id']
105
+ print(f"✓ Found lakehouse: {lakehouse_name_or_id}")
106
+ return lakehouse_id
107
+
108
+
109
+ def get_dataset_id(dataset_name, workspace_id, client):
110
+ """Get dataset ID by name"""
111
+ response = client.get(f"/v1/workspaces/{workspace_id}/semanticModels")
112
+ items = response.json().get('value', [])
113
+
114
+ dataset_match = next((item for item in items if item.get('displayName') == dataset_name), None)
115
+ if not dataset_match:
116
+ raise ValueError(f"Dataset '{dataset_name}' not found")
117
+
118
+ return dataset_match['id']
119
+
120
+
121
+ def check_dataset_exists(dataset_name, workspace_id, client):
122
+ """Check if dataset already exists"""
123
+ try:
124
+ get_dataset_id(dataset_name, workspace_id, client)
125
+ print(f"⚠️ Dataset '{dataset_name}' already exists")
126
+ return True
127
+ except:
128
+ print(f"✓ Dataset name '{dataset_name}' is available")
129
+ return False
130
+
131
+
132
+ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
133
+ """Refresh a dataset and monitor progress using Power BI API"""
134
+
135
+ # If dataset_id not provided, look it up by name
136
+ if not dataset_id:
137
+ dataset_id = get_dataset_id(dataset_name, workspace_id, client)
138
+
139
+ payload = {
140
+ "type": "full",
141
+ "commitMode": "transactional",
142
+ "maxParallelism": 10,
143
+ "retryCount": 2,
144
+ "objects": []
145
+ }
146
+
147
+ # Use Power BI API for refresh (not Fabric API)
148
+ powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
149
+ headers = client._get_headers()
150
+
151
+ response = requests.post(powerbi_url, headers=headers, json=payload)
152
+
153
+ if response.status_code in [200, 202]:
154
+ print(f"✓ Refresh initiated")
155
+
156
+ # For 202, get the refresh_id from the Location header
157
+ if response.status_code == 202:
158
+ location = response.headers.get('Location')
159
+ if location:
160
+ refresh_id = location.split('/')[-1]
161
+ print(" Monitoring refresh progress...")
162
+ max_attempts = 60
163
+ for attempt in range(max_attempts):
164
+ time.sleep(5)
165
+
166
+ # Check refresh status using Power BI API
167
+ status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{refresh_id}"
168
+ status_response = requests.get(status_url, headers=headers)
169
+ status_response.raise_for_status()
170
+ status = status_response.json().get('status')
171
+
172
+ if status == 'Completed':
173
+ print(f"✓ Refresh completed successfully")
174
+ return
175
+ elif status == 'Failed':
176
+ error = status_response.json().get('serviceExceptionJson', '')
177
+ raise Exception(f"Refresh failed: {error}")
178
+ elif status == 'Cancelled':
179
+ raise Exception("Refresh was cancelled")
180
+
181
+ if attempt % 6 == 0:
182
+ print(f" Status: {status}...")
183
+
184
+ raise Exception(f"Refresh timed out")
185
+ else:
186
+ response.raise_for_status()
187
+
188
+
189
+ def download_bim_from_github(url):
190
+ """Download BIM file from URL"""
191
+ print(f"Downloading BIM file...")
192
+ response = requests.get(url)
193
+ response.raise_for_status()
194
+ bim_content = response.json()
195
+ print(f"✓ BIM file downloaded")
196
+ print(f" - Tables: {len(bim_content.get('model', {}).get('tables', []))}")
197
+ print(f" - Relationships: {len(bim_content.get('model', {}).get('relationships', []))}")
198
+ return bim_content
199
+
200
+
201
+ def update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name):
202
+ """Update BIM file for DirectLake mode"""
203
+
204
+ new_url = f"https://onelake.dfs.fabric.microsoft.com/{workspace_id}/{lakehouse_id}"
205
+ expression_name = None
206
+
207
+ # Update or create DirectLake expression
208
+ if 'model' in bim_content and 'expressions' in bim_content['model']:
209
+ for expr in bim_content['model']['expressions']:
210
+ if 'DirectLake' in expr['name'] or expr.get('kind') == 'm':
211
+ expression_name = expr['name']
212
+ expr['expression'] = [
213
+ "let",
214
+ f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
215
+ "in",
216
+ " Source"
217
+ ]
218
+ break
219
+
220
+ if not expression_name:
221
+ expression_name = f"DirectLake - {schema_name}"
222
+ if 'expressions' not in bim_content['model']:
223
+ bim_content['model']['expressions'] = []
224
+
225
+ bim_content['model']['expressions'].append({
226
+ "name": expression_name,
227
+ "kind": "m",
228
+ "expression": [
229
+ "let",
230
+ f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
231
+ "in",
232
+ " Source"
233
+ ],
234
+ "lineageTag": f"directlake-{schema_name}-source"
235
+ })
236
+
237
+ # Update table partitions for DirectLake
238
+ if 'tables' in bim_content['model']:
239
+ for table in bim_content['model']['tables']:
240
+ if 'partitions' in table:
241
+ for partition in table['partitions']:
242
+ if 'source' in partition:
243
+ partition['mode'] = 'directLake'
244
+ partition['source'] = {
245
+ "type": "entity",
246
+ "entityName": partition['source'].get('entityName', table['name']),
247
+ "expressionSource": expression_name,
248
+ "schemaName": schema_name
249
+ }
250
+
251
+ print(f"✓ Updated BIM for DirectLake")
252
+ print(f" - OneLake URL: {new_url}")
253
+ print(f" - Schema: {schema_name}")
254
+
255
+ return bim_content
256
+
257
+
258
+ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
259
+ """Create semantic model from BIM using Fabric REST API and return the dataset ID"""
260
+ # Convert to base64
261
+ bim_json = json.dumps(bim_content, indent=2)
262
+ bim_base64 = base64.b64encode(bim_json.encode('utf-8')).decode('utf-8')
263
+
264
+ pbism_content = {"version": "1.0"}
265
+ pbism_json = json.dumps(pbism_content)
266
+ pbism_base64 = base64.b64encode(pbism_json.encode('utf-8')).decode('utf-8')
267
+
268
+ payload = {
269
+ "displayName": dataset_name,
270
+ "definition": {
271
+ "parts": [
272
+ {
273
+ "path": "model.bim",
274
+ "payload": bim_base64,
275
+ "payloadType": "InlineBase64"
276
+ },
277
+ {
278
+ "path": "definition.pbism",
279
+ "payload": pbism_base64,
280
+ "payloadType": "InlineBase64"
281
+ }
282
+ ]
283
+ }
284
+ }
285
+
286
+ response = client.post(
287
+ f"/v1/workspaces/{workspace_id}/semanticModels",
288
+ json=payload
289
+ )
290
+
291
+ print(f"✓ Semantic model created")
292
+
293
+ # Handle long-running operation and return the dataset ID
294
+ if response.status_code == 202:
295
+ operation_id = response.headers.get('x-ms-operation-id')
296
+ print(f" Waiting for operation to complete...")
297
+
298
+ max_attempts = 30
299
+ for attempt in range(max_attempts):
300
+ time.sleep(2)
301
+
302
+ # Get operation result (not just status)
303
+ result_response = client.get(f"/v1/operations/{operation_id}/result")
304
+
305
+ # Check if operation is complete by getting the status
306
+ status_response = client.get(f"/v1/operations/{operation_id}")
307
+ status = status_response.json().get('status')
308
+
309
+ if status == 'Succeeded':
310
+ print(f"✓ Operation completed")
311
+ # Return the created dataset ID from the result
312
+ result_data = result_response.json()
313
+ dataset_id = result_data.get('id')
314
+ if dataset_id:
315
+ return dataset_id
316
+ else:
317
+ # Fallback: search for the dataset by name
318
+ return get_dataset_id(dataset_name, workspace_id, client)
319
+ elif status == 'Failed':
320
+ error = status_response.json().get('error', {})
321
+ raise Exception(f"Operation failed: {error.get('message')}")
322
+ elif attempt == max_attempts - 1:
323
+ raise Exception(f"Operation timed out")
324
+
325
+ # For non-async responses (status 200/201)
326
+ result_data = response.json()
327
+ dataset_id = result_data.get('id')
328
+ if dataset_id:
329
+ return dataset_id
330
+ else:
331
+ # Fallback: search for the dataset by name
332
+ return get_dataset_id(dataset_name, workspace_id, client)
333
+
334
+
335
+ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
336
+ bim_url, wait_seconds=5):
337
+ """
338
+ Deploy a semantic model using DirectLake mode.
339
+
340
+ Args:
341
+ workspace_name_or_id: Name or GUID of the target workspace
342
+ lakehouse_name_or_id: Name or GUID of the lakehouse
343
+ schema_name: Schema name (e.g., 'dbo', 'staging')
344
+ dataset_name: Name for the semantic model
345
+ bim_url: URL to the BIM file
346
+ wait_seconds: Seconds to wait before refresh (default: 5)
347
+
348
+ Returns:
349
+ 1 for success, 0 for failure
350
+
351
+ Examples:
352
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
353
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim")
354
+ """
355
+ print("=" * 70)
356
+ print("Semantic Model Deployment (DirectLake)")
357
+ print("=" * 70)
358
+
359
+ client = FabricRestClient()
360
+
361
+ try:
362
+ # Step 1: Get workspace ID
363
+ print("\n[Step 1/6] Getting workspace information...")
364
+ workspace_id = get_workspace_id(workspace_name_or_id, client)
365
+
366
+ # Step 2: Check if dataset exists
367
+ print(f"\n[Step 2/6] Checking if dataset '{dataset_name}' exists...")
368
+ dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
369
+
370
+ if dataset_exists:
371
+ print(f"\n✓ Dataset exists - refreshing...")
372
+
373
+ if wait_seconds > 0:
374
+ print(f" Waiting {wait_seconds} seconds...")
375
+ time.sleep(wait_seconds)
376
+
377
+ print("\n[Step 6/6] Refreshing semantic model...")
378
+ refresh_dataset(dataset_name, workspace_id, client)
379
+
380
+ print("\n" + "=" * 70)
381
+ print("🎉 Refresh Completed!")
382
+ print("=" * 70)
383
+ print(f"Dataset: {dataset_name}")
384
+ print("=" * 70)
385
+ return 1
386
+
387
+ # Step 3: Get lakehouse ID
388
+ print(f"\n[Step 3/6] Finding lakehouse...")
389
+ lakehouse_id = get_lakehouse_id(lakehouse_name_or_id, workspace_id, client)
390
+
391
+ # Step 4: Download and update BIM
392
+ print("\n[Step 4/6] Downloading and configuring BIM file...")
393
+ bim_content = download_bim_from_github(bim_url)
394
+
395
+ modified_bim = update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name)
396
+ modified_bim['name'] = dataset_name
397
+ modified_bim['id'] = dataset_name
398
+
399
+ # Step 5: Deploy and get the dataset ID
400
+ print("\n[Step 5/6] Deploying semantic model...")
401
+ dataset_id = create_dataset_from_bim(dataset_name, modified_bim, workspace_id, client)
402
+ print(f" Dataset ID: {dataset_id}")
403
+
404
+ if wait_seconds > 0:
405
+ print(f" Waiting {wait_seconds} seconds before refresh...")
406
+ time.sleep(wait_seconds)
407
+
408
+ # Step 6: Refresh using the dataset ID returned from creation
409
+ print("\n[Step 6/6] Refreshing semantic model...")
410
+ refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
411
+
412
+ print("\n" + "=" * 70)
413
+ print("🎉 Deployment Completed!")
414
+ print("=" * 70)
415
+ print(f"Dataset: {dataset_name}")
416
+ print(f"Workspace: {workspace_name_or_id}")
417
+ print(f"Lakehouse: {lakehouse_name_or_id}")
418
+ print(f"Schema: {schema_name}")
419
+ print("=" * 70)
420
+
421
+ return 1
422
+
423
+ except Exception as e:
424
+ print("\n" + "=" * 70)
425
+ print("❌ Deployment Failed")
426
+ print("=" * 70)
427
+ print(f"Error: {str(e)}")
428
+ print("\n💡 Troubleshooting:")
429
+ print(f" - Verify workspace '{workspace_name_or_id}' exists")
430
+ print(f" - Verify lakehouse '{lakehouse_name_or_id}' exists")
431
+ print(f" - Ensure tables exist in '{schema_name}' schema")
432
+ print(f" - Check tables are in Delta format")
433
+ print("=" * 70)
434
+ return 0
duckrun/stats.py CHANGED
@@ -147,7 +147,20 @@ def get_stats(duckrun_instance, source: str):
147
147
 
148
148
  try:
149
149
  dt = DeltaTable(table_path)
150
- xx = dt.get_add_actions(flatten=True).to_pydict()
150
+ add_actions = dt.get_add_actions(flatten=True)
151
+
152
+ # Convert to dict - compatible with both old and new deltalake versions
153
+ # Try to_pydict() first (old versions), fall back to to_pylist() (new versions)
154
+ try:
155
+ xx = add_actions.to_pydict()
156
+ except AttributeError:
157
+ # New version with arro3: use to_pylist() and convert to dict of lists
158
+ records = add_actions.to_pylist()
159
+ if records:
160
+ # Convert list of dicts to dict of lists
161
+ xx = {key: [record[key] for record in records] for key in records[0].keys()}
162
+ else:
163
+ xx = {}
151
164
 
152
165
  # Check if VORDER exists
153
166
  vorder = 'tags.VORDER' in xx.keys()
duckrun/writer.py CHANGED
@@ -1,18 +1,36 @@
1
1
  """
2
2
  Delta Lake writer functionality for duckrun - Spark-style write API
3
3
  """
4
- from deltalake import DeltaTable, write_deltalake
4
+ from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
5
5
 
6
6
 
7
7
  # Row Group configuration for optimal Delta Lake performance
8
8
  RG = 8_000_000
9
9
 
10
+ # Check deltalake version once at module load
11
+ # Version 0.18.x and 0.19.x support engine parameter and row group optimization
12
+ # Version 0.20+ removed these features (rust only, no row groups)
13
+ _DELTALAKE_VERSION = tuple(map(int, deltalake_version.split('.')[:2]))
14
+ _IS_OLD_DELTALAKE = _DELTALAKE_VERSION < (0, 20)
15
+
10
16
 
11
17
  def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
12
18
  """
13
- Build arguments for write_deltalake based on requirements:
14
- - If schema_mode='merge': use rust engine (no row group params)
15
- - Otherwise: use pyarrow engine with row group optimization
19
+ Build arguments for write_deltalake based on requirements and version:
20
+
21
+ deltalake 0.18.2 - 0.19.x:
22
+ - Has 'engine' parameter (defaults to 'pyarrow')
23
+ - Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
24
+ - When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
25
+ - When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
26
+
27
+ deltalake 0.20+:
28
+ - Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
29
+ - Does NOT have max_rows_per_file (row group optimization removed)
30
+ - When mergeSchema=True: must set schema_mode='merge'
31
+ - When mergeSchema=False: just write normally (no special params)
32
+
33
+ Uses version detection for simpler logic.
16
34
  """
17
35
  args = {
18
36
  'table_or_uri': path,
@@ -24,16 +42,24 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
24
42
  if partition_by:
25
43
  args['partition_by'] = partition_by
26
44
 
27
- # Engine selection based on schema_mode
28
45
  if schema_mode == 'merge':
29
- # Use rust engine for schema merging (no row group params supported)
46
+ # Schema merging mode - must explicitly set schema_mode='merge'
30
47
  args['schema_mode'] = 'merge'
31
- args['engine'] = 'rust'
48
+
49
+ if _IS_OLD_DELTALAKE:
50
+ # deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
51
+ # Do NOT use row group params (they conflict with rust engine)
52
+ args['engine'] = 'rust'
53
+ # For version 0.20+: just schema_mode='merge' is enough, rust is default
32
54
  else:
33
- # Use pyarrow engine with row group optimization (default)
34
- args['max_rows_per_file'] = RG
35
- args['max_rows_per_group'] = RG
36
- args['min_rows_per_group'] = RG
55
+ # Normal write mode (no schema merging)
56
+ if _IS_OLD_DELTALAKE:
57
+ # deltalake 0.18.2-0.19.x: use row group optimization
58
+ # DON'T set engine parameter - pyarrow is the default and works with row groups
59
+ args['max_rows_per_file'] = RG
60
+ args['max_rows_per_group'] = RG
61
+ args['min_rows_per_group'] = RG
62
+ # For version 0.20+: no optimization available (rust by default, no row group params supported)
37
63
 
38
64
  return args
39
65
 
@@ -106,7 +132,18 @@ class DeltaWriter:
106
132
  partition_by=self._partition_by
107
133
  )
108
134
 
109
- engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
135
+ # Prepare info message based on version and settings
136
+ if self._schema_mode == 'merge':
137
+ if _IS_OLD_DELTALAKE:
138
+ engine_info = " (engine=rust, schema_mode=merge)"
139
+ else:
140
+ engine_info = " (schema_mode=merge, rust by default)"
141
+ else:
142
+ if _IS_OLD_DELTALAKE:
143
+ engine_info = " (engine=pyarrow, optimized row groups)"
144
+ else:
145
+ engine_info = " (engine=rust by default)"
146
+
110
147
  partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
111
148
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
112
149
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -420,6 +420,37 @@ success = con.run(pipeline) # Returns True only if ALL tasks succeed
420
420
 
421
421
  This prevents downstream tasks from processing incomplete or corrupted data.
422
422
 
423
+ ### Semantic Model Deployment
424
+
425
+ Deploy Power BI semantic models directly from BIM files using DirectLake mode:
426
+
427
+ ```python
428
+ # Connect to lakehouse
429
+ con = duckrun.connect("Analytics/Sales.lakehouse/dbo")
430
+
431
+ # Deploy with auto-generated name (lakehouse_schema)
432
+ con.deploy("https://raw.githubusercontent.com/user/repo/main/model.bim")
433
+
434
+ # Deploy with custom name
435
+ con.deploy(
436
+ "https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
437
+ dataset_name="Sales Analytics Model",
438
+ wait_seconds=10 # Wait for permission propagation
439
+ )
440
+ ```
441
+
442
+ **Features:**
443
+ - 🚀 **DirectLake Mode**: Deploys semantic models with DirectLake connection
444
+ - 🔄 **Automatic Configuration**: Auto-configures workspace, lakehouse, and schema connections
445
+ - 📦 **BIM from URL**: Load model definitions from GitHub or any accessible URL
446
+ - ⏱️ **Permission Handling**: Configurable wait time for permission propagation
447
+
448
+ **Use Cases:**
449
+ - Deploy semantic models as part of CI/CD pipelines
450
+ - Version control your semantic models in Git
451
+ - Automated model deployment across environments
452
+ - Streamline DirectLake model creation
453
+
423
454
  ### Delta Lake Optimization
424
455
 
425
456
  Duckrun automatically:
@@ -534,6 +565,12 @@ con.sql("""
534
565
 
535
566
  # 5. Download processed files for external systems
536
567
  con.download("processed_reports", "./exports", ['.csv'])
568
+
569
+ # 6. Deploy semantic model for Power BI
570
+ con.deploy(
571
+ "https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
572
+ dataset_name="Sales Analytics"
573
+ )
537
574
  ```
538
575
 
539
576
  **This example demonstrates:**
@@ -541,8 +578,9 @@ con.download("processed_reports", "./exports", ['.csv'])
541
578
  - 🔄 **Pipeline orchestration** with SQL and Python tasks
542
579
  - ⚡ **Fast data exploration** with DuckDB
543
580
  - 💾 **Delta table creation** with Spark-style API
544
- - **Schema evolution** and partitioning
545
- - �📤 **File downloads** from OneLake Files
581
+ - 🔀 **Schema evolution** and partitioning
582
+ - 📤 **File downloads** from OneLake Files
583
+ - 📊 **Semantic model deployment** with DirectLake
546
584
 
547
585
  ## Schema Evolution & Partitioning Guide
548
586
 
@@ -0,0 +1,14 @@
1
+ duckrun/__init__.py,sha256=cTj6KQ6hKmgu1z7k9nhDcO5lct049luxjx1V0QnymCo,235
2
+ duckrun/auth.py,sha256=qPaLQ7InlV9leA9r6E6VEeYavFFoBi0zSN8m_l1aoQs,9545
3
+ duckrun/core.py,sha256=CrWMgA1QHvVF2AAlTlBlQ7VfKsuakcqZa4VuX2WJmik,39279
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
7
+ duckrun/semantic_model.py,sha256=jmTrS15WmhU3rQfdpLII1wm3EORdQfqQxOhqOSyXB_w,17305
8
+ duckrun/stats.py,sha256=CXfb2DWF3PgOckelJooU0y-BAsNT9NFDfDYEmo0mUQQ,10473
9
+ duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
+ duckrun-0.2.9.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.9.dist-info/METADATA,sha256=T4hEXLJELzqhPWDJtez42co8bNbaNgAabywoxFW0hC4,20623
12
+ duckrun-0.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.9.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.9.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
2
- duckrun/core.py,sha256=Y4-5H83Xw0mZa12QM5pcC7qOPidrDFASLcGIoUW3zwY,39394
3
- duckrun/files.py,sha256=piWRU5w9jHrW-wuV4Gf-SKY_jhFv9eflxgWO8AZCQTI,10495
4
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
5
- duckrun/runner.py,sha256=XsQqWlesFD2cuhH2gsQj3Astg0XN7xhW15WPmr8D65I,13797
6
- duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
7
- duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
8
- duckrun-0.2.7.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
9
- duckrun-0.2.7.dist-info/METADATA,sha256=fIwgvoj3Hw4ByOcwCmG87zpLF0qnlzK8GAotup5km40,19272
10
- duckrun-0.2.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- duckrun-0.2.7.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
12
- duckrun-0.2.7.dist-info/RECORD,,