duckrun 0.2.3__py3-none-any.whl → 0.2.5.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/__init__.py CHANGED
@@ -4,7 +4,7 @@ from duckrun.core import Duckrun
4
4
 
5
5
  __version__ = "0.1.0"
6
6
 
7
- # Expose connect at module level for: import duckrun as dr
7
+ # Expose unified connect method at module level
8
8
  connect = Duckrun.connect
9
9
 
10
10
  __all__ = ["Duckrun", "connect"]
duckrun/core.py CHANGED
@@ -49,17 +49,27 @@ class Duckrun:
49
49
  ]
50
50
  """
51
51
 
52
- def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
52
+ def __init__(self, workspace_id: str, lakehouse_id: str, schema: str = "dbo",
53
53
  sql_folder: Optional[str] = None, compaction_threshold: int = 10,
54
54
  scan_all_schemas: bool = False, storage_account: str = "onelake"):
55
- self.workspace = workspace
56
- self.lakehouse_name = lakehouse_name
55
+ # Store GUIDs for internal use
56
+ self.workspace_id = workspace_id
57
+ self.lakehouse_id = lakehouse_id
57
58
  self.schema = schema
58
59
  self.sql_folder = sql_folder.strip() if sql_folder else None
59
60
  self.compaction_threshold = compaction_threshold
60
61
  self.scan_all_schemas = scan_all_schemas
61
62
  self.storage_account = storage_account
62
- self.table_base_url = f'abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
63
+
64
+ # Construct proper ABFSS URLs using GUIDs
65
+ # Both Tables and Files use lakehouse GUID directly (no .Lakehouse suffix)
66
+ self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Tables/'
67
+ self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Files/'
68
+
69
+ # Keep legacy properties for backward compatibility
70
+ self.workspace = workspace_id
71
+ self.lakehouse_name = lakehouse_id
72
+
63
73
  self.con = duckdb.connect()
64
74
  self.con.sql("SET preserve_insertion_order = false")
65
75
  self._attach_lakehouse()
@@ -68,59 +78,218 @@ class Duckrun:
68
78
  def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
69
79
  compaction_threshold: int = 100, storage_account: str = "onelake"):
70
80
  """
71
- Create and connect to lakehouse.
81
+ Create and connect to lakehouse or workspace.
72
82
 
73
- Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
83
+ Smart detection based on connection string format:
84
+ - "workspace" → workspace management only
85
+ - "ws/lh.lakehouse/schema" → full lakehouse connection
86
+ - "ws/lh.lakehouse" → lakehouse connection (defaults to dbo schema)
74
87
 
75
88
  Args:
76
- connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
77
- sql_folder: Optional path or URL to SQL files folder
89
+ connection_string: OneLake path or workspace name
90
+ sql_folder: Optional path or URL to SQL files folder
78
91
  compaction_threshold: File count threshold for compaction
79
92
  storage_account: Storage account name (default: "onelake")
80
93
 
81
94
  Examples:
82
- dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
83
- dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
84
- dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
85
- dr = Duckrun.connect("ws/lh.lakehouse", storage_account="xxx-onelake") # custom storage
95
+ # Workspace management only (supports spaces in names)
96
+ ws = Duckrun.connect("My Workspace Name")
97
+ ws.list_lakehouses()
98
+ ws.create_lakehouse_if_not_exists("New Lakehouse")
99
+
100
+ # Full lakehouse connections (supports spaces in names)
101
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
102
+ dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
103
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
104
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
105
+
106
+ Note:
107
+ Internally resolves friendly names (with spaces) to GUIDs and constructs proper ABFSS URLs:
108
+ "My Workspace/My Lakehouse.lakehouse/schema" becomes
109
+ "abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/lakehouse_guid/Tables/schema"
86
110
  """
111
+
112
+ # Check if it's a workspace-only connection (no "/" means workspace name only)
113
+ if "/" not in connection_string:
114
+ print(f"Connecting to workspace '{connection_string}' for management operations...")
115
+ return WorkspaceConnection(connection_string)
116
+
87
117
  print("Connecting to Lakehouse...")
88
118
 
89
119
  scan_all_schemas = False
90
120
 
91
- # Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
92
- if not connection_string or "/" not in connection_string:
93
- raise ValueError(
94
- "Invalid connection string format. "
95
- "Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
96
- )
97
-
121
+ # Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
122
+ # Support workspace and lakehouse names with spaces
98
123
  parts = connection_string.split("/")
99
124
  if len(parts) == 2:
100
- workspace, lakehouse_name = parts
125
+ workspace_name, lakehouse_name = parts
101
126
  scan_all_schemas = True
102
127
  schema = "dbo"
103
- print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
104
- print(f" Scanning all schemas for table discovery...\n")
105
128
  elif len(parts) == 3:
106
- workspace, lakehouse_name, schema = parts
129
+ workspace_name, lakehouse_name, schema = parts
107
130
  else:
108
131
  raise ValueError(
109
132
  f"Invalid connection string format: '{connection_string}'. "
110
- "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
133
+ "Expected formats:\n"
134
+ " 'workspace name' (workspace management only)\n"
135
+ " 'workspace name/lakehouse name.lakehouse' (lakehouse with dbo schema)\n"
136
+ " 'workspace name/lakehouse name.lakehouse/schema' (lakehouse with specific schema)"
111
137
  )
112
138
 
113
139
  if lakehouse_name.endswith(".lakehouse"):
114
140
  lakehouse_name = lakehouse_name[:-10]
115
141
 
116
- if not workspace or not lakehouse_name:
142
+ if not workspace_name or not lakehouse_name:
117
143
  raise ValueError(
118
- "Missing required parameters. Use compact format:\n"
119
- " connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
120
- " connect('workspace/lakehouse.lakehouse') # defaults to dbo"
144
+ "Missing required parameters. Use one of these formats:\n"
145
+ " connect('workspace name') # workspace management\n"
146
+ " connect('workspace name/lakehouse name.lakehouse/schema') # full lakehouse\n"
147
+ " connect('workspace name/lakehouse name.lakehouse') # defaults to dbo"
121
148
  )
122
149
 
123
- return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
150
+ # Resolve friendly names to GUIDs and construct proper ABFSS path
151
+ workspace_id, lakehouse_id = cls._resolve_names_to_guids(workspace_name, lakehouse_name)
152
+
153
+ return cls(workspace_id, lakehouse_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
154
+
155
+ @classmethod
156
+ def _resolve_names_to_guids(cls, workspace_name: str, lakehouse_name: str) -> tuple[str, str]:
157
+ """
158
+ Resolve friendly workspace and lakehouse names to their GUIDs.
159
+
160
+ Optimization: If names don't contain spaces, use them directly (no API calls needed).
161
+ Only resolve to GUIDs when names contain spaces or are already GUIDs.
162
+
163
+ Args:
164
+ workspace_name: Display name of the workspace (can contain spaces)
165
+ lakehouse_name: Display name of the lakehouse (can contain spaces)
166
+
167
+ Returns:
168
+ Tuple of (workspace_id, lakehouse_id) - either resolved GUIDs or original names
169
+ """
170
+
171
+ # Check if names are already GUIDs first
172
+ import re
173
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
174
+
175
+ if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
176
+ print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
177
+ return workspace_name, lakehouse_name
178
+
179
+ # Optimization: If workspace name has no spaces, use both names directly (old behavior)
180
+ # Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
181
+ if " " not in workspace_name:
182
+ print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
183
+ return workspace_name, lakehouse_name
184
+
185
+ # Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
186
+ print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
187
+
188
+ try:
189
+ # Get authentication token (try notebook environment first, then azure-identity)
190
+ try:
191
+ import notebookutils # type: ignore
192
+ token = notebookutils.credentials.getToken("pbi")
193
+ current_workspace_id = notebookutils.runtime.context.get("workspaceId")
194
+ except ImportError:
195
+ current_workspace_id = None
196
+ # Fallback to azure-identity for external environments
197
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
198
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
199
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
200
+ token = token_obj.token
201
+
202
+ # Resolve workspace name to ID
203
+ if current_workspace_id:
204
+ # In notebook environment, we could use current workspace ID
205
+ # but we should validate it matches the requested workspace name
206
+ workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
207
+ if not workspace_id:
208
+ # Fallback to current workspace if name resolution fails
209
+ print(f"⚠️ Could not validate workspace name '{workspace_name}', using current workspace")
210
+ workspace_id = current_workspace_id
211
+ else:
212
+ # External environment - must resolve by name
213
+ workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
214
+ if not workspace_id:
215
+ raise ValueError(f"Workspace '{workspace_name}' not found")
216
+
217
+ # Resolve lakehouse name to ID (required for ABFSS URLs with spaces)
218
+ lakehouse_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, lakehouse_name)
219
+ if not lakehouse_id:
220
+ raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
221
+
222
+ print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
223
+ return workspace_id, lakehouse_id
224
+
225
+ except Exception as e:
226
+ print(f"❌ Failed to resolve names to GUIDs: {e}")
227
+ print(f"❌ Cannot use friendly names with spaces '{workspace_name}'/'{lakehouse_name}' in ABFSS URLs without GUID resolution")
228
+ print("❌ Microsoft Fabric requires actual workspace and lakehouse GUIDs for ABFSS access when names contain spaces")
229
+ raise ValueError(
230
+ f"Unable to resolve workspace '{workspace_name}' and lakehouse '{lakehouse_name}' to GUIDs. "
231
+ f"ABFSS URLs require actual GUIDs when names contain spaces. "
232
+ f"Please ensure you have proper authentication and the workspace/lakehouse names are correct."
233
+ )
234
+
235
+ @classmethod
236
+ def _resolve_workspace_id_by_name(cls, token: str, workspace_name: str) -> Optional[str]:
237
+ """Get workspace ID from display name"""
238
+ try:
239
+ import requests
240
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
241
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
242
+
243
+ response = requests.get(url, headers=headers)
244
+ response.raise_for_status()
245
+
246
+ workspaces = response.json().get("value", [])
247
+ for workspace in workspaces:
248
+ if workspace.get("displayName") == workspace_name:
249
+ return workspace.get("id")
250
+
251
+ return None
252
+ except Exception:
253
+ return None
254
+
255
+ @classmethod
256
+ def _resolve_lakehouse_id_by_name(cls, token: str, workspace_id: str, lakehouse_name: str) -> Optional[str]:
257
+ """Get lakehouse ID from display name within a workspace"""
258
+ try:
259
+ import requests
260
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
261
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
262
+
263
+ response = requests.get(url, headers=headers)
264
+ response.raise_for_status()
265
+
266
+ lakehouses = response.json().get("value", [])
267
+ for lakehouse in lakehouses:
268
+ if lakehouse.get("displayName") == lakehouse_name:
269
+ return lakehouse.get("id")
270
+
271
+ return None
272
+ except Exception:
273
+ return None
274
+
275
+ @classmethod
276
+ def connect_workspace(cls, workspace_name: str):
277
+ """
278
+ Connect to a workspace without a specific lakehouse.
279
+ Used for lakehouse management operations.
280
+
281
+ Args:
282
+ workspace_name: Name of the workspace
283
+
284
+ Returns:
285
+ WorkspaceConnection object with lakehouse management methods
286
+
287
+ Example:
288
+ con = duckrun.connect_workspace("MyWorkspace")
289
+ con.list_lakehouses()
290
+ con.create_lakehouse_if_not_exists("newlakehouse")
291
+ """
292
+ return WorkspaceConnection(workspace_name)
124
293
 
125
294
  def _get_storage_token(self):
126
295
  return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
@@ -157,21 +326,18 @@ class Duckrun:
157
326
  url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
158
327
  store = AzureStore.from_url(url, bearer_token=token)
159
328
 
160
- base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
329
+ base_path = f"{self.lakehouse_name}/Tables/"
161
330
  tables_found = []
162
331
 
163
332
  if self.scan_all_schemas:
164
333
  # Discover all schemas first
165
- print("🔍 Discovering schemas...")
166
334
  schemas_result = obs.list_with_delimiter(store, prefix=base_path)
167
335
  schemas = [
168
336
  prefix.rstrip('/').split('/')[-1]
169
337
  for prefix in schemas_result['common_prefixes']
170
338
  ]
171
- print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
172
339
 
173
340
  # Discover tables in each schema
174
- print("🔍 Discovering tables...")
175
341
  for schema_name in schemas:
176
342
  schema_path = f"{base_path}{schema_name}/"
177
343
  result = obs.list_with_delimiter(store, prefix=schema_path)
@@ -203,14 +369,27 @@ class Duckrun:
203
369
 
204
370
  if not tables:
205
371
  if self.scan_all_schemas:
206
- print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
372
+ print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
207
373
  else:
208
- print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
374
+ print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
209
375
  return
210
376
 
211
- print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
377
+ # Group tables by schema for display
378
+ schema_tables = {}
379
+ for schema_name, table_name in tables:
380
+ if schema_name not in schema_tables:
381
+ schema_tables[schema_name] = []
382
+ schema_tables[schema_name].append(table_name)
383
+
384
+ # Display tables by schema
385
+ print(f"\n📊 Found {len(tables)} tables:")
386
+ for schema_name in sorted(schema_tables.keys()):
387
+ table_list = sorted(schema_tables[schema_name])
388
+ print(f" {schema_name}: {', '.join(table_list)}")
212
389
 
213
390
  attached_count = 0
391
+ skipped_tables = []
392
+
214
393
  for schema_name, table_name in tables:
215
394
  try:
216
395
  if self.scan_all_schemas:
@@ -225,19 +404,16 @@ class Duckrun:
225
404
  CREATE OR REPLACE VIEW {view_name}
226
405
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
227
406
  """)
228
- print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
229
407
  attached_count += 1
230
408
  except Exception as e:
231
- print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
409
+ skipped_tables.append(f"{schema_name}.{table_name}")
232
410
  continue
233
411
 
234
412
  print(f"\n{'='*60}")
235
- print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
413
+ print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
414
+ if skipped_tables:
415
+ print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
236
416
  print(f"{'='*60}\n")
237
-
238
- if self.scan_all_schemas:
239
- print(f"\n💡 Note: Tables use schema.table format (e.g., aemo.calendar, dbo.results)")
240
- print(f" Default schema for operations: {self.schema}\n")
241
417
 
242
418
  except Exception as e:
243
419
  print(f"❌ Error attaching lakehouse: {e}")
@@ -353,8 +529,268 @@ class Duckrun:
353
529
  """
354
530
  return _get_stats(self, source)
355
531
 
532
+ def list_lakehouses(self) -> List[str]:
533
+ """
534
+ List all lakehouses in the current workspace.
535
+
536
+ Returns:
537
+ List of lakehouse names
538
+ """
539
+ try:
540
+ # Try to get token from notebook environment first
541
+ try:
542
+ import notebookutils # type: ignore
543
+ token = notebookutils.credentials.getToken("pbi")
544
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
545
+ except ImportError:
546
+ # Fallback to azure-identity
547
+ print("Getting authentication token...")
548
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
549
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
550
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
551
+ token = token_obj.token
552
+
553
+ # Get workspace ID by name
554
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace)
555
+ if not workspace_id:
556
+ print(f"Workspace '{self.workspace}' not found")
557
+ return []
558
+
559
+ # List lakehouses
560
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
561
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
562
+
563
+ response = requests.get(url, headers=headers)
564
+ response.raise_for_status()
565
+
566
+ lakehouses = response.json().get("value", [])
567
+ lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
568
+
569
+ print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
570
+ return lakehouse_names
571
+
572
+ except Exception as e:
573
+ print(f"Error listing lakehouses: {e}")
574
+ return []
575
+
576
+ def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
577
+ """
578
+ Create a lakehouse if it doesn't already exist.
579
+
580
+ Args:
581
+ lakehouse_name: Name of the lakehouse to create
582
+
583
+ Returns:
584
+ True if lakehouse exists or was created successfully, False otherwise
585
+ """
586
+ try:
587
+ # Try to get token from notebook environment first
588
+ try:
589
+ import notebookutils # type: ignore
590
+ token = notebookutils.credentials.getToken("pbi")
591
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
592
+ except ImportError:
593
+ # Fallback to azure-identity
594
+ print("Getting authentication token...")
595
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
596
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
597
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
598
+ token = token_obj.token
599
+
600
+ # Get workspace ID by name
601
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace)
602
+ if not workspace_id:
603
+ print(f"Workspace '{self.workspace}' not found")
604
+ return False
605
+
606
+ # Check if lakehouse already exists
607
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
608
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
609
+
610
+ response = requests.get(url, headers=headers)
611
+ response.raise_for_status()
612
+
613
+ lakehouses = response.json().get("value", [])
614
+ existing_names = [lh.get("displayName", "") for lh in lakehouses]
615
+
616
+ if lakehouse_name in existing_names:
617
+ print(f"Lakehouse '{lakehouse_name}' already exists")
618
+ return True
619
+
620
+ # Create lakehouse
621
+ print(f"Creating lakehouse '{lakehouse_name}'...")
622
+ payload = {
623
+ "displayName": lakehouse_name,
624
+ "description": f"Lakehouse {lakehouse_name} created via duckrun"
625
+ }
626
+
627
+ response = requests.post(url, headers=headers, json=payload)
628
+ response.raise_for_status()
629
+
630
+ print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
631
+ return True
632
+
633
+ except Exception as e:
634
+ print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
635
+ return False
636
+
637
+ def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
638
+ """Helper method to get workspace ID from name"""
639
+ try:
640
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
641
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
642
+
643
+ response = requests.get(url, headers=headers)
644
+ response.raise_for_status()
645
+
646
+ workspaces = response.json().get("value", [])
647
+ for workspace in workspaces:
648
+ if workspace.get("displayName") == workspace_name:
649
+ return workspace.get("id")
650
+
651
+ return None
652
+
653
+ except Exception:
654
+ return None
655
+
356
656
  def close(self):
357
657
  """Close DuckDB connection"""
358
658
  if self.con:
359
659
  self.con.close()
360
- print("Connection closed")
660
+ print("Connection closed")
661
+
662
+
663
+ class WorkspaceConnection:
664
+ """
665
+ Simple workspace connection for lakehouse management operations.
666
+ """
667
+
668
+ def __init__(self, workspace_name: str):
669
+ self.workspace_name = workspace_name
670
+
671
+ def list_lakehouses(self) -> List[str]:
672
+ """
673
+ List all lakehouses in the workspace.
674
+
675
+ Returns:
676
+ List of lakehouse names
677
+ """
678
+ try:
679
+ # Try to get token from notebook environment first
680
+ try:
681
+ import notebookutils # type: ignore
682
+ token = notebookutils.credentials.getToken("pbi")
683
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
684
+ except ImportError:
685
+ # Fallback to azure-identity
686
+ print("Getting authentication token...")
687
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
688
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
689
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
690
+ token = token_obj.token
691
+
692
+ # Get workspace ID by name
693
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
694
+ if not workspace_id:
695
+ print(f"Workspace '{self.workspace_name}' not found")
696
+ return []
697
+
698
+ # List lakehouses
699
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
700
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
701
+
702
+ response = requests.get(url, headers=headers)
703
+ response.raise_for_status()
704
+
705
+ lakehouses = response.json().get("value", [])
706
+ lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
707
+
708
+ print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
709
+ return lakehouse_names
710
+
711
+ except Exception as e:
712
+ print(f"Error listing lakehouses: {e}")
713
+ return []
714
+
715
+ def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
716
+ """
717
+ Create a lakehouse if it doesn't already exist.
718
+
719
+ Args:
720
+ lakehouse_name: Name of the lakehouse to create
721
+
722
+ Returns:
723
+ True if lakehouse exists or was created successfully, False otherwise
724
+ """
725
+ try:
726
+ # Try to get token from notebook environment first
727
+ try:
728
+ import notebookutils # type: ignore
729
+ token = notebookutils.credentials.getToken("pbi")
730
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
731
+ except ImportError:
732
+ # Fallback to azure-identity
733
+ print("Getting authentication token...")
734
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
735
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
736
+ token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
737
+ token = token_obj.token
738
+
739
+ # Get workspace ID by name
740
+ workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
741
+ if not workspace_id:
742
+ print(f"Workspace '{self.workspace_name}' not found")
743
+ return False
744
+
745
+ # Check if lakehouse already exists
746
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
747
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
748
+
749
+ response = requests.get(url, headers=headers)
750
+ response.raise_for_status()
751
+
752
+ lakehouses = response.json().get("value", [])
753
+ existing_names = [lh.get("displayName", "") for lh in lakehouses]
754
+
755
+ if lakehouse_name in existing_names:
756
+ print(f"Lakehouse '{lakehouse_name}' already exists")
757
+ return True
758
+
759
+ # Create lakehouse
760
+ print(f"Creating lakehouse '{lakehouse_name}'...")
761
+ payload = {
762
+ "displayName": lakehouse_name,
763
+ "description": f"Lakehouse {lakehouse_name} created via duckrun",
764
+ "creationPayload": {
765
+ "enableSchemas": True
766
+ }
767
+ }
768
+
769
+ response = requests.post(url, headers=headers, json=payload)
770
+ response.raise_for_status()
771
+
772
+ print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
773
+ return True
774
+
775
+ except Exception as e:
776
+ print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
777
+ return False
778
+
779
+ def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
780
+ """Helper method to get workspace ID from name"""
781
+ try:
782
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
783
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
784
+
785
+ response = requests.get(url, headers=headers)
786
+ response.raise_for_status()
787
+
788
+ workspaces = response.json().get("value", [])
789
+ for workspace in workspaces:
790
+ if workspace.get("displayName") == workspace_name:
791
+ return workspace.get("id")
792
+
793
+ return None
794
+
795
+ except Exception:
796
+ return None
duckrun/files.py CHANGED
@@ -51,8 +51,8 @@ def copy(duckrun_instance, local_folder: str, remote_folder: str,
51
51
  token = token_obj.token
52
52
  os.environ["AZURE_STORAGE_TOKEN"] = token
53
53
 
54
- # Setup OneLake Files URL (not Tables)
55
- files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
54
+ # Setup OneLake Files URL (use correct format without .Lakehouse suffix)
55
+ files_base_url = duckrun_instance.files_base_url
56
56
  store = AzureStore.from_url(files_base_url, bearer_token=token)
57
57
 
58
58
  # Collect files to upload
@@ -160,8 +160,8 @@ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./d
160
160
  token = token_obj.token
161
161
  os.environ["AZURE_STORAGE_TOKEN"] = token
162
162
 
163
- # Setup OneLake Files URL (not Tables)
164
- files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
163
+ # Setup OneLake Files URL (use correct format without .Lakehouse suffix)
164
+ files_base_url = duckrun_instance.files_base_url
165
165
  store = AzureStore.from_url(files_base_url, bearer_token=token)
166
166
 
167
167
  # Create local directory
duckrun/lakehouse.py ADDED
@@ -0,0 +1,402 @@
1
+ import requests
2
+ import time
3
+ from typing import Optional
4
+
5
+ class FabricLakehouseManager:
6
+ """
7
+ Manage Microsoft Fabric Lakehouses using REST API only.
8
+ Works on any machine with Python and internet access.
9
+ """
10
+
11
+ def __init__(self, access_token: str):
12
+ """
13
+ Initialize with Azure AD access token.
14
+
15
+ Args:
16
+ access_token: Bearer token for Fabric API authentication
17
+ """
18
+ self.base_url = "https://api.fabric.microsoft.com/v1"
19
+ self.headers = {
20
+ "Authorization": f"Bearer {access_token}",
21
+ "Content-Type": "application/json"
22
+ }
23
+
24
+ def get_workspace_id(self, workspace_name: str) -> Optional[str]:
25
+ """
26
+ Get workspace ID from workspace name.
27
+
28
+ Args:
29
+ workspace_name: Name of the workspace
30
+
31
+ Returns:
32
+ Workspace ID if found, None otherwise
33
+ """
34
+ if not workspace_name:
35
+ return None
36
+
37
+ try:
38
+ url = f"{self.base_url}/workspaces"
39
+ response = requests.get(url, headers=self.headers)
40
+ response.raise_for_status()
41
+
42
+ workspaces = response.json().get("value", [])
43
+ for workspace in workspaces:
44
+ if workspace.get("displayName") == workspace_name:
45
+ return workspace.get("id")
46
+
47
+ print(f"Workspace '{workspace_name}' not found")
48
+ return None
49
+
50
+ except Exception as e:
51
+ print(f"Error getting workspace ID: {e}")
52
+ return None
53
+
54
+ def get_lakehouse(self, lakehouse_name: str, workspace_id: str) -> Optional[dict]:
55
+ """
56
+ Get lakehouse details if it exists.
57
+
58
+ Args:
59
+ lakehouse_name: Name of the lakehouse
60
+ workspace_id: ID of the workspace
61
+
62
+ Returns:
63
+ Lakehouse details if found, None otherwise
64
+ """
65
+ try:
66
+ url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
67
+ response = requests.get(url, headers=self.headers)
68
+ response.raise_for_status()
69
+
70
+ lakehouses = response.json().get("value", [])
71
+ for lakehouse in lakehouses:
72
+ if lakehouse.get("displayName") == lakehouse_name:
73
+ return lakehouse
74
+
75
+ return None
76
+
77
+ except Exception as e:
78
+ print(f"Error getting lakehouse: {e}")
79
+ return None
80
+
81
+ def create_lakehouse(self, lakehouse_name: str, workspace_id: str,
82
+ enable_schemas: bool = True) -> Optional[dict]:
83
+ """
84
+ Create a new lakehouse.
85
+
86
+ Args:
87
+ lakehouse_name: Name of the lakehouse
88
+ workspace_id: ID of the workspace
89
+ enable_schemas: Whether to enable schemas
90
+
91
+ Returns:
92
+ Created lakehouse details if successful, None otherwise
93
+ """
94
+ try:
95
+ url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
96
+ payload = {
97
+ "displayName": lakehouse_name,
98
+ "description": f"Lakehouse {lakehouse_name}"
99
+ }
100
+
101
+ if enable_schemas:
102
+ payload["creationPayload"] = {
103
+ "enableSchemas": True
104
+ }
105
+
106
+ response = requests.post(url, headers=self.headers, json=payload)
107
+ response.raise_for_status()
108
+
109
+ # Wait a bit for the lakehouse to be fully provisioned
110
+ time.sleep(2)
111
+
112
+ return response.json()
113
+
114
+ except Exception as e:
115
+ print(f"Error creating lakehouse: {e}")
116
+ if hasattr(e, 'response') and e.response is not None:
117
+ print(f"Response: {e.response.text}")
118
+ return None
119
+
120
+ def create_lakehouse_if_not_exists(self, lakehouse_name: str,
121
+ workspace_name: Optional[str] = None,
122
+ workspace_id: Optional[str] = None) -> int:
123
+ """
124
+ Create a lakehouse if it doesn't exist.
125
+
126
+ Args:
127
+ lakehouse_name: Name of the lakehouse
128
+ workspace_name: Optional workspace name
129
+ workspace_id: Optional workspace ID (takes precedence over workspace_name)
130
+
131
+ Returns:
132
+ 1 if successful (lakehouse exists or was created)
133
+ 0 if failed
134
+ """
135
+ # Resolve workspace ID
136
+ if workspace_id is None and workspace_name:
137
+ workspace_id = self.get_workspace_id(workspace_name)
138
+ if workspace_id is None:
139
+ print(f"Workspace '{workspace_name}' not found - returning 0")
140
+ return 0
141
+ elif workspace_id is None:
142
+ print("No workspace specified - returning 0")
143
+ return 0
144
+
145
+ print(f"Attempting to get lakehouse '{lakehouse_name}' in workspace '{workspace_id}'")
146
+
147
+ # Check if lakehouse exists
148
+ lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
149
+
150
+ if lakehouse:
151
+ print(f"Lakehouse '{lakehouse_name}' found - returning 1")
152
+ return 1
153
+
154
+ # Create lakehouse if it doesn't exist
155
+ print(f"Lakehouse not found, attempting to create...")
156
+ created = self.create_lakehouse(lakehouse_name, workspace_id)
157
+
158
+ if created:
159
+ # Verify creation
160
+ lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
161
+ if lakehouse:
162
+ print(f"Lakehouse '{lakehouse_name}' created successfully - returning 1")
163
+ return 1
164
+
165
+ print(f"Failed to create lakehouse '{lakehouse_name}' - returning 0")
166
+ return 0
167
+
168
+
169
+ # Example usage with Azure Identity:
170
+ def main():
171
+ """
172
+ Example of how to use the FabricLakehouseManager with azure-identity.
173
+ """
174
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
175
+
176
+ print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
177
+
178
+ # Create credential chain (CLI first, then interactive browser)
179
+ credential = ChainedTokenCredential(
180
+ AzureCliCredential(),
181
+ InteractiveBrowserCredential()
182
+ )
183
+
184
+ # Get token for Fabric API (not storage!)
185
+ # Note: Use Fabric API scope, not storage scope
186
+ token = credential.get_token("https://api.fabric.microsoft.com/.default")
187
+
188
+ print("✓ Authentication successful!")
189
+
190
+ # Initialize manager with Fabric token
191
+ manager = FabricLakehouseManager(token.token)
192
+
193
+ # Create lakehouse if not exists
194
+ result = manager.create_lakehouse_if_not_exists(
195
+ lakehouse_name="MyLakehouse",
196
+ workspace_name="MyWorkspace"
197
+ )
198
+
199
+ if result == 1:
200
+ print("✓ Lakehouse operation successful!")
201
+ else:
202
+ print("✗ Lakehouse operation failed!")
203
+
204
+ return result
205
+
206
+
207
+ def get_fabric_token():
208
+ """
209
+ Helper function to get Fabric API token.
210
+ Returns the token string.
211
+ """
212
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
213
+
214
+ credential = ChainedTokenCredential(
215
+ AzureCliCredential(),
216
+ InteractiveBrowserCredential()
217
+ )
218
+
219
+ # Get token for Fabric API
220
+ token = credential.get_token("https://api.fabric.microsoft.com/.default")
221
+ return token.token
222
+
223
+
224
+ def create_lakehouse_in_notebook(lakehouse_name: str, workspace_name: Optional[str] = None) -> int:
225
+ """
226
+ Create a lakehouse in a Fabric notebook environment.
227
+ This function uses the notebook's built-in authentication.
228
+
229
+ Args:
230
+ lakehouse_name: Name of the lakehouse to create
231
+ workspace_name: Optional workspace name (uses current workspace if None)
232
+
233
+ Returns:
234
+ 1 if successful (lakehouse exists or was created)
235
+ 0 if failed
236
+ """
237
+ try:
238
+ # Try to import fabric notebook utilities (only available in Fabric notebooks)
239
+ import notebookutils # type: ignore
240
+
241
+ # Get authentication token from notebook environment
242
+ token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
243
+
244
+ # Initialize manager with notebook token
245
+ manager = FabricLakehouseManager(token)
246
+
247
+ # Get current workspace ID if no workspace specified
248
+ workspace_id = None
249
+ if workspace_name:
250
+ workspace_id = manager.get_workspace_id(workspace_name)
251
+ else:
252
+ # In Fabric notebooks, we can get the current workspace from context
253
+ try:
254
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
255
+ except:
256
+ print("Could not get current workspace ID from notebook context")
257
+ return 0
258
+
259
+ if not workspace_id:
260
+ print(f"Could not resolve workspace ID")
261
+ return 0
262
+
263
+ # Create lakehouse if not exists
264
+ return manager.create_lakehouse_if_not_exists(
265
+ lakehouse_name=lakehouse_name,
266
+ workspace_id=workspace_id
267
+ )
268
+
269
+ except ImportError:
270
+ print("notebookutils not available - not running in Fabric notebook environment")
271
+ print("Use FabricLakehouseManager class directly with proper authentication")
272
+ return 0
273
+ except Exception as e:
274
+ print(f"Error creating lakehouse in notebook: {e}")
275
+ return 0
276
+
277
+
278
+ def create_lakehouse_simple(lakehouse_name: str, access_token: str, workspace_id: str) -> dict:
279
+ """
280
+ Simple function to create a lakehouse with minimal dependencies.
281
+ Perfect for Fabric notebook environments.
282
+
283
+ Args:
284
+ lakehouse_name: Name of the lakehouse to create
285
+ access_token: Bearer token for authentication
286
+ workspace_id: ID of the target workspace
287
+
288
+ Returns:
289
+ Dictionary with creation result
290
+ """
291
+ import requests
292
+ import time
293
+
294
+ base_url = "https://api.fabric.microsoft.com/v1"
295
+ headers = {
296
+ "Authorization": f"Bearer {access_token}",
297
+ "Content-Type": "application/json"
298
+ }
299
+
300
+ try:
301
+ # First check if lakehouse already exists
302
+ list_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
303
+ response = requests.get(list_url, headers=headers)
304
+ response.raise_for_status()
305
+
306
+ lakehouses = response.json().get("value", [])
307
+ for lakehouse in lakehouses:
308
+ if lakehouse.get("displayName") == lakehouse_name:
309
+ return {
310
+ "success": True,
311
+ "message": f"Lakehouse '{lakehouse_name}' already exists",
312
+ "lakehouse": lakehouse,
313
+ "created": False
314
+ }
315
+
316
+ # Create new lakehouse
317
+ create_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
318
+ payload = {
319
+ "displayName": lakehouse_name,
320
+ "description": f"Lakehouse {lakehouse_name} created via API"
321
+ }
322
+
323
+ response = requests.post(create_url, headers=headers, json=payload)
324
+ response.raise_for_status()
325
+
326
+ # Wait for provisioning
327
+ time.sleep(3)
328
+
329
+ created_lakehouse = response.json()
330
+ return {
331
+ "success": True,
332
+ "message": f"Lakehouse '{lakehouse_name}' created successfully",
333
+ "lakehouse": created_lakehouse,
334
+ "created": True
335
+ }
336
+
337
+ except requests.exceptions.RequestException as e:
338
+ error_msg = f"HTTP error creating lakehouse: {e}"
339
+ if hasattr(e, 'response') and e.response is not None:
340
+ error_msg += f" Response: {e.response.text}"
341
+
342
+ return {
343
+ "success": False,
344
+ "message": error_msg,
345
+ "lakehouse": None,
346
+ "created": False
347
+ }
348
+ except Exception as e:
349
+ return {
350
+ "success": False,
351
+ "message": f"Unexpected error: {e}",
352
+ "lakehouse": None,
353
+ "created": False
354
+ }
355
+
356
+
357
+ if __name__ == "__main__":
358
+ # Uncomment to run the example
359
+ # main()
360
+ pass
361
+
362
+
363
+ # Usage Examples:
364
+ """
365
+ # Example 1: In a Fabric Notebook (simplest approach)
366
+ from duckrun.lakehouse import create_lakehouse_in_notebook
367
+
368
+ result = create_lakehouse_in_notebook("MyNewLakehouse")
369
+ if result == 1:
370
+ print("Lakehouse created or already exists!")
371
+
372
+ # Example 2: In a Fabric Notebook with explicit token
373
+ import notebookutils
374
+ from duckrun.lakehouse import create_lakehouse_simple
375
+
376
+ token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
377
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
378
+
379
+ result = create_lakehouse_simple("MyLakehouse", token, workspace_id)
380
+ print(f"Result: {result['message']}")
381
+
382
+ # Example 3: Outside Fabric (requires azure-identity package)
383
+ from duckrun.lakehouse import FabricLakehouseManager, get_fabric_token
384
+
385
+ token = get_fabric_token()
386
+ manager = FabricLakehouseManager(token)
387
+ result = manager.create_lakehouse_if_not_exists("MyLakehouse", workspace_name="MyWorkspace")
388
+
389
+ # Example 4: With explicit workspace and lakehouse details
390
+ from duckrun.lakehouse import FabricLakehouseManager
391
+
392
+ # Get your token however you prefer
393
+ token = "your_bearer_token_here"
394
+ manager = FabricLakehouseManager(token)
395
+
396
+ # Create lakehouse in specific workspace
397
+ workspace_id = manager.get_workspace_id("Production Workspace")
398
+ lakehouse = manager.create_lakehouse("DataLake2024", workspace_id, enable_schemas=True)
399
+
400
+ if lakehouse:
401
+ print(f"Created lakehouse with ID: {lakehouse['id']}")
402
+ """
duckrun/runner.py CHANGED
@@ -235,11 +235,28 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
235
235
  print(f"SQL file is empty: {table_name}.sql")
236
236
  return None
237
237
 
238
+ import re
239
+ # Determine if lakehouse_name is a GUID
240
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
241
+ lakehouse_is_guid = bool(guid_pattern.match(duckrun_instance.lakehouse_name))
242
+
243
+ # Smart substitution for ${lh}.Lakehouse
244
+ # If template contains ${lh}.Lakehouse, replace with correct value
245
+ if '${lh}.Lakehouse' in content:
246
+ if lakehouse_is_guid:
247
+ # If GUID, use just the GUID
248
+ content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
249
+ else:
250
+ # If not GUID, use legacy format
251
+ content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
252
+
238
253
  full_params = {
239
254
  'ws': duckrun_instance.workspace,
240
255
  'lh': duckrun_instance.lakehouse_name,
241
256
  'schema': duckrun_instance.schema,
242
- 'storage_account': duckrun_instance.storage_account
257
+ 'storage_account': duckrun_instance.storage_account,
258
+ 'tables_url': duckrun_instance.table_base_url,
259
+ 'files_url': duckrun_instance.files_base_url
243
260
  }
244
261
  if params:
245
262
  full_params.update(params)
@@ -247,6 +264,10 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
247
264
  try:
248
265
  template = Template(content)
249
266
  content = template.substitute(full_params)
267
+ # After substitution, remove .Lakehouse if it follows a GUID in any ABFSS URL
268
+ import re
269
+ # Pattern: GUID.Lakehouse or GUID.lakehouse (in URLs)
270
+ content = re.sub(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.(Lakehouse|lakehouse)', r'\1', content)
250
271
  except KeyError as e:
251
272
  print(f"Missing parameter in SQL file: ${e}")
252
273
  return None
duckrun/stats.py CHANGED
@@ -21,33 +21,39 @@ def _table_exists(duckrun_instance, schema_name: str, table_name: str) -> bool:
21
21
 
22
22
 
23
23
  def _schema_exists(duckrun_instance, schema_name: str) -> bool:
24
- """Check if a schema exists by trying to show its tables."""
24
+ """Check if a schema exists by querying information_schema."""
25
25
  try:
26
- # For main schema, just show tables
26
+ # For main schema, always exists
27
27
  if schema_name == "main":
28
- query = "SHOW TABLES"
28
+ return True
29
29
  else:
30
- query = f"SHOW TABLES FROM {schema_name}"
31
- duckrun_instance.con.execute(query)
32
- return True
30
+ # Use information_schema which works in DuckDB 1.2.2
31
+ query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{schema_name}' LIMIT 1"
32
+ result = duckrun_instance.con.execute(query).fetchall()
33
+ return len(result) > 0
33
34
  except:
34
35
  return False
35
36
 
36
37
 
37
38
  def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
38
- """Get all existing tables in a schema by showing tables, excluding temporary tables."""
39
+ """Get all existing tables in a schema using information_schema, excluding temporary tables."""
39
40
  try:
40
- # For main schema, just show tables
41
+ # For main schema, use SHOW TABLES
41
42
  if schema_name == "main":
42
43
  query = "SHOW TABLES"
44
+ result = duckrun_instance.con.execute(query).fetchall()
45
+ if result:
46
+ tables = [row[0] for row in result]
47
+ filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
48
+ return filtered_tables
43
49
  else:
44
- query = f"SHOW TABLES FROM {schema_name}"
45
- result = duckrun_instance.con.execute(query).fetchall()
46
- if result:
47
- # Filter out temporary tables created by stats processing (tbl_0, tbl_1, etc.)
48
- tables = [row[0] for row in result]
49
- filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
50
- return filtered_tables
50
+ # Use information_schema which works in DuckDB 1.2.2
51
+ query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
52
+ result = duckrun_instance.con.execute(query).fetchall()
53
+ if result:
54
+ tables = [row[0] for row in result]
55
+ filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
56
+ return filtered_tables
51
57
  return []
52
58
  except:
53
59
  return []
@@ -136,8 +142,8 @@ def get_stats(duckrun_instance, source: str):
136
142
  print(f"Processing {len(list_tables)} tables: {list_tables}")
137
143
 
138
144
  for idx, tbl in enumerate(list_tables):
139
- # Construct lakehouse path using ABFSS URL
140
- table_path = f"abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Tables/{schema_name}/{tbl}"
145
+ # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
146
+ table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
141
147
 
142
148
  try:
143
149
  dt = DeltaTable(table_path)
@@ -218,7 +224,7 @@ def get_stats(duckrun_instance, source: str):
218
224
  WHERE tbl IS NOT NULL
219
225
  GROUP BY tbl
220
226
  ORDER BY total_rows DESC
221
- ''').fetch_arrow_table()
227
+ ''').df()
222
228
 
223
229
  return final_result
224
230
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.3
3
+ Version: 0.2.5.dev1
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,12 @@
1
+ duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
2
+ duckrun/core.py,sha256=-UAsAOlOmVkVuQCkbCWTH7aiFSHP0OenAAWSl0i0inY,37107
3
+ duckrun/files.py,sha256=piWRU5w9jHrW-wuV4Gf-SKY_jhFv9eflxgWO8AZCQTI,10495
4
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
5
+ duckrun/runner.py,sha256=lfwNoU1CZXh6bPTHvGWVaUWjzG5crvT7Pzq4onMEVjw,12576
6
+ duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
7
+ duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
8
+ duckrun-0.2.5.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
9
+ duckrun-0.2.5.dev1.dist-info/METADATA,sha256=ZQfd7I-J08MC5ytjKt789PqHdroHI0ld36aMNp-57yE,18344
10
+ duckrun-0.2.5.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ duckrun-0.2.5.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
12
+ duckrun-0.2.5.dev1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=LN5rc5B3HLimgslZdC8tLKe3rjTl_KD8WxCh1qoJhdM,16443
3
- duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
4
- duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
5
- duckrun/stats.py,sha256=jLEkxNo7MjibPMpjMsXyedrJqv9-BAnP1C0L2a7H8Z8,9417
6
- duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
7
- duckrun-0.2.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
8
- duckrun-0.2.3.dist-info/METADATA,sha256=CpJvtR9l8c9b1AV9-KnjN4fZODE_3oJxS3omz4p-qlc,18339
9
- duckrun-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
- duckrun-0.2.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
11
- duckrun-0.2.3.dist-info/RECORD,,