duckrun 0.1.5.6__py3-none-any.whl → 0.1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -5,6 +5,8 @@ import importlib.util
5
5
  from deltalake import DeltaTable, write_deltalake
6
6
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
7
  from string import Template
8
+ import obstore as obs
9
+ from obstore.store import AzureStore
8
10
 
9
11
 
10
12
  class DeltaWriter:
@@ -13,7 +15,7 @@ class DeltaWriter:
13
15
  def __init__(self, relation, duckrun_instance):
14
16
  self.relation = relation
15
17
  self.duckrun = duckrun_instance
16
- self._format = "delta" # Default to delta format
18
+ self._format = "delta"
17
19
  self._mode = "overwrite"
18
20
 
19
21
  def format(self, format_type: str):
@@ -32,46 +34,35 @@ class DeltaWriter:
32
34
 
33
35
  def saveAsTable(self, table_name: str):
34
36
  """Save query result as Delta table"""
35
- # Format defaults to "delta", so no need to check
36
37
  if self._format != "delta":
37
38
  raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
38
39
 
39
- # Parse schema.table or use default schema
40
40
  if "." in table_name:
41
41
  schema, table = table_name.split(".", 1)
42
42
  else:
43
43
  schema = self.duckrun.schema
44
44
  table = table_name
45
45
 
46
- # Ensure OneLake secret is created
47
46
  self.duckrun._create_onelake_secret()
48
-
49
- # Build path
50
47
  path = f"{self.duckrun.table_base_url}{schema}/{table}"
51
-
52
- # Execute query and get result
53
48
  df = self.relation.record_batch()
54
49
 
55
50
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
56
-
57
- # Write to Delta
58
51
  write_deltalake(path, df, mode=self._mode)
59
52
 
60
- # Create or replace view in DuckDB
61
53
  self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
62
54
  self.duckrun.con.sql(f"""
63
55
  CREATE OR REPLACE VIEW {table}
64
56
  AS SELECT * FROM delta_scan('{path}')
65
57
  """)
66
58
 
67
- # Optimize if needed
68
59
  dt = DeltaTable(path)
69
60
 
70
61
  if self._mode == "overwrite":
71
62
  dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
72
63
  dt.cleanup_metadata()
73
64
  print(f"✅ Table {schema}.{table} created/overwritten")
74
- else: # append
65
+ else:
75
66
  file_count = len(dt.file_uris())
76
67
  if file_count > self.duckrun.compaction_threshold:
77
68
  print(f"Compacting {schema}.{table} ({file_count} files)")
@@ -136,43 +127,54 @@ class Duckrun:
136
127
  self._attach_lakehouse()
137
128
 
138
129
  @classmethod
139
- def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
140
- schema: str = "dbo", sql_folder: Optional[str] = None,
130
+ def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
131
+ schema: str = "dbo", sql_folder: Optional[str] = None,
141
132
  compaction_threshold: int = 100):
142
133
  """
143
134
  Create and connect to lakehouse.
144
135
 
145
136
  Supports two formats:
146
- 1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
147
- 2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
137
+ 1. Compact: connect("ws/lh.lakehouse/schema", sql_folder=...) or connect("ws/lh.lakehouse")
138
+ 2. Traditional: connect("ws", "lh", "schema", sql_folder) or connect("ws", "lh")
148
139
 
149
- Schema defaults to "dbo" if not specified. When no schema is provided,
150
- all tables across all schemas will be listed, but operations will use "dbo".
140
+ Args:
141
+ workspace: Workspace name or full path "ws/lh.lakehouse/schema"
142
+ lakehouse_name: Lakehouse name (optional if using compact format)
143
+ schema: Schema name (defaults to "dbo")
144
+ sql_folder: Optional path or URL to SQL files folder
145
+ compaction_threshold: File count threshold for compaction
151
146
 
152
147
  Examples:
153
- dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
154
- dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # lists all, uses dbo
155
- dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
156
- dr = Duckrun.connect("myworkspace", "mylakehouse") # lists all, uses dbo
157
- dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
148
+ # Compact format (second param treated as sql_folder if it's a URL/path string)
149
+ dr = Duckrun.connect("temp/power.lakehouse/wa", "https://github.com/.../sql/")
150
+ dr = Duckrun.connect("ws/lh.lakehouse/schema", "./sql")
151
+ dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
152
+
153
+ # Traditional format
154
+ dr = Duckrun.connect("ws", "lh", "schema", "./sql")
155
+ dr = Duckrun.connect("ws", "lh", "schema")
158
156
  """
159
157
  print("Connecting to Lakehouse...")
160
158
 
161
159
  scan_all_schemas = False
162
160
 
163
161
  # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
164
- if workspace and "/" in workspace and lakehouse_name is None:
162
+ # If second param looks like a path/URL and not a lakehouse name, treat it as sql_folder
163
+ if workspace and "/" in workspace and (lakehouse_name is None or
164
+ (isinstance(lakehouse_name, str) and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')))):
165
+
166
+ # If lakehouse_name looks like a sql_folder, shift it
167
+ if lakehouse_name and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')):
168
+ sql_folder = lakehouse_name
169
+ lakehouse_name = None
170
+
165
171
  parts = workspace.split("/")
166
172
  if len(parts) == 2:
167
- # Format: "ws/lh.lakehouse" (schema will use default)
168
173
  workspace, lakehouse_name = parts
169
174
  scan_all_schemas = True
170
175
  print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
171
- print(f" Scanning all schemas for table discovery...")
172
- print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
173
- print(f" 💡 For better performance, specify a schema: {workspace}/{lakehouse_name}.lakehouse/schema\n")
176
+ print(f" Scanning all schemas for table discovery...\n")
174
177
  elif len(parts) == 3:
175
- # Format: "ws/lh.lakehouse/schema"
176
178
  workspace, lakehouse_name, schema = parts
177
179
  else:
178
180
  raise ValueError(
@@ -180,27 +182,22 @@ class Duckrun:
180
182
  "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
181
183
  )
182
184
 
183
- # Remove .lakehouse suffix if present
184
185
  if lakehouse_name.endswith(".lakehouse"):
185
186
  lakehouse_name = lakehouse_name[:-10]
186
187
  elif lakehouse_name is not None:
187
- # Traditional format used, check if schema was explicitly provided
188
- # If schema is still "dbo" (default), scan all schemas
188
+ # Traditional format - check if schema was explicitly provided
189
189
  if schema == "dbo":
190
190
  scan_all_schemas = True
191
191
  print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
192
- print(f" Scanning all schemas for table discovery...")
193
- print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
194
- print(f" 💡 For better performance, specify a schema explicitly.\n")
192
+ print(f" Scanning all schemas for table discovery...\n")
195
193
 
196
- # Validate all required parameters are present
197
194
  if not workspace or not lakehouse_name:
198
195
  raise ValueError(
199
196
  "Missing required parameters. Use either:\n"
200
- " connect('workspace/lakehouse.lakehouse/schema')\n"
201
- " connect('workspace/lakehouse.lakehouse') # defaults to dbo, lists all\n"
202
- " connect('workspace', 'lakehouse', 'schema')\n"
203
- " connect('workspace', 'lakehouse') # defaults to dbo, lists all"
197
+ " connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
198
+ " connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
199
+ " connect('workspace', 'lakehouse', 'schema', 'sql_folder')\n"
200
+ " connect('workspace', 'lakehouse') # defaults to dbo"
204
201
  )
205
202
 
206
203
  return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
@@ -220,62 +217,82 @@ class Duckrun:
220
217
  os.environ["AZURE_STORAGE_TOKEN"] = token.token
221
218
  self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
222
219
 
220
+ def _discover_tables_fast(self) -> List[Tuple[str, str]]:
221
+ """
222
+ Fast Delta table discovery using obstore with list_with_delimiter.
223
+ Only lists directories, not files - super fast!
224
+
225
+ Returns:
226
+ List of tuples: [(schema, table_name), ...]
227
+ """
228
+ token = self._get_storage_token()
229
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
230
+ print("Getting Azure token for table discovery...")
231
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
232
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
233
+ token_obj = credential.get_token("https://storage.azure.com/.default")
234
+ token = token_obj.token
235
+ os.environ["AZURE_STORAGE_TOKEN"] = token
236
+
237
+ url = f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
238
+ store = AzureStore.from_url(url, bearer_token=token)
239
+
240
+ base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
241
+ tables_found = []
242
+
243
+ if self.scan_all_schemas:
244
+ # Discover all schemas first
245
+ print("🔍 Discovering schemas...")
246
+ schemas_result = obs.list_with_delimiter(store, prefix=base_path)
247
+ schemas = [
248
+ prefix.rstrip('/').split('/')[-1]
249
+ for prefix in schemas_result['common_prefixes']
250
+ ]
251
+ print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
252
+
253
+ # Discover tables in each schema
254
+ print("🔍 Discovering tables...")
255
+ for schema_name in schemas:
256
+ schema_path = f"{base_path}{schema_name}/"
257
+ result = obs.list_with_delimiter(store, prefix=schema_path)
258
+
259
+ for table_prefix in result['common_prefixes']:
260
+ table_name = table_prefix.rstrip('/').split('/')[-1]
261
+ # Skip non-table directories
262
+ if table_name not in ('metadata', 'iceberg'):
263
+ tables_found.append((schema_name, table_name))
264
+ else:
265
+ # Scan specific schema only
266
+ print(f"🔍 Discovering tables in schema '{self.schema}'...")
267
+ schema_path = f"{base_path}{self.schema}/"
268
+ result = obs.list_with_delimiter(store, prefix=schema_path)
269
+
270
+ for table_prefix in result['common_prefixes']:
271
+ table_name = table_prefix.rstrip('/').split('/')[-1]
272
+ if table_name not in ('metadata', 'iceberg'):
273
+ tables_found.append((self.schema, table_name))
274
+
275
+ return tables_found
276
+
223
277
  def _attach_lakehouse(self):
278
+ """Attach lakehouse tables as DuckDB views using fast discovery"""
224
279
  self._create_onelake_secret()
280
+
225
281
  try:
226
- if self.scan_all_schemas:
227
- # Scan all schemas
228
- print(f"⚠️ Scanning for Delta tables across all schemas...")
229
- print(f" This may take a while for large lakehouses with many schemas/tables.")
230
-
231
- list_tables_query = f"""
232
- SELECT DISTINCT
233
- regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) as schema_name,
234
- regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) as table_name
235
- FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/**")
236
- WHERE file LIKE '%/_delta_log/%'
237
- AND file NOT LIKE '%/metadata/%'
238
- AND file NOT LIKE '%/iceberg/%'
239
- AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) IS NOT NULL
240
- AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) IS NOT NULL
241
- ORDER BY schema_name, table_name
242
- """
243
- else:
244
- # Scan specific schema only
245
- print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
246
-
247
- list_tables_query = f"""
248
- SELECT DISTINCT
249
- '{self.schema}' as schema_name,
250
- regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
251
- FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
252
- WHERE file LIKE '%/_delta_log/%'
253
- AND file NOT LIKE '%/metadata/%'
254
- AND file NOT LIKE '%/iceberg/%'
255
- AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
256
- """
282
+ tables = self._discover_tables_fast()
257
283
 
258
- list_tables_df = self.con.sql(list_tables_query).df()
259
-
260
- if list_tables_df.empty:
284
+ if not tables:
261
285
  if self.scan_all_schemas:
262
286
  print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
263
287
  else:
264
288
  print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
265
289
  return
266
290
 
267
- print(f"Found {len(list_tables_df)} Delta tables. Attaching as views...\n")
268
-
269
- for _, row in list_tables_df.iterrows():
270
- schema_name = row['schema_name']
271
- table_name = row['table_name']
272
-
273
- # Skip Iceberg-related folders and empty names
274
- if not table_name or table_name in ('metadata', 'iceberg'):
275
- continue
276
-
291
+ print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
292
+
293
+ attached_count = 0
294
+ for schema_name, table_name in tables:
277
295
  try:
278
- # Create view with schema prefix to avoid conflicts
279
296
  view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
280
297
 
281
298
  self.con.sql(f"""
@@ -283,19 +300,24 @@ class Duckrun:
283
300
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
284
301
  """)
285
302
  print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
303
+ attached_count += 1
286
304
  except Exception as e:
287
305
  print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
288
306
  continue
289
307
 
290
- print("\nAttached tables (views) in DuckDB:")
308
+ print(f"\n{'='*60}")
309
+ print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
310
+ print(f"{'='*60}\n")
311
+
312
+ print("Available views in DuckDB:")
291
313
  self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
292
314
 
293
315
  if self.scan_all_schemas:
294
- print(f"\nNote: Tables are prefixed with schema (e.g., dbo_tablename)")
295
- print(f" Default schema for operations: {self.schema}")
316
+ print(f"\n💡 Note: Tables are prefixed with schema (e.g., dbo_tablename)")
317
+ print(f" Default schema for operations: {self.schema}\n")
296
318
 
297
319
  except Exception as e:
298
- print(f"Error attaching lakehouse: {e}")
320
+ print(f"Error attaching lakehouse: {e}")
299
321
  print("Continuing without pre-attached tables.")
300
322
 
301
323
  def _normalize_table_name(self, name: str) -> str:
@@ -329,7 +351,6 @@ class Duckrun:
329
351
  print(f"SQL file is empty: {table_name}.sql")
330
352
  return None
331
353
 
332
- # Auto-inject common params, merge with user params
333
354
  full_params = {
334
355
  'ws': self.workspace,
335
356
  'lh': self.lakehouse_name,
@@ -452,18 +473,9 @@ class Duckrun:
452
473
 
453
474
  Returns:
454
475
  True if all tasks succeeded
455
-
456
- Example:
457
- pipeline = [
458
- ('download', (urls, paths, depth)),
459
- ('staging', 'overwrite', {'run_date': '2024-06-01'}),
460
- ('transform', 'append'), # {} optional!
461
- ('calendar', 'ignore') # {} optional!
462
- ]
463
- dr.run(pipeline)
464
476
  """
465
477
  if self.sql_folder is None:
466
- raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
478
+ raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
467
479
 
468
480
  for i, task in enumerate(pipeline, 1):
469
481
  print(f"\n{'='*60}")
@@ -472,18 +484,14 @@ class Duckrun:
472
484
 
473
485
  try:
474
486
  if len(task) == 2:
475
- # Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
476
487
  name, second = task
477
488
  if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
478
- # SQL task without params: ('table', 'mode')
479
489
  self._run_sql(name, second, {})
480
490
  else:
481
- # Python task: ('name', (args,))
482
491
  args = second if isinstance(second, (tuple, list)) else (second,)
483
492
  self._run_python(name, tuple(args))
484
493
 
485
494
  elif len(task) == 3:
486
- # SQL task with params: ('table', 'mode', {params})
487
495
  table, mode, params = task
488
496
  if not isinstance(params, dict):
489
497
  raise ValueError(f"Expected dict for params, got {type(params)}")
@@ -506,13 +514,9 @@ class Duckrun:
506
514
  Execute raw SQL query with Spark-style write API.
507
515
 
508
516
  Example:
509
- # Traditional DuckDB style
510
517
  dr.sql("SELECT * FROM table").show()
511
518
  df = dr.sql("SELECT * FROM table").df()
512
-
513
- # New Spark-style write API (format is optional, defaults to delta)
514
519
  dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
515
- dr.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
516
520
  """
517
521
  relation = self.con.sql(query)
518
522
  return QueryResult(relation, self)
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.5.6
3
+ Version: 0.1.6.1
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
- License-Expression: MIT
6
+ License: MIT
7
7
  Project-URL: Homepage, https://github.com/djouallah/duckrun
8
8
  Project-URL: Repository, https://github.com/djouallah/duckrun
9
9
  Project-URL: Issues, https://github.com/djouallah/duckrun/issues
@@ -13,6 +13,9 @@ License-File: LICENSE
13
13
  Requires-Dist: duckdb>=1.2.0
14
14
  Requires-Dist: deltalake>=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
+ Requires-Dist: obstore>=0.2.0
17
+ Provides-Extra: local
18
+ Requires-Dist: azure-identity>=1.12.0; extra == "local"
16
19
  Dynamic: license-file
17
20
 
18
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=A5UdhpdEE9Wzje5d16c0ejTWn24zy5LCaoX6OghO8Us,23352
3
+ duckrun-0.1.6.1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.6.1.dist-info/METADATA,sha256=oHc38InTVr48Hp2mER4tbFL0RkWMEFXqg48OPYTk9qk,9358
5
+ duckrun-0.1.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.6.1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.6.1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=AjaY3fkbO2S9rCejy-gF06UgQ13J1K6gBAp_AEwcyRs,23762
3
- duckrun-0.1.5.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.5.6.dist-info/METADATA,sha256=bGr8L2ZCLOqVtvUtcpBQPxtLgkiZAhy7lOq0U4KtTSI,9258
5
- duckrun-0.1.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.5.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.5.6.dist-info/RECORD,,