duckrun 0.1.5.6__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -5,6 +5,8 @@ import importlib.util
5
5
  from deltalake import DeltaTable, write_deltalake
6
6
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
7
  from string import Template
8
+ import obstore as obs
9
+ from obstore.store import AzureStore
8
10
 
9
11
 
10
12
  class DeltaWriter:
@@ -13,7 +15,7 @@ class DeltaWriter:
13
15
  def __init__(self, relation, duckrun_instance):
14
16
  self.relation = relation
15
17
  self.duckrun = duckrun_instance
16
- self._format = "delta" # Default to delta format
18
+ self._format = "delta"
17
19
  self._mode = "overwrite"
18
20
 
19
21
  def format(self, format_type: str):
@@ -32,46 +34,35 @@ class DeltaWriter:
32
34
 
33
35
  def saveAsTable(self, table_name: str):
34
36
  """Save query result as Delta table"""
35
- # Format defaults to "delta", so no need to check
36
37
  if self._format != "delta":
37
38
  raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
38
39
 
39
- # Parse schema.table or use default schema
40
40
  if "." in table_name:
41
41
  schema, table = table_name.split(".", 1)
42
42
  else:
43
43
  schema = self.duckrun.schema
44
44
  table = table_name
45
45
 
46
- # Ensure OneLake secret is created
47
46
  self.duckrun._create_onelake_secret()
48
-
49
- # Build path
50
47
  path = f"{self.duckrun.table_base_url}{schema}/{table}"
51
-
52
- # Execute query and get result
53
48
  df = self.relation.record_batch()
54
49
 
55
50
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
56
-
57
- # Write to Delta
58
51
  write_deltalake(path, df, mode=self._mode)
59
52
 
60
- # Create or replace view in DuckDB
61
53
  self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
62
54
  self.duckrun.con.sql(f"""
63
55
  CREATE OR REPLACE VIEW {table}
64
56
  AS SELECT * FROM delta_scan('{path}')
65
57
  """)
66
58
 
67
- # Optimize if needed
68
59
  dt = DeltaTable(path)
69
60
 
70
61
  if self._mode == "overwrite":
71
62
  dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
72
63
  dt.cleanup_metadata()
73
64
  print(f"✅ Table {schema}.{table} created/overwritten")
74
- else: # append
65
+ else:
75
66
  file_count = len(dt.file_uris())
76
67
  if file_count > self.duckrun.compaction_threshold:
77
68
  print(f"Compacting {schema}.{table} ({file_count} files)")
@@ -148,31 +139,19 @@ class Duckrun:
148
139
 
149
140
  Schema defaults to "dbo" if not specified. When no schema is provided,
150
141
  all tables across all schemas will be listed, but operations will use "dbo".
151
-
152
- Examples:
153
- dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
154
- dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # lists all, uses dbo
155
- dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
156
- dr = Duckrun.connect("myworkspace", "mylakehouse") # lists all, uses dbo
157
- dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
158
142
  """
159
143
  print("Connecting to Lakehouse...")
160
144
 
161
145
  scan_all_schemas = False
162
146
 
163
- # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
164
147
  if workspace and "/" in workspace and lakehouse_name is None:
165
148
  parts = workspace.split("/")
166
149
  if len(parts) == 2:
167
- # Format: "ws/lh.lakehouse" (schema will use default)
168
150
  workspace, lakehouse_name = parts
169
151
  scan_all_schemas = True
170
152
  print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
171
- print(f" Scanning all schemas for table discovery...")
172
- print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
173
- print(f" 💡 For better performance, specify a schema: {workspace}/{lakehouse_name}.lakehouse/schema\n")
153
+ print(f" Scanning all schemas for table discovery...\n")
174
154
  elif len(parts) == 3:
175
- # Format: "ws/lh.lakehouse/schema"
176
155
  workspace, lakehouse_name, schema = parts
177
156
  else:
178
157
  raise ValueError(
@@ -180,20 +159,14 @@ class Duckrun:
180
159
  "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
181
160
  )
182
161
 
183
- # Remove .lakehouse suffix if present
184
162
  if lakehouse_name.endswith(".lakehouse"):
185
163
  lakehouse_name = lakehouse_name[:-10]
186
164
  elif lakehouse_name is not None:
187
- # Traditional format used, check if schema was explicitly provided
188
- # If schema is still "dbo" (default), scan all schemas
189
165
  if schema == "dbo":
190
166
  scan_all_schemas = True
191
167
  print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
192
- print(f" Scanning all schemas for table discovery...")
193
- print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
194
- print(f" 💡 For better performance, specify a schema explicitly.\n")
168
+ print(f" Scanning all schemas for table discovery...\n")
195
169
 
196
- # Validate all required parameters are present
197
170
  if not workspace or not lakehouse_name:
198
171
  raise ValueError(
199
172
  "Missing required parameters. Use either:\n"
@@ -220,62 +193,82 @@ class Duckrun:
220
193
  os.environ["AZURE_STORAGE_TOKEN"] = token.token
221
194
  self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
222
195
 
196
+ def _discover_tables_fast(self) -> List[Tuple[str, str]]:
197
+ """
198
+ Fast Delta table discovery using obstore with list_with_delimiter.
199
+ Only lists directories, not files - super fast!
200
+
201
+ Returns:
202
+ List of tuples: [(schema, table_name), ...]
203
+ """
204
+ token = self._get_storage_token()
205
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
206
+ print("Getting Azure token for table discovery...")
207
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
208
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
209
+ token_obj = credential.get_token("https://storage.azure.com/.default")
210
+ token = token_obj.token
211
+ os.environ["AZURE_STORAGE_TOKEN"] = token
212
+
213
+ url = f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
214
+ store = AzureStore.from_url(url, bearer_token=token)
215
+
216
+ base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
217
+ tables_found = []
218
+
219
+ if self.scan_all_schemas:
220
+ # Discover all schemas first
221
+ print("🔍 Discovering schemas...")
222
+ schemas_result = obs.list_with_delimiter(store, prefix=base_path)
223
+ schemas = [
224
+ prefix.rstrip('/').split('/')[-1]
225
+ for prefix in schemas_result['common_prefixes']
226
+ ]
227
+ print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
228
+
229
+ # Discover tables in each schema
230
+ print("🔍 Discovering tables...")
231
+ for schema_name in schemas:
232
+ schema_path = f"{base_path}{schema_name}/"
233
+ result = obs.list_with_delimiter(store, prefix=schema_path)
234
+
235
+ for table_prefix in result['common_prefixes']:
236
+ table_name = table_prefix.rstrip('/').split('/')[-1]
237
+ # Skip non-table directories
238
+ if table_name not in ('metadata', 'iceberg'):
239
+ tables_found.append((schema_name, table_name))
240
+ else:
241
+ # Scan specific schema only
242
+ print(f"🔍 Discovering tables in schema '{self.schema}'...")
243
+ schema_path = f"{base_path}{self.schema}/"
244
+ result = obs.list_with_delimiter(store, prefix=schema_path)
245
+
246
+ for table_prefix in result['common_prefixes']:
247
+ table_name = table_prefix.rstrip('/').split('/')[-1]
248
+ if table_name not in ('metadata', 'iceberg'):
249
+ tables_found.append((self.schema, table_name))
250
+
251
+ return tables_found
252
+
223
253
  def _attach_lakehouse(self):
254
+ """Attach lakehouse tables as DuckDB views using fast discovery"""
224
255
  self._create_onelake_secret()
256
+
225
257
  try:
226
- if self.scan_all_schemas:
227
- # Scan all schemas
228
- print(f"⚠️ Scanning for Delta tables across all schemas...")
229
- print(f" This may take a while for large lakehouses with many schemas/tables.")
230
-
231
- list_tables_query = f"""
232
- SELECT DISTINCT
233
- regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) as schema_name,
234
- regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) as table_name
235
- FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/**")
236
- WHERE file LIKE '%/_delta_log/%'
237
- AND file NOT LIKE '%/metadata/%'
238
- AND file NOT LIKE '%/iceberg/%'
239
- AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) IS NOT NULL
240
- AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) IS NOT NULL
241
- ORDER BY schema_name, table_name
242
- """
243
- else:
244
- # Scan specific schema only
245
- print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
246
-
247
- list_tables_query = f"""
248
- SELECT DISTINCT
249
- '{self.schema}' as schema_name,
250
- regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
251
- FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
252
- WHERE file LIKE '%/_delta_log/%'
253
- AND file NOT LIKE '%/metadata/%'
254
- AND file NOT LIKE '%/iceberg/%'
255
- AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
256
- """
258
+ tables = self._discover_tables_fast()
257
259
 
258
- list_tables_df = self.con.sql(list_tables_query).df()
259
-
260
- if list_tables_df.empty:
260
+ if not tables:
261
261
  if self.scan_all_schemas:
262
262
  print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
263
263
  else:
264
264
  print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
265
265
  return
266
266
 
267
- print(f"Found {len(list_tables_df)} Delta tables. Attaching as views...\n")
268
-
269
- for _, row in list_tables_df.iterrows():
270
- schema_name = row['schema_name']
271
- table_name = row['table_name']
272
-
273
- # Skip Iceberg-related folders and empty names
274
- if not table_name or table_name in ('metadata', 'iceberg'):
275
- continue
276
-
267
+ print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
268
+
269
+ attached_count = 0
270
+ for schema_name, table_name in tables:
277
271
  try:
278
- # Create view with schema prefix to avoid conflicts
279
272
  view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
280
273
 
281
274
  self.con.sql(f"""
@@ -283,19 +276,24 @@ class Duckrun:
283
276
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
284
277
  """)
285
278
  print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
279
+ attached_count += 1
286
280
  except Exception as e:
287
281
  print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
288
282
  continue
289
283
 
290
- print("\nAttached tables (views) in DuckDB:")
284
+ print(f"\n{'='*60}")
285
+ print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
286
+ print(f"{'='*60}\n")
287
+
288
+ print("Available views in DuckDB:")
291
289
  self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
292
290
 
293
291
  if self.scan_all_schemas:
294
- print(f"\nNote: Tables are prefixed with schema (e.g., dbo_tablename)")
295
- print(f" Default schema for operations: {self.schema}")
292
+ print(f"\n💡 Note: Tables are prefixed with schema (e.g., dbo_tablename)")
293
+ print(f" Default schema for operations: {self.schema}\n")
296
294
 
297
295
  except Exception as e:
298
- print(f"Error attaching lakehouse: {e}")
296
+ print(f"Error attaching lakehouse: {e}")
299
297
  print("Continuing without pre-attached tables.")
300
298
 
301
299
  def _normalize_table_name(self, name: str) -> str:
@@ -329,7 +327,6 @@ class Duckrun:
329
327
  print(f"SQL file is empty: {table_name}.sql")
330
328
  return None
331
329
 
332
- # Auto-inject common params, merge with user params
333
330
  full_params = {
334
331
  'ws': self.workspace,
335
332
  'lh': self.lakehouse_name,
@@ -452,18 +449,9 @@ class Duckrun:
452
449
 
453
450
  Returns:
454
451
  True if all tasks succeeded
455
-
456
- Example:
457
- pipeline = [
458
- ('download', (urls, paths, depth)),
459
- ('staging', 'overwrite', {'run_date': '2024-06-01'}),
460
- ('transform', 'append'), # {} optional!
461
- ('calendar', 'ignore') # {} optional!
462
- ]
463
- dr.run(pipeline)
464
452
  """
465
453
  if self.sql_folder is None:
466
- raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
454
+ raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
467
455
 
468
456
  for i, task in enumerate(pipeline, 1):
469
457
  print(f"\n{'='*60}")
@@ -472,18 +460,14 @@ class Duckrun:
472
460
 
473
461
  try:
474
462
  if len(task) == 2:
475
- # Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
476
463
  name, second = task
477
464
  if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
478
- # SQL task without params: ('table', 'mode')
479
465
  self._run_sql(name, second, {})
480
466
  else:
481
- # Python task: ('name', (args,))
482
467
  args = second if isinstance(second, (tuple, list)) else (second,)
483
468
  self._run_python(name, tuple(args))
484
469
 
485
470
  elif len(task) == 3:
486
- # SQL task with params: ('table', 'mode', {params})
487
471
  table, mode, params = task
488
472
  if not isinstance(params, dict):
489
473
  raise ValueError(f"Expected dict for params, got {type(params)}")
@@ -506,13 +490,9 @@ class Duckrun:
506
490
  Execute raw SQL query with Spark-style write API.
507
491
 
508
492
  Example:
509
- # Traditional DuckDB style
510
493
  dr.sql("SELECT * FROM table").show()
511
494
  df = dr.sql("SELECT * FROM table").df()
512
-
513
- # New Spark-style write API (format is optional, defaults to delta)
514
495
  dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
515
- dr.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
516
496
  """
517
497
  relation = self.con.sql(query)
518
498
  return QueryResult(relation, self)
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.5.6
3
+ Version: 0.1.6
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
- License-Expression: MIT
6
+ License: MIT
7
7
  Project-URL: Homepage, https://github.com/djouallah/duckrun
8
8
  Project-URL: Repository, https://github.com/djouallah/duckrun
9
9
  Project-URL: Issues, https://github.com/djouallah/duckrun/issues
@@ -13,6 +13,9 @@ License-File: LICENSE
13
13
  Requires-Dist: duckdb>=1.2.0
14
14
  Requires-Dist: deltalake>=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
+ Requires-Dist: obstore>=0.2.0
17
+ Provides-Extra: local
18
+ Requires-Dist: azure-identity>=1.12.0; extra == "local"
16
19
  Dynamic: license-file
17
20
 
18
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=H7Q-mvE5ET3mdEi7VTubWdaCrgVaJW9G0LfAu0Gpw-g,21872
3
+ duckrun-0.1.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.6.dist-info/METADATA,sha256=20vTn4-9fn8iqwXGjYT3IQd9Xk47sQAD-Tv3wk2Pp9I,9356
5
+ duckrun-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.6.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=AjaY3fkbO2S9rCejy-gF06UgQ13J1K6gBAp_AEwcyRs,23762
3
- duckrun-0.1.5.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.5.6.dist-info/METADATA,sha256=bGr8L2ZCLOqVtvUtcpBQPxtLgkiZAhy7lOq0U4KtTSI,9258
5
- duckrun-0.1.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.5.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.5.6.dist-info/RECORD,,