duckrun 0.1.5.5__py3-none-any.whl → 0.1.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -112,7 +112,7 @@ class Duckrun:
112
112
  Usage:
113
113
  # For pipelines:
114
114
  dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
115
- dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema
115
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
116
116
  dr.run(pipeline)
117
117
 
118
118
  # For data exploration with Spark-style API:
@@ -122,12 +122,14 @@ class Duckrun:
122
122
  """
123
123
 
124
124
  def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
125
- sql_folder: Optional[str] = None, compaction_threshold: int = 10):
125
+ sql_folder: Optional[str] = None, compaction_threshold: int = 10,
126
+ scan_all_schemas: bool = False):
126
127
  self.workspace = workspace
127
128
  self.lakehouse_name = lakehouse_name
128
129
  self.schema = schema
129
130
  self.sql_folder = sql_folder.strip() if sql_folder else None
130
131
  self.compaction_threshold = compaction_threshold
132
+ self.scan_all_schemas = scan_all_schemas
131
133
  self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
132
134
  self.con = duckdb.connect()
133
135
  self.con.sql("SET preserve_insertion_order = false")
@@ -144,27 +146,31 @@ class Duckrun:
144
146
  1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
145
147
  2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
146
148
 
147
- Schema defaults to "dbo" if not specified.
149
+ Schema defaults to "dbo" if not specified. When no schema is provided,
150
+ all tables across all schemas will be listed, but operations will use "dbo".
148
151
 
149
152
  Examples:
150
153
  dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
151
- dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # uses dbo
154
+ dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # lists all, uses dbo
152
155
  dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
153
- dr = Duckrun.connect("myworkspace", "mylakehouse") # uses dbo
156
+ dr = Duckrun.connect("myworkspace", "mylakehouse") # lists all, uses dbo
154
157
  dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
155
158
  """
156
159
  print("Connecting to Lakehouse...")
157
160
 
161
+ scan_all_schemas = False
162
+
158
163
  # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
159
164
  if workspace and "/" in workspace and lakehouse_name is None:
160
165
  parts = workspace.split("/")
161
166
  if len(parts) == 2:
162
167
  # Format: "ws/lh.lakehouse" (schema will use default)
163
168
  workspace, lakehouse_name = parts
164
- # schema already has default value "dbo"
165
- print(f"ℹ️ No schema specified. Using default schema 'dbo'.")
166
- print(f" To specify a schema, use: {workspace}/{lakehouse_name}.lakehouse/schema")
167
- print(f" Note: Scanning all schemas will be added in a future update.\n")
169
+ scan_all_schemas = True
170
+ print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
171
+ print(f" Scanning all schemas for table discovery...")
172
+ print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
173
+ print(f" 💡 For better performance, specify a schema: {workspace}/{lakehouse_name}.lakehouse/schema\n")
168
174
  elif len(parts) == 3:
169
175
  # Format: "ws/lh.lakehouse/schema"
170
176
  workspace, lakehouse_name, schema = parts
@@ -177,18 +183,27 @@ class Duckrun:
177
183
  # Remove .lakehouse suffix if present
178
184
  if lakehouse_name.endswith(".lakehouse"):
179
185
  lakehouse_name = lakehouse_name[:-10]
186
+ elif lakehouse_name is not None:
187
+ # Traditional format used, check if schema was explicitly provided
188
+ # If schema is still "dbo" (default), scan all schemas
189
+ if schema == "dbo":
190
+ scan_all_schemas = True
191
+ print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
192
+ print(f" Scanning all schemas for table discovery...")
193
+ print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
194
+ print(f" 💡 For better performance, specify a schema explicitly.\n")
180
195
 
181
196
  # Validate all required parameters are present
182
197
  if not workspace or not lakehouse_name:
183
198
  raise ValueError(
184
199
  "Missing required parameters. Use either:\n"
185
200
  " connect('workspace/lakehouse.lakehouse/schema')\n"
186
- " connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
201
+ " connect('workspace/lakehouse.lakehouse') # defaults to dbo, lists all\n"
187
202
  " connect('workspace', 'lakehouse', 'schema')\n"
188
- " connect('workspace', 'lakehouse') # defaults to dbo"
203
+ " connect('workspace', 'lakehouse') # defaults to dbo, lists all"
189
204
  )
190
205
 
191
- return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
206
+ return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
192
207
 
193
208
  def _get_storage_token(self):
194
209
  return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
@@ -208,47 +223,77 @@ class Duckrun:
208
223
  def _attach_lakehouse(self):
209
224
  self._create_onelake_secret()
210
225
  try:
211
- # Use expensive list operation but filter for _delta_log folders only
212
- # This avoids parsing JSON content that causes Iceberg metadata issues
213
- print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
214
-
215
- list_tables_query = f"""
216
- SELECT DISTINCT
217
- regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
218
- FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
219
- WHERE file LIKE '%/_delta_log/%'
220
- AND file NOT LIKE '%/metadata/%'
221
- AND file NOT LIKE '%/iceberg/%'
222
- AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
223
- """
226
+ if self.scan_all_schemas:
227
+ # Scan all schemas
228
+ print(f"⚠️ Scanning for Delta tables across all schemas...")
229
+ print(f" This may take a while for large lakehouses with many schemas/tables.")
230
+
231
+ list_tables_query = f"""
232
+ SELECT DISTINCT
233
+ regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) as schema_name,
234
+ regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) as table_name
235
+ FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/**")
236
+ WHERE file LIKE '%/_delta_log/%'
237
+ AND file NOT LIKE '%/metadata/%'
238
+ AND file NOT LIKE '%/iceberg/%'
239
+ AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) IS NOT NULL
240
+ AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) IS NOT NULL
241
+ ORDER BY schema_name, table_name
242
+ """
243
+ else:
244
+ # Scan specific schema only
245
+ print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
246
+
247
+ list_tables_query = f"""
248
+ SELECT DISTINCT
249
+ '{self.schema}' as schema_name,
250
+ regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
251
+ FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
252
+ WHERE file LIKE '%/_delta_log/%'
253
+ AND file NOT LIKE '%/metadata/%'
254
+ AND file NOT LIKE '%/iceberg/%'
255
+ AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
256
+ """
224
257
 
225
258
  list_tables_df = self.con.sql(list_tables_query).df()
226
259
 
227
260
  if list_tables_df.empty:
228
- print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}.")
261
+ if self.scan_all_schemas:
262
+ print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
263
+ else:
264
+ print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
229
265
  return
230
266
 
231
- table_names = list_tables_df['table_name'].tolist()
267
+ print(f"Found {len(list_tables_df)} Delta tables. Attaching as views...\n")
232
268
 
233
- print(f"Found {len(table_names)} Delta tables. Attaching as views...")
234
-
235
- for table in table_names:
269
+ for _, row in list_tables_df.iterrows():
270
+ schema_name = row['schema_name']
271
+ table_name = row['table_name']
272
+
236
273
  # Skip Iceberg-related folders and empty names
237
- if not table or table in ('metadata', 'iceberg'):
274
+ if not table_name or table_name in ('metadata', 'iceberg'):
238
275
  continue
239
276
 
240
277
  try:
278
+ # Create view with schema prefix to avoid conflicts
279
+ view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
280
+
241
281
  self.con.sql(f"""
242
- CREATE OR REPLACE VIEW {table}
243
- AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
282
+ CREATE OR REPLACE VIEW {view_name}
283
+ AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
244
284
  """)
245
- print(f" ✓ Attached: {table}")
285
+ print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
246
286
  except Exception as e:
247
- print(f" ⚠ Skipped {table}: {str(e)[:100]}")
287
+ print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
248
288
  continue
249
289
 
250
290
  print("\nAttached tables (views) in DuckDB:")
251
- self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
291
+ self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
292
+
293
+ if self.scan_all_schemas:
294
+ print(f"\nNote: Tables are prefixed with schema (e.g., dbo_tablename)")
295
+ print(f" Default schema for operations: {self.schema}")
296
+
252
297
  except Exception as e:
253
298
  print(f"Error attaching lakehouse: {e}")
254
299
  print("Continuing without pre-attached tables.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.5.5
3
+ Version: 0.1.5.6
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License-Expression: MIT
@@ -15,7 +15,7 @@ Requires-Dist: deltalake>=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
16
  Dynamic: license-file
17
17
 
18
- <img src="duckrun.png" width="400" alt="Duckrun">
18
+ <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
19
19
 
20
20
  Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
21
21
 
@@ -38,10 +38,11 @@ pip install duckrun
38
38
  ```python
39
39
  import duckrun
40
40
 
41
- # Connect to your Fabric lakehouse
41
+ # Connect to your Fabric lakehouse with a specific schema
42
42
  con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
43
43
 
44
- # Schema defaults to 'dbo' if not specified
44
+ # Schema defaults to 'dbo' if not specified (scans all schemas)
45
+ # ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
45
46
  con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
46
47
 
47
48
  # Explore data
@@ -56,17 +57,37 @@ That's it! No `sql_folder` needed for data exploration.
56
57
  ## Connection Format
57
58
 
58
59
  ```python
59
- # With schema
60
+ # With schema (recommended for better performance)
60
61
  con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
61
62
 
62
- # Without schema (uses 'dbo' by default)
63
+ # Without schema (defaults to 'dbo', scans all schemas)
64
+ # ⚠️ This can be slow for large lakehouses!
63
65
  con = duckrun.connect("workspace/lakehouse.lakehouse")
64
66
 
65
67
  # With options
66
68
  con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
67
69
  ```
68
70
 
69
- **Note:** When schema is not specified, Duckrun defaults to `dbo`. Multi-schema scanning will be added in a future update.
71
+ ### Multi-Schema Support
72
+
73
+ When you don't specify a schema, Duckrun will:
74
+ - **Default to `dbo`** for write operations
75
+ - **Scan all schemas** to discover and attach all Delta tables
76
+ - **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
77
+
78
+ **Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
79
+
80
+ ```python
81
+ # Fast: scans only 'dbo' schema
82
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
83
+
84
+ # Slower: scans all schemas
85
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
86
+
87
+ # Query tables from different schemas (when scanning all)
88
+ con.sql("SELECT * FROM dbo_customers").show()
89
+ con.sql("SELECT * FROM bronze_raw_data").show()
90
+ ```
70
91
 
71
92
  ## Two Ways to Use Duckrun
72
93
 
@@ -262,7 +283,7 @@ con = duckrun.connect(
262
283
  ```python
263
284
  import duckrun
264
285
 
265
- # Connect
286
+ # Connect (specify schema for best performance)
266
287
  con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
267
288
 
268
289
  # Pipeline with mixed tasks
@@ -297,7 +318,7 @@ con.sql("""
297
318
  ## How It Works
298
319
 
299
320
  1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
300
- 2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
321
+ 2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
301
322
  3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
302
323
  4. **Write Operations**: Results are written back as Delta tables with automatic optimization
303
324
  5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=AjaY3fkbO2S9rCejy-gF06UgQ13J1K6gBAp_AEwcyRs,23762
3
+ duckrun-0.1.5.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.5.6.dist-info/METADATA,sha256=bGr8L2ZCLOqVtvUtcpBQPxtLgkiZAhy7lOq0U4KtTSI,9258
5
+ duckrun-0.1.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.5.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.5.6.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=0Jo7zkVuTvdPPt-ubUhy5996oAm4VffZrH6K1AUw7wE,20804
3
- duckrun-0.1.5.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.5.5.dist-info/METADATA,sha256=0fp-MgKtZuxYBxvXtGOpUsK4aJbaobLckzFfq-LMu4o,8201
5
- duckrun-0.1.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.5.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.5.5.dist-info/RECORD,,