duckrun 0.2.14.dev3__py3-none-any.whl → 0.2.14.dev40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/core.py CHANGED
@@ -817,16 +817,16 @@ class Duckrun:
817
817
  # Fallback to original value
818
818
  return self.workspace_id
819
819
 
820
- def get_lakehouse_id(self, force: bool = False) -> str:
820
+ def get_item_id(self, force: bool = False) -> str:
821
821
  """
822
- Get the lakehouse ID (GUID or name).
823
- Use this when passing lakehouse parameter to Python functions.
822
+ Get the item ID (GUID or name) - works for lakehouses, warehouses, databases, etc.
823
+ Use this when passing lakehouse/item parameter to Python functions.
824
824
 
825
825
  Args:
826
826
  force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
827
827
 
828
828
  Returns:
829
- Lakehouse ID - either a GUID or lakehouse name
829
+ Item ID - either a GUID or item name (supports all OneLake item types)
830
830
  """
831
831
  if not force:
832
832
  return self.lakehouse_id
@@ -839,14 +839,24 @@ class Duckrun:
839
839
  if guid_pattern.match(self.lakehouse_id):
840
840
  return self.lakehouse_id
841
841
 
842
- # Try to get from notebook context first (fastest)
843
- try:
844
- import notebookutils # type: ignore
845
- lakehouse_guid = notebookutils.lakehouse.get("id")
846
- if lakehouse_guid:
847
- return lakehouse_guid
848
- except (ImportError, Exception):
849
- pass
842
+ # Detect item type from lakehouse_id (e.g., "data.Lakehouse" -> Lakehouse)
843
+ item_type = None
844
+ item_name = self.lakehouse_id
845
+ for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
846
+ if self.lakehouse_id.endswith(suffix):
847
+ item_type = suffix[1:] # Remove the leading dot
848
+ item_name = self.lakehouse_id[:-len(suffix)]
849
+ break
850
+
851
+ # Try to get from notebook context first (only works for lakehouses)
852
+ if item_type == 'Lakehouse' or item_type is None:
853
+ try:
854
+ import notebookutils # type: ignore
855
+ lakehouse_guid = notebookutils.lakehouse.get("id")
856
+ if lakehouse_guid:
857
+ return lakehouse_guid
858
+ except (ImportError, Exception):
859
+ pass
850
860
 
851
861
  # Resolve via API
852
862
  try:
@@ -855,8 +865,15 @@ class Duckrun:
855
865
  if token:
856
866
  # First get workspace GUID
857
867
  workspace_guid = self.get_workspace_id(force=True)
858
- # Then resolve lakehouse name to ID
859
- resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, self.lakehouse_id)
868
+
869
+ # Use appropriate resolver based on item type
870
+ if item_type == 'Lakehouse' or item_type is None:
871
+ # Use lakehouse-specific API
872
+ resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, item_name if item_name else self.lakehouse_id)
873
+ else:
874
+ # Use generic items API for warehouses, databases, etc.
875
+ resolved_id = self._resolve_item_id_by_name(token, workspace_guid, item_name, item_type)
876
+
860
877
  if resolved_id:
861
878
  return resolved_id
862
879
  except Exception:
@@ -864,6 +881,13 @@ class Duckrun:
864
881
 
865
882
  # Fallback to original value
866
883
  return self.lakehouse_id
884
+
885
+ def get_lakehouse_id(self, force: bool = False) -> str:
886
+ """
887
+ Deprecated: Use get_item_id() instead.
888
+ Backward compatibility alias for get_item_id().
889
+ """
890
+ return self.get_item_id(force)
867
891
 
868
892
  def run(self, pipeline: List[Tuple]) -> bool:
869
893
  """
duckrun/stats.py CHANGED
@@ -142,7 +142,9 @@ def get_stats(duckrun_instance, source: str):
142
142
 
143
143
  print(f"Processing {len(list_tables)} tables: {list_tables}")
144
144
 
145
+ successful_tables = []
145
146
  for idx, tbl in enumerate(list_tables):
147
+ print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
146
148
  # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
147
149
  table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
148
150
 
@@ -210,23 +212,82 @@ def get_stats(duckrun_instance, source: str):
210
212
  ''')
211
213
 
212
214
  except Exception as e:
213
- print(f"Warning: Could not process table '{tbl}': {e}")
214
- # Create empty temp table for failed tables
215
- con.execute(f'''
216
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
217
- SELECT
218
- '{tbl}' as tbl,
219
- 'error' as file_name,
220
- 0 as num_rows,
221
- 0 as num_row_groups,
222
- 0 as size,
223
- false as vorder,
224
- '{timestamp}' as timestamp
225
- WHERE false
226
- ''')
215
+ error_msg = str(e)
216
+ print(f"Warning: Could not process table '{tbl}' using DeltaTable API: {e}")
217
+
218
+ # Fallback: Use DuckDB's delta_scan with filename parameter
219
+ if "Invalid JSON" in error_msg or "MetadataValue" in error_msg:
220
+ print(f" Detected JSON parsing issue - falling back to DuckDB delta_scan")
221
+ else:
222
+ print(f" Falling back to DuckDB delta_scan")
223
+
224
+ try:
225
+ # First get the list of actual parquet files using delta_scan
226
+ file_list_result = con.execute(f'''
227
+ SELECT DISTINCT filename
228
+ FROM delta_scan('{table_path}', filename=1)
229
+ ''').fetchall()
230
+
231
+ if not file_list_result:
232
+ # Empty table
233
+ con.execute(f'''
234
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
235
+ SELECT
236
+ '{tbl}' as tbl,
237
+ 'empty' as file_name,
238
+ 0 as num_rows,
239
+ 0 as num_row_groups,
240
+ 0 as size,
241
+ false as vorder,
242
+ '{timestamp}' as timestamp
243
+ WHERE false
244
+ ''')
245
+ else:
246
+ # Extract just the filename (not the full path) from delta_scan results
247
+ # delta_scan returns full ABFSS paths, we need to extract just the filename part
248
+ filenames = []
249
+ for row in file_list_result:
250
+ full_path = row[0]
251
+ # Extract just the filename from the full ABFSS path
252
+ if '/' in full_path:
253
+ filename = full_path.split('/')[-1]
254
+ else:
255
+ filename = full_path
256
+ filenames.append(table_path + "/" + filename)
257
+
258
+ # Use parquet_file_metadata to get actual parquet stats
259
+ con.execute(f'''
260
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
261
+ SELECT
262
+ '{tbl}' as tbl,
263
+ file_name,
264
+ num_rows,
265
+ num_row_groups,
266
+ 0 as size,
267
+ false as vorder,
268
+ '{timestamp}' as timestamp
269
+ FROM parquet_file_metadata({filenames})
270
+ ''')
271
+
272
+ print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
273
+ except Exception as fallback_error:
274
+ print(f" ✗ DuckDB fallback also failed for '{tbl}': {fallback_error}")
275
+ print(f" ⏭️ Skipping table '{tbl}'")
276
+ continue
277
+
278
+ # Mark this table as successfully processed
279
+ successful_tables.append(idx)
280
+
281
+ # Only union tables that were successfully processed
282
+ if not successful_tables:
283
+ # No tables were processed successfully - return empty dataframe
284
+ print("⚠️ No tables could be processed successfully")
285
+ import pandas as pd
286
+ return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
287
+ 'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
227
288
 
228
- # Union all temp tables
229
- union_parts = [f'SELECT * FROM tbl_{i}' for i in range(len(list_tables))]
289
+ # Union all successfully processed temp tables
290
+ union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
230
291
  union_query = ' UNION ALL '.join(union_parts)
231
292
 
232
293
  # Generate final summary
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.14.dev3
3
+ Version: 0.2.14.dev40
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,14 @@
1
+ duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
2
+ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
+ duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
7
+ duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
+ duckrun/stats.py,sha256=xqgtW_HHAizom6E13_UjitNgmz6pzK10XdosPWJO1Ew,14282
9
+ duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
+ duckrun-0.2.14.dev40.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.14.dev40.dist-info/METADATA,sha256=P6aHVS1SPOmiuFKehThvojrwqtTMU2DHAFuGmyjj-g4,20772
12
+ duckrun-0.2.14.dev40.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.14.dev40.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.14.dev40.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
2
- duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
- duckrun/core.py,sha256=_D0CnaRNQm_wW4bSP__EAPHEt_VNgf9N-VXWYSZScL8,65829
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
7
- duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
- duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
9
- duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
- duckrun-0.2.14.dev3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
- duckrun-0.2.14.dev3.dist-info/METADATA,sha256=tOLtAIHcEJyXk93hvvgZNC3Cx7U2Dy7iatRutBnrU3Y,20771
12
- duckrun-0.2.14.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- duckrun-0.2.14.dev3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
- duckrun-0.2.14.dev3.dist-info/RECORD,,