duckrun 0.2.14.dev3__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- duckrun/core.py +38 -14
- duckrun/stats.py +77 -16
- {duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/METADATA +1 -1
- duckrun-0.2.15.dist-info/RECORD +14 -0
- duckrun-0.2.14.dev3.dist-info/RECORD +0 -14
- {duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/WHEEL +0 -0
- {duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
|
@@ -817,16 +817,16 @@ class Duckrun:
|
|
|
817
817
|
# Fallback to original value
|
|
818
818
|
return self.workspace_id
|
|
819
819
|
|
|
820
|
-
def
|
|
820
|
+
def get_item_id(self, force: bool = False) -> str:
|
|
821
821
|
"""
|
|
822
|
-
Get the
|
|
823
|
-
Use this when passing lakehouse parameter to Python functions.
|
|
822
|
+
Get the item ID (GUID or name) - works for lakehouses, warehouses, databases, etc.
|
|
823
|
+
Use this when passing lakehouse/item parameter to Python functions.
|
|
824
824
|
|
|
825
825
|
Args:
|
|
826
826
|
force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
|
|
827
827
|
|
|
828
828
|
Returns:
|
|
829
|
-
|
|
829
|
+
Item ID - either a GUID or item name (supports all OneLake item types)
|
|
830
830
|
"""
|
|
831
831
|
if not force:
|
|
832
832
|
return self.lakehouse_id
|
|
@@ -839,14 +839,24 @@ class Duckrun:
|
|
|
839
839
|
if guid_pattern.match(self.lakehouse_id):
|
|
840
840
|
return self.lakehouse_id
|
|
841
841
|
|
|
842
|
-
#
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
if
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
842
|
+
# Detect item type from lakehouse_id (e.g., "data.Lakehouse" -> Lakehouse)
|
|
843
|
+
item_type = None
|
|
844
|
+
item_name = self.lakehouse_id
|
|
845
|
+
for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
|
|
846
|
+
if self.lakehouse_id.endswith(suffix):
|
|
847
|
+
item_type = suffix[1:] # Remove the leading dot
|
|
848
|
+
item_name = self.lakehouse_id[:-len(suffix)]
|
|
849
|
+
break
|
|
850
|
+
|
|
851
|
+
# Try to get from notebook context first (only works for lakehouses)
|
|
852
|
+
if item_type == 'Lakehouse' or item_type is None:
|
|
853
|
+
try:
|
|
854
|
+
import notebookutils # type: ignore
|
|
855
|
+
lakehouse_guid = notebookutils.lakehouse.get("id")
|
|
856
|
+
if lakehouse_guid:
|
|
857
|
+
return lakehouse_guid
|
|
858
|
+
except (ImportError, Exception):
|
|
859
|
+
pass
|
|
850
860
|
|
|
851
861
|
# Resolve via API
|
|
852
862
|
try:
|
|
@@ -855,8 +865,15 @@ class Duckrun:
|
|
|
855
865
|
if token:
|
|
856
866
|
# First get workspace GUID
|
|
857
867
|
workspace_guid = self.get_workspace_id(force=True)
|
|
858
|
-
|
|
859
|
-
|
|
868
|
+
|
|
869
|
+
# Use appropriate resolver based on item type
|
|
870
|
+
if item_type == 'Lakehouse' or item_type is None:
|
|
871
|
+
# Use lakehouse-specific API
|
|
872
|
+
resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, item_name if item_name else self.lakehouse_id)
|
|
873
|
+
else:
|
|
874
|
+
# Use generic items API for warehouses, databases, etc.
|
|
875
|
+
resolved_id = self._resolve_item_id_by_name(token, workspace_guid, item_name, item_type)
|
|
876
|
+
|
|
860
877
|
if resolved_id:
|
|
861
878
|
return resolved_id
|
|
862
879
|
except Exception:
|
|
@@ -864,6 +881,13 @@ class Duckrun:
|
|
|
864
881
|
|
|
865
882
|
# Fallback to original value
|
|
866
883
|
return self.lakehouse_id
|
|
884
|
+
|
|
885
|
+
def get_lakehouse_id(self, force: bool = False) -> str:
|
|
886
|
+
"""
|
|
887
|
+
Deprecated: Use get_item_id() instead.
|
|
888
|
+
Backward compatibility alias for get_item_id().
|
|
889
|
+
"""
|
|
890
|
+
return self.get_item_id(force)
|
|
867
891
|
|
|
868
892
|
def run(self, pipeline: List[Tuple]) -> bool:
|
|
869
893
|
"""
|
duckrun/stats.py
CHANGED
|
@@ -142,7 +142,9 @@ def get_stats(duckrun_instance, source: str):
|
|
|
142
142
|
|
|
143
143
|
print(f"Processing {len(list_tables)} tables: {list_tables}")
|
|
144
144
|
|
|
145
|
+
successful_tables = []
|
|
145
146
|
for idx, tbl in enumerate(list_tables):
|
|
147
|
+
print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
|
|
146
148
|
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
|
147
149
|
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
|
148
150
|
|
|
@@ -210,23 +212,82 @@ def get_stats(duckrun_instance, source: str):
|
|
|
210
212
|
''')
|
|
211
213
|
|
|
212
214
|
except Exception as e:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
215
|
+
error_msg = str(e)
|
|
216
|
+
print(f"Warning: Could not process table '{tbl}' using DeltaTable API: {e}")
|
|
217
|
+
|
|
218
|
+
# Fallback: Use DuckDB's delta_scan with filename parameter
|
|
219
|
+
if "Invalid JSON" in error_msg or "MetadataValue" in error_msg:
|
|
220
|
+
print(f" Detected JSON parsing issue - falling back to DuckDB delta_scan")
|
|
221
|
+
else:
|
|
222
|
+
print(f" Falling back to DuckDB delta_scan")
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
# First get the list of actual parquet files using delta_scan
|
|
226
|
+
file_list_result = con.execute(f'''
|
|
227
|
+
SELECT DISTINCT filename
|
|
228
|
+
FROM delta_scan('{table_path}', filename=1)
|
|
229
|
+
''').fetchall()
|
|
230
|
+
|
|
231
|
+
if not file_list_result:
|
|
232
|
+
# Empty table
|
|
233
|
+
con.execute(f'''
|
|
234
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
235
|
+
SELECT
|
|
236
|
+
'{tbl}' as tbl,
|
|
237
|
+
'empty' as file_name,
|
|
238
|
+
0 as num_rows,
|
|
239
|
+
0 as num_row_groups,
|
|
240
|
+
0 as size,
|
|
241
|
+
false as vorder,
|
|
242
|
+
'{timestamp}' as timestamp
|
|
243
|
+
WHERE false
|
|
244
|
+
''')
|
|
245
|
+
else:
|
|
246
|
+
# Extract just the filename (not the full path) from delta_scan results
|
|
247
|
+
# delta_scan returns full ABFSS paths, we need to extract just the filename part
|
|
248
|
+
filenames = []
|
|
249
|
+
for row in file_list_result:
|
|
250
|
+
full_path = row[0]
|
|
251
|
+
# Extract just the filename from the full ABFSS path
|
|
252
|
+
if '/' in full_path:
|
|
253
|
+
filename = full_path.split('/')[-1]
|
|
254
|
+
else:
|
|
255
|
+
filename = full_path
|
|
256
|
+
filenames.append(table_path + "/" + filename)
|
|
257
|
+
|
|
258
|
+
# Use parquet_file_metadata to get actual parquet stats
|
|
259
|
+
con.execute(f'''
|
|
260
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
261
|
+
SELECT
|
|
262
|
+
'{tbl}' as tbl,
|
|
263
|
+
file_name,
|
|
264
|
+
num_rows,
|
|
265
|
+
num_row_groups,
|
|
266
|
+
0 as size,
|
|
267
|
+
false as vorder,
|
|
268
|
+
'{timestamp}' as timestamp
|
|
269
|
+
FROM parquet_file_metadata({filenames})
|
|
270
|
+
''')
|
|
271
|
+
|
|
272
|
+
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
273
|
+
except Exception as fallback_error:
|
|
274
|
+
print(f" ✗ DuckDB fallback also failed for '{tbl}': {fallback_error}")
|
|
275
|
+
print(f" ⏭️ Skipping table '{tbl}'")
|
|
276
|
+
continue
|
|
277
|
+
|
|
278
|
+
# Mark this table as successfully processed
|
|
279
|
+
successful_tables.append(idx)
|
|
280
|
+
|
|
281
|
+
# Only union tables that were successfully processed
|
|
282
|
+
if not successful_tables:
|
|
283
|
+
# No tables were processed successfully - return empty dataframe
|
|
284
|
+
print("⚠️ No tables could be processed successfully")
|
|
285
|
+
import pandas as pd
|
|
286
|
+
return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
287
|
+
'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
|
|
227
288
|
|
|
228
|
-
# Union all temp tables
|
|
229
|
-
union_parts = [f'SELECT * FROM tbl_{i}' for i in
|
|
289
|
+
# Union all successfully processed temp tables
|
|
290
|
+
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
230
291
|
union_query = ' UNION ALL '.join(union_parts)
|
|
231
292
|
|
|
232
293
|
# Generate final summary
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
|
|
2
|
+
duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
|
|
3
|
+
duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
|
|
4
|
+
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
+
duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
|
|
7
|
+
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
|
+
duckrun/stats.py,sha256=xqgtW_HHAizom6E13_UjitNgmz6pzK10XdosPWJO1Ew,14282
|
|
9
|
+
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
|
10
|
+
duckrun-0.2.15.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
+
duckrun-0.2.15.dist-info/METADATA,sha256=xExTRo--bAjK6Ioq7O6F_641ZkVGgHj3_d-jHO9tadE,20766
|
|
12
|
+
duckrun-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
duckrun-0.2.15.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
+
duckrun-0.2.15.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
|
|
2
|
-
duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
|
|
3
|
-
duckrun/core.py,sha256=_D0CnaRNQm_wW4bSP__EAPHEt_VNgf9N-VXWYSZScL8,65829
|
|
4
|
-
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
-
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
-
duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
|
|
7
|
-
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
|
-
duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
|
|
9
|
-
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
|
10
|
-
duckrun-0.2.14.dev3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
-
duckrun-0.2.14.dev3.dist-info/METADATA,sha256=tOLtAIHcEJyXk93hvvgZNC3Cx7U2Dy7iatRutBnrU3Y,20771
|
|
12
|
-
duckrun-0.2.14.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
duckrun-0.2.14.dev3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
-
duckrun-0.2.14.dev3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|