duckrun 0.2.13__py3-none-any.whl → 0.2.19.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- duckrun/__init__.py +3 -2
- duckrun/auth.py +12 -0
- duckrun/core.py +525 -186
- duckrun/notebook.py +324 -0
- duckrun/runner.py +15 -45
- duckrun/semantic_model.py +143 -17
- duckrun/stats.py +267 -62
- duckrun/writer.py +35 -6
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev1.dist-info}/METADATA +3 -3
- duckrun-0.2.19.dev1.dist-info/RECORD +15 -0
- duckrun-0.2.13.dist-info/RECORD +0 -14
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev1.dist-info}/WHEEL +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev1.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev1.dist-info}/top_level.txt +0 -0
duckrun/stats.py
CHANGED
|
@@ -60,32 +60,89 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
|
|
60
60
|
return []
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def
|
|
63
|
+
def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
|
|
64
|
+
"""Match tables across all schemas using a wildcard pattern.
|
|
65
|
+
Pattern can be:
|
|
66
|
+
- '*.summary' - matches 'summary' table in all schemas
|
|
67
|
+
- '*summary' - matches any table ending with 'summary'
|
|
68
|
+
- 'schema.*' - matches all tables in 'schema'
|
|
69
|
+
Returns a dict mapping schema names to lists of matching table names."""
|
|
70
|
+
import fnmatch
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Query all schemas and tables in one go
|
|
74
|
+
query = """
|
|
75
|
+
SELECT table_schema, table_name
|
|
76
|
+
FROM information_schema.tables
|
|
77
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
78
|
+
AND table_schema != 'information_schema'
|
|
79
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
80
|
+
"""
|
|
81
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
|
82
|
+
|
|
83
|
+
matched = {}
|
|
84
|
+
|
|
85
|
+
# Check if pattern contains a dot (schema.table pattern)
|
|
86
|
+
if '.' in pattern:
|
|
87
|
+
schema_pattern, table_pattern = pattern.split('.', 1)
|
|
88
|
+
for schema, table in result:
|
|
89
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
90
|
+
if schema not in matched:
|
|
91
|
+
matched[schema] = []
|
|
92
|
+
matched[schema].append(table)
|
|
93
|
+
else:
|
|
94
|
+
# Pattern matches only table names
|
|
95
|
+
for schema, table in result:
|
|
96
|
+
if fnmatch.fnmatch(table, pattern):
|
|
97
|
+
if schema not in matched:
|
|
98
|
+
matched[schema] = []
|
|
99
|
+
matched[schema].append(table)
|
|
100
|
+
|
|
101
|
+
return matched
|
|
102
|
+
except:
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_stats(duckrun_instance, source: str = None, detailed = False):
|
|
64
107
|
"""
|
|
65
108
|
Get comprehensive statistics for Delta Lake tables.
|
|
66
109
|
|
|
67
110
|
Args:
|
|
68
111
|
duckrun_instance: The Duckrun connection instance
|
|
69
|
-
source: Can be one of:
|
|
112
|
+
source: Optional. Can be one of:
|
|
113
|
+
- None: Use all tables in the connection's schema (default)
|
|
70
114
|
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
71
115
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
72
116
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
117
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
118
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
119
|
+
- False (default): Aggregated table-level stats (total rows, file count,
|
|
120
|
+
row groups, average row group size, file sizes, VORDER status)
|
|
121
|
+
- True: Row group level statistics with compression details, row group sizes,
|
|
122
|
+
and parquet metadata
|
|
73
123
|
|
|
74
124
|
Returns:
|
|
75
|
-
|
|
76
|
-
|
|
125
|
+
DataFrame with statistics based on detailed parameter:
|
|
126
|
+
- If detailed=False: Aggregated table-level summary
|
|
127
|
+
- If detailed=True: Granular file and row group level stats
|
|
77
128
|
|
|
78
129
|
Examples:
|
|
79
130
|
con = duckrun.connect("tmp/data.lakehouse/test")
|
|
80
131
|
|
|
81
|
-
#
|
|
82
|
-
stats = con.get_stats(
|
|
132
|
+
# All tables in the connection's schema (aggregated)
|
|
133
|
+
stats = con.get_stats()
|
|
134
|
+
|
|
135
|
+
# Single table with detailed row group statistics
|
|
136
|
+
stats_detailed = con.get_stats('price_today', detailed=True)
|
|
83
137
|
|
|
84
138
|
# Specific table in different schema (only if multi-schema enabled)
|
|
85
139
|
stats = con.get_stats('aemo.price')
|
|
86
140
|
|
|
87
141
|
# All tables in a schema (only if multi-schema enabled)
|
|
88
142
|
stats = con.get_stats('aemo')
|
|
143
|
+
|
|
144
|
+
# Wildcard pattern across all schemas (only if multi-schema enabled)
|
|
145
|
+
stats = con.get_stats('*.summary')
|
|
89
146
|
"""
|
|
90
147
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
91
148
|
|
|
@@ -93,8 +150,31 @@ def get_stats(duckrun_instance, source: str):
|
|
|
93
150
|
duckdb_schema = "main"
|
|
94
151
|
url_schema = duckrun_instance.schema # This is from the connection URL path
|
|
95
152
|
|
|
153
|
+
# If source is not provided, default to all tables in the connection's schema
|
|
154
|
+
if source is None:
|
|
155
|
+
source = url_schema
|
|
156
|
+
|
|
157
|
+
# Check if source contains wildcard characters
|
|
158
|
+
if '*' in source or '?' in source:
|
|
159
|
+
# Wildcard pattern mode - only valid if multi-schema is enabled
|
|
160
|
+
if not duckrun_instance.scan_all_schemas:
|
|
161
|
+
raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
|
|
162
|
+
|
|
163
|
+
matched_tables = _match_tables_by_pattern(duckrun_instance, source)
|
|
164
|
+
|
|
165
|
+
if not matched_tables:
|
|
166
|
+
raise ValueError(f"No tables found matching pattern '{source}'")
|
|
167
|
+
|
|
168
|
+
# Flatten the matched tables into a list with schema info
|
|
169
|
+
tables_with_schemas = []
|
|
170
|
+
for schema, tables in matched_tables.items():
|
|
171
|
+
for table in tables:
|
|
172
|
+
tables_with_schemas.append((schema, table))
|
|
173
|
+
|
|
174
|
+
print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
|
|
175
|
+
|
|
96
176
|
# Parse the source and validate existence
|
|
97
|
-
|
|
177
|
+
elif '.' in source:
|
|
98
178
|
# Format: schema.table - only valid if multi-schema is enabled
|
|
99
179
|
schema_name, table_name = source.split('.', 1)
|
|
100
180
|
|
|
@@ -105,44 +185,45 @@ def get_stats(duckrun_instance, source: str):
|
|
|
105
185
|
if not _table_exists(duckrun_instance, schema_name, table_name):
|
|
106
186
|
raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
|
|
107
187
|
|
|
108
|
-
|
|
188
|
+
tables_with_schemas = [(schema_name, table_name)]
|
|
109
189
|
else:
|
|
110
190
|
# Could be just table name or schema name
|
|
111
191
|
if duckrun_instance.scan_all_schemas:
|
|
112
192
|
# Multi-schema mode: DuckDB has actual schemas
|
|
113
193
|
# First check if it's a table in main schema
|
|
114
194
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
115
|
-
|
|
116
|
-
schema_name = duckdb_schema
|
|
195
|
+
tables_with_schemas = [(duckdb_schema, source)]
|
|
117
196
|
# Otherwise, check if it's a schema name
|
|
118
197
|
elif _schema_exists(duckrun_instance, source):
|
|
119
198
|
schema_name = source
|
|
120
199
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
|
|
121
200
|
if not list_tables:
|
|
122
201
|
raise ValueError(f"Schema '{source}' exists but contains no tables")
|
|
202
|
+
tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
|
|
123
203
|
else:
|
|
124
204
|
raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
|
|
125
205
|
else:
|
|
126
206
|
# Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
|
|
127
207
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
128
208
|
# It's a table name
|
|
129
|
-
|
|
130
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
209
|
+
tables_with_schemas = [(url_schema, source)]
|
|
131
210
|
elif source == url_schema:
|
|
132
211
|
# Special case: user asked for stats on the URL schema name - list all tables
|
|
133
212
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
|
|
134
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
135
213
|
if not list_tables:
|
|
136
214
|
raise ValueError(f"No tables found in schema '{url_schema}'")
|
|
215
|
+
tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
|
|
137
216
|
else:
|
|
138
217
|
raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
|
|
139
218
|
|
|
140
219
|
# Use the existing connection
|
|
141
220
|
con = duckrun_instance.con
|
|
142
221
|
|
|
143
|
-
print(f"Processing {len(
|
|
222
|
+
print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
|
|
144
223
|
|
|
145
|
-
|
|
224
|
+
successful_tables = []
|
|
225
|
+
for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
|
|
226
|
+
print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
|
|
146
227
|
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
|
147
228
|
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
|
148
229
|
|
|
@@ -169,8 +250,18 @@ def get_stats(duckrun_instance, source: str):
|
|
|
169
250
|
print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
|
|
170
251
|
xx = {}
|
|
171
252
|
|
|
172
|
-
# Check if VORDER exists
|
|
173
|
-
|
|
253
|
+
# Check if VORDER exists - handle both formats:
|
|
254
|
+
# 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
|
|
255
|
+
# 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
|
|
256
|
+
vorder = False
|
|
257
|
+
if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
|
|
258
|
+
vorder = True
|
|
259
|
+
elif 'tags' in xx.keys() and xx['tags']:
|
|
260
|
+
# Check nested tags dictionary (tags is a list of dicts, one per file)
|
|
261
|
+
for tag_dict in xx['tags']:
|
|
262
|
+
if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
|
|
263
|
+
vorder = True
|
|
264
|
+
break
|
|
174
265
|
|
|
175
266
|
# Calculate total size
|
|
176
267
|
total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
|
|
@@ -185,66 +276,180 @@ def get_stats(duckrun_instance, source: str):
|
|
|
185
276
|
con.execute(f'''
|
|
186
277
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
187
278
|
SELECT
|
|
279
|
+
'{schema_name}' as schema,
|
|
188
280
|
'{tbl}' as tbl,
|
|
189
281
|
'empty' as file_name,
|
|
190
282
|
0 as num_rows,
|
|
191
283
|
0 as num_row_groups,
|
|
192
284
|
0 as size,
|
|
193
285
|
{vorder} as vorder,
|
|
286
|
+
'' as compression,
|
|
194
287
|
'{timestamp}' as timestamp
|
|
195
288
|
WHERE false
|
|
196
289
|
''')
|
|
197
290
|
else:
|
|
198
|
-
# Get parquet metadata and create temp table
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
291
|
+
# Get parquet metadata and create temp table with compression info
|
|
292
|
+
if detailed == True:
|
|
293
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
294
|
+
con.execute(f'''
|
|
295
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
296
|
+
SELECT
|
|
297
|
+
'{schema_name}' as schema,
|
|
298
|
+
'{tbl}' as tbl,
|
|
299
|
+
{vorder} as vorder,
|
|
300
|
+
pm.*,
|
|
301
|
+
'{timestamp}' as timestamp
|
|
302
|
+
FROM parquet_metadata({delta}) pm
|
|
303
|
+
''')
|
|
304
|
+
else:
|
|
305
|
+
# Aggregated mode: Original summary statistics
|
|
306
|
+
con.execute(f'''
|
|
307
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
308
|
+
SELECT
|
|
309
|
+
'{schema_name}' as schema,
|
|
310
|
+
'{tbl}' as tbl,
|
|
311
|
+
fm.file_name,
|
|
312
|
+
fm.num_rows,
|
|
313
|
+
fm.num_row_groups,
|
|
314
|
+
CEIL({total_size}/(1024*1024)) as size,
|
|
315
|
+
{vorder} as vorder,
|
|
316
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
317
|
+
'{timestamp}' as timestamp
|
|
318
|
+
FROM parquet_file_metadata({delta}) fm
|
|
319
|
+
LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
|
|
320
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
321
|
+
''')
|
|
211
322
|
|
|
212
323
|
except Exception as e:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
324
|
+
error_msg = str(e)
|
|
325
|
+
print(f"Warning: Could not process table '{tbl}' using DeltaTable API: {e}")
|
|
326
|
+
|
|
327
|
+
# Fallback: Use DuckDB's delta_scan with filename parameter
|
|
328
|
+
if "Invalid JSON" in error_msg or "MetadataValue" in error_msg:
|
|
329
|
+
print(f" Detected JSON parsing issue - falling back to DuckDB delta_scan")
|
|
330
|
+
else:
|
|
331
|
+
print(f" Falling back to DuckDB delta_scan")
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
# First get the list of actual parquet files using delta_scan
|
|
335
|
+
file_list_result = con.execute(f'''
|
|
336
|
+
SELECT DISTINCT filename
|
|
337
|
+
FROM delta_scan('{table_path}', filename=1)
|
|
338
|
+
''').fetchall()
|
|
339
|
+
|
|
340
|
+
if not file_list_result:
|
|
341
|
+
# Empty table
|
|
342
|
+
con.execute(f'''
|
|
343
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
344
|
+
SELECT
|
|
345
|
+
'{schema_name}' as schema,
|
|
346
|
+
'{tbl}' as tbl,
|
|
347
|
+
'empty' as file_name,
|
|
348
|
+
0 as num_rows,
|
|
349
|
+
0 as num_row_groups,
|
|
350
|
+
0 as size,
|
|
351
|
+
false as vorder,
|
|
352
|
+
'' as compression,
|
|
353
|
+
'{timestamp}' as timestamp
|
|
354
|
+
WHERE false
|
|
355
|
+
''')
|
|
356
|
+
else:
|
|
357
|
+
# Extract just the filename (not the full path) from delta_scan results
|
|
358
|
+
# delta_scan returns full ABFSS paths, we need to extract just the filename part
|
|
359
|
+
filenames = []
|
|
360
|
+
for row in file_list_result:
|
|
361
|
+
full_path = row[0]
|
|
362
|
+
# Extract just the filename from the full ABFSS path
|
|
363
|
+
if '/' in full_path:
|
|
364
|
+
filename = full_path.split('/')[-1]
|
|
365
|
+
else:
|
|
366
|
+
filename = full_path
|
|
367
|
+
filenames.append(table_path + "/" + filename)
|
|
368
|
+
|
|
369
|
+
# Use parquet_file_metadata to get actual parquet stats with compression
|
|
370
|
+
if detailed == True:
|
|
371
|
+
# Detailed mode: Include ALL parquet_metadata columns
|
|
372
|
+
con.execute(f'''
|
|
373
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
374
|
+
SELECT
|
|
375
|
+
'{schema_name}' as schema,
|
|
376
|
+
'{tbl}' as tbl,
|
|
377
|
+
false as vorder,
|
|
378
|
+
pm.*,
|
|
379
|
+
'{timestamp}' as timestamp
|
|
380
|
+
FROM parquet_metadata({filenames}) pm
|
|
381
|
+
''')
|
|
382
|
+
else:
|
|
383
|
+
# Aggregated mode: Original summary statistics
|
|
384
|
+
con.execute(f'''
|
|
385
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
386
|
+
SELECT
|
|
387
|
+
'{schema_name}' as schema,
|
|
388
|
+
'{tbl}' as tbl,
|
|
389
|
+
fm.file_name,
|
|
390
|
+
fm.num_rows,
|
|
391
|
+
fm.num_row_groups,
|
|
392
|
+
0 as size,
|
|
393
|
+
false as vorder,
|
|
394
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
395
|
+
'{timestamp}' as timestamp
|
|
396
|
+
FROM parquet_file_metadata({filenames}) fm
|
|
397
|
+
LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
|
|
398
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
399
|
+
''')
|
|
400
|
+
|
|
401
|
+
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
402
|
+
except Exception as fallback_error:
|
|
403
|
+
print(f" ✗ DuckDB fallback also failed for '{tbl}': {fallback_error}")
|
|
404
|
+
print(f" ⏭️ Skipping table '{tbl}'")
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# Mark this table as successfully processed
|
|
408
|
+
successful_tables.append(idx)
|
|
409
|
+
|
|
410
|
+
# Only union tables that were successfully processed
|
|
411
|
+
if not successful_tables:
|
|
412
|
+
# No tables were processed successfully - return empty dataframe
|
|
413
|
+
print("⚠️ No tables could be processed successfully")
|
|
414
|
+
import pandas as pd
|
|
415
|
+
if detailed == True:
|
|
416
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
|
|
417
|
+
else:
|
|
418
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
419
|
+
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
227
420
|
|
|
228
|
-
# Union all temp tables
|
|
229
|
-
union_parts = [f'SELECT * FROM tbl_{i}' for i in
|
|
421
|
+
# Union all successfully processed temp tables
|
|
422
|
+
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
230
423
|
union_query = ' UNION ALL '.join(union_parts)
|
|
231
424
|
|
|
232
|
-
# Generate final summary
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
425
|
+
# Generate final summary based on detailed flag
|
|
426
|
+
if detailed == True:
|
|
427
|
+
# Detailed mode: Return ALL parquet_metadata columns
|
|
428
|
+
final_result = con.execute(f'''
|
|
429
|
+
SELECT *
|
|
430
|
+
FROM ({union_query})
|
|
431
|
+
WHERE tbl IS NOT NULL
|
|
432
|
+
ORDER BY schema, tbl, file_name, row_group_id, column_id
|
|
433
|
+
''').df()
|
|
434
|
+
else:
|
|
435
|
+
# Aggregated mode: Original summary statistics
|
|
436
|
+
final_result = con.execute(f'''
|
|
437
|
+
SELECT
|
|
438
|
+
schema,
|
|
439
|
+
tbl,
|
|
440
|
+
SUM(num_rows) as total_rows,
|
|
441
|
+
COUNT(*) as num_files,
|
|
442
|
+
SUM(num_row_groups) as num_row_group,
|
|
443
|
+
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
|
444
|
+
MIN(size) as file_size_MB,
|
|
445
|
+
ANY_VALUE(vorder) as vorder,
|
|
446
|
+
STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
|
|
447
|
+
ANY_VALUE(timestamp) as timestamp
|
|
448
|
+
FROM ({union_query})
|
|
449
|
+
WHERE tbl IS NOT NULL
|
|
450
|
+
GROUP BY schema, tbl
|
|
451
|
+
ORDER BY total_rows DESC
|
|
452
|
+
''').df()
|
|
248
453
|
|
|
249
454
|
return final_result
|
|
250
455
|
|
duckrun/writer.py
CHANGED
|
@@ -3,6 +3,20 @@ Delta Lake writer functionality for duckrun - Spark-style write API
|
|
|
3
3
|
"""
|
|
4
4
|
from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
|
|
5
5
|
|
|
6
|
+
# Try to import WriterProperties for Rust engine (available in 0.18.2+)
|
|
7
|
+
try:
|
|
8
|
+
from deltalake.writer import WriterProperties
|
|
9
|
+
_HAS_WRITER_PROPERTIES = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
_HAS_WRITER_PROPERTIES = False
|
|
12
|
+
|
|
13
|
+
# Try to import PyArrow dataset for old PyArrow engine
|
|
14
|
+
try:
|
|
15
|
+
import pyarrow.dataset as ds
|
|
16
|
+
_HAS_PYARROW_DATASET = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
_HAS_PYARROW_DATASET = False
|
|
19
|
+
|
|
6
20
|
|
|
7
21
|
# Row Group configuration for optimal Delta Lake performance
|
|
8
22
|
RG = 8_000_000
|
|
@@ -23,12 +37,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
23
37
|
- Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
|
|
24
38
|
- When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
|
|
25
39
|
- When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
|
|
40
|
+
- COMPRESSION: Defaults to ZSTD via writer_properties (rust) or file_options (pyarrow)
|
|
26
41
|
|
|
27
42
|
deltalake 0.20+:
|
|
28
43
|
- Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
|
|
29
44
|
- Does NOT have max_rows_per_file (row group optimization removed)
|
|
30
45
|
- When mergeSchema=True: must set schema_mode='merge'
|
|
31
46
|
- When mergeSchema=False: just write normally (no special params)
|
|
47
|
+
- COMPRESSION: Defaults to ZSTD via writer_properties (rust only)
|
|
32
48
|
|
|
33
49
|
Uses version detection for simpler logic.
|
|
34
50
|
"""
|
|
@@ -50,7 +66,13 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
50
66
|
# deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
|
|
51
67
|
# Do NOT use row group params (they conflict with rust engine)
|
|
52
68
|
args['engine'] = 'rust'
|
|
53
|
-
|
|
69
|
+
# Set ZSTD compression for Rust engine
|
|
70
|
+
if _HAS_WRITER_PROPERTIES:
|
|
71
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
72
|
+
else:
|
|
73
|
+
# Version 0.20+: rust is default, just add compression
|
|
74
|
+
if _HAS_WRITER_PROPERTIES:
|
|
75
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
54
76
|
else:
|
|
55
77
|
# Normal write mode (no schema merging)
|
|
56
78
|
if _IS_OLD_DELTALAKE:
|
|
@@ -59,7 +81,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
59
81
|
args['max_rows_per_file'] = RG
|
|
60
82
|
args['max_rows_per_group'] = RG
|
|
61
83
|
args['min_rows_per_group'] = RG
|
|
62
|
-
|
|
84
|
+
# Set ZSTD compression for PyArrow engine
|
|
85
|
+
if _HAS_PYARROW_DATASET:
|
|
86
|
+
args['file_options'] = ds.ParquetFileFormat().make_write_options(compression='ZSTD')
|
|
87
|
+
else:
|
|
88
|
+
# Version 0.20+: no optimization available (rust by default, no row group params supported)
|
|
89
|
+
# Set ZSTD compression for Rust engine
|
|
90
|
+
if _HAS_WRITER_PROPERTIES:
|
|
91
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
63
92
|
|
|
64
93
|
return args
|
|
65
94
|
|
|
@@ -135,14 +164,14 @@ class DeltaWriter:
|
|
|
135
164
|
# Prepare info message based on version and settings
|
|
136
165
|
if self._schema_mode == 'merge':
|
|
137
166
|
if _IS_OLD_DELTALAKE:
|
|
138
|
-
engine_info = " (engine=rust, schema_mode=merge)"
|
|
167
|
+
engine_info = " (engine=rust, schema_mode=merge, compression=ZSTD)"
|
|
139
168
|
else:
|
|
140
|
-
engine_info = " (schema_mode=merge, rust by default)"
|
|
169
|
+
engine_info = " (schema_mode=merge, rust by default, compression=ZSTD)"
|
|
141
170
|
else:
|
|
142
171
|
if _IS_OLD_DELTALAKE:
|
|
143
|
-
engine_info = " (engine=pyarrow, optimized row groups)"
|
|
172
|
+
engine_info = " (engine=pyarrow, optimized row groups, compression=ZSTD)"
|
|
144
173
|
else:
|
|
145
|
-
engine_info = " (engine=rust by default)"
|
|
174
|
+
engine_info = " (engine=rust by default, compression=ZSTD)"
|
|
146
175
|
|
|
147
176
|
partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
|
|
148
177
|
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.2.19.dev1
|
|
4
|
+
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
|
10
10
|
Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.
|
|
13
|
+
Requires-Dist: duckdb>=1.2.0
|
|
14
14
|
Requires-Dist: deltalake<=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
duckrun/__init__.py,sha256=-DPOb_ETaBC0M7YqXj482FE1aZ-SxJeSeY6KB6hPgWU,350
|
|
2
|
+
duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
|
|
3
|
+
duckrun/core.py,sha256=jpg1okp6-Y4HubTJmSjyT9uhUc5pFr4A0tcNxNujSig,69086
|
|
4
|
+
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
+
duckrun/notebook.py,sha256=lzDRBoWZ_lePF-_5BbA1_42BImLZC5yrq6nzlmlKglM,12183
|
|
7
|
+
duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
|
|
8
|
+
duckrun/semantic_model.py,sha256=shRPBN1II60K_PH8JOqke-_3hAwLspcx4Add0VJRwwU,35913
|
|
9
|
+
duckrun/stats.py,sha256=8Qc9Mimvv7ALbOHw5-UPWrSflFrGrtkCQkB0QYL8jCw,21923
|
|
10
|
+
duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
|
|
11
|
+
duckrun-0.2.19.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
12
|
+
duckrun-0.2.19.dev1.dist-info/METADATA,sha256=RvjFSOTabsqOYCk2ApzQ5ichMistEScyLKnrn61ODRs,20807
|
|
13
|
+
duckrun-0.2.19.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
duckrun-0.2.19.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
15
|
+
duckrun-0.2.19.dev1.dist-info/RECORD,,
|
duckrun-0.2.13.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
duckrun/__init__.py,sha256=cTj6KQ6hKmgu1z7k9nhDcO5lct049luxjx1V0QnymCo,235
|
|
2
|
-
duckrun/auth.py,sha256=dMqIzozgEQ5v7Uc3Mb_OoFZGmsAq0m-VOoYCVL7rehc,9281
|
|
3
|
-
duckrun/core.py,sha256=C5nnL-MheBfJPcw-Jr8t14jsm2iwMF07cYm8g_AXtFQ,52303
|
|
4
|
-
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
-
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
-
duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
|
|
7
|
-
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
|
-
duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
|
|
9
|
-
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
|
10
|
-
duckrun-0.2.13.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
-
duckrun-0.2.13.dist-info/METADATA,sha256=0r-l8dWnd8KLBGj7cspK53eUdaDeUG-iHsa74rGBaCo,20766
|
|
12
|
-
duckrun-0.2.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
duckrun-0.2.13.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
-
duckrun-0.2.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|