duckrun 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +46 -572
- duckrun/files.py +251 -0
- duckrun/runner.py +287 -0
- duckrun/stats.py +225 -0
- duckrun/writer.py +165 -0
- {duckrun-0.2.1.dist-info → duckrun-0.2.3.dist-info}/METADATA +1 -1
- duckrun-0.2.3.dist-info/RECORD +11 -0
- duckrun-0.2.1.dist-info/RECORD +0 -7
- {duckrun-0.2.1.dist-info → duckrun-0.2.3.dist-info}/WHEEL +0 -0
- {duckrun-0.2.1.dist-info → duckrun-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.1.dist-info → duckrun-0.2.3.dist-info}/top_level.txt +0 -0
duckrun/stats.py
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
"""
|
2
|
+
Delta Lake table statistics functionality for duckrun
|
3
|
+
"""
|
4
|
+
import duckdb
|
5
|
+
from deltalake import DeltaTable
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
|
9
|
+
def _table_exists(duckrun_instance, schema_name: str, table_name: str) -> bool:
|
10
|
+
"""Check if a specific table exists by trying to query it directly."""
|
11
|
+
try:
|
12
|
+
# For main schema, just use table name directly
|
13
|
+
if schema_name == "main":
|
14
|
+
query = f"SELECT COUNT(*) FROM {table_name} LIMIT 1"
|
15
|
+
else:
|
16
|
+
query = f"SELECT COUNT(*) FROM {schema_name}.{table_name} LIMIT 1"
|
17
|
+
duckrun_instance.con.execute(query)
|
18
|
+
return True
|
19
|
+
except:
|
20
|
+
return False
|
21
|
+
|
22
|
+
|
23
|
+
def _schema_exists(duckrun_instance, schema_name: str) -> bool:
|
24
|
+
"""Check if a schema exists by trying to show its tables."""
|
25
|
+
try:
|
26
|
+
# For main schema, just show tables
|
27
|
+
if schema_name == "main":
|
28
|
+
query = "SHOW TABLES"
|
29
|
+
else:
|
30
|
+
query = f"SHOW TABLES FROM {schema_name}"
|
31
|
+
duckrun_instance.con.execute(query)
|
32
|
+
return True
|
33
|
+
except:
|
34
|
+
return False
|
35
|
+
|
36
|
+
|
37
|
+
def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
38
|
+
"""Get all existing tables in a schema by showing tables, excluding temporary tables."""
|
39
|
+
try:
|
40
|
+
# For main schema, just show tables
|
41
|
+
if schema_name == "main":
|
42
|
+
query = "SHOW TABLES"
|
43
|
+
else:
|
44
|
+
query = f"SHOW TABLES FROM {schema_name}"
|
45
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
46
|
+
if result:
|
47
|
+
# Filter out temporary tables created by stats processing (tbl_0, tbl_1, etc.)
|
48
|
+
tables = [row[0] for row in result]
|
49
|
+
filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
|
50
|
+
return filtered_tables
|
51
|
+
return []
|
52
|
+
except:
|
53
|
+
return []
|
54
|
+
|
55
|
+
|
56
|
+
def get_stats(duckrun_instance, source: str):
|
57
|
+
"""
|
58
|
+
Get comprehensive statistics for Delta Lake tables.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
duckrun_instance: The Duckrun connection instance
|
62
|
+
source: Can be one of:
|
63
|
+
- Table name: 'table_name' (uses main schema in DuckDB)
|
64
|
+
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
65
|
+
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Arrow table with statistics including total rows, file count, row groups,
|
69
|
+
average row group size, file sizes, VORDER status, and timestamp
|
70
|
+
|
71
|
+
Examples:
|
72
|
+
con = duckrun.connect("tmp/data.lakehouse/test")
|
73
|
+
|
74
|
+
# Single table in main schema (DuckDB uses 'main', not 'test')
|
75
|
+
stats = con.get_stats('price_today')
|
76
|
+
|
77
|
+
# Specific table in different schema (only if multi-schema enabled)
|
78
|
+
stats = con.get_stats('aemo.price')
|
79
|
+
|
80
|
+
# All tables in a schema (only if multi-schema enabled)
|
81
|
+
stats = con.get_stats('aemo')
|
82
|
+
"""
|
83
|
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
84
|
+
|
85
|
+
# DuckDB always uses 'main' as the default schema, regardless of connection URL schema
|
86
|
+
duckdb_schema = "main"
|
87
|
+
url_schema = duckrun_instance.schema # This is from the connection URL path
|
88
|
+
|
89
|
+
# Parse the source and validate existence
|
90
|
+
if '.' in source:
|
91
|
+
# Format: schema.table - only valid if multi-schema is enabled
|
92
|
+
schema_name, table_name = source.split('.', 1)
|
93
|
+
|
94
|
+
if not duckrun_instance.scan_all_schemas:
|
95
|
+
raise ValueError(f"Multi-schema format '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Use just the table name '{table_name}' instead.")
|
96
|
+
|
97
|
+
# Validate the specific table exists in the actual DuckDB schema
|
98
|
+
if not _table_exists(duckrun_instance, schema_name, table_name):
|
99
|
+
raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
|
100
|
+
|
101
|
+
list_tables = [table_name]
|
102
|
+
else:
|
103
|
+
# Could be just table name or schema name
|
104
|
+
if duckrun_instance.scan_all_schemas:
|
105
|
+
# Multi-schema mode: DuckDB has actual schemas
|
106
|
+
# First check if it's a table in main schema
|
107
|
+
if _table_exists(duckrun_instance, duckdb_schema, source):
|
108
|
+
list_tables = [source]
|
109
|
+
schema_name = duckdb_schema
|
110
|
+
# Otherwise, check if it's a schema name
|
111
|
+
elif _schema_exists(duckrun_instance, source):
|
112
|
+
schema_name = source
|
113
|
+
list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
|
114
|
+
if not list_tables:
|
115
|
+
raise ValueError(f"Schema '{source}' exists but contains no tables")
|
116
|
+
else:
|
117
|
+
raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
|
118
|
+
else:
|
119
|
+
# Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
|
120
|
+
if _table_exists(duckrun_instance, duckdb_schema, source):
|
121
|
+
# It's a table name
|
122
|
+
list_tables = [source]
|
123
|
+
schema_name = url_schema # Use URL schema for file path construction
|
124
|
+
elif source == url_schema:
|
125
|
+
# Special case: user asked for stats on the URL schema name - list all tables
|
126
|
+
list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
|
127
|
+
schema_name = url_schema # Use URL schema for file path construction
|
128
|
+
if not list_tables:
|
129
|
+
raise ValueError(f"No tables found in schema '{url_schema}'")
|
130
|
+
else:
|
131
|
+
raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
|
132
|
+
|
133
|
+
# Use the existing connection
|
134
|
+
con = duckrun_instance.con
|
135
|
+
|
136
|
+
print(f"Processing {len(list_tables)} tables: {list_tables}")
|
137
|
+
|
138
|
+
for idx, tbl in enumerate(list_tables):
|
139
|
+
# Construct lakehouse path using ABFSS URL
|
140
|
+
table_path = f"abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Tables/{schema_name}/{tbl}"
|
141
|
+
|
142
|
+
try:
|
143
|
+
dt = DeltaTable(table_path)
|
144
|
+
xx = dt.get_add_actions(flatten=True).to_pydict()
|
145
|
+
|
146
|
+
# Check if VORDER exists
|
147
|
+
vorder = 'tags.VORDER' in xx.keys()
|
148
|
+
|
149
|
+
# Calculate total size
|
150
|
+
total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
|
151
|
+
|
152
|
+
# Get Delta files
|
153
|
+
delta_files = dt.files()
|
154
|
+
delta = [table_path + "/" + f for f in delta_files]
|
155
|
+
|
156
|
+
# Check if table has any files
|
157
|
+
if not delta:
|
158
|
+
# Empty table - create empty temp table
|
159
|
+
con.execute(f'''
|
160
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
161
|
+
SELECT
|
162
|
+
'{tbl}' as tbl,
|
163
|
+
'empty' as file_name,
|
164
|
+
0 as num_rows,
|
165
|
+
0 as num_row_groups,
|
166
|
+
0 as size,
|
167
|
+
{vorder} as vorder,
|
168
|
+
'{timestamp}' as timestamp
|
169
|
+
WHERE false
|
170
|
+
''')
|
171
|
+
else:
|
172
|
+
# Get parquet metadata and create temp table
|
173
|
+
con.execute(f'''
|
174
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
175
|
+
SELECT
|
176
|
+
'{tbl}' as tbl,
|
177
|
+
file_name,
|
178
|
+
num_rows,
|
179
|
+
num_row_groups,
|
180
|
+
CEIL({total_size}/(1024*1024)) as size,
|
181
|
+
{vorder} as vorder,
|
182
|
+
'{timestamp}' as timestamp
|
183
|
+
FROM parquet_file_metadata({delta})
|
184
|
+
''')
|
185
|
+
|
186
|
+
except Exception as e:
|
187
|
+
print(f"Warning: Could not process table '{tbl}': {e}")
|
188
|
+
# Create empty temp table for failed tables
|
189
|
+
con.execute(f'''
|
190
|
+
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
191
|
+
SELECT
|
192
|
+
'{tbl}' as tbl,
|
193
|
+
'error' as file_name,
|
194
|
+
0 as num_rows,
|
195
|
+
0 as num_row_groups,
|
196
|
+
0 as size,
|
197
|
+
false as vorder,
|
198
|
+
'{timestamp}' as timestamp
|
199
|
+
WHERE false
|
200
|
+
''')
|
201
|
+
|
202
|
+
# Union all temp tables
|
203
|
+
union_parts = [f'SELECT * FROM tbl_{i}' for i in range(len(list_tables))]
|
204
|
+
union_query = ' UNION ALL '.join(union_parts)
|
205
|
+
|
206
|
+
# Generate final summary
|
207
|
+
final_result = con.execute(f'''
|
208
|
+
SELECT
|
209
|
+
tbl,
|
210
|
+
SUM(num_rows) as total_rows,
|
211
|
+
COUNT(*) as num_files,
|
212
|
+
SUM(num_row_groups) as num_row_group,
|
213
|
+
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
214
|
+
MIN(size) as file_size_MB,
|
215
|
+
ANY_VALUE(vorder) as vorder,
|
216
|
+
ANY_VALUE(timestamp) as timestamp
|
217
|
+
FROM ({union_query})
|
218
|
+
WHERE tbl IS NOT NULL
|
219
|
+
GROUP BY tbl
|
220
|
+
ORDER BY total_rows DESC
|
221
|
+
''').fetch_arrow_table()
|
222
|
+
|
223
|
+
return final_result
|
224
|
+
|
225
|
+
|
duckrun/writer.py
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
"""
|
2
|
+
Delta Lake writer functionality for duckrun - Spark-style write API
|
3
|
+
"""
|
4
|
+
from deltalake import DeltaTable, write_deltalake
|
5
|
+
|
6
|
+
|
7
|
+
# Row Group configuration for optimal Delta Lake performance
|
8
|
+
RG = 8_000_000
|
9
|
+
|
10
|
+
|
11
|
+
def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
|
12
|
+
"""
|
13
|
+
Build arguments for write_deltalake based on requirements:
|
14
|
+
- If schema_mode='merge': use rust engine (no row group params)
|
15
|
+
- Otherwise: use pyarrow engine with row group optimization
|
16
|
+
"""
|
17
|
+
args = {
|
18
|
+
'table_or_uri': path,
|
19
|
+
'data': df,
|
20
|
+
'mode': mode
|
21
|
+
}
|
22
|
+
|
23
|
+
# Add partition_by if specified
|
24
|
+
if partition_by:
|
25
|
+
args['partition_by'] = partition_by
|
26
|
+
|
27
|
+
# Engine selection based on schema_mode
|
28
|
+
if schema_mode == 'merge':
|
29
|
+
# Use rust engine for schema merging (no row group params supported)
|
30
|
+
args['schema_mode'] = 'merge'
|
31
|
+
args['engine'] = 'rust'
|
32
|
+
else:
|
33
|
+
# Use pyarrow engine with row group optimization (default)
|
34
|
+
args['max_rows_per_file'] = RG
|
35
|
+
args['max_rows_per_group'] = RG
|
36
|
+
args['min_rows_per_group'] = RG
|
37
|
+
|
38
|
+
return args
|
39
|
+
|
40
|
+
|
41
|
+
class DeltaWriter:
|
42
|
+
"""Spark-style write API for Delta Lake"""
|
43
|
+
|
44
|
+
def __init__(self, relation, duckrun_instance):
|
45
|
+
self.relation = relation
|
46
|
+
self.duckrun = duckrun_instance
|
47
|
+
self._format = "delta"
|
48
|
+
self._mode = "overwrite"
|
49
|
+
self._schema_mode = None
|
50
|
+
self._partition_by = None
|
51
|
+
|
52
|
+
def format(self, format_type: str):
|
53
|
+
"""Set output format (only 'delta' supported)"""
|
54
|
+
if format_type.lower() != "delta":
|
55
|
+
raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
|
56
|
+
self._format = "delta"
|
57
|
+
return self
|
58
|
+
|
59
|
+
def mode(self, write_mode: str):
|
60
|
+
"""Set write mode: 'overwrite' or 'append'"""
|
61
|
+
if write_mode not in {"overwrite", "append"}:
|
62
|
+
raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
|
63
|
+
self._mode = write_mode
|
64
|
+
return self
|
65
|
+
|
66
|
+
def option(self, key: str, value):
|
67
|
+
"""Set write option (Spark-compatible)"""
|
68
|
+
if key == "mergeSchema":
|
69
|
+
if str(value).lower() in ("true", "1"):
|
70
|
+
self._schema_mode = "merge"
|
71
|
+
else:
|
72
|
+
self._schema_mode = None
|
73
|
+
else:
|
74
|
+
raise ValueError(f"Unsupported option: {key}")
|
75
|
+
return self
|
76
|
+
|
77
|
+
def partitionBy(self, *columns):
|
78
|
+
"""Set partition columns (Spark-compatible)"""
|
79
|
+
if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
|
80
|
+
# Handle partitionBy(["col1", "col2"]) case
|
81
|
+
self._partition_by = list(columns[0])
|
82
|
+
else:
|
83
|
+
# Handle partitionBy("col1", "col2") case
|
84
|
+
self._partition_by = list(columns)
|
85
|
+
return self
|
86
|
+
|
87
|
+
def saveAsTable(self, table_name: str):
|
88
|
+
"""Save query result as Delta table"""
|
89
|
+
if self._format != "delta":
|
90
|
+
raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
|
91
|
+
|
92
|
+
if "." in table_name:
|
93
|
+
schema, table = table_name.split(".", 1)
|
94
|
+
else:
|
95
|
+
schema = self.duckrun.schema
|
96
|
+
table = table_name
|
97
|
+
|
98
|
+
self.duckrun._create_onelake_secret()
|
99
|
+
path = f"{self.duckrun.table_base_url}{schema}/{table}"
|
100
|
+
df = self.relation.record_batch()
|
101
|
+
|
102
|
+
# Build write arguments based on schema_mode and partition_by
|
103
|
+
write_args = _build_write_deltalake_args(
|
104
|
+
path, df, self._mode,
|
105
|
+
schema_mode=self._schema_mode,
|
106
|
+
partition_by=self._partition_by
|
107
|
+
)
|
108
|
+
|
109
|
+
engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
|
110
|
+
partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
|
111
|
+
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
|
112
|
+
|
113
|
+
write_deltalake(**write_args)
|
114
|
+
|
115
|
+
# Create view with appropriate schema qualification
|
116
|
+
# If user explicitly specified schema.table, create view with schema qualification
|
117
|
+
# If user just specified table, create view in current schema
|
118
|
+
if "." in table_name:
|
119
|
+
# User explicitly specified schema.table - create qualified view
|
120
|
+
view_name = f"{schema}.{table}"
|
121
|
+
# Ensure the schema exists before creating the view
|
122
|
+
self.duckrun.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
|
123
|
+
else:
|
124
|
+
# User specified just table name - create view in current schema
|
125
|
+
view_name = table
|
126
|
+
|
127
|
+
self.duckrun.con.sql(f"DROP VIEW IF EXISTS {view_name}")
|
128
|
+
self.duckrun.con.sql(f"""
|
129
|
+
CREATE OR REPLACE VIEW {view_name}
|
130
|
+
AS SELECT * FROM delta_scan('{path}')
|
131
|
+
""")
|
132
|
+
|
133
|
+
dt = DeltaTable(path)
|
134
|
+
|
135
|
+
if self._mode == "overwrite":
|
136
|
+
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
137
|
+
dt.cleanup_metadata()
|
138
|
+
print(f"✅ Table {schema}.{table} created/overwritten")
|
139
|
+
else:
|
140
|
+
file_count = len(dt.file_uris())
|
141
|
+
if file_count > self.duckrun.compaction_threshold:
|
142
|
+
print(f"Compacting {schema}.{table} ({file_count} files)")
|
143
|
+
dt.optimize.compact()
|
144
|
+
dt.vacuum(dry_run=False)
|
145
|
+
dt.cleanup_metadata()
|
146
|
+
print(f"✅ Data appended to {schema}.{table}")
|
147
|
+
|
148
|
+
return table
|
149
|
+
|
150
|
+
|
151
|
+
class QueryResult:
|
152
|
+
"""Wrapper for DuckDB relation with write API"""
|
153
|
+
|
154
|
+
def __init__(self, relation, duckrun_instance):
|
155
|
+
self.relation = relation
|
156
|
+
self.duckrun = duckrun_instance
|
157
|
+
|
158
|
+
@property
|
159
|
+
def write(self):
|
160
|
+
"""Access write API"""
|
161
|
+
return DeltaWriter(self.relation, self.duckrun)
|
162
|
+
|
163
|
+
def __getattr__(self, name):
|
164
|
+
"""Delegate all other methods to underlying DuckDB relation"""
|
165
|
+
return getattr(self.relation, name)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=LN5rc5B3HLimgslZdC8tLKe3rjTl_KD8WxCh1qoJhdM,16443
|
3
|
+
duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
|
4
|
+
duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
|
5
|
+
duckrun/stats.py,sha256=jLEkxNo7MjibPMpjMsXyedrJqv9-BAnP1C0L2a7H8Z8,9417
|
6
|
+
duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
|
7
|
+
duckrun-0.2.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
8
|
+
duckrun-0.2.3.dist-info/METADATA,sha256=CpJvtR9l8c9b1AV9-KnjN4fZODE_3oJxS3omz4p-qlc,18339
|
9
|
+
duckrun-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
10
|
+
duckrun-0.2.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
11
|
+
duckrun-0.2.3.dist-info/RECORD,,
|
duckrun-0.2.1.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=vkEFwDo4PTfaTCR-jnkgRMgK6kozvBxagDp2TfDdLVI,39000
|
3
|
-
duckrun-0.2.1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
-
duckrun-0.2.1.dist-info/METADATA,sha256=QktTzL04kkWtpcTVfD18jxfO-YffEE6mVHpBnUHm1-A,18339
|
5
|
-
duckrun-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
duckrun-0.2.1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
-
duckrun-0.2.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|