duckrun 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/stats.py ADDED
@@ -0,0 +1,231 @@
1
+ """
2
+ Delta Lake table statistics functionality for duckrun
3
+ """
4
+ import duckdb
5
+ from deltalake import DeltaTable
6
+ from datetime import datetime
7
+
8
+
9
+ def _table_exists(duckrun_instance, schema_name: str, table_name: str) -> bool:
10
+ """Check if a specific table exists by trying to query it directly."""
11
+ try:
12
+ # For main schema, just use table name directly
13
+ if schema_name == "main":
14
+ query = f"SELECT COUNT(*) FROM {table_name} LIMIT 1"
15
+ else:
16
+ query = f"SELECT COUNT(*) FROM {schema_name}.{table_name} LIMIT 1"
17
+ duckrun_instance.con.execute(query)
18
+ return True
19
+ except:
20
+ return False
21
+
22
+
23
+ def _schema_exists(duckrun_instance, schema_name: str) -> bool:
24
+ """Check if a schema exists by querying information_schema."""
25
+ try:
26
+ # For main schema, always exists
27
+ if schema_name == "main":
28
+ return True
29
+ else:
30
+ # Use information_schema which works in DuckDB 1.2.2
31
+ query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{schema_name}' LIMIT 1"
32
+ result = duckrun_instance.con.execute(query).fetchall()
33
+ return len(result) > 0
34
+ except:
35
+ return False
36
+
37
+
38
+ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
39
+ """Get all existing tables in a schema using information_schema, excluding temporary tables."""
40
+ try:
41
+ # For main schema, use SHOW TABLES
42
+ if schema_name == "main":
43
+ query = "SHOW TABLES"
44
+ result = duckrun_instance.con.execute(query).fetchall()
45
+ if result:
46
+ tables = [row[0] for row in result]
47
+ filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
48
+ return filtered_tables
49
+ else:
50
+ # Use information_schema which works in DuckDB 1.2.2
51
+ query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
52
+ result = duckrun_instance.con.execute(query).fetchall()
53
+ if result:
54
+ tables = [row[0] for row in result]
55
+ filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
56
+ return filtered_tables
57
+ return []
58
+ except:
59
+ return []
60
+
61
+
62
+ def get_stats(duckrun_instance, source: str):
63
+ """
64
+ Get comprehensive statistics for Delta Lake tables.
65
+
66
+ Args:
67
+ duckrun_instance: The Duckrun connection instance
68
+ source: Can be one of:
69
+ - Table name: 'table_name' (uses main schema in DuckDB)
70
+ - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
71
+ - Schema only: 'schema' (all tables in schema, if multi-schema)
72
+
73
+ Returns:
74
+ Arrow table with statistics including total rows, file count, row groups,
75
+ average row group size, file sizes, VORDER status, and timestamp
76
+
77
+ Examples:
78
+ con = duckrun.connect("tmp/data.lakehouse/test")
79
+
80
+ # Single table in main schema (DuckDB uses 'main', not 'test')
81
+ stats = con.get_stats('price_today')
82
+
83
+ # Specific table in different schema (only if multi-schema enabled)
84
+ stats = con.get_stats('aemo.price')
85
+
86
+ # All tables in a schema (only if multi-schema enabled)
87
+ stats = con.get_stats('aemo')
88
+ """
89
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
90
+
91
+ # DuckDB always uses 'main' as the default schema, regardless of connection URL schema
92
+ duckdb_schema = "main"
93
+ url_schema = duckrun_instance.schema # This is from the connection URL path
94
+
95
+ # Parse the source and validate existence
96
+ if '.' in source:
97
+ # Format: schema.table - only valid if multi-schema is enabled
98
+ schema_name, table_name = source.split('.', 1)
99
+
100
+ if not duckrun_instance.scan_all_schemas:
101
+ raise ValueError(f"Multi-schema format '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Use just the table name '{table_name}' instead.")
102
+
103
+ # Validate the specific table exists in the actual DuckDB schema
104
+ if not _table_exists(duckrun_instance, schema_name, table_name):
105
+ raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
106
+
107
+ list_tables = [table_name]
108
+ else:
109
+ # Could be just table name or schema name
110
+ if duckrun_instance.scan_all_schemas:
111
+ # Multi-schema mode: DuckDB has actual schemas
112
+ # First check if it's a table in main schema
113
+ if _table_exists(duckrun_instance, duckdb_schema, source):
114
+ list_tables = [source]
115
+ schema_name = duckdb_schema
116
+ # Otherwise, check if it's a schema name
117
+ elif _schema_exists(duckrun_instance, source):
118
+ schema_name = source
119
+ list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
120
+ if not list_tables:
121
+ raise ValueError(f"Schema '{source}' exists but contains no tables")
122
+ else:
123
+ raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
124
+ else:
125
+ # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
126
+ if _table_exists(duckrun_instance, duckdb_schema, source):
127
+ # It's a table name
128
+ list_tables = [source]
129
+ schema_name = url_schema # Use URL schema for file path construction
130
+ elif source == url_schema:
131
+ # Special case: user asked for stats on the URL schema name - list all tables
132
+ list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
133
+ schema_name = url_schema # Use URL schema for file path construction
134
+ if not list_tables:
135
+ raise ValueError(f"No tables found in schema '{url_schema}'")
136
+ else:
137
+ raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
138
+
139
+ # Use the existing connection
140
+ con = duckrun_instance.con
141
+
142
+ print(f"Processing {len(list_tables)} tables: {list_tables}")
143
+
144
+ for idx, tbl in enumerate(list_tables):
145
+ # Construct lakehouse path using ABFSS URL
146
+ table_path = f"abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Tables/{schema_name}/{tbl}"
147
+
148
+ try:
149
+ dt = DeltaTable(table_path)
150
+ xx = dt.get_add_actions(flatten=True).to_pydict()
151
+
152
+ # Check if VORDER exists
153
+ vorder = 'tags.VORDER' in xx.keys()
154
+
155
+ # Calculate total size
156
+ total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
157
+
158
+ # Get Delta files
159
+ delta_files = dt.files()
160
+ delta = [table_path + "/" + f for f in delta_files]
161
+
162
+ # Check if table has any files
163
+ if not delta:
164
+ # Empty table - create empty temp table
165
+ con.execute(f'''
166
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
167
+ SELECT
168
+ '{tbl}' as tbl,
169
+ 'empty' as file_name,
170
+ 0 as num_rows,
171
+ 0 as num_row_groups,
172
+ 0 as size,
173
+ {vorder} as vorder,
174
+ '{timestamp}' as timestamp
175
+ WHERE false
176
+ ''')
177
+ else:
178
+ # Get parquet metadata and create temp table
179
+ con.execute(f'''
180
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
181
+ SELECT
182
+ '{tbl}' as tbl,
183
+ file_name,
184
+ num_rows,
185
+ num_row_groups,
186
+ CEIL({total_size}/(1024*1024)) as size,
187
+ {vorder} as vorder,
188
+ '{timestamp}' as timestamp
189
+ FROM parquet_file_metadata({delta})
190
+ ''')
191
+
192
+ except Exception as e:
193
+ print(f"Warning: Could not process table '{tbl}': {e}")
194
+ # Create empty temp table for failed tables
195
+ con.execute(f'''
196
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
197
+ SELECT
198
+ '{tbl}' as tbl,
199
+ 'error' as file_name,
200
+ 0 as num_rows,
201
+ 0 as num_row_groups,
202
+ 0 as size,
203
+ false as vorder,
204
+ '{timestamp}' as timestamp
205
+ WHERE false
206
+ ''')
207
+
208
+ # Union all temp tables
209
+ union_parts = [f'SELECT * FROM tbl_{i}' for i in range(len(list_tables))]
210
+ union_query = ' UNION ALL '.join(union_parts)
211
+
212
+ # Generate final summary
213
+ final_result = con.execute(f'''
214
+ SELECT
215
+ tbl,
216
+ SUM(num_rows) as total_rows,
217
+ COUNT(*) as num_files,
218
+ SUM(num_row_groups) as num_row_group,
219
+ CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
220
+ MIN(size) as file_size_MB,
221
+ ANY_VALUE(vorder) as vorder,
222
+ ANY_VALUE(timestamp) as timestamp
223
+ FROM ({union_query})
224
+ WHERE tbl IS NOT NULL
225
+ GROUP BY tbl
226
+ ORDER BY total_rows DESC
227
+ ''').df()
228
+
229
+ return final_result
230
+
231
+
duckrun/writer.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ Delta Lake writer functionality for duckrun - Spark-style write API
3
+ """
4
+ from deltalake import DeltaTable, write_deltalake
5
+
6
+
7
+ # Row Group configuration for optimal Delta Lake performance
8
+ RG = 8_000_000
9
+
10
+
11
+ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
12
+ """
13
+ Build arguments for write_deltalake based on requirements:
14
+ - If schema_mode='merge': use rust engine (no row group params)
15
+ - Otherwise: use pyarrow engine with row group optimization
16
+ """
17
+ args = {
18
+ 'table_or_uri': path,
19
+ 'data': df,
20
+ 'mode': mode
21
+ }
22
+
23
+ # Add partition_by if specified
24
+ if partition_by:
25
+ args['partition_by'] = partition_by
26
+
27
+ # Engine selection based on schema_mode
28
+ if schema_mode == 'merge':
29
+ # Use rust engine for schema merging (no row group params supported)
30
+ args['schema_mode'] = 'merge'
31
+ args['engine'] = 'rust'
32
+ else:
33
+ # Use pyarrow engine with row group optimization (default)
34
+ args['max_rows_per_file'] = RG
35
+ args['max_rows_per_group'] = RG
36
+ args['min_rows_per_group'] = RG
37
+
38
+ return args
39
+
40
+
41
+ class DeltaWriter:
42
+ """Spark-style write API for Delta Lake"""
43
+
44
+ def __init__(self, relation, duckrun_instance):
45
+ self.relation = relation
46
+ self.duckrun = duckrun_instance
47
+ self._format = "delta"
48
+ self._mode = "overwrite"
49
+ self._schema_mode = None
50
+ self._partition_by = None
51
+
52
+ def format(self, format_type: str):
53
+ """Set output format (only 'delta' supported)"""
54
+ if format_type.lower() != "delta":
55
+ raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
56
+ self._format = "delta"
57
+ return self
58
+
59
+ def mode(self, write_mode: str):
60
+ """Set write mode: 'overwrite' or 'append'"""
61
+ if write_mode not in {"overwrite", "append"}:
62
+ raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
63
+ self._mode = write_mode
64
+ return self
65
+
66
+ def option(self, key: str, value):
67
+ """Set write option (Spark-compatible)"""
68
+ if key == "mergeSchema":
69
+ if str(value).lower() in ("true", "1"):
70
+ self._schema_mode = "merge"
71
+ else:
72
+ self._schema_mode = None
73
+ else:
74
+ raise ValueError(f"Unsupported option: {key}")
75
+ return self
76
+
77
+ def partitionBy(self, *columns):
78
+ """Set partition columns (Spark-compatible)"""
79
+ if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
80
+ # Handle partitionBy(["col1", "col2"]) case
81
+ self._partition_by = list(columns[0])
82
+ else:
83
+ # Handle partitionBy("col1", "col2") case
84
+ self._partition_by = list(columns)
85
+ return self
86
+
87
+ def saveAsTable(self, table_name: str):
88
+ """Save query result as Delta table"""
89
+ if self._format != "delta":
90
+ raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
91
+
92
+ if "." in table_name:
93
+ schema, table = table_name.split(".", 1)
94
+ else:
95
+ schema = self.duckrun.schema
96
+ table = table_name
97
+
98
+ self.duckrun._create_onelake_secret()
99
+ path = f"{self.duckrun.table_base_url}{schema}/{table}"
100
+ df = self.relation.record_batch()
101
+
102
+ # Build write arguments based on schema_mode and partition_by
103
+ write_args = _build_write_deltalake_args(
104
+ path, df, self._mode,
105
+ schema_mode=self._schema_mode,
106
+ partition_by=self._partition_by
107
+ )
108
+
109
+ engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
110
+ partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
111
+ print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
112
+
113
+ write_deltalake(**write_args)
114
+
115
+ # Create view with appropriate schema qualification
116
+ # If user explicitly specified schema.table, create view with schema qualification
117
+ # If user just specified table, create view in current schema
118
+ if "." in table_name:
119
+ # User explicitly specified schema.table - create qualified view
120
+ view_name = f"{schema}.{table}"
121
+ # Ensure the schema exists before creating the view
122
+ self.duckrun.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
123
+ else:
124
+ # User specified just table name - create view in current schema
125
+ view_name = table
126
+
127
+ self.duckrun.con.sql(f"DROP VIEW IF EXISTS {view_name}")
128
+ self.duckrun.con.sql(f"""
129
+ CREATE OR REPLACE VIEW {view_name}
130
+ AS SELECT * FROM delta_scan('{path}')
131
+ """)
132
+
133
+ dt = DeltaTable(path)
134
+
135
+ if self._mode == "overwrite":
136
+ dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
137
+ dt.cleanup_metadata()
138
+ print(f"✅ Table {schema}.{table} created/overwritten")
139
+ else:
140
+ file_count = len(dt.file_uris())
141
+ if file_count > self.duckrun.compaction_threshold:
142
+ print(f"Compacting {schema}.{table} ({file_count} files)")
143
+ dt.optimize.compact()
144
+ dt.vacuum(dry_run=False)
145
+ dt.cleanup_metadata()
146
+ print(f"✅ Data appended to {schema}.{table}")
147
+
148
+ return table
149
+
150
+
151
+ class QueryResult:
152
+ """Wrapper for DuckDB relation with write API"""
153
+
154
+ def __init__(self, relation, duckrun_instance):
155
+ self.relation = relation
156
+ self.duckrun = duckrun_instance
157
+
158
+ @property
159
+ def write(self):
160
+ """Access write API"""
161
+ return DeltaWriter(self.relation, self.duckrun)
162
+
163
+ def __getattr__(self, name):
164
+ """Delegate all other methods to underlying DuckDB relation"""
165
+ return getattr(self.relation, name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,11 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=m_9DuSZNZ5DOETnkjNGn8HJBYheCgs_7NewcbM9VECI,16500
3
+ duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
4
+ duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
5
+ duckrun/stats.py,sha256=B9UfGOndRNfcB2AhOVjuSqgfmF2x-uRmdmBn3usx_jQ,9881
6
+ duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
7
+ duckrun-0.2.4.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
8
+ duckrun-0.2.4.dist-info/METADATA,sha256=2t7-pNzcPCeseXTjp6Bc18_V41MpjDarG0z-2IzY-Lk,18339
9
+ duckrun-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ duckrun-0.2.4.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
11
+ duckrun-0.2.4.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=VqfTL4fFE-XUXXsDy9VRFEPSQ21dfrkCGH_06C9CLNg,39416
3
- duckrun-0.2.2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.2.2.dist-info/METADATA,sha256=J_Vw7Ps5afPRkofvyo-r7wufizjS431XgXpHdwaKwyo,18339
5
- duckrun-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.2.2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.2.2.dist-info/RECORD,,