duckrun 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/stats.py ADDED
@@ -0,0 +1,225 @@
1
+ """
2
+ Delta Lake table statistics functionality for duckrun
3
+ """
4
+ import duckdb
5
+ from deltalake import DeltaTable
6
+ from datetime import datetime
7
+
8
+
9
+ def _table_exists(duckrun_instance, schema_name: str, table_name: str) -> bool:
10
+ """Check if a specific table exists by trying to query it directly."""
11
+ try:
12
+ # For main schema, just use table name directly
13
+ if schema_name == "main":
14
+ query = f"SELECT COUNT(*) FROM {table_name} LIMIT 1"
15
+ else:
16
+ query = f"SELECT COUNT(*) FROM {schema_name}.{table_name} LIMIT 1"
17
+ duckrun_instance.con.execute(query)
18
+ return True
19
+ except:
20
+ return False
21
+
22
+
23
+ def _schema_exists(duckrun_instance, schema_name: str) -> bool:
24
+ """Check if a schema exists by trying to show its tables."""
25
+ try:
26
+ # For main schema, just show tables
27
+ if schema_name == "main":
28
+ query = "SHOW TABLES"
29
+ else:
30
+ query = f"SHOW TABLES FROM {schema_name}"
31
+ duckrun_instance.con.execute(query)
32
+ return True
33
+ except:
34
+ return False
35
+
36
+
37
+ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
38
+ """Get all existing tables in a schema by showing tables, excluding temporary tables."""
39
+ try:
40
+ # For main schema, just show tables
41
+ if schema_name == "main":
42
+ query = "SHOW TABLES"
43
+ else:
44
+ query = f"SHOW TABLES FROM {schema_name}"
45
+ result = duckrun_instance.con.execute(query).fetchall()
46
+ if result:
47
+ # Filter out temporary tables created by stats processing (tbl_0, tbl_1, etc.)
48
+ tables = [row[0] for row in result]
49
+ filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
50
+ return filtered_tables
51
+ return []
52
+ except:
53
+ return []
54
+
55
+
56
+ def get_stats(duckrun_instance, source: str):
57
+ """
58
+ Get comprehensive statistics for Delta Lake tables.
59
+
60
+ Args:
61
+ duckrun_instance: The Duckrun connection instance
62
+ source: Can be one of:
63
+ - Table name: 'table_name' (uses main schema in DuckDB)
64
+ - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
65
+ - Schema only: 'schema' (all tables in schema, if multi-schema)
66
+
67
+ Returns:
68
+ Arrow table with statistics including total rows, file count, row groups,
69
+ average row group size, file sizes, VORDER status, and timestamp
70
+
71
+ Examples:
72
+ con = duckrun.connect("tmp/data.lakehouse/test")
73
+
74
+ # Single table in main schema (DuckDB uses 'main', not 'test')
75
+ stats = con.get_stats('price_today')
76
+
77
+ # Specific table in different schema (only if multi-schema enabled)
78
+ stats = con.get_stats('aemo.price')
79
+
80
+ # All tables in a schema (only if multi-schema enabled)
81
+ stats = con.get_stats('aemo')
82
+ """
83
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
84
+
85
+ # DuckDB always uses 'main' as the default schema, regardless of connection URL schema
86
+ duckdb_schema = "main"
87
+ url_schema = duckrun_instance.schema # This is from the connection URL path
88
+
89
+ # Parse the source and validate existence
90
+ if '.' in source:
91
+ # Format: schema.table - only valid if multi-schema is enabled
92
+ schema_name, table_name = source.split('.', 1)
93
+
94
+ if not duckrun_instance.scan_all_schemas:
95
+ raise ValueError(f"Multi-schema format '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Use just the table name '{table_name}' instead.")
96
+
97
+ # Validate the specific table exists in the actual DuckDB schema
98
+ if not _table_exists(duckrun_instance, schema_name, table_name):
99
+ raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
100
+
101
+ list_tables = [table_name]
102
+ else:
103
+ # Could be just table name or schema name
104
+ if duckrun_instance.scan_all_schemas:
105
+ # Multi-schema mode: DuckDB has actual schemas
106
+ # First check if it's a table in main schema
107
+ if _table_exists(duckrun_instance, duckdb_schema, source):
108
+ list_tables = [source]
109
+ schema_name = duckdb_schema
110
+ # Otherwise, check if it's a schema name
111
+ elif _schema_exists(duckrun_instance, source):
112
+ schema_name = source
113
+ list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
114
+ if not list_tables:
115
+ raise ValueError(f"Schema '{source}' exists but contains no tables")
116
+ else:
117
+ raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
118
+ else:
119
+ # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
120
+ if _table_exists(duckrun_instance, duckdb_schema, source):
121
+ # It's a table name
122
+ list_tables = [source]
123
+ schema_name = url_schema # Use URL schema for file path construction
124
+ elif source == url_schema:
125
+ # Special case: user asked for stats on the URL schema name - list all tables
126
+ list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
127
+ schema_name = url_schema # Use URL schema for file path construction
128
+ if not list_tables:
129
+ raise ValueError(f"No tables found in schema '{url_schema}'")
130
+ else:
131
+ raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
132
+
133
+ # Use the existing connection
134
+ con = duckrun_instance.con
135
+
136
+ print(f"Processing {len(list_tables)} tables: {list_tables}")
137
+
138
+ for idx, tbl in enumerate(list_tables):
139
+ # Construct lakehouse path using ABFSS URL
140
+ table_path = f"abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Tables/{schema_name}/{tbl}"
141
+
142
+ try:
143
+ dt = DeltaTable(table_path)
144
+ xx = dt.get_add_actions(flatten=True).to_pydict()
145
+
146
+ # Check if VORDER exists
147
+ vorder = 'tags.VORDER' in xx.keys()
148
+
149
+ # Calculate total size
150
+ total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
151
+
152
+ # Get Delta files
153
+ delta_files = dt.files()
154
+ delta = [table_path + "/" + f for f in delta_files]
155
+
156
+ # Check if table has any files
157
+ if not delta:
158
+ # Empty table - create empty temp table
159
+ con.execute(f'''
160
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
161
+ SELECT
162
+ '{tbl}' as tbl,
163
+ 'empty' as file_name,
164
+ 0 as num_rows,
165
+ 0 as num_row_groups,
166
+ 0 as size,
167
+ {vorder} as vorder,
168
+ '{timestamp}' as timestamp
169
+ WHERE false
170
+ ''')
171
+ else:
172
+ # Get parquet metadata and create temp table
173
+ con.execute(f'''
174
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
175
+ SELECT
176
+ '{tbl}' as tbl,
177
+ file_name,
178
+ num_rows,
179
+ num_row_groups,
180
+ CEIL({total_size}/(1024*1024)) as size,
181
+ {vorder} as vorder,
182
+ '{timestamp}' as timestamp
183
+ FROM parquet_file_metadata({delta})
184
+ ''')
185
+
186
+ except Exception as e:
187
+ print(f"Warning: Could not process table '{tbl}': {e}")
188
+ # Create empty temp table for failed tables
189
+ con.execute(f'''
190
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
191
+ SELECT
192
+ '{tbl}' as tbl,
193
+ 'error' as file_name,
194
+ 0 as num_rows,
195
+ 0 as num_row_groups,
196
+ 0 as size,
197
+ false as vorder,
198
+ '{timestamp}' as timestamp
199
+ WHERE false
200
+ ''')
201
+
202
+ # Union all temp tables
203
+ union_parts = [f'SELECT * FROM tbl_{i}' for i in range(len(list_tables))]
204
+ union_query = ' UNION ALL '.join(union_parts)
205
+
206
+ # Generate final summary
207
+ final_result = con.execute(f'''
208
+ SELECT
209
+ tbl,
210
+ SUM(num_rows) as total_rows,
211
+ COUNT(*) as num_files,
212
+ SUM(num_row_groups) as num_row_group,
213
+ CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
214
+ MIN(size) as file_size_MB,
215
+ ANY_VALUE(vorder) as vorder,
216
+ ANY_VALUE(timestamp) as timestamp
217
+ FROM ({union_query})
218
+ WHERE tbl IS NOT NULL
219
+ GROUP BY tbl
220
+ ORDER BY total_rows DESC
221
+ ''').fetch_arrow_table()
222
+
223
+ return final_result
224
+
225
+
duckrun/writer.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ Delta Lake writer functionality for duckrun - Spark-style write API
3
+ """
4
+ from deltalake import DeltaTable, write_deltalake
5
+
6
+
7
+ # Row Group configuration for optimal Delta Lake performance
8
+ RG = 8_000_000
9
+
10
+
11
+ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
12
+ """
13
+ Build arguments for write_deltalake based on requirements:
14
+ - If schema_mode='merge': use rust engine (no row group params)
15
+ - Otherwise: use pyarrow engine with row group optimization
16
+ """
17
+ args = {
18
+ 'table_or_uri': path,
19
+ 'data': df,
20
+ 'mode': mode
21
+ }
22
+
23
+ # Add partition_by if specified
24
+ if partition_by:
25
+ args['partition_by'] = partition_by
26
+
27
+ # Engine selection based on schema_mode
28
+ if schema_mode == 'merge':
29
+ # Use rust engine for schema merging (no row group params supported)
30
+ args['schema_mode'] = 'merge'
31
+ args['engine'] = 'rust'
32
+ else:
33
+ # Use pyarrow engine with row group optimization (default)
34
+ args['max_rows_per_file'] = RG
35
+ args['max_rows_per_group'] = RG
36
+ args['min_rows_per_group'] = RG
37
+
38
+ return args
39
+
40
+
41
+ class DeltaWriter:
42
+ """Spark-style write API for Delta Lake"""
43
+
44
+ def __init__(self, relation, duckrun_instance):
45
+ self.relation = relation
46
+ self.duckrun = duckrun_instance
47
+ self._format = "delta"
48
+ self._mode = "overwrite"
49
+ self._schema_mode = None
50
+ self._partition_by = None
51
+
52
+ def format(self, format_type: str):
53
+ """Set output format (only 'delta' supported)"""
54
+ if format_type.lower() != "delta":
55
+ raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
56
+ self._format = "delta"
57
+ return self
58
+
59
+ def mode(self, write_mode: str):
60
+ """Set write mode: 'overwrite' or 'append'"""
61
+ if write_mode not in {"overwrite", "append"}:
62
+ raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
63
+ self._mode = write_mode
64
+ return self
65
+
66
+ def option(self, key: str, value):
67
+ """Set write option (Spark-compatible)"""
68
+ if key == "mergeSchema":
69
+ if str(value).lower() in ("true", "1"):
70
+ self._schema_mode = "merge"
71
+ else:
72
+ self._schema_mode = None
73
+ else:
74
+ raise ValueError(f"Unsupported option: {key}")
75
+ return self
76
+
77
+ def partitionBy(self, *columns):
78
+ """Set partition columns (Spark-compatible)"""
79
+ if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
80
+ # Handle partitionBy(["col1", "col2"]) case
81
+ self._partition_by = list(columns[0])
82
+ else:
83
+ # Handle partitionBy("col1", "col2") case
84
+ self._partition_by = list(columns)
85
+ return self
86
+
87
+ def saveAsTable(self, table_name: str):
88
+ """Save query result as Delta table"""
89
+ if self._format != "delta":
90
+ raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
91
+
92
+ if "." in table_name:
93
+ schema, table = table_name.split(".", 1)
94
+ else:
95
+ schema = self.duckrun.schema
96
+ table = table_name
97
+
98
+ self.duckrun._create_onelake_secret()
99
+ path = f"{self.duckrun.table_base_url}{schema}/{table}"
100
+ df = self.relation.record_batch()
101
+
102
+ # Build write arguments based on schema_mode and partition_by
103
+ write_args = _build_write_deltalake_args(
104
+ path, df, self._mode,
105
+ schema_mode=self._schema_mode,
106
+ partition_by=self._partition_by
107
+ )
108
+
109
+ engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
110
+ partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
111
+ print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
112
+
113
+ write_deltalake(**write_args)
114
+
115
+ # Create view with appropriate schema qualification
116
+ # If user explicitly specified schema.table, create view with schema qualification
117
+ # If user just specified table, create view in current schema
118
+ if "." in table_name:
119
+ # User explicitly specified schema.table - create qualified view
120
+ view_name = f"{schema}.{table}"
121
+ # Ensure the schema exists before creating the view
122
+ self.duckrun.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
123
+ else:
124
+ # User specified just table name - create view in current schema
125
+ view_name = table
126
+
127
+ self.duckrun.con.sql(f"DROP VIEW IF EXISTS {view_name}")
128
+ self.duckrun.con.sql(f"""
129
+ CREATE OR REPLACE VIEW {view_name}
130
+ AS SELECT * FROM delta_scan('{path}')
131
+ """)
132
+
133
+ dt = DeltaTable(path)
134
+
135
+ if self._mode == "overwrite":
136
+ dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
137
+ dt.cleanup_metadata()
138
+ print(f"✅ Table {schema}.{table} created/overwritten")
139
+ else:
140
+ file_count = len(dt.file_uris())
141
+ if file_count > self.duckrun.compaction_threshold:
142
+ print(f"Compacting {schema}.{table} ({file_count} files)")
143
+ dt.optimize.compact()
144
+ dt.vacuum(dry_run=False)
145
+ dt.cleanup_metadata()
146
+ print(f"✅ Data appended to {schema}.{table}")
147
+
148
+ return table
149
+
150
+
151
+ class QueryResult:
152
+ """Wrapper for DuckDB relation with write API"""
153
+
154
+ def __init__(self, relation, duckrun_instance):
155
+ self.relation = relation
156
+ self.duckrun = duckrun_instance
157
+
158
+ @property
159
+ def write(self):
160
+ """Access write API"""
161
+ return DeltaWriter(self.relation, self.duckrun)
162
+
163
+ def __getattr__(self, name):
164
+ """Delegate all other methods to underlying DuckDB relation"""
165
+ return getattr(self.relation, name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,11 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=LN5rc5B3HLimgslZdC8tLKe3rjTl_KD8WxCh1qoJhdM,16443
3
+ duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
4
+ duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
5
+ duckrun/stats.py,sha256=jLEkxNo7MjibPMpjMsXyedrJqv9-BAnP1C0L2a7H8Z8,9417
6
+ duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
7
+ duckrun-0.2.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
8
+ duckrun-0.2.3.dist-info/METADATA,sha256=CpJvtR9l8c9b1AV9-KnjN4fZODE_3oJxS3omz4p-qlc,18339
9
+ duckrun-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ duckrun-0.2.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
11
+ duckrun-0.2.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=VqfTL4fFE-XUXXsDy9VRFEPSQ21dfrkCGH_06C9CLNg,39416
3
- duckrun-0.2.2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.2.2.dist-info/METADATA,sha256=J_Vw7Ps5afPRkofvyo-r7wufizjS431XgXpHdwaKwyo,18339
5
- duckrun-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.2.2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.2.2.dist-info/RECORD,,