duckrun 0.2.19.dev8__tar.gz → 0.2.20.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/auth.py +3 -0
  3. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/core.py +66 -1
  4. duckrun-0.2.20.dev0/duckrun/ducklake_metadata.py +571 -0
  5. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/semantic_model.py +134 -13
  6. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/PKG-INFO +1 -1
  7. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/SOURCES.txt +3 -0
  8. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/pyproject.toml +1 -1
  9. duckrun-0.2.20.dev0/tests/test_checkpoint_format.py +102 -0
  10. duckrun-0.2.20.dev0/tests/test_ducklake_export.py +7 -0
  11. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/LICENSE +0 -0
  12. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/README.md +0 -0
  13. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/__init__.py +0 -0
  14. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/files.py +0 -0
  15. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/lakehouse.py +0 -0
  16. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/notebook.py +0 -0
  17. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/rle.py +0 -0
  18. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/runner.py +0 -0
  19. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/stats.py +0 -0
  20. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/writer.py +0 -0
  21. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
  22. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/requires.txt +0 -0
  23. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/top_level.txt +0 -0
  24. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/setup.cfg +0 -0
  25. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/tests/test_register.py +0 -0
  26. {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/tests/test_rle.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev8
3
+ Version: 0.2.20.dev0
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -104,6 +104,8 @@ def _get_local_token() -> Optional[str]:
104
104
 
105
105
  except Exception as cli_error:
106
106
  print(f"⚠️ Azure CLI authentication failed: {cli_error}")
107
+ print("💡 TIP: Due to MFA requirements, you now need to login with scope:")
108
+ print(" az login --scope https://storage.azure.com/.default")
107
109
  print("🔐 Falling back to interactive browser authentication...")
108
110
 
109
111
  # Fallback to interactive browser
@@ -119,6 +121,7 @@ def _get_local_token() -> Optional[str]:
119
121
 
120
122
  except Exception as browser_error:
121
123
  print(f"❌ Interactive browser authentication failed: {browser_error}")
124
+ print("💡 Please run: az login --scope https://storage.azure.com/.default")
122
125
  return None
123
126
 
124
127
 
@@ -1249,9 +1249,11 @@ class Duckrun(WorkspaceOperationsMixin):
1249
1249
  dataset_name = self.schema # Use schema name
1250
1250
 
1251
1251
  # Call the deployment function (DirectLake only)
1252
+ # Use lakehouse_id (with .ItemType suffix) instead of lakehouse_name (without suffix)
1253
+ # This ensures proper item resolution for non-lakehouse items like .SnowflakeDatabase
1252
1254
  return deploy_semantic_model(
1253
1255
  workspace_name_or_id=self.workspace,
1254
- lakehouse_name_or_id=self.lakehouse_name,
1256
+ lakehouse_name_or_id=self.lakehouse_id,
1255
1257
  schema_name=self.schema,
1256
1258
  dataset_name=dataset_name,
1257
1259
  bim_url_or_path=bim_url,
@@ -1259,6 +1261,69 @@ class Duckrun(WorkspaceOperationsMixin):
1259
1261
  refresh=refresh
1260
1262
  )
1261
1263
 
1264
+ def export_ducklake_to_delta(self, db_path: str, data_root: str = None) -> bool:
1265
+ """
1266
+ Export DuckLake metadata to Delta Lake format for Spark compatibility.
1267
+
1268
+ Reads a DuckLake database file from the Files section and generates Delta Lake
1269
+ checkpoint files and JSON logs for all tables, making them readable by Spark
1270
+ and other Delta Lake tools.
1271
+
1272
+ Args:
1273
+ db_path: Relative path to DuckLake DB file in Files section (e.g., "db/test/test.db")
1274
+ data_root: Optional base path for lakehouse data. If None, reads from DuckLake metadata.
1275
+
1276
+ Returns:
1277
+ True if export succeeded, False otherwise
1278
+
1279
+ Examples:
1280
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
1281
+
1282
+ # Export DuckLake tables to Delta format
1283
+ con.export_ducklake_to_delta("meta.db")
1284
+
1285
+ # With explicit data root
1286
+ con.export_ducklake_to_delta("db/ducklake.db", data_root="abfss://...")
1287
+ """
1288
+ from .ducklake_metadata import generate_latest_delta_log
1289
+ import obstore as obs
1290
+ from obstore.store import AzureStore
1291
+
1292
+ # Construct full ABFSS path to DB file in Files section
1293
+ full_db_path = f"{self.files_base_url}{db_path}"
1294
+
1295
+ print(f"🔍 Exporting DuckLake metadata from: {db_path}")
1296
+ print(f"📂 Full DB path: {full_db_path}")
1297
+
1298
+ # Get Azure token
1299
+ from .auth import get_token
1300
+ token = self._get_storage_token()
1301
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
1302
+ print("Authenticating with Azure for DuckLake export...")
1303
+ token = get_token()
1304
+ if not token:
1305
+ print("❌ Failed to authenticate for DuckLake export")
1306
+ return False
1307
+
1308
+ # Setup OneLake store for uploading checkpoint files
1309
+ # Use table_base_url as the base since we'll be writing to Tables section
1310
+ store = AzureStore.from_url(self.table_base_url, bearer_token=token)
1311
+
1312
+ # If data_root not provided, use table_base_url (which includes /Tables/)
1313
+ # This will be used to construct full paths for checkpoint files
1314
+ if data_root is None:
1315
+ data_root = self.table_base_url.rstrip('/')
1316
+
1317
+ try:
1318
+ generate_latest_delta_log(full_db_path, data_root, store, token)
1319
+ print(f"✅ DuckLake export completed successfully")
1320
+ return True
1321
+ except Exception as e:
1322
+ print(f"❌ DuckLake export failed: {e}")
1323
+ import traceback
1324
+ traceback.print_exc()
1325
+ return False
1326
+
1262
1327
  def rle(self, table_name: str = None, mode = "natural",
1263
1328
  min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1264
1329
  max_ordering_depth: int = 3, limit: int = None):
@@ -0,0 +1,571 @@
1
+ # File: ducklake_delta_exporter.py
2
+ import json
3
+ import time
4
+ import duckdb
5
+ import os
6
+ import tempfile
7
+ import shutil
8
+
9
+ def map_type_ducklake_to_spark(t):
10
+ """Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
11
+ t = t.lower()
12
+ if 'int' in t:
13
+ return 'long' if '64' in t else 'integer'
14
+ elif 'float' in t:
15
+ return 'double'
16
+ elif 'double' in t:
17
+ return 'double'
18
+ elif 'decimal' in t:
19
+ return 'decimal(10,0)'
20
+ elif 'bool' in t:
21
+ return 'boolean'
22
+ elif 'timestamp' in t:
23
+ return 'timestamp'
24
+ elif 'date' in t:
25
+ return 'date'
26
+ return 'string'
27
+
28
+ def convert_stat_value_to_json(value_str, column_type):
29
+ """
30
+ Convert DuckLake stat string value to proper JSON type for Delta Lake.
31
+
32
+ Args:
33
+ value_str: String representation of the value from DuckLake
34
+ column_type: DuckDB column type
35
+
36
+ Returns:
37
+ Properly typed value for JSON serialization
38
+ """
39
+ if value_str is None:
40
+ return None
41
+
42
+ column_type = column_type.lower()
43
+
44
+ try:
45
+ # Timestamp: Convert to ISO 8601 with .000Z suffix
46
+ if 'timestamp' in column_type:
47
+ # Parse and format to ISO 8601
48
+ # Assumes value_str is in format like "2025-06-22 23:55:00"
49
+ if 'T' not in value_str:
50
+ value_str = value_str.replace(' ', 'T')
51
+ if not value_str.endswith('Z'):
52
+ value_str += '.000Z' if '.000Z' not in value_str else 'Z'
53
+ return value_str
54
+
55
+ # Date: Keep as YYYY-MM-DD string
56
+ elif 'date' in column_type:
57
+ return value_str
58
+
59
+ # Boolean: Convert to JSON boolean
60
+ elif 'bool' in column_type:
61
+ return value_str.lower() in ('true', 't', '1', 'yes')
62
+
63
+ # Numeric types: Convert to number (not string)
64
+ elif any(t in column_type for t in ['int', 'float', 'double', 'decimal', 'numeric']):
65
+ # Try to parse as float first (handles both int and float)
66
+ if '.' in value_str or 'e' in value_str.lower():
67
+ return float(value_str)
68
+ else:
69
+ return int(value_str)
70
+
71
+ # String and others: Keep as string
72
+ else:
73
+ return value_str
74
+
75
+ except (ValueError, AttributeError):
76
+ # If conversion fails, return as string
77
+ return value_str
78
+
79
+ def create_spark_schema_string(fields):
80
+ """Creates a JSON string for the Spark schema from a list of fields."""
81
+ return json.dumps({"type": "struct", "fields": fields})
82
+
83
+ def get_latest_ducklake_snapshot(con, table_id):
84
+ """
85
+ Get the latest DuckLake snapshot ID for a table.
86
+ """
87
+ latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
88
+ return latest_snapshot
89
+
90
+ def get_latest_delta_checkpoint(con, table_id):
91
+ """
92
+ check how many times a table has being modified.
93
+ """
94
+ delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
95
+ where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
96
+ return delta_checkpoint
97
+
98
+ def get_file_modification_time(dummy_time):
99
+ """
100
+ Return a dummy modification time for parquet files.
101
+ This avoids the latency of actually reading file metadata.
102
+
103
+ Args:
104
+ dummy_time: Timestamp in milliseconds to use as modification time
105
+
106
+ Returns:
107
+ Modification time in milliseconds
108
+ """
109
+ return dummy_time
110
+
111
+ def create_dummy_json_log(local_table_root, delta_version, table_info, schema_fields, now, latest_snapshot,
112
+ num_files, total_rows=None, total_bytes=None):
113
+ """
114
+ Create a minimal Delta Lake transaction log file for Spark compatibility.
115
+ Writes to local filesystem (temp directory) following Delta Lake specification.
116
+ Entry order: commitInfo → metaData → protocol (as per Delta Lake spec)
117
+
118
+ Note: The actual add entries are in the checkpoint.parquet file.
119
+ This JSON log provides metadata for Delta readers to understand the checkpoint.
120
+ """
121
+ import uuid
122
+
123
+ local_delta_log_dir = os.path.join(local_table_root, '_delta_log')
124
+ json_log_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.json")
125
+
126
+ # Ensure directory exists
127
+ os.makedirs(local_delta_log_dir, exist_ok=True)
128
+
129
+ # 1. Commit info entry (FIRST - as per Delta Lake spec)
130
+ commitinfo_json = json.dumps({
131
+ "commitInfo": {
132
+ "timestamp": now,
133
+ "operation": "CONVERT",
134
+ "operationParameters": {
135
+ "convertedFrom": "DuckLake",
136
+ "duckLakeSnapshotId": str(latest_snapshot),
137
+ "partitionBy": "[]"
138
+ },
139
+ "isolationLevel": "Serializable",
140
+ "isBlindAppend": False,
141
+ "operationMetrics": {
142
+ "numFiles": str(num_files),
143
+ "numOutputRows": str(total_rows) if total_rows else "0",
144
+ "numOutputBytes": str(total_bytes) if total_bytes else "0"
145
+ },
146
+ "engineInfo": "DuckLake-Delta-Exporter/1.0.0",
147
+ "txnId": str(uuid.uuid4())
148
+ }
149
+ })
150
+
151
+ # 2. Metadata entry (SECOND)
152
+ metadata_json = json.dumps({
153
+ "metaData": {
154
+ "id": str(uuid.uuid4()), # Use UUID for metadata ID
155
+ "name": table_info['table_name'],
156
+ "description": None,
157
+ "format": {
158
+ "provider": "parquet",
159
+ "options": {}
160
+ },
161
+ "schemaString": create_spark_schema_string(schema_fields),
162
+ "partitionColumns": [],
163
+ "createdTime": now,
164
+ "configuration": {}
165
+ }
166
+ })
167
+
168
+ # 3. Protocol entry (THIRD)
169
+ protocol_json = json.dumps({
170
+ "protocol": {
171
+ "minReaderVersion": 1,
172
+ "minWriterVersion": 2
173
+ }
174
+ })
175
+
176
+ # Write JSON log file (newline-delimited JSON) in correct order
177
+ with open(json_log_file, 'w') as f:
178
+ f.write(commitinfo_json + '\n')
179
+ f.write(metadata_json + '\n')
180
+ f.write(protocol_json + '\n')
181
+
182
+ return json_log_file
183
+
184
+ def build_file_path(table_root, relative_path):
185
+ """
186
+ Build full file path from table root and relative path.
187
+ Works with both local paths and S3 URLs.
188
+ """
189
+ table_root = table_root.rstrip('/')
190
+ relative_path = relative_path.lstrip('/')
191
+ return f"{table_root}/{relative_path}"
192
+
193
+ def create_checkpoint_for_latest_snapshot(con, table_info, data_root, temp_dir, store=None, token=None):
194
+ """
195
+ Create a Delta checkpoint file for the latest DuckLake snapshot.
196
+
197
+ Args:
198
+ con: DuckDB connection to DuckLake database
199
+ table_info: Dictionary with table metadata
200
+ data_root: Root path for data (used for constructing remote paths)
201
+ temp_dir: Temporary directory for writing local files
202
+ store: obstore AzureStore instance for uploading files (None for local mode)
203
+ token: Azure auth token (None for local mode)
204
+ """
205
+ # Construct table path (relative to data_root)
206
+ # Clean up paths to avoid double slashes
207
+ schema_path = table_info['schema_path'].strip('/')
208
+ table_path = table_info['table_path'].strip('/')
209
+ table_relative_path = f"{schema_path}/{table_path}" if schema_path else table_path
210
+
211
+ # Local temporary directory for this table
212
+ local_table_root = os.path.join(temp_dir, table_relative_path.replace('/', os.sep))
213
+
214
+ # Remote path (for ABFSS upload) - always use forward slashes
215
+ remote_table_root = f"{data_root.rstrip('/')}/{table_relative_path}"
216
+
217
+ # Get the latest snapshot
218
+ latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
219
+ if latest_snapshot is None:
220
+ print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
221
+ return False
222
+
223
+ # Use snapshot ID as the delta version
224
+ delta_version = latest_snapshot
225
+
226
+ # Local checkpoint files (in temp directory)
227
+ local_delta_log_dir = os.path.join(local_table_root, '_delta_log')
228
+ local_checkpoint_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.checkpoint.parquet")
229
+ local_json_log_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.json")
230
+ local_last_checkpoint_file = os.path.join(local_delta_log_dir, "_last_checkpoint")
231
+
232
+ # Remote paths (for ABFSS upload) - always use forward slashes
233
+ remote_checkpoint_file = remote_table_root + f"/_delta_log/{delta_version:020d}.checkpoint.parquet"
234
+ remote_json_log_file = remote_table_root + f"/_delta_log/{delta_version:020d}.json"
235
+ remote_last_checkpoint_file = remote_table_root + "/_delta_log/_last_checkpoint"
236
+
237
+ # Check if checkpoint already exists (if store is provided)
238
+ if store:
239
+ # Read _last_checkpoint to get the current version
240
+ try:
241
+ last_checkpoint_result = con.execute(f"""
242
+ SELECT version
243
+ FROM read_json_auto('{remote_last_checkpoint_file}')
244
+ LIMIT 1
245
+ """).fetchone()
246
+
247
+ if last_checkpoint_result:
248
+ current_version = last_checkpoint_result[0]
249
+ current_json_file = remote_table_root + f"/_delta_log/{current_version:020d}.json"
250
+
251
+ # Read the current version's JSON to check snapshot ID
252
+ result = con.execute(f"""
253
+ SELECT
254
+ commitInfo.operationParameters.duckLakeSnapshotId as snapshot_id
255
+ FROM read_json_auto('{current_json_file}', format='newline_delimited')
256
+ WHERE commitInfo IS NOT NULL
257
+ LIMIT 1
258
+ """).fetchone()
259
+
260
+ if result and result[0]:
261
+ last_snapshot = result[0]
262
+ if last_snapshot == str(latest_snapshot):
263
+ print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Snapshot {latest_snapshot} already exported (version {current_version})")
264
+ return False
265
+ else:
266
+ print(f"📊 {table_info['schema_name']}.{table_info['table_name']}: New snapshot detected (was {last_snapshot}, now {latest_snapshot})")
267
+ except Exception:
268
+ # _last_checkpoint doesn't exist, this is first export
269
+ pass
270
+
271
+ now = int(time.time() * 1000)
272
+
273
+ # Get all files with their stats for the latest snapshot
274
+ file_stats_query = f"""
275
+ SELECT
276
+ df.data_file_id,
277
+ df.path,
278
+ df.file_size_bytes,
279
+ c.column_name,
280
+ c.column_type,
281
+ fcs.value_count,
282
+ fcs.null_count,
283
+ fcs.min_value,
284
+ fcs.max_value
285
+ FROM ducklake_data_file df
286
+ LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
287
+ LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
288
+ WHERE df.table_id = {table_info['table_id']}
289
+ AND df.begin_snapshot <= {latest_snapshot}
290
+ AND (df.end_snapshot IS NULL OR df.end_snapshot > {latest_snapshot})
291
+ AND (c.begin_snapshot IS NULL OR c.begin_snapshot <= {latest_snapshot})
292
+ AND (c.end_snapshot IS NULL OR c.end_snapshot > {latest_snapshot})
293
+ ORDER BY df.data_file_id, c.column_order
294
+ """
295
+
296
+ file_stats_rows = con.execute(file_stats_query).fetchall()
297
+
298
+ # Group stats by file
299
+ from collections import defaultdict
300
+ files_dict = defaultdict(lambda: {
301
+ 'path': None,
302
+ 'size': 0,
303
+ 'num_records': 0,
304
+ 'min_values': {},
305
+ 'max_values': {},
306
+ 'null_count': {}
307
+ })
308
+
309
+ for row in file_stats_rows:
310
+ file_id, path, size, col_name, col_type, value_count, null_count, min_val, max_val = row
311
+
312
+ file_data = files_dict[file_id]
313
+ file_data['path'] = path
314
+ file_data['size'] = size
315
+
316
+ # Set num_records from first column's value_count (all columns have same count)
317
+ if file_data['num_records'] == 0 and value_count is not None:
318
+ file_data['num_records'] = value_count
319
+
320
+ # Only add column stats if column name exists (handle LEFT JOIN nulls)
321
+ if col_name is not None:
322
+ # Convert and add min/max values with proper typing
323
+ if min_val is not None:
324
+ file_data['min_values'][col_name] = convert_stat_value_to_json(min_val, col_type)
325
+ if max_val is not None:
326
+ file_data['max_values'][col_name] = convert_stat_value_to_json(max_val, col_type)
327
+ if null_count is not None:
328
+ file_data['null_count'][col_name] = null_count
329
+
330
+ # Convert to list format for processing
331
+ file_rows = [(f['path'], f['size'], f['num_records'], f['min_values'], f['max_values'], f['null_count'])
332
+ for f in files_dict.values()]
333
+
334
+ # Calculate aggregate metrics for commitInfo
335
+ total_files = len(file_rows)
336
+ total_rows = sum(f[2] for f in file_rows) # num_records
337
+ total_bytes = sum(f[1] for f in file_rows) # size
338
+
339
+ # Get schema for the latest snapshot
340
+ columns = con.execute(f"""
341
+ SELECT column_name, column_type FROM ducklake_column
342
+ WHERE table_id = {table_info['table_id']}
343
+ AND begin_snapshot <= {latest_snapshot}
344
+ AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
345
+ ORDER BY column_order
346
+ """).fetchall()
347
+
348
+ # Get or generate table metadata ID
349
+ table_meta_id = str(table_info['table_id'])
350
+
351
+ # Prepare schema
352
+ schema_fields = [
353
+ {"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
354
+ for name, typ in columns
355
+ ]
356
+
357
+ # Create checkpoint data using DuckDB directly
358
+ checkpoint_data = []
359
+
360
+ # Create checkpoint data directly in DuckDB using proper data types
361
+ duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
362
+
363
+ # Create the checkpoint table with proper nested structure
364
+ duckdb.execute("""
365
+ CREATE TABLE checkpoint_table AS
366
+ WITH checkpoint_data AS (
367
+ -- Protocol record
368
+ SELECT
369
+ {'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
370
+ NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
371
+ NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
372
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
373
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
374
+
375
+ UNION ALL
376
+
377
+ -- Metadata record
378
+ SELECT
379
+ NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
380
+ {
381
+ 'id': ?,
382
+ 'name': ?,
383
+ 'description': NULL,
384
+ 'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
385
+ 'schemaString': ?,
386
+ 'partitionColumns': []::VARCHAR[],
387
+ 'createdTime': ?,
388
+ 'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
389
+ }::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
390
+ NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
391
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
392
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
393
+ )
394
+ SELECT * FROM checkpoint_data
395
+ """, [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
396
+
397
+ # Add file records with real statistics
398
+ for path, size, num_records, min_values, max_values, null_count in file_rows:
399
+ rel_path = path.lstrip('/')
400
+ full_path = build_file_path(remote_table_root, rel_path)
401
+ mod_time = get_file_modification_time(now)
402
+
403
+ # Build stats JSON with real values from DuckLake metadata
404
+ stats_json = json.dumps({
405
+ "numRecords": num_records,
406
+ "minValues": min_values,
407
+ "maxValues": max_values,
408
+ "nullCount": null_count
409
+ })
410
+
411
+ duckdb.execute("""
412
+ INSERT INTO checkpoint_table
413
+ SELECT
414
+ NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
415
+ NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
416
+ {
417
+ 'path': ?,
418
+ 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
419
+ 'size': ?,
420
+ 'modificationTime': ?,
421
+ 'dataChange': true,
422
+ 'stats': ?,
423
+ 'tags': MAP{}::MAP(VARCHAR, VARCHAR)
424
+ }::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
425
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
426
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
427
+ """, [rel_path, size, mod_time, stats_json])
428
+
429
+ # Create the _delta_log directory
430
+ os.makedirs(local_delta_log_dir, exist_ok=True)
431
+
432
+ # Write the checkpoint file to local temp directory
433
+ duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{local_checkpoint_file}' (FORMAT PARQUET)")
434
+
435
+ # Create minimal JSON log file (writes to local temp)
436
+ # Note: Full add entries are in the checkpoint.parquet, JSON only has metadata
437
+ create_dummy_json_log(local_table_root, delta_version, table_info, schema_fields, now, latest_snapshot,
438
+ total_files, total_rows, total_bytes)
439
+
440
+ # Write the _last_checkpoint file to local temp directory
441
+ with open(local_last_checkpoint_file, 'w') as f:
442
+ total_records = 2 + len(file_rows) # protocol + metadata + file records
443
+ f.write(json.dumps({"version": delta_version, "size": total_records}))
444
+
445
+ # Upload files to OneLake if store is provided
446
+ if store:
447
+ try:
448
+ import obstore as obs
449
+
450
+ # Extract relative paths from full ABFSS URLs for obstore
451
+ # obstore expects paths relative to the store's base URL
452
+ # remote_checkpoint_file is like: "abfss://.../Tables/simple/ducklake/_delta_log/file.parquet"
453
+ # We need just: "simple/ducklake/_delta_log/file.parquet"
454
+ def get_relative_path(full_path):
455
+ # Split on /Tables/ and take the part after it
456
+ if '/Tables/' in full_path:
457
+ return full_path.split('/Tables/')[-1]
458
+ return full_path.lstrip('/')
459
+
460
+ rel_checkpoint = get_relative_path(remote_checkpoint_file)
461
+ rel_json_log = get_relative_path(remote_json_log_file)
462
+ rel_last_checkpoint = get_relative_path(remote_last_checkpoint_file)
463
+
464
+ # Upload checkpoint file first
465
+ with open(local_checkpoint_file, 'rb') as f:
466
+ obs.put(store, rel_checkpoint, f.read())
467
+
468
+ # Upload JSON log file second
469
+ with open(local_json_log_file, 'rb') as f:
470
+ obs.put(store, rel_json_log, f.read())
471
+
472
+ # Upload _last_checkpoint file last for semi-decent consistency
473
+ # (readers check this first to find the latest checkpoint)
474
+ with open(local_last_checkpoint_file, 'rb') as f:
475
+ obs.put(store, rel_last_checkpoint, f.read())
476
+
477
+ print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
478
+ print(f"✅ Uploaded to: {remote_table_root}/_delta_log/")
479
+ except Exception as e:
480
+ print(f"❌ Failed to upload checkpoint files: {e}")
481
+ return False
482
+ else:
483
+ # Local mode - files are already written to temp directory
484
+ print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
485
+ print(f"✅ Created local files in: {local_delta_log_dir}")
486
+
487
+ # Clean up temporary tables
488
+ duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
489
+
490
+ return True, delta_version, latest_snapshot
491
+
492
+ def generate_latest_delta_log(db_path: str, data_root: str = None, store=None, token=None):
493
+ """
494
+ Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
495
+ Creates both checkpoint files and minimal JSON log files for Spark compatibility.
496
+
497
+ Args:
498
+ db_path (str): The path to the DuckLake database file (can be ABFSS URL or local path).
499
+ data_root (str): The root directory for the lakehouse data. If None, reads from DuckLake metadata.
500
+ store: obstore AzureStore instance for uploading files (None for local mode).
501
+ token: Azure auth token (None for local mode).
502
+ """
503
+ # Create temporary directory for local file operations
504
+ temp_dir = tempfile.mkdtemp(prefix='ducklake_export_')
505
+
506
+ try:
507
+ # Create an in-memory DuckDB connection
508
+ con = duckdb.connect(':memory:')
509
+
510
+ # If token is provided and db_path is ABFSS URL, set up Azure authentication
511
+ if token and db_path.startswith('abfss://'):
512
+ con.sql(f"CREATE OR REPLACE SECRET ducklake_secret (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
513
+
514
+ # Attach the DuckLake database (works for both local and ABFSS paths)
515
+ con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
516
+ con.execute("USE ducklake_db")
517
+
518
+ if data_root is None:
519
+ data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
520
+
521
+ # Get all active tables
522
+ tables = con.execute("""
523
+ SELECT
524
+ t.table_id,
525
+ t.table_name,
526
+ s.schema_name,
527
+ t.path as table_path,
528
+ s.path as schema_path
529
+ FROM ducklake_table t
530
+ JOIN ducklake_schema s USING(schema_id)
531
+ WHERE t.end_snapshot IS NULL
532
+ """).fetchall()
533
+
534
+ total_tables = len(tables)
535
+ successful_exports = 0
536
+
537
+ for table_row in tables:
538
+ table_info = {
539
+ 'table_id': table_row[0],
540
+ 'table_name': table_row[1],
541
+ 'schema_name': table_row[2],
542
+ 'table_path': table_row[3],
543
+ 'schema_path': table_row[4]
544
+ }
545
+
546
+ table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
547
+ print(f"Processing {table_key}...")
548
+
549
+ try:
550
+ result = create_checkpoint_for_latest_snapshot(con, table_info, data_root, temp_dir, store, token)
551
+
552
+ if result is False:
553
+ # False means checkpoint already exists or no snapshots
554
+ pass # Message already printed by the function
555
+ else:
556
+ successful_exports += 1
557
+
558
+ except Exception as e:
559
+ print(f"❌ {table_key}: Failed to export checkpoint - {e}")
560
+ import traceback
561
+ traceback.print_exc()
562
+
563
+ con.close()
564
+ print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
565
+
566
+ finally:
567
+ # Clean up temporary directory
568
+ try:
569
+ shutil.rmtree(temp_dir)
570
+ except Exception as e:
571
+ print(f"⚠️ Warning: Could not clean up temp directory {temp_dir}: {e}")
@@ -73,7 +73,10 @@ def get_workspace_id(workspace_name_or_id, client):
73
73
 
74
74
 
75
75
  def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
76
- """Get lakehouse ID by name or validate if already a GUID"""
76
+ """
77
+ Get lakehouse/item ID by name or validate if already a GUID.
78
+ Supports lakehouses, warehouses, databases, and other OneLake items.
79
+ """
77
80
  import re
78
81
 
79
82
  # Check if input is already a GUID
@@ -93,17 +96,114 @@ def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
93
96
  except Exception as e:
94
97
  raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found: {e}")
95
98
 
96
- # It's a name, search for it
97
- response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
98
- items = response.json().get('value', [])
99
+ # Parse item type from name (e.g., "ItemName.ItemType")
100
+ item_type_map = {
101
+ '.lakehouse': 'Lakehouse',
102
+ '.warehouse': 'Warehouse',
103
+ '.database': 'Database',
104
+ '.snowflakedatabase': 'SnowflakeDatabase'
105
+ }
106
+
107
+ item_type = None
108
+ item_name = lakehouse_name_or_id
109
+
110
+ for suffix, mapped_type in item_type_map.items():
111
+ if lakehouse_name_or_id.lower().endswith(suffix):
112
+ item_type = mapped_type
113
+ item_name = lakehouse_name_or_id[:-len(suffix)]
114
+ break
115
+
116
+ # If no item type suffix, assume it's a lakehouse
117
+ if item_type is None or item_type == 'Lakehouse':
118
+ # Use lakehouse-specific API
119
+ response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
120
+ items = response.json().get('value', [])
121
+
122
+ lakehouse_match = next((item for item in items if item.get('displayName') == item_name), None)
123
+ if not lakehouse_match:
124
+ raise ValueError(f"Lakehouse '{item_name}' not found")
125
+
126
+ lakehouse_id = lakehouse_match['id']
127
+ print(f"✓ Found lakehouse: {item_name}")
128
+ return lakehouse_id
129
+ else:
130
+ # Use generic items API for non-lakehouse items
131
+ print(f" Searching for {item_type} '{item_name}'...")
132
+ response = client.get(f"/v1/workspaces/{workspace_id}/items")
133
+ items = response.json().get('value', [])
134
+
135
+ # Filter by type and name
136
+ item_match = next(
137
+ (item for item in items
138
+ if item.get('displayName') == item_name and item.get('type') == item_type),
139
+ None
140
+ )
141
+
142
+ if not item_match:
143
+ raise ValueError(f"{item_type} '{item_name}' not found")
144
+
145
+ item_id = item_match['id']
146
+ print(f"✓ Found {item_type.lower()}: {item_name}")
147
+ return item_id
148
+
149
+
150
+ def resolve_to_guid(identifier, identifier_type, client, workspace_id=None):
151
+ """
152
+ Resolve workspace or item identifier to GUID if it's a friendly name.
153
+ If already a GUID, returns as-is.
154
+
155
+ Args:
156
+ identifier: Workspace name/GUID or item name/GUID
157
+ identifier_type: 'workspace' or 'item'
158
+ client: FabricRestClient instance
159
+ workspace_id: Required if identifier_type is 'item'
160
+
161
+ Returns:
162
+ GUID string or None if resolution fails
163
+ """
164
+ import re
165
+
166
+ # Check if already a GUID
167
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
168
+ if guid_pattern.match(identifier):
169
+ return identifier
99
170
 
100
- lakehouse_match = next((item for item in items if item.get('displayName') == lakehouse_name_or_id), None)
101
- if not lakehouse_match:
102
- raise ValueError(f"Lakehouse '{lakehouse_name_or_id}' not found")
171
+ try:
172
+ if identifier_type == 'workspace':
173
+ # Resolve workspace name to GUID
174
+ response = client.get("/v1/workspaces")
175
+ workspaces = response.json().get('value', [])
176
+ workspace_match = next((ws for ws in workspaces if ws.get('displayName') == identifier), None)
177
+ return workspace_match['id'] if workspace_match else None
178
+
179
+ elif identifier_type == 'item':
180
+ if not workspace_id:
181
+ return None
182
+
183
+ # Parse item type from identifier
184
+ item_type_map = {
185
+ '.lakehouse': 'Lakehouse',
186
+ '.warehouse': 'Warehouse',
187
+ '.database': 'Database',
188
+ '.snowflakedatabase': 'SnowflakeDatabase'
189
+ }
190
+
191
+ item_name = identifier
192
+ for suffix, mapped_type in item_type_map.items():
193
+ if identifier.lower().endswith(suffix):
194
+ item_name = identifier[:-len(suffix)]
195
+ break
196
+
197
+ # Try generic items API
198
+ response = client.get(f"/v1/workspaces/{workspace_id}/items")
199
+ items = response.json().get('value', [])
200
+ item_match = next((item for item in items if item.get('displayName') == item_name), None)
201
+
202
+ return item_match['id'] if item_match else None
103
203
 
104
- lakehouse_id = lakehouse_match['id']
105
- print(f" Found lakehouse: {lakehouse_name_or_id}")
106
- return lakehouse_id
204
+ except Exception as e:
205
+ print(f" ⚠️ Could not resolve {identifier_type} to GUID: {e}")
206
+ return None
107
207
 
108
208
 
109
209
  def get_dataset_id(dataset_name, workspace_id, client):
@@ -406,7 +506,14 @@ def download_bim_from_github(url_or_path):
406
506
 
407
507
 
408
508
  def update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name):
409
- """Update BIM file for DirectLake mode"""
509
+ """
510
+ Update BIM file for DirectLake mode.
511
+
512
+ Args:
513
+ workspace_id: Workspace GUID (should be actual GUID, not friendly name)
514
+ lakehouse_id: Item GUID (should be actual GUID, not friendly name with suffix)
515
+ schema_name: Schema name
516
+ """
410
517
 
411
518
  new_url = f"https://onelake.dfs.fabric.microsoft.com/{workspace_id}/{lakehouse_id}"
412
519
  expression_name = None
@@ -606,15 +713,29 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
606
713
  print("=" * 70)
607
714
  return 1
608
715
 
609
- # Step 3: Get lakehouse ID
716
+ # Step 3: Get lakehouse ID and ensure we have GUIDs for the BIM
610
717
  print(f"\n[Step 3/6] Finding lakehouse...")
611
718
  lakehouse_id = get_lakehouse_id(lakehouse_name_or_id, workspace_id, client)
612
719
 
720
+ # Step 3.5: Resolve to actual GUIDs for semantic model compatibility
721
+ print(f"\n[Step 3.5/6] Resolving to GUIDs for semantic model...")
722
+ workspace_guid = resolve_to_guid(workspace_id, 'workspace', client)
723
+ lakehouse_guid = resolve_to_guid(lakehouse_id, 'item', client, workspace_guid)
724
+
725
+ if workspace_guid:
726
+ print(f"✓ Workspace GUID: {workspace_guid}")
727
+ if lakehouse_guid:
728
+ print(f"✓ Item GUID: {lakehouse_guid}")
729
+
730
+ # Use GUIDs if available, otherwise fall back to original values
731
+ workspace_for_bim = workspace_guid if workspace_guid else workspace_id
732
+ lakehouse_for_bim = lakehouse_guid if lakehouse_guid else lakehouse_id
733
+
613
734
  # Step 4: Download and update BIM
614
735
  print("\n[Step 4/6] Loading and configuring BIM file...")
615
736
  bim_content = download_bim_from_github(bim_url_or_path)
616
737
 
617
- modified_bim = update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name)
738
+ modified_bim = update_bim_for_directlake(bim_content, workspace_for_bim, lakehouse_for_bim, schema_name)
618
739
  modified_bim['name'] = dataset_name
619
740
  modified_bim['id'] = dataset_name
620
741
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev8
3
+ Version: 0.2.20.dev0
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,6 +4,7 @@ pyproject.toml
4
4
  duckrun/__init__.py
5
5
  duckrun/auth.py
6
6
  duckrun/core.py
7
+ duckrun/ducklake_metadata.py
7
8
  duckrun/files.py
8
9
  duckrun/lakehouse.py
9
10
  duckrun/notebook.py
@@ -17,5 +18,7 @@ duckrun.egg-info/SOURCES.txt
17
18
  duckrun.egg-info/dependency_links.txt
18
19
  duckrun.egg-info/requires.txt
19
20
  duckrun.egg-info/top_level.txt
21
+ tests/test_checkpoint_format.py
22
+ tests/test_ducklake_export.py
20
23
  tests/test_register.py
21
24
  tests/test_rle.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev8"
7
+ version = "0.2.20.dev0"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,102 @@
1
+ """
2
+ Test to verify our checkpoint parquet format matches Delta Lake specification
3
+ by comparing with a real Delta Lake checkpoint file.
4
+ """
5
+ import duckdb
6
+ import json
7
+
8
+ def test_checkpoint_columns():
9
+ """Verify the checkpoint has the correct columns with correct types"""
10
+
11
+ # Expected columns for Delta Lake checkpoint
12
+ expected_columns = {
13
+ 'protocol': 'STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER)',
14
+ 'metaData': 'STRUCT', # Complex nested structure
15
+ 'add': 'STRUCT', # Complex nested structure with stats
16
+ 'remove': 'STRUCT',
17
+ 'commitInfo': 'STRUCT'
18
+ }
19
+
20
+ # This would be the path to a generated checkpoint - for now just verify structure
21
+ print("✓ Checkpoint should have these columns:")
22
+ for col, typ in expected_columns.items():
23
+ print(f" - {col}: {typ}")
24
+
25
+ print("\n✓ The 'add' struct should contain:")
26
+ print(" - path: VARCHAR")
27
+ print(" - partitionValues: MAP(VARCHAR, VARCHAR)")
28
+ print(" - size: BIGINT")
29
+ print(" - modificationTime: BIGINT")
30
+ print(" - dataChange: BOOLEAN")
31
+ print(" - stats: VARCHAR (JSON string)")
32
+ print(" - tags: MAP(VARCHAR, VARCHAR)")
33
+
34
+ print("\n✓ The 'stats' JSON string should contain:")
35
+ print(" - numRecords: INTEGER")
36
+ print(" - minValues: MAP with properly typed values")
37
+ print(" - maxValues: MAP with properly typed values")
38
+ print(" - nullCount: MAP with INTEGER values")
39
+
40
+ def test_checkpoint_structure_from_json():
41
+ """Test that our checkpoint structure matches the real Delta checkpoint JSON"""
42
+
43
+ # Read the real checkpoint JSON
44
+ with open('tests/00000000000000000000.json', 'r') as f:
45
+ lines = f.readlines()
46
+
47
+ print("=== Real Delta Lake Checkpoint Analysis ===\n")
48
+
49
+ # Parse each entry
50
+ entry_types = []
51
+ add_count = 0
52
+
53
+ for i, line in enumerate(lines[:10]): # Check first 10 lines
54
+ entry = json.loads(line)
55
+ entry_type = list(entry.keys())[0]
56
+ entry_types.append(entry_type)
57
+
58
+ if entry_type == 'add':
59
+ add_count += 1
60
+ if add_count == 1: # Show first add entry structure
61
+ add_entry = entry['add']
62
+ print(f"✓ ADD Entry Structure (line {i+1}):")
63
+ print(f" - path: {type(add_entry['path']).__name__}")
64
+ print(f" - partitionValues: {type(add_entry['partitionValues']).__name__} = {add_entry['partitionValues']}")
65
+ print(f" - size: {type(add_entry['size']).__name__} = {add_entry['size']}")
66
+ print(f" - modificationTime: {type(add_entry['modificationTime']).__name__}")
67
+ print(f" - dataChange: {type(add_entry['dataChange']).__name__} = {add_entry['dataChange']}")
68
+ print(f" - stats: {type(add_entry['stats']).__name__} (JSON string)")
69
+ print(f" - tags: {type(add_entry['tags']).__name__} = {add_entry['tags']}")
70
+
71
+ # Parse stats JSON
72
+ stats = json.loads(add_entry['stats'])
73
+ print(f"\n✓ STATS Structure:")
74
+ print(f" - numRecords: {type(stats['numRecords']).__name__} = {stats['numRecords']}")
75
+ print(f" - minValues: {type(stats['minValues']).__name__} with {len(stats['minValues'])} columns")
76
+ print(f" - maxValues: {type(stats['maxValues']).__name__} with {len(stats['maxValues'])} columns")
77
+ print(f" - nullCount: {type(stats['nullCount']).__name__} with {len(stats['nullCount'])} columns")
78
+
79
+ # Check value types in stats
80
+ print(f"\n✓ Sample minValues types:")
81
+ for key, value in list(stats['minValues'].items())[:3]:
82
+ print(f" - {key}: {type(value).__name__} = {value}")
83
+
84
+ elif entry_type == 'commitInfo':
85
+ print(f"✓ COMMITINFO Entry (line {i+1})")
86
+ elif entry_type == 'metaData':
87
+ print(f"✓ METADATA Entry (line {i+1})")
88
+ elif entry_type == 'protocol':
89
+ protocol = entry['protocol']
90
+ print(f"✓ PROTOCOL Entry (line {i+1}):")
91
+ print(f" - minReaderVersion: {protocol['minReaderVersion']}")
92
+ print(f" - minWriterVersion: {protocol['minWriterVersion']}")
93
+
94
+ print(f"\n=== Entry Count Summary ===")
95
+ print(f"Total entries analyzed: {len(lines)}")
96
+ print(f"Entry types: {', '.join(set(entry_types))}")
97
+ print(f"Add entries: {len([t for t in entry_types if t == 'add'])}")
98
+
99
+ if __name__ == '__main__':
100
+ test_checkpoint_columns()
101
+ print("\n" + "="*50 + "\n")
102
+ test_checkpoint_structure_from_json()
@@ -0,0 +1,7 @@
1
+ import duckrun
2
+ def test_ducklake_export():
3
+ con = duckrun.connect("tmp/tmp.Lakehouse/dbo")
4
+ result = con.export_ducklake_to_delta("meta.db")
5
+ return result
6
+ if __name__ == "__main__":
7
+ test_ducklake_export()
File without changes
File without changes
File without changes