ducklake-delta-exporter 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: ducklake-delta-exporter
3
+ Version: 0.1.0
4
+ Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
+ Home-page: https://github.com/djouallah/ducklake-delta-exporter
6
+ Author: mim
7
+ Author-email: your.email@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: duckdb
17
+ Requires-Dist: pyarrow
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
27
+
28
+
29
+ # DuckLake Delta Exporter
30
+ A Python utility to synchronize metadata from a DuckLake database with Delta Lake transaction logs. This allows you to manage data in DuckLake and make it discoverable and queryable by Delta Lake compatible tools (e.g., Spark, Delta Lake Rust/Python clients).
31
+
32
+ # Features
33
+ DuckLake to Delta Sync: Generates incremental Delta Lake transaction logs (_delta_log/*.json) and checkpoint files (_delta_log/*.checkpoint.parquet) based on the latest state of tables in a DuckLake database.
34
+
35
+ Schema Mapping: Automatically maps DuckDB data types to their Spark SQL equivalents for Delta Lake schema definitions.
36
+
37
+ Change Detection: Identifies added and removed data files since the last Delta export, ensuring only necessary updates are written to the log.
38
+
39
+ Checkpointing: Supports creating Delta Lake checkpoint files at a configurable interval for efficient state reconstruction.
40
+
41
+ # Installation
42
+ You can install this package using pip:
43
+
44
+ pip install ducklake-delta-exporter
45
+
46
+
47
+
48
+ # Usage
49
+ ```
50
+ from ducklake_delta_exporter import generate_latest_delta_log
51
+ generate_latest_delta_log('path/to/your/ducklake.db', data_root='/lakehouse/default/Tables', checkpoint_interval=1)
52
+ ```
@@ -0,0 +1,25 @@
1
+
2
+ # DuckLake Delta Exporter
3
+ A Python utility to synchronize metadata from a DuckLake database with Delta Lake transaction logs. This allows you to manage data in DuckLake and make it discoverable and queryable by Delta Lake compatible tools (e.g., Spark, Delta Lake Rust/Python clients).
4
+
5
+ # Features
6
+ DuckLake to Delta Sync: Generates incremental Delta Lake transaction logs (_delta_log/*.json) and checkpoint files (_delta_log/*.checkpoint.parquet) based on the latest state of tables in a DuckLake database.
7
+
8
+ Schema Mapping: Automatically maps DuckDB data types to their Spark SQL equivalents for Delta Lake schema definitions.
9
+
10
+ Change Detection: Identifies added and removed data files since the last Delta export, ensuring only necessary updates are written to the log.
11
+
12
+ Checkpointing: Supports creating Delta Lake checkpoint files at a configurable interval for efficient state reconstruction.
13
+
14
+ # Installation
15
+ You can install this package using pip:
16
+
17
+ pip install ducklake-delta-exporter
18
+
19
+
20
+
21
+ # Usage
22
+ ```
23
+ from ducklake_delta_exporter import generate_latest_delta_log
24
+ generate_latest_delta_log('path/to/your/ducklake.db', data_root='/lakehouse/default/Tables', checkpoint_interval=1)
25
+ ```
@@ -0,0 +1,461 @@
1
+ # File: ducklake_delta_exporter.py
2
+ import os
3
+ import json
4
+ import uuid
5
+ import time
6
+ import duckdb
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ from datetime import datetime
10
+
11
+ def map_type_ducklake_to_spark(t):
12
+ """Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
13
+ t = t.lower()
14
+ if 'int' in t:
15
+ return 'long' if '64' in t else 'integer'
16
+ elif 'float' in t:
17
+ return 'double'
18
+ elif 'double' in t:
19
+ return 'double'
20
+ elif 'decimal' in t:
21
+ return 'decimal(10,0)'
22
+ elif 'bool' in t:
23
+ return 'boolean'
24
+ elif 'timestamp' in t:
25
+ return 'timestamp'
26
+ elif 'date' in t:
27
+ return 'date'
28
+ return 'string'
29
+
30
+ def create_spark_schema_string(fields):
31
+ """Creates a JSON string for the Spark schema from a list of fields."""
32
+ return json.dumps({"type": "struct", "fields": fields})
33
+
34
+ def get_spark_checkpoint_schema():
35
+ """Returns the PyArrow schema for a Delta Lake checkpoint file."""
36
+ return pa.schema([
37
+ pa.field("protocol", pa.struct([
38
+ pa.field("minReaderVersion", pa.int32()), # Made nullable
39
+ pa.field("minWriterVersion", pa.int32()) # Made nullable
40
+ ]), nullable=True),
41
+ pa.field("metaData", pa.struct([
42
+ pa.field("id", pa.string()),
43
+ pa.field("name", pa.string()),
44
+ pa.field("description", pa.string()),
45
+ pa.field("format", pa.struct([
46
+ pa.field("provider", pa.string()),
47
+ pa.field("options", pa.map_(pa.string(), pa.string()))
48
+ ])),
49
+ pa.field("schemaString", pa.string()),
50
+ pa.field("partitionColumns", pa.list_(pa.string())),
51
+ pa.field("createdTime", pa.int64()),
52
+ pa.field("configuration", pa.map_(pa.string(), pa.string()))
53
+ ]), nullable=True),
54
+ pa.field("add", pa.struct([
55
+ pa.field("path", pa.string()),
56
+ pa.field("partitionValues", pa.map_(pa.string(), pa.string())),
57
+ pa.field("size", pa.int64()),
58
+ pa.field("modificationTime", pa.int64()),
59
+ pa.field("dataChange", pa.bool_()),
60
+ pa.field("stats", pa.string(), nullable=True),
61
+ pa.field("tags", pa.map_(pa.string(), pa.string()), nullable=True)
62
+ # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
63
+ ]), nullable=True),
64
+ pa.field("remove", pa.struct([
65
+ pa.field("path", pa.string()),
66
+ pa.field("deletionTimestamp", pa.int64()),
67
+ pa.field("dataChange", pa.bool_())
68
+ ]), nullable=True),
69
+ pa.field("commitInfo", pa.struct([
70
+ pa.field("timestamp", pa.timestamp('ms'), False), # Changed from pa.int64() to pa.timestamp('ms')
71
+ pa.field("operation", pa.string()),
72
+ pa.field("operationParameters", pa.map_(pa.string(), pa.string())),
73
+ pa.field("isBlindAppend", pa.bool_(), nullable=True),
74
+ pa.field("engineInfo", pa.string(), nullable=True),
75
+ pa.field("clientVersion", pa.string(), nullable=True)
76
+ ]), nullable=True)
77
+ ])
78
+
79
+ def get_latest_delta_version_info(delta_log_path, con, table_id):
80
+ """
81
+ Determines the latest Delta version exported and reconstructs the set of files
82
+ that were part of that Delta version, based on the embedded DuckLake snapshot ID.
83
+ Also retrieves the consistent metaData.id if available from version 0.
84
+
85
+ Returns (latest_delta_version, set_of_files_in_that_version, latest_ducklake_snapshot_id_in_delta, meta_id_from_delta_log).
86
+ """
87
+ last_delta_version_idx = -1
88
+ last_exported_ducklake_snapshot_id = None
89
+ files_in_last_delta_version = set()
90
+ meta_id_from_delta_log = None # This should be consistent for the table
91
+
92
+ # Collect all files ending with .json
93
+ log_files = [f for f in os.listdir(delta_log_path) if f.endswith('.json')]
94
+
95
+ if not log_files:
96
+ return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
97
+
98
+ try:
99
+ # Collect valid version numbers from file names
100
+ found_versions = []
101
+ for f_name in log_files:
102
+ base_name = f_name.split('.')[0]
103
+ # Check if filename starts with '0000' and consists entirely of digits
104
+ if base_name.startswith('0000') and base_name.isdigit():
105
+ found_versions.append(int(base_name))
106
+
107
+ if not found_versions:
108
+ # No valid versioned log files found with the '0000' prefix
109
+ return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
110
+
111
+ # Get the highest version index
112
+ last_delta_version_idx = max(found_versions)
113
+ last_log_file = os.path.join(delta_log_path, f"{last_delta_version_idx:020d}.json")
114
+
115
+ # Attempt to read the last log file for commitInfo and metaData (if present)
116
+ with open(last_log_file, 'r') as f:
117
+ for line in f:
118
+ try:
119
+ action = json.loads(line)
120
+ if 'commitInfo' in action:
121
+ commit_info = action['commitInfo']
122
+ if 'operationParameters' in commit_info and 'duckLakeSnapshotId' in commit_info['operationParameters']:
123
+ last_exported_ducklake_snapshot_id = int(commit_info['operationParameters']['duckLakeSnapshotId'])
124
+ if 'metaData' in action:
125
+ meta_id_from_delta_log = action['metaData'].get('id')
126
+ except json.JSONDecodeError as e:
127
+ print(f"ERROR: Failed to parse JSON line in {last_log_file}: {line.strip()}. Error: {e}")
128
+ except Exception as e:
129
+ print(f"ERROR: Unexpected error processing line in {last_log_file}: {e}")
130
+
131
+ # If metaData.id was not found in the latest log file, try to get it from version 0
132
+ if meta_id_from_delta_log is None:
133
+ v0_log_file = os.path.join(delta_log_path, "00000000000000000000.json")
134
+ if os.path.exists(v0_log_file):
135
+ with open(v0_log_file, 'r') as v0f:
136
+ for v0_line in v0f:
137
+ try:
138
+ v0_action = json.loads(v0_line)
139
+ if 'metaData' in v0_action:
140
+ meta_id_from_delta_log = v0_action['metaData'].get('id')
141
+ break
142
+ except json.JSONDecodeError:
143
+ pass # Ignore parsing errors for v0 metadata, just try next line
144
+
145
+ # If a valid last_exported_ducklake_snapshot_id was found, reconstruct the files
146
+ if last_exported_ducklake_snapshot_id is not None:
147
+ file_rows = con.execute(f"""
148
+ SELECT path FROM ducklake_data_file
149
+ WHERE table_id = {table_id}
150
+ AND begin_snapshot <= {last_exported_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {last_exported_ducklake_snapshot_id})
151
+ """).fetchall()
152
+ files_in_last_delta_version = {path.lstrip('/') for path, in file_rows}
153
+ else:
154
+ print(f"WARNING: 'duckLakeSnapshotId' not found or parsed from latest log ({last_log_file}). Cannot reconstruct previous Delta table state accurately for diffing.")
155
+
156
+ except Exception as e:
157
+ print(f"ERROR: Unhandled exception in get_latest_delta_version_info for {delta_log_path}. Resetting state. Error: {e}")
158
+ last_delta_version_idx = -1 # Reset to -1 if there's an issue parsing or finding files
159
+
160
+ return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
161
+
162
+
163
+ def generate_latest_delta_log(db_path: str, data_root: str='/lakehouse/default/Tables', checkpoint_interval: int = 1):
164
+ """
165
+ Generates a Delta Lake transaction log for the LATEST state of each table in a DuckLake database.
166
+ This creates incremental updates to Delta, not a full history.
167
+
168
+ Args:
169
+ db_path (str): The path to the DuckLake database file.
170
+ data_root (str): The root directory for the lakehouse data.
171
+ checkpoint_interval (int): The interval at which to create checkpoint files.
172
+ """
173
+ con = duckdb.connect(db_path, read_only=True)
174
+
175
+ tables = con.sql("""
176
+ SELECT
177
+ t.table_id,
178
+ t.table_name,
179
+ s.schema_name,
180
+ t.path as table_path,
181
+ s.path as schema_path
182
+ FROM ducklake_table t
183
+ JOIN ducklake_schema s USING(schema_id)
184
+ WHERE t.end_snapshot IS NULL
185
+ """).df()
186
+
187
+ for row in tables.itertuples():
188
+ table_key = f"{row.schema_name}.{row.table_name}"
189
+ table_root = os.path.join(data_root, row.schema_path, row.table_path)
190
+ delta_log_path = os.path.join(table_root, "_delta_log")
191
+ os.makedirs(delta_log_path, exist_ok=True)
192
+
193
+ # 1. Get the LATEST DuckLake snapshot for this table
194
+ latest_ducklake_snapshot_raw = con.execute(f"""
195
+ SELECT MAX(begin_snapshot) FROM ducklake_data_file
196
+ WHERE table_id = {row.table_id}
197
+ """).fetchone()
198
+
199
+ if not latest_ducklake_snapshot_raw or latest_ducklake_snapshot_raw[0] is None:
200
+ print(f"⚠️ {table_key}: No data files found in DuckLake, skipping Delta log generation.")
201
+ continue
202
+
203
+ latest_ducklake_snapshot_id = latest_ducklake_snapshot_raw[0]
204
+
205
+ # 2. Determine the current state of the Delta table and next Delta version
206
+ last_delta_version_idx, previously_exported_files, last_exported_ducklake_snapshot_id, existing_meta_id = \
207
+ get_latest_delta_version_info(delta_log_path, con, row.table_id)
208
+
209
+ next_delta_version = last_delta_version_idx + 1
210
+
211
+ # Check if the Delta table is already up-to-date with the latest DuckLake snapshot
212
+ if last_exported_ducklake_snapshot_id == latest_ducklake_snapshot_id:
213
+ print(f"✅ {table_key}: Delta table already at latest DuckLake snapshot {latest_ducklake_snapshot_id} (Delta version {last_delta_version_idx}), skipping export.")
214
+ continue # Nothing to do, skip to next table
215
+
216
+ try:
217
+ now = int(time.time() * 1000)
218
+ now_timestamp = datetime.fromtimestamp(now / 1000) # Convert to datetime for checkpoint
219
+ log_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.json")
220
+ checkpoint_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.checkpoint.parquet")
221
+
222
+ # Fetch all current files associated with the LATEST DuckLake snapshot
223
+ file_rows_for_current_version = con.execute(f"""
224
+ SELECT path, file_size_bytes FROM ducklake_data_file
225
+ WHERE table_id = {row.table_id}
226
+ AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
227
+ """).fetchall()
228
+
229
+ current_files_map = {}
230
+ for path, size in file_rows_for_current_version:
231
+ rel_path = path.lstrip('/')
232
+ full_path = os.path.join(table_root, rel_path)
233
+ mod_time = int(os.path.getmtime(full_path) * 1000) if os.path.exists(full_path) else now
234
+ current_files_map[rel_path] = {
235
+ "path": rel_path, "size": size, "modification_time": mod_time,
236
+ "stats": json.dumps({"numRecords": None}) # Stats would require reading files
237
+ }
238
+ current_file_paths = set(current_files_map.keys())
239
+
240
+ added_files_data = []
241
+ removed_files_paths = []
242
+
243
+ # Calculate the diff between the previous Delta state and the current latest DuckLake snapshot
244
+ added_file_paths = current_file_paths - previously_exported_files
245
+ removed_file_paths_set = previously_exported_files - current_file_paths
246
+
247
+ added_files_data = [current_files_map[p] for p in added_file_paths]
248
+ # removed_files_paths only need the path, not full dict
249
+ removed_files_paths = list(removed_file_paths_set)
250
+
251
+ # If no changes and not the initial version 0, skip writing a log file
252
+ # Version 0 should always be written if it's the first export, even if empty (e.g., empty table)
253
+ if not added_files_data and not removed_files_paths and next_delta_version > 0:
254
+ print(f" {table_key}: No *detectable* changes between previous Delta state and latest DuckLake snapshot {latest_ducklake_snapshot_id}. Skipping new Delta log for version {next_delta_version}.")
255
+ continue # Skip to the next table
256
+
257
+ # Get schema for metadata (always from the latest DuckLake snapshot)
258
+ columns = con.execute(f"""
259
+ SELECT column_name, column_type FROM ducklake_column
260
+ WHERE table_id = {row.table_id}
261
+ AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
262
+ ORDER BY column_order
263
+ """).fetchall()
264
+
265
+ with open(log_file, 'w') as f:
266
+ # Protocol always comes first
267
+ f.write(json.dumps({"protocol": {"minReaderVersion": 1, "minWriterVersion": 2}}) + "\n")
268
+
269
+ # Determine the table_meta_id
270
+ table_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
271
+
272
+ # Metadata always comes second
273
+ schema_fields = [{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}} for name, typ in columns]
274
+
275
+ # Configuration, including logRetentionDuration
276
+ table_configuration = {"delta.logRetentionDuration": "interval 1 hour"}
277
+
278
+ f.write(json.dumps({
279
+ "metaData": {
280
+ "id": table_meta_id,
281
+ "name": row.table_name if row.table_name else None,
282
+ "description": None,
283
+ "format": {"provider": "parquet", "options": {}},
284
+ "schemaString": create_spark_schema_string(schema_fields),
285
+ "partitionColumns": [],
286
+ "createdTime": now,
287
+ "configuration": table_configuration
288
+ }
289
+ }) + "\n")
290
+
291
+ # Write remove actions
292
+ for path in removed_files_paths:
293
+ f.write(json.dumps({"remove": {"path": path, "deletionTimestamp": now, "dataChange": True}}) + "\n")
294
+
295
+ # Write add actions, excluding the explicitly removed fields
296
+ for af in added_files_data:
297
+ f.write(json.dumps({
298
+ "add": {
299
+ "path": af["path"],
300
+ "partitionValues": {},
301
+ "size": af["size"],
302
+ "modificationTime": af["modification_time"],
303
+ "dataChange": True,
304
+ "stats": af["stats"],
305
+ "tags": None # Set to null as per example
306
+ # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
307
+ }
308
+ }) + "\n")
309
+
310
+ # Prepare operationParameters for commitInfo based on Delta version
311
+ commit_operation_parameters = {
312
+ "mode": "Overwrite",
313
+ "partitionBy": "[]",
314
+ "duckLakeSnapshotId": str(latest_ducklake_snapshot_id)
315
+ }
316
+ commit_operation = "WRITE"
317
+
318
+ if next_delta_version == 0:
319
+ # For v0, emulate the 'CREATE TABLE' operation parameters as per example
320
+ commit_operation = "CREATE TABLE"
321
+ commit_operation_parameters = {
322
+ "mode": "ErrorIfExists",
323
+ "location": f"{data_root}/{row.schema_path}/{row.table_path}", # Construct location based on data_root
324
+ "protocol": json.dumps({"minReaderVersion": 1, "minWriterVersion": 2}),
325
+ "metadata": json.dumps({ # Stringify metadata object
326
+ "configuration": table_configuration,
327
+ "createdTime": now,
328
+ "description": None,
329
+ "format": {"options": {}, "provider": "parquet"},
330
+ "id": table_meta_id,
331
+ "name": row.table_name if row.table_name else None,
332
+ "partitionColumns": [],
333
+ "schemaString": create_spark_schema_string(schema_fields)
334
+ })
335
+ }
336
+
337
+ # Write CommitInfo
338
+ f.write(json.dumps({
339
+ "commitInfo": {
340
+ "timestamp": now,
341
+ "operation": commit_operation,
342
+ "operationParameters": commit_operation_parameters,
343
+ "isBlindAppend": not removed_files_paths,
344
+ "engineInfo": "DuckLake-Delta-Export-Latest",
345
+ "clientVersion": "delta-rs.0.18.1" if next_delta_version == 0 else "DuckLake-Delta-Python" # Use example clientVersion for v0
346
+ }
347
+ }) + "\n")
348
+
349
+ print(f"✅ {table_key}: Delta log written v{next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
350
+
351
+ # --- CHECKPOINT LOGIC ---
352
+ # Create checkpoint if it's a checkpoint version and doesn't already exist
353
+ if next_delta_version > 0 and next_delta_version % checkpoint_interval == 0 and not os.path.exists(checkpoint_file):
354
+ # Fixed checkpoint creation with proper protocol handling
355
+ checkpoint_records = []
356
+
357
+ # First record: protocol only
358
+ checkpoint_records.append({
359
+ "protocol": {"minReaderVersion": 1, "minWriterVersion": 2},
360
+ "metaData": None,
361
+ "add": None,
362
+ "remove": None,
363
+ "commitInfo": None
364
+ })
365
+
366
+ # Second record: metadata only
367
+ checkpoint_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
368
+ checkpoint_records.append({
369
+ "protocol": None,
370
+ "commitInfo": None,
371
+ "remove": None,
372
+ "add": None,
373
+ "metaData": {
374
+ "id": checkpoint_meta_id,
375
+ "name": row.table_name if row.table_name else None,
376
+ "description": None,
377
+ "format": {"provider": "parquet", "options": {}},
378
+ "schemaString": create_spark_schema_string(schema_fields),
379
+ "partitionColumns": [],
380
+ "createdTime": now,
381
+ "configuration": {"delta.logRetentionDuration": "interval 1 hour"}
382
+ },
383
+ })
384
+
385
+ # Add all current files from the latest DuckLake snapshot to the checkpoint
386
+ for af_path in current_file_paths:
387
+ af = current_files_map[af_path]
388
+ checkpoint_records.append({
389
+ "protocol": None,
390
+ "metaData": None,
391
+ "remove": None,
392
+ "commitInfo": None,
393
+ "add": {
394
+ "path": af["path"],
395
+ "partitionValues": {},
396
+ "size": af["size"],
397
+ "modificationTime": af["modification_time"],
398
+ "dataChange": True,
399
+ "stats": af["stats"],
400
+ "tags": None # Set to null as per example
401
+ # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
402
+ },
403
+ })
404
+
405
+ # Create PyArrow table with proper handling of None values
406
+ table_data = {
407
+ 'protocol': [record.get("protocol") for record in checkpoint_records],
408
+ 'metaData': [record.get("metaData") for record in checkpoint_records],
409
+ 'add': [record.get("add") for record in checkpoint_records],
410
+ 'remove': [record.get("remove") for record in checkpoint_records],
411
+ 'commitInfo': [record.get("commitInfo") for record in checkpoint_records]
412
+ }
413
+
414
+ # Create table directly with target schema to avoid casting issues
415
+ target_schema = get_spark_checkpoint_schema()
416
+ table = pa.table(table_data, schema=target_schema)
417
+ pq.write_table(table, checkpoint_file, compression='snappy')
418
+
419
+ with open(os.path.join(delta_log_path, "_last_checkpoint"), 'w') as f:
420
+ json.dump({"version": next_delta_version, "size": len(checkpoint_records)}, f)
421
+
422
+ print(f"📸 {table_key}: Checkpoint created at Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
423
+
424
+ # --- Cleanup old JSON log files and Checkpoint files ---
425
+ print(f"🧹 {table_key}: Cleaning up old log and checkpoint files before Delta version {next_delta_version}...")
426
+ for f_name in os.listdir(delta_log_path):
427
+ base_name = f_name.split('.')[0]
428
+ # Check for versioned JSON log files
429
+ if f_name.endswith('.json') and base_name.startswith('0000') and base_name.isdigit():
430
+ log_version = int(base_name)
431
+ if log_version < next_delta_version:
432
+ file_to_delete = os.path.join(delta_log_path, f_name)
433
+ try:
434
+ os.remove(file_to_delete)
435
+ print(f" Deleted JSON log: {f_name}")
436
+ except OSError as e:
437
+ print(f" Error deleting JSON log {f_name}: {e}")
438
+ # Check for versioned Parquet checkpoint files
439
+ elif f_name.endswith('.checkpoint.parquet'):
440
+ checkpoint_base_name = f_name.split('.checkpoint.parquet')[0]
441
+ if checkpoint_base_name.startswith('0000') and checkpoint_base_name.isdigit():
442
+ checkpoint_version = int(checkpoint_base_name)
443
+ if checkpoint_version < next_delta_version:
444
+ file_to_delete = os.path.join(delta_log_path, f_name)
445
+ try:
446
+ os.remove(file_to_delete)
447
+ print(f" Deleted checkpoint: {f_name}")
448
+ except OSError as e:
449
+ print(f" Error deleting checkpoint {f_name}: {e}")
450
+ print(f"🧹 {table_key}: Cleanup complete.")
451
+
452
+ elif next_delta_version > 0 and next_delta_version % checkpoint_interval == 0 and os.path.exists(checkpoint_file):
453
+ print(f"⏩ {table_key}: Checkpoint for Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id}) already exists, skipping generation.")
454
+
455
+ except Exception as e:
456
+ print(f"❌ Failed processing {table_key} for Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id}): {e}")
457
+ # This should ideally rollback the written log file if it partially succeeded,
458
+ # but for this script, we just log and continue to next table.
459
+
460
+ con.close()
461
+ print("Delta export finished.")
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: ducklake-delta-exporter
3
+ Version: 0.1.0
4
+ Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
+ Home-page: https://github.com/djouallah/ducklake-delta-exporter
6
+ Author: mim
7
+ Author-email: your.email@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: duckdb
17
+ Requires-Dist: pyarrow
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
27
+
28
+
29
+ # DuckLake Delta Exporter
30
+ A Python utility to synchronize metadata from a DuckLake database with Delta Lake transaction logs. This allows you to manage data in DuckLake and make it discoverable and queryable by Delta Lake compatible tools (e.g., Spark, Delta Lake Rust/Python clients).
31
+
32
+ # Features
33
+ DuckLake to Delta Sync: Generates incremental Delta Lake transaction logs (_delta_log/*.json) and checkpoint files (_delta_log/*.checkpoint.parquet) based on the latest state of tables in a DuckLake database.
34
+
35
+ Schema Mapping: Automatically maps DuckDB data types to their Spark SQL equivalents for Delta Lake schema definitions.
36
+
37
+ Change Detection: Identifies added and removed data files since the last Delta export, ensuring only necessary updates are written to the log.
38
+
39
+ Checkpointing: Supports creating Delta Lake checkpoint files at a configurable interval for efficient state reconstruction.
40
+
41
+ # Installation
42
+ You can install this package using pip:
43
+
44
+ pip install ducklake-delta-exporter
45
+
46
+
47
+
48
+ # Usage
49
+ ```
50
+ from ducklake_delta_exporter import generate_latest_delta_log
51
+ generate_latest_delta_log('path/to/your/ducklake.db', data_root='/lakehouse/default/Tables', checkpoint_interval=1)
52
+ ```
@@ -0,0 +1,8 @@
1
+ README.md
2
+ setup.py
3
+ ducklake_delta_exporter/__init__.py
4
+ ducklake_delta_exporter.egg-info/PKG-INFO
5
+ ducklake_delta_exporter.egg-info/SOURCES.txt
6
+ ducklake_delta_exporter.egg-info/dependency_links.txt
7
+ ducklake_delta_exporter.egg-info/requires.txt
8
+ ducklake_delta_exporter.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ ducklake_delta_exporter
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ # File: setup.py
2
+ from setuptools import setup, find_packages
3
+
4
+ setup(
5
+ name='ducklake-delta-exporter',
6
+ version='0.1.0',
7
+ packages=find_packages(),
8
+ install_requires=[
9
+ 'duckdb',
10
+ 'pyarrow'
11
+ ],
12
+ author='mim',
13
+ author_email='your.email@example.com',
14
+ description='A utility to export DuckLake database metadata to Delta Lake transaction logs.',
15
+ long_description=open('README.md').read(),
16
+ long_description_content_type='text/markdown',
17
+ url='https://github.com/djouallah/ducklake-delta-exporter',
18
+ classifiers=[
19
+ 'Programming Language :: Python :: 3',
20
+ 'License :: OSI Approved :: MIT License',
21
+ 'Operating System :: OS Independent',
22
+ 'Intended Audience :: Developers',
23
+ 'Topic :: Software Development :: Libraries :: Python Modules',
24
+ 'Development Status :: 3 - Alpha',
25
+ ],
26
+ python_requires='>=3.8',
27
+ )