ducklake-delta-exporter 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,7 @@
1
1
  # File: ducklake_delta_exporter.py
2
- import os
3
2
  import json
4
- import uuid
5
3
  import time
6
4
  import duckdb
7
- import pyarrow as pa
8
- import pyarrow.parquet as pq
9
- from datetime import datetime
10
5
 
11
6
  def map_type_ducklake_to_spark(t):
12
7
  """Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
@@ -31,148 +26,265 @@ def create_spark_schema_string(fields):
31
26
  """Creates a JSON string for the Spark schema from a list of fields."""
32
27
  return json.dumps({"type": "struct", "fields": fields})
33
28
 
34
- def get_spark_checkpoint_schema():
35
- """Returns the PyArrow schema for a Delta Lake checkpoint file."""
36
- return pa.schema([
37
- pa.field("protocol", pa.struct([
38
- pa.field("minReaderVersion", pa.int32()), # Made nullable
39
- pa.field("minWriterVersion", pa.int32()) # Made nullable
40
- ]), nullable=True),
41
- pa.field("metaData", pa.struct([
42
- pa.field("id", pa.string()),
43
- pa.field("name", pa.string()),
44
- pa.field("description", pa.string()),
45
- pa.field("format", pa.struct([
46
- pa.field("provider", pa.string()),
47
- pa.field("options", pa.map_(pa.string(), pa.string()))
48
- ])),
49
- pa.field("schemaString", pa.string()),
50
- pa.field("partitionColumns", pa.list_(pa.string())),
51
- pa.field("createdTime", pa.int64()),
52
- pa.field("configuration", pa.map_(pa.string(), pa.string()))
53
- ]), nullable=True),
54
- pa.field("add", pa.struct([
55
- pa.field("path", pa.string()),
56
- pa.field("partitionValues", pa.map_(pa.string(), pa.string())),
57
- pa.field("size", pa.int64()),
58
- pa.field("modificationTime", pa.int64()),
59
- pa.field("dataChange", pa.bool_()),
60
- pa.field("stats", pa.string(), nullable=True),
61
- pa.field("tags", pa.map_(pa.string(), pa.string()), nullable=True)
62
- # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
63
- ]), nullable=True),
64
- pa.field("remove", pa.struct([
65
- pa.field("path", pa.string()),
66
- pa.field("deletionTimestamp", pa.int64()),
67
- pa.field("dataChange", pa.bool_())
68
- ]), nullable=True),
69
- pa.field("commitInfo", pa.struct([
70
- pa.field("timestamp", pa.timestamp('ms'), False), # Changed from pa.int64() to pa.timestamp('ms')
71
- pa.field("operation", pa.string()),
72
- pa.field("operationParameters", pa.map_(pa.string(), pa.string())),
73
- pa.field("isBlindAppend", pa.bool_(), nullable=True),
74
- pa.field("engineInfo", pa.string(), nullable=True),
75
- pa.field("clientVersion", pa.string(), nullable=True)
76
- ]), nullable=True)
77
- ])
78
-
79
- def get_latest_delta_version_info(delta_log_path, con, table_id):
29
+ def get_latest_ducklake_snapshot(con, table_id):
30
+ """
31
+ Get the latest DuckLake snapshot ID for a table.
80
32
  """
81
- Determines the latest Delta version exported and reconstructs the set of files
82
- that were part of that Delta version, based on the embedded DuckLake snapshot ID.
83
- Also retrieves the consistent metaData.id if available from version 0.
33
+ latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
34
+ return latest_snapshot
84
35
 
85
- Returns (latest_delta_version, set_of_files_in_that_version, latest_ducklake_snapshot_id_in_delta, meta_id_from_delta_log).
36
+ def get_latest_delta_checkpoint(con, table_id):
37
+ """
38
+ check how many times a table has being modified.
86
39
  """
87
- last_delta_version_idx = -1
88
- last_exported_ducklake_snapshot_id = None
89
- files_in_last_delta_version = set()
90
- meta_id_from_delta_log = None # This should be consistent for the table
40
+ delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
41
+ where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
42
+ print(table_id)
43
+ print(delta_checkpoint)
44
+ return delta_checkpoint
91
45
 
92
- # Collect all files ending with .json
93
- log_files = [f for f in os.listdir(delta_log_path) if f.endswith('.json')]
46
+ def get_file_modification_time(dummy_time):
47
+ """
48
+ Return a dummy modification time for parquet files.
49
+ This avoids the latency of actually reading file metadata.
50
+
51
+ Args:
52
+ dummy_time: Timestamp in milliseconds to use as modification time
94
53
 
95
- if not log_files:
96
- return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
54
+ Returns:
55
+ Modification time in milliseconds
56
+ """
57
+ return dummy_time
97
58
 
98
- try:
99
- # Collect valid version numbers from file names
100
- found_versions = []
101
- for f_name in log_files:
102
- base_name = f_name.split('.')[0]
103
- # Check if filename starts with '0000' and consists entirely of digits
104
- if base_name.startswith('0000') and base_name.isdigit():
105
- found_versions.append(int(base_name))
59
+ def create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now):
60
+ """
61
+ Create a minimal JSON log file for Spark compatibility using DuckDB.
62
+ """
63
+ json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
64
+
65
+ # Create JSON log entries using DuckDB
66
+ duckdb.execute("DROP TABLE IF EXISTS json_log_table")
67
+
68
+ # Protocol entry
69
+ protocol_json = json.dumps({
70
+ "protocol": {
71
+ "minReaderVersion": 1,
72
+ "minWriterVersion": 2
73
+ }
74
+ })
75
+
76
+ # Metadata entry
77
+ metadata_json = json.dumps({
78
+ "metaData": {
79
+ "id": str(table_info['table_id']),
80
+ "name": table_info['table_name'],
81
+ "description": None,
82
+ "format": {
83
+ "provider": "parquet",
84
+ "options": {}
85
+ },
86
+ "schemaString": create_spark_schema_string(schema_fields),
87
+ "partitionColumns": [],
88
+ "createdTime": now,
89
+ "configuration": {
90
+ "delta.logRetentionDuration": "interval 1 hour"
91
+ }
92
+ }
93
+ })
94
+
95
+ # Commit info entry
96
+ commitinfo_json = json.dumps({
97
+ "commitInfo": {
98
+ "timestamp": now,
99
+ "operation": "CONVERT",
100
+ "operationParameters": {
101
+ "convertedFrom": "DuckLake"
102
+ },
103
+ "isBlindAppend": True,
104
+ "engineInfo": "DuckLake-Delta-Exporter",
105
+ "clientVersion": "1.0.0"
106
+ }
107
+ })
108
+
109
+ # Create table with JSON entries
110
+ duckdb.execute("""
111
+ CREATE TABLE json_log_table AS
112
+ SELECT ? AS json_line
113
+ UNION ALL
114
+ SELECT ? AS json_line
115
+ UNION ALL
116
+ SELECT ? AS json_line
117
+ """, [protocol_json, metadata_json, commitinfo_json])
118
+
119
+ # Write JSON log file using DuckDB
120
+ duckdb.execute(f"COPY (SELECT json_line FROM json_log_table) TO '{json_log_file}' (FORMAT CSV, HEADER false, QUOTE '')")
121
+
122
+ # Clean up
123
+ duckdb.execute("DROP TABLE IF EXISTS json_log_table")
124
+
125
+ return json_log_file
106
126
 
107
- if not found_versions:
108
- # No valid versioned log files found with the '0000' prefix
109
- return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
127
+ def build_file_path(table_root, relative_path):
128
+ """
129
+ Build full file path from table root and relative path.
130
+ Works with both local paths and S3 URLs.
131
+ """
132
+ table_root = table_root.rstrip('/')
133
+ relative_path = relative_path.lstrip('/')
134
+ return f"{table_root}/{relative_path}"
110
135
 
111
- # Get the highest version index
112
- last_delta_version_idx = max(found_versions)
113
- last_log_file = os.path.join(delta_log_path, f"{last_delta_version_idx:020d}.json")
136
+ def create_checkpoint_for_latest_snapshot(con, table_info, data_root):
137
+ """
138
+ Create a Delta checkpoint file for the latest DuckLake snapshot.
139
+ """
140
+ table_root = data_root.rstrip('/') + '/' + table_info['schema_path'] + table_info['table_path']
141
+
142
+ # Get the latest snapshot
143
+ latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
144
+ if latest_snapshot is None:
145
+ print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
146
+ return False
147
+ delta_version = get_latest_delta_checkpoint(con, table_info['table_id'])
148
+ checkpoint_file = table_root + f"_delta_log/{delta_version:020d}.checkpoint.parquet"
149
+ json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
150
+
151
+ try:
152
+ con.execute(f"SELECT protocol FROM '{checkpoint_file}' limit 0 ")
153
+ print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Checkpoint file already exists: {checkpoint_file}")
154
+ except:
155
+
156
+ now = int(time.time() * 1000)
114
157
 
115
- # Attempt to read the last log file for commitInfo and metaData (if present)
116
- with open(last_log_file, 'r') as f:
117
- for line in f:
118
- try:
119
- action = json.loads(line)
120
- if 'commitInfo' in action:
121
- commit_info = action['commitInfo']
122
- if 'operationParameters' in commit_info and 'duckLakeSnapshotId' in commit_info['operationParameters']:
123
- last_exported_ducklake_snapshot_id = int(commit_info['operationParameters']['duckLakeSnapshotId'])
124
- if 'metaData' in action:
125
- meta_id_from_delta_log = action['metaData'].get('id')
126
- except json.JSONDecodeError as e:
127
- print(f"ERROR: Failed to parse JSON line in {last_log_file}: {line.strip()}. Error: {e}")
128
- except Exception as e:
129
- print(f"ERROR: Unexpected error processing line in {last_log_file}: {e}")
158
+ # Get all files for the latest snapshot
159
+ file_rows = con.execute(f"""
160
+ SELECT path, file_size_bytes FROM ducklake_data_file
161
+ WHERE table_id = {table_info['table_id']}
162
+ AND begin_snapshot <= {latest_snapshot}
163
+ AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
164
+ """).fetchall()
130
165
 
131
- # If metaData.id was not found in the latest log file, try to get it from version 0
132
- if meta_id_from_delta_log is None:
133
- v0_log_file = os.path.join(delta_log_path, "00000000000000000000.json")
134
- if os.path.exists(v0_log_file):
135
- with open(v0_log_file, 'r') as v0f:
136
- for v0_line in v0f:
137
- try:
138
- v0_action = json.loads(v0_line)
139
- if 'metaData' in v0_action:
140
- meta_id_from_delta_log = v0_action['metaData'].get('id')
141
- break
142
- except json.JSONDecodeError:
143
- pass # Ignore parsing errors for v0 metadata, just try next line
144
-
145
- # If a valid last_exported_ducklake_snapshot_id was found, reconstruct the files
146
- if last_exported_ducklake_snapshot_id is not None:
147
- file_rows = con.execute(f"""
148
- SELECT path FROM ducklake_data_file
149
- WHERE table_id = {table_id}
150
- AND begin_snapshot <= {last_exported_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {last_exported_ducklake_snapshot_id})
151
- """).fetchall()
152
- files_in_last_delta_version = {path.lstrip('/') for path, in file_rows}
153
- else:
154
- print(f"WARNING: 'duckLakeSnapshotId' not found or parsed from latest log ({last_log_file}). Cannot reconstruct previous Delta table state accurately for diffing.")
155
-
156
- except Exception as e:
157
- print(f"ERROR: Unhandled exception in get_latest_delta_version_info for {delta_log_path}. Resetting state. Error: {e}")
158
- last_delta_version_idx = -1 # Reset to -1 if there's an issue parsing or finding files
159
-
160
- return last_delta_version_idx, files_in_last_delta_version, last_exported_ducklake_snapshot_id, meta_id_from_delta_log
161
-
166
+ # Get schema for the latest snapshot
167
+ columns = con.execute(f"""
168
+ SELECT column_name, column_type FROM ducklake_column
169
+ WHERE table_id = {table_info['table_id']}
170
+ AND begin_snapshot <= {latest_snapshot}
171
+ AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
172
+ ORDER BY column_order
173
+ """).fetchall()
174
+
175
+ # Get or generate table metadata ID
176
+ table_meta_id = str(table_info['table_id'])
177
+
178
+ # Prepare schema
179
+ schema_fields = [
180
+ {"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
181
+ for name, typ in columns
182
+ ]
183
+
184
+ # Create checkpoint data using DuckDB directly
185
+ checkpoint_data = []
186
+
187
+ # Create checkpoint data directly in DuckDB using proper data types
188
+ duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
189
+
190
+ # Create the checkpoint table with proper nested structure
191
+ duckdb.execute("""
192
+ CREATE TABLE checkpoint_table AS
193
+ WITH checkpoint_data AS (
194
+ -- Protocol record
195
+ SELECT
196
+ {'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
197
+ NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
198
+ NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
199
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
200
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
201
+
202
+ UNION ALL
203
+
204
+ -- Metadata record
205
+ SELECT
206
+ NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
207
+ {
208
+ 'id': ?,
209
+ 'name': ?,
210
+ 'description': NULL,
211
+ 'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
212
+ 'schemaString': ?,
213
+ 'partitionColumns': []::VARCHAR[],
214
+ 'createdTime': ?,
215
+ 'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
216
+ }::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
217
+ NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
218
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
219
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
220
+ )
221
+ SELECT * FROM checkpoint_data
222
+ """, [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
223
+
224
+ # Add file records
225
+ for path, size in file_rows:
226
+ rel_path = path.lstrip('/')
227
+ full_path = build_file_path(table_root, rel_path)
228
+ mod_time = get_file_modification_time(now)
229
+
230
+ duckdb.execute("""
231
+ INSERT INTO checkpoint_table
232
+ SELECT
233
+ NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
234
+ NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
235
+ {
236
+ 'path': ?,
237
+ 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
238
+ 'size': ?,
239
+ 'modificationTime': ?,
240
+ 'dataChange': true,
241
+ 'stats': ?,
242
+ 'tags': NULL::MAP(VARCHAR, VARCHAR)
243
+ }::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
244
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
245
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
246
+ """, [rel_path, size, mod_time, json.dumps({"numRecords": None})])
247
+
248
+ # Create the _delta_log directory if it doesn't exist
249
+ duckdb.execute(f"COPY (SELECT 43) TO '{table_root}_delta_log' (FORMAT PARQUET, PER_THREAD_OUTPUT, OVERWRITE_OR_IGNORE)")
250
+
251
+ # Write the checkpoint file
252
+ duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{checkpoint_file}' (FORMAT PARQUET)")
253
+
254
+ # Create dummy JSON log file for Spark compatibility
255
+ create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now)
256
+
257
+ # Write the _last_checkpoint file
258
+ total_records = 2 + len(file_rows) # protocol + metadata + file records
259
+ duckdb.execute(f"""
260
+ COPY (SELECT {delta_version} AS version, {total_records} AS size)
261
+ TO '{table_root}_delta_log/_last_checkpoint' (FORMAT JSON, ARRAY false)
262
+ """)
263
+
264
+ print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
265
+ print(f"✅ Created JSON log file: {json_log_file}")
266
+
267
+ # Clean up temporary tables
268
+ duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
269
+
270
+ return True, delta_version, latest_snapshot
162
271
 
163
- def generate_latest_delta_log(db_path: str, data_root: str='/lakehouse/default/Tables', checkpoint_interval: int = 1):
272
+ def generate_latest_delta_log(db_path: str, data_root: str = None):
164
273
  """
165
- Generates a Delta Lake transaction log for the LATEST state of each table in a DuckLake database.
166
- This creates incremental updates to Delta, not a full history.
274
+ Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
275
+ Creates both checkpoint files and minimal JSON log files for Spark compatibility.
167
276
 
168
277
  Args:
169
278
  db_path (str): The path to the DuckLake database file.
170
279
  data_root (str): The root directory for the lakehouse data.
171
- checkpoint_interval (int): The interval at which to create checkpoint files.
172
280
  """
173
281
  con = duckdb.connect(db_path, read_only=True)
174
-
175
- tables = con.sql("""
282
+
283
+ if data_root is None:
284
+ data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
285
+
286
+ # Get all active tables
287
+ tables = con.execute("""
176
288
  SELECT
177
289
  t.table_id,
178
290
  t.table_name,
@@ -182,280 +294,33 @@ def generate_latest_delta_log(db_path: str, data_root: str='/lakehouse/default/T
182
294
  FROM ducklake_table t
183
295
  JOIN ducklake_schema s USING(schema_id)
184
296
  WHERE t.end_snapshot IS NULL
185
- """).df()
186
-
187
- for row in tables.itertuples():
188
- table_key = f"{row.schema_name}.{row.table_name}"
189
- table_root = os.path.join(data_root, row.schema_path, row.table_path)
190
- delta_log_path = os.path.join(table_root, "_delta_log")
191
- os.makedirs(delta_log_path, exist_ok=True)
192
-
193
- # 1. Get the LATEST DuckLake snapshot for this table
194
- latest_ducklake_snapshot_raw = con.execute(f"""
195
- SELECT MAX(begin_snapshot) FROM ducklake_data_file
196
- WHERE table_id = {row.table_id}
197
- """).fetchone()
198
-
199
- if not latest_ducklake_snapshot_raw or latest_ducklake_snapshot_raw[0] is None:
200
- print(f"⚠️ {table_key}: No data files found in DuckLake, skipping Delta log generation.")
201
- continue
297
+ """).fetchall()
298
+
299
+ total_tables = len(tables)
300
+ successful_exports = 0
301
+
302
+ for table_row in tables:
303
+ table_info = {
304
+ 'table_id': table_row[0],
305
+ 'table_name': table_row[1],
306
+ 'schema_name': table_row[2],
307
+ 'table_path': table_row[3],
308
+ 'schema_path': table_row[4]
309
+ }
202
310
 
203
- latest_ducklake_snapshot_id = latest_ducklake_snapshot_raw[0]
204
-
205
- # 2. Determine the current state of the Delta table and next Delta version
206
- last_delta_version_idx, previously_exported_files, last_exported_ducklake_snapshot_id, existing_meta_id = \
207
- get_latest_delta_version_info(delta_log_path, con, row.table_id)
311
+ table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
312
+ print(f"Processing {table_key}...")
208
313
 
209
- next_delta_version = last_delta_version_idx + 1
210
-
211
- # Check if the Delta table is already up-to-date with the latest DuckLake snapshot
212
- if last_exported_ducklake_snapshot_id == latest_ducklake_snapshot_id:
213
- print(f"✅ {table_key}: Delta table already at latest DuckLake snapshot {latest_ducklake_snapshot_id} (Delta version {last_delta_version_idx}), skipping export.")
214
- continue # Nothing to do, skip to next table
215
-
216
314
  try:
217
- now = int(time.time() * 1000)
218
- now_timestamp = datetime.fromtimestamp(now / 1000) # Convert to datetime for checkpoint
219
- log_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.json")
220
- checkpoint_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.checkpoint.parquet")
221
-
222
- # Fetch all current files associated with the LATEST DuckLake snapshot
223
- file_rows_for_current_version = con.execute(f"""
224
- SELECT path, file_size_bytes FROM ducklake_data_file
225
- WHERE table_id = {row.table_id}
226
- AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
227
- """).fetchall()
228
-
229
- current_files_map = {}
230
- for path, size in file_rows_for_current_version:
231
- rel_path = path.lstrip('/')
232
- full_path = os.path.join(table_root, rel_path)
233
- mod_time = int(os.path.getmtime(full_path) * 1000) if os.path.exists(full_path) else now
234
- current_files_map[rel_path] = {
235
- "path": rel_path, "size": size, "modification_time": mod_time,
236
- "stats": json.dumps({"numRecords": None}) # Stats would require reading files
237
- }
238
- current_file_paths = set(current_files_map.keys())
239
-
240
- added_files_data = []
241
- removed_files_paths = []
242
-
243
- # Calculate the diff between the previous Delta state and the current latest DuckLake snapshot
244
- added_file_paths = current_file_paths - previously_exported_files
245
- removed_file_paths_set = previously_exported_files - current_file_paths
315
+ result = create_checkpoint_for_latest_snapshot(con, table_info, data_root)
246
316
 
247
- added_files_data = [current_files_map[p] for p in added_file_paths]
248
- # removed_files_paths only need the path, not full dict
249
- removed_files_paths = list(removed_file_paths_set)
250
-
251
- # If no changes and not the initial version 0, skip writing a log file
252
- # Version 0 should always be written if it's the first export, even if empty (e.g., empty table)
253
- if not added_files_data and not removed_files_paths and next_delta_version > 0:
254
- print(f" {table_key}: No *detectable* changes between previous Delta state and latest DuckLake snapshot {latest_ducklake_snapshot_id}. Skipping new Delta log for version {next_delta_version}.")
255
- continue # Skip to the next table
256
-
257
- # Get schema for metadata (always from the latest DuckLake snapshot)
258
- columns = con.execute(f"""
259
- SELECT column_name, column_type FROM ducklake_column
260
- WHERE table_id = {row.table_id}
261
- AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
262
- ORDER BY column_order
263
- """).fetchall()
264
-
265
- with open(log_file, 'w') as f:
266
- # Protocol always comes first
267
- f.write(json.dumps({"protocol": {"minReaderVersion": 1, "minWriterVersion": 2}}) + "\n")
268
-
269
- # Determine the table_meta_id
270
- table_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
317
+ if result:
318
+ successful_exports += 1
319
+ else:
320
+ print(f"⚠️ {table_key}: No data to export")
271
321
 
272
- # Metadata always comes second
273
- schema_fields = [{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}} for name, typ in columns]
274
-
275
- # Configuration, including logRetentionDuration
276
- table_configuration = {"delta.logRetentionDuration": "interval 1 hour"}
277
-
278
- f.write(json.dumps({
279
- "metaData": {
280
- "id": table_meta_id,
281
- "name": row.table_name if row.table_name else None,
282
- "description": None,
283
- "format": {"provider": "parquet", "options": {}},
284
- "schemaString": create_spark_schema_string(schema_fields),
285
- "partitionColumns": [],
286
- "createdTime": now,
287
- "configuration": table_configuration
288
- }
289
- }) + "\n")
290
-
291
- # Write remove actions
292
- for path in removed_files_paths:
293
- f.write(json.dumps({"remove": {"path": path, "deletionTimestamp": now, "dataChange": True}}) + "\n")
294
-
295
- # Write add actions, excluding the explicitly removed fields
296
- for af in added_files_data:
297
- f.write(json.dumps({
298
- "add": {
299
- "path": af["path"],
300
- "partitionValues": {},
301
- "size": af["size"],
302
- "modificationTime": af["modification_time"],
303
- "dataChange": True,
304
- "stats": af["stats"],
305
- "tags": None # Set to null as per example
306
- # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
307
- }
308
- }) + "\n")
309
-
310
- # Prepare operationParameters for commitInfo based on Delta version
311
- commit_operation_parameters = {
312
- "mode": "Overwrite",
313
- "partitionBy": "[]",
314
- "duckLakeSnapshotId": str(latest_ducklake_snapshot_id)
315
- }
316
- commit_operation = "WRITE"
317
-
318
- if next_delta_version == 0:
319
- # For v0, emulate the 'CREATE TABLE' operation parameters as per example
320
- commit_operation = "CREATE TABLE"
321
- commit_operation_parameters = {
322
- "mode": "ErrorIfExists",
323
- "location": f"{data_root}/{row.schema_path}/{row.table_path}", # Construct location based on data_root
324
- "protocol": json.dumps({"minReaderVersion": 1, "minWriterVersion": 2}),
325
- "metadata": json.dumps({ # Stringify metadata object
326
- "configuration": table_configuration,
327
- "createdTime": now,
328
- "description": None,
329
- "format": {"options": {}, "provider": "parquet"},
330
- "id": table_meta_id,
331
- "name": row.table_name if row.table_name else None,
332
- "partitionColumns": [],
333
- "schemaString": create_spark_schema_string(schema_fields)
334
- })
335
- }
336
-
337
- # Write CommitInfo
338
- f.write(json.dumps({
339
- "commitInfo": {
340
- "timestamp": now,
341
- "operation": commit_operation,
342
- "operationParameters": commit_operation_parameters,
343
- "isBlindAppend": not removed_files_paths,
344
- "engineInfo": "DuckLake-Delta-Export-Latest",
345
- "clientVersion": "delta-rs.0.18.1" if next_delta_version == 0 else "DuckLake-Delta-Python" # Use example clientVersion for v0
346
- }
347
- }) + "\n")
348
-
349
- print(f"✅ {table_key}: Delta log written v{next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
350
-
351
- # --- CHECKPOINT LOGIC ---
352
- # Create checkpoint if it's a checkpoint version and doesn't already exist
353
- if next_delta_version > 0 and next_delta_version % checkpoint_interval == 0 and not os.path.exists(checkpoint_file):
354
- # Fixed checkpoint creation with proper protocol handling
355
- checkpoint_records = []
356
-
357
- # First record: protocol only
358
- checkpoint_records.append({
359
- "protocol": {"minReaderVersion": 1, "minWriterVersion": 2},
360
- "metaData": None,
361
- "add": None,
362
- "remove": None,
363
- "commitInfo": None
364
- })
365
-
366
- # Second record: metadata only
367
- checkpoint_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
368
- checkpoint_records.append({
369
- "protocol": None,
370
- "commitInfo": None,
371
- "remove": None,
372
- "add": None,
373
- "metaData": {
374
- "id": checkpoint_meta_id,
375
- "name": row.table_name if row.table_name else None,
376
- "description": None,
377
- "format": {"provider": "parquet", "options": {}},
378
- "schemaString": create_spark_schema_string(schema_fields),
379
- "partitionColumns": [],
380
- "createdTime": now,
381
- "configuration": {"delta.logRetentionDuration": "interval 1 hour"}
382
- },
383
- })
384
-
385
- # Add all current files from the latest DuckLake snapshot to the checkpoint
386
- for af_path in current_file_paths:
387
- af = current_files_map[af_path]
388
- checkpoint_records.append({
389
- "protocol": None,
390
- "metaData": None,
391
- "remove": None,
392
- "commitInfo": None,
393
- "add": {
394
- "path": af["path"],
395
- "partitionValues": {},
396
- "size": af["size"],
397
- "modificationTime": af["modification_time"],
398
- "dataChange": True,
399
- "stats": af["stats"],
400
- "tags": None # Set to null as per example
401
- # Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
402
- },
403
- })
404
-
405
- # Create PyArrow table with proper handling of None values
406
- table_data = {
407
- 'protocol': [record.get("protocol") for record in checkpoint_records],
408
- 'metaData': [record.get("metaData") for record in checkpoint_records],
409
- 'add': [record.get("add") for record in checkpoint_records],
410
- 'remove': [record.get("remove") for record in checkpoint_records],
411
- 'commitInfo': [record.get("commitInfo") for record in checkpoint_records]
412
- }
413
-
414
- # Create table directly with target schema to avoid casting issues
415
- target_schema = get_spark_checkpoint_schema()
416
- table = pa.table(table_data, schema=target_schema)
417
- pq.write_table(table, checkpoint_file, compression='snappy')
418
-
419
- with open(os.path.join(delta_log_path, "_last_checkpoint"), 'w') as f:
420
- json.dump({"version": next_delta_version, "size": len(checkpoint_records)}, f)
421
-
422
- print(f"📸 {table_key}: Checkpoint created at Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
423
-
424
- # --- Cleanup old JSON log files and Checkpoint files ---
425
- print(f"🧹 {table_key}: Cleaning up old log and checkpoint files before Delta version {next_delta_version}...")
426
- for f_name in os.listdir(delta_log_path):
427
- base_name = f_name.split('.')[0]
428
- # Check for versioned JSON log files
429
- if f_name.endswith('.json') and base_name.startswith('0000') and base_name.isdigit():
430
- log_version = int(base_name)
431
- if log_version < next_delta_version:
432
- file_to_delete = os.path.join(delta_log_path, f_name)
433
- try:
434
- os.remove(file_to_delete)
435
- print(f" Deleted JSON log: {f_name}")
436
- except OSError as e:
437
- print(f" Error deleting JSON log {f_name}: {e}")
438
- # Check for versioned Parquet checkpoint files
439
- elif f_name.endswith('.checkpoint.parquet'):
440
- checkpoint_base_name = f_name.split('.checkpoint.parquet')[0]
441
- if checkpoint_base_name.startswith('0000') and checkpoint_base_name.isdigit():
442
- checkpoint_version = int(checkpoint_base_name)
443
- if checkpoint_version < next_delta_version:
444
- file_to_delete = os.path.join(delta_log_path, f_name)
445
- try:
446
- os.remove(file_to_delete)
447
- print(f" Deleted checkpoint: {f_name}")
448
- except OSError as e:
449
- print(f" Error deleting checkpoint {f_name}: {e}")
450
- print(f"🧹 {table_key}: Cleanup complete.")
451
-
452
- elif next_delta_version > 0 and next_delta_version % checkpoint_interval == 0 and os.path.exists(checkpoint_file):
453
- print(f"⏩ {table_key}: Checkpoint for Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id}) already exists, skipping generation.")
454
-
455
322
  except Exception as e:
456
- print(f"❌ Failed processing {table_key} for Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id}): {e}")
457
- # This should ideally rollback the written log file if it partially succeeded,
458
- # but for this script, we just log and continue to next table.
459
-
323
+ print(f"❌ {table_key}: Failed to export checkpoint - {e}")
324
+
460
325
  con.close()
461
- print("Delta export finished.")
326
+ print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: ducklake-delta-exporter
3
+ Version: 0.1.3
4
+ Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
+ Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
+ Author: mim
7
+ Author-email: your.email@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: duckdb
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # DuckLake Delta Exporter
28
+
29
+ A Python package for exporting DuckLake snapshots as Delta Lake checkpoint files, enabling compatibility with Delta Lake readers, support local path, s3 and gcs, for onelake use mounted storage as azure storage is not supported
30
+
31
+ this is just a fun project
32
+
33
+ ## Repository
34
+
35
+ https://github.com/djouallah/ducklake_delta_exporter
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install ducklake-delta-exporter
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ```python
46
+ from ducklake_delta_exporter import generate_latest_delta_log
47
+
48
+ # Export all tables from a DuckLake database
49
+ generate_latest_delta_log("/path/to/ducklake.db")
50
+
51
+ # Specify a custom data root directory
52
+ generate_latest_delta_log("/path/to/ducklake.db", data_root="/custom/data/path")
53
+ ```
54
+
55
+ ## What it does
56
+
57
+ This package converts DuckLake table snapshots into Delta Lake format by:
58
+
59
+ 1. **Reading DuckLake metadata** - Extracts table schemas, file paths, and snapshot information
60
+ 2. **Creating Delta checkpoint files** - Generates `.checkpoint.parquet` files with Delta Lake metadata
61
+ 3. **Writing JSON transaction logs** - Creates minimal `.json` log files for Spark compatibility
62
+ 4. **Mapping data types** - Converts DuckDB types to Spark SQL equivalents
63
+
64
+ ## Features
65
+
66
+ - ✅ **Spark Compatible** - Generated Delta files can be read by Spark and other Delta Lake tools
67
+ - ✅ **Type Mapping** - Automatic conversion between DuckDB and Spark data types
68
+ - ✅ **Batch Processing** - Exports all tables in a DuckLake database
69
+ - ✅ **Error Handling** - Graceful handling of missing snapshots and other issues
70
+ - ✅ **Progress Reporting** - Clear feedback on export progress and results
71
+
72
+ ## Requirements
73
+
74
+ - Python 3.8+
75
+ - DuckDB
76
+
77
+ ## File Structure
78
+
79
+ After running the exporter, your Delta tables will have the following structure:
80
+
81
+ ```
82
+ your_table/
83
+ ├── data_file_1.parquet
84
+ ├── data_file_2.parquet
85
+ └── _delta_log/
86
+ ├── 00000000000000000000.json
87
+ ├── 00000000000000000000.checkpoint.parquet
88
+ └── _last_checkpoint
89
+ ```
90
+
91
+ ## Type Mapping
92
+
93
+ The exporter automatically maps DuckDB types to Spark SQL types:
94
+
95
+ | DuckDB Type | Spark Type |
96
+ |-------------|------------|
97
+ | INTEGER | integer |
98
+ | BIGINT | long |
99
+ | FLOAT | double |
100
+ | DOUBLE | double |
101
+ | DECIMAL | decimal(10,0) |
102
+ | BOOLEAN | boolean |
103
+ | TIMESTAMP | timestamp |
104
+ | DATE | date |
105
+ | VARCHAR | string |
106
+ | Others | string |
107
+
108
+ ## Error Handling
109
+
110
+ The exporter handles various error conditions:
111
+
112
+ - **Missing snapshots** - Skips tables with no data
113
+ - **Existing checkpoints** - Avoids overwriting existing files
114
+ - **Schema changes** - Uses the latest schema for each table
115
+ - **File system errors** - Reports and continues with other tables
116
+
117
+ ## License
118
+
119
+ MIT License - see LICENSE file for details.
120
+
121
+ ## Contributing
122
+
123
+ Contributions are welcome! Please feel free to submit a Pull Request.
@@ -0,0 +1,5 @@
1
+ ducklake_delta_exporter/__init__.py,sha256=oTLyKljSO2MlSJD9sRtg3wcn2nhBGpPMfwEOuKw360Q,14430
2
+ ducklake_delta_exporter-0.1.3.dist-info/METADATA,sha256=6iaajQHsbymIXvkNnz9hgkXRRy9V2luvnl2m5qcO4Os,3855
3
+ ducklake_delta_exporter-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
+ ducklake_delta_exporter-0.1.3.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
5
+ ducklake_delta_exporter-0.1.3.dist-info/RECORD,,
@@ -1,71 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ducklake-delta-exporter
3
- Version: 0.1.1
4
- Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
- Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
- Author: mim
7
- Author-email: your.email@example.com
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
- Classifier: Development Status :: 3 - Alpha
14
- Requires-Python: >=3.8
15
- Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
17
- Requires-Dist: pyarrow
18
- Dynamic: author
19
- Dynamic: author-email
20
- Dynamic: classifier
21
- Dynamic: description
22
- Dynamic: description-content-type
23
- Dynamic: home-page
24
- Dynamic: requires-dist
25
- Dynamic: requires-python
26
- Dynamic: summary
27
-
28
- # 🦆 DuckLake Delta Exporter
29
-
30
- A Python utility to **bridge the gap between DuckLake and Delta Lake** by generating Delta-compatible transaction logs directly from DuckLake metadata.
31
-
32
- This isn’t your typical general-purpose library. It’s mostly battle-tested with **OneLake mounted storage**, and while it *should* work with local filesystems, there’s **no support for S3, GCS, or ABFSS** .
33
-
34
- It doesn’t use the `deltalake` Python package either. The metadata is handcrafted from scratch — because why not reinvent the wheel for fun and learning?
35
-
36
- **Goal?**
37
- Mostly to annoy DuckDB developers into finally shipping a proper Delta Lake metadata exporter 😎
38
-
39
- 🔗 [Source code on GitHub](https://github.com/djouallah/ducklake_delta_exporter)
40
-
41
- ---
42
-
43
- ## ✨ Features
44
-
45
- - **DuckLake → Delta Sync**
46
- Generates Delta Lake `_delta_log/*.json` transaction files and Parquet checkpoints from the latest DuckLake state.
47
-
48
- - **Schema Mapping**
49
- Converts DuckDB types to their Spark SQL equivalents so Delta can understand them without throwing a tantrum.
50
-
51
- - **Change Detection**
52
- Detects file-level additions/removals since the last export — keeps things incremental and tidy.
53
-
54
- - **Checkpointing**
55
- Automatically writes Delta checkpoints every N versions (configurable), so readers don’t have to replay the entire log from scratch.
56
-
57
- ---
58
-
59
- ## ⚙️ Installation & Usage
60
-
61
- Install via pip:
62
-
63
- ```bash
64
- pip install ducklake-delta-exporter
65
- ```
66
-
67
- ```
68
- from ducklake_delta_exporter import generate_latest_delta_log
69
-
70
- generate_latest_delta_log('/lakehouse/default/Files/meta.db')
71
- ```
@@ -1,5 +0,0 @@
1
- ducklake_delta_exporter/__init__.py,sha256=dCfDXzdj-LOi8FXGPuA_bfD4AQgmblMFF-x0YJR6SXA,24797
2
- ducklake_delta_exporter-0.1.1.dist-info/METADATA,sha256=Vmk_LrjMZIM41ZnueTUnEZLTY1TFoAdOgMZQWjP4Ouk,2539
3
- ducklake_delta_exporter-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- ducklake_delta_exporter-0.1.1.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
5
- ducklake_delta_exporter-0.1.1.dist-info/RECORD,,