ducklake-delta-exporter 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
13
  Classifier: Development Status :: 3 - Alpha
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
16
+ Requires-Dist: duckdb>=1.4.4
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: classifier
@@ -28,7 +28,7 @@ Dynamic: summary
28
28
 
29
29
  A Python package for exporting DuckLake snapshots as Delta Lake checkpoint files, enabling compatibility with Delta Lake readers, support local path, s3 and gcs, for onelake use mounted storage as azure storage is not supported
30
30
 
31
- this is just a fun project
31
+ this is just a fun project, please vote for a proper support in duckdb https://github.com/duckdb/duckdb-delta/issues/218
32
32
 
33
33
  ## Repository
34
34
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  A Python package for exporting DuckLake snapshots as Delta Lake checkpoint files, enabling compatibility with Delta Lake readers, support local path, s3 and gcs, for onelake use mounted storage as azure storage is not supported
4
4
 
5
- this is just a fun project
5
+ this is just a fun project, please vote for a proper support in duckdb https://github.com/duckdb/duckdb-delta/issues/218
6
6
 
7
7
  ## Repository
8
8
 
@@ -0,0 +1,486 @@
1
+ # File: ducklake_delta_exporter.py
2
+ import duckdb
3
+
4
+
5
+ def generate_latest_delta_log(db_path: str):
6
+ """
7
+ Export the latest DuckLake snapshot for each table as Delta checkpoint files.
8
+ Uses DuckDB 1.4.4+ native support for writing to abfss://, s3://, etc.
9
+
10
+ Args:
11
+ db_path (str): The path to the DuckLake database file (or connection string).
12
+ """
13
+ # For remote paths (abfss://, s3://, etc.), use in-memory connection with ATTACH
14
+ is_remote = any(db_path.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://'])
15
+
16
+ if is_remote:
17
+ con = duckdb.connect()
18
+ # Load required extensions for cloud storage
19
+ if db_path.startswith('abfss://') or db_path.startswith('az://'):
20
+ con.execute("LOAD azure")
21
+ # Load persistent secrets
22
+ con.execute("SELECT * FROM duckdb_secrets()")
23
+ elif db_path.startswith('s3://'):
24
+ con.execute("LOAD httpfs")
25
+ con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
26
+ con.execute("USE ducklake_db")
27
+ else:
28
+ con = duckdb.connect(db_path, read_only=True)
29
+
30
+ # Build export summary - identify which tables have data
31
+ con.execute("""
32
+ CREATE OR REPLACE TEMP TABLE export_summary AS
33
+ WITH
34
+ data_root_config AS (
35
+ SELECT value AS data_root FROM ducklake_metadata WHERE key = 'data_path'
36
+ ),
37
+ active_tables AS (
38
+ SELECT
39
+ t.table_id,
40
+ t.table_name,
41
+ s.schema_name,
42
+ t.path AS table_path,
43
+ s.path AS schema_path,
44
+ rtrim((SELECT data_root FROM data_root_config), '/') || '/' ||
45
+ CASE
46
+ WHEN trim(s.path, '/') != '' THEN trim(s.path, '/') || '/'
47
+ ELSE ''
48
+ END ||
49
+ trim(t.path, '/') AS table_root
50
+ FROM ducklake_table t
51
+ JOIN ducklake_schema s USING(schema_id)
52
+ WHERE t.end_snapshot IS NULL
53
+ ),
54
+ current_snapshot AS (
55
+ SELECT MAX(snapshot_id) AS snapshot_id FROM ducklake_snapshot
56
+ ),
57
+ table_last_modified AS (
58
+ SELECT
59
+ t.*,
60
+ COALESCE(
61
+ (SELECT MAX(sc.snapshot_id)
62
+ FROM ducklake_snapshot_changes sc
63
+ WHERE regexp_matches(sc.changes_made, '[:,]' || t.table_id || '([^0-9]|$)')
64
+ ),
65
+ (SELECT cs.snapshot_id
66
+ FROM current_snapshot cs
67
+ WHERE EXISTS (
68
+ SELECT 1 FROM ducklake_data_file df
69
+ WHERE df.table_id = t.table_id
70
+ AND df.end_snapshot IS NULL
71
+ )
72
+ )
73
+ ) AS last_modified_snapshot,
74
+ (SELECT COUNT(*) FROM ducklake_data_file df
75
+ WHERE df.table_id = t.table_id
76
+ AND df.end_snapshot IS NULL
77
+ ) AS file_count
78
+ FROM active_tables t
79
+ )
80
+ SELECT
81
+ table_id,
82
+ schema_name,
83
+ table_name,
84
+ table_root,
85
+ CASE
86
+ WHEN file_count = 0 THEN 'no_data_files'
87
+ WHEN last_modified_snapshot IS NULL THEN 'no_changes'
88
+ ELSE 'needs_export'
89
+ END AS status,
90
+ last_modified_snapshot AS snapshot_id,
91
+ file_count
92
+ FROM table_last_modified
93
+ """)
94
+
95
+ # Get tables that need export
96
+ tables_to_export = con.execute("""
97
+ SELECT table_id, schema_name, table_name, table_root, snapshot_id, file_count
98
+ FROM export_summary
99
+ WHERE status = 'needs_export'
100
+ """).fetchall()
101
+
102
+ # Show summary
103
+ summary = con.execute("""
104
+ SELECT status, COUNT(*) as cnt FROM export_summary GROUP BY status
105
+ """).fetchall()
106
+
107
+ for status, cnt in summary:
108
+ print(f" {status}: {cnt} tables")
109
+
110
+ if not tables_to_export:
111
+ print("\n✅ No tables need export.")
112
+ con.close()
113
+ return
114
+
115
+ print(f"\n📦 Exporting {len(tables_to_export)} tables...")
116
+
117
+ # Process each table
118
+ for table_id, schema_name, table_name, table_root, snapshot_id, file_count in tables_to_export:
119
+ table_key = f"{schema_name}.{table_name}"
120
+
121
+ # Check if checkpoint already exists for this snapshot
122
+ checkpoint_path = f"{table_root}/_delta_log/{snapshot_id:020d}.checkpoint.parquet"
123
+ try:
124
+ con.execute(f"SELECT 1 FROM '{checkpoint_path}' LIMIT 1")
125
+ print(f" ⏭️ {table_key}: snapshot {snapshot_id} already exported")
126
+ continue
127
+ except Exception:
128
+ pass # File doesn't exist, proceed with export
129
+
130
+ print(f"\n Processing {table_key}...")
131
+
132
+ try:
133
+ # Build checkpoint parquet data for this table
134
+ con.execute("""
135
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_parquet AS
136
+ WITH
137
+ table_schemas AS (
138
+ SELECT
139
+ ? AS table_id,
140
+ ? AS table_name,
141
+ ? AS snapshot_id,
142
+ ? AS table_root,
143
+ list({
144
+ 'name': c.column_name,
145
+ 'type':
146
+ CASE
147
+ WHEN contains(lower(c.column_type), 'int') AND contains(c.column_type, '64') THEN 'long'
148
+ WHEN contains(lower(c.column_type), 'int') THEN 'integer'
149
+ WHEN contains(lower(c.column_type), 'float') THEN 'double'
150
+ WHEN contains(lower(c.column_type), 'double') THEN 'double'
151
+ WHEN contains(lower(c.column_type), 'bool') THEN 'boolean'
152
+ WHEN contains(lower(c.column_type), 'timestamp') THEN 'timestamp'
153
+ WHEN contains(lower(c.column_type), 'date') THEN 'date'
154
+ WHEN contains(lower(c.column_type), 'decimal') THEN lower(c.column_type)
155
+ ELSE 'string'
156
+ END,
157
+ 'nullable': true,
158
+ 'metadata': MAP{}::MAP(VARCHAR, VARCHAR)
159
+ }::STRUCT(name VARCHAR, type VARCHAR, nullable BOOLEAN, metadata MAP(VARCHAR, VARCHAR)) ORDER BY c.column_order) AS schema_fields
160
+ FROM ducklake_column c
161
+ WHERE c.table_id = ?
162
+ AND c.end_snapshot IS NULL
163
+ ),
164
+ file_column_stats_agg AS (
165
+ SELECT
166
+ df.data_file_id,
167
+ c.column_name,
168
+ ANY_VALUE(c.column_type) AS column_type,
169
+ MAX(fcs.value_count) AS value_count,
170
+ MIN(fcs.min_value) AS min_value,
171
+ MAX(fcs.max_value) AS max_value,
172
+ MAX(fcs.null_count) AS null_count
173
+ FROM ducklake_data_file df
174
+ LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
175
+ LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
176
+ WHERE df.table_id = ?
177
+ AND df.end_snapshot IS NULL
178
+ AND c.column_id IS NOT NULL
179
+ AND c.end_snapshot IS NULL
180
+ GROUP BY df.data_file_id, c.column_name
181
+ ),
182
+ file_column_stats_transformed AS (
183
+ SELECT
184
+ fca.data_file_id,
185
+ fca.column_name,
186
+ fca.column_type,
187
+ fca.value_count,
188
+ fca.null_count,
189
+ CASE
190
+ WHEN fca.min_value IS NULL THEN NULL
191
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
192
+ regexp_replace(
193
+ regexp_replace(replace(fca.min_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
194
+ '^([^.]+)$', '\\1.000'
195
+ ) || 'Z'
196
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.min_value
197
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.min_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
198
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
199
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
200
+ CASE WHEN contains(fca.min_value, '.') OR contains(lower(fca.min_value), 'e')
201
+ THEN CAST(TRY_CAST(fca.min_value AS DOUBLE) AS VARCHAR)
202
+ ELSE CAST(TRY_CAST(fca.min_value AS BIGINT) AS VARCHAR)
203
+ END
204
+ ELSE fca.min_value
205
+ END AS transformed_min,
206
+ CASE
207
+ WHEN fca.max_value IS NULL THEN NULL
208
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
209
+ regexp_replace(
210
+ regexp_replace(replace(fca.max_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
211
+ '^([^.]+)$', '\\1.000'
212
+ ) || 'Z'
213
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.max_value
214
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.max_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
215
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
216
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
217
+ CASE WHEN contains(fca.max_value, '.') OR contains(lower(fca.max_value), 'e')
218
+ THEN CAST(TRY_CAST(fca.max_value AS DOUBLE) AS VARCHAR)
219
+ ELSE CAST(TRY_CAST(fca.max_value AS BIGINT) AS VARCHAR)
220
+ END
221
+ ELSE fca.max_value
222
+ END AS transformed_max
223
+ FROM file_column_stats_agg fca
224
+ ),
225
+ file_metadata AS (
226
+ SELECT
227
+ ts.table_id,
228
+ ts.table_name,
229
+ ts.snapshot_id,
230
+ ts.table_root,
231
+ ts.schema_fields,
232
+ df.data_file_id,
233
+ df.path AS file_path,
234
+ df.file_size_bytes,
235
+ COALESCE(MAX(fct.value_count), 0) AS num_records,
236
+ COALESCE(map_from_entries(list({
237
+ 'key': fct.column_name,
238
+ 'value': fct.transformed_min
239
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_min IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS min_values,
240
+ COALESCE(map_from_entries(list({
241
+ 'key': fct.column_name,
242
+ 'value': fct.transformed_max
243
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_max IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS max_values,
244
+ COALESCE(map_from_entries(list({
245
+ 'key': fct.column_name,
246
+ 'value': fct.null_count
247
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.null_count IS NOT NULL)), MAP{}::MAP(VARCHAR, BIGINT)) AS null_count
248
+ FROM table_schemas ts
249
+ JOIN ducklake_data_file df ON df.table_id = ts.table_id
250
+ LEFT JOIN file_column_stats_transformed fct ON df.data_file_id = fct.data_file_id
251
+ WHERE df.end_snapshot IS NULL
252
+ GROUP BY ts.table_id, ts.table_name, ts.snapshot_id,
253
+ ts.table_root, ts.schema_fields, df.data_file_id, df.path, df.file_size_bytes
254
+ ),
255
+ table_aggregates AS (
256
+ SELECT
257
+ table_id,
258
+ table_name,
259
+ snapshot_id,
260
+ table_root,
261
+ schema_fields,
262
+ COUNT(*) AS num_files,
263
+ SUM(num_records) AS total_rows,
264
+ SUM(file_size_bytes) AS total_bytes,
265
+ list({
266
+ 'path': ltrim(file_path, '/'),
267
+ 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
268
+ 'size': file_size_bytes,
269
+ 'modificationTime': epoch_ms(now()),
270
+ 'dataChange': true,
271
+ 'stats': COALESCE(to_json({
272
+ 'numRecords': COALESCE(num_records, 0),
273
+ 'minValues': COALESCE(min_values, MAP{}::MAP(VARCHAR, VARCHAR)),
274
+ 'maxValues': COALESCE(max_values, MAP{}::MAP(VARCHAR, VARCHAR)),
275
+ 'nullCount': COALESCE(null_count, MAP{}::MAP(VARCHAR, BIGINT))
276
+ }), '{"numRecords":0}'),
277
+ 'tags': MAP{}::MAP(VARCHAR, VARCHAR)
278
+ }::STRUCT(
279
+ path VARCHAR,
280
+ partitionValues MAP(VARCHAR, VARCHAR),
281
+ size BIGINT,
282
+ modificationTime BIGINT,
283
+ dataChange BOOLEAN,
284
+ stats VARCHAR,
285
+ tags MAP(VARCHAR, VARCHAR)
286
+ )) AS add_entries
287
+ FROM file_metadata
288
+ GROUP BY table_id, table_name, snapshot_id, table_root, schema_fields
289
+ ),
290
+ checkpoint_data AS (
291
+ SELECT
292
+ ta.*,
293
+ epoch_ms(now()) AS now_ms,
294
+ uuid()::VARCHAR AS txn_id,
295
+ (substring(md5(ta.table_id::VARCHAR || '-metadata'), 1, 8) || '-' ||
296
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 9, 4) || '-' ||
297
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 13, 4) || '-' ||
298
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 17, 4) || '-' ||
299
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 21, 12)) AS meta_id,
300
+ to_json({'type': 'struct', 'fields': ta.schema_fields}) AS schema_string
301
+ FROM table_aggregates ta
302
+ ),
303
+ checkpoint_parquet_data AS (
304
+ SELECT
305
+ cd.table_id,
306
+ cd.table_name,
307
+ cd.snapshot_id,
308
+ cd.table_root,
309
+ cd.meta_id,
310
+ cd.now_ms,
311
+ cd.txn_id,
312
+ cd.schema_string,
313
+ cd.num_files,
314
+ cd.total_rows,
315
+ cd.total_bytes,
316
+ {'minReaderVersion': 1, 'minWriterVersion': 2} AS protocol,
317
+ NULL AS metaData,
318
+ NULL AS add,
319
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
320
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
321
+ 1 AS row_order
322
+ FROM checkpoint_data cd
323
+ UNION ALL
324
+ SELECT
325
+ cd.table_id,
326
+ cd.table_name,
327
+ cd.snapshot_id,
328
+ cd.table_root,
329
+ cd.meta_id,
330
+ cd.now_ms,
331
+ cd.txn_id,
332
+ cd.schema_string,
333
+ cd.num_files,
334
+ cd.total_rows,
335
+ cd.total_bytes,
336
+ NULL AS protocol,
337
+ {
338
+ 'id': cd.meta_id,
339
+ 'name': cd.table_name,
340
+ 'format': {'provider': 'parquet', 'options': MAP{}::MAP(VARCHAR, VARCHAR)}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
341
+ 'schemaString': cd.schema_string,
342
+ 'partitionColumns': []::VARCHAR[],
343
+ 'createdTime': cd.now_ms,
344
+ 'configuration': MAP{}::MAP(VARCHAR, VARCHAR)
345
+ }::STRUCT(id VARCHAR, name VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
346
+ NULL AS add,
347
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
348
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
349
+ 2 AS row_order
350
+ FROM checkpoint_data cd
351
+ UNION ALL
352
+ SELECT
353
+ cd.table_id,
354
+ cd.table_name,
355
+ cd.snapshot_id,
356
+ cd.table_root,
357
+ cd.meta_id,
358
+ cd.now_ms,
359
+ cd.txn_id,
360
+ cd.schema_string,
361
+ cd.num_files,
362
+ cd.total_rows,
363
+ cd.total_bytes,
364
+ NULL AS protocol,
365
+ NULL AS metaData,
366
+ unnest(cd.add_entries) AS add,
367
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
368
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
369
+ 3 AS row_order
370
+ FROM checkpoint_data cd
371
+ )
372
+ SELECT * FROM checkpoint_parquet_data
373
+ """, [table_id, table_name, snapshot_id, table_root, table_id, table_id])
374
+
375
+ # Build JSON log content
376
+ con.execute("""
377
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_json AS
378
+ SELECT DISTINCT
379
+ p.table_id,
380
+ p.table_root,
381
+ p.snapshot_id,
382
+ p.num_files,
383
+ to_json({
384
+ 'commitInfo': {
385
+ 'timestamp': p.now_ms,
386
+ 'operation': 'CONVERT',
387
+ 'operationParameters': {
388
+ 'convertedFrom': 'DuckLake',
389
+ 'duckLakeSnapshotId': p.snapshot_id::VARCHAR,
390
+ 'partitionBy': '[]'
391
+ },
392
+ 'isolationLevel': 'Serializable',
393
+ 'isBlindAppend': false,
394
+ 'operationMetrics': {
395
+ 'numFiles': p.num_files::VARCHAR,
396
+ 'numOutputRows': p.total_rows::VARCHAR,
397
+ 'numOutputBytes': p.total_bytes::VARCHAR
398
+ },
399
+ 'engineInfo': 'DuckLake-Delta-Exporter/1.0.0',
400
+ 'txnId': p.txn_id
401
+ }
402
+ }) || chr(10) ||
403
+ to_json({
404
+ 'metaData': {
405
+ 'id': p.meta_id,
406
+ 'name': p.table_name,
407
+ 'format': {'provider': 'parquet', 'options': MAP{}},
408
+ 'schemaString': p.schema_string::VARCHAR,
409
+ 'partitionColumns': [],
410
+ 'createdTime': p.now_ms,
411
+ 'configuration': MAP{}
412
+ }
413
+ }) || chr(10) ||
414
+ to_json({
415
+ 'protocol': {'minReaderVersion': 1, 'minWriterVersion': 2}
416
+ }) AS content
417
+ FROM temp_checkpoint_parquet p
418
+ WHERE p.row_order = 1
419
+ """)
420
+
421
+ # Build last checkpoint content
422
+ con.execute("""
423
+ CREATE OR REPLACE TEMP TABLE temp_last_checkpoint AS
424
+ SELECT
425
+ table_id,
426
+ table_root,
427
+ snapshot_id,
428
+ '{"version":' || snapshot_id || ',"size":' || (2 + num_files) || '}' AS content
429
+ FROM temp_checkpoint_parquet
430
+ WHERE row_order = 1
431
+ """)
432
+
433
+ # Get file paths
434
+ paths = con.execute("""
435
+ SELECT
436
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.checkpoint.parquet' AS checkpoint_file,
437
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.json' AS json_file,
438
+ table_root || '/_delta_log/_last_checkpoint' AS last_checkpoint_file,
439
+ table_root || '/_delta_log' AS delta_log_path
440
+ FROM temp_checkpoint_parquet
441
+ WHERE row_order = 1
442
+ LIMIT 1
443
+ """).fetchone()
444
+
445
+ checkpoint_file, json_file, last_checkpoint_file, delta_log_path = paths
446
+
447
+ # Create delta_log directory for local paths
448
+ if not any(table_root.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://']):
449
+ con.execute(f"""
450
+ COPY (SELECT 1 AS id, 1 AS ".duckdb_init")
451
+ TO '{delta_log_path}'
452
+ (FORMAT CSV, PARTITION_BY (".duckdb_init"), OVERWRITE_OR_IGNORE)
453
+ """)
454
+
455
+ # Write checkpoint parquet
456
+ con.execute(f"""
457
+ COPY (SELECT protocol, metaData, add, remove, commitInfo
458
+ FROM temp_checkpoint_parquet ORDER BY row_order)
459
+ TO '{checkpoint_file}' (FORMAT PARQUET)
460
+ """)
461
+
462
+ # Write JSON log
463
+ con.execute(f"""
464
+ COPY (SELECT content FROM temp_checkpoint_json)
465
+ TO '{json_file}' (FORMAT CSV, HEADER false, QUOTE '')
466
+ """)
467
+
468
+ # Write last checkpoint
469
+ con.execute(f"""
470
+ COPY (SELECT content FROM temp_last_checkpoint)
471
+ TO '{last_checkpoint_file}' (FORMAT CSV, HEADER false, QUOTE '')
472
+ """)
473
+
474
+ print(f" ✅ {table_key}: exported snapshot {snapshot_id} ({file_count} files)")
475
+
476
+ except Exception as e:
477
+ print(f" ❌ {table_key}: {e}")
478
+
479
+ # Cleanup temp tables
480
+ con.execute("DROP TABLE IF EXISTS export_summary")
481
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_parquet")
482
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_json")
483
+ con.execute("DROP TABLE IF EXISTS temp_last_checkpoint")
484
+
485
+ con.close()
486
+ print("\n🎉 Export completed!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
13
  Classifier: Development Status :: 3 - Alpha
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
16
+ Requires-Dist: duckdb>=1.4.4
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: classifier
@@ -28,7 +28,7 @@ Dynamic: summary
28
28
 
29
29
  A Python package for exporting DuckLake snapshots as Delta Lake checkpoint files, enabling compatibility with Delta Lake readers, support local path, s3 and gcs, for onelake use mounted storage as azure storage is not supported
30
30
 
31
- this is just a fun project
31
+ this is just a fun project, please vote for a proper support in duckdb https://github.com/duckdb/duckdb-delta/issues/218
32
32
 
33
33
  ## Repository
34
34
 
@@ -3,9 +3,9 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name='ducklake-delta-exporter',
6
- version='0.1.3',
6
+ version='0.2.0',
7
7
  packages=find_packages(),
8
- install_requires=['duckdb'],
8
+ install_requires=['duckdb>=1.4.4'],
9
9
  author='mim',
10
10
  author_email='your.email@example.com',
11
11
  description='A utility to export DuckLake database metadata to Delta Lake transaction logs.',
@@ -1,326 +0,0 @@
1
- # File: ducklake_delta_exporter.py
2
- import json
3
- import time
4
- import duckdb
5
-
6
- def map_type_ducklake_to_spark(t):
7
- """Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
8
- t = t.lower()
9
- if 'int' in t:
10
- return 'long' if '64' in t else 'integer'
11
- elif 'float' in t:
12
- return 'double'
13
- elif 'double' in t:
14
- return 'double'
15
- elif 'decimal' in t:
16
- return 'decimal(10,0)'
17
- elif 'bool' in t:
18
- return 'boolean'
19
- elif 'timestamp' in t:
20
- return 'timestamp'
21
- elif 'date' in t:
22
- return 'date'
23
- return 'string'
24
-
25
- def create_spark_schema_string(fields):
26
- """Creates a JSON string for the Spark schema from a list of fields."""
27
- return json.dumps({"type": "struct", "fields": fields})
28
-
29
- def get_latest_ducklake_snapshot(con, table_id):
30
- """
31
- Get the latest DuckLake snapshot ID for a table.
32
- """
33
- latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
34
- return latest_snapshot
35
-
36
- def get_latest_delta_checkpoint(con, table_id):
37
- """
38
- check how many times a table has being modified.
39
- """
40
- delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
41
- where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
42
- print(table_id)
43
- print(delta_checkpoint)
44
- return delta_checkpoint
45
-
46
- def get_file_modification_time(dummy_time):
47
- """
48
- Return a dummy modification time for parquet files.
49
- This avoids the latency of actually reading file metadata.
50
-
51
- Args:
52
- dummy_time: Timestamp in milliseconds to use as modification time
53
-
54
- Returns:
55
- Modification time in milliseconds
56
- """
57
- return dummy_time
58
-
59
- def create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now):
60
- """
61
- Create a minimal JSON log file for Spark compatibility using DuckDB.
62
- """
63
- json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
64
-
65
- # Create JSON log entries using DuckDB
66
- duckdb.execute("DROP TABLE IF EXISTS json_log_table")
67
-
68
- # Protocol entry
69
- protocol_json = json.dumps({
70
- "protocol": {
71
- "minReaderVersion": 1,
72
- "minWriterVersion": 2
73
- }
74
- })
75
-
76
- # Metadata entry
77
- metadata_json = json.dumps({
78
- "metaData": {
79
- "id": str(table_info['table_id']),
80
- "name": table_info['table_name'],
81
- "description": None,
82
- "format": {
83
- "provider": "parquet",
84
- "options": {}
85
- },
86
- "schemaString": create_spark_schema_string(schema_fields),
87
- "partitionColumns": [],
88
- "createdTime": now,
89
- "configuration": {
90
- "delta.logRetentionDuration": "interval 1 hour"
91
- }
92
- }
93
- })
94
-
95
- # Commit info entry
96
- commitinfo_json = json.dumps({
97
- "commitInfo": {
98
- "timestamp": now,
99
- "operation": "CONVERT",
100
- "operationParameters": {
101
- "convertedFrom": "DuckLake"
102
- },
103
- "isBlindAppend": True,
104
- "engineInfo": "DuckLake-Delta-Exporter",
105
- "clientVersion": "1.0.0"
106
- }
107
- })
108
-
109
- # Create table with JSON entries
110
- duckdb.execute("""
111
- CREATE TABLE json_log_table AS
112
- SELECT ? AS json_line
113
- UNION ALL
114
- SELECT ? AS json_line
115
- UNION ALL
116
- SELECT ? AS json_line
117
- """, [protocol_json, metadata_json, commitinfo_json])
118
-
119
- # Write JSON log file using DuckDB
120
- duckdb.execute(f"COPY (SELECT json_line FROM json_log_table) TO '{json_log_file}' (FORMAT CSV, HEADER false, QUOTE '')")
121
-
122
- # Clean up
123
- duckdb.execute("DROP TABLE IF EXISTS json_log_table")
124
-
125
- return json_log_file
126
-
127
- def build_file_path(table_root, relative_path):
128
- """
129
- Build full file path from table root and relative path.
130
- Works with both local paths and S3 URLs.
131
- """
132
- table_root = table_root.rstrip('/')
133
- relative_path = relative_path.lstrip('/')
134
- return f"{table_root}/{relative_path}"
135
-
136
- def create_checkpoint_for_latest_snapshot(con, table_info, data_root):
137
- """
138
- Create a Delta checkpoint file for the latest DuckLake snapshot.
139
- """
140
- table_root = data_root.rstrip('/') + '/' + table_info['schema_path'] + table_info['table_path']
141
-
142
- # Get the latest snapshot
143
- latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
144
- if latest_snapshot is None:
145
- print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
146
- return False
147
- delta_version = get_latest_delta_checkpoint(con, table_info['table_id'])
148
- checkpoint_file = table_root + f"_delta_log/{delta_version:020d}.checkpoint.parquet"
149
- json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
150
-
151
- try:
152
- con.execute(f"SELECT protocol FROM '{checkpoint_file}' limit 0 ")
153
- print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Checkpoint file already exists: {checkpoint_file}")
154
- except:
155
-
156
- now = int(time.time() * 1000)
157
-
158
- # Get all files for the latest snapshot
159
- file_rows = con.execute(f"""
160
- SELECT path, file_size_bytes FROM ducklake_data_file
161
- WHERE table_id = {table_info['table_id']}
162
- AND begin_snapshot <= {latest_snapshot}
163
- AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
164
- """).fetchall()
165
-
166
- # Get schema for the latest snapshot
167
- columns = con.execute(f"""
168
- SELECT column_name, column_type FROM ducklake_column
169
- WHERE table_id = {table_info['table_id']}
170
- AND begin_snapshot <= {latest_snapshot}
171
- AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
172
- ORDER BY column_order
173
- """).fetchall()
174
-
175
- # Get or generate table metadata ID
176
- table_meta_id = str(table_info['table_id'])
177
-
178
- # Prepare schema
179
- schema_fields = [
180
- {"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
181
- for name, typ in columns
182
- ]
183
-
184
- # Create checkpoint data using DuckDB directly
185
- checkpoint_data = []
186
-
187
- # Create checkpoint data directly in DuckDB using proper data types
188
- duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
189
-
190
- # Create the checkpoint table with proper nested structure
191
- duckdb.execute("""
192
- CREATE TABLE checkpoint_table AS
193
- WITH checkpoint_data AS (
194
- -- Protocol record
195
- SELECT
196
- {'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
197
- NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
198
- NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
199
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
200
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
201
-
202
- UNION ALL
203
-
204
- -- Metadata record
205
- SELECT
206
- NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
207
- {
208
- 'id': ?,
209
- 'name': ?,
210
- 'description': NULL,
211
- 'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
212
- 'schemaString': ?,
213
- 'partitionColumns': []::VARCHAR[],
214
- 'createdTime': ?,
215
- 'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
216
- }::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
217
- NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
218
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
219
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
220
- )
221
- SELECT * FROM checkpoint_data
222
- """, [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
223
-
224
- # Add file records
225
- for path, size in file_rows:
226
- rel_path = path.lstrip('/')
227
- full_path = build_file_path(table_root, rel_path)
228
- mod_time = get_file_modification_time(now)
229
-
230
- duckdb.execute("""
231
- INSERT INTO checkpoint_table
232
- SELECT
233
- NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
234
- NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
235
- {
236
- 'path': ?,
237
- 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
238
- 'size': ?,
239
- 'modificationTime': ?,
240
- 'dataChange': true,
241
- 'stats': ?,
242
- 'tags': NULL::MAP(VARCHAR, VARCHAR)
243
- }::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
244
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
245
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
246
- """, [rel_path, size, mod_time, json.dumps({"numRecords": None})])
247
-
248
- # Create the _delta_log directory if it doesn't exist
249
- duckdb.execute(f"COPY (SELECT 43) TO '{table_root}_delta_log' (FORMAT PARQUET, PER_THREAD_OUTPUT, OVERWRITE_OR_IGNORE)")
250
-
251
- # Write the checkpoint file
252
- duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{checkpoint_file}' (FORMAT PARQUET)")
253
-
254
- # Create dummy JSON log file for Spark compatibility
255
- create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now)
256
-
257
- # Write the _last_checkpoint file
258
- total_records = 2 + len(file_rows) # protocol + metadata + file records
259
- duckdb.execute(f"""
260
- COPY (SELECT {delta_version} AS version, {total_records} AS size)
261
- TO '{table_root}_delta_log/_last_checkpoint' (FORMAT JSON, ARRAY false)
262
- """)
263
-
264
- print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
265
- print(f"✅ Created JSON log file: {json_log_file}")
266
-
267
- # Clean up temporary tables
268
- duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
269
-
270
- return True, delta_version, latest_snapshot
271
-
272
- def generate_latest_delta_log(db_path: str, data_root: str = None):
273
- """
274
- Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
275
- Creates both checkpoint files and minimal JSON log files for Spark compatibility.
276
-
277
- Args:
278
- db_path (str): The path to the DuckLake database file.
279
- data_root (str): The root directory for the lakehouse data.
280
- """
281
- con = duckdb.connect(db_path, read_only=True)
282
-
283
- if data_root is None:
284
- data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
285
-
286
- # Get all active tables
287
- tables = con.execute("""
288
- SELECT
289
- t.table_id,
290
- t.table_name,
291
- s.schema_name,
292
- t.path as table_path,
293
- s.path as schema_path
294
- FROM ducklake_table t
295
- JOIN ducklake_schema s USING(schema_id)
296
- WHERE t.end_snapshot IS NULL
297
- """).fetchall()
298
-
299
- total_tables = len(tables)
300
- successful_exports = 0
301
-
302
- for table_row in tables:
303
- table_info = {
304
- 'table_id': table_row[0],
305
- 'table_name': table_row[1],
306
- 'schema_name': table_row[2],
307
- 'table_path': table_row[3],
308
- 'schema_path': table_row[4]
309
- }
310
-
311
- table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
312
- print(f"Processing {table_key}...")
313
-
314
- try:
315
- result = create_checkpoint_for_latest_snapshot(con, table_info, data_root)
316
-
317
- if result:
318
- successful_exports += 1
319
- else:
320
- print(f"⚠️ {table_key}: No data to export")
321
-
322
- except Exception as e:
323
- print(f"❌ {table_key}: Failed to export checkpoint - {e}")
324
-
325
- con.close()
326
- print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")