ducklake-delta-exporter 0.1.4__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.1.4
3
+ Version: 0.3.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
13
  Classifier: Development Status :: 3 - Alpha
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
16
+ Requires-Dist: duckdb>=1.4.4
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: classifier
@@ -0,0 +1,487 @@
1
+ # File: ducklake_delta_exporter.py
2
+ import duckdb
3
+
4
+
5
+ def generate_latest_delta_log(db_path: str):
6
+ """
7
+ Export the latest DuckLake snapshot for each table as Delta checkpoint files.
8
+ Uses DuckDB 1.4.4+ native support for writing to abfss://, s3://, etc.
9
+
10
+ Args:
11
+ db_path (str): The path to the DuckLake database file (or connection string).
12
+ """
13
+ # For remote paths (abfss://, s3://, etc.), use in-memory connection with ATTACH
14
+ is_remote = any(db_path.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://'])
15
+
16
+ if is_remote:
17
+ con = duckdb.connect()
18
+ # Load required extensions for cloud storage
19
+ if db_path.startswith('abfss://') or db_path.startswith('az://'):
20
+ con.execute("LOAD azure")
21
+ # Load persistent secrets
22
+ con.execute("SELECT * FROM duckdb_secrets()")
23
+ elif db_path.startswith('s3://'):
24
+ con.execute("LOAD httpfs")
25
+ con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
26
+ con.execute("USE ducklake_db")
27
+ else:
28
+ con = duckdb.connect(db_path, read_only=True)
29
+
30
+ # Build export summary - identify which tables have data
31
+ con.execute("""
32
+ CREATE OR REPLACE TEMP TABLE export_summary AS
33
+ WITH
34
+ data_root_config AS (
35
+ SELECT value AS data_root FROM ducklake_metadata WHERE key = 'data_path'
36
+ ),
37
+ active_tables AS (
38
+ SELECT
39
+ t.table_id,
40
+ t.table_name,
41
+ s.schema_name,
42
+ t.path AS table_path,
43
+ s.path AS schema_path,
44
+ rtrim((SELECT data_root FROM data_root_config), '/') || '/' ||
45
+ CASE
46
+ WHEN trim(s.path, '/') != '' THEN trim(s.path, '/') || '/'
47
+ ELSE ''
48
+ END ||
49
+ trim(t.path, '/') AS table_root
50
+ FROM ducklake_table t
51
+ JOIN ducklake_schema s USING(schema_id)
52
+ WHERE t.end_snapshot IS NULL
53
+ ),
54
+ current_snapshot AS (
55
+ SELECT MAX(snapshot_id) AS snapshot_id FROM ducklake_snapshot
56
+ ),
57
+ table_last_modified AS (
58
+ SELECT
59
+ t.*,
60
+ COALESCE(
61
+ (SELECT MAX(sc.snapshot_id)
62
+ FROM ducklake_snapshot_changes sc
63
+ WHERE regexp_matches(sc.changes_made, '[:,]' || t.table_id || '([^0-9]|$)')
64
+ ),
65
+ (SELECT cs.snapshot_id
66
+ FROM current_snapshot cs
67
+ WHERE EXISTS (
68
+ SELECT 1 FROM ducklake_data_file df
69
+ WHERE df.table_id = t.table_id
70
+ AND df.end_snapshot IS NULL
71
+ )
72
+ )
73
+ ) AS last_modified_snapshot,
74
+ (SELECT COUNT(*) FROM ducklake_data_file df
75
+ WHERE df.table_id = t.table_id
76
+ AND df.end_snapshot IS NULL
77
+ ) AS file_count
78
+ FROM active_tables t
79
+ )
80
+ SELECT
81
+ table_id,
82
+ schema_name,
83
+ table_name,
84
+ table_root,
85
+ CASE
86
+ WHEN file_count = 0 THEN 'no_data_files'
87
+ WHEN last_modified_snapshot IS NULL THEN 'no_changes'
88
+ ELSE 'needs_export'
89
+ END AS status,
90
+ last_modified_snapshot AS snapshot_id,
91
+ file_count
92
+ FROM table_last_modified
93
+ """)
94
+
95
+ # Get tables that need export
96
+ tables_to_export = con.execute("""
97
+ SELECT table_id, schema_name, table_name, table_root, snapshot_id, file_count
98
+ FROM export_summary
99
+ WHERE status = 'needs_export'
100
+ """).fetchall()
101
+
102
+ # Show summary
103
+ summary = con.execute("""
104
+ SELECT status, COUNT(*) as cnt FROM export_summary GROUP BY status
105
+ """).fetchall()
106
+
107
+ for status, cnt in summary:
108
+ print(f" {status}: {cnt} tables")
109
+
110
+ if not tables_to_export:
111
+ print("\n✅ No tables need export.")
112
+ con.close()
113
+ return
114
+
115
+ print(f"\n📦 Exporting {len(tables_to_export)} tables...")
116
+
117
+ # Process each table
118
+ for table_id, schema_name, table_name, table_root, snapshot_id, file_count in tables_to_export:
119
+ table_key = f"{schema_name}.{table_name}"
120
+
121
+ # Check if checkpoint already exists for this snapshot
122
+ checkpoint_path = f"{table_root}/_delta_log/{snapshot_id:020d}.checkpoint.parquet"
123
+ try:
124
+ con.execute(f"SELECT 1 FROM '{checkpoint_path}' LIMIT 1")
125
+ print(f" ⏭️ {table_key}: snapshot {snapshot_id} already exported")
126
+ continue
127
+ except Exception:
128
+ pass # File doesn't exist, proceed with export
129
+
130
+ print(f"\n Processing {table_key}...")
131
+
132
+ try:
133
+ # Build checkpoint parquet data for this table
134
+ con.execute("""
135
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_parquet AS
136
+ WITH
137
+ table_schemas AS (
138
+ SELECT
139
+ ? AS table_id,
140
+ ? AS table_name,
141
+ ? AS snapshot_id,
142
+ ? AS table_root,
143
+ list({
144
+ 'name': c.column_name,
145
+ 'type':
146
+ CASE
147
+ WHEN contains(lower(c.column_type), 'bigint') OR
148
+ (contains(lower(c.column_type), 'int') AND contains(c.column_type, '64')) THEN 'long'
149
+ WHEN contains(lower(c.column_type), 'int') THEN 'integer'
150
+ WHEN contains(lower(c.column_type), 'float') THEN 'double'
151
+ WHEN contains(lower(c.column_type), 'double') THEN 'double'
152
+ WHEN contains(lower(c.column_type), 'bool') THEN 'boolean'
153
+ WHEN contains(lower(c.column_type), 'timestamp') THEN 'timestamp'
154
+ WHEN contains(lower(c.column_type), 'date') THEN 'date'
155
+ WHEN contains(lower(c.column_type), 'decimal') THEN lower(c.column_type)
156
+ ELSE 'string'
157
+ END,
158
+ 'nullable': true,
159
+ 'metadata': MAP{}::MAP(VARCHAR, VARCHAR)
160
+ }::STRUCT(name VARCHAR, type VARCHAR, nullable BOOLEAN, metadata MAP(VARCHAR, VARCHAR)) ORDER BY c.column_order) AS schema_fields
161
+ FROM ducklake_column c
162
+ WHERE c.table_id = ?
163
+ AND c.end_snapshot IS NULL
164
+ ),
165
+ file_column_stats_agg AS (
166
+ SELECT
167
+ df.data_file_id,
168
+ c.column_name,
169
+ ANY_VALUE(c.column_type) AS column_type,
170
+ MAX(fcs.value_count) AS value_count,
171
+ MIN(fcs.min_value) AS min_value,
172
+ MAX(fcs.max_value) AS max_value,
173
+ MAX(fcs.null_count) AS null_count
174
+ FROM ducklake_data_file df
175
+ LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
176
+ LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id AND c.table_id = df.table_id
177
+ WHERE df.table_id = ?
178
+ AND df.end_snapshot IS NULL
179
+ AND c.column_id IS NOT NULL
180
+ AND c.end_snapshot IS NULL
181
+ GROUP BY df.data_file_id, c.column_name
182
+ ),
183
+ file_column_stats_transformed AS (
184
+ SELECT
185
+ fca.data_file_id,
186
+ fca.column_name,
187
+ fca.column_type,
188
+ fca.value_count,
189
+ fca.null_count,
190
+ CASE
191
+ WHEN fca.min_value IS NULL THEN NULL
192
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
193
+ regexp_replace(
194
+ regexp_replace(replace(fca.min_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
195
+ '^([^.]+)$', '\\1.000'
196
+ ) || 'Z'
197
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.min_value
198
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.min_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
199
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
200
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
201
+ CASE WHEN contains(fca.min_value, '.') OR contains(lower(fca.min_value), 'e')
202
+ THEN CAST(TRY_CAST(fca.min_value AS DOUBLE) AS VARCHAR)
203
+ ELSE CAST(TRY_CAST(fca.min_value AS BIGINT) AS VARCHAR)
204
+ END
205
+ ELSE fca.min_value
206
+ END AS transformed_min,
207
+ CASE
208
+ WHEN fca.max_value IS NULL THEN NULL
209
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
210
+ regexp_replace(
211
+ regexp_replace(replace(fca.max_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
212
+ '^([^.]+)$', '\\1.000'
213
+ ) || 'Z'
214
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.max_value
215
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.max_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
216
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
217
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
218
+ CASE WHEN contains(fca.max_value, '.') OR contains(lower(fca.max_value), 'e')
219
+ THEN CAST(TRY_CAST(fca.max_value AS DOUBLE) AS VARCHAR)
220
+ ELSE CAST(TRY_CAST(fca.max_value AS BIGINT) AS VARCHAR)
221
+ END
222
+ ELSE fca.max_value
223
+ END AS transformed_max
224
+ FROM file_column_stats_agg fca
225
+ ),
226
+ file_metadata AS (
227
+ SELECT
228
+ ts.table_id,
229
+ ts.table_name,
230
+ ts.snapshot_id,
231
+ ts.table_root,
232
+ ts.schema_fields,
233
+ df.data_file_id,
234
+ df.path AS file_path,
235
+ df.file_size_bytes,
236
+ COALESCE(MAX(fct.value_count), 0) AS num_records,
237
+ COALESCE(map_from_entries(list({
238
+ 'key': fct.column_name,
239
+ 'value': fct.transformed_min
240
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_min IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS min_values,
241
+ COALESCE(map_from_entries(list({
242
+ 'key': fct.column_name,
243
+ 'value': fct.transformed_max
244
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_max IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS max_values,
245
+ COALESCE(map_from_entries(list({
246
+ 'key': fct.column_name,
247
+ 'value': fct.null_count
248
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.null_count IS NOT NULL)), MAP{}::MAP(VARCHAR, BIGINT)) AS null_count
249
+ FROM table_schemas ts
250
+ JOIN ducklake_data_file df ON df.table_id = ts.table_id
251
+ LEFT JOIN file_column_stats_transformed fct ON df.data_file_id = fct.data_file_id
252
+ WHERE df.end_snapshot IS NULL
253
+ GROUP BY ts.table_id, ts.table_name, ts.snapshot_id,
254
+ ts.table_root, ts.schema_fields, df.data_file_id, df.path, df.file_size_bytes
255
+ ),
256
+ table_aggregates AS (
257
+ SELECT
258
+ table_id,
259
+ table_name,
260
+ snapshot_id,
261
+ table_root,
262
+ schema_fields,
263
+ COUNT(*) AS num_files,
264
+ SUM(num_records) AS total_rows,
265
+ SUM(file_size_bytes) AS total_bytes,
266
+ list({
267
+ 'path': ltrim(file_path, '/'),
268
+ 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
269
+ 'size': file_size_bytes,
270
+ 'modificationTime': epoch_ms(now()),
271
+ 'dataChange': true,
272
+ 'stats': COALESCE(to_json({
273
+ 'numRecords': COALESCE(num_records, 0),
274
+ 'minValues': COALESCE(min_values, MAP{}::MAP(VARCHAR, VARCHAR)),
275
+ 'maxValues': COALESCE(max_values, MAP{}::MAP(VARCHAR, VARCHAR)),
276
+ 'nullCount': COALESCE(null_count, MAP{}::MAP(VARCHAR, BIGINT))
277
+ }), '{"numRecords":0}'),
278
+ 'tags': MAP{}::MAP(VARCHAR, VARCHAR)
279
+ }::STRUCT(
280
+ path VARCHAR,
281
+ partitionValues MAP(VARCHAR, VARCHAR),
282
+ size BIGINT,
283
+ modificationTime BIGINT,
284
+ dataChange BOOLEAN,
285
+ stats VARCHAR,
286
+ tags MAP(VARCHAR, VARCHAR)
287
+ )) AS add_entries
288
+ FROM file_metadata
289
+ GROUP BY table_id, table_name, snapshot_id, table_root, schema_fields
290
+ ),
291
+ checkpoint_data AS (
292
+ SELECT
293
+ ta.*,
294
+ epoch_ms(now()) AS now_ms,
295
+ uuid()::VARCHAR AS txn_id,
296
+ (substring(md5(ta.table_id::VARCHAR || '-metadata'), 1, 8) || '-' ||
297
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 9, 4) || '-' ||
298
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 13, 4) || '-' ||
299
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 17, 4) || '-' ||
300
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 21, 12)) AS meta_id,
301
+ to_json({'type': 'struct', 'fields': ta.schema_fields}) AS schema_string
302
+ FROM table_aggregates ta
303
+ ),
304
+ checkpoint_parquet_data AS (
305
+ SELECT
306
+ cd.table_id,
307
+ cd.table_name,
308
+ cd.snapshot_id,
309
+ cd.table_root,
310
+ cd.meta_id,
311
+ cd.now_ms,
312
+ cd.txn_id,
313
+ cd.schema_string,
314
+ cd.num_files,
315
+ cd.total_rows,
316
+ cd.total_bytes,
317
+ {'minReaderVersion': 1, 'minWriterVersion': 2} AS protocol,
318
+ NULL AS metaData,
319
+ NULL AS add,
320
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
321
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
322
+ 1 AS row_order
323
+ FROM checkpoint_data cd
324
+ UNION ALL
325
+ SELECT
326
+ cd.table_id,
327
+ cd.table_name,
328
+ cd.snapshot_id,
329
+ cd.table_root,
330
+ cd.meta_id,
331
+ cd.now_ms,
332
+ cd.txn_id,
333
+ cd.schema_string,
334
+ cd.num_files,
335
+ cd.total_rows,
336
+ cd.total_bytes,
337
+ NULL AS protocol,
338
+ {
339
+ 'id': cd.meta_id,
340
+ 'name': cd.table_name,
341
+ 'format': {'provider': 'parquet', 'options': MAP{}::MAP(VARCHAR, VARCHAR)}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
342
+ 'schemaString': cd.schema_string,
343
+ 'partitionColumns': []::VARCHAR[],
344
+ 'createdTime': cd.now_ms,
345
+ 'configuration': MAP{}::MAP(VARCHAR, VARCHAR)
346
+ }::STRUCT(id VARCHAR, name VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
347
+ NULL AS add,
348
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
349
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
350
+ 2 AS row_order
351
+ FROM checkpoint_data cd
352
+ UNION ALL
353
+ SELECT
354
+ cd.table_id,
355
+ cd.table_name,
356
+ cd.snapshot_id,
357
+ cd.table_root,
358
+ cd.meta_id,
359
+ cd.now_ms,
360
+ cd.txn_id,
361
+ cd.schema_string,
362
+ cd.num_files,
363
+ cd.total_rows,
364
+ cd.total_bytes,
365
+ NULL AS protocol,
366
+ NULL AS metaData,
367
+ unnest(cd.add_entries) AS add,
368
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
369
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
370
+ 3 AS row_order
371
+ FROM checkpoint_data cd
372
+ )
373
+ SELECT * FROM checkpoint_parquet_data
374
+ """, [table_id, table_name, snapshot_id, table_root, table_id, table_id])
375
+
376
+ # Build JSON log content
377
+ con.execute("""
378
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_json AS
379
+ SELECT DISTINCT
380
+ p.table_id,
381
+ p.table_root,
382
+ p.snapshot_id,
383
+ p.num_files,
384
+ to_json({
385
+ 'commitInfo': {
386
+ 'timestamp': p.now_ms,
387
+ 'operation': 'CONVERT',
388
+ 'operationParameters': {
389
+ 'convertedFrom': 'DuckLake',
390
+ 'duckLakeSnapshotId': p.snapshot_id::VARCHAR,
391
+ 'partitionBy': '[]'
392
+ },
393
+ 'isolationLevel': 'Serializable',
394
+ 'isBlindAppend': false,
395
+ 'operationMetrics': {
396
+ 'numFiles': p.num_files::VARCHAR,
397
+ 'numOutputRows': p.total_rows::VARCHAR,
398
+ 'numOutputBytes': p.total_bytes::VARCHAR
399
+ },
400
+ 'engineInfo': 'DuckLake-Delta-Exporter/1.0.0',
401
+ 'txnId': p.txn_id
402
+ }
403
+ }) || chr(10) ||
404
+ to_json({
405
+ 'metaData': {
406
+ 'id': p.meta_id,
407
+ 'name': p.table_name,
408
+ 'format': {'provider': 'parquet', 'options': MAP{}},
409
+ 'schemaString': p.schema_string::VARCHAR,
410
+ 'partitionColumns': [],
411
+ 'createdTime': p.now_ms,
412
+ 'configuration': MAP{}
413
+ }
414
+ }) || chr(10) ||
415
+ to_json({
416
+ 'protocol': {'minReaderVersion': 1, 'minWriterVersion': 2}
417
+ }) AS content
418
+ FROM temp_checkpoint_parquet p
419
+ WHERE p.row_order = 1
420
+ """)
421
+
422
+ # Build last checkpoint content
423
+ con.execute("""
424
+ CREATE OR REPLACE TEMP TABLE temp_last_checkpoint AS
425
+ SELECT
426
+ table_id,
427
+ table_root,
428
+ snapshot_id,
429
+ '{"version":' || snapshot_id || ',"size":' || (2 + num_files) || '}' AS content
430
+ FROM temp_checkpoint_parquet
431
+ WHERE row_order = 1
432
+ """)
433
+
434
+ # Get file paths
435
+ paths = con.execute("""
436
+ SELECT
437
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.checkpoint.parquet' AS checkpoint_file,
438
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.json' AS json_file,
439
+ table_root || '/_delta_log/_last_checkpoint' AS last_checkpoint_file,
440
+ table_root || '/_delta_log' AS delta_log_path
441
+ FROM temp_checkpoint_parquet
442
+ WHERE row_order = 1
443
+ LIMIT 1
444
+ """).fetchone()
445
+
446
+ checkpoint_file, json_file, last_checkpoint_file, delta_log_path = paths
447
+
448
+ # Create delta_log directory for local paths
449
+ if not any(table_root.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://']):
450
+ con.execute(f"""
451
+ COPY (SELECT 1 AS id, 1 AS ".duckdb_init")
452
+ TO '{delta_log_path}'
453
+ (FORMAT CSV, PARTITION_BY (".duckdb_init"), OVERWRITE_OR_IGNORE)
454
+ """)
455
+
456
+ # Write checkpoint parquet
457
+ con.execute(f"""
458
+ COPY (SELECT protocol, metaData, add, remove, commitInfo
459
+ FROM temp_checkpoint_parquet ORDER BY row_order)
460
+ TO '{checkpoint_file}' (FORMAT PARQUET)
461
+ """)
462
+
463
+ # Write JSON log
464
+ con.execute(f"""
465
+ COPY (SELECT content FROM temp_checkpoint_json)
466
+ TO '{json_file}' (FORMAT CSV, HEADER false, QUOTE '')
467
+ """)
468
+
469
+ # Write last checkpoint
470
+ con.execute(f"""
471
+ COPY (SELECT content FROM temp_last_checkpoint)
472
+ TO '{last_checkpoint_file}' (FORMAT CSV, HEADER false, QUOTE '')
473
+ """)
474
+
475
+ print(f" ✅ {table_key}: exported snapshot {snapshot_id} ({file_count} files)")
476
+
477
+ except Exception as e:
478
+ print(f" ❌ {table_key}: {e}")
479
+
480
+ # Cleanup temp tables
481
+ con.execute("DROP TABLE IF EXISTS export_summary")
482
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_parquet")
483
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_json")
484
+ con.execute("DROP TABLE IF EXISTS temp_last_checkpoint")
485
+
486
+ con.close()
487
+ print("\n🎉 Export completed!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.1.4
3
+ Version: 0.3.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
13
  Classifier: Development Status :: 3 - Alpha
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
16
+ Requires-Dist: duckdb>=1.4.4
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: classifier
@@ -5,4 +5,5 @@ ducklake_delta_exporter.egg-info/PKG-INFO
5
5
  ducklake_delta_exporter.egg-info/SOURCES.txt
6
6
  ducklake_delta_exporter.egg-info/dependency_links.txt
7
7
  ducklake_delta_exporter.egg-info/requires.txt
8
- ducklake_delta_exporter.egg-info/top_level.txt
8
+ ducklake_delta_exporter.egg-info/top_level.txt
9
+ tests/test_stats_transformation.py
@@ -3,9 +3,9 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name='ducklake-delta-exporter',
6
- version='0.1.4',
6
+ version='0.3.0',
7
7
  packages=find_packages(),
8
- install_requires=['duckdb'],
8
+ install_requires=['duckdb>=1.4.4'],
9
9
  author='mim',
10
10
  author_email='your.email@example.com',
11
11
  description='A utility to export DuckLake database metadata to Delta Lake transaction logs.',