ducklake-delta-exporter 0.1.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,324 +1,487 @@
1
1
  # File: ducklake_delta_exporter.py
2
- import json
3
- import time
4
2
  import duckdb
5
3
 
6
- def map_type_ducklake_to_spark(t):
7
- """Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
8
- t = t.lower()
9
- if 'int' in t:
10
- return 'long' if '64' in t else 'integer'
11
- elif 'float' in t:
12
- return 'double'
13
- elif 'double' in t:
14
- return 'double'
15
- elif 'decimal' in t:
16
- return 'decimal(10,0)'
17
- elif 'bool' in t:
18
- return 'boolean'
19
- elif 'timestamp' in t:
20
- return 'timestamp'
21
- elif 'date' in t:
22
- return 'date'
23
- return 'string'
24
4
 
25
- def create_spark_schema_string(fields):
26
- """Creates a JSON string for the Spark schema from a list of fields."""
27
- return json.dumps({"type": "struct", "fields": fields})
28
-
29
- def get_latest_ducklake_snapshot(con, table_id):
30
- """
31
- Get the latest DuckLake snapshot ID for a table.
32
- """
33
- latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
34
- return latest_snapshot
35
-
36
- def get_latest_delta_checkpoint(con, table_id):
37
- """
38
- check how many times a table has being modified.
5
+ def generate_latest_delta_log(db_path: str):
39
6
  """
40
- delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
41
- where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
42
- return delta_checkpoint
7
+ Export the latest DuckLake snapshot for each table as Delta checkpoint files.
8
+ Uses DuckDB 1.4.4+ native support for writing to abfss://, s3://, etc.
43
9
 
44
- def get_file_modification_time(dummy_time):
45
- """
46
- Return a dummy modification time for parquet files.
47
- This avoids the latency of actually reading file metadata.
48
-
49
10
  Args:
50
- dummy_time: Timestamp in milliseconds to use as modification time
51
-
52
- Returns:
53
- Modification time in milliseconds
11
+ db_path (str): The path to the DuckLake database file (or connection string).
54
12
  """
55
- return dummy_time
13
+ # For remote paths (abfss://, s3://, etc.), use in-memory connection with ATTACH
14
+ is_remote = any(db_path.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://'])
56
15
 
57
- def create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now):
58
- """
59
- Create a minimal JSON log file for Spark compatibility using DuckDB.
60
- """
61
- json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
62
-
63
- # Create JSON log entries using DuckDB
64
- duckdb.execute("DROP TABLE IF EXISTS json_log_table")
65
-
66
- # Protocol entry
67
- protocol_json = json.dumps({
68
- "protocol": {
69
- "minReaderVersion": 1,
70
- "minWriterVersion": 2
71
- }
72
- })
73
-
74
- # Metadata entry
75
- metadata_json = json.dumps({
76
- "metaData": {
77
- "id": str(table_info['table_id']),
78
- "name": table_info['table_name'],
79
- "description": None,
80
- "format": {
81
- "provider": "parquet",
82
- "options": {}
83
- },
84
- "schemaString": create_spark_schema_string(schema_fields),
85
- "partitionColumns": [],
86
- "createdTime": now,
87
- "configuration": {
88
- "delta.logRetentionDuration": "interval 1 hour"
89
- }
90
- }
91
- })
92
-
93
- # Commit info entry
94
- commitinfo_json = json.dumps({
95
- "commitInfo": {
96
- "timestamp": now,
97
- "operation": "CONVERT",
98
- "operationParameters": {
99
- "convertedFrom": "DuckLake"
100
- },
101
- "isBlindAppend": True,
102
- "engineInfo": "DuckLake-Delta-Exporter",
103
- "clientVersion": "1.0.0"
104
- }
105
- })
106
-
107
- # Create table with JSON entries
108
- duckdb.execute("""
109
- CREATE TABLE json_log_table AS
110
- SELECT ? AS json_line
111
- UNION ALL
112
- SELECT ? AS json_line
113
- UNION ALL
114
- SELECT ? AS json_line
115
- """, [protocol_json, metadata_json, commitinfo_json])
116
-
117
- # Write JSON log file using DuckDB
118
- duckdb.execute(f"COPY (SELECT json_line FROM json_log_table) TO '{json_log_file}' (FORMAT CSV, HEADER false, QUOTE '')")
119
-
120
- # Clean up
121
- duckdb.execute("DROP TABLE IF EXISTS json_log_table")
122
-
123
- return json_log_file
16
+ if is_remote:
17
+ con = duckdb.connect()
18
+ # Load required extensions for cloud storage
19
+ if db_path.startswith('abfss://') or db_path.startswith('az://'):
20
+ con.execute("LOAD azure")
21
+ # Load persistent secrets
22
+ con.execute("SELECT * FROM duckdb_secrets()")
23
+ elif db_path.startswith('s3://'):
24
+ con.execute("LOAD httpfs")
25
+ con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
26
+ con.execute("USE ducklake_db")
27
+ else:
28
+ con = duckdb.connect(db_path, read_only=True)
124
29
 
125
- def build_file_path(table_root, relative_path):
126
- """
127
- Build full file path from table root and relative path.
128
- Works with both local paths and S3 URLs.
129
- """
130
- table_root = table_root.rstrip('/')
131
- relative_path = relative_path.lstrip('/')
132
- return f"{table_root}/{relative_path}"
30
+ # Build export summary - identify which tables have data
31
+ con.execute("""
32
+ CREATE OR REPLACE TEMP TABLE export_summary AS
33
+ WITH
34
+ data_root_config AS (
35
+ SELECT value AS data_root FROM ducklake_metadata WHERE key = 'data_path'
36
+ ),
37
+ active_tables AS (
38
+ SELECT
39
+ t.table_id,
40
+ t.table_name,
41
+ s.schema_name,
42
+ t.path AS table_path,
43
+ s.path AS schema_path,
44
+ rtrim((SELECT data_root FROM data_root_config), '/') || '/' ||
45
+ CASE
46
+ WHEN trim(s.path, '/') != '' THEN trim(s.path, '/') || '/'
47
+ ELSE ''
48
+ END ||
49
+ trim(t.path, '/') AS table_root
50
+ FROM ducklake_table t
51
+ JOIN ducklake_schema s USING(schema_id)
52
+ WHERE t.end_snapshot IS NULL
53
+ ),
54
+ current_snapshot AS (
55
+ SELECT MAX(snapshot_id) AS snapshot_id FROM ducklake_snapshot
56
+ ),
57
+ table_last_modified AS (
58
+ SELECT
59
+ t.*,
60
+ COALESCE(
61
+ (SELECT MAX(sc.snapshot_id)
62
+ FROM ducklake_snapshot_changes sc
63
+ WHERE regexp_matches(sc.changes_made, '[:,]' || t.table_id || '([^0-9]|$)')
64
+ ),
65
+ (SELECT cs.snapshot_id
66
+ FROM current_snapshot cs
67
+ WHERE EXISTS (
68
+ SELECT 1 FROM ducklake_data_file df
69
+ WHERE df.table_id = t.table_id
70
+ AND df.end_snapshot IS NULL
71
+ )
72
+ )
73
+ ) AS last_modified_snapshot,
74
+ (SELECT COUNT(*) FROM ducklake_data_file df
75
+ WHERE df.table_id = t.table_id
76
+ AND df.end_snapshot IS NULL
77
+ ) AS file_count
78
+ FROM active_tables t
79
+ )
80
+ SELECT
81
+ table_id,
82
+ schema_name,
83
+ table_name,
84
+ table_root,
85
+ CASE
86
+ WHEN file_count = 0 THEN 'no_data_files'
87
+ WHEN last_modified_snapshot IS NULL THEN 'no_changes'
88
+ ELSE 'needs_export'
89
+ END AS status,
90
+ last_modified_snapshot AS snapshot_id,
91
+ file_count
92
+ FROM table_last_modified
93
+ """)
133
94
 
134
- def create_checkpoint_for_latest_snapshot(con, table_info, data_root):
135
- """
136
- Create a Delta checkpoint file for the latest DuckLake snapshot.
137
- """
138
- table_root = data_root.rstrip('/') + '/' + table_info['schema_path'] + table_info['table_path']
139
-
140
- # Get the latest snapshot
141
- latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
142
- if latest_snapshot is None:
143
- print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
144
- return False
145
- delta_version = get_latest_delta_checkpoint(con, table_info['table_id'])
146
- checkpoint_file = table_root + f"_delta_log/{delta_version:020d}.checkpoint.parquet"
147
- json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
148
-
149
- try:
150
- con.execute(f"SELECT protocol FROM '{checkpoint_file}' limit 0 ")
151
- print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Checkpoint file already exists: {checkpoint_file}")
152
- except:
153
-
154
- now = int(time.time() * 1000)
155
-
156
- # Get all files for the latest snapshot
157
- file_rows = con.execute(f"""
158
- SELECT path, file_size_bytes FROM ducklake_data_file
159
- WHERE table_id = {table_info['table_id']}
160
- AND begin_snapshot <= {latest_snapshot}
161
- AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
162
- """).fetchall()
163
-
164
- # Get schema for the latest snapshot
165
- columns = con.execute(f"""
166
- SELECT column_name, column_type FROM ducklake_column
167
- WHERE table_id = {table_info['table_id']}
168
- AND begin_snapshot <= {latest_snapshot}
169
- AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
170
- ORDER BY column_order
171
- """).fetchall()
172
-
173
- # Get or generate table metadata ID
174
- table_meta_id = str(table_info['table_id'])
175
-
176
- # Prepare schema
177
- schema_fields = [
178
- {"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
179
- for name, typ in columns
180
- ]
181
-
182
- # Create checkpoint data using DuckDB directly
183
- checkpoint_data = []
184
-
185
- # Create checkpoint data directly in DuckDB using proper data types
186
- duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
187
-
188
- # Create the checkpoint table with proper nested structure
189
- duckdb.execute("""
190
- CREATE TABLE checkpoint_table AS
191
- WITH checkpoint_data AS (
192
- -- Protocol record
193
- SELECT
194
- {'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
195
- NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
196
- NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
197
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
198
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
199
-
200
- UNION ALL
201
-
202
- -- Metadata record
203
- SELECT
204
- NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
205
- {
206
- 'id': ?,
207
- 'name': ?,
208
- 'description': NULL,
209
- 'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
210
- 'schemaString': ?,
211
- 'partitionColumns': []::VARCHAR[],
212
- 'createdTime': ?,
213
- 'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
214
- }::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
215
- NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
216
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
217
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
218
- )
219
- SELECT * FROM checkpoint_data
220
- """, [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
221
-
222
- # Add file records
223
- for path, size in file_rows:
224
- rel_path = path.lstrip('/')
225
- full_path = build_file_path(table_root, rel_path)
226
- mod_time = get_file_modification_time(now)
227
-
228
- duckdb.execute("""
229
- INSERT INTO checkpoint_table
230
- SELECT
231
- NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
232
- NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
233
- {
234
- 'path': ?,
235
- 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
236
- 'size': ?,
237
- 'modificationTime': ?,
238
- 'dataChange': true,
239
- 'stats': ?,
240
- 'tags': NULL::MAP(VARCHAR, VARCHAR)
241
- }::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
242
- NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
243
- NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
244
- """, [rel_path, size, mod_time, json.dumps({"numRecords": None})])
245
-
246
- # Create the _delta_log directory if it doesn't exist
247
- duckdb.execute(f"COPY (SELECT 43) TO '{table_root}_delta_log' (FORMAT PARQUET, PER_THREAD_OUTPUT, OVERWRITE_OR_IGNORE)")
248
-
249
- # Write the checkpoint file
250
- duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{checkpoint_file}' (FORMAT PARQUET)")
251
-
252
- # Create dummy JSON log file for Spark compatibility
253
- create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now)
254
-
255
- # Write the _last_checkpoint file
256
- total_records = 2 + len(file_rows) # protocol + metadata + file records
257
- duckdb.execute(f"""
258
- COPY (SELECT {delta_version} AS version, {total_records} AS size)
259
- TO '{table_root}_delta_log/_last_checkpoint' (FORMAT JSON, ARRAY false)
260
- """)
261
-
262
- print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
263
- print(f"✅ Created JSON log file: {json_log_file}")
264
-
265
- # Clean up temporary tables
266
- duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
267
-
268
- return True, delta_version, latest_snapshot
95
+ # Get tables that need export
96
+ tables_to_export = con.execute("""
97
+ SELECT table_id, schema_name, table_name, table_root, snapshot_id, file_count
98
+ FROM export_summary
99
+ WHERE status = 'needs_export'
100
+ """).fetchall()
269
101
 
270
- def generate_latest_delta_log(db_path: str, data_root: str = None):
271
- """
272
- Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
273
- Creates both checkpoint files and minimal JSON log files for Spark compatibility.
274
-
275
- Args:
276
- db_path (str): The path to the DuckLake database file.
277
- data_root (str): The root directory for the lakehouse data.
278
- """
279
- con = duckdb.connect(db_path, read_only=True)
280
-
281
- if data_root is None:
282
- data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
283
-
284
- # Get all active tables
285
- tables = con.execute("""
286
- SELECT
287
- t.table_id,
288
- t.table_name,
289
- s.schema_name,
290
- t.path as table_path,
291
- s.path as schema_path
292
- FROM ducklake_table t
293
- JOIN ducklake_schema s USING(schema_id)
294
- WHERE t.end_snapshot IS NULL
102
+ # Show summary
103
+ summary = con.execute("""
104
+ SELECT status, COUNT(*) as cnt FROM export_summary GROUP BY status
295
105
  """).fetchall()
296
-
297
- total_tables = len(tables)
298
- successful_exports = 0
299
-
300
- for table_row in tables:
301
- table_info = {
302
- 'table_id': table_row[0],
303
- 'table_name': table_row[1],
304
- 'schema_name': table_row[2],
305
- 'table_path': table_row[3],
306
- 'schema_path': table_row[4]
307
- }
308
-
309
- table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
310
- print(f"Processing {table_key}...")
311
-
106
+
107
+ for status, cnt in summary:
108
+ print(f" {status}: {cnt} tables")
109
+
110
+ if not tables_to_export:
111
+ print("\n✅ No tables need export.")
112
+ con.close()
113
+ return
114
+
115
+ print(f"\n📦 Exporting {len(tables_to_export)} tables...")
116
+
117
+ # Process each table
118
+ for table_id, schema_name, table_name, table_root, snapshot_id, file_count in tables_to_export:
119
+ table_key = f"{schema_name}.{table_name}"
120
+
121
+ # Check if checkpoint already exists for this snapshot
122
+ checkpoint_path = f"{table_root}/_delta_log/{snapshot_id:020d}.checkpoint.parquet"
312
123
  try:
313
- result = create_checkpoint_for_latest_snapshot(con, table_info, data_root)
314
-
315
- if result:
316
- successful_exports += 1
317
- else:
318
- print(f"⚠️ {table_key}: No data to export")
319
-
124
+ con.execute(f"SELECT 1 FROM '{checkpoint_path}' LIMIT 1")
125
+ print(f" ⏭️ {table_key}: snapshot {snapshot_id} already exported")
126
+ continue
127
+ except Exception:
128
+ pass # File doesn't exist, proceed with export
129
+
130
+ print(f"\n Processing {table_key}...")
131
+
132
+ try:
133
+ # Build checkpoint parquet data for this table
134
+ con.execute("""
135
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_parquet AS
136
+ WITH
137
+ table_schemas AS (
138
+ SELECT
139
+ ? AS table_id,
140
+ ? AS table_name,
141
+ ? AS snapshot_id,
142
+ ? AS table_root,
143
+ list({
144
+ 'name': c.column_name,
145
+ 'type':
146
+ CASE
147
+ WHEN contains(lower(c.column_type), 'bigint') OR
148
+ (contains(lower(c.column_type), 'int') AND contains(c.column_type, '64')) THEN 'long'
149
+ WHEN contains(lower(c.column_type), 'int') THEN 'integer'
150
+ WHEN contains(lower(c.column_type), 'float') THEN 'double'
151
+ WHEN contains(lower(c.column_type), 'double') THEN 'double'
152
+ WHEN contains(lower(c.column_type), 'bool') THEN 'boolean'
153
+ WHEN contains(lower(c.column_type), 'timestamp') THEN 'timestamp'
154
+ WHEN contains(lower(c.column_type), 'date') THEN 'date'
155
+ WHEN contains(lower(c.column_type), 'decimal') THEN lower(c.column_type)
156
+ ELSE 'string'
157
+ END,
158
+ 'nullable': true,
159
+ 'metadata': MAP{}::MAP(VARCHAR, VARCHAR)
160
+ }::STRUCT(name VARCHAR, type VARCHAR, nullable BOOLEAN, metadata MAP(VARCHAR, VARCHAR)) ORDER BY c.column_order) AS schema_fields
161
+ FROM ducklake_column c
162
+ WHERE c.table_id = ?
163
+ AND c.end_snapshot IS NULL
164
+ ),
165
+ file_column_stats_agg AS (
166
+ SELECT
167
+ df.data_file_id,
168
+ c.column_name,
169
+ ANY_VALUE(c.column_type) AS column_type,
170
+ MAX(fcs.value_count) AS value_count,
171
+ MIN(fcs.min_value) AS min_value,
172
+ MAX(fcs.max_value) AS max_value,
173
+ MAX(fcs.null_count) AS null_count
174
+ FROM ducklake_data_file df
175
+ LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
176
+ LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id AND c.table_id = df.table_id
177
+ WHERE df.table_id = ?
178
+ AND df.end_snapshot IS NULL
179
+ AND c.column_id IS NOT NULL
180
+ AND c.end_snapshot IS NULL
181
+ GROUP BY df.data_file_id, c.column_name
182
+ ),
183
+ file_column_stats_transformed AS (
184
+ SELECT
185
+ fca.data_file_id,
186
+ fca.column_name,
187
+ fca.column_type,
188
+ fca.value_count,
189
+ fca.null_count,
190
+ CASE
191
+ WHEN fca.min_value IS NULL THEN NULL
192
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
193
+ regexp_replace(
194
+ regexp_replace(replace(fca.min_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
195
+ '^([^.]+)$', '\\1.000'
196
+ ) || 'Z'
197
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.min_value
198
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.min_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
199
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
200
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
201
+ CASE WHEN contains(fca.min_value, '.') OR contains(lower(fca.min_value), 'e')
202
+ THEN CAST(TRY_CAST(fca.min_value AS DOUBLE) AS VARCHAR)
203
+ ELSE CAST(TRY_CAST(fca.min_value AS BIGINT) AS VARCHAR)
204
+ END
205
+ ELSE fca.min_value
206
+ END AS transformed_min,
207
+ CASE
208
+ WHEN fca.max_value IS NULL THEN NULL
209
+ WHEN contains(lower(fca.column_type), 'timestamp') THEN
210
+ regexp_replace(
211
+ regexp_replace(replace(fca.max_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
212
+ '^([^.]+)$', '\\1.000'
213
+ ) || 'Z'
214
+ WHEN contains(lower(fca.column_type), 'date') THEN fca.max_value
215
+ WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.max_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
216
+ WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
217
+ OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
218
+ CASE WHEN contains(fca.max_value, '.') OR contains(lower(fca.max_value), 'e')
219
+ THEN CAST(TRY_CAST(fca.max_value AS DOUBLE) AS VARCHAR)
220
+ ELSE CAST(TRY_CAST(fca.max_value AS BIGINT) AS VARCHAR)
221
+ END
222
+ ELSE fca.max_value
223
+ END AS transformed_max
224
+ FROM file_column_stats_agg fca
225
+ ),
226
+ file_metadata AS (
227
+ SELECT
228
+ ts.table_id,
229
+ ts.table_name,
230
+ ts.snapshot_id,
231
+ ts.table_root,
232
+ ts.schema_fields,
233
+ df.data_file_id,
234
+ df.path AS file_path,
235
+ df.file_size_bytes,
236
+ COALESCE(MAX(fct.value_count), 0) AS num_records,
237
+ COALESCE(map_from_entries(list({
238
+ 'key': fct.column_name,
239
+ 'value': fct.transformed_min
240
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_min IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS min_values,
241
+ COALESCE(map_from_entries(list({
242
+ 'key': fct.column_name,
243
+ 'value': fct.transformed_max
244
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_max IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS max_values,
245
+ COALESCE(map_from_entries(list({
246
+ 'key': fct.column_name,
247
+ 'value': fct.null_count
248
+ } ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.null_count IS NOT NULL)), MAP{}::MAP(VARCHAR, BIGINT)) AS null_count
249
+ FROM table_schemas ts
250
+ JOIN ducklake_data_file df ON df.table_id = ts.table_id
251
+ LEFT JOIN file_column_stats_transformed fct ON df.data_file_id = fct.data_file_id
252
+ WHERE df.end_snapshot IS NULL
253
+ GROUP BY ts.table_id, ts.table_name, ts.snapshot_id,
254
+ ts.table_root, ts.schema_fields, df.data_file_id, df.path, df.file_size_bytes
255
+ ),
256
+ table_aggregates AS (
257
+ SELECT
258
+ table_id,
259
+ table_name,
260
+ snapshot_id,
261
+ table_root,
262
+ schema_fields,
263
+ COUNT(*) AS num_files,
264
+ SUM(num_records) AS total_rows,
265
+ SUM(file_size_bytes) AS total_bytes,
266
+ list({
267
+ 'path': ltrim(file_path, '/'),
268
+ 'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
269
+ 'size': file_size_bytes,
270
+ 'modificationTime': epoch_ms(now()),
271
+ 'dataChange': true,
272
+ 'stats': COALESCE(to_json({
273
+ 'numRecords': COALESCE(num_records, 0),
274
+ 'minValues': COALESCE(min_values, MAP{}::MAP(VARCHAR, VARCHAR)),
275
+ 'maxValues': COALESCE(max_values, MAP{}::MAP(VARCHAR, VARCHAR)),
276
+ 'nullCount': COALESCE(null_count, MAP{}::MAP(VARCHAR, BIGINT))
277
+ }), '{"numRecords":0}'),
278
+ 'tags': MAP{}::MAP(VARCHAR, VARCHAR)
279
+ }::STRUCT(
280
+ path VARCHAR,
281
+ partitionValues MAP(VARCHAR, VARCHAR),
282
+ size BIGINT,
283
+ modificationTime BIGINT,
284
+ dataChange BOOLEAN,
285
+ stats VARCHAR,
286
+ tags MAP(VARCHAR, VARCHAR)
287
+ )) AS add_entries
288
+ FROM file_metadata
289
+ GROUP BY table_id, table_name, snapshot_id, table_root, schema_fields
290
+ ),
291
+ checkpoint_data AS (
292
+ SELECT
293
+ ta.*,
294
+ epoch_ms(now()) AS now_ms,
295
+ uuid()::VARCHAR AS txn_id,
296
+ (substring(md5(ta.table_id::VARCHAR || '-metadata'), 1, 8) || '-' ||
297
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 9, 4) || '-' ||
298
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 13, 4) || '-' ||
299
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 17, 4) || '-' ||
300
+ substring(md5(ta.table_id::VARCHAR || '-metadata'), 21, 12)) AS meta_id,
301
+ to_json({'type': 'struct', 'fields': ta.schema_fields}) AS schema_string
302
+ FROM table_aggregates ta
303
+ ),
304
+ checkpoint_parquet_data AS (
305
+ SELECT
306
+ cd.table_id,
307
+ cd.table_name,
308
+ cd.snapshot_id,
309
+ cd.table_root,
310
+ cd.meta_id,
311
+ cd.now_ms,
312
+ cd.txn_id,
313
+ cd.schema_string,
314
+ cd.num_files,
315
+ cd.total_rows,
316
+ cd.total_bytes,
317
+ {'minReaderVersion': 1, 'minWriterVersion': 2} AS protocol,
318
+ NULL AS metaData,
319
+ NULL AS add,
320
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
321
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
322
+ 1 AS row_order
323
+ FROM checkpoint_data cd
324
+ UNION ALL
325
+ SELECT
326
+ cd.table_id,
327
+ cd.table_name,
328
+ cd.snapshot_id,
329
+ cd.table_root,
330
+ cd.meta_id,
331
+ cd.now_ms,
332
+ cd.txn_id,
333
+ cd.schema_string,
334
+ cd.num_files,
335
+ cd.total_rows,
336
+ cd.total_bytes,
337
+ NULL AS protocol,
338
+ {
339
+ 'id': cd.meta_id,
340
+ 'name': cd.table_name,
341
+ 'format': {'provider': 'parquet', 'options': MAP{}::MAP(VARCHAR, VARCHAR)}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
342
+ 'schemaString': cd.schema_string,
343
+ 'partitionColumns': []::VARCHAR[],
344
+ 'createdTime': cd.now_ms,
345
+ 'configuration': MAP{}::MAP(VARCHAR, VARCHAR)
346
+ }::STRUCT(id VARCHAR, name VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
347
+ NULL AS add,
348
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
349
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
350
+ 2 AS row_order
351
+ FROM checkpoint_data cd
352
+ UNION ALL
353
+ SELECT
354
+ cd.table_id,
355
+ cd.table_name,
356
+ cd.snapshot_id,
357
+ cd.table_root,
358
+ cd.meta_id,
359
+ cd.now_ms,
360
+ cd.txn_id,
361
+ cd.schema_string,
362
+ cd.num_files,
363
+ cd.total_rows,
364
+ cd.total_bytes,
365
+ NULL AS protocol,
366
+ NULL AS metaData,
367
+ unnest(cd.add_entries) AS add,
368
+ NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
369
+ NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
370
+ 3 AS row_order
371
+ FROM checkpoint_data cd
372
+ )
373
+ SELECT * FROM checkpoint_parquet_data
374
+ """, [table_id, table_name, snapshot_id, table_root, table_id, table_id])
375
+
376
+ # Build JSON log content
377
+ con.execute("""
378
+ CREATE OR REPLACE TEMP TABLE temp_checkpoint_json AS
379
+ SELECT DISTINCT
380
+ p.table_id,
381
+ p.table_root,
382
+ p.snapshot_id,
383
+ p.num_files,
384
+ to_json({
385
+ 'commitInfo': {
386
+ 'timestamp': p.now_ms,
387
+ 'operation': 'CONVERT',
388
+ 'operationParameters': {
389
+ 'convertedFrom': 'DuckLake',
390
+ 'duckLakeSnapshotId': p.snapshot_id::VARCHAR,
391
+ 'partitionBy': '[]'
392
+ },
393
+ 'isolationLevel': 'Serializable',
394
+ 'isBlindAppend': false,
395
+ 'operationMetrics': {
396
+ 'numFiles': p.num_files::VARCHAR,
397
+ 'numOutputRows': p.total_rows::VARCHAR,
398
+ 'numOutputBytes': p.total_bytes::VARCHAR
399
+ },
400
+ 'engineInfo': 'DuckLake-Delta-Exporter/1.0.0',
401
+ 'txnId': p.txn_id
402
+ }
403
+ }) || chr(10) ||
404
+ to_json({
405
+ 'metaData': {
406
+ 'id': p.meta_id,
407
+ 'name': p.table_name,
408
+ 'format': {'provider': 'parquet', 'options': MAP{}},
409
+ 'schemaString': p.schema_string::VARCHAR,
410
+ 'partitionColumns': [],
411
+ 'createdTime': p.now_ms,
412
+ 'configuration': MAP{}
413
+ }
414
+ }) || chr(10) ||
415
+ to_json({
416
+ 'protocol': {'minReaderVersion': 1, 'minWriterVersion': 2}
417
+ }) AS content
418
+ FROM temp_checkpoint_parquet p
419
+ WHERE p.row_order = 1
420
+ """)
421
+
422
+ # Build last checkpoint content
423
+ con.execute("""
424
+ CREATE OR REPLACE TEMP TABLE temp_last_checkpoint AS
425
+ SELECT
426
+ table_id,
427
+ table_root,
428
+ snapshot_id,
429
+ '{"version":' || snapshot_id || ',"size":' || (2 + num_files) || '}' AS content
430
+ FROM temp_checkpoint_parquet
431
+ WHERE row_order = 1
432
+ """)
433
+
434
+ # Get file paths
435
+ paths = con.execute("""
436
+ SELECT
437
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.checkpoint.parquet' AS checkpoint_file,
438
+ table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.json' AS json_file,
439
+ table_root || '/_delta_log/_last_checkpoint' AS last_checkpoint_file,
440
+ table_root || '/_delta_log' AS delta_log_path
441
+ FROM temp_checkpoint_parquet
442
+ WHERE row_order = 1
443
+ LIMIT 1
444
+ """).fetchone()
445
+
446
+ checkpoint_file, json_file, last_checkpoint_file, delta_log_path = paths
447
+
448
+ # Create delta_log directory for local paths
449
+ if not any(table_root.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://']):
450
+ con.execute(f"""
451
+ COPY (SELECT 1 AS id, 1 AS ".duckdb_init")
452
+ TO '{delta_log_path}'
453
+ (FORMAT CSV, PARTITION_BY (".duckdb_init"), OVERWRITE_OR_IGNORE)
454
+ """)
455
+
456
+ # Write checkpoint parquet
457
+ con.execute(f"""
458
+ COPY (SELECT protocol, metaData, add, remove, commitInfo
459
+ FROM temp_checkpoint_parquet ORDER BY row_order)
460
+ TO '{checkpoint_file}' (FORMAT PARQUET)
461
+ """)
462
+
463
+ # Write JSON log
464
+ con.execute(f"""
465
+ COPY (SELECT content FROM temp_checkpoint_json)
466
+ TO '{json_file}' (FORMAT CSV, HEADER false, QUOTE '')
467
+ """)
468
+
469
+ # Write last checkpoint
470
+ con.execute(f"""
471
+ COPY (SELECT content FROM temp_last_checkpoint)
472
+ TO '{last_checkpoint_file}' (FORMAT CSV, HEADER false, QUOTE '')
473
+ """)
474
+
475
+ print(f" ✅ {table_key}: exported snapshot {snapshot_id} ({file_count} files)")
476
+
320
477
  except Exception as e:
321
- print(f"❌ {table_key}: Failed to export checkpoint - {e}")
322
-
478
+ print(f" ❌ {table_key}: {e}")
479
+
480
+ # Cleanup temp tables
481
+ con.execute("DROP TABLE IF EXISTS export_summary")
482
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_parquet")
483
+ con.execute("DROP TABLE IF EXISTS temp_checkpoint_json")
484
+ con.execute("DROP TABLE IF EXISTS temp_last_checkpoint")
485
+
323
486
  con.close()
324
- print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
487
+ print("\n🎉 Export completed!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.1.4
3
+ Version: 0.3.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
13
  Classifier: Development Status :: 3 - Alpha
14
14
  Requires-Python: >=3.8
15
15
  Description-Content-Type: text/markdown
16
- Requires-Dist: duckdb
16
+ Requires-Dist: duckdb>=1.4.4
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: classifier
@@ -0,0 +1,5 @@
1
+ ducklake_delta_exporter/__init__.py,sha256=tbMa54gkBIQGqhSlQhz5WJKihrkXwY9Tkz8Gpn1GNmQ,25042
2
+ ducklake_delta_exporter-0.3.0.dist-info/METADATA,sha256=eU4XikvDYeI3tGzA5Z0jjaifMS02TLxlGEV45apU8-k,3956
3
+ ducklake_delta_exporter-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
4
+ ducklake_delta_exporter-0.3.0.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
5
+ ducklake_delta_exporter-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,5 +0,0 @@
1
- ducklake_delta_exporter/__init__.py,sha256=LmXUUeR0LPgrlqlqeVmpnwm3JdurMD81GvjB-KeGxLo,14380
2
- ducklake_delta_exporter-0.1.4.dist-info/METADATA,sha256=s8rMeyMR00CvfGjaprCOZsOhw9U7EheZ6gFd9VVTh6Y,3949
3
- ducklake_delta_exporter-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- ducklake_delta_exporter-0.1.4.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
5
- ducklake_delta_exporter-0.1.4.dist-info/RECORD,,