ducklake-delta-exporter 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ducklake_delta_exporter/__init__.py +261 -399
- ducklake_delta_exporter-0.1.4.dist-info/METADATA +123 -0
- ducklake_delta_exporter-0.1.4.dist-info/RECORD +5 -0
- ducklake_delta_exporter-0.1.2.dist-info/METADATA +0 -72
- ducklake_delta_exporter-0.1.2.dist-info/RECORD +0 -5
- {ducklake_delta_exporter-0.1.2.dist-info → ducklake_delta_exporter-0.1.4.dist-info}/WHEEL +0 -0
- {ducklake_delta_exporter-0.1.2.dist-info → ducklake_delta_exporter-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,7 @@
|
|
|
1
1
|
# File: ducklake_delta_exporter.py
|
|
2
|
-
import os
|
|
3
2
|
import json
|
|
4
|
-
import uuid
|
|
5
3
|
import time
|
|
6
4
|
import duckdb
|
|
7
|
-
import pyarrow as pa
|
|
8
|
-
import pyarrow.parquet as pq
|
|
9
|
-
from datetime import datetime
|
|
10
5
|
|
|
11
6
|
def map_type_ducklake_to_spark(t):
|
|
12
7
|
"""Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
|
|
@@ -31,149 +26,263 @@ def create_spark_schema_string(fields):
|
|
|
31
26
|
"""Creates a JSON string for the Spark schema from a list of fields."""
|
|
32
27
|
return json.dumps({"type": "struct", "fields": fields})
|
|
33
28
|
|
|
34
|
-
def
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
pa.field("protocol", pa.struct([
|
|
38
|
-
pa.field("minReaderVersion", pa.int32()), # Made nullable
|
|
39
|
-
pa.field("minWriterVersion", pa.int32()) # Made nullable
|
|
40
|
-
]), nullable=True),
|
|
41
|
-
pa.field("metaData", pa.struct([
|
|
42
|
-
pa.field("id", pa.string()),
|
|
43
|
-
pa.field("name", pa.string()),
|
|
44
|
-
pa.field("description", pa.string()),
|
|
45
|
-
pa.field("format", pa.struct([
|
|
46
|
-
pa.field("provider", pa.string()),
|
|
47
|
-
pa.field("options", pa.map_(pa.string(), pa.string()))
|
|
48
|
-
])),
|
|
49
|
-
pa.field("schemaString", pa.string()),
|
|
50
|
-
pa.field("partitionColumns", pa.list_(pa.string())),
|
|
51
|
-
pa.field("createdTime", pa.int64()),
|
|
52
|
-
pa.field("configuration", pa.map_(pa.string(), pa.string()))
|
|
53
|
-
]), nullable=True),
|
|
54
|
-
pa.field("add", pa.struct([
|
|
55
|
-
pa.field("path", pa.string()),
|
|
56
|
-
pa.field("partitionValues", pa.map_(pa.string(), pa.string())),
|
|
57
|
-
pa.field("size", pa.int64()),
|
|
58
|
-
pa.field("modificationTime", pa.int64()),
|
|
59
|
-
pa.field("dataChange", pa.bool_()),
|
|
60
|
-
pa.field("stats", pa.string(), nullable=True),
|
|
61
|
-
pa.field("tags", pa.map_(pa.string(), pa.string()), nullable=True)
|
|
62
|
-
# Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
|
|
63
|
-
]), nullable=True),
|
|
64
|
-
pa.field("remove", pa.struct([
|
|
65
|
-
pa.field("path", pa.string()),
|
|
66
|
-
pa.field("deletionTimestamp", pa.int64()),
|
|
67
|
-
pa.field("dataChange", pa.bool_())
|
|
68
|
-
]), nullable=True),
|
|
69
|
-
pa.field("commitInfo", pa.struct([
|
|
70
|
-
pa.field("timestamp", pa.timestamp('ms'), False), # Changed from pa.int64() to pa.timestamp('ms')
|
|
71
|
-
pa.field("operation", pa.string()),
|
|
72
|
-
pa.field("operationParameters", pa.map_(pa.string(), pa.string())),
|
|
73
|
-
pa.field("isBlindAppend", pa.bool_(), nullable=True),
|
|
74
|
-
pa.field("engineInfo", pa.string(), nullable=True),
|
|
75
|
-
pa.field("clientVersion", pa.string(), nullable=True)
|
|
76
|
-
]), nullable=True)
|
|
77
|
-
])
|
|
78
|
-
|
|
79
|
-
def get_latest_delta_version_info(delta_log_path, con, table_id):
|
|
29
|
+
def get_latest_ducklake_snapshot(con, table_id):
|
|
30
|
+
"""
|
|
31
|
+
Get the latest DuckLake snapshot ID for a table.
|
|
80
32
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
Also retrieves the consistent metaData.id if available from version 0.
|
|
33
|
+
latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
|
|
34
|
+
return latest_snapshot
|
|
84
35
|
|
|
85
|
-
|
|
36
|
+
def get_latest_delta_checkpoint(con, table_id):
|
|
37
|
+
"""
|
|
38
|
+
check how many times a table has being modified.
|
|
86
39
|
"""
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
meta_id_from_delta_log = None # This should be consistent for the table
|
|
40
|
+
delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
|
|
41
|
+
where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
|
|
42
|
+
return delta_checkpoint
|
|
91
43
|
|
|
92
|
-
|
|
93
|
-
|
|
44
|
+
def get_file_modification_time(dummy_time):
|
|
45
|
+
"""
|
|
46
|
+
Return a dummy modification time for parquet files.
|
|
47
|
+
This avoids the latency of actually reading file metadata.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
dummy_time: Timestamp in milliseconds to use as modification time
|
|
94
51
|
|
|
95
|
-
|
|
96
|
-
|
|
52
|
+
Returns:
|
|
53
|
+
Modification time in milliseconds
|
|
54
|
+
"""
|
|
55
|
+
return dummy_time
|
|
97
56
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
57
|
+
def create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now):
|
|
58
|
+
"""
|
|
59
|
+
Create a minimal JSON log file for Spark compatibility using DuckDB.
|
|
60
|
+
"""
|
|
61
|
+
json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
|
|
62
|
+
|
|
63
|
+
# Create JSON log entries using DuckDB
|
|
64
|
+
duckdb.execute("DROP TABLE IF EXISTS json_log_table")
|
|
65
|
+
|
|
66
|
+
# Protocol entry
|
|
67
|
+
protocol_json = json.dumps({
|
|
68
|
+
"protocol": {
|
|
69
|
+
"minReaderVersion": 1,
|
|
70
|
+
"minWriterVersion": 2
|
|
71
|
+
}
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
# Metadata entry
|
|
75
|
+
metadata_json = json.dumps({
|
|
76
|
+
"metaData": {
|
|
77
|
+
"id": str(table_info['table_id']),
|
|
78
|
+
"name": table_info['table_name'],
|
|
79
|
+
"description": None,
|
|
80
|
+
"format": {
|
|
81
|
+
"provider": "parquet",
|
|
82
|
+
"options": {}
|
|
83
|
+
},
|
|
84
|
+
"schemaString": create_spark_schema_string(schema_fields),
|
|
85
|
+
"partitionColumns": [],
|
|
86
|
+
"createdTime": now,
|
|
87
|
+
"configuration": {
|
|
88
|
+
"delta.logRetentionDuration": "interval 1 hour"
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
# Commit info entry
|
|
94
|
+
commitinfo_json = json.dumps({
|
|
95
|
+
"commitInfo": {
|
|
96
|
+
"timestamp": now,
|
|
97
|
+
"operation": "CONVERT",
|
|
98
|
+
"operationParameters": {
|
|
99
|
+
"convertedFrom": "DuckLake"
|
|
100
|
+
},
|
|
101
|
+
"isBlindAppend": True,
|
|
102
|
+
"engineInfo": "DuckLake-Delta-Exporter",
|
|
103
|
+
"clientVersion": "1.0.0"
|
|
104
|
+
}
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
# Create table with JSON entries
|
|
108
|
+
duckdb.execute("""
|
|
109
|
+
CREATE TABLE json_log_table AS
|
|
110
|
+
SELECT ? AS json_line
|
|
111
|
+
UNION ALL
|
|
112
|
+
SELECT ? AS json_line
|
|
113
|
+
UNION ALL
|
|
114
|
+
SELECT ? AS json_line
|
|
115
|
+
""", [protocol_json, metadata_json, commitinfo_json])
|
|
116
|
+
|
|
117
|
+
# Write JSON log file using DuckDB
|
|
118
|
+
duckdb.execute(f"COPY (SELECT json_line FROM json_log_table) TO '{json_log_file}' (FORMAT CSV, HEADER false, QUOTE '')")
|
|
119
|
+
|
|
120
|
+
# Clean up
|
|
121
|
+
duckdb.execute("DROP TABLE IF EXISTS json_log_table")
|
|
122
|
+
|
|
123
|
+
return json_log_file
|
|
106
124
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
125
|
+
def build_file_path(table_root, relative_path):
|
|
126
|
+
"""
|
|
127
|
+
Build full file path from table root and relative path.
|
|
128
|
+
Works with both local paths and S3 URLs.
|
|
129
|
+
"""
|
|
130
|
+
table_root = table_root.rstrip('/')
|
|
131
|
+
relative_path = relative_path.lstrip('/')
|
|
132
|
+
return f"{table_root}/{relative_path}"
|
|
110
133
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
134
|
+
def create_checkpoint_for_latest_snapshot(con, table_info, data_root):
|
|
135
|
+
"""
|
|
136
|
+
Create a Delta checkpoint file for the latest DuckLake snapshot.
|
|
137
|
+
"""
|
|
138
|
+
table_root = data_root.rstrip('/') + '/' + table_info['schema_path'] + table_info['table_path']
|
|
139
|
+
|
|
140
|
+
# Get the latest snapshot
|
|
141
|
+
latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
|
|
142
|
+
if latest_snapshot is None:
|
|
143
|
+
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
|
|
144
|
+
return False
|
|
145
|
+
delta_version = get_latest_delta_checkpoint(con, table_info['table_id'])
|
|
146
|
+
checkpoint_file = table_root + f"_delta_log/{delta_version:020d}.checkpoint.parquet"
|
|
147
|
+
json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
con.execute(f"SELECT protocol FROM '{checkpoint_file}' limit 0 ")
|
|
151
|
+
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Checkpoint file already exists: {checkpoint_file}")
|
|
152
|
+
except:
|
|
153
|
+
|
|
154
|
+
now = int(time.time() * 1000)
|
|
114
155
|
|
|
115
|
-
#
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
if 'operationParameters' in commit_info and 'duckLakeSnapshotId' in commit_info['operationParameters']:
|
|
123
|
-
last_exported_ducklake_snapshot_id = int(commit_info['operationParameters']['duckLakeSnapshotId'])
|
|
124
|
-
if 'metaData' in action:
|
|
125
|
-
meta_id_from_delta_log = action['metaData'].get('id')
|
|
126
|
-
except json.JSONDecodeError as e:
|
|
127
|
-
print(f"ERROR: Failed to parse JSON line in {last_log_file}: {line.strip()}. Error: {e}")
|
|
128
|
-
except Exception as e:
|
|
129
|
-
print(f"ERROR: Unexpected error processing line in {last_log_file}: {e}")
|
|
156
|
+
# Get all files for the latest snapshot
|
|
157
|
+
file_rows = con.execute(f"""
|
|
158
|
+
SELECT path, file_size_bytes FROM ducklake_data_file
|
|
159
|
+
WHERE table_id = {table_info['table_id']}
|
|
160
|
+
AND begin_snapshot <= {latest_snapshot}
|
|
161
|
+
AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
|
|
162
|
+
""").fetchall()
|
|
130
163
|
|
|
131
|
-
#
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
164
|
+
# Get schema for the latest snapshot
|
|
165
|
+
columns = con.execute(f"""
|
|
166
|
+
SELECT column_name, column_type FROM ducklake_column
|
|
167
|
+
WHERE table_id = {table_info['table_id']}
|
|
168
|
+
AND begin_snapshot <= {latest_snapshot}
|
|
169
|
+
AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
|
|
170
|
+
ORDER BY column_order
|
|
171
|
+
""").fetchall()
|
|
172
|
+
|
|
173
|
+
# Get or generate table metadata ID
|
|
174
|
+
table_meta_id = str(table_info['table_id'])
|
|
175
|
+
|
|
176
|
+
# Prepare schema
|
|
177
|
+
schema_fields = [
|
|
178
|
+
{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
|
|
179
|
+
for name, typ in columns
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
# Create checkpoint data using DuckDB directly
|
|
183
|
+
checkpoint_data = []
|
|
184
|
+
|
|
185
|
+
# Create checkpoint data directly in DuckDB using proper data types
|
|
186
|
+
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
187
|
+
|
|
188
|
+
# Create the checkpoint table with proper nested structure
|
|
189
|
+
duckdb.execute("""
|
|
190
|
+
CREATE TABLE checkpoint_table AS
|
|
191
|
+
WITH checkpoint_data AS (
|
|
192
|
+
-- Protocol record
|
|
193
|
+
SELECT
|
|
194
|
+
{'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
195
|
+
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
196
|
+
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
197
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
198
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
199
|
+
|
|
200
|
+
UNION ALL
|
|
201
|
+
|
|
202
|
+
-- Metadata record
|
|
203
|
+
SELECT
|
|
204
|
+
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
205
|
+
{
|
|
206
|
+
'id': ?,
|
|
207
|
+
'name': ?,
|
|
208
|
+
'description': NULL,
|
|
209
|
+
'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
|
|
210
|
+
'schemaString': ?,
|
|
211
|
+
'partitionColumns': []::VARCHAR[],
|
|
212
|
+
'createdTime': ?,
|
|
213
|
+
'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
|
|
214
|
+
}::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
215
|
+
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
216
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
217
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
218
|
+
)
|
|
219
|
+
SELECT * FROM checkpoint_data
|
|
220
|
+
""", [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
|
|
221
|
+
|
|
222
|
+
# Add file records
|
|
223
|
+
for path, size in file_rows:
|
|
224
|
+
rel_path = path.lstrip('/')
|
|
225
|
+
full_path = build_file_path(table_root, rel_path)
|
|
226
|
+
mod_time = get_file_modification_time(now)
|
|
227
|
+
|
|
228
|
+
duckdb.execute("""
|
|
229
|
+
INSERT INTO checkpoint_table
|
|
230
|
+
SELECT
|
|
231
|
+
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
232
|
+
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
233
|
+
{
|
|
234
|
+
'path': ?,
|
|
235
|
+
'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
|
|
236
|
+
'size': ?,
|
|
237
|
+
'modificationTime': ?,
|
|
238
|
+
'dataChange': true,
|
|
239
|
+
'stats': ?,
|
|
240
|
+
'tags': NULL::MAP(VARCHAR, VARCHAR)
|
|
241
|
+
}::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
242
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
243
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
244
|
+
""", [rel_path, size, mod_time, json.dumps({"numRecords": None})])
|
|
245
|
+
|
|
246
|
+
# Create the _delta_log directory if it doesn't exist
|
|
247
|
+
duckdb.execute(f"COPY (SELECT 43) TO '{table_root}_delta_log' (FORMAT PARQUET, PER_THREAD_OUTPUT, OVERWRITE_OR_IGNORE)")
|
|
248
|
+
|
|
249
|
+
# Write the checkpoint file
|
|
250
|
+
duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{checkpoint_file}' (FORMAT PARQUET)")
|
|
251
|
+
|
|
252
|
+
# Create dummy JSON log file for Spark compatibility
|
|
253
|
+
create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now)
|
|
254
|
+
|
|
255
|
+
# Write the _last_checkpoint file
|
|
256
|
+
total_records = 2 + len(file_rows) # protocol + metadata + file records
|
|
257
|
+
duckdb.execute(f"""
|
|
258
|
+
COPY (SELECT {delta_version} AS version, {total_records} AS size)
|
|
259
|
+
TO '{table_root}_delta_log/_last_checkpoint' (FORMAT JSON, ARRAY false)
|
|
260
|
+
""")
|
|
261
|
+
|
|
262
|
+
print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
|
|
263
|
+
print(f"✅ Created JSON log file: {json_log_file}")
|
|
264
|
+
|
|
265
|
+
# Clean up temporary tables
|
|
266
|
+
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
267
|
+
|
|
268
|
+
return True, delta_version, latest_snapshot
|
|
162
269
|
|
|
163
270
|
def generate_latest_delta_log(db_path: str, data_root: str = None):
|
|
164
271
|
"""
|
|
165
|
-
|
|
166
|
-
|
|
272
|
+
Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
|
|
273
|
+
Creates both checkpoint files and minimal JSON log files for Spark compatibility.
|
|
167
274
|
|
|
168
275
|
Args:
|
|
169
276
|
db_path (str): The path to the DuckLake database file.
|
|
170
277
|
data_root (str): The root directory for the lakehouse data.
|
|
171
278
|
"""
|
|
172
|
-
checkpoint_interval = 1
|
|
173
279
|
con = duckdb.connect(db_path, read_only=True)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
280
|
+
|
|
281
|
+
if data_root is None:
|
|
282
|
+
data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
|
|
283
|
+
|
|
284
|
+
# Get all active tables
|
|
285
|
+
tables = con.execute("""
|
|
177
286
|
SELECT
|
|
178
287
|
t.table_id,
|
|
179
288
|
t.table_name,
|
|
@@ -183,280 +292,33 @@ def generate_latest_delta_log(db_path: str, data_root: str = None):
|
|
|
183
292
|
FROM ducklake_table t
|
|
184
293
|
JOIN ducklake_schema s USING(schema_id)
|
|
185
294
|
WHERE t.end_snapshot IS NULL
|
|
186
|
-
""").
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
if not latest_ducklake_snapshot_raw or latest_ducklake_snapshot_raw[0] is None:
|
|
201
|
-
print(f"⚠️ {table_key}: No data files found in DuckLake, skipping Delta log generation.")
|
|
202
|
-
continue
|
|
295
|
+
""").fetchall()
|
|
296
|
+
|
|
297
|
+
total_tables = len(tables)
|
|
298
|
+
successful_exports = 0
|
|
299
|
+
|
|
300
|
+
for table_row in tables:
|
|
301
|
+
table_info = {
|
|
302
|
+
'table_id': table_row[0],
|
|
303
|
+
'table_name': table_row[1],
|
|
304
|
+
'schema_name': table_row[2],
|
|
305
|
+
'table_path': table_row[3],
|
|
306
|
+
'schema_path': table_row[4]
|
|
307
|
+
}
|
|
203
308
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# 2. Determine the current state of the Delta table and next Delta version
|
|
207
|
-
last_delta_version_idx, previously_exported_files, last_exported_ducklake_snapshot_id, existing_meta_id = \
|
|
208
|
-
get_latest_delta_version_info(delta_log_path, con, row.table_id)
|
|
309
|
+
table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
|
|
310
|
+
print(f"Processing {table_key}...")
|
|
209
311
|
|
|
210
|
-
next_delta_version = last_delta_version_idx + 1
|
|
211
|
-
|
|
212
|
-
# Check if the Delta table is already up-to-date with the latest DuckLake snapshot
|
|
213
|
-
if last_exported_ducklake_snapshot_id == latest_ducklake_snapshot_id:
|
|
214
|
-
print(f"✅ {table_key}: Delta table already at latest DuckLake snapshot {latest_ducklake_snapshot_id} (Delta version {last_delta_version_idx}), skipping export.")
|
|
215
|
-
continue # Nothing to do, skip to next table
|
|
216
|
-
|
|
217
312
|
try:
|
|
218
|
-
|
|
219
|
-
now_timestamp = datetime.fromtimestamp(now / 1000) # Convert to datetime for checkpoint
|
|
220
|
-
log_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.json")
|
|
221
|
-
checkpoint_file = os.path.join(delta_log_path, f"{next_delta_version:020d}.checkpoint.parquet")
|
|
222
|
-
|
|
223
|
-
# Fetch all current files associated with the LATEST DuckLake snapshot
|
|
224
|
-
file_rows_for_current_version = con.execute(f"""
|
|
225
|
-
SELECT path, file_size_bytes FROM ducklake_data_file
|
|
226
|
-
WHERE table_id = {row.table_id}
|
|
227
|
-
AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
|
|
228
|
-
""").fetchall()
|
|
229
|
-
|
|
230
|
-
current_files_map = {}
|
|
231
|
-
for path, size in file_rows_for_current_version:
|
|
232
|
-
rel_path = path.lstrip('/')
|
|
233
|
-
full_path = os.path.join(table_root, rel_path)
|
|
234
|
-
mod_time = int(os.path.getmtime(full_path) * 1000) if os.path.exists(full_path) else now
|
|
235
|
-
current_files_map[rel_path] = {
|
|
236
|
-
"path": rel_path, "size": size, "modification_time": mod_time,
|
|
237
|
-
"stats": json.dumps({"numRecords": None}) # Stats would require reading files
|
|
238
|
-
}
|
|
239
|
-
current_file_paths = set(current_files_map.keys())
|
|
240
|
-
|
|
241
|
-
added_files_data = []
|
|
242
|
-
removed_files_paths = []
|
|
243
|
-
|
|
244
|
-
# Calculate the diff between the previous Delta state and the current latest DuckLake snapshot
|
|
245
|
-
added_file_paths = current_file_paths - previously_exported_files
|
|
246
|
-
removed_file_paths_set = previously_exported_files - current_file_paths
|
|
247
|
-
|
|
248
|
-
added_files_data = [current_files_map[p] for p in added_file_paths]
|
|
249
|
-
# removed_files_paths only need the path, not full dict
|
|
250
|
-
removed_files_paths = list(removed_file_paths_set)
|
|
251
|
-
|
|
252
|
-
# If no changes and not the initial version 0, skip writing a log file
|
|
253
|
-
# Version 0 should always be written if it's the first export, even if empty (e.g., empty table)
|
|
254
|
-
if not added_files_data and not removed_files_paths and next_delta_version > 0:
|
|
255
|
-
print(f" {table_key}: No *detectable* changes between previous Delta state and latest DuckLake snapshot {latest_ducklake_snapshot_id}. Skipping new Delta log for version {next_delta_version}.")
|
|
256
|
-
continue # Skip to the next table
|
|
257
|
-
|
|
258
|
-
# Get schema for metadata (always from the latest DuckLake snapshot)
|
|
259
|
-
columns = con.execute(f"""
|
|
260
|
-
SELECT column_name, column_type FROM ducklake_column
|
|
261
|
-
WHERE table_id = {row.table_id}
|
|
262
|
-
AND begin_snapshot <= {latest_ducklake_snapshot_id} AND (end_snapshot IS NULL OR end_snapshot > {latest_ducklake_snapshot_id})
|
|
263
|
-
ORDER BY column_order
|
|
264
|
-
""").fetchall()
|
|
265
|
-
|
|
266
|
-
with open(log_file, 'w') as f:
|
|
267
|
-
# Protocol always comes first
|
|
268
|
-
f.write(json.dumps({"protocol": {"minReaderVersion": 1, "minWriterVersion": 2}}) + "\n")
|
|
269
|
-
|
|
270
|
-
# Determine the table_meta_id
|
|
271
|
-
table_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
|
|
272
|
-
|
|
273
|
-
# Metadata always comes second
|
|
274
|
-
schema_fields = [{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}} for name, typ in columns]
|
|
275
|
-
|
|
276
|
-
# Configuration, including logRetentionDuration
|
|
277
|
-
table_configuration = {"delta.logRetentionDuration": "interval 1 hour"}
|
|
278
|
-
|
|
279
|
-
f.write(json.dumps({
|
|
280
|
-
"metaData": {
|
|
281
|
-
"id": table_meta_id,
|
|
282
|
-
"name": row.table_name if row.table_name else None,
|
|
283
|
-
"description": None,
|
|
284
|
-
"format": {"provider": "parquet", "options": {}},
|
|
285
|
-
"schemaString": create_spark_schema_string(schema_fields),
|
|
286
|
-
"partitionColumns": [],
|
|
287
|
-
"createdTime": now,
|
|
288
|
-
"configuration": table_configuration
|
|
289
|
-
}
|
|
290
|
-
}) + "\n")
|
|
291
|
-
|
|
292
|
-
# Write remove actions
|
|
293
|
-
for path in removed_files_paths:
|
|
294
|
-
f.write(json.dumps({"remove": {"path": path, "deletionTimestamp": now, "dataChange": True}}) + "\n")
|
|
295
|
-
|
|
296
|
-
# Write add actions, excluding the explicitly removed fields
|
|
297
|
-
for af in added_files_data:
|
|
298
|
-
f.write(json.dumps({
|
|
299
|
-
"add": {
|
|
300
|
-
"path": af["path"],
|
|
301
|
-
"partitionValues": {},
|
|
302
|
-
"size": af["size"],
|
|
303
|
-
"modificationTime": af["modification_time"],
|
|
304
|
-
"dataChange": True,
|
|
305
|
-
"stats": af["stats"],
|
|
306
|
-
"tags": None # Set to null as per example
|
|
307
|
-
# Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
|
|
308
|
-
}
|
|
309
|
-
}) + "\n")
|
|
310
|
-
|
|
311
|
-
# Prepare operationParameters for commitInfo based on Delta version
|
|
312
|
-
commit_operation_parameters = {
|
|
313
|
-
"mode": "Overwrite",
|
|
314
|
-
"partitionBy": "[]",
|
|
315
|
-
"duckLakeSnapshotId": str(latest_ducklake_snapshot_id)
|
|
316
|
-
}
|
|
317
|
-
commit_operation = "WRITE"
|
|
318
|
-
|
|
319
|
-
if next_delta_version == 0:
|
|
320
|
-
# For v0, emulate the 'CREATE TABLE' operation parameters as per example
|
|
321
|
-
commit_operation = "CREATE TABLE"
|
|
322
|
-
commit_operation_parameters = {
|
|
323
|
-
"mode": "ErrorIfExists",
|
|
324
|
-
"location": f"{data_root}/{row.schema_path}/{row.table_path}", # Construct location based on data_root
|
|
325
|
-
"protocol": json.dumps({"minReaderVersion": 1, "minWriterVersion": 2}),
|
|
326
|
-
"metadata": json.dumps({ # Stringify metadata object
|
|
327
|
-
"configuration": table_configuration,
|
|
328
|
-
"createdTime": now,
|
|
329
|
-
"description": None,
|
|
330
|
-
"format": {"options": {}, "provider": "parquet"},
|
|
331
|
-
"id": table_meta_id,
|
|
332
|
-
"name": row.table_name if row.table_name else None,
|
|
333
|
-
"partitionColumns": [],
|
|
334
|
-
"schemaString": create_spark_schema_string(schema_fields)
|
|
335
|
-
})
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
# Write CommitInfo
|
|
339
|
-
f.write(json.dumps({
|
|
340
|
-
"commitInfo": {
|
|
341
|
-
"timestamp": now,
|
|
342
|
-
"operation": commit_operation,
|
|
343
|
-
"operationParameters": commit_operation_parameters,
|
|
344
|
-
"isBlindAppend": not removed_files_paths,
|
|
345
|
-
"engineInfo": "DuckLake-Delta-Export-Latest",
|
|
346
|
-
"clientVersion": "delta-rs.0.18.1" if next_delta_version == 0 else "DuckLake-Delta-Python" # Use example clientVersion for v0
|
|
347
|
-
}
|
|
348
|
-
}) + "\n")
|
|
349
|
-
|
|
350
|
-
print(f"✅ {table_key}: Delta log written v{next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
|
|
313
|
+
result = create_checkpoint_for_latest_snapshot(con, table_info, data_root)
|
|
351
314
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
checkpoint_records = []
|
|
357
|
-
|
|
358
|
-
# First record: protocol only
|
|
359
|
-
checkpoint_records.append({
|
|
360
|
-
"protocol": {"minReaderVersion": 1, "minWriterVersion": 2},
|
|
361
|
-
"metaData": None,
|
|
362
|
-
"add": None,
|
|
363
|
-
"remove": None,
|
|
364
|
-
"commitInfo": None
|
|
365
|
-
})
|
|
315
|
+
if result:
|
|
316
|
+
successful_exports += 1
|
|
317
|
+
else:
|
|
318
|
+
print(f"⚠️ {table_key}: No data to export")
|
|
366
319
|
|
|
367
|
-
# Second record: metadata only
|
|
368
|
-
checkpoint_meta_id = existing_meta_id if existing_meta_id else str(uuid.uuid4())
|
|
369
|
-
checkpoint_records.append({
|
|
370
|
-
"protocol": None,
|
|
371
|
-
"commitInfo": None,
|
|
372
|
-
"remove": None,
|
|
373
|
-
"add": None,
|
|
374
|
-
"metaData": {
|
|
375
|
-
"id": checkpoint_meta_id,
|
|
376
|
-
"name": row.table_name if row.table_name else None,
|
|
377
|
-
"description": None,
|
|
378
|
-
"format": {"provider": "parquet", "options": {}},
|
|
379
|
-
"schemaString": create_spark_schema_string(schema_fields),
|
|
380
|
-
"partitionColumns": [],
|
|
381
|
-
"createdTime": now,
|
|
382
|
-
"configuration": {"delta.logRetentionDuration": "interval 1 hour"}
|
|
383
|
-
},
|
|
384
|
-
})
|
|
385
|
-
|
|
386
|
-
# Add all current files from the latest DuckLake snapshot to the checkpoint
|
|
387
|
-
for af_path in current_file_paths:
|
|
388
|
-
af = current_files_map[af_path]
|
|
389
|
-
checkpoint_records.append({
|
|
390
|
-
"protocol": None,
|
|
391
|
-
"metaData": None,
|
|
392
|
-
"remove": None,
|
|
393
|
-
"commitInfo": None,
|
|
394
|
-
"add": {
|
|
395
|
-
"path": af["path"],
|
|
396
|
-
"partitionValues": {},
|
|
397
|
-
"size": af["size"],
|
|
398
|
-
"modificationTime": af["modification_time"],
|
|
399
|
-
"dataChange": True,
|
|
400
|
-
"stats": af["stats"],
|
|
401
|
-
"tags": None # Set to null as per example
|
|
402
|
-
# Removed deletionVector, baseRowId, defaultRowCommitVersion, clusteringProvider
|
|
403
|
-
},
|
|
404
|
-
})
|
|
405
|
-
|
|
406
|
-
# Create PyArrow table with proper handling of None values
|
|
407
|
-
table_data = {
|
|
408
|
-
'protocol': [record.get("protocol") for record in checkpoint_records],
|
|
409
|
-
'metaData': [record.get("metaData") for record in checkpoint_records],
|
|
410
|
-
'add': [record.get("add") for record in checkpoint_records],
|
|
411
|
-
'remove': [record.get("remove") for record in checkpoint_records],
|
|
412
|
-
'commitInfo': [record.get("commitInfo") for record in checkpoint_records]
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
# Create table directly with target schema to avoid casting issues
|
|
416
|
-
target_schema = get_spark_checkpoint_schema()
|
|
417
|
-
table = pa.table(table_data, schema=target_schema)
|
|
418
|
-
pq.write_table(table, checkpoint_file, compression='snappy')
|
|
419
|
-
|
|
420
|
-
with open(os.path.join(delta_log_path, "_last_checkpoint"), 'w') as f:
|
|
421
|
-
json.dump({"version": next_delta_version, "size": len(checkpoint_records)}, f)
|
|
422
|
-
|
|
423
|
-
print(f"📸 {table_key}: Checkpoint created at Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id})")
|
|
424
|
-
|
|
425
|
-
# --- Cleanup old JSON log files and Checkpoint files ---
|
|
426
|
-
print(f"🧹 {table_key}: Cleaning up old log and checkpoint files before Delta version {next_delta_version}...")
|
|
427
|
-
for f_name in os.listdir(delta_log_path):
|
|
428
|
-
base_name = f_name.split('.')[0]
|
|
429
|
-
# Check for versioned JSON log files
|
|
430
|
-
if f_name.endswith('.json') and base_name.startswith('0000') and base_name.isdigit():
|
|
431
|
-
log_version = int(base_name)
|
|
432
|
-
if log_version < next_delta_version:
|
|
433
|
-
file_to_delete = os.path.join(delta_log_path, f_name)
|
|
434
|
-
try:
|
|
435
|
-
os.remove(file_to_delete)
|
|
436
|
-
print(f" Deleted JSON log: {f_name}")
|
|
437
|
-
except OSError as e:
|
|
438
|
-
print(f" Error deleting JSON log {f_name}: {e}")
|
|
439
|
-
# Check for versioned Parquet checkpoint files
|
|
440
|
-
elif f_name.endswith('.checkpoint.parquet'):
|
|
441
|
-
checkpoint_base_name = f_name.split('.checkpoint.parquet')[0]
|
|
442
|
-
if checkpoint_base_name.startswith('0000') and checkpoint_base_name.isdigit():
|
|
443
|
-
checkpoint_version = int(checkpoint_base_name)
|
|
444
|
-
if checkpoint_version < next_delta_version:
|
|
445
|
-
file_to_delete = os.path.join(delta_log_path, f_name)
|
|
446
|
-
try:
|
|
447
|
-
os.remove(file_to_delete)
|
|
448
|
-
print(f" Deleted checkpoint: {f_name}")
|
|
449
|
-
except OSError as e:
|
|
450
|
-
print(f" Error deleting checkpoint {f_name}: {e}")
|
|
451
|
-
print(f"🧹 {table_key}: Cleanup complete.")
|
|
452
|
-
|
|
453
|
-
elif next_delta_version > 0 and next_delta_version % checkpoint_interval == 0 and os.path.exists(checkpoint_file):
|
|
454
|
-
print(f"⏩ {table_key}: Checkpoint for Delta version {next_delta_version} (DuckLake snapshot: {latest_ducklake_snapshot_id}) already exists, skipping generation.")
|
|
455
|
-
|
|
456
320
|
except Exception as e:
|
|
457
|
-
print(f"❌
|
|
458
|
-
|
|
459
|
-
# but for this script, we just log and continue to next table.
|
|
460
|
-
|
|
321
|
+
print(f"❌ {table_key}: Failed to export checkpoint - {e}")
|
|
322
|
+
|
|
461
323
|
con.close()
|
|
462
|
-
print("
|
|
324
|
+
print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ducklake-delta-exporter
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
|
+
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
|
+
Author: mim
|
|
7
|
+
Author-email: your.email@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: duckdb
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: requires-dist
|
|
24
|
+
Dynamic: requires-python
|
|
25
|
+
Dynamic: summary
|
|
26
|
+
|
|
27
|
+
# DuckLake Delta Exporter
|
|
28
|
+
|
|
29
|
+
A Python package for exporting DuckLake snapshots as Delta Lake checkpoint files, enabling compatibility with Delta Lake readers, support local path, s3 and gcs, for onelake use mounted storage as azure storage is not supported
|
|
30
|
+
|
|
31
|
+
this is just a fun project, please vote for a proper support in duckdb https://github.com/duckdb/duckdb-delta/issues/218
|
|
32
|
+
|
|
33
|
+
## Repository
|
|
34
|
+
|
|
35
|
+
https://github.com/djouallah/ducklake_delta_exporter
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install ducklake-delta-exporter
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from ducklake_delta_exporter import generate_latest_delta_log
|
|
47
|
+
|
|
48
|
+
# Export all tables from a DuckLake database
|
|
49
|
+
generate_latest_delta_log("/path/to/ducklake.db")
|
|
50
|
+
|
|
51
|
+
# Specify a custom data root directory
|
|
52
|
+
generate_latest_delta_log("/path/to/ducklake.db", data_root="/custom/data/path")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## What it does
|
|
56
|
+
|
|
57
|
+
This package converts DuckLake table snapshots into Delta Lake format by:
|
|
58
|
+
|
|
59
|
+
1. **Reading DuckLake metadata** - Extracts table schemas, file paths, and snapshot information
|
|
60
|
+
2. **Creating Delta checkpoint files** - Generates `.checkpoint.parquet` files with Delta Lake metadata
|
|
61
|
+
3. **Writing JSON transaction logs** - Creates minimal `.json` log files for Spark compatibility
|
|
62
|
+
4. **Mapping data types** - Converts DuckDB types to Spark SQL equivalents
|
|
63
|
+
|
|
64
|
+
## Features
|
|
65
|
+
|
|
66
|
+
- ✅ **Spark Compatible** - Generated Delta files can be read by Spark and other Delta Lake tools
|
|
67
|
+
- ✅ **Type Mapping** - Automatic conversion between DuckDB and Spark data types
|
|
68
|
+
- ✅ **Batch Processing** - Exports all tables in a DuckLake database
|
|
69
|
+
- ✅ **Error Handling** - Graceful handling of missing snapshots and other issues
|
|
70
|
+
- ✅ **Progress Reporting** - Clear feedback on export progress and results
|
|
71
|
+
|
|
72
|
+
## Requirements
|
|
73
|
+
|
|
74
|
+
- Python 3.8+
|
|
75
|
+
- DuckDB
|
|
76
|
+
|
|
77
|
+
## File Structure
|
|
78
|
+
|
|
79
|
+
After running the exporter, your Delta tables will have the following structure:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
your_table/
|
|
83
|
+
├── data_file_1.parquet
|
|
84
|
+
├── data_file_2.parquet
|
|
85
|
+
└── _delta_log/
|
|
86
|
+
├── 00000000000000000000.json
|
|
87
|
+
├── 00000000000000000000.checkpoint.parquet
|
|
88
|
+
└── _last_checkpoint
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Type Mapping
|
|
92
|
+
|
|
93
|
+
The exporter automatically maps DuckDB types to Spark SQL types:
|
|
94
|
+
|
|
95
|
+
| DuckDB Type | Spark Type |
|
|
96
|
+
|-------------|------------|
|
|
97
|
+
| INTEGER | integer |
|
|
98
|
+
| BIGINT | long |
|
|
99
|
+
| FLOAT | double |
|
|
100
|
+
| DOUBLE | double |
|
|
101
|
+
| DECIMAL | decimal(10,0) |
|
|
102
|
+
| BOOLEAN | boolean |
|
|
103
|
+
| TIMESTAMP | timestamp |
|
|
104
|
+
| DATE | date |
|
|
105
|
+
| VARCHAR | string |
|
|
106
|
+
| Others | string |
|
|
107
|
+
|
|
108
|
+
## Error Handling
|
|
109
|
+
|
|
110
|
+
The exporter handles various error conditions:
|
|
111
|
+
|
|
112
|
+
- **Missing snapshots** - Skips tables with no data
|
|
113
|
+
- **Existing checkpoints** - Avoids overwriting existing files
|
|
114
|
+
- **Schema changes** - Uses the latest schema for each table
|
|
115
|
+
- **File system errors** - Reports and continues with other tables
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT License - see LICENSE file for details.
|
|
120
|
+
|
|
121
|
+
## Contributing
|
|
122
|
+
|
|
123
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
ducklake_delta_exporter/__init__.py,sha256=LmXUUeR0LPgrlqlqeVmpnwm3JdurMD81GvjB-KeGxLo,14380
|
|
2
|
+
ducklake_delta_exporter-0.1.4.dist-info/METADATA,sha256=s8rMeyMR00CvfGjaprCOZsOhw9U7EheZ6gFd9VVTh6Y,3949
|
|
3
|
+
ducklake_delta_exporter-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
+
ducklake_delta_exporter-0.1.4.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
|
|
5
|
+
ducklake_delta_exporter-0.1.4.dist-info/RECORD,,
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ducklake-delta-exporter
|
|
3
|
-
Version: 0.1.2
|
|
4
|
-
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
|
-
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
|
-
Author: mim
|
|
7
|
-
Author-email: your.email@example.com
|
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Operating System :: OS Independent
|
|
11
|
-
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
-
Classifier: Development Status :: 3 - Alpha
|
|
14
|
-
Requires-Python: >=3.8
|
|
15
|
-
Description-Content-Type: text/markdown
|
|
16
|
-
Requires-Dist: duckdb
|
|
17
|
-
Requires-Dist: pyarrow
|
|
18
|
-
Dynamic: author
|
|
19
|
-
Dynamic: author-email
|
|
20
|
-
Dynamic: classifier
|
|
21
|
-
Dynamic: description
|
|
22
|
-
Dynamic: description-content-type
|
|
23
|
-
Dynamic: home-page
|
|
24
|
-
Dynamic: requires-dist
|
|
25
|
-
Dynamic: requires-python
|
|
26
|
-
Dynamic: summary
|
|
27
|
-
|
|
28
|
-
# 🦆 DuckLake Delta Exporter
|
|
29
|
-
|
|
30
|
-
A Python utility to **bridge the gap between DuckLake and Delta Lake** by generating Delta-compatible transaction logs directly from DuckLake metadata.
|
|
31
|
-
|
|
32
|
-
This isn’t your typical general-purpose library. It’s mostly battle-tested with **OneLake mounted storage**, and while it *should* work with local filesystems, there’s **no support for S3, GCS, or ABFSS** .
|
|
33
|
-
|
|
34
|
-
It doesn’t use the `deltalake` Python package either. The metadata is handcrafted from scratch — because why not reinvent the wheel for fun and learning?
|
|
35
|
-
|
|
36
|
-
**Goal?**
|
|
37
|
-
Mostly to annoy DuckDB developers into finally shipping a proper Delta Lake metadata exporter 😎
|
|
38
|
-
|
|
39
|
-
🔗 [Source code on GitHub](https://github.com/djouallah/ducklake_delta_exporter)
|
|
40
|
-
|
|
41
|
-
---
|
|
42
|
-
|
|
43
|
-
## ✨ Features
|
|
44
|
-
|
|
45
|
-
- **DuckLake → Delta Sync**
|
|
46
|
-
Generates Delta Lake `_delta_log/*.json` transaction files and Parquet checkpoints from the latest DuckLake state.
|
|
47
|
-
|
|
48
|
-
- **Schema Mapping**
|
|
49
|
-
Converts DuckDB types to their Spark SQL equivalents so Delta can understand them without throwing a tantrum.
|
|
50
|
-
|
|
51
|
-
- **Change Detection**
|
|
52
|
-
Detects file-level additions/removals since the last export — keeps things incremental and tidy.
|
|
53
|
-
|
|
54
|
-
- **Checkpointing**
|
|
55
|
-
Automatically writes Delta checkpoints every N versions (configurable), so readers don’t have to replay the entire log from scratch.
|
|
56
|
-
|
|
57
|
-
---
|
|
58
|
-
|
|
59
|
-
## ⚙️ Installation & Usage
|
|
60
|
-
|
|
61
|
-
Install via pip:
|
|
62
|
-
|
|
63
|
-
```bash
|
|
64
|
-
pip install ducklake-delta-exporter
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
```
|
|
68
|
-
from ducklake_delta_exporter import generate_latest_delta_log
|
|
69
|
-
|
|
70
|
-
generate_latest_delta_log('/lakehouse/default/Files/meta.db','/lakehouse/default/Tables')
|
|
71
|
-
```
|
|
72
|
-
the data path is optional, but handy to support relative path
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
ducklake_delta_exporter/__init__.py,sha256=kZxTL8QRsbraNkZ9xvQZEipalRikMCd6DDJLTmvHkso,24868
|
|
2
|
-
ducklake_delta_exporter-0.1.2.dist-info/METADATA,sha256=NEBJN3FsL_m64F2SpPLTX8uqOnQDKwDjRVvKZJR0JY4,2630
|
|
3
|
-
ducklake_delta_exporter-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
-
ducklake_delta_exporter-0.1.2.dist-info/top_level.txt,sha256=cGISjIUrP9eP3UexjiCEWnWy8N5woIBV2QVF21OgdtQ,24
|
|
5
|
-
ducklake_delta_exporter-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
{ducklake_delta_exporter-0.1.2.dist-info → ducklake_delta_exporter-0.1.4.dist-info}/top_level.txt
RENAMED
|
File without changes
|