ducklake-delta-exporter 0.1.4__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/PKG-INFO +2 -2
- ducklake_delta_exporter-0.2.0/ducklake_delta_exporter/__init__.py +486 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/ducklake_delta_exporter.egg-info/PKG-INFO +2 -2
- ducklake_delta_exporter-0.2.0/ducklake_delta_exporter.egg-info/requires.txt +1 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/setup.py +2 -2
- ducklake_delta_exporter-0.1.4/ducklake_delta_exporter/__init__.py +0 -324
- ducklake_delta_exporter-0.1.4/ducklake_delta_exporter.egg-info/requires.txt +0 -1
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/README.md +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/ducklake_delta_exporter.egg-info/SOURCES.txt +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/ducklake_delta_exporter.egg-info/dependency_links.txt +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/ducklake_delta_exporter.egg-info/top_level.txt +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ducklake-delta-exporter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
5
|
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
6
|
Author: mim
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
13
13
|
Classifier: Development Status :: 3 - Alpha
|
|
14
14
|
Requires-Python: >=3.8
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
|
-
Requires-Dist: duckdb
|
|
16
|
+
Requires-Dist: duckdb>=1.4.4
|
|
17
17
|
Dynamic: author
|
|
18
18
|
Dynamic: author-email
|
|
19
19
|
Dynamic: classifier
|
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
# File: ducklake_delta_exporter.py
|
|
2
|
+
import duckdb
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_latest_delta_log(db_path: str):
|
|
6
|
+
"""
|
|
7
|
+
Export the latest DuckLake snapshot for each table as Delta checkpoint files.
|
|
8
|
+
Uses DuckDB 1.4.4+ native support for writing to abfss://, s3://, etc.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
db_path (str): The path to the DuckLake database file (or connection string).
|
|
12
|
+
"""
|
|
13
|
+
# For remote paths (abfss://, s3://, etc.), use in-memory connection with ATTACH
|
|
14
|
+
is_remote = any(db_path.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://'])
|
|
15
|
+
|
|
16
|
+
if is_remote:
|
|
17
|
+
con = duckdb.connect()
|
|
18
|
+
# Load required extensions for cloud storage
|
|
19
|
+
if db_path.startswith('abfss://') or db_path.startswith('az://'):
|
|
20
|
+
con.execute("LOAD azure")
|
|
21
|
+
# Load persistent secrets
|
|
22
|
+
con.execute("SELECT * FROM duckdb_secrets()")
|
|
23
|
+
elif db_path.startswith('s3://'):
|
|
24
|
+
con.execute("LOAD httpfs")
|
|
25
|
+
con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
|
|
26
|
+
con.execute("USE ducklake_db")
|
|
27
|
+
else:
|
|
28
|
+
con = duckdb.connect(db_path, read_only=True)
|
|
29
|
+
|
|
30
|
+
# Build export summary - identify which tables have data
|
|
31
|
+
con.execute("""
|
|
32
|
+
CREATE OR REPLACE TEMP TABLE export_summary AS
|
|
33
|
+
WITH
|
|
34
|
+
data_root_config AS (
|
|
35
|
+
SELECT value AS data_root FROM ducklake_metadata WHERE key = 'data_path'
|
|
36
|
+
),
|
|
37
|
+
active_tables AS (
|
|
38
|
+
SELECT
|
|
39
|
+
t.table_id,
|
|
40
|
+
t.table_name,
|
|
41
|
+
s.schema_name,
|
|
42
|
+
t.path AS table_path,
|
|
43
|
+
s.path AS schema_path,
|
|
44
|
+
rtrim((SELECT data_root FROM data_root_config), '/') || '/' ||
|
|
45
|
+
CASE
|
|
46
|
+
WHEN trim(s.path, '/') != '' THEN trim(s.path, '/') || '/'
|
|
47
|
+
ELSE ''
|
|
48
|
+
END ||
|
|
49
|
+
trim(t.path, '/') AS table_root
|
|
50
|
+
FROM ducklake_table t
|
|
51
|
+
JOIN ducklake_schema s USING(schema_id)
|
|
52
|
+
WHERE t.end_snapshot IS NULL
|
|
53
|
+
),
|
|
54
|
+
current_snapshot AS (
|
|
55
|
+
SELECT MAX(snapshot_id) AS snapshot_id FROM ducklake_snapshot
|
|
56
|
+
),
|
|
57
|
+
table_last_modified AS (
|
|
58
|
+
SELECT
|
|
59
|
+
t.*,
|
|
60
|
+
COALESCE(
|
|
61
|
+
(SELECT MAX(sc.snapshot_id)
|
|
62
|
+
FROM ducklake_snapshot_changes sc
|
|
63
|
+
WHERE regexp_matches(sc.changes_made, '[:,]' || t.table_id || '([^0-9]|$)')
|
|
64
|
+
),
|
|
65
|
+
(SELECT cs.snapshot_id
|
|
66
|
+
FROM current_snapshot cs
|
|
67
|
+
WHERE EXISTS (
|
|
68
|
+
SELECT 1 FROM ducklake_data_file df
|
|
69
|
+
WHERE df.table_id = t.table_id
|
|
70
|
+
AND df.end_snapshot IS NULL
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
) AS last_modified_snapshot,
|
|
74
|
+
(SELECT COUNT(*) FROM ducklake_data_file df
|
|
75
|
+
WHERE df.table_id = t.table_id
|
|
76
|
+
AND df.end_snapshot IS NULL
|
|
77
|
+
) AS file_count
|
|
78
|
+
FROM active_tables t
|
|
79
|
+
)
|
|
80
|
+
SELECT
|
|
81
|
+
table_id,
|
|
82
|
+
schema_name,
|
|
83
|
+
table_name,
|
|
84
|
+
table_root,
|
|
85
|
+
CASE
|
|
86
|
+
WHEN file_count = 0 THEN 'no_data_files'
|
|
87
|
+
WHEN last_modified_snapshot IS NULL THEN 'no_changes'
|
|
88
|
+
ELSE 'needs_export'
|
|
89
|
+
END AS status,
|
|
90
|
+
last_modified_snapshot AS snapshot_id,
|
|
91
|
+
file_count
|
|
92
|
+
FROM table_last_modified
|
|
93
|
+
""")
|
|
94
|
+
|
|
95
|
+
# Get tables that need export
|
|
96
|
+
tables_to_export = con.execute("""
|
|
97
|
+
SELECT table_id, schema_name, table_name, table_root, snapshot_id, file_count
|
|
98
|
+
FROM export_summary
|
|
99
|
+
WHERE status = 'needs_export'
|
|
100
|
+
""").fetchall()
|
|
101
|
+
|
|
102
|
+
# Show summary
|
|
103
|
+
summary = con.execute("""
|
|
104
|
+
SELECT status, COUNT(*) as cnt FROM export_summary GROUP BY status
|
|
105
|
+
""").fetchall()
|
|
106
|
+
|
|
107
|
+
for status, cnt in summary:
|
|
108
|
+
print(f" {status}: {cnt} tables")
|
|
109
|
+
|
|
110
|
+
if not tables_to_export:
|
|
111
|
+
print("\n✅ No tables need export.")
|
|
112
|
+
con.close()
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
print(f"\n📦 Exporting {len(tables_to_export)} tables...")
|
|
116
|
+
|
|
117
|
+
# Process each table
|
|
118
|
+
for table_id, schema_name, table_name, table_root, snapshot_id, file_count in tables_to_export:
|
|
119
|
+
table_key = f"{schema_name}.{table_name}"
|
|
120
|
+
|
|
121
|
+
# Check if checkpoint already exists for this snapshot
|
|
122
|
+
checkpoint_path = f"{table_root}/_delta_log/{snapshot_id:020d}.checkpoint.parquet"
|
|
123
|
+
try:
|
|
124
|
+
con.execute(f"SELECT 1 FROM '{checkpoint_path}' LIMIT 1")
|
|
125
|
+
print(f" ⏭️ {table_key}: snapshot {snapshot_id} already exported")
|
|
126
|
+
continue
|
|
127
|
+
except Exception:
|
|
128
|
+
pass # File doesn't exist, proceed with export
|
|
129
|
+
|
|
130
|
+
print(f"\n Processing {table_key}...")
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
# Build checkpoint parquet data for this table
|
|
134
|
+
con.execute("""
|
|
135
|
+
CREATE OR REPLACE TEMP TABLE temp_checkpoint_parquet AS
|
|
136
|
+
WITH
|
|
137
|
+
table_schemas AS (
|
|
138
|
+
SELECT
|
|
139
|
+
? AS table_id,
|
|
140
|
+
? AS table_name,
|
|
141
|
+
? AS snapshot_id,
|
|
142
|
+
? AS table_root,
|
|
143
|
+
list({
|
|
144
|
+
'name': c.column_name,
|
|
145
|
+
'type':
|
|
146
|
+
CASE
|
|
147
|
+
WHEN contains(lower(c.column_type), 'int') AND contains(c.column_type, '64') THEN 'long'
|
|
148
|
+
WHEN contains(lower(c.column_type), 'int') THEN 'integer'
|
|
149
|
+
WHEN contains(lower(c.column_type), 'float') THEN 'double'
|
|
150
|
+
WHEN contains(lower(c.column_type), 'double') THEN 'double'
|
|
151
|
+
WHEN contains(lower(c.column_type), 'bool') THEN 'boolean'
|
|
152
|
+
WHEN contains(lower(c.column_type), 'timestamp') THEN 'timestamp'
|
|
153
|
+
WHEN contains(lower(c.column_type), 'date') THEN 'date'
|
|
154
|
+
WHEN contains(lower(c.column_type), 'decimal') THEN lower(c.column_type)
|
|
155
|
+
ELSE 'string'
|
|
156
|
+
END,
|
|
157
|
+
'nullable': true,
|
|
158
|
+
'metadata': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
159
|
+
}::STRUCT(name VARCHAR, type VARCHAR, nullable BOOLEAN, metadata MAP(VARCHAR, VARCHAR)) ORDER BY c.column_order) AS schema_fields
|
|
160
|
+
FROM ducklake_column c
|
|
161
|
+
WHERE c.table_id = ?
|
|
162
|
+
AND c.end_snapshot IS NULL
|
|
163
|
+
),
|
|
164
|
+
file_column_stats_agg AS (
|
|
165
|
+
SELECT
|
|
166
|
+
df.data_file_id,
|
|
167
|
+
c.column_name,
|
|
168
|
+
ANY_VALUE(c.column_type) AS column_type,
|
|
169
|
+
MAX(fcs.value_count) AS value_count,
|
|
170
|
+
MIN(fcs.min_value) AS min_value,
|
|
171
|
+
MAX(fcs.max_value) AS max_value,
|
|
172
|
+
MAX(fcs.null_count) AS null_count
|
|
173
|
+
FROM ducklake_data_file df
|
|
174
|
+
LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
|
|
175
|
+
LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
|
|
176
|
+
WHERE df.table_id = ?
|
|
177
|
+
AND df.end_snapshot IS NULL
|
|
178
|
+
AND c.column_id IS NOT NULL
|
|
179
|
+
AND c.end_snapshot IS NULL
|
|
180
|
+
GROUP BY df.data_file_id, c.column_name
|
|
181
|
+
),
|
|
182
|
+
file_column_stats_transformed AS (
|
|
183
|
+
SELECT
|
|
184
|
+
fca.data_file_id,
|
|
185
|
+
fca.column_name,
|
|
186
|
+
fca.column_type,
|
|
187
|
+
fca.value_count,
|
|
188
|
+
fca.null_count,
|
|
189
|
+
CASE
|
|
190
|
+
WHEN fca.min_value IS NULL THEN NULL
|
|
191
|
+
WHEN contains(lower(fca.column_type), 'timestamp') THEN
|
|
192
|
+
regexp_replace(
|
|
193
|
+
regexp_replace(replace(fca.min_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
|
|
194
|
+
'^([^.]+)$', '\\1.000'
|
|
195
|
+
) || 'Z'
|
|
196
|
+
WHEN contains(lower(fca.column_type), 'date') THEN fca.min_value
|
|
197
|
+
WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.min_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
|
|
198
|
+
WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
|
|
199
|
+
OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
|
|
200
|
+
CASE WHEN contains(fca.min_value, '.') OR contains(lower(fca.min_value), 'e')
|
|
201
|
+
THEN CAST(TRY_CAST(fca.min_value AS DOUBLE) AS VARCHAR)
|
|
202
|
+
ELSE CAST(TRY_CAST(fca.min_value AS BIGINT) AS VARCHAR)
|
|
203
|
+
END
|
|
204
|
+
ELSE fca.min_value
|
|
205
|
+
END AS transformed_min,
|
|
206
|
+
CASE
|
|
207
|
+
WHEN fca.max_value IS NULL THEN NULL
|
|
208
|
+
WHEN contains(lower(fca.column_type), 'timestamp') THEN
|
|
209
|
+
regexp_replace(
|
|
210
|
+
regexp_replace(replace(fca.max_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
|
|
211
|
+
'^([^.]+)$', '\\1.000'
|
|
212
|
+
) || 'Z'
|
|
213
|
+
WHEN contains(lower(fca.column_type), 'date') THEN fca.max_value
|
|
214
|
+
WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.max_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
|
|
215
|
+
WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
|
|
216
|
+
OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
|
|
217
|
+
CASE WHEN contains(fca.max_value, '.') OR contains(lower(fca.max_value), 'e')
|
|
218
|
+
THEN CAST(TRY_CAST(fca.max_value AS DOUBLE) AS VARCHAR)
|
|
219
|
+
ELSE CAST(TRY_CAST(fca.max_value AS BIGINT) AS VARCHAR)
|
|
220
|
+
END
|
|
221
|
+
ELSE fca.max_value
|
|
222
|
+
END AS transformed_max
|
|
223
|
+
FROM file_column_stats_agg fca
|
|
224
|
+
),
|
|
225
|
+
file_metadata AS (
|
|
226
|
+
SELECT
|
|
227
|
+
ts.table_id,
|
|
228
|
+
ts.table_name,
|
|
229
|
+
ts.snapshot_id,
|
|
230
|
+
ts.table_root,
|
|
231
|
+
ts.schema_fields,
|
|
232
|
+
df.data_file_id,
|
|
233
|
+
df.path AS file_path,
|
|
234
|
+
df.file_size_bytes,
|
|
235
|
+
COALESCE(MAX(fct.value_count), 0) AS num_records,
|
|
236
|
+
COALESCE(map_from_entries(list({
|
|
237
|
+
'key': fct.column_name,
|
|
238
|
+
'value': fct.transformed_min
|
|
239
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_min IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS min_values,
|
|
240
|
+
COALESCE(map_from_entries(list({
|
|
241
|
+
'key': fct.column_name,
|
|
242
|
+
'value': fct.transformed_max
|
|
243
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_max IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS max_values,
|
|
244
|
+
COALESCE(map_from_entries(list({
|
|
245
|
+
'key': fct.column_name,
|
|
246
|
+
'value': fct.null_count
|
|
247
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.null_count IS NOT NULL)), MAP{}::MAP(VARCHAR, BIGINT)) AS null_count
|
|
248
|
+
FROM table_schemas ts
|
|
249
|
+
JOIN ducklake_data_file df ON df.table_id = ts.table_id
|
|
250
|
+
LEFT JOIN file_column_stats_transformed fct ON df.data_file_id = fct.data_file_id
|
|
251
|
+
WHERE df.end_snapshot IS NULL
|
|
252
|
+
GROUP BY ts.table_id, ts.table_name, ts.snapshot_id,
|
|
253
|
+
ts.table_root, ts.schema_fields, df.data_file_id, df.path, df.file_size_bytes
|
|
254
|
+
),
|
|
255
|
+
table_aggregates AS (
|
|
256
|
+
SELECT
|
|
257
|
+
table_id,
|
|
258
|
+
table_name,
|
|
259
|
+
snapshot_id,
|
|
260
|
+
table_root,
|
|
261
|
+
schema_fields,
|
|
262
|
+
COUNT(*) AS num_files,
|
|
263
|
+
SUM(num_records) AS total_rows,
|
|
264
|
+
SUM(file_size_bytes) AS total_bytes,
|
|
265
|
+
list({
|
|
266
|
+
'path': ltrim(file_path, '/'),
|
|
267
|
+
'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
|
|
268
|
+
'size': file_size_bytes,
|
|
269
|
+
'modificationTime': epoch_ms(now()),
|
|
270
|
+
'dataChange': true,
|
|
271
|
+
'stats': COALESCE(to_json({
|
|
272
|
+
'numRecords': COALESCE(num_records, 0),
|
|
273
|
+
'minValues': COALESCE(min_values, MAP{}::MAP(VARCHAR, VARCHAR)),
|
|
274
|
+
'maxValues': COALESCE(max_values, MAP{}::MAP(VARCHAR, VARCHAR)),
|
|
275
|
+
'nullCount': COALESCE(null_count, MAP{}::MAP(VARCHAR, BIGINT))
|
|
276
|
+
}), '{"numRecords":0}'),
|
|
277
|
+
'tags': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
278
|
+
}::STRUCT(
|
|
279
|
+
path VARCHAR,
|
|
280
|
+
partitionValues MAP(VARCHAR, VARCHAR),
|
|
281
|
+
size BIGINT,
|
|
282
|
+
modificationTime BIGINT,
|
|
283
|
+
dataChange BOOLEAN,
|
|
284
|
+
stats VARCHAR,
|
|
285
|
+
tags MAP(VARCHAR, VARCHAR)
|
|
286
|
+
)) AS add_entries
|
|
287
|
+
FROM file_metadata
|
|
288
|
+
GROUP BY table_id, table_name, snapshot_id, table_root, schema_fields
|
|
289
|
+
),
|
|
290
|
+
checkpoint_data AS (
|
|
291
|
+
SELECT
|
|
292
|
+
ta.*,
|
|
293
|
+
epoch_ms(now()) AS now_ms,
|
|
294
|
+
uuid()::VARCHAR AS txn_id,
|
|
295
|
+
(substring(md5(ta.table_id::VARCHAR || '-metadata'), 1, 8) || '-' ||
|
|
296
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 9, 4) || '-' ||
|
|
297
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 13, 4) || '-' ||
|
|
298
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 17, 4) || '-' ||
|
|
299
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 21, 12)) AS meta_id,
|
|
300
|
+
to_json({'type': 'struct', 'fields': ta.schema_fields}) AS schema_string
|
|
301
|
+
FROM table_aggregates ta
|
|
302
|
+
),
|
|
303
|
+
checkpoint_parquet_data AS (
|
|
304
|
+
SELECT
|
|
305
|
+
cd.table_id,
|
|
306
|
+
cd.table_name,
|
|
307
|
+
cd.snapshot_id,
|
|
308
|
+
cd.table_root,
|
|
309
|
+
cd.meta_id,
|
|
310
|
+
cd.now_ms,
|
|
311
|
+
cd.txn_id,
|
|
312
|
+
cd.schema_string,
|
|
313
|
+
cd.num_files,
|
|
314
|
+
cd.total_rows,
|
|
315
|
+
cd.total_bytes,
|
|
316
|
+
{'minReaderVersion': 1, 'minWriterVersion': 2} AS protocol,
|
|
317
|
+
NULL AS metaData,
|
|
318
|
+
NULL AS add,
|
|
319
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
320
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
321
|
+
1 AS row_order
|
|
322
|
+
FROM checkpoint_data cd
|
|
323
|
+
UNION ALL
|
|
324
|
+
SELECT
|
|
325
|
+
cd.table_id,
|
|
326
|
+
cd.table_name,
|
|
327
|
+
cd.snapshot_id,
|
|
328
|
+
cd.table_root,
|
|
329
|
+
cd.meta_id,
|
|
330
|
+
cd.now_ms,
|
|
331
|
+
cd.txn_id,
|
|
332
|
+
cd.schema_string,
|
|
333
|
+
cd.num_files,
|
|
334
|
+
cd.total_rows,
|
|
335
|
+
cd.total_bytes,
|
|
336
|
+
NULL AS protocol,
|
|
337
|
+
{
|
|
338
|
+
'id': cd.meta_id,
|
|
339
|
+
'name': cd.table_name,
|
|
340
|
+
'format': {'provider': 'parquet', 'options': MAP{}::MAP(VARCHAR, VARCHAR)}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
|
|
341
|
+
'schemaString': cd.schema_string,
|
|
342
|
+
'partitionColumns': []::VARCHAR[],
|
|
343
|
+
'createdTime': cd.now_ms,
|
|
344
|
+
'configuration': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
345
|
+
}::STRUCT(id VARCHAR, name VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
346
|
+
NULL AS add,
|
|
347
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
348
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
349
|
+
2 AS row_order
|
|
350
|
+
FROM checkpoint_data cd
|
|
351
|
+
UNION ALL
|
|
352
|
+
SELECT
|
|
353
|
+
cd.table_id,
|
|
354
|
+
cd.table_name,
|
|
355
|
+
cd.snapshot_id,
|
|
356
|
+
cd.table_root,
|
|
357
|
+
cd.meta_id,
|
|
358
|
+
cd.now_ms,
|
|
359
|
+
cd.txn_id,
|
|
360
|
+
cd.schema_string,
|
|
361
|
+
cd.num_files,
|
|
362
|
+
cd.total_rows,
|
|
363
|
+
cd.total_bytes,
|
|
364
|
+
NULL AS protocol,
|
|
365
|
+
NULL AS metaData,
|
|
366
|
+
unnest(cd.add_entries) AS add,
|
|
367
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
368
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
369
|
+
3 AS row_order
|
|
370
|
+
FROM checkpoint_data cd
|
|
371
|
+
)
|
|
372
|
+
SELECT * FROM checkpoint_parquet_data
|
|
373
|
+
""", [table_id, table_name, snapshot_id, table_root, table_id, table_id])
|
|
374
|
+
|
|
375
|
+
# Build JSON log content
|
|
376
|
+
con.execute("""
|
|
377
|
+
CREATE OR REPLACE TEMP TABLE temp_checkpoint_json AS
|
|
378
|
+
SELECT DISTINCT
|
|
379
|
+
p.table_id,
|
|
380
|
+
p.table_root,
|
|
381
|
+
p.snapshot_id,
|
|
382
|
+
p.num_files,
|
|
383
|
+
to_json({
|
|
384
|
+
'commitInfo': {
|
|
385
|
+
'timestamp': p.now_ms,
|
|
386
|
+
'operation': 'CONVERT',
|
|
387
|
+
'operationParameters': {
|
|
388
|
+
'convertedFrom': 'DuckLake',
|
|
389
|
+
'duckLakeSnapshotId': p.snapshot_id::VARCHAR,
|
|
390
|
+
'partitionBy': '[]'
|
|
391
|
+
},
|
|
392
|
+
'isolationLevel': 'Serializable',
|
|
393
|
+
'isBlindAppend': false,
|
|
394
|
+
'operationMetrics': {
|
|
395
|
+
'numFiles': p.num_files::VARCHAR,
|
|
396
|
+
'numOutputRows': p.total_rows::VARCHAR,
|
|
397
|
+
'numOutputBytes': p.total_bytes::VARCHAR
|
|
398
|
+
},
|
|
399
|
+
'engineInfo': 'DuckLake-Delta-Exporter/1.0.0',
|
|
400
|
+
'txnId': p.txn_id
|
|
401
|
+
}
|
|
402
|
+
}) || chr(10) ||
|
|
403
|
+
to_json({
|
|
404
|
+
'metaData': {
|
|
405
|
+
'id': p.meta_id,
|
|
406
|
+
'name': p.table_name,
|
|
407
|
+
'format': {'provider': 'parquet', 'options': MAP{}},
|
|
408
|
+
'schemaString': p.schema_string::VARCHAR,
|
|
409
|
+
'partitionColumns': [],
|
|
410
|
+
'createdTime': p.now_ms,
|
|
411
|
+
'configuration': MAP{}
|
|
412
|
+
}
|
|
413
|
+
}) || chr(10) ||
|
|
414
|
+
to_json({
|
|
415
|
+
'protocol': {'minReaderVersion': 1, 'minWriterVersion': 2}
|
|
416
|
+
}) AS content
|
|
417
|
+
FROM temp_checkpoint_parquet p
|
|
418
|
+
WHERE p.row_order = 1
|
|
419
|
+
""")
|
|
420
|
+
|
|
421
|
+
# Build last checkpoint content
|
|
422
|
+
con.execute("""
|
|
423
|
+
CREATE OR REPLACE TEMP TABLE temp_last_checkpoint AS
|
|
424
|
+
SELECT
|
|
425
|
+
table_id,
|
|
426
|
+
table_root,
|
|
427
|
+
snapshot_id,
|
|
428
|
+
'{"version":' || snapshot_id || ',"size":' || (2 + num_files) || '}' AS content
|
|
429
|
+
FROM temp_checkpoint_parquet
|
|
430
|
+
WHERE row_order = 1
|
|
431
|
+
""")
|
|
432
|
+
|
|
433
|
+
# Get file paths
|
|
434
|
+
paths = con.execute("""
|
|
435
|
+
SELECT
|
|
436
|
+
table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.checkpoint.parquet' AS checkpoint_file,
|
|
437
|
+
table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.json' AS json_file,
|
|
438
|
+
table_root || '/_delta_log/_last_checkpoint' AS last_checkpoint_file,
|
|
439
|
+
table_root || '/_delta_log' AS delta_log_path
|
|
440
|
+
FROM temp_checkpoint_parquet
|
|
441
|
+
WHERE row_order = 1
|
|
442
|
+
LIMIT 1
|
|
443
|
+
""").fetchone()
|
|
444
|
+
|
|
445
|
+
checkpoint_file, json_file, last_checkpoint_file, delta_log_path = paths
|
|
446
|
+
|
|
447
|
+
# Create delta_log directory for local paths
|
|
448
|
+
if not any(table_root.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://']):
|
|
449
|
+
con.execute(f"""
|
|
450
|
+
COPY (SELECT 1 AS id, 1 AS ".duckdb_init")
|
|
451
|
+
TO '{delta_log_path}'
|
|
452
|
+
(FORMAT CSV, PARTITION_BY (".duckdb_init"), OVERWRITE_OR_IGNORE)
|
|
453
|
+
""")
|
|
454
|
+
|
|
455
|
+
# Write checkpoint parquet
|
|
456
|
+
con.execute(f"""
|
|
457
|
+
COPY (SELECT protocol, metaData, add, remove, commitInfo
|
|
458
|
+
FROM temp_checkpoint_parquet ORDER BY row_order)
|
|
459
|
+
TO '{checkpoint_file}' (FORMAT PARQUET)
|
|
460
|
+
""")
|
|
461
|
+
|
|
462
|
+
# Write JSON log
|
|
463
|
+
con.execute(f"""
|
|
464
|
+
COPY (SELECT content FROM temp_checkpoint_json)
|
|
465
|
+
TO '{json_file}' (FORMAT CSV, HEADER false, QUOTE '')
|
|
466
|
+
""")
|
|
467
|
+
|
|
468
|
+
# Write last checkpoint
|
|
469
|
+
con.execute(f"""
|
|
470
|
+
COPY (SELECT content FROM temp_last_checkpoint)
|
|
471
|
+
TO '{last_checkpoint_file}' (FORMAT CSV, HEADER false, QUOTE '')
|
|
472
|
+
""")
|
|
473
|
+
|
|
474
|
+
print(f" ✅ {table_key}: exported snapshot {snapshot_id} ({file_count} files)")
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
print(f" ❌ {table_key}: {e}")
|
|
478
|
+
|
|
479
|
+
# Cleanup temp tables
|
|
480
|
+
con.execute("DROP TABLE IF EXISTS export_summary")
|
|
481
|
+
con.execute("DROP TABLE IF EXISTS temp_checkpoint_parquet")
|
|
482
|
+
con.execute("DROP TABLE IF EXISTS temp_checkpoint_json")
|
|
483
|
+
con.execute("DROP TABLE IF EXISTS temp_last_checkpoint")
|
|
484
|
+
|
|
485
|
+
con.close()
|
|
486
|
+
print("\n🎉 Export completed!")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ducklake-delta-exporter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
5
|
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
6
|
Author: mim
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
13
13
|
Classifier: Development Status :: 3 - Alpha
|
|
14
14
|
Requires-Python: >=3.8
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
|
-
Requires-Dist: duckdb
|
|
16
|
+
Requires-Dist: duckdb>=1.4.4
|
|
17
17
|
Dynamic: author
|
|
18
18
|
Dynamic: author-email
|
|
19
19
|
Dynamic: classifier
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
duckdb>=1.4.4
|
|
@@ -3,9 +3,9 @@ from setuptools import setup, find_packages
|
|
|
3
3
|
|
|
4
4
|
setup(
|
|
5
5
|
name='ducklake-delta-exporter',
|
|
6
|
-
version='0.
|
|
6
|
+
version='0.2.0',
|
|
7
7
|
packages=find_packages(),
|
|
8
|
-
install_requires=['duckdb'],
|
|
8
|
+
install_requires=['duckdb>=1.4.4'],
|
|
9
9
|
author='mim',
|
|
10
10
|
author_email='your.email@example.com',
|
|
11
11
|
description='A utility to export DuckLake database metadata to Delta Lake transaction logs.',
|
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
# File: ducklake_delta_exporter.py
|
|
2
|
-
import json
|
|
3
|
-
import time
|
|
4
|
-
import duckdb
|
|
5
|
-
|
|
6
|
-
def map_type_ducklake_to_spark(t):
|
|
7
|
-
"""Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
|
|
8
|
-
t = t.lower()
|
|
9
|
-
if 'int' in t:
|
|
10
|
-
return 'long' if '64' in t else 'integer'
|
|
11
|
-
elif 'float' in t:
|
|
12
|
-
return 'double'
|
|
13
|
-
elif 'double' in t:
|
|
14
|
-
return 'double'
|
|
15
|
-
elif 'decimal' in t:
|
|
16
|
-
return 'decimal(10,0)'
|
|
17
|
-
elif 'bool' in t:
|
|
18
|
-
return 'boolean'
|
|
19
|
-
elif 'timestamp' in t:
|
|
20
|
-
return 'timestamp'
|
|
21
|
-
elif 'date' in t:
|
|
22
|
-
return 'date'
|
|
23
|
-
return 'string'
|
|
24
|
-
|
|
25
|
-
def create_spark_schema_string(fields):
|
|
26
|
-
"""Creates a JSON string for the Spark schema from a list of fields."""
|
|
27
|
-
return json.dumps({"type": "struct", "fields": fields})
|
|
28
|
-
|
|
29
|
-
def get_latest_ducklake_snapshot(con, table_id):
|
|
30
|
-
"""
|
|
31
|
-
Get the latest DuckLake snapshot ID for a table.
|
|
32
|
-
"""
|
|
33
|
-
latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
|
|
34
|
-
return latest_snapshot
|
|
35
|
-
|
|
36
|
-
def get_latest_delta_checkpoint(con, table_id):
|
|
37
|
-
"""
|
|
38
|
-
check how many times a table has being modified.
|
|
39
|
-
"""
|
|
40
|
-
delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
|
|
41
|
-
where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
|
|
42
|
-
return delta_checkpoint
|
|
43
|
-
|
|
44
|
-
def get_file_modification_time(dummy_time):
|
|
45
|
-
"""
|
|
46
|
-
Return a dummy modification time for parquet files.
|
|
47
|
-
This avoids the latency of actually reading file metadata.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
dummy_time: Timestamp in milliseconds to use as modification time
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
Modification time in milliseconds
|
|
54
|
-
"""
|
|
55
|
-
return dummy_time
|
|
56
|
-
|
|
57
|
-
def create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now):
|
|
58
|
-
"""
|
|
59
|
-
Create a minimal JSON log file for Spark compatibility using DuckDB.
|
|
60
|
-
"""
|
|
61
|
-
json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
|
|
62
|
-
|
|
63
|
-
# Create JSON log entries using DuckDB
|
|
64
|
-
duckdb.execute("DROP TABLE IF EXISTS json_log_table")
|
|
65
|
-
|
|
66
|
-
# Protocol entry
|
|
67
|
-
protocol_json = json.dumps({
|
|
68
|
-
"protocol": {
|
|
69
|
-
"minReaderVersion": 1,
|
|
70
|
-
"minWriterVersion": 2
|
|
71
|
-
}
|
|
72
|
-
})
|
|
73
|
-
|
|
74
|
-
# Metadata entry
|
|
75
|
-
metadata_json = json.dumps({
|
|
76
|
-
"metaData": {
|
|
77
|
-
"id": str(table_info['table_id']),
|
|
78
|
-
"name": table_info['table_name'],
|
|
79
|
-
"description": None,
|
|
80
|
-
"format": {
|
|
81
|
-
"provider": "parquet",
|
|
82
|
-
"options": {}
|
|
83
|
-
},
|
|
84
|
-
"schemaString": create_spark_schema_string(schema_fields),
|
|
85
|
-
"partitionColumns": [],
|
|
86
|
-
"createdTime": now,
|
|
87
|
-
"configuration": {
|
|
88
|
-
"delta.logRetentionDuration": "interval 1 hour"
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
})
|
|
92
|
-
|
|
93
|
-
# Commit info entry
|
|
94
|
-
commitinfo_json = json.dumps({
|
|
95
|
-
"commitInfo": {
|
|
96
|
-
"timestamp": now,
|
|
97
|
-
"operation": "CONVERT",
|
|
98
|
-
"operationParameters": {
|
|
99
|
-
"convertedFrom": "DuckLake"
|
|
100
|
-
},
|
|
101
|
-
"isBlindAppend": True,
|
|
102
|
-
"engineInfo": "DuckLake-Delta-Exporter",
|
|
103
|
-
"clientVersion": "1.0.0"
|
|
104
|
-
}
|
|
105
|
-
})
|
|
106
|
-
|
|
107
|
-
# Create table with JSON entries
|
|
108
|
-
duckdb.execute("""
|
|
109
|
-
CREATE TABLE json_log_table AS
|
|
110
|
-
SELECT ? AS json_line
|
|
111
|
-
UNION ALL
|
|
112
|
-
SELECT ? AS json_line
|
|
113
|
-
UNION ALL
|
|
114
|
-
SELECT ? AS json_line
|
|
115
|
-
""", [protocol_json, metadata_json, commitinfo_json])
|
|
116
|
-
|
|
117
|
-
# Write JSON log file using DuckDB
|
|
118
|
-
duckdb.execute(f"COPY (SELECT json_line FROM json_log_table) TO '{json_log_file}' (FORMAT CSV, HEADER false, QUOTE '')")
|
|
119
|
-
|
|
120
|
-
# Clean up
|
|
121
|
-
duckdb.execute("DROP TABLE IF EXISTS json_log_table")
|
|
122
|
-
|
|
123
|
-
return json_log_file
|
|
124
|
-
|
|
125
|
-
def build_file_path(table_root, relative_path):
|
|
126
|
-
"""
|
|
127
|
-
Build full file path from table root and relative path.
|
|
128
|
-
Works with both local paths and S3 URLs.
|
|
129
|
-
"""
|
|
130
|
-
table_root = table_root.rstrip('/')
|
|
131
|
-
relative_path = relative_path.lstrip('/')
|
|
132
|
-
return f"{table_root}/{relative_path}"
|
|
133
|
-
|
|
134
|
-
def create_checkpoint_for_latest_snapshot(con, table_info, data_root):
|
|
135
|
-
"""
|
|
136
|
-
Create a Delta checkpoint file for the latest DuckLake snapshot.
|
|
137
|
-
"""
|
|
138
|
-
table_root = data_root.rstrip('/') + '/' + table_info['schema_path'] + table_info['table_path']
|
|
139
|
-
|
|
140
|
-
# Get the latest snapshot
|
|
141
|
-
latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
|
|
142
|
-
if latest_snapshot is None:
|
|
143
|
-
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
|
|
144
|
-
return False
|
|
145
|
-
delta_version = get_latest_delta_checkpoint(con, table_info['table_id'])
|
|
146
|
-
checkpoint_file = table_root + f"_delta_log/{delta_version:020d}.checkpoint.parquet"
|
|
147
|
-
json_log_file = table_root + f"_delta_log/{delta_version:020d}.json"
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
con.execute(f"SELECT protocol FROM '{checkpoint_file}' limit 0 ")
|
|
151
|
-
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Checkpoint file already exists: {checkpoint_file}")
|
|
152
|
-
except:
|
|
153
|
-
|
|
154
|
-
now = int(time.time() * 1000)
|
|
155
|
-
|
|
156
|
-
# Get all files for the latest snapshot
|
|
157
|
-
file_rows = con.execute(f"""
|
|
158
|
-
SELECT path, file_size_bytes FROM ducklake_data_file
|
|
159
|
-
WHERE table_id = {table_info['table_id']}
|
|
160
|
-
AND begin_snapshot <= {latest_snapshot}
|
|
161
|
-
AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
|
|
162
|
-
""").fetchall()
|
|
163
|
-
|
|
164
|
-
# Get schema for the latest snapshot
|
|
165
|
-
columns = con.execute(f"""
|
|
166
|
-
SELECT column_name, column_type FROM ducklake_column
|
|
167
|
-
WHERE table_id = {table_info['table_id']}
|
|
168
|
-
AND begin_snapshot <= {latest_snapshot}
|
|
169
|
-
AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
|
|
170
|
-
ORDER BY column_order
|
|
171
|
-
""").fetchall()
|
|
172
|
-
|
|
173
|
-
# Get or generate table metadata ID
|
|
174
|
-
table_meta_id = str(table_info['table_id'])
|
|
175
|
-
|
|
176
|
-
# Prepare schema
|
|
177
|
-
schema_fields = [
|
|
178
|
-
{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
|
|
179
|
-
for name, typ in columns
|
|
180
|
-
]
|
|
181
|
-
|
|
182
|
-
# Create checkpoint data using DuckDB directly
|
|
183
|
-
checkpoint_data = []
|
|
184
|
-
|
|
185
|
-
# Create checkpoint data directly in DuckDB using proper data types
|
|
186
|
-
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
187
|
-
|
|
188
|
-
# Create the checkpoint table with proper nested structure
|
|
189
|
-
duckdb.execute("""
|
|
190
|
-
CREATE TABLE checkpoint_table AS
|
|
191
|
-
WITH checkpoint_data AS (
|
|
192
|
-
-- Protocol record
|
|
193
|
-
SELECT
|
|
194
|
-
{'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
195
|
-
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
196
|
-
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
197
|
-
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
198
|
-
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
199
|
-
|
|
200
|
-
UNION ALL
|
|
201
|
-
|
|
202
|
-
-- Metadata record
|
|
203
|
-
SELECT
|
|
204
|
-
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
205
|
-
{
|
|
206
|
-
'id': ?,
|
|
207
|
-
'name': ?,
|
|
208
|
-
'description': NULL,
|
|
209
|
-
'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
|
|
210
|
-
'schemaString': ?,
|
|
211
|
-
'partitionColumns': []::VARCHAR[],
|
|
212
|
-
'createdTime': ?,
|
|
213
|
-
'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
|
|
214
|
-
}::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
215
|
-
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
216
|
-
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
217
|
-
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
218
|
-
)
|
|
219
|
-
SELECT * FROM checkpoint_data
|
|
220
|
-
""", [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
|
|
221
|
-
|
|
222
|
-
# Add file records
|
|
223
|
-
for path, size in file_rows:
|
|
224
|
-
rel_path = path.lstrip('/')
|
|
225
|
-
full_path = build_file_path(table_root, rel_path)
|
|
226
|
-
mod_time = get_file_modification_time(now)
|
|
227
|
-
|
|
228
|
-
duckdb.execute("""
|
|
229
|
-
INSERT INTO checkpoint_table
|
|
230
|
-
SELECT
|
|
231
|
-
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
232
|
-
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
233
|
-
{
|
|
234
|
-
'path': ?,
|
|
235
|
-
'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
|
|
236
|
-
'size': ?,
|
|
237
|
-
'modificationTime': ?,
|
|
238
|
-
'dataChange': true,
|
|
239
|
-
'stats': ?,
|
|
240
|
-
'tags': NULL::MAP(VARCHAR, VARCHAR)
|
|
241
|
-
}::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
242
|
-
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
243
|
-
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
244
|
-
""", [rel_path, size, mod_time, json.dumps({"numRecords": None})])
|
|
245
|
-
|
|
246
|
-
# Create the _delta_log directory if it doesn't exist
|
|
247
|
-
duckdb.execute(f"COPY (SELECT 43) TO '{table_root}_delta_log' (FORMAT PARQUET, PER_THREAD_OUTPUT, OVERWRITE_OR_IGNORE)")
|
|
248
|
-
|
|
249
|
-
# Write the checkpoint file
|
|
250
|
-
duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{checkpoint_file}' (FORMAT PARQUET)")
|
|
251
|
-
|
|
252
|
-
# Create dummy JSON log file for Spark compatibility
|
|
253
|
-
create_dummy_json_log(table_root, delta_version, table_info, schema_fields, now)
|
|
254
|
-
|
|
255
|
-
# Write the _last_checkpoint file
|
|
256
|
-
total_records = 2 + len(file_rows) # protocol + metadata + file records
|
|
257
|
-
duckdb.execute(f"""
|
|
258
|
-
COPY (SELECT {delta_version} AS version, {total_records} AS size)
|
|
259
|
-
TO '{table_root}_delta_log/_last_checkpoint' (FORMAT JSON, ARRAY false)
|
|
260
|
-
""")
|
|
261
|
-
|
|
262
|
-
print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
|
|
263
|
-
print(f"✅ Created JSON log file: {json_log_file}")
|
|
264
|
-
|
|
265
|
-
# Clean up temporary tables
|
|
266
|
-
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
267
|
-
|
|
268
|
-
return True, delta_version, latest_snapshot
|
|
269
|
-
|
|
270
|
-
def generate_latest_delta_log(db_path: str, data_root: str = None):
|
|
271
|
-
"""
|
|
272
|
-
Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
|
|
273
|
-
Creates both checkpoint files and minimal JSON log files for Spark compatibility.
|
|
274
|
-
|
|
275
|
-
Args:
|
|
276
|
-
db_path (str): The path to the DuckLake database file.
|
|
277
|
-
data_root (str): The root directory for the lakehouse data.
|
|
278
|
-
"""
|
|
279
|
-
con = duckdb.connect(db_path, read_only=True)
|
|
280
|
-
|
|
281
|
-
if data_root is None:
|
|
282
|
-
data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
|
|
283
|
-
|
|
284
|
-
# Get all active tables
|
|
285
|
-
tables = con.execute("""
|
|
286
|
-
SELECT
|
|
287
|
-
t.table_id,
|
|
288
|
-
t.table_name,
|
|
289
|
-
s.schema_name,
|
|
290
|
-
t.path as table_path,
|
|
291
|
-
s.path as schema_path
|
|
292
|
-
FROM ducklake_table t
|
|
293
|
-
JOIN ducklake_schema s USING(schema_id)
|
|
294
|
-
WHERE t.end_snapshot IS NULL
|
|
295
|
-
""").fetchall()
|
|
296
|
-
|
|
297
|
-
total_tables = len(tables)
|
|
298
|
-
successful_exports = 0
|
|
299
|
-
|
|
300
|
-
for table_row in tables:
|
|
301
|
-
table_info = {
|
|
302
|
-
'table_id': table_row[0],
|
|
303
|
-
'table_name': table_row[1],
|
|
304
|
-
'schema_name': table_row[2],
|
|
305
|
-
'table_path': table_row[3],
|
|
306
|
-
'schema_path': table_row[4]
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
|
|
310
|
-
print(f"Processing {table_key}...")
|
|
311
|
-
|
|
312
|
-
try:
|
|
313
|
-
result = create_checkpoint_for_latest_snapshot(con, table_info, data_root)
|
|
314
|
-
|
|
315
|
-
if result:
|
|
316
|
-
successful_exports += 1
|
|
317
|
-
else:
|
|
318
|
-
print(f"⚠️ {table_key}: No data to export")
|
|
319
|
-
|
|
320
|
-
except Exception as e:
|
|
321
|
-
print(f"❌ {table_key}: Failed to export checkpoint - {e}")
|
|
322
|
-
|
|
323
|
-
con.close()
|
|
324
|
-
print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
duckdb
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|