ducklake-delta-exporter 0.1.4__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/PKG-INFO +2 -2
- ducklake_delta_exporter-0.3.0/ducklake_delta_exporter/__init__.py +487 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/PKG-INFO +2 -2
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/SOURCES.txt +2 -1
- ducklake_delta_exporter-0.3.0/ducklake_delta_exporter.egg-info/requires.txt +1 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/setup.py +2 -2
- ducklake_delta_exporter-0.3.0/tests/test_stats_transformation.py +656 -0
- ducklake_delta_exporter-0.1.4/ducklake_delta_exporter/__init__.py +0 -324
- ducklake_delta_exporter-0.1.4/ducklake_delta_exporter.egg-info/requires.txt +0 -1
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/README.md +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/dependency_links.txt +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/top_level.txt +0 -0
- {ducklake_delta_exporter-0.1.4 → ducklake_delta_exporter-0.3.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ducklake-delta-exporter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
5
|
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
6
|
Author: mim
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
13
13
|
Classifier: Development Status :: 3 - Alpha
|
|
14
14
|
Requires-Python: >=3.8
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
|
-
Requires-Dist: duckdb
|
|
16
|
+
Requires-Dist: duckdb>=1.4.4
|
|
17
17
|
Dynamic: author
|
|
18
18
|
Dynamic: author-email
|
|
19
19
|
Dynamic: classifier
|
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
# File: ducklake_delta_exporter.py
|
|
2
|
+
import duckdb
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_latest_delta_log(db_path: str):
|
|
6
|
+
"""
|
|
7
|
+
Export the latest DuckLake snapshot for each table as Delta checkpoint files.
|
|
8
|
+
Uses DuckDB 1.4.4+ native support for writing to abfss://, s3://, etc.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
db_path (str): The path to the DuckLake database file (or connection string).
|
|
12
|
+
"""
|
|
13
|
+
# For remote paths (abfss://, s3://, etc.), use in-memory connection with ATTACH
|
|
14
|
+
is_remote = any(db_path.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://'])
|
|
15
|
+
|
|
16
|
+
if is_remote:
|
|
17
|
+
con = duckdb.connect()
|
|
18
|
+
# Load required extensions for cloud storage
|
|
19
|
+
if db_path.startswith('abfss://') or db_path.startswith('az://'):
|
|
20
|
+
con.execute("LOAD azure")
|
|
21
|
+
# Load persistent secrets
|
|
22
|
+
con.execute("SELECT * FROM duckdb_secrets()")
|
|
23
|
+
elif db_path.startswith('s3://'):
|
|
24
|
+
con.execute("LOAD httpfs")
|
|
25
|
+
con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
|
|
26
|
+
con.execute("USE ducklake_db")
|
|
27
|
+
else:
|
|
28
|
+
con = duckdb.connect(db_path, read_only=True)
|
|
29
|
+
|
|
30
|
+
# Build export summary - identify which tables have data
|
|
31
|
+
con.execute("""
|
|
32
|
+
CREATE OR REPLACE TEMP TABLE export_summary AS
|
|
33
|
+
WITH
|
|
34
|
+
data_root_config AS (
|
|
35
|
+
SELECT value AS data_root FROM ducklake_metadata WHERE key = 'data_path'
|
|
36
|
+
),
|
|
37
|
+
active_tables AS (
|
|
38
|
+
SELECT
|
|
39
|
+
t.table_id,
|
|
40
|
+
t.table_name,
|
|
41
|
+
s.schema_name,
|
|
42
|
+
t.path AS table_path,
|
|
43
|
+
s.path AS schema_path,
|
|
44
|
+
rtrim((SELECT data_root FROM data_root_config), '/') || '/' ||
|
|
45
|
+
CASE
|
|
46
|
+
WHEN trim(s.path, '/') != '' THEN trim(s.path, '/') || '/'
|
|
47
|
+
ELSE ''
|
|
48
|
+
END ||
|
|
49
|
+
trim(t.path, '/') AS table_root
|
|
50
|
+
FROM ducklake_table t
|
|
51
|
+
JOIN ducklake_schema s USING(schema_id)
|
|
52
|
+
WHERE t.end_snapshot IS NULL
|
|
53
|
+
),
|
|
54
|
+
current_snapshot AS (
|
|
55
|
+
SELECT MAX(snapshot_id) AS snapshot_id FROM ducklake_snapshot
|
|
56
|
+
),
|
|
57
|
+
table_last_modified AS (
|
|
58
|
+
SELECT
|
|
59
|
+
t.*,
|
|
60
|
+
COALESCE(
|
|
61
|
+
(SELECT MAX(sc.snapshot_id)
|
|
62
|
+
FROM ducklake_snapshot_changes sc
|
|
63
|
+
WHERE regexp_matches(sc.changes_made, '[:,]' || t.table_id || '([^0-9]|$)')
|
|
64
|
+
),
|
|
65
|
+
(SELECT cs.snapshot_id
|
|
66
|
+
FROM current_snapshot cs
|
|
67
|
+
WHERE EXISTS (
|
|
68
|
+
SELECT 1 FROM ducklake_data_file df
|
|
69
|
+
WHERE df.table_id = t.table_id
|
|
70
|
+
AND df.end_snapshot IS NULL
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
) AS last_modified_snapshot,
|
|
74
|
+
(SELECT COUNT(*) FROM ducklake_data_file df
|
|
75
|
+
WHERE df.table_id = t.table_id
|
|
76
|
+
AND df.end_snapshot IS NULL
|
|
77
|
+
) AS file_count
|
|
78
|
+
FROM active_tables t
|
|
79
|
+
)
|
|
80
|
+
SELECT
|
|
81
|
+
table_id,
|
|
82
|
+
schema_name,
|
|
83
|
+
table_name,
|
|
84
|
+
table_root,
|
|
85
|
+
CASE
|
|
86
|
+
WHEN file_count = 0 THEN 'no_data_files'
|
|
87
|
+
WHEN last_modified_snapshot IS NULL THEN 'no_changes'
|
|
88
|
+
ELSE 'needs_export'
|
|
89
|
+
END AS status,
|
|
90
|
+
last_modified_snapshot AS snapshot_id,
|
|
91
|
+
file_count
|
|
92
|
+
FROM table_last_modified
|
|
93
|
+
""")
|
|
94
|
+
|
|
95
|
+
# Get tables that need export
|
|
96
|
+
tables_to_export = con.execute("""
|
|
97
|
+
SELECT table_id, schema_name, table_name, table_root, snapshot_id, file_count
|
|
98
|
+
FROM export_summary
|
|
99
|
+
WHERE status = 'needs_export'
|
|
100
|
+
""").fetchall()
|
|
101
|
+
|
|
102
|
+
# Show summary
|
|
103
|
+
summary = con.execute("""
|
|
104
|
+
SELECT status, COUNT(*) as cnt FROM export_summary GROUP BY status
|
|
105
|
+
""").fetchall()
|
|
106
|
+
|
|
107
|
+
for status, cnt in summary:
|
|
108
|
+
print(f" {status}: {cnt} tables")
|
|
109
|
+
|
|
110
|
+
if not tables_to_export:
|
|
111
|
+
print("\n✅ No tables need export.")
|
|
112
|
+
con.close()
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
print(f"\n📦 Exporting {len(tables_to_export)} tables...")
|
|
116
|
+
|
|
117
|
+
# Process each table
|
|
118
|
+
for table_id, schema_name, table_name, table_root, snapshot_id, file_count in tables_to_export:
|
|
119
|
+
table_key = f"{schema_name}.{table_name}"
|
|
120
|
+
|
|
121
|
+
# Check if checkpoint already exists for this snapshot
|
|
122
|
+
checkpoint_path = f"{table_root}/_delta_log/{snapshot_id:020d}.checkpoint.parquet"
|
|
123
|
+
try:
|
|
124
|
+
con.execute(f"SELECT 1 FROM '{checkpoint_path}' LIMIT 1")
|
|
125
|
+
print(f" ⏭️ {table_key}: snapshot {snapshot_id} already exported")
|
|
126
|
+
continue
|
|
127
|
+
except Exception:
|
|
128
|
+
pass # File doesn't exist, proceed with export
|
|
129
|
+
|
|
130
|
+
print(f"\n Processing {table_key}...")
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
# Build checkpoint parquet data for this table
|
|
134
|
+
con.execute("""
|
|
135
|
+
CREATE OR REPLACE TEMP TABLE temp_checkpoint_parquet AS
|
|
136
|
+
WITH
|
|
137
|
+
table_schemas AS (
|
|
138
|
+
SELECT
|
|
139
|
+
? AS table_id,
|
|
140
|
+
? AS table_name,
|
|
141
|
+
? AS snapshot_id,
|
|
142
|
+
? AS table_root,
|
|
143
|
+
list({
|
|
144
|
+
'name': c.column_name,
|
|
145
|
+
'type':
|
|
146
|
+
CASE
|
|
147
|
+
WHEN contains(lower(c.column_type), 'bigint') OR
|
|
148
|
+
(contains(lower(c.column_type), 'int') AND contains(c.column_type, '64')) THEN 'long'
|
|
149
|
+
WHEN contains(lower(c.column_type), 'int') THEN 'integer'
|
|
150
|
+
WHEN contains(lower(c.column_type), 'float') THEN 'double'
|
|
151
|
+
WHEN contains(lower(c.column_type), 'double') THEN 'double'
|
|
152
|
+
WHEN contains(lower(c.column_type), 'bool') THEN 'boolean'
|
|
153
|
+
WHEN contains(lower(c.column_type), 'timestamp') THEN 'timestamp'
|
|
154
|
+
WHEN contains(lower(c.column_type), 'date') THEN 'date'
|
|
155
|
+
WHEN contains(lower(c.column_type), 'decimal') THEN lower(c.column_type)
|
|
156
|
+
ELSE 'string'
|
|
157
|
+
END,
|
|
158
|
+
'nullable': true,
|
|
159
|
+
'metadata': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
160
|
+
}::STRUCT(name VARCHAR, type VARCHAR, nullable BOOLEAN, metadata MAP(VARCHAR, VARCHAR)) ORDER BY c.column_order) AS schema_fields
|
|
161
|
+
FROM ducklake_column c
|
|
162
|
+
WHERE c.table_id = ?
|
|
163
|
+
AND c.end_snapshot IS NULL
|
|
164
|
+
),
|
|
165
|
+
file_column_stats_agg AS (
|
|
166
|
+
SELECT
|
|
167
|
+
df.data_file_id,
|
|
168
|
+
c.column_name,
|
|
169
|
+
ANY_VALUE(c.column_type) AS column_type,
|
|
170
|
+
MAX(fcs.value_count) AS value_count,
|
|
171
|
+
MIN(fcs.min_value) AS min_value,
|
|
172
|
+
MAX(fcs.max_value) AS max_value,
|
|
173
|
+
MAX(fcs.null_count) AS null_count
|
|
174
|
+
FROM ducklake_data_file df
|
|
175
|
+
LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
|
|
176
|
+
LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id AND c.table_id = df.table_id
|
|
177
|
+
WHERE df.table_id = ?
|
|
178
|
+
AND df.end_snapshot IS NULL
|
|
179
|
+
AND c.column_id IS NOT NULL
|
|
180
|
+
AND c.end_snapshot IS NULL
|
|
181
|
+
GROUP BY df.data_file_id, c.column_name
|
|
182
|
+
),
|
|
183
|
+
file_column_stats_transformed AS (
|
|
184
|
+
SELECT
|
|
185
|
+
fca.data_file_id,
|
|
186
|
+
fca.column_name,
|
|
187
|
+
fca.column_type,
|
|
188
|
+
fca.value_count,
|
|
189
|
+
fca.null_count,
|
|
190
|
+
CASE
|
|
191
|
+
WHEN fca.min_value IS NULL THEN NULL
|
|
192
|
+
WHEN contains(lower(fca.column_type), 'timestamp') THEN
|
|
193
|
+
regexp_replace(
|
|
194
|
+
regexp_replace(replace(fca.min_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
|
|
195
|
+
'^([^.]+)$', '\\1.000'
|
|
196
|
+
) || 'Z'
|
|
197
|
+
WHEN contains(lower(fca.column_type), 'date') THEN fca.min_value
|
|
198
|
+
WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.min_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
|
|
199
|
+
WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
|
|
200
|
+
OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
|
|
201
|
+
CASE WHEN contains(fca.min_value, '.') OR contains(lower(fca.min_value), 'e')
|
|
202
|
+
THEN CAST(TRY_CAST(fca.min_value AS DOUBLE) AS VARCHAR)
|
|
203
|
+
ELSE CAST(TRY_CAST(fca.min_value AS BIGINT) AS VARCHAR)
|
|
204
|
+
END
|
|
205
|
+
ELSE fca.min_value
|
|
206
|
+
END AS transformed_min,
|
|
207
|
+
CASE
|
|
208
|
+
WHEN fca.max_value IS NULL THEN NULL
|
|
209
|
+
WHEN contains(lower(fca.column_type), 'timestamp') THEN
|
|
210
|
+
regexp_replace(
|
|
211
|
+
regexp_replace(replace(fca.max_value, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
|
|
212
|
+
'^([^.]+)$', '\\1.000'
|
|
213
|
+
) || 'Z'
|
|
214
|
+
WHEN contains(lower(fca.column_type), 'date') THEN fca.max_value
|
|
215
|
+
WHEN contains(lower(fca.column_type), 'bool') THEN CAST(lower(fca.max_value) IN ('true', 't', '1', 'yes') AS VARCHAR)
|
|
216
|
+
WHEN contains(lower(fca.column_type), 'int') OR contains(lower(fca.column_type), 'float')
|
|
217
|
+
OR contains(lower(fca.column_type), 'double') OR contains(lower(fca.column_type), 'decimal') THEN
|
|
218
|
+
CASE WHEN contains(fca.max_value, '.') OR contains(lower(fca.max_value), 'e')
|
|
219
|
+
THEN CAST(TRY_CAST(fca.max_value AS DOUBLE) AS VARCHAR)
|
|
220
|
+
ELSE CAST(TRY_CAST(fca.max_value AS BIGINT) AS VARCHAR)
|
|
221
|
+
END
|
|
222
|
+
ELSE fca.max_value
|
|
223
|
+
END AS transformed_max
|
|
224
|
+
FROM file_column_stats_agg fca
|
|
225
|
+
),
|
|
226
|
+
file_metadata AS (
|
|
227
|
+
SELECT
|
|
228
|
+
ts.table_id,
|
|
229
|
+
ts.table_name,
|
|
230
|
+
ts.snapshot_id,
|
|
231
|
+
ts.table_root,
|
|
232
|
+
ts.schema_fields,
|
|
233
|
+
df.data_file_id,
|
|
234
|
+
df.path AS file_path,
|
|
235
|
+
df.file_size_bytes,
|
|
236
|
+
COALESCE(MAX(fct.value_count), 0) AS num_records,
|
|
237
|
+
COALESCE(map_from_entries(list({
|
|
238
|
+
'key': fct.column_name,
|
|
239
|
+
'value': fct.transformed_min
|
|
240
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_min IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS min_values,
|
|
241
|
+
COALESCE(map_from_entries(list({
|
|
242
|
+
'key': fct.column_name,
|
|
243
|
+
'value': fct.transformed_max
|
|
244
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.transformed_max IS NOT NULL)), MAP{}::MAP(VARCHAR, VARCHAR)) AS max_values,
|
|
245
|
+
COALESCE(map_from_entries(list({
|
|
246
|
+
'key': fct.column_name,
|
|
247
|
+
'value': fct.null_count
|
|
248
|
+
} ORDER BY fct.column_name) FILTER (WHERE fct.column_name IS NOT NULL AND fct.null_count IS NOT NULL)), MAP{}::MAP(VARCHAR, BIGINT)) AS null_count
|
|
249
|
+
FROM table_schemas ts
|
|
250
|
+
JOIN ducklake_data_file df ON df.table_id = ts.table_id
|
|
251
|
+
LEFT JOIN file_column_stats_transformed fct ON df.data_file_id = fct.data_file_id
|
|
252
|
+
WHERE df.end_snapshot IS NULL
|
|
253
|
+
GROUP BY ts.table_id, ts.table_name, ts.snapshot_id,
|
|
254
|
+
ts.table_root, ts.schema_fields, df.data_file_id, df.path, df.file_size_bytes
|
|
255
|
+
),
|
|
256
|
+
table_aggregates AS (
|
|
257
|
+
SELECT
|
|
258
|
+
table_id,
|
|
259
|
+
table_name,
|
|
260
|
+
snapshot_id,
|
|
261
|
+
table_root,
|
|
262
|
+
schema_fields,
|
|
263
|
+
COUNT(*) AS num_files,
|
|
264
|
+
SUM(num_records) AS total_rows,
|
|
265
|
+
SUM(file_size_bytes) AS total_bytes,
|
|
266
|
+
list({
|
|
267
|
+
'path': ltrim(file_path, '/'),
|
|
268
|
+
'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
|
|
269
|
+
'size': file_size_bytes,
|
|
270
|
+
'modificationTime': epoch_ms(now()),
|
|
271
|
+
'dataChange': true,
|
|
272
|
+
'stats': COALESCE(to_json({
|
|
273
|
+
'numRecords': COALESCE(num_records, 0),
|
|
274
|
+
'minValues': COALESCE(min_values, MAP{}::MAP(VARCHAR, VARCHAR)),
|
|
275
|
+
'maxValues': COALESCE(max_values, MAP{}::MAP(VARCHAR, VARCHAR)),
|
|
276
|
+
'nullCount': COALESCE(null_count, MAP{}::MAP(VARCHAR, BIGINT))
|
|
277
|
+
}), '{"numRecords":0}'),
|
|
278
|
+
'tags': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
279
|
+
}::STRUCT(
|
|
280
|
+
path VARCHAR,
|
|
281
|
+
partitionValues MAP(VARCHAR, VARCHAR),
|
|
282
|
+
size BIGINT,
|
|
283
|
+
modificationTime BIGINT,
|
|
284
|
+
dataChange BOOLEAN,
|
|
285
|
+
stats VARCHAR,
|
|
286
|
+
tags MAP(VARCHAR, VARCHAR)
|
|
287
|
+
)) AS add_entries
|
|
288
|
+
FROM file_metadata
|
|
289
|
+
GROUP BY table_id, table_name, snapshot_id, table_root, schema_fields
|
|
290
|
+
),
|
|
291
|
+
checkpoint_data AS (
|
|
292
|
+
SELECT
|
|
293
|
+
ta.*,
|
|
294
|
+
epoch_ms(now()) AS now_ms,
|
|
295
|
+
uuid()::VARCHAR AS txn_id,
|
|
296
|
+
(substring(md5(ta.table_id::VARCHAR || '-metadata'), 1, 8) || '-' ||
|
|
297
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 9, 4) || '-' ||
|
|
298
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 13, 4) || '-' ||
|
|
299
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 17, 4) || '-' ||
|
|
300
|
+
substring(md5(ta.table_id::VARCHAR || '-metadata'), 21, 12)) AS meta_id,
|
|
301
|
+
to_json({'type': 'struct', 'fields': ta.schema_fields}) AS schema_string
|
|
302
|
+
FROM table_aggregates ta
|
|
303
|
+
),
|
|
304
|
+
checkpoint_parquet_data AS (
|
|
305
|
+
SELECT
|
|
306
|
+
cd.table_id,
|
|
307
|
+
cd.table_name,
|
|
308
|
+
cd.snapshot_id,
|
|
309
|
+
cd.table_root,
|
|
310
|
+
cd.meta_id,
|
|
311
|
+
cd.now_ms,
|
|
312
|
+
cd.txn_id,
|
|
313
|
+
cd.schema_string,
|
|
314
|
+
cd.num_files,
|
|
315
|
+
cd.total_rows,
|
|
316
|
+
cd.total_bytes,
|
|
317
|
+
{'minReaderVersion': 1, 'minWriterVersion': 2} AS protocol,
|
|
318
|
+
NULL AS metaData,
|
|
319
|
+
NULL AS add,
|
|
320
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
321
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
322
|
+
1 AS row_order
|
|
323
|
+
FROM checkpoint_data cd
|
|
324
|
+
UNION ALL
|
|
325
|
+
SELECT
|
|
326
|
+
cd.table_id,
|
|
327
|
+
cd.table_name,
|
|
328
|
+
cd.snapshot_id,
|
|
329
|
+
cd.table_root,
|
|
330
|
+
cd.meta_id,
|
|
331
|
+
cd.now_ms,
|
|
332
|
+
cd.txn_id,
|
|
333
|
+
cd.schema_string,
|
|
334
|
+
cd.num_files,
|
|
335
|
+
cd.total_rows,
|
|
336
|
+
cd.total_bytes,
|
|
337
|
+
NULL AS protocol,
|
|
338
|
+
{
|
|
339
|
+
'id': cd.meta_id,
|
|
340
|
+
'name': cd.table_name,
|
|
341
|
+
'format': {'provider': 'parquet', 'options': MAP{}::MAP(VARCHAR, VARCHAR)}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
|
|
342
|
+
'schemaString': cd.schema_string,
|
|
343
|
+
'partitionColumns': []::VARCHAR[],
|
|
344
|
+
'createdTime': cd.now_ms,
|
|
345
|
+
'configuration': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
346
|
+
}::STRUCT(id VARCHAR, name VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
347
|
+
NULL AS add,
|
|
348
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
349
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
350
|
+
2 AS row_order
|
|
351
|
+
FROM checkpoint_data cd
|
|
352
|
+
UNION ALL
|
|
353
|
+
SELECT
|
|
354
|
+
cd.table_id,
|
|
355
|
+
cd.table_name,
|
|
356
|
+
cd.snapshot_id,
|
|
357
|
+
cd.table_root,
|
|
358
|
+
cd.meta_id,
|
|
359
|
+
cd.now_ms,
|
|
360
|
+
cd.txn_id,
|
|
361
|
+
cd.schema_string,
|
|
362
|
+
cd.num_files,
|
|
363
|
+
cd.total_rows,
|
|
364
|
+
cd.total_bytes,
|
|
365
|
+
NULL AS protocol,
|
|
366
|
+
NULL AS metaData,
|
|
367
|
+
unnest(cd.add_entries) AS add,
|
|
368
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
369
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isolationLevel VARCHAR, isBlindAppend BOOLEAN, operationMetrics MAP(VARCHAR, VARCHAR), engineInfo VARCHAR, txnId VARCHAR) AS commitInfo,
|
|
370
|
+
3 AS row_order
|
|
371
|
+
FROM checkpoint_data cd
|
|
372
|
+
)
|
|
373
|
+
SELECT * FROM checkpoint_parquet_data
|
|
374
|
+
""", [table_id, table_name, snapshot_id, table_root, table_id, table_id])
|
|
375
|
+
|
|
376
|
+
# Build JSON log content
|
|
377
|
+
con.execute("""
|
|
378
|
+
CREATE OR REPLACE TEMP TABLE temp_checkpoint_json AS
|
|
379
|
+
SELECT DISTINCT
|
|
380
|
+
p.table_id,
|
|
381
|
+
p.table_root,
|
|
382
|
+
p.snapshot_id,
|
|
383
|
+
p.num_files,
|
|
384
|
+
to_json({
|
|
385
|
+
'commitInfo': {
|
|
386
|
+
'timestamp': p.now_ms,
|
|
387
|
+
'operation': 'CONVERT',
|
|
388
|
+
'operationParameters': {
|
|
389
|
+
'convertedFrom': 'DuckLake',
|
|
390
|
+
'duckLakeSnapshotId': p.snapshot_id::VARCHAR,
|
|
391
|
+
'partitionBy': '[]'
|
|
392
|
+
},
|
|
393
|
+
'isolationLevel': 'Serializable',
|
|
394
|
+
'isBlindAppend': false,
|
|
395
|
+
'operationMetrics': {
|
|
396
|
+
'numFiles': p.num_files::VARCHAR,
|
|
397
|
+
'numOutputRows': p.total_rows::VARCHAR,
|
|
398
|
+
'numOutputBytes': p.total_bytes::VARCHAR
|
|
399
|
+
},
|
|
400
|
+
'engineInfo': 'DuckLake-Delta-Exporter/1.0.0',
|
|
401
|
+
'txnId': p.txn_id
|
|
402
|
+
}
|
|
403
|
+
}) || chr(10) ||
|
|
404
|
+
to_json({
|
|
405
|
+
'metaData': {
|
|
406
|
+
'id': p.meta_id,
|
|
407
|
+
'name': p.table_name,
|
|
408
|
+
'format': {'provider': 'parquet', 'options': MAP{}},
|
|
409
|
+
'schemaString': p.schema_string::VARCHAR,
|
|
410
|
+
'partitionColumns': [],
|
|
411
|
+
'createdTime': p.now_ms,
|
|
412
|
+
'configuration': MAP{}
|
|
413
|
+
}
|
|
414
|
+
}) || chr(10) ||
|
|
415
|
+
to_json({
|
|
416
|
+
'protocol': {'minReaderVersion': 1, 'minWriterVersion': 2}
|
|
417
|
+
}) AS content
|
|
418
|
+
FROM temp_checkpoint_parquet p
|
|
419
|
+
WHERE p.row_order = 1
|
|
420
|
+
""")
|
|
421
|
+
|
|
422
|
+
# Build last checkpoint content
|
|
423
|
+
con.execute("""
|
|
424
|
+
CREATE OR REPLACE TEMP TABLE temp_last_checkpoint AS
|
|
425
|
+
SELECT
|
|
426
|
+
table_id,
|
|
427
|
+
table_root,
|
|
428
|
+
snapshot_id,
|
|
429
|
+
'{"version":' || snapshot_id || ',"size":' || (2 + num_files) || '}' AS content
|
|
430
|
+
FROM temp_checkpoint_parquet
|
|
431
|
+
WHERE row_order = 1
|
|
432
|
+
""")
|
|
433
|
+
|
|
434
|
+
# Get file paths
|
|
435
|
+
paths = con.execute("""
|
|
436
|
+
SELECT
|
|
437
|
+
table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.checkpoint.parquet' AS checkpoint_file,
|
|
438
|
+
table_root || '/_delta_log/' || lpad(snapshot_id::VARCHAR, 20, '0') || '.json' AS json_file,
|
|
439
|
+
table_root || '/_delta_log/_last_checkpoint' AS last_checkpoint_file,
|
|
440
|
+
table_root || '/_delta_log' AS delta_log_path
|
|
441
|
+
FROM temp_checkpoint_parquet
|
|
442
|
+
WHERE row_order = 1
|
|
443
|
+
LIMIT 1
|
|
444
|
+
""").fetchone()
|
|
445
|
+
|
|
446
|
+
checkpoint_file, json_file, last_checkpoint_file, delta_log_path = paths
|
|
447
|
+
|
|
448
|
+
# Create delta_log directory for local paths
|
|
449
|
+
if not any(table_root.startswith(prefix) for prefix in ['abfss://', 's3://', 'gs://', 'az://', 'http://', 'https://']):
|
|
450
|
+
con.execute(f"""
|
|
451
|
+
COPY (SELECT 1 AS id, 1 AS ".duckdb_init")
|
|
452
|
+
TO '{delta_log_path}'
|
|
453
|
+
(FORMAT CSV, PARTITION_BY (".duckdb_init"), OVERWRITE_OR_IGNORE)
|
|
454
|
+
""")
|
|
455
|
+
|
|
456
|
+
# Write checkpoint parquet
|
|
457
|
+
con.execute(f"""
|
|
458
|
+
COPY (SELECT protocol, metaData, add, remove, commitInfo
|
|
459
|
+
FROM temp_checkpoint_parquet ORDER BY row_order)
|
|
460
|
+
TO '{checkpoint_file}' (FORMAT PARQUET)
|
|
461
|
+
""")
|
|
462
|
+
|
|
463
|
+
# Write JSON log
|
|
464
|
+
con.execute(f"""
|
|
465
|
+
COPY (SELECT content FROM temp_checkpoint_json)
|
|
466
|
+
TO '{json_file}' (FORMAT CSV, HEADER false, QUOTE '')
|
|
467
|
+
""")
|
|
468
|
+
|
|
469
|
+
# Write last checkpoint
|
|
470
|
+
con.execute(f"""
|
|
471
|
+
COPY (SELECT content FROM temp_last_checkpoint)
|
|
472
|
+
TO '{last_checkpoint_file}' (FORMAT CSV, HEADER false, QUOTE '')
|
|
473
|
+
""")
|
|
474
|
+
|
|
475
|
+
print(f" ✅ {table_key}: exported snapshot {snapshot_id} ({file_count} files)")
|
|
476
|
+
|
|
477
|
+
except Exception as e:
|
|
478
|
+
print(f" ❌ {table_key}: {e}")
|
|
479
|
+
|
|
480
|
+
# Cleanup temp tables
|
|
481
|
+
con.execute("DROP TABLE IF EXISTS export_summary")
|
|
482
|
+
con.execute("DROP TABLE IF EXISTS temp_checkpoint_parquet")
|
|
483
|
+
con.execute("DROP TABLE IF EXISTS temp_checkpoint_json")
|
|
484
|
+
con.execute("DROP TABLE IF EXISTS temp_last_checkpoint")
|
|
485
|
+
|
|
486
|
+
con.close()
|
|
487
|
+
print("\n🎉 Export completed!")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ducklake-delta-exporter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
|
|
5
5
|
Home-page: https://github.com/djouallah/ducklake_delta_exporter
|
|
6
6
|
Author: mim
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
13
13
|
Classifier: Development Status :: 3 - Alpha
|
|
14
14
|
Requires-Python: >=3.8
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
|
-
Requires-Dist: duckdb
|
|
16
|
+
Requires-Dist: duckdb>=1.4.4
|
|
17
17
|
Dynamic: author
|
|
18
18
|
Dynamic: author-email
|
|
19
19
|
Dynamic: classifier
|
|
@@ -5,4 +5,5 @@ ducklake_delta_exporter.egg-info/PKG-INFO
|
|
|
5
5
|
ducklake_delta_exporter.egg-info/SOURCES.txt
|
|
6
6
|
ducklake_delta_exporter.egg-info/dependency_links.txt
|
|
7
7
|
ducklake_delta_exporter.egg-info/requires.txt
|
|
8
|
-
ducklake_delta_exporter.egg-info/top_level.txt
|
|
8
|
+
ducklake_delta_exporter.egg-info/top_level.txt
|
|
9
|
+
tests/test_stats_transformation.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
duckdb>=1.4.4
|
|
@@ -3,9 +3,9 @@ from setuptools import setup, find_packages
|
|
|
3
3
|
|
|
4
4
|
setup(
|
|
5
5
|
name='ducklake-delta-exporter',
|
|
6
|
-
version='0.
|
|
6
|
+
version='0.3.0',
|
|
7
7
|
packages=find_packages(),
|
|
8
|
-
install_requires=['duckdb'],
|
|
8
|
+
install_requires=['duckdb>=1.4.4'],
|
|
9
9
|
author='mim',
|
|
10
10
|
author_email='your.email@example.com',
|
|
11
11
|
description='A utility to export DuckLake database metadata to Delta Lake transaction logs.',
|