duckrun 0.2.19.dev8__tar.gz → 0.2.20.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/auth.py +3 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/core.py +66 -1
- duckrun-0.2.20.dev0/duckrun/ducklake_metadata.py +571 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/semantic_model.py +134 -13
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/SOURCES.txt +3 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/pyproject.toml +1 -1
- duckrun-0.2.20.dev0/tests/test_checkpoint_format.py +102 -0
- duckrun-0.2.20.dev0/tests/test_ducklake_export.py +7 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/LICENSE +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/README.md +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/rle.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/setup.cfg +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/tests/test_register.py +0 -0
- {duckrun-0.2.19.dev8 → duckrun-0.2.20.dev0}/tests/test_rle.py +0 -0
|
@@ -104,6 +104,8 @@ def _get_local_token() -> Optional[str]:
|
|
|
104
104
|
|
|
105
105
|
except Exception as cli_error:
|
|
106
106
|
print(f"⚠️ Azure CLI authentication failed: {cli_error}")
|
|
107
|
+
print("💡 TIP: Due to MFA requirements, you now need to login with scope:")
|
|
108
|
+
print(" az login --scope https://storage.azure.com/.default")
|
|
107
109
|
print("🔐 Falling back to interactive browser authentication...")
|
|
108
110
|
|
|
109
111
|
# Fallback to interactive browser
|
|
@@ -119,6 +121,7 @@ def _get_local_token() -> Optional[str]:
|
|
|
119
121
|
|
|
120
122
|
except Exception as browser_error:
|
|
121
123
|
print(f"❌ Interactive browser authentication failed: {browser_error}")
|
|
124
|
+
print("💡 Please run: az login --scope https://storage.azure.com/.default")
|
|
122
125
|
return None
|
|
123
126
|
|
|
124
127
|
|
|
@@ -1249,9 +1249,11 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1249
1249
|
dataset_name = self.schema # Use schema name
|
|
1250
1250
|
|
|
1251
1251
|
# Call the deployment function (DirectLake only)
|
|
1252
|
+
# Use lakehouse_id (with .ItemType suffix) instead of lakehouse_name (without suffix)
|
|
1253
|
+
# This ensures proper item resolution for non-lakehouse items like .SnowflakeDatabase
|
|
1252
1254
|
return deploy_semantic_model(
|
|
1253
1255
|
workspace_name_or_id=self.workspace,
|
|
1254
|
-
lakehouse_name_or_id=self.
|
|
1256
|
+
lakehouse_name_or_id=self.lakehouse_id,
|
|
1255
1257
|
schema_name=self.schema,
|
|
1256
1258
|
dataset_name=dataset_name,
|
|
1257
1259
|
bim_url_or_path=bim_url,
|
|
@@ -1259,6 +1261,69 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1259
1261
|
refresh=refresh
|
|
1260
1262
|
)
|
|
1261
1263
|
|
|
1264
|
+
def export_ducklake_to_delta(self, db_path: str, data_root: str = None) -> bool:
|
|
1265
|
+
"""
|
|
1266
|
+
Export DuckLake metadata to Delta Lake format for Spark compatibility.
|
|
1267
|
+
|
|
1268
|
+
Reads a DuckLake database file from the Files section and generates Delta Lake
|
|
1269
|
+
checkpoint files and JSON logs for all tables, making them readable by Spark
|
|
1270
|
+
and other Delta Lake tools.
|
|
1271
|
+
|
|
1272
|
+
Args:
|
|
1273
|
+
db_path: Relative path to DuckLake DB file in Files section (e.g., "db/test/test.db")
|
|
1274
|
+
data_root: Optional base path for lakehouse data. If None, reads from DuckLake metadata.
|
|
1275
|
+
|
|
1276
|
+
Returns:
|
|
1277
|
+
True if export succeeded, False otherwise
|
|
1278
|
+
|
|
1279
|
+
Examples:
|
|
1280
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
1281
|
+
|
|
1282
|
+
# Export DuckLake tables to Delta format
|
|
1283
|
+
con.export_ducklake_to_delta("meta.db")
|
|
1284
|
+
|
|
1285
|
+
# With explicit data root
|
|
1286
|
+
con.export_ducklake_to_delta("db/ducklake.db", data_root="abfss://...")
|
|
1287
|
+
"""
|
|
1288
|
+
from .ducklake_metadata import generate_latest_delta_log
|
|
1289
|
+
import obstore as obs
|
|
1290
|
+
from obstore.store import AzureStore
|
|
1291
|
+
|
|
1292
|
+
# Construct full ABFSS path to DB file in Files section
|
|
1293
|
+
full_db_path = f"{self.files_base_url}{db_path}"
|
|
1294
|
+
|
|
1295
|
+
print(f"🔍 Exporting DuckLake metadata from: {db_path}")
|
|
1296
|
+
print(f"📂 Full DB path: {full_db_path}")
|
|
1297
|
+
|
|
1298
|
+
# Get Azure token
|
|
1299
|
+
from .auth import get_token
|
|
1300
|
+
token = self._get_storage_token()
|
|
1301
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
|
1302
|
+
print("Authenticating with Azure for DuckLake export...")
|
|
1303
|
+
token = get_token()
|
|
1304
|
+
if not token:
|
|
1305
|
+
print("❌ Failed to authenticate for DuckLake export")
|
|
1306
|
+
return False
|
|
1307
|
+
|
|
1308
|
+
# Setup OneLake store for uploading checkpoint files
|
|
1309
|
+
# Use table_base_url as the base since we'll be writing to Tables section
|
|
1310
|
+
store = AzureStore.from_url(self.table_base_url, bearer_token=token)
|
|
1311
|
+
|
|
1312
|
+
# If data_root not provided, use table_base_url (which includes /Tables/)
|
|
1313
|
+
# This will be used to construct full paths for checkpoint files
|
|
1314
|
+
if data_root is None:
|
|
1315
|
+
data_root = self.table_base_url.rstrip('/')
|
|
1316
|
+
|
|
1317
|
+
try:
|
|
1318
|
+
generate_latest_delta_log(full_db_path, data_root, store, token)
|
|
1319
|
+
print(f"✅ DuckLake export completed successfully")
|
|
1320
|
+
return True
|
|
1321
|
+
except Exception as e:
|
|
1322
|
+
print(f"❌ DuckLake export failed: {e}")
|
|
1323
|
+
import traceback
|
|
1324
|
+
traceback.print_exc()
|
|
1325
|
+
return False
|
|
1326
|
+
|
|
1262
1327
|
def rle(self, table_name: str = None, mode = "natural",
|
|
1263
1328
|
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1264
1329
|
max_ordering_depth: int = 3, limit: int = None):
|
|
@@ -0,0 +1,571 @@
|
|
|
1
|
+
# File: ducklake_delta_exporter.py
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
import duckdb
|
|
5
|
+
import os
|
|
6
|
+
import tempfile
|
|
7
|
+
import shutil
|
|
8
|
+
|
|
9
|
+
def map_type_ducklake_to_spark(t):
|
|
10
|
+
"""Maps DuckDB data types to their Spark SQL equivalents for the Delta schema."""
|
|
11
|
+
t = t.lower()
|
|
12
|
+
if 'int' in t:
|
|
13
|
+
return 'long' if '64' in t else 'integer'
|
|
14
|
+
elif 'float' in t:
|
|
15
|
+
return 'double'
|
|
16
|
+
elif 'double' in t:
|
|
17
|
+
return 'double'
|
|
18
|
+
elif 'decimal' in t:
|
|
19
|
+
return 'decimal(10,0)'
|
|
20
|
+
elif 'bool' in t:
|
|
21
|
+
return 'boolean'
|
|
22
|
+
elif 'timestamp' in t:
|
|
23
|
+
return 'timestamp'
|
|
24
|
+
elif 'date' in t:
|
|
25
|
+
return 'date'
|
|
26
|
+
return 'string'
|
|
27
|
+
|
|
28
|
+
def convert_stat_value_to_json(value_str, column_type):
|
|
29
|
+
"""
|
|
30
|
+
Convert DuckLake stat string value to proper JSON type for Delta Lake.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
value_str: String representation of the value from DuckLake
|
|
34
|
+
column_type: DuckDB column type
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Properly typed value for JSON serialization
|
|
38
|
+
"""
|
|
39
|
+
if value_str is None:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
column_type = column_type.lower()
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
# Timestamp: Convert to ISO 8601 with .000Z suffix
|
|
46
|
+
if 'timestamp' in column_type:
|
|
47
|
+
# Parse and format to ISO 8601
|
|
48
|
+
# Assumes value_str is in format like "2025-06-22 23:55:00"
|
|
49
|
+
if 'T' not in value_str:
|
|
50
|
+
value_str = value_str.replace(' ', 'T')
|
|
51
|
+
if not value_str.endswith('Z'):
|
|
52
|
+
value_str += '.000Z' if '.000Z' not in value_str else 'Z'
|
|
53
|
+
return value_str
|
|
54
|
+
|
|
55
|
+
# Date: Keep as YYYY-MM-DD string
|
|
56
|
+
elif 'date' in column_type:
|
|
57
|
+
return value_str
|
|
58
|
+
|
|
59
|
+
# Boolean: Convert to JSON boolean
|
|
60
|
+
elif 'bool' in column_type:
|
|
61
|
+
return value_str.lower() in ('true', 't', '1', 'yes')
|
|
62
|
+
|
|
63
|
+
# Numeric types: Convert to number (not string)
|
|
64
|
+
elif any(t in column_type for t in ['int', 'float', 'double', 'decimal', 'numeric']):
|
|
65
|
+
# Try to parse as float first (handles both int and float)
|
|
66
|
+
if '.' in value_str or 'e' in value_str.lower():
|
|
67
|
+
return float(value_str)
|
|
68
|
+
else:
|
|
69
|
+
return int(value_str)
|
|
70
|
+
|
|
71
|
+
# String and others: Keep as string
|
|
72
|
+
else:
|
|
73
|
+
return value_str
|
|
74
|
+
|
|
75
|
+
except (ValueError, AttributeError):
|
|
76
|
+
# If conversion fails, return as string
|
|
77
|
+
return value_str
|
|
78
|
+
|
|
79
|
+
def create_spark_schema_string(fields):
|
|
80
|
+
"""Creates a JSON string for the Spark schema from a list of fields."""
|
|
81
|
+
return json.dumps({"type": "struct", "fields": fields})
|
|
82
|
+
|
|
83
|
+
def get_latest_ducklake_snapshot(con, table_id):
|
|
84
|
+
"""
|
|
85
|
+
Get the latest DuckLake snapshot ID for a table.
|
|
86
|
+
"""
|
|
87
|
+
latest_snapshot = con.execute(f""" SELECT MAX(begin_snapshot) as latest_snapshot FROM ducklake_data_file WHERE table_id = {table_id} """).fetchone()[0]
|
|
88
|
+
return latest_snapshot
|
|
89
|
+
|
|
90
|
+
def get_latest_delta_checkpoint(con, table_id):
|
|
91
|
+
"""
|
|
92
|
+
check how many times a table has being modified.
|
|
93
|
+
"""
|
|
94
|
+
delta_checkpoint = con.execute(f""" SELECT count(snapshot_id) FROM ducklake_snapshot_changes
|
|
95
|
+
where changes_made like '%:{table_id}' or changes_made like '%:{table_id},%' """).fetchone()[0]
|
|
96
|
+
return delta_checkpoint
|
|
97
|
+
|
|
98
|
+
def get_file_modification_time(dummy_time):
|
|
99
|
+
"""
|
|
100
|
+
Return a dummy modification time for parquet files.
|
|
101
|
+
This avoids the latency of actually reading file metadata.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
dummy_time: Timestamp in milliseconds to use as modification time
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Modification time in milliseconds
|
|
108
|
+
"""
|
|
109
|
+
return dummy_time
|
|
110
|
+
|
|
111
|
+
def create_dummy_json_log(local_table_root, delta_version, table_info, schema_fields, now, latest_snapshot,
|
|
112
|
+
num_files, total_rows=None, total_bytes=None):
|
|
113
|
+
"""
|
|
114
|
+
Create a minimal Delta Lake transaction log file for Spark compatibility.
|
|
115
|
+
Writes to local filesystem (temp directory) following Delta Lake specification.
|
|
116
|
+
Entry order: commitInfo → metaData → protocol (as per Delta Lake spec)
|
|
117
|
+
|
|
118
|
+
Note: The actual add entries are in the checkpoint.parquet file.
|
|
119
|
+
This JSON log provides metadata for Delta readers to understand the checkpoint.
|
|
120
|
+
"""
|
|
121
|
+
import uuid
|
|
122
|
+
|
|
123
|
+
local_delta_log_dir = os.path.join(local_table_root, '_delta_log')
|
|
124
|
+
json_log_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.json")
|
|
125
|
+
|
|
126
|
+
# Ensure directory exists
|
|
127
|
+
os.makedirs(local_delta_log_dir, exist_ok=True)
|
|
128
|
+
|
|
129
|
+
# 1. Commit info entry (FIRST - as per Delta Lake spec)
|
|
130
|
+
commitinfo_json = json.dumps({
|
|
131
|
+
"commitInfo": {
|
|
132
|
+
"timestamp": now,
|
|
133
|
+
"operation": "CONVERT",
|
|
134
|
+
"operationParameters": {
|
|
135
|
+
"convertedFrom": "DuckLake",
|
|
136
|
+
"duckLakeSnapshotId": str(latest_snapshot),
|
|
137
|
+
"partitionBy": "[]"
|
|
138
|
+
},
|
|
139
|
+
"isolationLevel": "Serializable",
|
|
140
|
+
"isBlindAppend": False,
|
|
141
|
+
"operationMetrics": {
|
|
142
|
+
"numFiles": str(num_files),
|
|
143
|
+
"numOutputRows": str(total_rows) if total_rows else "0",
|
|
144
|
+
"numOutputBytes": str(total_bytes) if total_bytes else "0"
|
|
145
|
+
},
|
|
146
|
+
"engineInfo": "DuckLake-Delta-Exporter/1.0.0",
|
|
147
|
+
"txnId": str(uuid.uuid4())
|
|
148
|
+
}
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# 2. Metadata entry (SECOND)
|
|
152
|
+
metadata_json = json.dumps({
|
|
153
|
+
"metaData": {
|
|
154
|
+
"id": str(uuid.uuid4()), # Use UUID for metadata ID
|
|
155
|
+
"name": table_info['table_name'],
|
|
156
|
+
"description": None,
|
|
157
|
+
"format": {
|
|
158
|
+
"provider": "parquet",
|
|
159
|
+
"options": {}
|
|
160
|
+
},
|
|
161
|
+
"schemaString": create_spark_schema_string(schema_fields),
|
|
162
|
+
"partitionColumns": [],
|
|
163
|
+
"createdTime": now,
|
|
164
|
+
"configuration": {}
|
|
165
|
+
}
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
# 3. Protocol entry (THIRD)
|
|
169
|
+
protocol_json = json.dumps({
|
|
170
|
+
"protocol": {
|
|
171
|
+
"minReaderVersion": 1,
|
|
172
|
+
"minWriterVersion": 2
|
|
173
|
+
}
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
# Write JSON log file (newline-delimited JSON) in correct order
|
|
177
|
+
with open(json_log_file, 'w') as f:
|
|
178
|
+
f.write(commitinfo_json + '\n')
|
|
179
|
+
f.write(metadata_json + '\n')
|
|
180
|
+
f.write(protocol_json + '\n')
|
|
181
|
+
|
|
182
|
+
return json_log_file
|
|
183
|
+
|
|
184
|
+
def build_file_path(table_root, relative_path):
|
|
185
|
+
"""
|
|
186
|
+
Build full file path from table root and relative path.
|
|
187
|
+
Works with both local paths and S3 URLs.
|
|
188
|
+
"""
|
|
189
|
+
table_root = table_root.rstrip('/')
|
|
190
|
+
relative_path = relative_path.lstrip('/')
|
|
191
|
+
return f"{table_root}/{relative_path}"
|
|
192
|
+
|
|
193
|
+
def create_checkpoint_for_latest_snapshot(con, table_info, data_root, temp_dir, store=None, token=None):
|
|
194
|
+
"""
|
|
195
|
+
Create a Delta checkpoint file for the latest DuckLake snapshot.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
con: DuckDB connection to DuckLake database
|
|
199
|
+
table_info: Dictionary with table metadata
|
|
200
|
+
data_root: Root path for data (used for constructing remote paths)
|
|
201
|
+
temp_dir: Temporary directory for writing local files
|
|
202
|
+
store: obstore AzureStore instance for uploading files (None for local mode)
|
|
203
|
+
token: Azure auth token (None for local mode)
|
|
204
|
+
"""
|
|
205
|
+
# Construct table path (relative to data_root)
|
|
206
|
+
# Clean up paths to avoid double slashes
|
|
207
|
+
schema_path = table_info['schema_path'].strip('/')
|
|
208
|
+
table_path = table_info['table_path'].strip('/')
|
|
209
|
+
table_relative_path = f"{schema_path}/{table_path}" if schema_path else table_path
|
|
210
|
+
|
|
211
|
+
# Local temporary directory for this table
|
|
212
|
+
local_table_root = os.path.join(temp_dir, table_relative_path.replace('/', os.sep))
|
|
213
|
+
|
|
214
|
+
# Remote path (for ABFSS upload) - always use forward slashes
|
|
215
|
+
remote_table_root = f"{data_root.rstrip('/')}/{table_relative_path}"
|
|
216
|
+
|
|
217
|
+
# Get the latest snapshot
|
|
218
|
+
latest_snapshot = get_latest_ducklake_snapshot(con, table_info['table_id'])
|
|
219
|
+
if latest_snapshot is None:
|
|
220
|
+
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: No snapshots found")
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
# Use snapshot ID as the delta version
|
|
224
|
+
delta_version = latest_snapshot
|
|
225
|
+
|
|
226
|
+
# Local checkpoint files (in temp directory)
|
|
227
|
+
local_delta_log_dir = os.path.join(local_table_root, '_delta_log')
|
|
228
|
+
local_checkpoint_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.checkpoint.parquet")
|
|
229
|
+
local_json_log_file = os.path.join(local_delta_log_dir, f"{delta_version:020d}.json")
|
|
230
|
+
local_last_checkpoint_file = os.path.join(local_delta_log_dir, "_last_checkpoint")
|
|
231
|
+
|
|
232
|
+
# Remote paths (for ABFSS upload) - always use forward slashes
|
|
233
|
+
remote_checkpoint_file = remote_table_root + f"/_delta_log/{delta_version:020d}.checkpoint.parquet"
|
|
234
|
+
remote_json_log_file = remote_table_root + f"/_delta_log/{delta_version:020d}.json"
|
|
235
|
+
remote_last_checkpoint_file = remote_table_root + "/_delta_log/_last_checkpoint"
|
|
236
|
+
|
|
237
|
+
# Check if checkpoint already exists (if store is provided)
|
|
238
|
+
if store:
|
|
239
|
+
# Read _last_checkpoint to get the current version
|
|
240
|
+
try:
|
|
241
|
+
last_checkpoint_result = con.execute(f"""
|
|
242
|
+
SELECT version
|
|
243
|
+
FROM read_json_auto('{remote_last_checkpoint_file}')
|
|
244
|
+
LIMIT 1
|
|
245
|
+
""").fetchone()
|
|
246
|
+
|
|
247
|
+
if last_checkpoint_result:
|
|
248
|
+
current_version = last_checkpoint_result[0]
|
|
249
|
+
current_json_file = remote_table_root + f"/_delta_log/{current_version:020d}.json"
|
|
250
|
+
|
|
251
|
+
# Read the current version's JSON to check snapshot ID
|
|
252
|
+
result = con.execute(f"""
|
|
253
|
+
SELECT
|
|
254
|
+
commitInfo.operationParameters.duckLakeSnapshotId as snapshot_id
|
|
255
|
+
FROM read_json_auto('{current_json_file}', format='newline_delimited')
|
|
256
|
+
WHERE commitInfo IS NOT NULL
|
|
257
|
+
LIMIT 1
|
|
258
|
+
""").fetchone()
|
|
259
|
+
|
|
260
|
+
if result and result[0]:
|
|
261
|
+
last_snapshot = result[0]
|
|
262
|
+
if last_snapshot == str(latest_snapshot):
|
|
263
|
+
print(f"⚠️ {table_info['schema_name']}.{table_info['table_name']}: Snapshot {latest_snapshot} already exported (version {current_version})")
|
|
264
|
+
return False
|
|
265
|
+
else:
|
|
266
|
+
print(f"📊 {table_info['schema_name']}.{table_info['table_name']}: New snapshot detected (was {last_snapshot}, now {latest_snapshot})")
|
|
267
|
+
except Exception:
|
|
268
|
+
# _last_checkpoint doesn't exist, this is first export
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
now = int(time.time() * 1000)
|
|
272
|
+
|
|
273
|
+
# Get all files with their stats for the latest snapshot
|
|
274
|
+
file_stats_query = f"""
|
|
275
|
+
SELECT
|
|
276
|
+
df.data_file_id,
|
|
277
|
+
df.path,
|
|
278
|
+
df.file_size_bytes,
|
|
279
|
+
c.column_name,
|
|
280
|
+
c.column_type,
|
|
281
|
+
fcs.value_count,
|
|
282
|
+
fcs.null_count,
|
|
283
|
+
fcs.min_value,
|
|
284
|
+
fcs.max_value
|
|
285
|
+
FROM ducklake_data_file df
|
|
286
|
+
LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
|
|
287
|
+
LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
|
|
288
|
+
WHERE df.table_id = {table_info['table_id']}
|
|
289
|
+
AND df.begin_snapshot <= {latest_snapshot}
|
|
290
|
+
AND (df.end_snapshot IS NULL OR df.end_snapshot > {latest_snapshot})
|
|
291
|
+
AND (c.begin_snapshot IS NULL OR c.begin_snapshot <= {latest_snapshot})
|
|
292
|
+
AND (c.end_snapshot IS NULL OR c.end_snapshot > {latest_snapshot})
|
|
293
|
+
ORDER BY df.data_file_id, c.column_order
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
file_stats_rows = con.execute(file_stats_query).fetchall()
|
|
297
|
+
|
|
298
|
+
# Group stats by file
|
|
299
|
+
from collections import defaultdict
|
|
300
|
+
files_dict = defaultdict(lambda: {
|
|
301
|
+
'path': None,
|
|
302
|
+
'size': 0,
|
|
303
|
+
'num_records': 0,
|
|
304
|
+
'min_values': {},
|
|
305
|
+
'max_values': {},
|
|
306
|
+
'null_count': {}
|
|
307
|
+
})
|
|
308
|
+
|
|
309
|
+
for row in file_stats_rows:
|
|
310
|
+
file_id, path, size, col_name, col_type, value_count, null_count, min_val, max_val = row
|
|
311
|
+
|
|
312
|
+
file_data = files_dict[file_id]
|
|
313
|
+
file_data['path'] = path
|
|
314
|
+
file_data['size'] = size
|
|
315
|
+
|
|
316
|
+
# Set num_records from first column's value_count (all columns have same count)
|
|
317
|
+
if file_data['num_records'] == 0 and value_count is not None:
|
|
318
|
+
file_data['num_records'] = value_count
|
|
319
|
+
|
|
320
|
+
# Only add column stats if column name exists (handle LEFT JOIN nulls)
|
|
321
|
+
if col_name is not None:
|
|
322
|
+
# Convert and add min/max values with proper typing
|
|
323
|
+
if min_val is not None:
|
|
324
|
+
file_data['min_values'][col_name] = convert_stat_value_to_json(min_val, col_type)
|
|
325
|
+
if max_val is not None:
|
|
326
|
+
file_data['max_values'][col_name] = convert_stat_value_to_json(max_val, col_type)
|
|
327
|
+
if null_count is not None:
|
|
328
|
+
file_data['null_count'][col_name] = null_count
|
|
329
|
+
|
|
330
|
+
# Convert to list format for processing
|
|
331
|
+
file_rows = [(f['path'], f['size'], f['num_records'], f['min_values'], f['max_values'], f['null_count'])
|
|
332
|
+
for f in files_dict.values()]
|
|
333
|
+
|
|
334
|
+
# Calculate aggregate metrics for commitInfo
|
|
335
|
+
total_files = len(file_rows)
|
|
336
|
+
total_rows = sum(f[2] for f in file_rows) # num_records
|
|
337
|
+
total_bytes = sum(f[1] for f in file_rows) # size
|
|
338
|
+
|
|
339
|
+
# Get schema for the latest snapshot
|
|
340
|
+
columns = con.execute(f"""
|
|
341
|
+
SELECT column_name, column_type FROM ducklake_column
|
|
342
|
+
WHERE table_id = {table_info['table_id']}
|
|
343
|
+
AND begin_snapshot <= {latest_snapshot}
|
|
344
|
+
AND (end_snapshot IS NULL OR end_snapshot > {latest_snapshot})
|
|
345
|
+
ORDER BY column_order
|
|
346
|
+
""").fetchall()
|
|
347
|
+
|
|
348
|
+
# Get or generate table metadata ID
|
|
349
|
+
table_meta_id = str(table_info['table_id'])
|
|
350
|
+
|
|
351
|
+
# Prepare schema
|
|
352
|
+
schema_fields = [
|
|
353
|
+
{"name": name, "type": map_type_ducklake_to_spark(typ), "nullable": True, "metadata": {}}
|
|
354
|
+
for name, typ in columns
|
|
355
|
+
]
|
|
356
|
+
|
|
357
|
+
# Create checkpoint data using DuckDB directly
|
|
358
|
+
checkpoint_data = []
|
|
359
|
+
|
|
360
|
+
# Create checkpoint data directly in DuckDB using proper data types
|
|
361
|
+
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
362
|
+
|
|
363
|
+
# Create the checkpoint table with proper nested structure
|
|
364
|
+
duckdb.execute("""
|
|
365
|
+
CREATE TABLE checkpoint_table AS
|
|
366
|
+
WITH checkpoint_data AS (
|
|
367
|
+
-- Protocol record
|
|
368
|
+
SELECT
|
|
369
|
+
{'minReaderVersion': 1, 'minWriterVersion': 2}::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
370
|
+
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
371
|
+
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
372
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
373
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
374
|
+
|
|
375
|
+
UNION ALL
|
|
376
|
+
|
|
377
|
+
-- Metadata record
|
|
378
|
+
SELECT
|
|
379
|
+
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
380
|
+
{
|
|
381
|
+
'id': ?,
|
|
382
|
+
'name': ?,
|
|
383
|
+
'description': NULL,
|
|
384
|
+
'format': {'provider': 'parquet', 'options': MAP{}}::STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)),
|
|
385
|
+
'schemaString': ?,
|
|
386
|
+
'partitionColumns': []::VARCHAR[],
|
|
387
|
+
'createdTime': ?,
|
|
388
|
+
'configuration': MAP{'delta.logRetentionDuration': 'interval 1 hour'}
|
|
389
|
+
}::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
390
|
+
NULL::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
391
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
392
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
393
|
+
)
|
|
394
|
+
SELECT * FROM checkpoint_data
|
|
395
|
+
""", [table_meta_id, table_info['table_name'], create_spark_schema_string(schema_fields), now])
|
|
396
|
+
|
|
397
|
+
# Add file records with real statistics
|
|
398
|
+
for path, size, num_records, min_values, max_values, null_count in file_rows:
|
|
399
|
+
rel_path = path.lstrip('/')
|
|
400
|
+
full_path = build_file_path(remote_table_root, rel_path)
|
|
401
|
+
mod_time = get_file_modification_time(now)
|
|
402
|
+
|
|
403
|
+
# Build stats JSON with real values from DuckLake metadata
|
|
404
|
+
stats_json = json.dumps({
|
|
405
|
+
"numRecords": num_records,
|
|
406
|
+
"minValues": min_values,
|
|
407
|
+
"maxValues": max_values,
|
|
408
|
+
"nullCount": null_count
|
|
409
|
+
})
|
|
410
|
+
|
|
411
|
+
duckdb.execute("""
|
|
412
|
+
INSERT INTO checkpoint_table
|
|
413
|
+
SELECT
|
|
414
|
+
NULL::STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER) AS protocol,
|
|
415
|
+
NULL::STRUCT(id VARCHAR, name VARCHAR, description VARCHAR, format STRUCT(provider VARCHAR, options MAP(VARCHAR, VARCHAR)), schemaString VARCHAR, partitionColumns VARCHAR[], createdTime BIGINT, configuration MAP(VARCHAR, VARCHAR)) AS metaData,
|
|
416
|
+
{
|
|
417
|
+
'path': ?,
|
|
418
|
+
'partitionValues': MAP{}::MAP(VARCHAR, VARCHAR),
|
|
419
|
+
'size': ?,
|
|
420
|
+
'modificationTime': ?,
|
|
421
|
+
'dataChange': true,
|
|
422
|
+
'stats': ?,
|
|
423
|
+
'tags': MAP{}::MAP(VARCHAR, VARCHAR)
|
|
424
|
+
}::STRUCT(path VARCHAR, partitionValues MAP(VARCHAR, VARCHAR), size BIGINT, modificationTime BIGINT, dataChange BOOLEAN, stats VARCHAR, tags MAP(VARCHAR, VARCHAR)) AS add,
|
|
425
|
+
NULL::STRUCT(path VARCHAR, deletionTimestamp BIGINT, dataChange BOOLEAN) AS remove,
|
|
426
|
+
NULL::STRUCT(timestamp TIMESTAMP, operation VARCHAR, operationParameters MAP(VARCHAR, VARCHAR), isBlindAppend BOOLEAN, engineInfo VARCHAR, clientVersion VARCHAR) AS commitInfo
|
|
427
|
+
""", [rel_path, size, mod_time, stats_json])
|
|
428
|
+
|
|
429
|
+
# Create the _delta_log directory
|
|
430
|
+
os.makedirs(local_delta_log_dir, exist_ok=True)
|
|
431
|
+
|
|
432
|
+
# Write the checkpoint file to local temp directory
|
|
433
|
+
duckdb.execute(f"COPY (SELECT * FROM checkpoint_table) TO '{local_checkpoint_file}' (FORMAT PARQUET)")
|
|
434
|
+
|
|
435
|
+
# Create minimal JSON log file (writes to local temp)
|
|
436
|
+
# Note: Full add entries are in the checkpoint.parquet, JSON only has metadata
|
|
437
|
+
create_dummy_json_log(local_table_root, delta_version, table_info, schema_fields, now, latest_snapshot,
|
|
438
|
+
total_files, total_rows, total_bytes)
|
|
439
|
+
|
|
440
|
+
# Write the _last_checkpoint file to local temp directory
|
|
441
|
+
with open(local_last_checkpoint_file, 'w') as f:
|
|
442
|
+
total_records = 2 + len(file_rows) # protocol + metadata + file records
|
|
443
|
+
f.write(json.dumps({"version": delta_version, "size": total_records}))
|
|
444
|
+
|
|
445
|
+
# Upload files to OneLake if store is provided
|
|
446
|
+
if store:
|
|
447
|
+
try:
|
|
448
|
+
import obstore as obs
|
|
449
|
+
|
|
450
|
+
# Extract relative paths from full ABFSS URLs for obstore
|
|
451
|
+
# obstore expects paths relative to the store's base URL
|
|
452
|
+
# remote_checkpoint_file is like: "abfss://.../Tables/simple/ducklake/_delta_log/file.parquet"
|
|
453
|
+
# We need just: "simple/ducklake/_delta_log/file.parquet"
|
|
454
|
+
def get_relative_path(full_path):
|
|
455
|
+
# Split on /Tables/ and take the part after it
|
|
456
|
+
if '/Tables/' in full_path:
|
|
457
|
+
return full_path.split('/Tables/')[-1]
|
|
458
|
+
return full_path.lstrip('/')
|
|
459
|
+
|
|
460
|
+
rel_checkpoint = get_relative_path(remote_checkpoint_file)
|
|
461
|
+
rel_json_log = get_relative_path(remote_json_log_file)
|
|
462
|
+
rel_last_checkpoint = get_relative_path(remote_last_checkpoint_file)
|
|
463
|
+
|
|
464
|
+
# Upload checkpoint file first
|
|
465
|
+
with open(local_checkpoint_file, 'rb') as f:
|
|
466
|
+
obs.put(store, rel_checkpoint, f.read())
|
|
467
|
+
|
|
468
|
+
# Upload JSON log file second
|
|
469
|
+
with open(local_json_log_file, 'rb') as f:
|
|
470
|
+
obs.put(store, rel_json_log, f.read())
|
|
471
|
+
|
|
472
|
+
# Upload _last_checkpoint file last for semi-decent consistency
|
|
473
|
+
# (readers check this first to find the latest checkpoint)
|
|
474
|
+
with open(local_last_checkpoint_file, 'rb') as f:
|
|
475
|
+
obs.put(store, rel_last_checkpoint, f.read())
|
|
476
|
+
|
|
477
|
+
print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
|
|
478
|
+
print(f"✅ Uploaded to: {remote_table_root}/_delta_log/")
|
|
479
|
+
except Exception as e:
|
|
480
|
+
print(f"❌ Failed to upload checkpoint files: {e}")
|
|
481
|
+
return False
|
|
482
|
+
else:
|
|
483
|
+
# Local mode - files are already written to temp directory
|
|
484
|
+
print(f"✅ Exported DuckLake snapshot {latest_snapshot} as Delta checkpoint v{delta_version}")
|
|
485
|
+
print(f"✅ Created local files in: {local_delta_log_dir}")
|
|
486
|
+
|
|
487
|
+
# Clean up temporary tables
|
|
488
|
+
duckdb.execute("DROP TABLE IF EXISTS checkpoint_table")
|
|
489
|
+
|
|
490
|
+
return True, delta_version, latest_snapshot
|
|
491
|
+
|
|
492
|
+
def generate_latest_delta_log(db_path: str, data_root: str = None, store=None, token=None):
|
|
493
|
+
"""
|
|
494
|
+
Export the latest DuckLake snapshot for each table as a Delta checkpoint file.
|
|
495
|
+
Creates both checkpoint files and minimal JSON log files for Spark compatibility.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
db_path (str): The path to the DuckLake database file (can be ABFSS URL or local path).
|
|
499
|
+
data_root (str): The root directory for the lakehouse data. If None, reads from DuckLake metadata.
|
|
500
|
+
store: obstore AzureStore instance for uploading files (None for local mode).
|
|
501
|
+
token: Azure auth token (None for local mode).
|
|
502
|
+
"""
|
|
503
|
+
# Create temporary directory for local file operations
|
|
504
|
+
temp_dir = tempfile.mkdtemp(prefix='ducklake_export_')
|
|
505
|
+
|
|
506
|
+
try:
|
|
507
|
+
# Create an in-memory DuckDB connection
|
|
508
|
+
con = duckdb.connect(':memory:')
|
|
509
|
+
|
|
510
|
+
# If token is provided and db_path is ABFSS URL, set up Azure authentication
|
|
511
|
+
if token and db_path.startswith('abfss://'):
|
|
512
|
+
con.sql(f"CREATE OR REPLACE SECRET ducklake_secret (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
|
|
513
|
+
|
|
514
|
+
# Attach the DuckLake database (works for both local and ABFSS paths)
|
|
515
|
+
con.execute(f"ATTACH '{db_path}' AS ducklake_db (READ_ONLY)")
|
|
516
|
+
con.execute("USE ducklake_db")
|
|
517
|
+
|
|
518
|
+
if data_root is None:
|
|
519
|
+
data_root = con.sql("SELECT value FROM ducklake_metadata WHERE key = 'data_path'").fetchone()[0]
|
|
520
|
+
|
|
521
|
+
# Get all active tables
|
|
522
|
+
tables = con.execute("""
|
|
523
|
+
SELECT
|
|
524
|
+
t.table_id,
|
|
525
|
+
t.table_name,
|
|
526
|
+
s.schema_name,
|
|
527
|
+
t.path as table_path,
|
|
528
|
+
s.path as schema_path
|
|
529
|
+
FROM ducklake_table t
|
|
530
|
+
JOIN ducklake_schema s USING(schema_id)
|
|
531
|
+
WHERE t.end_snapshot IS NULL
|
|
532
|
+
""").fetchall()
|
|
533
|
+
|
|
534
|
+
total_tables = len(tables)
|
|
535
|
+
successful_exports = 0
|
|
536
|
+
|
|
537
|
+
for table_row in tables:
|
|
538
|
+
table_info = {
|
|
539
|
+
'table_id': table_row[0],
|
|
540
|
+
'table_name': table_row[1],
|
|
541
|
+
'schema_name': table_row[2],
|
|
542
|
+
'table_path': table_row[3],
|
|
543
|
+
'schema_path': table_row[4]
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
table_key = f"{table_info['schema_name']}.{table_info['table_name']}"
|
|
547
|
+
print(f"Processing {table_key}...")
|
|
548
|
+
|
|
549
|
+
try:
|
|
550
|
+
result = create_checkpoint_for_latest_snapshot(con, table_info, data_root, temp_dir, store, token)
|
|
551
|
+
|
|
552
|
+
if result is False:
|
|
553
|
+
# False means checkpoint already exists or no snapshots
|
|
554
|
+
pass # Message already printed by the function
|
|
555
|
+
else:
|
|
556
|
+
successful_exports += 1
|
|
557
|
+
|
|
558
|
+
except Exception as e:
|
|
559
|
+
print(f"❌ {table_key}: Failed to export checkpoint - {e}")
|
|
560
|
+
import traceback
|
|
561
|
+
traceback.print_exc()
|
|
562
|
+
|
|
563
|
+
con.close()
|
|
564
|
+
print(f"\n🎉 Export completed! {successful_exports}/{total_tables} tables exported successfully.")
|
|
565
|
+
|
|
566
|
+
finally:
|
|
567
|
+
# Clean up temporary directory
|
|
568
|
+
try:
|
|
569
|
+
shutil.rmtree(temp_dir)
|
|
570
|
+
except Exception as e:
|
|
571
|
+
print(f"⚠️ Warning: Could not clean up temp directory {temp_dir}: {e}")
|
|
@@ -73,7 +73,10 @@ def get_workspace_id(workspace_name_or_id, client):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
|
|
76
|
-
"""
|
|
76
|
+
"""
|
|
77
|
+
Get lakehouse/item ID by name or validate if already a GUID.
|
|
78
|
+
Supports lakehouses, warehouses, databases, and other OneLake items.
|
|
79
|
+
"""
|
|
77
80
|
import re
|
|
78
81
|
|
|
79
82
|
# Check if input is already a GUID
|
|
@@ -93,17 +96,114 @@ def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
|
|
|
93
96
|
except Exception as e:
|
|
94
97
|
raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found: {e}")
|
|
95
98
|
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
+
# Parse item type from name (e.g., "ItemName.ItemType")
|
|
100
|
+
item_type_map = {
|
|
101
|
+
'.lakehouse': 'Lakehouse',
|
|
102
|
+
'.warehouse': 'Warehouse',
|
|
103
|
+
'.database': 'Database',
|
|
104
|
+
'.snowflakedatabase': 'SnowflakeDatabase'
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
item_type = None
|
|
108
|
+
item_name = lakehouse_name_or_id
|
|
109
|
+
|
|
110
|
+
for suffix, mapped_type in item_type_map.items():
|
|
111
|
+
if lakehouse_name_or_id.lower().endswith(suffix):
|
|
112
|
+
item_type = mapped_type
|
|
113
|
+
item_name = lakehouse_name_or_id[:-len(suffix)]
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
# If no item type suffix, assume it's a lakehouse
|
|
117
|
+
if item_type is None or item_type == 'Lakehouse':
|
|
118
|
+
# Use lakehouse-specific API
|
|
119
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
|
|
120
|
+
items = response.json().get('value', [])
|
|
121
|
+
|
|
122
|
+
lakehouse_match = next((item for item in items if item.get('displayName') == item_name), None)
|
|
123
|
+
if not lakehouse_match:
|
|
124
|
+
raise ValueError(f"Lakehouse '{item_name}' not found")
|
|
125
|
+
|
|
126
|
+
lakehouse_id = lakehouse_match['id']
|
|
127
|
+
print(f"✓ Found lakehouse: {item_name}")
|
|
128
|
+
return lakehouse_id
|
|
129
|
+
else:
|
|
130
|
+
# Use generic items API for non-lakehouse items
|
|
131
|
+
print(f" Searching for {item_type} '{item_name}'...")
|
|
132
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/items")
|
|
133
|
+
items = response.json().get('value', [])
|
|
134
|
+
|
|
135
|
+
# Filter by type and name
|
|
136
|
+
item_match = next(
|
|
137
|
+
(item for item in items
|
|
138
|
+
if item.get('displayName') == item_name and item.get('type') == item_type),
|
|
139
|
+
None
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if not item_match:
|
|
143
|
+
raise ValueError(f"{item_type} '{item_name}' not found")
|
|
144
|
+
|
|
145
|
+
item_id = item_match['id']
|
|
146
|
+
print(f"✓ Found {item_type.lower()}: {item_name}")
|
|
147
|
+
return item_id
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def resolve_to_guid(identifier, identifier_type, client, workspace_id=None):
|
|
151
|
+
"""
|
|
152
|
+
Resolve workspace or item identifier to GUID if it's a friendly name.
|
|
153
|
+
If already a GUID, returns as-is.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
identifier: Workspace name/GUID or item name/GUID
|
|
157
|
+
identifier_type: 'workspace' or 'item'
|
|
158
|
+
client: FabricRestClient instance
|
|
159
|
+
workspace_id: Required if identifier_type is 'item'
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
GUID string or None if resolution fails
|
|
163
|
+
"""
|
|
164
|
+
import re
|
|
165
|
+
|
|
166
|
+
# Check if already a GUID
|
|
167
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
168
|
+
if guid_pattern.match(identifier):
|
|
169
|
+
return identifier
|
|
99
170
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
171
|
+
try:
|
|
172
|
+
if identifier_type == 'workspace':
|
|
173
|
+
# Resolve workspace name to GUID
|
|
174
|
+
response = client.get("/v1/workspaces")
|
|
175
|
+
workspaces = response.json().get('value', [])
|
|
176
|
+
workspace_match = next((ws for ws in workspaces if ws.get('displayName') == identifier), None)
|
|
177
|
+
return workspace_match['id'] if workspace_match else None
|
|
178
|
+
|
|
179
|
+
elif identifier_type == 'item':
|
|
180
|
+
if not workspace_id:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
# Parse item type from identifier
|
|
184
|
+
item_type_map = {
|
|
185
|
+
'.lakehouse': 'Lakehouse',
|
|
186
|
+
'.warehouse': 'Warehouse',
|
|
187
|
+
'.database': 'Database',
|
|
188
|
+
'.snowflakedatabase': 'SnowflakeDatabase'
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
item_name = identifier
|
|
192
|
+
for suffix, mapped_type in item_type_map.items():
|
|
193
|
+
if identifier.lower().endswith(suffix):
|
|
194
|
+
item_name = identifier[:-len(suffix)]
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
# Try generic items API
|
|
198
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/items")
|
|
199
|
+
items = response.json().get('value', [])
|
|
200
|
+
item_match = next((item for item in items if item.get('displayName') == item_name), None)
|
|
201
|
+
|
|
202
|
+
return item_match['id'] if item_match else None
|
|
103
203
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
204
|
+
except Exception as e:
|
|
205
|
+
print(f" ⚠️ Could not resolve {identifier_type} to GUID: {e}")
|
|
206
|
+
return None
|
|
107
207
|
|
|
108
208
|
|
|
109
209
|
def get_dataset_id(dataset_name, workspace_id, client):
|
|
@@ -406,7 +506,14 @@ def download_bim_from_github(url_or_path):
|
|
|
406
506
|
|
|
407
507
|
|
|
408
508
|
def update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name):
|
|
409
|
-
"""
|
|
509
|
+
"""
|
|
510
|
+
Update BIM file for DirectLake mode.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
workspace_id: Workspace GUID (should be actual GUID, not friendly name)
|
|
514
|
+
lakehouse_id: Item GUID (should be actual GUID, not friendly name with suffix)
|
|
515
|
+
schema_name: Schema name
|
|
516
|
+
"""
|
|
410
517
|
|
|
411
518
|
new_url = f"https://onelake.dfs.fabric.microsoft.com/{workspace_id}/{lakehouse_id}"
|
|
412
519
|
expression_name = None
|
|
@@ -606,15 +713,29 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
606
713
|
print("=" * 70)
|
|
607
714
|
return 1
|
|
608
715
|
|
|
609
|
-
# Step 3: Get lakehouse ID
|
|
716
|
+
# Step 3: Get lakehouse ID and ensure we have GUIDs for the BIM
|
|
610
717
|
print(f"\n[Step 3/6] Finding lakehouse...")
|
|
611
718
|
lakehouse_id = get_lakehouse_id(lakehouse_name_or_id, workspace_id, client)
|
|
612
719
|
|
|
720
|
+
# Step 3.5: Resolve to actual GUIDs for semantic model compatibility
|
|
721
|
+
print(f"\n[Step 3.5/6] Resolving to GUIDs for semantic model...")
|
|
722
|
+
workspace_guid = resolve_to_guid(workspace_id, 'workspace', client)
|
|
723
|
+
lakehouse_guid = resolve_to_guid(lakehouse_id, 'item', client, workspace_guid)
|
|
724
|
+
|
|
725
|
+
if workspace_guid:
|
|
726
|
+
print(f"✓ Workspace GUID: {workspace_guid}")
|
|
727
|
+
if lakehouse_guid:
|
|
728
|
+
print(f"✓ Item GUID: {lakehouse_guid}")
|
|
729
|
+
|
|
730
|
+
# Use GUIDs if available, otherwise fall back to original values
|
|
731
|
+
workspace_for_bim = workspace_guid if workspace_guid else workspace_id
|
|
732
|
+
lakehouse_for_bim = lakehouse_guid if lakehouse_guid else lakehouse_id
|
|
733
|
+
|
|
613
734
|
# Step 4: Download and update BIM
|
|
614
735
|
print("\n[Step 4/6] Loading and configuring BIM file...")
|
|
615
736
|
bim_content = download_bim_from_github(bim_url_or_path)
|
|
616
737
|
|
|
617
|
-
modified_bim = update_bim_for_directlake(bim_content,
|
|
738
|
+
modified_bim = update_bim_for_directlake(bim_content, workspace_for_bim, lakehouse_for_bim, schema_name)
|
|
618
739
|
modified_bim['name'] = dataset_name
|
|
619
740
|
modified_bim['id'] = dataset_name
|
|
620
741
|
|
|
@@ -4,6 +4,7 @@ pyproject.toml
|
|
|
4
4
|
duckrun/__init__.py
|
|
5
5
|
duckrun/auth.py
|
|
6
6
|
duckrun/core.py
|
|
7
|
+
duckrun/ducklake_metadata.py
|
|
7
8
|
duckrun/files.py
|
|
8
9
|
duckrun/lakehouse.py
|
|
9
10
|
duckrun/notebook.py
|
|
@@ -17,5 +18,7 @@ duckrun.egg-info/SOURCES.txt
|
|
|
17
18
|
duckrun.egg-info/dependency_links.txt
|
|
18
19
|
duckrun.egg-info/requires.txt
|
|
19
20
|
duckrun.egg-info/top_level.txt
|
|
21
|
+
tests/test_checkpoint_format.py
|
|
22
|
+
tests/test_ducklake_export.py
|
|
20
23
|
tests/test_register.py
|
|
21
24
|
tests/test_rle.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.20.dev0"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test to verify our checkpoint parquet format matches Delta Lake specification
|
|
3
|
+
by comparing with a real Delta Lake checkpoint file.
|
|
4
|
+
"""
|
|
5
|
+
import duckdb
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
def test_checkpoint_columns():
|
|
9
|
+
"""Verify the checkpoint has the correct columns with correct types"""
|
|
10
|
+
|
|
11
|
+
# Expected columns for Delta Lake checkpoint
|
|
12
|
+
expected_columns = {
|
|
13
|
+
'protocol': 'STRUCT(minReaderVersion INTEGER, minWriterVersion INTEGER)',
|
|
14
|
+
'metaData': 'STRUCT', # Complex nested structure
|
|
15
|
+
'add': 'STRUCT', # Complex nested structure with stats
|
|
16
|
+
'remove': 'STRUCT',
|
|
17
|
+
'commitInfo': 'STRUCT'
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# This would be the path to a generated checkpoint - for now just verify structure
|
|
21
|
+
print("✓ Checkpoint should have these columns:")
|
|
22
|
+
for col, typ in expected_columns.items():
|
|
23
|
+
print(f" - {col}: {typ}")
|
|
24
|
+
|
|
25
|
+
print("\n✓ The 'add' struct should contain:")
|
|
26
|
+
print(" - path: VARCHAR")
|
|
27
|
+
print(" - partitionValues: MAP(VARCHAR, VARCHAR)")
|
|
28
|
+
print(" - size: BIGINT")
|
|
29
|
+
print(" - modificationTime: BIGINT")
|
|
30
|
+
print(" - dataChange: BOOLEAN")
|
|
31
|
+
print(" - stats: VARCHAR (JSON string)")
|
|
32
|
+
print(" - tags: MAP(VARCHAR, VARCHAR)")
|
|
33
|
+
|
|
34
|
+
print("\n✓ The 'stats' JSON string should contain:")
|
|
35
|
+
print(" - numRecords: INTEGER")
|
|
36
|
+
print(" - minValues: MAP with properly typed values")
|
|
37
|
+
print(" - maxValues: MAP with properly typed values")
|
|
38
|
+
print(" - nullCount: MAP with INTEGER values")
|
|
39
|
+
|
|
40
|
+
def test_checkpoint_structure_from_json():
|
|
41
|
+
"""Test that our checkpoint structure matches the real Delta checkpoint JSON"""
|
|
42
|
+
|
|
43
|
+
# Read the real checkpoint JSON
|
|
44
|
+
with open('tests/00000000000000000000.json', 'r') as f:
|
|
45
|
+
lines = f.readlines()
|
|
46
|
+
|
|
47
|
+
print("=== Real Delta Lake Checkpoint Analysis ===\n")
|
|
48
|
+
|
|
49
|
+
# Parse each entry
|
|
50
|
+
entry_types = []
|
|
51
|
+
add_count = 0
|
|
52
|
+
|
|
53
|
+
for i, line in enumerate(lines[:10]): # Check first 10 lines
|
|
54
|
+
entry = json.loads(line)
|
|
55
|
+
entry_type = list(entry.keys())[0]
|
|
56
|
+
entry_types.append(entry_type)
|
|
57
|
+
|
|
58
|
+
if entry_type == 'add':
|
|
59
|
+
add_count += 1
|
|
60
|
+
if add_count == 1: # Show first add entry structure
|
|
61
|
+
add_entry = entry['add']
|
|
62
|
+
print(f"✓ ADD Entry Structure (line {i+1}):")
|
|
63
|
+
print(f" - path: {type(add_entry['path']).__name__}")
|
|
64
|
+
print(f" - partitionValues: {type(add_entry['partitionValues']).__name__} = {add_entry['partitionValues']}")
|
|
65
|
+
print(f" - size: {type(add_entry['size']).__name__} = {add_entry['size']}")
|
|
66
|
+
print(f" - modificationTime: {type(add_entry['modificationTime']).__name__}")
|
|
67
|
+
print(f" - dataChange: {type(add_entry['dataChange']).__name__} = {add_entry['dataChange']}")
|
|
68
|
+
print(f" - stats: {type(add_entry['stats']).__name__} (JSON string)")
|
|
69
|
+
print(f" - tags: {type(add_entry['tags']).__name__} = {add_entry['tags']}")
|
|
70
|
+
|
|
71
|
+
# Parse stats JSON
|
|
72
|
+
stats = json.loads(add_entry['stats'])
|
|
73
|
+
print(f"\n✓ STATS Structure:")
|
|
74
|
+
print(f" - numRecords: {type(stats['numRecords']).__name__} = {stats['numRecords']}")
|
|
75
|
+
print(f" - minValues: {type(stats['minValues']).__name__} with {len(stats['minValues'])} columns")
|
|
76
|
+
print(f" - maxValues: {type(stats['maxValues']).__name__} with {len(stats['maxValues'])} columns")
|
|
77
|
+
print(f" - nullCount: {type(stats['nullCount']).__name__} with {len(stats['nullCount'])} columns")
|
|
78
|
+
|
|
79
|
+
# Check value types in stats
|
|
80
|
+
print(f"\n✓ Sample minValues types:")
|
|
81
|
+
for key, value in list(stats['minValues'].items())[:3]:
|
|
82
|
+
print(f" - {key}: {type(value).__name__} = {value}")
|
|
83
|
+
|
|
84
|
+
elif entry_type == 'commitInfo':
|
|
85
|
+
print(f"✓ COMMITINFO Entry (line {i+1})")
|
|
86
|
+
elif entry_type == 'metaData':
|
|
87
|
+
print(f"✓ METADATA Entry (line {i+1})")
|
|
88
|
+
elif entry_type == 'protocol':
|
|
89
|
+
protocol = entry['protocol']
|
|
90
|
+
print(f"✓ PROTOCOL Entry (line {i+1}):")
|
|
91
|
+
print(f" - minReaderVersion: {protocol['minReaderVersion']}")
|
|
92
|
+
print(f" - minWriterVersion: {protocol['minWriterVersion']}")
|
|
93
|
+
|
|
94
|
+
print(f"\n=== Entry Count Summary ===")
|
|
95
|
+
print(f"Total entries analyzed: {len(lines)}")
|
|
96
|
+
print(f"Entry types: {', '.join(set(entry_types))}")
|
|
97
|
+
print(f"Add entries: {len([t for t in entry_types if t == 'add'])}")
|
|
98
|
+
|
|
99
|
+
if __name__ == '__main__':
|
|
100
|
+
test_checkpoint_columns()
|
|
101
|
+
print("\n" + "="*50 + "\n")
|
|
102
|
+
test_checkpoint_structure_from_json()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|