faceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg/tests/test_server_playwright.py +5 -1
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/METADATA +9 -7
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/RECORD +13 -12
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/WHEEL +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,911 @@
|
|
|
1
|
+
"""Tests for the iceberg module (Iceberg metadata generation)."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
import pyarrow.parquet as pq
|
|
10
|
+
import pytest
|
|
11
|
+
from pyiceberg.io.pyarrow import PyArrowFileIO
|
|
12
|
+
from pyiceberg.manifest import ManifestEntryStatus
|
|
13
|
+
from pyiceberg.table import StaticTable
|
|
14
|
+
from pyiceberg.types import ListType, StructType
|
|
15
|
+
|
|
16
|
+
from faceberg.iceberg import ParquetFile, create_schema, diff_snapshot, write_snapshot
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def arrow_schema():
|
|
21
|
+
"""Create a simple PyArrow schema for testing."""
|
|
22
|
+
return pa.schema(
|
|
23
|
+
[
|
|
24
|
+
pa.field("id", pa.int64()),
|
|
25
|
+
pa.field("name", pa.string()),
|
|
26
|
+
pa.field("value", pa.float64()),
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compute_file_hash(path: Path) -> str:
|
|
32
|
+
"""Compute MD5 hash of file contents."""
|
|
33
|
+
md5 = hashlib.md5()
|
|
34
|
+
with open(path, "rb") as f:
|
|
35
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
36
|
+
md5.update(chunk)
|
|
37
|
+
return md5.hexdigest()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.fixture
|
|
41
|
+
def parquet_files(tmp_path, arrow_schema):
|
|
42
|
+
"""Create 5 parquet files with 20 rows each (100 total), each row unique."""
|
|
43
|
+
files = []
|
|
44
|
+
data_dir = tmp_path / "data"
|
|
45
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
for i in range(5):
|
|
48
|
+
path = data_dir / f"part-{i:05d}.parquet"
|
|
49
|
+
|
|
50
|
+
# Each file has 20 unique rows
|
|
51
|
+
start_id = i * 20
|
|
52
|
+
table = pa.table(
|
|
53
|
+
{
|
|
54
|
+
"id": pa.array(list(range(start_id, start_id + 20)), type=pa.int64()),
|
|
55
|
+
"name": [f"name_{j}" for j in range(start_id, start_id + 20)],
|
|
56
|
+
"value": [float(j) * 1.5 for j in range(start_id, start_id + 20)],
|
|
57
|
+
},
|
|
58
|
+
schema=arrow_schema,
|
|
59
|
+
)
|
|
60
|
+
pq.write_table(table, path)
|
|
61
|
+
files.append(
|
|
62
|
+
ParquetFile(
|
|
63
|
+
uri=str(path),
|
|
64
|
+
path=str(path),
|
|
65
|
+
size=path.stat().st_size,
|
|
66
|
+
blob_id=compute_file_hash(path),
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return files
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def make_extra_files(tmp_path, arrow_schema, count=2, start_index=5):
|
|
74
|
+
"""Create additional parquet files for append tests."""
|
|
75
|
+
files = []
|
|
76
|
+
data_dir = tmp_path / "data"
|
|
77
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
for i in range(count):
|
|
80
|
+
idx = start_index + i
|
|
81
|
+
path = data_dir / f"part-{idx:05d}.parquet"
|
|
82
|
+
|
|
83
|
+
start_id = idx * 20
|
|
84
|
+
table = pa.table(
|
|
85
|
+
{
|
|
86
|
+
"id": pa.array(list(range(start_id, start_id + 20)), type=pa.int64()),
|
|
87
|
+
"name": [f"name_{j}" for j in range(start_id, start_id + 20)],
|
|
88
|
+
"value": [float(j) * 1.5 for j in range(start_id, start_id + 20)],
|
|
89
|
+
},
|
|
90
|
+
schema=arrow_schema,
|
|
91
|
+
)
|
|
92
|
+
pq.write_table(table, path)
|
|
93
|
+
files.append(
|
|
94
|
+
ParquetFile(
|
|
95
|
+
uri=str(path),
|
|
96
|
+
path=str(path),
|
|
97
|
+
size=path.stat().st_size,
|
|
98
|
+
blob_id=compute_file_hash(path),
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return files
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class TestInitialSnapshot:
|
|
106
|
+
"""Tests for creating initial table snapshots."""
|
|
107
|
+
|
|
108
|
+
def test_initial_snapshot_creates_valid_metadata(self, tmp_path, parquet_files, arrow_schema):
|
|
109
|
+
"""Test that initial snapshot creates valid Iceberg metadata."""
|
|
110
|
+
_metadata = write_snapshot(
|
|
111
|
+
files=parquet_files,
|
|
112
|
+
schema=arrow_schema,
|
|
113
|
+
current_metadata=None,
|
|
114
|
+
output_dir=tmp_path,
|
|
115
|
+
base_uri=f"file://{tmp_path}",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Validate using StaticTable - pass the metadata file path
|
|
119
|
+
metadata_file = tmp_path / "metadata" / "v1.metadata.json"
|
|
120
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
121
|
+
|
|
122
|
+
# Check data files
|
|
123
|
+
data_files = table.inspect.data_files()
|
|
124
|
+
assert len(data_files) == 5
|
|
125
|
+
|
|
126
|
+
# Check file paths match
|
|
127
|
+
file_paths = set(data_files["file_path"].to_pylist())
|
|
128
|
+
expected_paths = {f.uri for f in parquet_files}
|
|
129
|
+
assert file_paths == expected_paths
|
|
130
|
+
|
|
131
|
+
# Check snapshot summary
|
|
132
|
+
snapshot = table.current_snapshot()
|
|
133
|
+
assert snapshot is not None
|
|
134
|
+
assert snapshot.summary.operation.value == "append"
|
|
135
|
+
assert int(snapshot.summary["added-data-files"]) == 5
|
|
136
|
+
assert int(snapshot.summary["total-records"]) == 100
|
|
137
|
+
|
|
138
|
+
def test_initial_snapshot_scan_returns_data(self, tmp_path, parquet_files, arrow_schema):
|
|
139
|
+
"""Test that initial snapshot can be scanned correctly."""
|
|
140
|
+
|
|
141
|
+
write_snapshot(
|
|
142
|
+
files=parquet_files,
|
|
143
|
+
schema=arrow_schema,
|
|
144
|
+
current_metadata=None,
|
|
145
|
+
output_dir=tmp_path,
|
|
146
|
+
base_uri=f"file://{tmp_path}",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
metadata_file = tmp_path / "metadata" / "v1.metadata.json"
|
|
150
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
151
|
+
result = table.scan().to_arrow()
|
|
152
|
+
|
|
153
|
+
# Should have 100 rows (5 files * 20 rows)
|
|
154
|
+
assert len(result) == 100
|
|
155
|
+
|
|
156
|
+
# Check all IDs are present (0-99)
|
|
157
|
+
ids = sorted(result["id"].to_pylist())
|
|
158
|
+
assert ids == list(range(100))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class TestAppendSnapshot:
|
|
162
|
+
"""Tests for appending files to existing snapshots."""
|
|
163
|
+
|
|
164
|
+
def test_append_files_creates_new_snapshot(self, tmp_path, parquet_files, arrow_schema):
|
|
165
|
+
"""Test that appending files creates a new snapshot with all files."""
|
|
166
|
+
|
|
167
|
+
# Create initial snapshot
|
|
168
|
+
metadata = write_snapshot(
|
|
169
|
+
files=parquet_files,
|
|
170
|
+
schema=arrow_schema,
|
|
171
|
+
current_metadata=None,
|
|
172
|
+
output_dir=tmp_path,
|
|
173
|
+
base_uri=f"file://{tmp_path}",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Create additional files
|
|
177
|
+
extra_files = make_extra_files(tmp_path, arrow_schema, count=2, start_index=5)
|
|
178
|
+
|
|
179
|
+
# Append files
|
|
180
|
+
updated_metadata = write_snapshot(
|
|
181
|
+
files=parquet_files + extra_files,
|
|
182
|
+
schema=arrow_schema,
|
|
183
|
+
current_metadata=metadata,
|
|
184
|
+
output_dir=tmp_path,
|
|
185
|
+
base_uri=f"file://{tmp_path}",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Validate
|
|
189
|
+
metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
|
|
190
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
191
|
+
|
|
192
|
+
# Should have 7 files now
|
|
193
|
+
data_files = table.inspect.data_files()
|
|
194
|
+
assert len(data_files) == 7
|
|
195
|
+
|
|
196
|
+
# Should have 2 snapshots
|
|
197
|
+
snapshots = table.inspect.snapshots()
|
|
198
|
+
assert len(snapshots) == 2
|
|
199
|
+
|
|
200
|
+
# Scan should return 140 rows
|
|
201
|
+
result = table.scan().to_arrow()
|
|
202
|
+
assert len(result) == 140
|
|
203
|
+
|
|
204
|
+
# Check IDs 0-139 are present
|
|
205
|
+
ids = sorted(result["id"].to_pylist())
|
|
206
|
+
assert ids == list(range(140))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class TestDeleteSnapshot:
|
|
210
|
+
"""Tests for deleting files from snapshots."""
|
|
211
|
+
|
|
212
|
+
def test_delete_files_removes_from_snapshot(self, tmp_path, parquet_files, arrow_schema):
|
|
213
|
+
"""Test that deleting files removes them from the current snapshot."""
|
|
214
|
+
|
|
215
|
+
# Create initial snapshot
|
|
216
|
+
metadata = write_snapshot(
|
|
217
|
+
files=parquet_files,
|
|
218
|
+
schema=arrow_schema,
|
|
219
|
+
current_metadata=None,
|
|
220
|
+
output_dir=tmp_path,
|
|
221
|
+
base_uri=f"file://{tmp_path}",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Delete first 2 files (IDs 0-39) by passing only the remaining files
|
|
225
|
+
remaining_files = parquet_files[2:]
|
|
226
|
+
|
|
227
|
+
updated_metadata = write_snapshot(
|
|
228
|
+
files=remaining_files,
|
|
229
|
+
schema=arrow_schema,
|
|
230
|
+
current_metadata=metadata,
|
|
231
|
+
output_dir=tmp_path,
|
|
232
|
+
base_uri=f"file://{tmp_path}",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Validate
|
|
236
|
+
metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
|
|
237
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
238
|
+
|
|
239
|
+
# Should have 3 files now
|
|
240
|
+
data_files = table.inspect.data_files()
|
|
241
|
+
assert len(data_files) == 3
|
|
242
|
+
|
|
243
|
+
# Deleted files should not be present
|
|
244
|
+
file_paths = set(data_files["file_path"].to_pylist())
|
|
245
|
+
for deleted in parquet_files[:2]:
|
|
246
|
+
assert deleted.uri not in file_paths
|
|
247
|
+
|
|
248
|
+
# Scan should return 60 rows (IDs 40-99)
|
|
249
|
+
result = table.scan().to_arrow()
|
|
250
|
+
assert len(result) == 60
|
|
251
|
+
|
|
252
|
+
ids = sorted(result["id"].to_pylist())
|
|
253
|
+
assert ids == list(range(40, 100))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class TestOverwriteSnapshot:
|
|
257
|
+
"""Tests for overwrite operations (delete + add)."""
|
|
258
|
+
|
|
259
|
+
def test_overwrite_replaces_files(self, tmp_path, parquet_files, arrow_schema):
|
|
260
|
+
"""Test that overwrite removes old files and adds new ones."""
|
|
261
|
+
|
|
262
|
+
# Create initial snapshot
|
|
263
|
+
metadata = write_snapshot(
|
|
264
|
+
files=parquet_files,
|
|
265
|
+
schema=arrow_schema,
|
|
266
|
+
current_metadata=None,
|
|
267
|
+
output_dir=tmp_path,
|
|
268
|
+
base_uri=f"file://{tmp_path}",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Create a replacement file
|
|
272
|
+
replacement_path = tmp_path / "data" / "replacement.parquet"
|
|
273
|
+
replacement_table = pa.table(
|
|
274
|
+
{
|
|
275
|
+
"id": pa.array(list(range(1000, 1020)), type=pa.int64()),
|
|
276
|
+
"name": [f"replaced_{j}" for j in range(20)],
|
|
277
|
+
"value": [float(j) * 2.0 for j in range(20)],
|
|
278
|
+
},
|
|
279
|
+
schema=arrow_schema,
|
|
280
|
+
)
|
|
281
|
+
pq.write_table(replacement_table, replacement_path)
|
|
282
|
+
replacement_file = ParquetFile(
|
|
283
|
+
uri=str(replacement_path),
|
|
284
|
+
path=str(replacement_path),
|
|
285
|
+
size=replacement_path.stat().st_size,
|
|
286
|
+
blob_id=compute_file_hash(replacement_path),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Overwrite: replace first file with replacement
|
|
290
|
+
updated_metadata = write_snapshot(
|
|
291
|
+
files=[replacement_file] + parquet_files[1:],
|
|
292
|
+
schema=arrow_schema,
|
|
293
|
+
current_metadata=metadata,
|
|
294
|
+
output_dir=tmp_path,
|
|
295
|
+
base_uri=f"file://{tmp_path}",
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Validate
|
|
299
|
+
metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
|
|
300
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
301
|
+
|
|
302
|
+
# Still 5 files
|
|
303
|
+
data_files = table.inspect.data_files()
|
|
304
|
+
assert len(data_files) == 5
|
|
305
|
+
|
|
306
|
+
# Old file should be gone, replacement should be present
|
|
307
|
+
file_paths = set(data_files["file_path"].to_pylist())
|
|
308
|
+
assert parquet_files[0].uri not in file_paths
|
|
309
|
+
assert str(replacement_path) in file_paths
|
|
310
|
+
|
|
311
|
+
# Snapshot should be OVERWRITE
|
|
312
|
+
snapshot = table.current_snapshot()
|
|
313
|
+
assert snapshot.summary.operation.value == "overwrite"
|
|
314
|
+
|
|
315
|
+
# Scan should return 100 rows (20-99 from original + 1000-1019 from replacement)
|
|
316
|
+
result = table.scan().to_arrow()
|
|
317
|
+
assert len(result) == 100
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class TestRenameFile:
|
|
321
|
+
"""Tests for file rename operations (delete old URI + add new URI)."""
|
|
322
|
+
|
|
323
|
+
def test_rename_file_updates_uri(self, tmp_path, parquet_files, arrow_schema):
|
|
324
|
+
"""Test renaming a file (delete old URI + add new URI with same content)."""
|
|
325
|
+
|
|
326
|
+
# Create initial snapshot
|
|
327
|
+
metadata = write_snapshot(
|
|
328
|
+
files=parquet_files,
|
|
329
|
+
schema=arrow_schema,
|
|
330
|
+
current_metadata=None,
|
|
331
|
+
output_dir=tmp_path,
|
|
332
|
+
base_uri=f"file://{tmp_path}",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# "Rename" first file: copy to new location
|
|
336
|
+
old_file = parquet_files[0]
|
|
337
|
+
new_path = tmp_path / "data" / "renamed-file.parquet"
|
|
338
|
+
shutil.copy(old_file.uri, new_path)
|
|
339
|
+
new_file = ParquetFile(
|
|
340
|
+
uri=str(new_path),
|
|
341
|
+
path=str(new_path),
|
|
342
|
+
size=new_path.stat().st_size,
|
|
343
|
+
blob_id=compute_file_hash(new_path),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Create overwrite snapshot with renamed file
|
|
347
|
+
updated_metadata = write_snapshot(
|
|
348
|
+
files=[new_file] + parquet_files[1:],
|
|
349
|
+
schema=arrow_schema,
|
|
350
|
+
current_metadata=metadata,
|
|
351
|
+
output_dir=tmp_path,
|
|
352
|
+
base_uri=f"file://{tmp_path}",
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Validate
|
|
356
|
+
metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
|
|
357
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
358
|
+
data_files = table.inspect.data_files()
|
|
359
|
+
file_paths = set(data_files["file_path"].to_pylist())
|
|
360
|
+
|
|
361
|
+
# Old file should not be present
|
|
362
|
+
assert old_file.uri not in file_paths
|
|
363
|
+
# New file should be present
|
|
364
|
+
assert str(new_path) in file_paths
|
|
365
|
+
# Total files still 5
|
|
366
|
+
assert len(data_files) == 5
|
|
367
|
+
|
|
368
|
+
# Data should be unchanged (100 rows with same IDs)
|
|
369
|
+
result = table.scan().to_arrow()
|
|
370
|
+
assert len(result) == 100
|
|
371
|
+
|
|
372
|
+
ids = sorted(result["id"].to_pylist())
|
|
373
|
+
assert ids == list(range(100))
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class TestManifestEntries:
|
|
377
|
+
"""Tests for manifest entry correctness."""
|
|
378
|
+
|
|
379
|
+
def test_initial_entries_are_added(self, tmp_path, parquet_files, arrow_schema):
|
|
380
|
+
"""Test that initial snapshot entries have ADDED status."""
|
|
381
|
+
|
|
382
|
+
write_snapshot(
|
|
383
|
+
files=parquet_files,
|
|
384
|
+
schema=arrow_schema,
|
|
385
|
+
current_metadata=None,
|
|
386
|
+
output_dir=tmp_path,
|
|
387
|
+
base_uri=f"file://{tmp_path}",
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
metadata_file = tmp_path / "metadata" / "v1.metadata.json"
|
|
391
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
392
|
+
entries = table.inspect.entries()
|
|
393
|
+
|
|
394
|
+
# All entries should be ADDED (status=1)
|
|
395
|
+
statuses = entries["status"].to_pylist()
|
|
396
|
+
assert all(s == 1 for s in statuses)
|
|
397
|
+
assert len(statuses) == 5
|
|
398
|
+
|
|
399
|
+
def test_append_entries_are_added(self, tmp_path, parquet_files, arrow_schema):
|
|
400
|
+
"""Test that appended files have ADDED status in new manifest."""
|
|
401
|
+
|
|
402
|
+
metadata = write_snapshot(
|
|
403
|
+
files=parquet_files,
|
|
404
|
+
schema=arrow_schema,
|
|
405
|
+
current_metadata=None,
|
|
406
|
+
output_dir=tmp_path,
|
|
407
|
+
base_uri=f"file://{tmp_path}",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
extra_files = make_extra_files(tmp_path, arrow_schema, count=1, start_index=5)
|
|
411
|
+
|
|
412
|
+
updated_metadata = write_snapshot(
|
|
413
|
+
files=parquet_files + extra_files,
|
|
414
|
+
schema=arrow_schema,
|
|
415
|
+
current_metadata=metadata,
|
|
416
|
+
output_dir=tmp_path,
|
|
417
|
+
base_uri=f"file://{tmp_path}",
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
|
|
421
|
+
table = StaticTable.from_metadata(str(metadata_file))
|
|
422
|
+
entries = table.inspect.entries()
|
|
423
|
+
|
|
424
|
+
# Should have 6 entries total
|
|
425
|
+
assert len(entries) == 6
|
|
426
|
+
|
|
427
|
+
# All entries visible should be ADDED (1) or EXISTING (0)
|
|
428
|
+
statuses = entries["status"].to_pylist()
|
|
429
|
+
assert all(s in (0, 1) for s in statuses)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class TestDiffSnapshotFiles:
|
|
433
|
+
"""Tests for diff_snapshot function."""
|
|
434
|
+
|
|
435
|
+
def test_initial_snapshot_all_added(self, tmp_path, parquet_files, arrow_schema):
|
|
436
|
+
"""Test that with no previous metadata, all files are ADDED."""
|
|
437
|
+
|
|
438
|
+
io = PyArrowFileIO()
|
|
439
|
+
result = diff_snapshot(parquet_files, None, io)
|
|
440
|
+
|
|
441
|
+
# All files should be ADDED
|
|
442
|
+
assert len(result) == 5
|
|
443
|
+
for status, pf in result:
|
|
444
|
+
assert status == ManifestEntryStatus.ADDED
|
|
445
|
+
assert pf in parquet_files
|
|
446
|
+
|
|
447
|
+
def test_existing_files_unchanged(self, tmp_path, parquet_files, arrow_schema):
|
|
448
|
+
"""Test that files unchanged from previous snapshot are EXISTING."""
|
|
449
|
+
|
|
450
|
+
# Create initial snapshot
|
|
451
|
+
metadata = write_snapshot(
|
|
452
|
+
files=parquet_files,
|
|
453
|
+
schema=arrow_schema,
|
|
454
|
+
current_metadata=None,
|
|
455
|
+
output_dir=tmp_path,
|
|
456
|
+
base_uri=f"file://{tmp_path}",
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Diff with same files
|
|
460
|
+
io = PyArrowFileIO()
|
|
461
|
+
result = diff_snapshot(parquet_files, metadata, io)
|
|
462
|
+
|
|
463
|
+
# All files should be EXISTING
|
|
464
|
+
assert len(result) == 5
|
|
465
|
+
for status, pf in result:
|
|
466
|
+
assert status == ManifestEntryStatus.EXISTING
|
|
467
|
+
assert pf in parquet_files
|
|
468
|
+
|
|
469
|
+
def test_removed_files(self, tmp_path, parquet_files, arrow_schema):
|
|
470
|
+
"""Test that files in previous snapshot but not in current are REMOVED."""
|
|
471
|
+
|
|
472
|
+
# Create initial snapshot
|
|
473
|
+
metadata = write_snapshot(
|
|
474
|
+
files=parquet_files,
|
|
475
|
+
schema=arrow_schema,
|
|
476
|
+
current_metadata=None,
|
|
477
|
+
output_dir=tmp_path,
|
|
478
|
+
base_uri=f"file://{tmp_path}",
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Diff with subset of files (remove first 2)
|
|
482
|
+
io = PyArrowFileIO()
|
|
483
|
+
current_files = parquet_files[2:] # Keep only last 3 files
|
|
484
|
+
result = diff_snapshot(current_files, metadata, io)
|
|
485
|
+
|
|
486
|
+
# Should have 3 EXISTING + 2 REMOVED = 5 total
|
|
487
|
+
assert len(result) == 5
|
|
488
|
+
|
|
489
|
+
existing_count = sum(1 for status, _ in result if status == ManifestEntryStatus.EXISTING)
|
|
490
|
+
removed_count = sum(1 for status, _ in result if status == ManifestEntryStatus.DELETED)
|
|
491
|
+
|
|
492
|
+
assert existing_count == 3
|
|
493
|
+
assert removed_count == 2
|
|
494
|
+
|
|
495
|
+
# Check that removed files are the first 2
|
|
496
|
+
removed_files = [pf for status, pf in result if status == ManifestEntryStatus.DELETED]
|
|
497
|
+
assert len(removed_files) == 2
|
|
498
|
+
for pf in removed_files:
|
|
499
|
+
assert pf.uri in [parquet_files[0].uri, parquet_files[1].uri]
|
|
500
|
+
|
|
501
|
+
def test_changed_files_removed_and_added(self, tmp_path, parquet_files, arrow_schema):
|
|
502
|
+
"""Test that files with same URI but different hash/size are REMOVED + ADDED."""
|
|
503
|
+
|
|
504
|
+
# Create initial snapshot
|
|
505
|
+
metadata = write_snapshot(
|
|
506
|
+
files=parquet_files,
|
|
507
|
+
schema=arrow_schema,
|
|
508
|
+
current_metadata=None,
|
|
509
|
+
output_dir=tmp_path,
|
|
510
|
+
base_uri=f"file://{tmp_path}",
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
# Modify first file (same URI, different content)
|
|
514
|
+
first_file_path = Path(parquet_files[0].uri)
|
|
515
|
+
modified_table = pa.table(
|
|
516
|
+
{
|
|
517
|
+
"id": pa.array([999], type=pa.int64()),
|
|
518
|
+
"name": ["modified"],
|
|
519
|
+
"value": [999.9],
|
|
520
|
+
},
|
|
521
|
+
schema=arrow_schema,
|
|
522
|
+
)
|
|
523
|
+
pq.write_table(modified_table, first_file_path)
|
|
524
|
+
|
|
525
|
+
# Create new ParquetFile with same URI but new hash
|
|
526
|
+
modified_file = ParquetFile(
|
|
527
|
+
uri=str(first_file_path),
|
|
528
|
+
path=str(first_file_path),
|
|
529
|
+
size=first_file_path.stat().st_size,
|
|
530
|
+
blob_id=compute_file_hash(first_file_path),
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
current_files = [modified_file] + parquet_files[1:]
|
|
534
|
+
|
|
535
|
+
# Diff
|
|
536
|
+
io = PyArrowFileIO()
|
|
537
|
+
result = diff_snapshot(current_files, metadata, io)
|
|
538
|
+
|
|
539
|
+
# Should have: 1 REMOVED (old version) + 1 ADDED (new version) + 4 EXISTING = 6 total
|
|
540
|
+
assert len(result) == 6
|
|
541
|
+
|
|
542
|
+
added_count = sum(1 for status, _ in result if status == ManifestEntryStatus.ADDED)
|
|
543
|
+
removed_count = sum(1 for status, _ in result if status == ManifestEntryStatus.DELETED)
|
|
544
|
+
existing_count = sum(1 for status, _ in result if status == ManifestEntryStatus.EXISTING)
|
|
545
|
+
|
|
546
|
+
assert added_count == 1
|
|
547
|
+
assert removed_count == 1
|
|
548
|
+
assert existing_count == 4
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
class TestSchemaConversion:
|
|
552
|
+
"""Tests for create_schema with complex nested structures."""
|
|
553
|
+
|
|
554
|
+
def test_schema_with_nested_struct(self):
|
|
555
|
+
"""Test schema conversion with nested struct fields."""
|
|
556
|
+
# Create PyArrow schema with nested struct
|
|
557
|
+
arrow_schema = pa.schema(
|
|
558
|
+
[
|
|
559
|
+
pa.field("id", pa.int64()),
|
|
560
|
+
pa.field(
|
|
561
|
+
"metadata",
|
|
562
|
+
pa.struct(
|
|
563
|
+
[
|
|
564
|
+
pa.field("title", pa.string()),
|
|
565
|
+
pa.field("author", pa.string()),
|
|
566
|
+
pa.field("year", pa.int32()),
|
|
567
|
+
]
|
|
568
|
+
),
|
|
569
|
+
),
|
|
570
|
+
]
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
schema = create_schema(arrow_schema, include_split_column=False)
|
|
574
|
+
|
|
575
|
+
# Verify structure
|
|
576
|
+
field_names = [f.name for f in schema.fields]
|
|
577
|
+
assert "id" in field_names
|
|
578
|
+
assert "metadata" in field_names
|
|
579
|
+
|
|
580
|
+
# Find metadata field
|
|
581
|
+
metadata_field = next(f for f in schema.fields if f.name == "metadata")
|
|
582
|
+
assert isinstance(metadata_field.field_type, StructType)
|
|
583
|
+
|
|
584
|
+
# Verify nested fields
|
|
585
|
+
nested_field_names = [f.name for f in metadata_field.field_type.fields]
|
|
586
|
+
assert "title" in nested_field_names
|
|
587
|
+
assert "author" in nested_field_names
|
|
588
|
+
assert "year" in nested_field_names
|
|
589
|
+
|
|
590
|
+
def test_schema_with_list_field(self):
|
|
591
|
+
"""Test schema conversion with list fields."""
|
|
592
|
+
arrow_schema = pa.schema(
|
|
593
|
+
[
|
|
594
|
+
pa.field("id", pa.int64()),
|
|
595
|
+
pa.field("tags", pa.list_(pa.string())),
|
|
596
|
+
]
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
schema = create_schema(arrow_schema, include_split_column=False)
|
|
600
|
+
|
|
601
|
+
# Find tags field
|
|
602
|
+
tags_field = next(f for f in schema.fields if f.name == "tags")
|
|
603
|
+
assert isinstance(tags_field.field_type, ListType)
|
|
604
|
+
|
|
605
|
+
def test_schema_with_deeply_nested_structures(self):
|
|
606
|
+
"""Test schema conversion with deeply nested structures."""
|
|
607
|
+
arrow_schema = pa.schema(
|
|
608
|
+
[
|
|
609
|
+
pa.field("id", pa.int64()),
|
|
610
|
+
pa.field(
|
|
611
|
+
"nested",
|
|
612
|
+
pa.struct(
|
|
613
|
+
[
|
|
614
|
+
pa.field("field1", pa.string()),
|
|
615
|
+
pa.field("field2", pa.int32()),
|
|
616
|
+
pa.field(
|
|
617
|
+
"deeper",
|
|
618
|
+
pa.struct([pa.field("field3", pa.string())]),
|
|
619
|
+
),
|
|
620
|
+
]
|
|
621
|
+
),
|
|
622
|
+
),
|
|
623
|
+
pa.field("list_field", pa.list_(pa.string())),
|
|
624
|
+
]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
schema = create_schema(arrow_schema, include_split_column=True)
|
|
628
|
+
|
|
629
|
+
# Should include split column
|
|
630
|
+
field_names = [f.name for f in schema.fields]
|
|
631
|
+
assert "split" in field_names
|
|
632
|
+
assert schema.fields[0].name == "split"
|
|
633
|
+
|
|
634
|
+
# Verify nested field exists
|
|
635
|
+
nested_field = next(f for f in schema.fields if f.name == "nested")
|
|
636
|
+
assert isinstance(nested_field.field_type, StructType)
|
|
637
|
+
|
|
638
|
+
def test_unique_field_ids_across_nested_structures(self):
|
|
639
|
+
"""Test that all field IDs are unique across nested structures."""
|
|
640
|
+
arrow_schema = pa.schema(
|
|
641
|
+
[
|
|
642
|
+
pa.field("id", pa.int64()),
|
|
643
|
+
pa.field(
|
|
644
|
+
"nested",
|
|
645
|
+
pa.struct(
|
|
646
|
+
[
|
|
647
|
+
pa.field("field1", pa.string()),
|
|
648
|
+
pa.field("field2", pa.int32()),
|
|
649
|
+
pa.field(
|
|
650
|
+
"deeper",
|
|
651
|
+
pa.struct([pa.field("field3", pa.string())]),
|
|
652
|
+
),
|
|
653
|
+
]
|
|
654
|
+
),
|
|
655
|
+
),
|
|
656
|
+
pa.field("list_field", pa.list_(pa.string())),
|
|
657
|
+
]
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
schema = create_schema(arrow_schema, include_split_column=True)
|
|
661
|
+
|
|
662
|
+
# Collect all field IDs recursively
|
|
663
|
+
def collect_field_ids(field_type, ids=None):
|
|
664
|
+
if ids is None:
|
|
665
|
+
ids = []
|
|
666
|
+
|
|
667
|
+
if isinstance(field_type, StructType):
|
|
668
|
+
for field in field_type.fields:
|
|
669
|
+
ids.append(field.field_id)
|
|
670
|
+
collect_field_ids(field.field_type, ids)
|
|
671
|
+
elif isinstance(field_type, ListType):
|
|
672
|
+
ids.append(field_type.element_id)
|
|
673
|
+
collect_field_ids(field_type.element_type, ids)
|
|
674
|
+
|
|
675
|
+
return ids
|
|
676
|
+
|
|
677
|
+
# Get all field IDs
|
|
678
|
+
all_ids = [f.field_id for f in schema.fields]
|
|
679
|
+
for field in schema.fields:
|
|
680
|
+
all_ids.extend(collect_field_ids(field.field_type))
|
|
681
|
+
|
|
682
|
+
# Check all IDs are unique
|
|
683
|
+
assert len(all_ids) == len(set(all_ids)), f"Duplicate field IDs found: {all_ids}"
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
class TestNameMapping:
|
|
687
|
+
"""Tests for name mapping with nested structures."""
|
|
688
|
+
|
|
689
|
+
def test_name_mapping_with_nested_structs(self, tmp_path):
|
|
690
|
+
"""Test that name mapping includes nested struct fields."""
|
|
691
|
+
# Create schema with nested structs
|
|
692
|
+
iceberg_schema = pa.schema(
|
|
693
|
+
[
|
|
694
|
+
pa.field("id", pa.string()),
|
|
695
|
+
pa.field(
|
|
696
|
+
"metadata",
|
|
697
|
+
pa.struct(
|
|
698
|
+
[
|
|
699
|
+
pa.field("author", pa.string()),
|
|
700
|
+
pa.field("year", pa.int32()),
|
|
701
|
+
]
|
|
702
|
+
),
|
|
703
|
+
),
|
|
704
|
+
]
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
# Create a test parquet file
|
|
708
|
+
data_dir = tmp_path / "data"
|
|
709
|
+
data_dir.mkdir()
|
|
710
|
+
file_path = data_dir / "test.parquet"
|
|
711
|
+
table = pa.table(
|
|
712
|
+
{
|
|
713
|
+
"id": ["1", "2"],
|
|
714
|
+
"metadata": [
|
|
715
|
+
{"author": "Alice", "year": 2020},
|
|
716
|
+
{"author": "Bob", "year": 2021},
|
|
717
|
+
],
|
|
718
|
+
},
|
|
719
|
+
schema=iceberg_schema,
|
|
720
|
+
)
|
|
721
|
+
pq.write_table(table, file_path)
|
|
722
|
+
|
|
723
|
+
files = [
|
|
724
|
+
ParquetFile(
|
|
725
|
+
uri=str(file_path),
|
|
726
|
+
path=str(file_path),
|
|
727
|
+
size=file_path.stat().st_size,
|
|
728
|
+
blob_id="test",
|
|
729
|
+
)
|
|
730
|
+
]
|
|
731
|
+
|
|
732
|
+
# Write snapshot with schema
|
|
733
|
+
metadata = write_snapshot(
|
|
734
|
+
files=files,
|
|
735
|
+
schema=iceberg_schema,
|
|
736
|
+
current_metadata=None,
|
|
737
|
+
output_dir=tmp_path,
|
|
738
|
+
base_uri=f"file://{tmp_path}",
|
|
739
|
+
include_split_column=False,
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
# Get name mapping from properties
|
|
743
|
+
name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
|
|
744
|
+
|
|
745
|
+
# Check top-level fields
|
|
746
|
+
assert len(name_mapping) == 2
|
|
747
|
+
assert name_mapping[0]["names"] == ["id"]
|
|
748
|
+
assert name_mapping[1]["names"] == ["metadata"]
|
|
749
|
+
|
|
750
|
+
# Check nested struct field
|
|
751
|
+
metadata_mapping = name_mapping[1]
|
|
752
|
+
assert "fields" in metadata_mapping
|
|
753
|
+
assert len(metadata_mapping["fields"]) == 2
|
|
754
|
+
|
|
755
|
+
# Check nested struct's child fields
|
|
756
|
+
assert metadata_mapping["fields"][0]["names"] == ["author"]
|
|
757
|
+
assert metadata_mapping["fields"][1]["names"] == ["year"]
|
|
758
|
+
|
|
759
|
+
def test_name_mapping_with_lists(self, tmp_path):
|
|
760
|
+
"""Test that name mapping includes list element mappings."""
|
|
761
|
+
# Create schema with list of strings and list of structs
|
|
762
|
+
iceberg_schema = pa.schema(
|
|
763
|
+
[
|
|
764
|
+
pa.field("id", pa.string()),
|
|
765
|
+
pa.field("tags", pa.list_(pa.string())),
|
|
766
|
+
pa.field(
|
|
767
|
+
"items",
|
|
768
|
+
pa.list_(
|
|
769
|
+
pa.struct(
|
|
770
|
+
[
|
|
771
|
+
pa.field("name", pa.string()),
|
|
772
|
+
pa.field("value", pa.string()),
|
|
773
|
+
]
|
|
774
|
+
)
|
|
775
|
+
),
|
|
776
|
+
),
|
|
777
|
+
]
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
# Create test parquet file
|
|
781
|
+
data_dir = tmp_path / "data"
|
|
782
|
+
data_dir.mkdir()
|
|
783
|
+
file_path = data_dir / "test.parquet"
|
|
784
|
+
table = pa.table(
|
|
785
|
+
{
|
|
786
|
+
"id": ["1"],
|
|
787
|
+
"tags": [["tag1", "tag2"]],
|
|
788
|
+
"items": [[{"name": "item1", "value": "val1"}]],
|
|
789
|
+
},
|
|
790
|
+
schema=iceberg_schema,
|
|
791
|
+
)
|
|
792
|
+
pq.write_table(table, file_path)
|
|
793
|
+
|
|
794
|
+
files = [
|
|
795
|
+
ParquetFile(
|
|
796
|
+
uri=str(file_path),
|
|
797
|
+
path=str(file_path),
|
|
798
|
+
size=file_path.stat().st_size,
|
|
799
|
+
blob_id="test",
|
|
800
|
+
)
|
|
801
|
+
]
|
|
802
|
+
|
|
803
|
+
metadata = write_snapshot(
|
|
804
|
+
files=files,
|
|
805
|
+
schema=iceberg_schema,
|
|
806
|
+
current_metadata=None,
|
|
807
|
+
output_dir=tmp_path,
|
|
808
|
+
base_uri=f"file://{tmp_path}",
|
|
809
|
+
include_split_column=False,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
|
|
813
|
+
|
|
814
|
+
# Check list of strings (tags)
|
|
815
|
+
tags_mapping = name_mapping[1]
|
|
816
|
+
assert tags_mapping["names"] == ["tags"]
|
|
817
|
+
assert "fields" in tags_mapping
|
|
818
|
+
assert len(tags_mapping["fields"]) == 1
|
|
819
|
+
|
|
820
|
+
# Check element mapping for simple list
|
|
821
|
+
element_mapping = tags_mapping["fields"][0]
|
|
822
|
+
assert element_mapping["names"] == ["element"]
|
|
823
|
+
|
|
824
|
+
# Check list of structs (items)
|
|
825
|
+
items_mapping = name_mapping[2]
|
|
826
|
+
assert items_mapping["names"] == ["items"]
|
|
827
|
+
assert "fields" in items_mapping
|
|
828
|
+
|
|
829
|
+
# Check element mapping for list of structs
|
|
830
|
+
items_element = items_mapping["fields"][0]
|
|
831
|
+
assert items_element["names"] == ["element"]
|
|
832
|
+
assert "fields" in items_element
|
|
833
|
+
|
|
834
|
+
# Check struct fields within list element
|
|
835
|
+
assert len(items_element["fields"]) == 2
|
|
836
|
+
assert items_element["fields"][0]["names"] == ["name"]
|
|
837
|
+
assert items_element["fields"][1]["names"] == ["value"]
|
|
838
|
+
|
|
839
|
+
def test_name_mapping_with_maps(self, tmp_path):
|
|
840
|
+
"""Test that name mapping includes map key and value mappings."""
|
|
841
|
+
# Create schema with a map
|
|
842
|
+
iceberg_schema = pa.schema(
|
|
843
|
+
[
|
|
844
|
+
pa.field("id", pa.string()),
|
|
845
|
+
pa.field(
|
|
846
|
+
"metadata",
|
|
847
|
+
pa.map_(
|
|
848
|
+
pa.string(),
|
|
849
|
+
pa.struct(
|
|
850
|
+
[
|
|
851
|
+
pa.field("count", pa.int32()),
|
|
852
|
+
pa.field("name", pa.string()),
|
|
853
|
+
]
|
|
854
|
+
),
|
|
855
|
+
),
|
|
856
|
+
),
|
|
857
|
+
]
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
# Create test parquet file
|
|
861
|
+
data_dir = tmp_path / "data"
|
|
862
|
+
data_dir.mkdir()
|
|
863
|
+
file_path = data_dir / "test.parquet"
|
|
864
|
+
table = pa.table(
|
|
865
|
+
{
|
|
866
|
+
"id": ["1"],
|
|
867
|
+
"metadata": [[("key1", {"count": 1, "name": "name1"})]],
|
|
868
|
+
},
|
|
869
|
+
schema=iceberg_schema,
|
|
870
|
+
)
|
|
871
|
+
pq.write_table(table, file_path)
|
|
872
|
+
|
|
873
|
+
files = [
|
|
874
|
+
ParquetFile(
|
|
875
|
+
uri=str(file_path),
|
|
876
|
+
path=str(file_path),
|
|
877
|
+
size=file_path.stat().st_size,
|
|
878
|
+
blob_id="test",
|
|
879
|
+
)
|
|
880
|
+
]
|
|
881
|
+
|
|
882
|
+
metadata = write_snapshot(
|
|
883
|
+
files=files,
|
|
884
|
+
schema=iceberg_schema,
|
|
885
|
+
current_metadata=None,
|
|
886
|
+
output_dir=tmp_path,
|
|
887
|
+
base_uri=f"file://{tmp_path}",
|
|
888
|
+
include_split_column=False,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
|
|
892
|
+
|
|
893
|
+
# Check map field
|
|
894
|
+
metadata_mapping = name_mapping[1]
|
|
895
|
+
assert metadata_mapping["names"] == ["metadata"]
|
|
896
|
+
assert "fields" in metadata_mapping
|
|
897
|
+
assert len(metadata_mapping["fields"]) == 2
|
|
898
|
+
|
|
899
|
+
# Check key mapping
|
|
900
|
+
key_mapping = metadata_mapping["fields"][0]
|
|
901
|
+
assert key_mapping["names"] == ["key"]
|
|
902
|
+
|
|
903
|
+
# Check value mapping
|
|
904
|
+
value_mapping = metadata_mapping["fields"][1]
|
|
905
|
+
assert value_mapping["names"] == ["value"]
|
|
906
|
+
assert "fields" in value_mapping
|
|
907
|
+
|
|
908
|
+
# Check struct fields within map value
|
|
909
|
+
assert len(value_mapping["fields"]) == 2
|
|
910
|
+
assert value_mapping["fields"][0]["names"] == ["count"]
|
|
911
|
+
assert value_mapping["fields"][1]["names"] == ["name"]
|