faceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,911 @@
1
+ """Tests for the iceberg module (Iceberg metadata generation)."""
2
+
3
+ import hashlib
4
+ import json
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
10
+ import pytest
11
+ from pyiceberg.io.pyarrow import PyArrowFileIO
12
+ from pyiceberg.manifest import ManifestEntryStatus
13
+ from pyiceberg.table import StaticTable
14
+ from pyiceberg.types import ListType, StructType
15
+
16
+ from faceberg.iceberg import ParquetFile, create_schema, diff_snapshot, write_snapshot
17
+
18
+
19
+ @pytest.fixture
20
+ def arrow_schema():
21
+ """Create a simple PyArrow schema for testing."""
22
+ return pa.schema(
23
+ [
24
+ pa.field("id", pa.int64()),
25
+ pa.field("name", pa.string()),
26
+ pa.field("value", pa.float64()),
27
+ ]
28
+ )
29
+
30
+
31
+ def compute_file_hash(path: Path) -> str:
32
+ """Compute MD5 hash of file contents."""
33
+ md5 = hashlib.md5()
34
+ with open(path, "rb") as f:
35
+ for chunk in iter(lambda: f.read(8192), b""):
36
+ md5.update(chunk)
37
+ return md5.hexdigest()
38
+
39
+
40
+ @pytest.fixture
41
+ def parquet_files(tmp_path, arrow_schema):
42
+ """Create 5 parquet files with 20 rows each (100 total), each row unique."""
43
+ files = []
44
+ data_dir = tmp_path / "data"
45
+ data_dir.mkdir(parents=True, exist_ok=True)
46
+
47
+ for i in range(5):
48
+ path = data_dir / f"part-{i:05d}.parquet"
49
+
50
+ # Each file has 20 unique rows
51
+ start_id = i * 20
52
+ table = pa.table(
53
+ {
54
+ "id": pa.array(list(range(start_id, start_id + 20)), type=pa.int64()),
55
+ "name": [f"name_{j}" for j in range(start_id, start_id + 20)],
56
+ "value": [float(j) * 1.5 for j in range(start_id, start_id + 20)],
57
+ },
58
+ schema=arrow_schema,
59
+ )
60
+ pq.write_table(table, path)
61
+ files.append(
62
+ ParquetFile(
63
+ uri=str(path),
64
+ path=str(path),
65
+ size=path.stat().st_size,
66
+ blob_id=compute_file_hash(path),
67
+ )
68
+ )
69
+
70
+ return files
71
+
72
+
73
+ def make_extra_files(tmp_path, arrow_schema, count=2, start_index=5):
74
+ """Create additional parquet files for append tests."""
75
+ files = []
76
+ data_dir = tmp_path / "data"
77
+ data_dir.mkdir(parents=True, exist_ok=True)
78
+
79
+ for i in range(count):
80
+ idx = start_index + i
81
+ path = data_dir / f"part-{idx:05d}.parquet"
82
+
83
+ start_id = idx * 20
84
+ table = pa.table(
85
+ {
86
+ "id": pa.array(list(range(start_id, start_id + 20)), type=pa.int64()),
87
+ "name": [f"name_{j}" for j in range(start_id, start_id + 20)],
88
+ "value": [float(j) * 1.5 for j in range(start_id, start_id + 20)],
89
+ },
90
+ schema=arrow_schema,
91
+ )
92
+ pq.write_table(table, path)
93
+ files.append(
94
+ ParquetFile(
95
+ uri=str(path),
96
+ path=str(path),
97
+ size=path.stat().st_size,
98
+ blob_id=compute_file_hash(path),
99
+ )
100
+ )
101
+
102
+ return files
103
+
104
+
105
+ class TestInitialSnapshot:
106
+ """Tests for creating initial table snapshots."""
107
+
108
+ def test_initial_snapshot_creates_valid_metadata(self, tmp_path, parquet_files, arrow_schema):
109
+ """Test that initial snapshot creates valid Iceberg metadata."""
110
+ _metadata = write_snapshot(
111
+ files=parquet_files,
112
+ schema=arrow_schema,
113
+ current_metadata=None,
114
+ output_dir=tmp_path,
115
+ base_uri=f"file://{tmp_path}",
116
+ )
117
+
118
+ # Validate using StaticTable - pass the metadata file path
119
+ metadata_file = tmp_path / "metadata" / "v1.metadata.json"
120
+ table = StaticTable.from_metadata(str(metadata_file))
121
+
122
+ # Check data files
123
+ data_files = table.inspect.data_files()
124
+ assert len(data_files) == 5
125
+
126
+ # Check file paths match
127
+ file_paths = set(data_files["file_path"].to_pylist())
128
+ expected_paths = {f.uri for f in parquet_files}
129
+ assert file_paths == expected_paths
130
+
131
+ # Check snapshot summary
132
+ snapshot = table.current_snapshot()
133
+ assert snapshot is not None
134
+ assert snapshot.summary.operation.value == "append"
135
+ assert int(snapshot.summary["added-data-files"]) == 5
136
+ assert int(snapshot.summary["total-records"]) == 100
137
+
138
+ def test_initial_snapshot_scan_returns_data(self, tmp_path, parquet_files, arrow_schema):
139
+ """Test that initial snapshot can be scanned correctly."""
140
+
141
+ write_snapshot(
142
+ files=parquet_files,
143
+ schema=arrow_schema,
144
+ current_metadata=None,
145
+ output_dir=tmp_path,
146
+ base_uri=f"file://{tmp_path}",
147
+ )
148
+
149
+ metadata_file = tmp_path / "metadata" / "v1.metadata.json"
150
+ table = StaticTable.from_metadata(str(metadata_file))
151
+ result = table.scan().to_arrow()
152
+
153
+ # Should have 100 rows (5 files * 20 rows)
154
+ assert len(result) == 100
155
+
156
+ # Check all IDs are present (0-99)
157
+ ids = sorted(result["id"].to_pylist())
158
+ assert ids == list(range(100))
159
+
160
+
161
+ class TestAppendSnapshot:
162
+ """Tests for appending files to existing snapshots."""
163
+
164
+ def test_append_files_creates_new_snapshot(self, tmp_path, parquet_files, arrow_schema):
165
+ """Test that appending files creates a new snapshot with all files."""
166
+
167
+ # Create initial snapshot
168
+ metadata = write_snapshot(
169
+ files=parquet_files,
170
+ schema=arrow_schema,
171
+ current_metadata=None,
172
+ output_dir=tmp_path,
173
+ base_uri=f"file://{tmp_path}",
174
+ )
175
+
176
+ # Create additional files
177
+ extra_files = make_extra_files(tmp_path, arrow_schema, count=2, start_index=5)
178
+
179
+ # Append files
180
+ updated_metadata = write_snapshot(
181
+ files=parquet_files + extra_files,
182
+ schema=arrow_schema,
183
+ current_metadata=metadata,
184
+ output_dir=tmp_path,
185
+ base_uri=f"file://{tmp_path}",
186
+ )
187
+
188
+ # Validate
189
+ metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
190
+ table = StaticTable.from_metadata(str(metadata_file))
191
+
192
+ # Should have 7 files now
193
+ data_files = table.inspect.data_files()
194
+ assert len(data_files) == 7
195
+
196
+ # Should have 2 snapshots
197
+ snapshots = table.inspect.snapshots()
198
+ assert len(snapshots) == 2
199
+
200
+ # Scan should return 140 rows
201
+ result = table.scan().to_arrow()
202
+ assert len(result) == 140
203
+
204
+ # Check IDs 0-139 are present
205
+ ids = sorted(result["id"].to_pylist())
206
+ assert ids == list(range(140))
207
+
208
+
209
+ class TestDeleteSnapshot:
210
+ """Tests for deleting files from snapshots."""
211
+
212
+ def test_delete_files_removes_from_snapshot(self, tmp_path, parquet_files, arrow_schema):
213
+ """Test that deleting files removes them from the current snapshot."""
214
+
215
+ # Create initial snapshot
216
+ metadata = write_snapshot(
217
+ files=parquet_files,
218
+ schema=arrow_schema,
219
+ current_metadata=None,
220
+ output_dir=tmp_path,
221
+ base_uri=f"file://{tmp_path}",
222
+ )
223
+
224
+ # Delete first 2 files (IDs 0-39) by passing only the remaining files
225
+ remaining_files = parquet_files[2:]
226
+
227
+ updated_metadata = write_snapshot(
228
+ files=remaining_files,
229
+ schema=arrow_schema,
230
+ current_metadata=metadata,
231
+ output_dir=tmp_path,
232
+ base_uri=f"file://{tmp_path}",
233
+ )
234
+
235
+ # Validate
236
+ metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
237
+ table = StaticTable.from_metadata(str(metadata_file))
238
+
239
+ # Should have 3 files now
240
+ data_files = table.inspect.data_files()
241
+ assert len(data_files) == 3
242
+
243
+ # Deleted files should not be present
244
+ file_paths = set(data_files["file_path"].to_pylist())
245
+ for deleted in parquet_files[:2]:
246
+ assert deleted.uri not in file_paths
247
+
248
+ # Scan should return 60 rows (IDs 40-99)
249
+ result = table.scan().to_arrow()
250
+ assert len(result) == 60
251
+
252
+ ids = sorted(result["id"].to_pylist())
253
+ assert ids == list(range(40, 100))
254
+
255
+
256
+ class TestOverwriteSnapshot:
257
+ """Tests for overwrite operations (delete + add)."""
258
+
259
+ def test_overwrite_replaces_files(self, tmp_path, parquet_files, arrow_schema):
260
+ """Test that overwrite removes old files and adds new ones."""
261
+
262
+ # Create initial snapshot
263
+ metadata = write_snapshot(
264
+ files=parquet_files,
265
+ schema=arrow_schema,
266
+ current_metadata=None,
267
+ output_dir=tmp_path,
268
+ base_uri=f"file://{tmp_path}",
269
+ )
270
+
271
+ # Create a replacement file
272
+ replacement_path = tmp_path / "data" / "replacement.parquet"
273
+ replacement_table = pa.table(
274
+ {
275
+ "id": pa.array(list(range(1000, 1020)), type=pa.int64()),
276
+ "name": [f"replaced_{j}" for j in range(20)],
277
+ "value": [float(j) * 2.0 for j in range(20)],
278
+ },
279
+ schema=arrow_schema,
280
+ )
281
+ pq.write_table(replacement_table, replacement_path)
282
+ replacement_file = ParquetFile(
283
+ uri=str(replacement_path),
284
+ path=str(replacement_path),
285
+ size=replacement_path.stat().st_size,
286
+ blob_id=compute_file_hash(replacement_path),
287
+ )
288
+
289
+ # Overwrite: replace first file with replacement
290
+ updated_metadata = write_snapshot(
291
+ files=[replacement_file] + parquet_files[1:],
292
+ schema=arrow_schema,
293
+ current_metadata=metadata,
294
+ output_dir=tmp_path,
295
+ base_uri=f"file://{tmp_path}",
296
+ )
297
+
298
+ # Validate
299
+ metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
300
+ table = StaticTable.from_metadata(str(metadata_file))
301
+
302
+ # Still 5 files
303
+ data_files = table.inspect.data_files()
304
+ assert len(data_files) == 5
305
+
306
+ # Old file should be gone, replacement should be present
307
+ file_paths = set(data_files["file_path"].to_pylist())
308
+ assert parquet_files[0].uri not in file_paths
309
+ assert str(replacement_path) in file_paths
310
+
311
+ # Snapshot should be OVERWRITE
312
+ snapshot = table.current_snapshot()
313
+ assert snapshot.summary.operation.value == "overwrite"
314
+
315
+ # Scan should return 100 rows (20-99 from original + 1000-1019 from replacement)
316
+ result = table.scan().to_arrow()
317
+ assert len(result) == 100
318
+
319
+
320
+ class TestRenameFile:
321
+ """Tests for file rename operations (delete old URI + add new URI)."""
322
+
323
+ def test_rename_file_updates_uri(self, tmp_path, parquet_files, arrow_schema):
324
+ """Test renaming a file (delete old URI + add new URI with same content)."""
325
+
326
+ # Create initial snapshot
327
+ metadata = write_snapshot(
328
+ files=parquet_files,
329
+ schema=arrow_schema,
330
+ current_metadata=None,
331
+ output_dir=tmp_path,
332
+ base_uri=f"file://{tmp_path}",
333
+ )
334
+
335
+ # "Rename" first file: copy to new location
336
+ old_file = parquet_files[0]
337
+ new_path = tmp_path / "data" / "renamed-file.parquet"
338
+ shutil.copy(old_file.uri, new_path)
339
+ new_file = ParquetFile(
340
+ uri=str(new_path),
341
+ path=str(new_path),
342
+ size=new_path.stat().st_size,
343
+ blob_id=compute_file_hash(new_path),
344
+ )
345
+
346
+ # Create overwrite snapshot with renamed file
347
+ updated_metadata = write_snapshot(
348
+ files=[new_file] + parquet_files[1:],
349
+ schema=arrow_schema,
350
+ current_metadata=metadata,
351
+ output_dir=tmp_path,
352
+ base_uri=f"file://{tmp_path}",
353
+ )
354
+
355
+ # Validate
356
+ metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
357
+ table = StaticTable.from_metadata(str(metadata_file))
358
+ data_files = table.inspect.data_files()
359
+ file_paths = set(data_files["file_path"].to_pylist())
360
+
361
+ # Old file should not be present
362
+ assert old_file.uri not in file_paths
363
+ # New file should be present
364
+ assert str(new_path) in file_paths
365
+ # Total files still 5
366
+ assert len(data_files) == 5
367
+
368
+ # Data should be unchanged (100 rows with same IDs)
369
+ result = table.scan().to_arrow()
370
+ assert len(result) == 100
371
+
372
+ ids = sorted(result["id"].to_pylist())
373
+ assert ids == list(range(100))
374
+
375
+
376
+ class TestManifestEntries:
377
+ """Tests for manifest entry correctness."""
378
+
379
+ def test_initial_entries_are_added(self, tmp_path, parquet_files, arrow_schema):
380
+ """Test that initial snapshot entries have ADDED status."""
381
+
382
+ write_snapshot(
383
+ files=parquet_files,
384
+ schema=arrow_schema,
385
+ current_metadata=None,
386
+ output_dir=tmp_path,
387
+ base_uri=f"file://{tmp_path}",
388
+ )
389
+
390
+ metadata_file = tmp_path / "metadata" / "v1.metadata.json"
391
+ table = StaticTable.from_metadata(str(metadata_file))
392
+ entries = table.inspect.entries()
393
+
394
+ # All entries should be ADDED (status=1)
395
+ statuses = entries["status"].to_pylist()
396
+ assert all(s == 1 for s in statuses)
397
+ assert len(statuses) == 5
398
+
399
+ def test_append_entries_are_added(self, tmp_path, parquet_files, arrow_schema):
400
+ """Test that appended files have ADDED status in new manifest."""
401
+
402
+ metadata = write_snapshot(
403
+ files=parquet_files,
404
+ schema=arrow_schema,
405
+ current_metadata=None,
406
+ output_dir=tmp_path,
407
+ base_uri=f"file://{tmp_path}",
408
+ )
409
+
410
+ extra_files = make_extra_files(tmp_path, arrow_schema, count=1, start_index=5)
411
+
412
+ updated_metadata = write_snapshot(
413
+ files=parquet_files + extra_files,
414
+ schema=arrow_schema,
415
+ current_metadata=metadata,
416
+ output_dir=tmp_path,
417
+ base_uri=f"file://{tmp_path}",
418
+ )
419
+
420
+ metadata_file = tmp_path / "metadata" / f"v{len(updated_metadata.snapshots)}.metadata.json"
421
+ table = StaticTable.from_metadata(str(metadata_file))
422
+ entries = table.inspect.entries()
423
+
424
+ # Should have 6 entries total
425
+ assert len(entries) == 6
426
+
427
+ # All entries visible should be ADDED (1) or EXISTING (0)
428
+ statuses = entries["status"].to_pylist()
429
+ assert all(s in (0, 1) for s in statuses)
430
+
431
+
432
+ class TestDiffSnapshotFiles:
433
+ """Tests for diff_snapshot function."""
434
+
435
+ def test_initial_snapshot_all_added(self, tmp_path, parquet_files, arrow_schema):
436
+ """Test that with no previous metadata, all files are ADDED."""
437
+
438
+ io = PyArrowFileIO()
439
+ result = diff_snapshot(parquet_files, None, io)
440
+
441
+ # All files should be ADDED
442
+ assert len(result) == 5
443
+ for status, pf in result:
444
+ assert status == ManifestEntryStatus.ADDED
445
+ assert pf in parquet_files
446
+
447
+ def test_existing_files_unchanged(self, tmp_path, parquet_files, arrow_schema):
448
+ """Test that files unchanged from previous snapshot are EXISTING."""
449
+
450
+ # Create initial snapshot
451
+ metadata = write_snapshot(
452
+ files=parquet_files,
453
+ schema=arrow_schema,
454
+ current_metadata=None,
455
+ output_dir=tmp_path,
456
+ base_uri=f"file://{tmp_path}",
457
+ )
458
+
459
+ # Diff with same files
460
+ io = PyArrowFileIO()
461
+ result = diff_snapshot(parquet_files, metadata, io)
462
+
463
+ # All files should be EXISTING
464
+ assert len(result) == 5
465
+ for status, pf in result:
466
+ assert status == ManifestEntryStatus.EXISTING
467
+ assert pf in parquet_files
468
+
469
+ def test_removed_files(self, tmp_path, parquet_files, arrow_schema):
470
+ """Test that files in previous snapshot but not in current are REMOVED."""
471
+
472
+ # Create initial snapshot
473
+ metadata = write_snapshot(
474
+ files=parquet_files,
475
+ schema=arrow_schema,
476
+ current_metadata=None,
477
+ output_dir=tmp_path,
478
+ base_uri=f"file://{tmp_path}",
479
+ )
480
+
481
+ # Diff with subset of files (remove first 2)
482
+ io = PyArrowFileIO()
483
+ current_files = parquet_files[2:] # Keep only last 3 files
484
+ result = diff_snapshot(current_files, metadata, io)
485
+
486
+ # Should have 3 EXISTING + 2 REMOVED = 5 total
487
+ assert len(result) == 5
488
+
489
+ existing_count = sum(1 for status, _ in result if status == ManifestEntryStatus.EXISTING)
490
+ removed_count = sum(1 for status, _ in result if status == ManifestEntryStatus.DELETED)
491
+
492
+ assert existing_count == 3
493
+ assert removed_count == 2
494
+
495
+ # Check that removed files are the first 2
496
+ removed_files = [pf for status, pf in result if status == ManifestEntryStatus.DELETED]
497
+ assert len(removed_files) == 2
498
+ for pf in removed_files:
499
+ assert pf.uri in [parquet_files[0].uri, parquet_files[1].uri]
500
+
501
+ def test_changed_files_removed_and_added(self, tmp_path, parquet_files, arrow_schema):
502
+ """Test that files with same URI but different hash/size are REMOVED + ADDED."""
503
+
504
+ # Create initial snapshot
505
+ metadata = write_snapshot(
506
+ files=parquet_files,
507
+ schema=arrow_schema,
508
+ current_metadata=None,
509
+ output_dir=tmp_path,
510
+ base_uri=f"file://{tmp_path}",
511
+ )
512
+
513
+ # Modify first file (same URI, different content)
514
+ first_file_path = Path(parquet_files[0].uri)
515
+ modified_table = pa.table(
516
+ {
517
+ "id": pa.array([999], type=pa.int64()),
518
+ "name": ["modified"],
519
+ "value": [999.9],
520
+ },
521
+ schema=arrow_schema,
522
+ )
523
+ pq.write_table(modified_table, first_file_path)
524
+
525
+ # Create new ParquetFile with same URI but new hash
526
+ modified_file = ParquetFile(
527
+ uri=str(first_file_path),
528
+ path=str(first_file_path),
529
+ size=first_file_path.stat().st_size,
530
+ blob_id=compute_file_hash(first_file_path),
531
+ )
532
+
533
+ current_files = [modified_file] + parquet_files[1:]
534
+
535
+ # Diff
536
+ io = PyArrowFileIO()
537
+ result = diff_snapshot(current_files, metadata, io)
538
+
539
+ # Should have: 1 REMOVED (old version) + 1 ADDED (new version) + 4 EXISTING = 6 total
540
+ assert len(result) == 6
541
+
542
+ added_count = sum(1 for status, _ in result if status == ManifestEntryStatus.ADDED)
543
+ removed_count = sum(1 for status, _ in result if status == ManifestEntryStatus.DELETED)
544
+ existing_count = sum(1 for status, _ in result if status == ManifestEntryStatus.EXISTING)
545
+
546
+ assert added_count == 1
547
+ assert removed_count == 1
548
+ assert existing_count == 4
549
+
550
+
551
+ class TestSchemaConversion:
552
+ """Tests for create_schema with complex nested structures."""
553
+
554
+ def test_schema_with_nested_struct(self):
555
+ """Test schema conversion with nested struct fields."""
556
+ # Create PyArrow schema with nested struct
557
+ arrow_schema = pa.schema(
558
+ [
559
+ pa.field("id", pa.int64()),
560
+ pa.field(
561
+ "metadata",
562
+ pa.struct(
563
+ [
564
+ pa.field("title", pa.string()),
565
+ pa.field("author", pa.string()),
566
+ pa.field("year", pa.int32()),
567
+ ]
568
+ ),
569
+ ),
570
+ ]
571
+ )
572
+
573
+ schema = create_schema(arrow_schema, include_split_column=False)
574
+
575
+ # Verify structure
576
+ field_names = [f.name for f in schema.fields]
577
+ assert "id" in field_names
578
+ assert "metadata" in field_names
579
+
580
+ # Find metadata field
581
+ metadata_field = next(f for f in schema.fields if f.name == "metadata")
582
+ assert isinstance(metadata_field.field_type, StructType)
583
+
584
+ # Verify nested fields
585
+ nested_field_names = [f.name for f in metadata_field.field_type.fields]
586
+ assert "title" in nested_field_names
587
+ assert "author" in nested_field_names
588
+ assert "year" in nested_field_names
589
+
590
+ def test_schema_with_list_field(self):
591
+ """Test schema conversion with list fields."""
592
+ arrow_schema = pa.schema(
593
+ [
594
+ pa.field("id", pa.int64()),
595
+ pa.field("tags", pa.list_(pa.string())),
596
+ ]
597
+ )
598
+
599
+ schema = create_schema(arrow_schema, include_split_column=False)
600
+
601
+ # Find tags field
602
+ tags_field = next(f for f in schema.fields if f.name == "tags")
603
+ assert isinstance(tags_field.field_type, ListType)
604
+
605
+ def test_schema_with_deeply_nested_structures(self):
606
+ """Test schema conversion with deeply nested structures."""
607
+ arrow_schema = pa.schema(
608
+ [
609
+ pa.field("id", pa.int64()),
610
+ pa.field(
611
+ "nested",
612
+ pa.struct(
613
+ [
614
+ pa.field("field1", pa.string()),
615
+ pa.field("field2", pa.int32()),
616
+ pa.field(
617
+ "deeper",
618
+ pa.struct([pa.field("field3", pa.string())]),
619
+ ),
620
+ ]
621
+ ),
622
+ ),
623
+ pa.field("list_field", pa.list_(pa.string())),
624
+ ]
625
+ )
626
+
627
+ schema = create_schema(arrow_schema, include_split_column=True)
628
+
629
+ # Should include split column
630
+ field_names = [f.name for f in schema.fields]
631
+ assert "split" in field_names
632
+ assert schema.fields[0].name == "split"
633
+
634
+ # Verify nested field exists
635
+ nested_field = next(f for f in schema.fields if f.name == "nested")
636
+ assert isinstance(nested_field.field_type, StructType)
637
+
638
+ def test_unique_field_ids_across_nested_structures(self):
639
+ """Test that all field IDs are unique across nested structures."""
640
+ arrow_schema = pa.schema(
641
+ [
642
+ pa.field("id", pa.int64()),
643
+ pa.field(
644
+ "nested",
645
+ pa.struct(
646
+ [
647
+ pa.field("field1", pa.string()),
648
+ pa.field("field2", pa.int32()),
649
+ pa.field(
650
+ "deeper",
651
+ pa.struct([pa.field("field3", pa.string())]),
652
+ ),
653
+ ]
654
+ ),
655
+ ),
656
+ pa.field("list_field", pa.list_(pa.string())),
657
+ ]
658
+ )
659
+
660
+ schema = create_schema(arrow_schema, include_split_column=True)
661
+
662
+ # Collect all field IDs recursively
663
+ def collect_field_ids(field_type, ids=None):
664
+ if ids is None:
665
+ ids = []
666
+
667
+ if isinstance(field_type, StructType):
668
+ for field in field_type.fields:
669
+ ids.append(field.field_id)
670
+ collect_field_ids(field.field_type, ids)
671
+ elif isinstance(field_type, ListType):
672
+ ids.append(field_type.element_id)
673
+ collect_field_ids(field_type.element_type, ids)
674
+
675
+ return ids
676
+
677
+ # Get all field IDs
678
+ all_ids = [f.field_id for f in schema.fields]
679
+ for field in schema.fields:
680
+ all_ids.extend(collect_field_ids(field.field_type))
681
+
682
+ # Check all IDs are unique
683
+ assert len(all_ids) == len(set(all_ids)), f"Duplicate field IDs found: {all_ids}"
684
+
685
+
686
+ class TestNameMapping:
687
+ """Tests for name mapping with nested structures."""
688
+
689
+ def test_name_mapping_with_nested_structs(self, tmp_path):
690
+ """Test that name mapping includes nested struct fields."""
691
+ # Create schema with nested structs
692
+ iceberg_schema = pa.schema(
693
+ [
694
+ pa.field("id", pa.string()),
695
+ pa.field(
696
+ "metadata",
697
+ pa.struct(
698
+ [
699
+ pa.field("author", pa.string()),
700
+ pa.field("year", pa.int32()),
701
+ ]
702
+ ),
703
+ ),
704
+ ]
705
+ )
706
+
707
+ # Create a test parquet file
708
+ data_dir = tmp_path / "data"
709
+ data_dir.mkdir()
710
+ file_path = data_dir / "test.parquet"
711
+ table = pa.table(
712
+ {
713
+ "id": ["1", "2"],
714
+ "metadata": [
715
+ {"author": "Alice", "year": 2020},
716
+ {"author": "Bob", "year": 2021},
717
+ ],
718
+ },
719
+ schema=iceberg_schema,
720
+ )
721
+ pq.write_table(table, file_path)
722
+
723
+ files = [
724
+ ParquetFile(
725
+ uri=str(file_path),
726
+ path=str(file_path),
727
+ size=file_path.stat().st_size,
728
+ blob_id="test",
729
+ )
730
+ ]
731
+
732
+ # Write snapshot with schema
733
+ metadata = write_snapshot(
734
+ files=files,
735
+ schema=iceberg_schema,
736
+ current_metadata=None,
737
+ output_dir=tmp_path,
738
+ base_uri=f"file://{tmp_path}",
739
+ include_split_column=False,
740
+ )
741
+
742
+ # Get name mapping from properties
743
+ name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
744
+
745
+ # Check top-level fields
746
+ assert len(name_mapping) == 2
747
+ assert name_mapping[0]["names"] == ["id"]
748
+ assert name_mapping[1]["names"] == ["metadata"]
749
+
750
+ # Check nested struct field
751
+ metadata_mapping = name_mapping[1]
752
+ assert "fields" in metadata_mapping
753
+ assert len(metadata_mapping["fields"]) == 2
754
+
755
+ # Check nested struct's child fields
756
+ assert metadata_mapping["fields"][0]["names"] == ["author"]
757
+ assert metadata_mapping["fields"][1]["names"] == ["year"]
758
+
759
+ def test_name_mapping_with_lists(self, tmp_path):
760
+ """Test that name mapping includes list element mappings."""
761
+ # Create schema with list of strings and list of structs
762
+ iceberg_schema = pa.schema(
763
+ [
764
+ pa.field("id", pa.string()),
765
+ pa.field("tags", pa.list_(pa.string())),
766
+ pa.field(
767
+ "items",
768
+ pa.list_(
769
+ pa.struct(
770
+ [
771
+ pa.field("name", pa.string()),
772
+ pa.field("value", pa.string()),
773
+ ]
774
+ )
775
+ ),
776
+ ),
777
+ ]
778
+ )
779
+
780
+ # Create test parquet file
781
+ data_dir = tmp_path / "data"
782
+ data_dir.mkdir()
783
+ file_path = data_dir / "test.parquet"
784
+ table = pa.table(
785
+ {
786
+ "id": ["1"],
787
+ "tags": [["tag1", "tag2"]],
788
+ "items": [[{"name": "item1", "value": "val1"}]],
789
+ },
790
+ schema=iceberg_schema,
791
+ )
792
+ pq.write_table(table, file_path)
793
+
794
+ files = [
795
+ ParquetFile(
796
+ uri=str(file_path),
797
+ path=str(file_path),
798
+ size=file_path.stat().st_size,
799
+ blob_id="test",
800
+ )
801
+ ]
802
+
803
+ metadata = write_snapshot(
804
+ files=files,
805
+ schema=iceberg_schema,
806
+ current_metadata=None,
807
+ output_dir=tmp_path,
808
+ base_uri=f"file://{tmp_path}",
809
+ include_split_column=False,
810
+ )
811
+
812
+ name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
813
+
814
+ # Check list of strings (tags)
815
+ tags_mapping = name_mapping[1]
816
+ assert tags_mapping["names"] == ["tags"]
817
+ assert "fields" in tags_mapping
818
+ assert len(tags_mapping["fields"]) == 1
819
+
820
+ # Check element mapping for simple list
821
+ element_mapping = tags_mapping["fields"][0]
822
+ assert element_mapping["names"] == ["element"]
823
+
824
+ # Check list of structs (items)
825
+ items_mapping = name_mapping[2]
826
+ assert items_mapping["names"] == ["items"]
827
+ assert "fields" in items_mapping
828
+
829
+ # Check element mapping for list of structs
830
+ items_element = items_mapping["fields"][0]
831
+ assert items_element["names"] == ["element"]
832
+ assert "fields" in items_element
833
+
834
+ # Check struct fields within list element
835
+ assert len(items_element["fields"]) == 2
836
+ assert items_element["fields"][0]["names"] == ["name"]
837
+ assert items_element["fields"][1]["names"] == ["value"]
838
+
839
+ def test_name_mapping_with_maps(self, tmp_path):
840
+ """Test that name mapping includes map key and value mappings."""
841
+ # Create schema with a map
842
+ iceberg_schema = pa.schema(
843
+ [
844
+ pa.field("id", pa.string()),
845
+ pa.field(
846
+ "metadata",
847
+ pa.map_(
848
+ pa.string(),
849
+ pa.struct(
850
+ [
851
+ pa.field("count", pa.int32()),
852
+ pa.field("name", pa.string()),
853
+ ]
854
+ ),
855
+ ),
856
+ ),
857
+ ]
858
+ )
859
+
860
+ # Create test parquet file
861
+ data_dir = tmp_path / "data"
862
+ data_dir.mkdir()
863
+ file_path = data_dir / "test.parquet"
864
+ table = pa.table(
865
+ {
866
+ "id": ["1"],
867
+ "metadata": [[("key1", {"count": 1, "name": "name1"})]],
868
+ },
869
+ schema=iceberg_schema,
870
+ )
871
+ pq.write_table(table, file_path)
872
+
873
+ files = [
874
+ ParquetFile(
875
+ uri=str(file_path),
876
+ path=str(file_path),
877
+ size=file_path.stat().st_size,
878
+ blob_id="test",
879
+ )
880
+ ]
881
+
882
+ metadata = write_snapshot(
883
+ files=files,
884
+ schema=iceberg_schema,
885
+ current_metadata=None,
886
+ output_dir=tmp_path,
887
+ base_uri=f"file://{tmp_path}",
888
+ include_split_column=False,
889
+ )
890
+
891
+ name_mapping = json.loads(metadata.properties["schema.name-mapping.default"])
892
+
893
+ # Check map field
894
+ metadata_mapping = name_mapping[1]
895
+ assert metadata_mapping["names"] == ["metadata"]
896
+ assert "fields" in metadata_mapping
897
+ assert len(metadata_mapping["fields"]) == 2
898
+
899
+ # Check key mapping
900
+ key_mapping = metadata_mapping["fields"][0]
901
+ assert key_mapping["names"] == ["key"]
902
+
903
+ # Check value mapping
904
+ value_mapping = metadata_mapping["fields"][1]
905
+ assert value_mapping["names"] == ["value"]
906
+ assert "fields" in value_mapping
907
+
908
+ # Check struct fields within map value
909
+ assert len(value_mapping["fields"]) == 2
910
+ assert value_mapping["fields"][0]["names"] == ["count"]
911
+ assert value_mapping["fields"][1]["names"] == ["name"]