faceberg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1347 @@
1
+ """Tests for FacebergCatalog implementation."""
2
+
3
+ import uuid
4
+
5
+ import pyarrow as pa
6
+ import pytest
7
+ from huggingface_hub import HfFileSystem
8
+ from pyiceberg.exceptions import (
9
+ NamespaceAlreadyExistsError,
10
+ NamespaceNotEmptyError,
11
+ NoSuchTableError,
12
+ TableAlreadyExistsError,
13
+ )
14
+ from pyiceberg.io.fsspec import FsspecFileIO
15
+ from pyiceberg.partitioning import PartitionField, PartitionSpec
16
+ from pyiceberg.schema import Schema
17
+ from pyiceberg.transforms import IdentityTransform
18
+ from pyiceberg.types import LongType, NestedField, StringType
19
+
20
+ from faceberg.catalog import HfFileIO, HfLocationProvider, LocalCatalog, RemoteCatalog
21
+ from faceberg.catalog import catalog as catalog_factory
22
+
23
+
24
+ @pytest.fixture
25
+ def test_schema():
26
+ """Create a test schema."""
27
+ return Schema(
28
+ NestedField(1, "id", LongType(), required=True),
29
+ NestedField(2, "name", StringType(), required=False),
30
+ )
31
+
32
+
33
+ @pytest.fixture
34
+ def get_table_location(tmp_path):
35
+ """Generate a location for a table.
36
+
37
+ Returns a callable that generates unique table locations.
38
+ """
39
+ counter = 0
40
+
41
+ def _location(identifier: str = None):
42
+ nonlocal counter
43
+ counter += 1
44
+ if identifier:
45
+ # Use identifier as directory name
46
+ name = identifier.replace(".", "_")
47
+ else:
48
+ # Generate unique name
49
+ name = f"table_{counter}"
50
+ location_dir = tmp_path / "tables" / name
51
+ location_dir.mkdir(parents=True, exist_ok=True)
52
+ return f"file://{location_dir.as_posix()}"
53
+
54
+ return _location
55
+
56
+
57
+ # =============================================================================
58
+ # Catalog Creation Tests (Local-specific, not parametrized)
59
+ # =============================================================================
60
+
61
+
62
+ class TestCatalogCreation:
63
+ """Tests for catalog creation and initialization."""
64
+
65
+ def test_create_local_catalog(self, tmp_path):
66
+ """Test LocalCatalog creation."""
67
+ catalog_dir = tmp_path / "test_catalog"
68
+ catalog_dir.mkdir()
69
+ uri = f"file://{catalog_dir.as_posix()}"
70
+ catalog = LocalCatalog(name=str(catalog_dir), uri=uri)
71
+
72
+ # catalog.name is derived from path
73
+ assert catalog.name == str(catalog_dir)
74
+ assert catalog.uri.startswith("file:///")
75
+ assert catalog.uri.endswith(str(catalog_dir.name))
76
+ assert catalog_dir.exists()
77
+
78
+ def test_local_catalog_from_config(self, tmp_path):
79
+ """Test creating LocalCatalog from local config file."""
80
+ catalog_dir = tmp_path / "test_catalog"
81
+ catalog_dir.mkdir()
82
+ uri = f"file://{catalog_dir.as_posix()}"
83
+ catalog = LocalCatalog(name=str(catalog_dir), uri=uri)
84
+
85
+ assert catalog.uri.startswith("file:///")
86
+ assert catalog.uri.endswith(str(catalog_dir.name))
87
+
88
+ def test_catalog_persistence(self, tmp_path, test_schema):
89
+ """Test that catalog persists across instances."""
90
+ catalog_dir = tmp_path / "test_catalog"
91
+ # Create catalog and table
92
+ uri = f"file://{catalog_dir.as_posix()}"
93
+ catalog1 = LocalCatalog(name=str(catalog_dir), uri=uri)
94
+ catalog1.init()
95
+
96
+ catalog1.create_namespace("default")
97
+ table_location_dir = tmp_path / "tables" / "default_test_table"
98
+ table_location_dir.mkdir(parents=True)
99
+ catalog1.create_table(
100
+ "default.test_table", test_schema, location=f"file://{table_location_dir.as_posix()}"
101
+ )
102
+ # Changes are automatically persisted via context manager
103
+
104
+ # Create new catalog instance
105
+ catalog2 = LocalCatalog(name=str(catalog_dir), uri=uri)
106
+
107
+ # Table should still exist
108
+ assert catalog2.table_exists("default.test_table")
109
+ table = catalog2.load_table("default.test_table")
110
+ assert table.schema() == test_schema
111
+
112
+
113
+ # =============================================================================
114
+ # Namespace Operations (Parametrized for local/remote)
115
+ # =============================================================================
116
+
117
+
118
+ class TestNamespaceOperations:
119
+ """Tests for namespace create, read, update, delete operations."""
120
+
121
+ def test_create_namespace(self, catalog):
122
+ """Test namespace creation."""
123
+ catalog.create_namespace("default")
124
+ assert ("default",) in catalog.list_namespaces()
125
+
126
+ def test_list_namespaces_empty(self, catalog):
127
+ """Test listing namespaces when none exist."""
128
+ namespaces = catalog.list_namespaces()
129
+ assert namespaces == []
130
+
131
+ def test_list_namespaces_with_tables(self, catalog, test_schema, get_table_location):
132
+ """Test listing namespaces with hierarchical names."""
133
+ catalog.create_namespace("ns1")
134
+ catalog.create_table("ns1.table1", test_schema, location=get_table_location("ns1.table1"))
135
+
136
+ namespaces = catalog.list_namespaces()
137
+ assert ("ns1",) in namespaces
138
+
139
+ def test_drop_namespace(self, catalog):
140
+ """Test dropping an empty namespace."""
141
+ catalog.create_namespace("test_ns")
142
+ catalog.drop_namespace("test_ns")
143
+
144
+ # Namespace should not appear in list
145
+ assert ("test_ns",) not in catalog.list_namespaces()
146
+
147
+ def test_drop_namespace_not_empty(self, catalog, test_schema, get_table_location):
148
+ """Test that dropping a non-empty namespace raises error."""
149
+ catalog.create_namespace("test_ns")
150
+ catalog.create_table(
151
+ "test_ns.table1", test_schema, location=get_table_location("test_ns.table1")
152
+ )
153
+
154
+ with pytest.raises(NamespaceNotEmptyError):
155
+ catalog.drop_namespace("test_ns")
156
+
157
+ def test_update_namespace_properties(self, catalog):
158
+ """Test updating namespace properties."""
159
+ catalog.create_namespace("test_ns")
160
+ summary = catalog.update_namespace_properties(
161
+ "test_ns", removals={"old_prop"}, updates={"new_prop": "value"}
162
+ )
163
+
164
+ # Currently returns empty summary
165
+ assert summary.removed == []
166
+ assert summary.updated == []
167
+ assert summary.missing == []
168
+
169
+ def test_create_namespace_already_exists(self, catalog, test_schema, get_table_location):
170
+ """Test creating namespace that already exists (has tables)."""
171
+ catalog.create_namespace("test_ns")
172
+ catalog.create_table(
173
+ "test_ns.table1", test_schema, location=get_table_location("test_ns.table1")
174
+ )
175
+
176
+ with pytest.raises(NamespaceAlreadyExistsError):
177
+ catalog.create_namespace("test_ns")
178
+
179
+
180
+ # =============================================================================
181
+ # Table Read Operations (Parametrized for local/remote)
182
+ # =============================================================================
183
+
184
+
185
+ class TestTableRead:
186
+ """Tests for table read operations."""
187
+
188
+ def test_load_table(self, catalog, test_schema, get_table_location):
189
+ """Test loading a table."""
190
+ catalog.create_namespace("default")
191
+ catalog.create_table(
192
+ identifier="default.test_table",
193
+ schema=test_schema,
194
+ location=get_table_location("default.test_table"),
195
+ )
196
+
197
+ table = catalog.load_table("default.test_table")
198
+
199
+ assert table.schema() == test_schema
200
+
201
+ def test_list_tables(self, catalog, test_schema, get_table_location):
202
+ """Test listing tables."""
203
+ catalog.create_namespace("default")
204
+
205
+ # Create multiple tables
206
+ catalog.create_table(
207
+ "default.table1", test_schema, location=get_table_location("default.table1")
208
+ )
209
+ catalog.create_table(
210
+ "default.table2", test_schema, location=get_table_location("default.table2")
211
+ )
212
+
213
+ tables = catalog.list_tables("default")
214
+
215
+ assert len(tables) == 2
216
+ assert ("default", "table1") in tables
217
+ assert ("default", "table2") in tables
218
+
219
+ def test_table_exists(self, catalog, test_schema, get_table_location):
220
+ """Test checking table existence."""
221
+ catalog.create_namespace("default")
222
+
223
+ assert not catalog.table_exists("default.test_table")
224
+
225
+ catalog.create_table(
226
+ "default.test_table", test_schema, location=get_table_location("default.test_table")
227
+ )
228
+
229
+ assert catalog.table_exists("default.test_table")
230
+
231
+ def test_load_table_not_found(self, catalog):
232
+ """Test loading non-existent table raises error."""
233
+ with pytest.raises(NoSuchTableError):
234
+ catalog.load_table("default.nonexistent")
235
+
236
+
237
+ # =============================================================================
238
+ # Table Write Operations (Parametrized for local/remote)
239
+ # =============================================================================
240
+
241
+
242
+ class TestTableWrite:
243
+ """Tests for table create, update, delete operations."""
244
+
245
+ def test_create_table(self, catalog, test_schema, get_table_location):
246
+ """Test table creation."""
247
+ catalog.create_namespace("default")
248
+
249
+ table = catalog.create_table(
250
+ identifier="default.test_table",
251
+ schema=test_schema,
252
+ location=get_table_location("default.test_table"),
253
+ )
254
+
255
+ assert table.metadata is not None
256
+ assert table.schema() == test_schema
257
+
258
+ def test_drop_table(self, catalog, test_schema, get_table_location):
259
+ """Test dropping a table."""
260
+ catalog.create_namespace("default")
261
+ catalog.create_table(
262
+ "default.test_table", test_schema, location=get_table_location("default.test_table")
263
+ )
264
+
265
+ assert catalog.table_exists("default.test_table")
266
+
267
+ catalog.drop_table("default.test_table")
268
+
269
+ assert not catalog.table_exists("default.test_table")
270
+
271
+ def test_rename_table(self, catalog, test_schema, get_table_location):
272
+ """Test renaming a table."""
273
+ catalog.create_namespace("default")
274
+ catalog.create_table(
275
+ "default.old_name", test_schema, location=get_table_location("default.old_name")
276
+ )
277
+
278
+ catalog.rename_table("default.old_name", "default.new_name")
279
+
280
+ assert not catalog.table_exists("default.old_name")
281
+ assert catalog.table_exists("default.new_name")
282
+
283
+ def test_drop_table_not_found(self, catalog):
284
+ """Test dropping non-existent table raises error."""
285
+ with pytest.raises(NoSuchTableError):
286
+ catalog.drop_table("default.nonexistent")
287
+
288
+ def test_rename_table_source_not_found(self, catalog):
289
+ """Test renaming non-existent table raises error."""
290
+ with pytest.raises(NoSuchTableError):
291
+ catalog.rename_table("default.nonexistent", "default.new_name")
292
+
293
+ def test_rename_table_destination_exists(self, catalog, test_schema, get_table_location):
294
+ """Test that renaming to existing table name raises error."""
295
+ catalog.create_namespace("default")
296
+ catalog.create_table(
297
+ "default.table1", test_schema, location=get_table_location("default.table1")
298
+ )
299
+ catalog.create_table(
300
+ "default.table2", test_schema, location=get_table_location("default.table2")
301
+ )
302
+
303
+ with pytest.raises(TableAlreadyExistsError):
304
+ catalog.rename_table("default.table1", "default.table2")
305
+
306
+ def test_create_table_transaction_not_implemented(self, catalog, test_schema):
307
+ """Test that table transactions are not yet implemented."""
308
+ with pytest.raises(NotImplementedError):
309
+ catalog.create_table_transaction("default.test_table", test_schema)
310
+
311
+
312
+ # =============================================================================
313
+ # Table Write Properties (Parametrized for local/remote)
314
+ # =============================================================================
315
+
316
+
317
+ class TestTableWriteProperties:
318
+ """Tests for table write properties and LocationProvider."""
319
+
320
+ def test_create_table_with_write_properties(self, catalog, test_schema, get_table_location):
321
+ """Test creating a table with write LocationProvider configured."""
322
+ catalog.create_namespace("default")
323
+ table = catalog.create_table(
324
+ "default.write_test",
325
+ schema=test_schema,
326
+ location=get_table_location("default.write_test"),
327
+ properties={
328
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
329
+ "hf.write.split": "train",
330
+ },
331
+ )
332
+
333
+ # Verify LocationProvider is configured
334
+ assert (
335
+ table.properties.get("write.py-location-provider.impl")
336
+ == "faceberg.catalog.HfLocationProvider"
337
+ )
338
+
339
+ def test_location_provider_returns_correct_type(self, catalog, test_schema, get_table_location):
340
+ """Test that table.location_provider() returns HfLocationProvider."""
341
+ catalog.create_namespace("default")
342
+ table = catalog.create_table(
343
+ "default.test_table",
344
+ schema=test_schema,
345
+ location=get_table_location("default.test_table"),
346
+ properties={
347
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
348
+ },
349
+ )
350
+
351
+ # Verify LocationProvider is configured
352
+ provider = table.location_provider()
353
+ assert isinstance(provider, HfLocationProvider)
354
+
355
+
356
+ # =============================================================================
357
+ # Table Append Operations (Parametrized for local/remote)
358
+ # =============================================================================
359
+
360
+
361
+ class TestTableAppend:
362
+ """Tests for PyIceberg append operations on writable tables."""
363
+
364
+ @pytest.fixture
365
+ def writable_catalog(self, catalog, tmp_path):
366
+ """Create catalog with writable table for testing write operations.
367
+
368
+ Creates a catalog with a writable table (not from HuggingFace dataset)
369
+ that can be used to test append and other write operations.
370
+
371
+ Args:
372
+ catalog: Empty catalog instance (local or remote, from parametrized fixture)
373
+ tmp_path: Temporary directory for table data
374
+
375
+ Returns:
376
+ Catalog instance with a writable test_table in the default namespace
377
+ """
378
+ # Create data directory for the table
379
+ data_dir = tmp_path / "data"
380
+ data_dir.mkdir()
381
+ location = f"file://{data_dir.as_posix()}"
382
+
383
+ # Create the table with schema matching imdb dataset
384
+ schema = Schema(
385
+ NestedField(field_id=1, name="split", field_type=StringType(), required=False),
386
+ NestedField(field_id=2, name="text", field_type=StringType(), required=False),
387
+ NestedField(field_id=3, name="label", field_type=LongType(), required=False),
388
+ )
389
+
390
+ partition_spec = PartitionSpec(
391
+ PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="split")
392
+ )
393
+
394
+ # Create the table with mandatory location argument
395
+ catalog.create_table(
396
+ identifier="default.test_table",
397
+ schema=schema,
398
+ location=location,
399
+ partition_spec=partition_spec,
400
+ )
401
+
402
+ return catalog
403
+
404
+ def test_append_data(self, writable_catalog):
405
+ """Verifies data is appended, count increases, and data is scannable."""
406
+ table = writable_catalog.load_table("default.test_table")
407
+
408
+ # Record count before append
409
+ before_count = table.scan().to_arrow().num_rows
410
+
411
+ # Create test data with unique text for verification
412
+ unique_text = f"Unique test review {uuid.uuid4()}"
413
+ test_data = pa.Table.from_pydict(
414
+ {
415
+ "split": ["test", "test"],
416
+ "text": [unique_text, "Test review 2"],
417
+ "label": [1, 0],
418
+ }
419
+ )
420
+
421
+ # Append data
422
+ table.append(test_data)
423
+
424
+ # Verify count increased by expected amount
425
+ after_count = table.scan().to_arrow().num_rows
426
+ assert after_count == before_count + len(test_data)
427
+
428
+ # Verify appended data is readable via scan
429
+ scan = table.scan().filter(f"text = '{unique_text}'")
430
+ result = scan.to_arrow()
431
+ assert result.num_rows == 1
432
+ assert result["text"][0].as_py() == unique_text
433
+
434
+ def test_append_data_snapshot_history(self, writable_catalog):
435
+ """Test snapshot history is updated after append."""
436
+ table = writable_catalog.load_table("default.test_table")
437
+
438
+ # Record snapshot count before append
439
+ snapshots_before = list(table.snapshots())
440
+ snapshot_count_before = len(snapshots_before)
441
+
442
+ # Create and append test data
443
+ test_data = pa.Table.from_pydict(
444
+ {
445
+ "split": ["test"],
446
+ "text": ["Snapshot test review"],
447
+ "label": [1],
448
+ }
449
+ )
450
+ table.append(test_data)
451
+
452
+ # Reload table to get updated snapshots
453
+ table = writable_catalog.load_table("default.test_table")
454
+ snapshots_after = list(table.snapshots())
455
+
456
+ # Verify new snapshot was created
457
+ assert len(snapshots_after) == snapshot_count_before + 1
458
+
459
+ # Verify latest snapshot has append operation
460
+ latest_snapshot = snapshots_after[-1]
461
+ assert latest_snapshot.summary is not None
462
+ # Summary.operation is an enum, not a string
463
+ from pyiceberg.table.snapshots import Operation
464
+
465
+ assert latest_snapshot.summary.operation == Operation.APPEND
466
+
467
+ def test_append_data_partition_integrity(self, writable_catalog):
468
+ """Test partition integrity is maintained after append."""
469
+ table = writable_catalog.load_table("default.test_table")
470
+
471
+ # Record partition spec before append
472
+ spec_before = table.spec()
473
+
474
+ # Create test data for specific partition
475
+ test_data = pa.Table.from_pydict(
476
+ {
477
+ "split": ["test", "test"],
478
+ "text": ["Partition test review 1", "Partition test review 2"],
479
+ "label": [1, 0],
480
+ }
481
+ )
482
+ table.append(test_data)
483
+
484
+ # Reload table and verify partition spec unchanged
485
+ table = writable_catalog.load_table("default.test_table")
486
+ spec_after = table.spec()
487
+ assert len(spec_before.fields) == len(spec_after.fields)
488
+
489
+ # Verify partition filtering still works
490
+ scan = table.scan().filter("split = 'test'")
491
+ result = scan.to_arrow()
492
+
493
+ # All rows should have split == 'test'
494
+ split_values = result["split"].unique().to_pylist()
495
+ assert split_values == ["test"]
496
+ assert result.num_rows > 0
497
+
498
+
499
+ # =============================================================================
500
+ # Dataset Operations (Parametrized for local/remote)
501
+ # =============================================================================
502
+
503
+
504
+ class TestDatasetOperations:
505
+ """Tests for HuggingFace dataset integration."""
506
+
507
+ def test_namespace_exists_after_add_dataset(self, session_mbpp):
508
+ """Test that namespaces exist after datasets are added."""
509
+ # Namespace should exist after add_dataset
510
+ assert ("google-research-datasets",) in session_mbpp.list_namespaces()
511
+
512
+ # Verify table exists
513
+ tables = session_mbpp.list_tables("google-research-datasets")
514
+ assert len(tables) > 0
515
+
516
+ def test_add_dataset_already_exists(self, catalog):
517
+ """Test adding a dataset that already exists raises error."""
518
+ # Create table first time
519
+ catalog.add_dataset("default.imdb_plain_text", "stanfordnlp/imdb", config="plain_text")
520
+
521
+ # Try to create again - should raise
522
+ with pytest.raises(TableAlreadyExistsError):
523
+ catalog.add_dataset(
524
+ "default.imdb_plain_text",
525
+ "stanfordnlp/imdb",
526
+ config="plain_text",
527
+ )
528
+
529
+ def test_add_dataset_with_config(self, catalog):
530
+ """Test adding a dataset with a specific config."""
531
+ # Create table
532
+ table = catalog.add_dataset(
533
+ "default.imdb_plain_text",
534
+ "stanfordnlp/imdb",
535
+ config="plain_text",
536
+ )
537
+
538
+ # Verify table
539
+ assert table is not None
540
+ assert table.schema() is not None
541
+ assert len(table.schema().fields) > 0
542
+
543
+ # Verify table properties
544
+ props = table.properties
545
+ assert "hf.dataset.repo" in props
546
+ assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
547
+ assert "hf.dataset.config" in props
548
+ assert props["hf.dataset.config"] == "plain_text"
549
+
550
+
551
+ # =============================================================================
552
+ # Unsupported Operations (Parametrized for local/remote)
553
+ # =============================================================================
554
+
555
+
556
+ class TestTableScanning:
557
+ """Tests for PyIceberg table scanning operations."""
558
+
559
+ def test_scan_basic(self, session_mbpp):
560
+ """Test creating a basic scan object."""
561
+ catalog = session_mbpp
562
+ table = catalog.load_table("google-research-datasets.mbpp")
563
+ scan = table.scan()
564
+
565
+ # Verify scan object is created
566
+ assert scan is not None
567
+
568
+ # Verify scan has expected methods
569
+ assert hasattr(scan, "to_arrow")
570
+ assert hasattr(scan, "to_pandas")
571
+ assert hasattr(scan, "to_arrow_batch_reader")
572
+
573
+ def test_scan_to_arrow(self, session_mbpp):
574
+ """Test scanning table to Arrow table."""
575
+
576
+ catalog = session_mbpp
577
+ table = catalog.load_table("google-research-datasets.mbpp")
578
+ scan = table.scan()
579
+
580
+ # Convert to Arrow table
581
+ arrow_table = scan.to_arrow()
582
+
583
+ # Verify it's an Arrow table
584
+ assert isinstance(arrow_table, pa.Table)
585
+
586
+ # Verify we have rows
587
+ assert arrow_table.num_rows > 0
588
+
589
+ # Verify expected columns are present
590
+ column_names = arrow_table.schema.names
591
+ assert "split" in column_names
592
+ # Verify dataset has at least 2 other columns besides split
593
+ assert len(column_names) >= 3
594
+
595
+ # Verify split column contains expected values
596
+ split_values = arrow_table["split"].unique().to_pylist()
597
+ assert any(split in split_values for split in ["train", "test", "validation", "prompt"])
598
+
599
+ def test_scan_to_pandas(self, session_mbpp):
600
+ """Test scanning table to Pandas DataFrame."""
601
+
602
+ catalog = session_mbpp
603
+ table = catalog.load_table("google-research-datasets.mbpp")
604
+ scan = table.scan()
605
+
606
+ # Convert to Pandas DataFrame
607
+ df = scan.to_pandas()
608
+
609
+ # Verify DataFrame shape
610
+ assert len(df) > 0
611
+ assert len(df.columns) > 0
612
+
613
+ # Verify split column exists
614
+ assert "split" in df.columns
615
+
616
+ # Verify we have multiple columns
617
+ assert len(df.columns) >= 3
618
+
619
+ def test_scan_with_selected_fields(self, session_mbpp):
620
+ """Test scanning with column projection."""
621
+ catalog = session_mbpp
622
+ table = catalog.load_table("google-research-datasets.mbpp")
623
+
624
+ # Get schema to know which columns exist
625
+ schema = table.schema()
626
+ # Select first two non-split columns
627
+ cols_to_select = [f.name for f in schema.fields if f.name != "split"][:2]
628
+
629
+ # Scan with only specific columns selected
630
+ scan = table.scan().select(*cols_to_select)
631
+ arrow_table = scan.to_arrow()
632
+
633
+ # Verify only selected columns are present
634
+ column_names = arrow_table.schema.names
635
+ assert len(column_names) == len(cols_to_select)
636
+ assert "split" not in column_names
637
+ for col in cols_to_select:
638
+ assert col in column_names
639
+
640
+ def test_scan_limit(self, session_mbpp):
641
+ """Test scanning with row limit."""
642
+ catalog = session_mbpp
643
+ table = catalog.load_table("google-research-datasets.mbpp")
644
+
645
+ # PyIceberg doesn't support limit() directly on scan, need to materialize first
646
+ scan = table.scan()
647
+ arrow_table = scan.to_arrow()
648
+
649
+ # Take first 10 rows
650
+ limited_table = arrow_table.slice(0, 10)
651
+
652
+ # Verify exactly 10 rows
653
+ assert limited_table.num_rows == 10
654
+
655
+ def test_partition_filter_single_split(self, session_mbpp):
656
+ """Test partition pruning with single split filter."""
657
+ catalog = session_mbpp
658
+ table = catalog.load_table("google-research-datasets.mbpp")
659
+
660
+ # Scan with split filter
661
+ scan = table.scan().filter("split = 'train'")
662
+ arrow_table = scan.to_arrow()
663
+
664
+ # Verify all rows have split == "train"
665
+ split_values = arrow_table["split"].unique().to_pylist()
666
+ assert split_values == ["train"]
667
+
668
+ # Verify we got some rows (not empty result)
669
+ assert arrow_table.num_rows > 0
670
+
671
+ def test_partition_filter_multiple_splits(self, session_mbpp):
672
+ """Test partition pruning with multiple split filter."""
673
+ catalog = session_mbpp
674
+ table = catalog.load_table("google-research-datasets.mbpp")
675
+
676
+ # Scan with IN filter for multiple splits
677
+ scan = table.scan().filter("split IN ('train', 'test')")
678
+ df = scan.to_pandas()
679
+
680
+ # Verify only train and test splits are present
681
+ unique_splits = df["split"].unique()
682
+ assert set(unique_splits).issubset({"train", "test"})
683
+
684
+ # Verify other splits are excluded (validation, prompt)
685
+ assert "validation" not in unique_splits
686
+ assert "prompt" not in unique_splits
687
+
688
+ # Verify we got some rows
689
+ assert len(df) > 0
690
+
691
+ def test_scan_all_partitions(self, session_mbpp):
692
+ """Test scanning all partitions without filter."""
693
+ catalog = session_mbpp
694
+ table = catalog.load_table("google-research-datasets.mbpp")
695
+
696
+ # Scan without filter
697
+ scan = table.scan()
698
+ arrow_table = scan.to_arrow()
699
+
700
+ # Group by split to get all partitions
701
+ split_values = set(arrow_table["split"].to_pylist())
702
+
703
+ # Verify we have multiple splits
704
+ assert len(split_values) > 1
705
+
706
+ # Verify expected splits are present (mbpp has train/test/validation/prompt)
707
+ assert "train" in split_values or "test" in split_values
708
+
709
+ def test_scan_empty_result(self, session_mbpp):
710
+ """Test scanning with filter that returns no rows."""
711
+ catalog = session_mbpp
712
+ table = catalog.load_table("google-research-datasets.mbpp")
713
+
714
+ # Scan with impossible filter
715
+ scan = table.scan().filter("split = 'nonexistent_split'")
716
+ arrow_table = scan.to_arrow()
717
+
718
+ # Verify 0 rows returned
719
+ assert arrow_table.num_rows == 0
720
+
721
+ # Verify schema is still correct (has split and other columns)
722
+ assert "split" in arrow_table.schema.names
723
+ assert len(arrow_table.schema.names) >= 3
724
+
725
+ def test_multiple_scans_same_table(self, session_mbpp):
726
+ """Test multiple independent scans from the same table."""
727
+ catalog = session_mbpp
728
+ table = catalog.load_table("google-research-datasets.mbpp")
729
+
730
+ # Create two independent scans
731
+ scan1 = table.scan().filter("split = 'train'")
732
+ scan2 = table.scan().filter("split = 'test'")
733
+
734
+ # Materialize both scans
735
+ df1 = scan1.to_pandas().head(5)
736
+ df2 = scan2.to_pandas().head(3)
737
+
738
+ # Verify they don't interfere with each other
739
+ assert len(df1) == 5
740
+ assert all(df1["split"] == "train")
741
+
742
+ assert len(df2) == 3
743
+ assert all(df2["split"] == "test")
744
+
745
+
746
+ class TestTableMetadata:
747
+ """Tests for PyIceberg metadata reading operations."""
748
+
749
+ def test_read_schema(self, session_mbpp):
750
+ """Test reading table schema."""
751
+ catalog = session_mbpp
752
+ table = catalog.load_table("google-research-datasets.mbpp")
753
+ schema = table.schema()
754
+
755
+ # Verify schema has expected fields
756
+ field_names = [field.name for field in schema.fields]
757
+ assert "split" in field_names
758
+ # Verify we have multiple fields (at least 3)
759
+ assert len(field_names) >= 3
760
+
761
+ # Verify field IDs are assigned (all > 0)
762
+ for field in schema.fields:
763
+ assert field.field_id > 0
764
+
765
+ # Verify split column is first field
766
+ assert schema.fields[0].name == "split"
767
+
768
+ def test_read_partition_spec(self, session_mbpp):
769
+ """Test reading partition specification."""
770
+
771
+ catalog = session_mbpp
772
+ table = catalog.load_table("google-research-datasets.mbpp")
773
+ spec = table.spec()
774
+
775
+ # Verify partition spec has at least one field
776
+ assert len(spec.fields) >= 1
777
+
778
+ # Find the split partition field
779
+ split_partition = None
780
+ for field in spec.fields:
781
+ if field.name == "split":
782
+ split_partition = field
783
+ break
784
+
785
+ # Verify split partition exists with identity transform
786
+ assert split_partition is not None
787
+ assert isinstance(split_partition.transform, IdentityTransform)
788
+
789
+ def test_read_properties(self, session_mbpp):
790
+ """Test reading table properties."""
791
+ catalog = session_mbpp
792
+ table = catalog.load_table("google-research-datasets.mbpp")
793
+ properties = table.properties
794
+
795
+ # Verify HuggingFace properties exist
796
+ assert "hf.dataset.repo" in properties
797
+ assert properties["hf.dataset.repo"] == "google-research-datasets/mbpp"
798
+
799
+ assert "hf.dataset.config" in properties
800
+ assert properties["hf.dataset.config"] == "sanitized"
801
+
802
+ assert "hf.dataset.revision" in properties
803
+
804
+ # Verify schema name mapping is present
805
+ assert "schema.name-mapping.default" in properties
806
+
807
+ def test_read_snapshots(self, session_mbpp):
808
+ """Test reading table snapshots."""
809
+ catalog = session_mbpp
810
+ table = catalog.load_table("google-research-datasets.mbpp")
811
+ snapshots = list(table.snapshots())
812
+
813
+ # Verify at least one snapshot exists
814
+ assert len(snapshots) > 0
815
+
816
+ # Verify snapshot has expected attributes
817
+ snapshot = snapshots[0]
818
+ assert hasattr(snapshot, "snapshot_id")
819
+ assert hasattr(snapshot, "manifest_list")
820
+ assert snapshot.snapshot_id > 0
821
+
822
+ def test_current_snapshot(self, session_mbpp):
823
+ """Test reading current snapshot."""
824
+ catalog = session_mbpp
825
+ table = catalog.load_table("google-research-datasets.mbpp")
826
+ snapshot = table.current_snapshot()
827
+
828
+ # Verify current snapshot exists
829
+ assert snapshot is not None
830
+
831
+ # Verify snapshot has summary
832
+ assert snapshot.summary is not None
833
+
834
+ # Verify snapshot ID exists
835
+ assert snapshot.snapshot_id > 0
836
+
837
+
838
+ # =============================================================================
839
+ # REST Catalog Integration Tests
840
+ # =============================================================================
841
+
842
+
843
+ class TestRestCatalogOperations:
844
+ """Tests for PyIceberg REST catalog basic operations."""
845
+
846
+ def test_rest_list_namespaces(self, session_rest_catalog):
847
+ """Test listing namespaces via REST catalog."""
848
+ namespaces = session_rest_catalog.list_namespaces()
849
+
850
+ # Verify we got namespaces
851
+ assert len(namespaces) > 0
852
+
853
+ # Verify google-research-datasets namespace exists
854
+ namespace_strs = [".".join(ns) if isinstance(ns, tuple) else ns for ns in namespaces]
855
+ assert "google-research-datasets" in namespace_strs
856
+
857
+ def test_rest_list_tables(self, session_rest_catalog):
858
+ """Test listing tables via REST catalog."""
859
+ tables = session_rest_catalog.list_tables("google-research-datasets")
860
+
861
+ # Verify we got tables
862
+ assert len(tables) > 0
863
+
864
+ # Verify mbpp table exists
865
+ table_names = [t[1] if isinstance(t, tuple) and len(t) > 1 else str(t) for t in tables]
866
+ assert "mbpp" in table_names
867
+
868
+ def test_rest_load_table(self, session_rest_catalog):
869
+ """Test loading a table via REST catalog."""
870
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
871
+
872
+ # Verify table loaded successfully
873
+ assert table is not None
874
+
875
+ # Verify table has schema
876
+ schema = table.schema()
877
+ assert schema is not None
878
+ assert len(schema.fields) > 0
879
+
880
+
881
+ class TestRestCatalogScanning:
882
+ """Tests for PyIceberg REST catalog scanning operations."""
883
+
884
+ def test_rest_scan_to_arrow(self, session_rest_catalog):
885
+ """Test scanning table to Arrow via REST catalog."""
886
+
887
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
888
+ scan = table.scan()
889
+
890
+ # Convert to Arrow table
891
+ arrow_table = scan.to_arrow()
892
+
893
+ # Verify it's an Arrow table
894
+ assert isinstance(arrow_table, pa.Table)
895
+
896
+ # Verify we have rows
897
+ assert arrow_table.num_rows > 0
898
+
899
+ # Verify expected columns (split + at least 2 other columns)
900
+ column_names = arrow_table.schema.names
901
+ assert "split" in column_names
902
+ assert len(column_names) >= 3
903
+
904
+ def test_rest_scan_to_pandas(self, session_rest_catalog):
905
+ """Test scanning table to Pandas via REST catalog."""
906
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
907
+ scan = table.scan()
908
+
909
+ # Convert to Pandas DataFrame
910
+ df = scan.to_pandas()
911
+
912
+ # Verify DataFrame shape
913
+ assert len(df) > 0
914
+ assert len(df.columns) > 0
915
+
916
+ # Verify split column exists
917
+ assert "split" in df.columns
918
+
919
+ def test_rest_partition_filter(self, session_rest_catalog):
920
+ """Test partition filtering via REST catalog."""
921
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
922
+
923
+ # Scan with split filter
924
+ scan = table.scan().filter("split = 'train'")
925
+ arrow_table = scan.to_arrow()
926
+
927
+ # Verify all rows have split == "train"
928
+ split_values = arrow_table["split"].unique().to_pylist()
929
+ assert split_values == ["train"]
930
+
931
+ # Verify we got some rows
932
+ assert arrow_table.num_rows > 0
933
+
934
+ def test_rest_column_projection(self, session_rest_catalog):
935
+ """Test column projection via REST catalog."""
936
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
937
+
938
+ # Get schema to know which columns exist
939
+ schema = table.schema()
940
+ # Select first two non-split columns
941
+ cols_to_select = [f.name for f in schema.fields if f.name != "split"][:2]
942
+
943
+ # Scan with only specific columns selected
944
+ scan = table.scan().select(*cols_to_select)
945
+ arrow_table = scan.to_arrow()
946
+
947
+ # Verify only selected columns are present
948
+ column_names = arrow_table.schema.names
949
+ assert len(column_names) == len(cols_to_select)
950
+ assert "split" not in column_names
951
+ for col in cols_to_select:
952
+ assert col in column_names
953
+
954
+
955
+ class TestRestCatalogMetadata:
956
+ """Tests for PyIceberg REST catalog metadata operations."""
957
+
958
+ def test_rest_read_schema(self, session_rest_catalog):
959
+ """Test reading table schema via REST catalog."""
960
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
961
+ schema = table.schema()
962
+
963
+ # Verify schema has expected fields
964
+ field_names = [field.name for field in schema.fields]
965
+ assert "split" in field_names
966
+ # Verify we have multiple fields (at least 3)
967
+ assert len(field_names) >= 3
968
+
969
+ def test_rest_read_properties(self, session_rest_catalog):
970
+ """Test reading table properties via REST catalog."""
971
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
972
+ properties = table.properties
973
+
974
+ # Verify HuggingFace properties exist
975
+ assert "hf.dataset.repo" in properties
976
+ assert properties["hf.dataset.repo"] == "google-research-datasets/mbpp"
977
+
978
+ def test_rest_read_snapshots(self, session_rest_catalog):
979
+ """Test reading table snapshots via REST catalog."""
980
+ table = session_rest_catalog.load_table("google-research-datasets.mbpp")
981
+ snapshots = list(table.snapshots())
982
+
983
+ # Verify at least one snapshot exists
984
+ assert len(snapshots) > 0
985
+
986
+ # Verify snapshot has expected attributes
987
+ snapshot = snapshots[0]
988
+ assert hasattr(snapshot, "snapshot_id")
989
+ assert snapshot.snapshot_id > 0
990
+
991
+
992
+ class TestUnsupportedOperations:
993
+ """Tests for operations that are not yet supported."""
994
+
995
+ def test_view_operations_not_supported(self, catalog):
996
+ """Test that view operations are not supported."""
997
+ # view_exists should raise NotImplementedError
998
+ with pytest.raises(NotImplementedError):
999
+ catalog.view_exists("default.test_view")
1000
+
1001
+ # list_views should raise NotImplementedError
1002
+ with pytest.raises(NotImplementedError):
1003
+ catalog.list_views("default")
1004
+
1005
+ # drop_view should raise NotImplementedError
1006
+ with pytest.raises(NotImplementedError):
1007
+ catalog.drop_view("default.test_view")
1008
+
1009
+
1010
+ # =============================================================================
1011
+ # HfFileIO Tests
1012
+ # =============================================================================
1013
+
1014
+
1015
+ class TestHfFileIO:
1016
+ """Tests for HfFileIO custom FileIO implementation."""
1017
+
1018
+ def test_hffileio_initialization(self):
1019
+ """Test that HfFileIO can be initialized with properties."""
1020
+ io = HfFileIO(
1021
+ properties={
1022
+ "hf.endpoint": "https://huggingface.co",
1023
+ "hf.token": "test_token",
1024
+ }
1025
+ )
1026
+
1027
+ assert io is not None
1028
+ assert io.properties["hf.endpoint"] == "https://huggingface.co"
1029
+ assert io.properties["hf.token"] == "test_token"
1030
+
1031
+ def test_hffileio_creates_hf_filesystem(self):
1032
+ """Test that HfFileIO creates HfFileSystem for hf:// scheme."""
1033
+ io = HfFileIO(properties={"hf.endpoint": "https://huggingface.co"})
1034
+ fs = io.get_fs("hf")
1035
+
1036
+ assert isinstance(fs, HfFileSystem)
1037
+
1038
+ def test_hffileio_uses_skip_instance_cache(self):
1039
+ """Test that HfFileIO creates multiple distinct HfFileSystem instances.
1040
+
1041
+ When skip_instance_cache=True, each call to get_fs('hf') should create
1042
+ a new HfFileSystem instance (after cache eviction). This test verifies
1043
+ that our custom factory uses skip_instance_cache correctly.
1044
+ """
1045
+ io = HfFileIO(properties={"hf.endpoint": "https://huggingface.co"})
1046
+
1047
+ # First call creates and caches filesystem
1048
+ fs1 = io.get_fs("hf")
1049
+
1050
+ # Verify we got a HfFileSystem instance
1051
+ assert isinstance(fs1, HfFileSystem)
1052
+
1053
+ # Just verify that calling get_fs again works
1054
+ # (Testing internal cache behavior is fragile across pyiceberg versions)
1055
+ fs2 = io.get_fs("hf")
1056
+ assert isinstance(fs2, HfFileSystem)
1057
+
1058
+ def test_hffileio_extends_fsspec_fileio(self):
1059
+ """Test that HfFileIO properly extends FsspecFileIO."""
1060
+ io = HfFileIO(properties={})
1061
+
1062
+ assert isinstance(io, FsspecFileIO)
1063
+ # Should have all standard FileIO methods
1064
+ assert hasattr(io, "new_input")
1065
+ assert hasattr(io, "new_output")
1066
+ assert hasattr(io, "delete")
1067
+ assert hasattr(io, "get_fs")
1068
+
1069
+
1070
+ # =============================================================================
1071
+ # catalog() Factory Function Tests
1072
+ # =============================================================================
1073
+
1074
+
1075
+ class TestCatalogFactory:
1076
+ """Tests for the catalog() factory function."""
1077
+
1078
+ def test_catalog_local_directory_path(self, tmp_path):
1079
+ """Test creating LocalCatalog from directory path."""
1080
+ catalog_dir = tmp_path / "test_catalog"
1081
+ catalog_dir.mkdir()
1082
+
1083
+ cat = catalog_factory(str(catalog_dir))
1084
+
1085
+ assert isinstance(cat, LocalCatalog)
1086
+ assert cat.uri.startswith("file:///")
1087
+
1088
+ def test_catalog_local_file_uri(self, tmp_path):
1089
+ """Test creating LocalCatalog from file:// URI."""
1090
+ catalog_dir = tmp_path / "test_catalog"
1091
+ catalog_dir.mkdir()
1092
+ uri = f"file://{catalog_dir.as_posix()}"
1093
+
1094
+ cat = catalog_factory(uri)
1095
+
1096
+ assert isinstance(cat, LocalCatalog)
1097
+ assert cat.uri.startswith("file:///")
1098
+
1099
+ def test_catalog_remote_datasets_explicit(self):
1100
+ """Test creating RemoteCatalog with explicit hf://datasets/ URI."""
1101
+ cat = catalog_factory("hf://datasets/my-org/my-repo", hf_token="test_token")
1102
+
1103
+ assert isinstance(cat, RemoteCatalog)
1104
+ assert cat.uri == "hf://datasets/my-org/my-repo"
1105
+
1106
+ def test_catalog_remote_spaces_explicit(self):
1107
+ """Test creating RemoteCatalog with explicit hf://spaces/ URI."""
1108
+ cat = catalog_factory("hf://spaces/my-org/my-space", hf_token="test_token")
1109
+
1110
+ assert isinstance(cat, RemoteCatalog)
1111
+ assert cat.uri == "hf://spaces/my-org/my-space"
1112
+
1113
+ def test_catalog_remote_models_explicit(self):
1114
+ """Test creating RemoteCatalog with explicit hf://models/ URI."""
1115
+
1116
+ with pytest.raises(ValueError, match="Unsupported"):
1117
+ catalog_factory("hf://models/my-org/my-model", hf_token="test_token")
1118
+
1119
+ def test_catalog_remote_shorthand_defaults_to_spaces(self):
1120
+ """Test creating RemoteCatalog with shorthand org/repo format defaults to spaces."""
1121
+ cat = catalog_factory("my-org/my-repo", hf_token="test_token")
1122
+
1123
+ assert isinstance(cat, RemoteCatalog)
1124
+ assert cat.uri == "hf://spaces/my-org/my-repo"
1125
+ assert cat.name == "my-org/my-repo"
1126
+
1127
+ def test_catalog_remote_with_properties(self):
1128
+ """Test creating RemoteCatalog with additional properties."""
1129
+ cat = catalog_factory(
1130
+ "hf://spaces/my-org/my-space",
1131
+ hf_token="test_token",
1132
+ custom_prop="custom_value",
1133
+ )
1134
+
1135
+ assert isinstance(cat, RemoteCatalog)
1136
+ assert cat.properties["custom_prop"] == "custom_value"
1137
+
1138
+ def test_catalog_local_with_hf_token(self, tmp_path):
1139
+ """Test creating LocalCatalog with hf_token (for accessing datasets)."""
1140
+ catalog_dir = tmp_path / "test_catalog"
1141
+ catalog_dir.mkdir()
1142
+
1143
+ cat = catalog_factory(str(catalog_dir), hf_token="test_token")
1144
+
1145
+ assert isinstance(cat, LocalCatalog)
1146
+
1147
+ def test_catalog_name_extraction_from_hf_uri(self):
1148
+ """Test that catalog name is correctly extracted from hf:// URI."""
1149
+ # Datasets
1150
+ cat1 = catalog_factory("hf://datasets/org/repo")
1151
+ assert cat1.name == "org/repo"
1152
+
1153
+ # Spaces
1154
+ cat2 = catalog_factory("hf://spaces/org/space")
1155
+ assert cat2.name == "org/space"
1156
+
1157
+ def test_catalog_warehouse_property_set_correctly(self, tmp_path):
1158
+ """Test that warehouse property is set correctly for different catalog types."""
1159
+ # Local catalog
1160
+ catalog_dir = tmp_path / "test_catalog"
1161
+ catalog_dir.mkdir()
1162
+ local_cat = catalog_factory(str(catalog_dir))
1163
+ assert local_cat.properties["warehouse"] == str(catalog_dir)
1164
+
1165
+ # Remote catalog
1166
+ remote_cat = catalog_factory("hf://datasets/org/repo")
1167
+ assert remote_cat.properties["warehouse"] == "hf://datasets/org/repo"
1168
+
1169
+ def test_local_catalog_requires_file_uri(self, tmp_path):
1170
+ """Test that LocalCatalog requires file:// URI."""
1171
+ catalog_dir = tmp_path / "test_catalog"
1172
+ catalog_dir.mkdir()
1173
+
1174
+ # Should raise ValueError when given a plain path
1175
+ with pytest.raises(ValueError, match="LocalCatalog requires file:// URI"):
1176
+ LocalCatalog(name="test", uri=str(catalog_dir))
1177
+
1178
+ # Should work with file:// URI
1179
+ uri = f"file://{catalog_dir.as_posix()}"
1180
+ cat = LocalCatalog(name="test", uri=uri)
1181
+ assert isinstance(cat, LocalCatalog)
1182
+
1183
+ def test_remote_catalog_requires_hf_uri(self):
1184
+ """Test that RemoteCatalog requires hf:// URI."""
1185
+ # Should raise ValueError when given an invalid URI
1186
+ with pytest.raises(ValueError, match="RemoteCatalog requires hf:// URI"):
1187
+ RemoteCatalog(name="test", uri="file:///path/to/catalog")
1188
+
1189
+ with pytest.raises(ValueError, match="RemoteCatalog requires hf:// URI"):
1190
+ RemoteCatalog(name="test", uri="org/repo")
1191
+
1192
+ # Should work with hf:// URI
1193
+ cat = RemoteCatalog(name="test", uri="hf://datasets/org/repo")
1194
+ assert isinstance(cat, RemoteCatalog)
1195
+
1196
+ def test_catalog_factory_handles_path_conversion(self, tmp_path):
1197
+ """Test that catalog() factory converts paths to file:// URIs."""
1198
+ catalog_dir = tmp_path / "test_catalog"
1199
+ catalog_dir.mkdir()
1200
+
1201
+ # Factory should accept plain path and convert to file:// URI
1202
+ cat = catalog_factory(str(catalog_dir))
1203
+ assert isinstance(cat, LocalCatalog)
1204
+ assert cat.uri.startswith("file:///")
1205
+
1206
+
1207
+ # =============================================================================
1208
+ # HfLocationProvider Tests
1209
+ # =============================================================================
1210
+
1211
+
1212
+ class TestHfLocationProvider:
1213
+ """Tests for HfLocationProvider."""
1214
+
1215
+ def test_default_pattern(self):
1216
+ """Test default file naming pattern with UUIDv7."""
1217
+ provider = HfLocationProvider(
1218
+ table_location="hf://datasets/test-org/test-dataset",
1219
+ table_properties={},
1220
+ )
1221
+
1222
+ # First file
1223
+ path1 = provider.new_data_location("ignored.parquet")
1224
+ assert path1.endswith("-iceberg.parquet")
1225
+ assert "train-" in path1
1226
+ # UUIDv7 is 36 characters with hyphens
1227
+ filename1 = path1.split("/")[-1]
1228
+ uuid_part1 = filename1.replace("train-", "").replace("-iceberg.parquet", "")
1229
+ assert len(uuid_part1) == 36
1230
+
1231
+ # Second file - should have different UUID
1232
+ path2 = provider.new_data_location("ignored.parquet")
1233
+ assert path2.endswith("-iceberg.parquet")
1234
+ assert "train-" in path2
1235
+ assert path1 != path2 # Different UUIDs
1236
+
1237
+ def test_custom_split(self):
1238
+ """Test custom split name."""
1239
+ provider = HfLocationProvider(
1240
+ table_location="hf://datasets/test-org/test-dataset",
1241
+ table_properties={"hf.write.split": "validation"},
1242
+ )
1243
+
1244
+ path = provider.new_data_location("ignored.parquet")
1245
+ assert "validation-" in path
1246
+ assert path.endswith("-iceberg.parquet")
1247
+
1248
+ def test_custom_pattern(self):
1249
+ """Test custom file pattern."""
1250
+ provider = HfLocationProvider(
1251
+ table_location="hf://datasets/test-org/test-dataset",
1252
+ table_properties={
1253
+ "hf.write.pattern": "data-{split}-{uuid}.parquet",
1254
+ },
1255
+ )
1256
+
1257
+ path = provider.new_data_location("ignored.parquet")
1258
+ assert "data-train-" in path
1259
+ assert path.endswith(".parquet")
1260
+
1261
+ def test_uuidv7_sortability(self):
1262
+ """Test that UUIDv7 generates sortable identifiers."""
1263
+ import time
1264
+
1265
+ provider = HfLocationProvider(
1266
+ table_location="hf://datasets/test-org/test-dataset",
1267
+ table_properties={
1268
+ "hf.write.pattern": "{split}-{uuid}.parquet",
1269
+ },
1270
+ )
1271
+
1272
+ # Generate first UUID
1273
+ path1 = provider.new_data_location("ignored.parquet")
1274
+ filename1 = path1.split("/")[-1]
1275
+ uuid1 = filename1.replace("train-", "").replace(".parquet", "")
1276
+
1277
+ # Small delay to ensure different timestamp
1278
+ time.sleep(0.001)
1279
+
1280
+ # Generate second UUID
1281
+ path2 = provider.new_data_location("ignored.parquet")
1282
+ filename2 = path2.split("/")[-1]
1283
+ uuid2 = filename2.replace("train-", "").replace(".parquet", "")
1284
+
1285
+ # UUIDv7 should be sortable (later UUIDs are lexicographically greater)
1286
+ assert uuid1 < uuid2, "UUIDv7 should be sortable by timestamp"
1287
+ # UUIDs are 36 characters with hyphens
1288
+ assert len(uuid1) == 36
1289
+ assert len(uuid2) == 36
1290
+
1291
+
1292
+ # =============================================================================
1293
+ # Write to Existing Dataset Tests (Parametrized for local/remote)
1294
+ # =============================================================================
1295
+
1296
+
1297
+ class TestWriteToExistingDataset:
1298
+ """Tests for writing to existing HuggingFace datasets using location provider."""
1299
+
1300
+ def test_append_to_existing_dataset(self, writable_dataset):
1301
+ """Test appending data to an existing dataset with HfLocationProvider.
1302
+
1303
+ Verifies that:
1304
+ - The writable_dataset fixture provides a valid dataset
1305
+ - Data can be appended and read back correctly
1306
+ - Appended files follow HuggingFace naming pattern: train-{index:05d}-iceberg.parquet
1307
+ """
1308
+ catalog = writable_dataset
1309
+
1310
+ # Verify table exists and is properly configured
1311
+ assert catalog.table_exists("testorg.testdataset")
1312
+ table = catalog.load_table("testorg.testdataset")
1313
+ assert table is not None
1314
+
1315
+ # Verify table has HfLocationProvider configured
1316
+ assert (
1317
+ table.properties.get("write.py-location-provider.impl")
1318
+ == "faceberg.catalog.HfLocationProvider"
1319
+ )
1320
+
1321
+ # Verify table has initial data
1322
+ before_count = table.scan().to_arrow().num_rows
1323
+ assert before_count == 10 # Initial data from fixture
1324
+
1325
+ # Append new data (including split column as it's part of the schema)
1326
+ new_data = pa.Table.from_pydict(
1327
+ {
1328
+ "split": ["train", "train", "train"],
1329
+ "text": ["Appended test review", "Another appended review", "Third review"],
1330
+ "label": [1, 0, 1],
1331
+ }
1332
+ )
1333
+
1334
+ table.append(new_data)
1335
+
1336
+ # Reload table to get updated metadata
1337
+ table = catalog.load_table("testorg.testdataset")
1338
+
1339
+ # Verify data was appended (count should increase)
1340
+ after_count = table.scan().to_arrow().num_rows
1341
+ assert after_count >= before_count + len(new_data)
1342
+
1343
+ # Verify appended data is readable
1344
+ scan = table.scan().filter("text = 'Appended test review'")
1345
+ result = scan.to_arrow()
1346
+ assert result.num_rows == 1
1347
+ assert result["text"][0].as_py() == "Appended test review"