faceberg 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/__init__.py +15 -0
- faceberg/bridge.py +586 -0
- faceberg/catalog.py +1491 -0
- faceberg/cli.py +483 -0
- faceberg/config.py +208 -0
- faceberg/convert.py +813 -0
- faceberg/pretty.py +224 -0
- faceberg/server.py +439 -0
- faceberg/shell.py +83 -0
- faceberg/spaces/Dockerfile +10 -0
- faceberg/spaces/README.md +85 -0
- faceberg/spaces/landing.html +799 -0
- faceberg/tests/__init__.py +0 -0
- faceberg/tests/conftest.py +229 -0
- faceberg/tests/test_bridge.py +825 -0
- faceberg/tests/test_catalog.py +1347 -0
- faceberg/tests/test_catalog_duckdb.py +341 -0
- faceberg/tests/test_catalog_pandas.py +290 -0
- faceberg/tests/test_cli.py +62 -0
- faceberg/tests/test_config.py +367 -0
- faceberg/tests/test_convert.py +422 -0
- faceberg/tests/test_pretty.py +366 -0
- faceberg/tests/test_server.py +343 -0
- faceberg/tests/test_server_playwright.py +524 -0
- faceberg-0.1.0.dist-info/METADATA +175 -0
- faceberg-0.1.0.dist-info/RECORD +29 -0
- faceberg-0.1.0.dist-info/WHEEL +4 -0
- faceberg-0.1.0.dist-info/entry_points.txt +2 -0
- faceberg-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1347 @@
|
|
|
1
|
+
"""Tests for FacebergCatalog implementation."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pytest
|
|
7
|
+
from huggingface_hub import HfFileSystem
|
|
8
|
+
from pyiceberg.exceptions import (
|
|
9
|
+
NamespaceAlreadyExistsError,
|
|
10
|
+
NamespaceNotEmptyError,
|
|
11
|
+
NoSuchTableError,
|
|
12
|
+
TableAlreadyExistsError,
|
|
13
|
+
)
|
|
14
|
+
from pyiceberg.io.fsspec import FsspecFileIO
|
|
15
|
+
from pyiceberg.partitioning import PartitionField, PartitionSpec
|
|
16
|
+
from pyiceberg.schema import Schema
|
|
17
|
+
from pyiceberg.transforms import IdentityTransform
|
|
18
|
+
from pyiceberg.types import LongType, NestedField, StringType
|
|
19
|
+
|
|
20
|
+
from faceberg.catalog import HfFileIO, HfLocationProvider, LocalCatalog, RemoteCatalog
|
|
21
|
+
from faceberg.catalog import catalog as catalog_factory
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def test_schema():
|
|
26
|
+
"""Create a test schema."""
|
|
27
|
+
return Schema(
|
|
28
|
+
NestedField(1, "id", LongType(), required=True),
|
|
29
|
+
NestedField(2, "name", StringType(), required=False),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def get_table_location(tmp_path):
|
|
35
|
+
"""Generate a location for a table.
|
|
36
|
+
|
|
37
|
+
Returns a callable that generates unique table locations.
|
|
38
|
+
"""
|
|
39
|
+
counter = 0
|
|
40
|
+
|
|
41
|
+
def _location(identifier: str = None):
|
|
42
|
+
nonlocal counter
|
|
43
|
+
counter += 1
|
|
44
|
+
if identifier:
|
|
45
|
+
# Use identifier as directory name
|
|
46
|
+
name = identifier.replace(".", "_")
|
|
47
|
+
else:
|
|
48
|
+
# Generate unique name
|
|
49
|
+
name = f"table_{counter}"
|
|
50
|
+
location_dir = tmp_path / "tables" / name
|
|
51
|
+
location_dir.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
return f"file://{location_dir.as_posix()}"
|
|
53
|
+
|
|
54
|
+
return _location
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# =============================================================================
|
|
58
|
+
# Catalog Creation Tests (Local-specific, not parametrized)
|
|
59
|
+
# =============================================================================
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TestCatalogCreation:
|
|
63
|
+
"""Tests for catalog creation and initialization."""
|
|
64
|
+
|
|
65
|
+
def test_create_local_catalog(self, tmp_path):
|
|
66
|
+
"""Test LocalCatalog creation."""
|
|
67
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
68
|
+
catalog_dir.mkdir()
|
|
69
|
+
uri = f"file://{catalog_dir.as_posix()}"
|
|
70
|
+
catalog = LocalCatalog(name=str(catalog_dir), uri=uri)
|
|
71
|
+
|
|
72
|
+
# catalog.name is derived from path
|
|
73
|
+
assert catalog.name == str(catalog_dir)
|
|
74
|
+
assert catalog.uri.startswith("file:///")
|
|
75
|
+
assert catalog.uri.endswith(str(catalog_dir.name))
|
|
76
|
+
assert catalog_dir.exists()
|
|
77
|
+
|
|
78
|
+
def test_local_catalog_from_config(self, tmp_path):
|
|
79
|
+
"""Test creating LocalCatalog from local config file."""
|
|
80
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
81
|
+
catalog_dir.mkdir()
|
|
82
|
+
uri = f"file://{catalog_dir.as_posix()}"
|
|
83
|
+
catalog = LocalCatalog(name=str(catalog_dir), uri=uri)
|
|
84
|
+
|
|
85
|
+
assert catalog.uri.startswith("file:///")
|
|
86
|
+
assert catalog.uri.endswith(str(catalog_dir.name))
|
|
87
|
+
|
|
88
|
+
def test_catalog_persistence(self, tmp_path, test_schema):
|
|
89
|
+
"""Test that catalog persists across instances."""
|
|
90
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
91
|
+
# Create catalog and table
|
|
92
|
+
uri = f"file://{catalog_dir.as_posix()}"
|
|
93
|
+
catalog1 = LocalCatalog(name=str(catalog_dir), uri=uri)
|
|
94
|
+
catalog1.init()
|
|
95
|
+
|
|
96
|
+
catalog1.create_namespace("default")
|
|
97
|
+
table_location_dir = tmp_path / "tables" / "default_test_table"
|
|
98
|
+
table_location_dir.mkdir(parents=True)
|
|
99
|
+
catalog1.create_table(
|
|
100
|
+
"default.test_table", test_schema, location=f"file://{table_location_dir.as_posix()}"
|
|
101
|
+
)
|
|
102
|
+
# Changes are automatically persisted via context manager
|
|
103
|
+
|
|
104
|
+
# Create new catalog instance
|
|
105
|
+
catalog2 = LocalCatalog(name=str(catalog_dir), uri=uri)
|
|
106
|
+
|
|
107
|
+
# Table should still exist
|
|
108
|
+
assert catalog2.table_exists("default.test_table")
|
|
109
|
+
table = catalog2.load_table("default.test_table")
|
|
110
|
+
assert table.schema() == test_schema
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# =============================================================================
|
|
114
|
+
# Namespace Operations (Parametrized for local/remote)
|
|
115
|
+
# =============================================================================
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class TestNamespaceOperations:
|
|
119
|
+
"""Tests for namespace create, read, update, delete operations."""
|
|
120
|
+
|
|
121
|
+
def test_create_namespace(self, catalog):
|
|
122
|
+
"""Test namespace creation."""
|
|
123
|
+
catalog.create_namespace("default")
|
|
124
|
+
assert ("default",) in catalog.list_namespaces()
|
|
125
|
+
|
|
126
|
+
def test_list_namespaces_empty(self, catalog):
|
|
127
|
+
"""Test listing namespaces when none exist."""
|
|
128
|
+
namespaces = catalog.list_namespaces()
|
|
129
|
+
assert namespaces == []
|
|
130
|
+
|
|
131
|
+
def test_list_namespaces_with_tables(self, catalog, test_schema, get_table_location):
|
|
132
|
+
"""Test listing namespaces with hierarchical names."""
|
|
133
|
+
catalog.create_namespace("ns1")
|
|
134
|
+
catalog.create_table("ns1.table1", test_schema, location=get_table_location("ns1.table1"))
|
|
135
|
+
|
|
136
|
+
namespaces = catalog.list_namespaces()
|
|
137
|
+
assert ("ns1",) in namespaces
|
|
138
|
+
|
|
139
|
+
def test_drop_namespace(self, catalog):
|
|
140
|
+
"""Test dropping an empty namespace."""
|
|
141
|
+
catalog.create_namespace("test_ns")
|
|
142
|
+
catalog.drop_namespace("test_ns")
|
|
143
|
+
|
|
144
|
+
# Namespace should not appear in list
|
|
145
|
+
assert ("test_ns",) not in catalog.list_namespaces()
|
|
146
|
+
|
|
147
|
+
def test_drop_namespace_not_empty(self, catalog, test_schema, get_table_location):
|
|
148
|
+
"""Test that dropping a non-empty namespace raises error."""
|
|
149
|
+
catalog.create_namespace("test_ns")
|
|
150
|
+
catalog.create_table(
|
|
151
|
+
"test_ns.table1", test_schema, location=get_table_location("test_ns.table1")
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
with pytest.raises(NamespaceNotEmptyError):
|
|
155
|
+
catalog.drop_namespace("test_ns")
|
|
156
|
+
|
|
157
|
+
def test_update_namespace_properties(self, catalog):
|
|
158
|
+
"""Test updating namespace properties."""
|
|
159
|
+
catalog.create_namespace("test_ns")
|
|
160
|
+
summary = catalog.update_namespace_properties(
|
|
161
|
+
"test_ns", removals={"old_prop"}, updates={"new_prop": "value"}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Currently returns empty summary
|
|
165
|
+
assert summary.removed == []
|
|
166
|
+
assert summary.updated == []
|
|
167
|
+
assert summary.missing == []
|
|
168
|
+
|
|
169
|
+
def test_create_namespace_already_exists(self, catalog, test_schema, get_table_location):
|
|
170
|
+
"""Test creating namespace that already exists (has tables)."""
|
|
171
|
+
catalog.create_namespace("test_ns")
|
|
172
|
+
catalog.create_table(
|
|
173
|
+
"test_ns.table1", test_schema, location=get_table_location("test_ns.table1")
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
with pytest.raises(NamespaceAlreadyExistsError):
|
|
177
|
+
catalog.create_namespace("test_ns")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# =============================================================================
|
|
181
|
+
# Table Read Operations (Parametrized for local/remote)
|
|
182
|
+
# =============================================================================
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class TestTableRead:
|
|
186
|
+
"""Tests for table read operations."""
|
|
187
|
+
|
|
188
|
+
def test_load_table(self, catalog, test_schema, get_table_location):
|
|
189
|
+
"""Test loading a table."""
|
|
190
|
+
catalog.create_namespace("default")
|
|
191
|
+
catalog.create_table(
|
|
192
|
+
identifier="default.test_table",
|
|
193
|
+
schema=test_schema,
|
|
194
|
+
location=get_table_location("default.test_table"),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
table = catalog.load_table("default.test_table")
|
|
198
|
+
|
|
199
|
+
assert table.schema() == test_schema
|
|
200
|
+
|
|
201
|
+
def test_list_tables(self, catalog, test_schema, get_table_location):
|
|
202
|
+
"""Test listing tables."""
|
|
203
|
+
catalog.create_namespace("default")
|
|
204
|
+
|
|
205
|
+
# Create multiple tables
|
|
206
|
+
catalog.create_table(
|
|
207
|
+
"default.table1", test_schema, location=get_table_location("default.table1")
|
|
208
|
+
)
|
|
209
|
+
catalog.create_table(
|
|
210
|
+
"default.table2", test_schema, location=get_table_location("default.table2")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
tables = catalog.list_tables("default")
|
|
214
|
+
|
|
215
|
+
assert len(tables) == 2
|
|
216
|
+
assert ("default", "table1") in tables
|
|
217
|
+
assert ("default", "table2") in tables
|
|
218
|
+
|
|
219
|
+
def test_table_exists(self, catalog, test_schema, get_table_location):
|
|
220
|
+
"""Test checking table existence."""
|
|
221
|
+
catalog.create_namespace("default")
|
|
222
|
+
|
|
223
|
+
assert not catalog.table_exists("default.test_table")
|
|
224
|
+
|
|
225
|
+
catalog.create_table(
|
|
226
|
+
"default.test_table", test_schema, location=get_table_location("default.test_table")
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
assert catalog.table_exists("default.test_table")
|
|
230
|
+
|
|
231
|
+
def test_load_table_not_found(self, catalog):
|
|
232
|
+
"""Test loading non-existent table raises error."""
|
|
233
|
+
with pytest.raises(NoSuchTableError):
|
|
234
|
+
catalog.load_table("default.nonexistent")
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# =============================================================================
|
|
238
|
+
# Table Write Operations (Parametrized for local/remote)
|
|
239
|
+
# =============================================================================
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class TestTableWrite:
|
|
243
|
+
"""Tests for table create, update, delete operations."""
|
|
244
|
+
|
|
245
|
+
def test_create_table(self, catalog, test_schema, get_table_location):
|
|
246
|
+
"""Test table creation."""
|
|
247
|
+
catalog.create_namespace("default")
|
|
248
|
+
|
|
249
|
+
table = catalog.create_table(
|
|
250
|
+
identifier="default.test_table",
|
|
251
|
+
schema=test_schema,
|
|
252
|
+
location=get_table_location("default.test_table"),
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
assert table.metadata is not None
|
|
256
|
+
assert table.schema() == test_schema
|
|
257
|
+
|
|
258
|
+
def test_drop_table(self, catalog, test_schema, get_table_location):
|
|
259
|
+
"""Test dropping a table."""
|
|
260
|
+
catalog.create_namespace("default")
|
|
261
|
+
catalog.create_table(
|
|
262
|
+
"default.test_table", test_schema, location=get_table_location("default.test_table")
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
assert catalog.table_exists("default.test_table")
|
|
266
|
+
|
|
267
|
+
catalog.drop_table("default.test_table")
|
|
268
|
+
|
|
269
|
+
assert not catalog.table_exists("default.test_table")
|
|
270
|
+
|
|
271
|
+
def test_rename_table(self, catalog, test_schema, get_table_location):
|
|
272
|
+
"""Test renaming a table."""
|
|
273
|
+
catalog.create_namespace("default")
|
|
274
|
+
catalog.create_table(
|
|
275
|
+
"default.old_name", test_schema, location=get_table_location("default.old_name")
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
catalog.rename_table("default.old_name", "default.new_name")
|
|
279
|
+
|
|
280
|
+
assert not catalog.table_exists("default.old_name")
|
|
281
|
+
assert catalog.table_exists("default.new_name")
|
|
282
|
+
|
|
283
|
+
def test_drop_table_not_found(self, catalog):
|
|
284
|
+
"""Test dropping non-existent table raises error."""
|
|
285
|
+
with pytest.raises(NoSuchTableError):
|
|
286
|
+
catalog.drop_table("default.nonexistent")
|
|
287
|
+
|
|
288
|
+
def test_rename_table_source_not_found(self, catalog):
|
|
289
|
+
"""Test renaming non-existent table raises error."""
|
|
290
|
+
with pytest.raises(NoSuchTableError):
|
|
291
|
+
catalog.rename_table("default.nonexistent", "default.new_name")
|
|
292
|
+
|
|
293
|
+
def test_rename_table_destination_exists(self, catalog, test_schema, get_table_location):
|
|
294
|
+
"""Test that renaming to existing table name raises error."""
|
|
295
|
+
catalog.create_namespace("default")
|
|
296
|
+
catalog.create_table(
|
|
297
|
+
"default.table1", test_schema, location=get_table_location("default.table1")
|
|
298
|
+
)
|
|
299
|
+
catalog.create_table(
|
|
300
|
+
"default.table2", test_schema, location=get_table_location("default.table2")
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
with pytest.raises(TableAlreadyExistsError):
|
|
304
|
+
catalog.rename_table("default.table1", "default.table2")
|
|
305
|
+
|
|
306
|
+
def test_create_table_transaction_not_implemented(self, catalog, test_schema):
|
|
307
|
+
"""Test that table transactions are not yet implemented."""
|
|
308
|
+
with pytest.raises(NotImplementedError):
|
|
309
|
+
catalog.create_table_transaction("default.test_table", test_schema)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# =============================================================================
|
|
313
|
+
# Table Write Properties (Parametrized for local/remote)
|
|
314
|
+
# =============================================================================
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class TestTableWriteProperties:
|
|
318
|
+
"""Tests for table write properties and LocationProvider."""
|
|
319
|
+
|
|
320
|
+
def test_create_table_with_write_properties(self, catalog, test_schema, get_table_location):
|
|
321
|
+
"""Test creating a table with write LocationProvider configured."""
|
|
322
|
+
catalog.create_namespace("default")
|
|
323
|
+
table = catalog.create_table(
|
|
324
|
+
"default.write_test",
|
|
325
|
+
schema=test_schema,
|
|
326
|
+
location=get_table_location("default.write_test"),
|
|
327
|
+
properties={
|
|
328
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
329
|
+
"hf.write.split": "train",
|
|
330
|
+
},
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Verify LocationProvider is configured
|
|
334
|
+
assert (
|
|
335
|
+
table.properties.get("write.py-location-provider.impl")
|
|
336
|
+
== "faceberg.catalog.HfLocationProvider"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
def test_location_provider_returns_correct_type(self, catalog, test_schema, get_table_location):
|
|
340
|
+
"""Test that table.location_provider() returns HfLocationProvider."""
|
|
341
|
+
catalog.create_namespace("default")
|
|
342
|
+
table = catalog.create_table(
|
|
343
|
+
"default.test_table",
|
|
344
|
+
schema=test_schema,
|
|
345
|
+
location=get_table_location("default.test_table"),
|
|
346
|
+
properties={
|
|
347
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
348
|
+
},
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Verify LocationProvider is configured
|
|
352
|
+
provider = table.location_provider()
|
|
353
|
+
assert isinstance(provider, HfLocationProvider)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# =============================================================================
|
|
357
|
+
# Table Append Operations (Parametrized for local/remote)
|
|
358
|
+
# =============================================================================
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class TestTableAppend:
|
|
362
|
+
"""Tests for PyIceberg append operations on writable tables."""
|
|
363
|
+
|
|
364
|
+
@pytest.fixture
|
|
365
|
+
def writable_catalog(self, catalog, tmp_path):
|
|
366
|
+
"""Create catalog with writable table for testing write operations.
|
|
367
|
+
|
|
368
|
+
Creates a catalog with a writable table (not from HuggingFace dataset)
|
|
369
|
+
that can be used to test append and other write operations.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
catalog: Empty catalog instance (local or remote, from parametrized fixture)
|
|
373
|
+
tmp_path: Temporary directory for table data
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Catalog instance with a writable test_table in the default namespace
|
|
377
|
+
"""
|
|
378
|
+
# Create data directory for the table
|
|
379
|
+
data_dir = tmp_path / "data"
|
|
380
|
+
data_dir.mkdir()
|
|
381
|
+
location = f"file://{data_dir.as_posix()}"
|
|
382
|
+
|
|
383
|
+
# Create the table with schema matching imdb dataset
|
|
384
|
+
schema = Schema(
|
|
385
|
+
NestedField(field_id=1, name="split", field_type=StringType(), required=False),
|
|
386
|
+
NestedField(field_id=2, name="text", field_type=StringType(), required=False),
|
|
387
|
+
NestedField(field_id=3, name="label", field_type=LongType(), required=False),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
partition_spec = PartitionSpec(
|
|
391
|
+
PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="split")
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Create the table with mandatory location argument
|
|
395
|
+
catalog.create_table(
|
|
396
|
+
identifier="default.test_table",
|
|
397
|
+
schema=schema,
|
|
398
|
+
location=location,
|
|
399
|
+
partition_spec=partition_spec,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
return catalog
|
|
403
|
+
|
|
404
|
+
def test_append_data(self, writable_catalog):
|
|
405
|
+
"""Verifies data is appended, count increases, and data is scannable."""
|
|
406
|
+
table = writable_catalog.load_table("default.test_table")
|
|
407
|
+
|
|
408
|
+
# Record count before append
|
|
409
|
+
before_count = table.scan().to_arrow().num_rows
|
|
410
|
+
|
|
411
|
+
# Create test data with unique text for verification
|
|
412
|
+
unique_text = f"Unique test review {uuid.uuid4()}"
|
|
413
|
+
test_data = pa.Table.from_pydict(
|
|
414
|
+
{
|
|
415
|
+
"split": ["test", "test"],
|
|
416
|
+
"text": [unique_text, "Test review 2"],
|
|
417
|
+
"label": [1, 0],
|
|
418
|
+
}
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Append data
|
|
422
|
+
table.append(test_data)
|
|
423
|
+
|
|
424
|
+
# Verify count increased by expected amount
|
|
425
|
+
after_count = table.scan().to_arrow().num_rows
|
|
426
|
+
assert after_count == before_count + len(test_data)
|
|
427
|
+
|
|
428
|
+
# Verify appended data is readable via scan
|
|
429
|
+
scan = table.scan().filter(f"text = '{unique_text}'")
|
|
430
|
+
result = scan.to_arrow()
|
|
431
|
+
assert result.num_rows == 1
|
|
432
|
+
assert result["text"][0].as_py() == unique_text
|
|
433
|
+
|
|
434
|
+
def test_append_data_snapshot_history(self, writable_catalog):
|
|
435
|
+
"""Test snapshot history is updated after append."""
|
|
436
|
+
table = writable_catalog.load_table("default.test_table")
|
|
437
|
+
|
|
438
|
+
# Record snapshot count before append
|
|
439
|
+
snapshots_before = list(table.snapshots())
|
|
440
|
+
snapshot_count_before = len(snapshots_before)
|
|
441
|
+
|
|
442
|
+
# Create and append test data
|
|
443
|
+
test_data = pa.Table.from_pydict(
|
|
444
|
+
{
|
|
445
|
+
"split": ["test"],
|
|
446
|
+
"text": ["Snapshot test review"],
|
|
447
|
+
"label": [1],
|
|
448
|
+
}
|
|
449
|
+
)
|
|
450
|
+
table.append(test_data)
|
|
451
|
+
|
|
452
|
+
# Reload table to get updated snapshots
|
|
453
|
+
table = writable_catalog.load_table("default.test_table")
|
|
454
|
+
snapshots_after = list(table.snapshots())
|
|
455
|
+
|
|
456
|
+
# Verify new snapshot was created
|
|
457
|
+
assert len(snapshots_after) == snapshot_count_before + 1
|
|
458
|
+
|
|
459
|
+
# Verify latest snapshot has append operation
|
|
460
|
+
latest_snapshot = snapshots_after[-1]
|
|
461
|
+
assert latest_snapshot.summary is not None
|
|
462
|
+
# Summary.operation is an enum, not a string
|
|
463
|
+
from pyiceberg.table.snapshots import Operation
|
|
464
|
+
|
|
465
|
+
assert latest_snapshot.summary.operation == Operation.APPEND
|
|
466
|
+
|
|
467
|
+
def test_append_data_partition_integrity(self, writable_catalog):
|
|
468
|
+
"""Test partition integrity is maintained after append."""
|
|
469
|
+
table = writable_catalog.load_table("default.test_table")
|
|
470
|
+
|
|
471
|
+
# Record partition spec before append
|
|
472
|
+
spec_before = table.spec()
|
|
473
|
+
|
|
474
|
+
# Create test data for specific partition
|
|
475
|
+
test_data = pa.Table.from_pydict(
|
|
476
|
+
{
|
|
477
|
+
"split": ["test", "test"],
|
|
478
|
+
"text": ["Partition test review 1", "Partition test review 2"],
|
|
479
|
+
"label": [1, 0],
|
|
480
|
+
}
|
|
481
|
+
)
|
|
482
|
+
table.append(test_data)
|
|
483
|
+
|
|
484
|
+
# Reload table and verify partition spec unchanged
|
|
485
|
+
table = writable_catalog.load_table("default.test_table")
|
|
486
|
+
spec_after = table.spec()
|
|
487
|
+
assert len(spec_before.fields) == len(spec_after.fields)
|
|
488
|
+
|
|
489
|
+
# Verify partition filtering still works
|
|
490
|
+
scan = table.scan().filter("split = 'test'")
|
|
491
|
+
result = scan.to_arrow()
|
|
492
|
+
|
|
493
|
+
# All rows should have split == 'test'
|
|
494
|
+
split_values = result["split"].unique().to_pylist()
|
|
495
|
+
assert split_values == ["test"]
|
|
496
|
+
assert result.num_rows > 0
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
# =============================================================================
|
|
500
|
+
# Dataset Operations (Parametrized for local/remote)
|
|
501
|
+
# =============================================================================
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class TestDatasetOperations:
|
|
505
|
+
"""Tests for HuggingFace dataset integration."""
|
|
506
|
+
|
|
507
|
+
def test_namespace_exists_after_add_dataset(self, session_mbpp):
|
|
508
|
+
"""Test that namespaces exist after datasets are added."""
|
|
509
|
+
# Namespace should exist after add_dataset
|
|
510
|
+
assert ("google-research-datasets",) in session_mbpp.list_namespaces()
|
|
511
|
+
|
|
512
|
+
# Verify table exists
|
|
513
|
+
tables = session_mbpp.list_tables("google-research-datasets")
|
|
514
|
+
assert len(tables) > 0
|
|
515
|
+
|
|
516
|
+
def test_add_dataset_already_exists(self, catalog):
|
|
517
|
+
"""Test adding a dataset that already exists raises error."""
|
|
518
|
+
# Create table first time
|
|
519
|
+
catalog.add_dataset("default.imdb_plain_text", "stanfordnlp/imdb", config="plain_text")
|
|
520
|
+
|
|
521
|
+
# Try to create again - should raise
|
|
522
|
+
with pytest.raises(TableAlreadyExistsError):
|
|
523
|
+
catalog.add_dataset(
|
|
524
|
+
"default.imdb_plain_text",
|
|
525
|
+
"stanfordnlp/imdb",
|
|
526
|
+
config="plain_text",
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
def test_add_dataset_with_config(self, catalog):
|
|
530
|
+
"""Test adding a dataset with a specific config."""
|
|
531
|
+
# Create table
|
|
532
|
+
table = catalog.add_dataset(
|
|
533
|
+
"default.imdb_plain_text",
|
|
534
|
+
"stanfordnlp/imdb",
|
|
535
|
+
config="plain_text",
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Verify table
|
|
539
|
+
assert table is not None
|
|
540
|
+
assert table.schema() is not None
|
|
541
|
+
assert len(table.schema().fields) > 0
|
|
542
|
+
|
|
543
|
+
# Verify table properties
|
|
544
|
+
props = table.properties
|
|
545
|
+
assert "hf.dataset.repo" in props
|
|
546
|
+
assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
|
|
547
|
+
assert "hf.dataset.config" in props
|
|
548
|
+
assert props["hf.dataset.config"] == "plain_text"
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
# =============================================================================
|
|
552
|
+
# Unsupported Operations (Parametrized for local/remote)
|
|
553
|
+
# =============================================================================
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
class TestTableScanning:
|
|
557
|
+
"""Tests for PyIceberg table scanning operations."""
|
|
558
|
+
|
|
559
|
+
def test_scan_basic(self, session_mbpp):
|
|
560
|
+
"""Test creating a basic scan object."""
|
|
561
|
+
catalog = session_mbpp
|
|
562
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
563
|
+
scan = table.scan()
|
|
564
|
+
|
|
565
|
+
# Verify scan object is created
|
|
566
|
+
assert scan is not None
|
|
567
|
+
|
|
568
|
+
# Verify scan has expected methods
|
|
569
|
+
assert hasattr(scan, "to_arrow")
|
|
570
|
+
assert hasattr(scan, "to_pandas")
|
|
571
|
+
assert hasattr(scan, "to_arrow_batch_reader")
|
|
572
|
+
|
|
573
|
+
def test_scan_to_arrow(self, session_mbpp):
|
|
574
|
+
"""Test scanning table to Arrow table."""
|
|
575
|
+
|
|
576
|
+
catalog = session_mbpp
|
|
577
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
578
|
+
scan = table.scan()
|
|
579
|
+
|
|
580
|
+
# Convert to Arrow table
|
|
581
|
+
arrow_table = scan.to_arrow()
|
|
582
|
+
|
|
583
|
+
# Verify it's an Arrow table
|
|
584
|
+
assert isinstance(arrow_table, pa.Table)
|
|
585
|
+
|
|
586
|
+
# Verify we have rows
|
|
587
|
+
assert arrow_table.num_rows > 0
|
|
588
|
+
|
|
589
|
+
# Verify expected columns are present
|
|
590
|
+
column_names = arrow_table.schema.names
|
|
591
|
+
assert "split" in column_names
|
|
592
|
+
# Verify dataset has at least 2 other columns besides split
|
|
593
|
+
assert len(column_names) >= 3
|
|
594
|
+
|
|
595
|
+
# Verify split column contains expected values
|
|
596
|
+
split_values = arrow_table["split"].unique().to_pylist()
|
|
597
|
+
assert any(split in split_values for split in ["train", "test", "validation", "prompt"])
|
|
598
|
+
|
|
599
|
+
def test_scan_to_pandas(self, session_mbpp):
|
|
600
|
+
"""Test scanning table to Pandas DataFrame."""
|
|
601
|
+
|
|
602
|
+
catalog = session_mbpp
|
|
603
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
604
|
+
scan = table.scan()
|
|
605
|
+
|
|
606
|
+
# Convert to Pandas DataFrame
|
|
607
|
+
df = scan.to_pandas()
|
|
608
|
+
|
|
609
|
+
# Verify DataFrame shape
|
|
610
|
+
assert len(df) > 0
|
|
611
|
+
assert len(df.columns) > 0
|
|
612
|
+
|
|
613
|
+
# Verify split column exists
|
|
614
|
+
assert "split" in df.columns
|
|
615
|
+
|
|
616
|
+
# Verify we have multiple columns
|
|
617
|
+
assert len(df.columns) >= 3
|
|
618
|
+
|
|
619
|
+
def test_scan_with_selected_fields(self, session_mbpp):
|
|
620
|
+
"""Test scanning with column projection."""
|
|
621
|
+
catalog = session_mbpp
|
|
622
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
623
|
+
|
|
624
|
+
# Get schema to know which columns exist
|
|
625
|
+
schema = table.schema()
|
|
626
|
+
# Select first two non-split columns
|
|
627
|
+
cols_to_select = [f.name for f in schema.fields if f.name != "split"][:2]
|
|
628
|
+
|
|
629
|
+
# Scan with only specific columns selected
|
|
630
|
+
scan = table.scan().select(*cols_to_select)
|
|
631
|
+
arrow_table = scan.to_arrow()
|
|
632
|
+
|
|
633
|
+
# Verify only selected columns are present
|
|
634
|
+
column_names = arrow_table.schema.names
|
|
635
|
+
assert len(column_names) == len(cols_to_select)
|
|
636
|
+
assert "split" not in column_names
|
|
637
|
+
for col in cols_to_select:
|
|
638
|
+
assert col in column_names
|
|
639
|
+
|
|
640
|
+
def test_scan_limit(self, session_mbpp):
|
|
641
|
+
"""Test scanning with row limit."""
|
|
642
|
+
catalog = session_mbpp
|
|
643
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
644
|
+
|
|
645
|
+
# PyIceberg doesn't support limit() directly on scan, need to materialize first
|
|
646
|
+
scan = table.scan()
|
|
647
|
+
arrow_table = scan.to_arrow()
|
|
648
|
+
|
|
649
|
+
# Take first 10 rows
|
|
650
|
+
limited_table = arrow_table.slice(0, 10)
|
|
651
|
+
|
|
652
|
+
# Verify exactly 10 rows
|
|
653
|
+
assert limited_table.num_rows == 10
|
|
654
|
+
|
|
655
|
+
def test_partition_filter_single_split(self, session_mbpp):
|
|
656
|
+
"""Test partition pruning with single split filter."""
|
|
657
|
+
catalog = session_mbpp
|
|
658
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
659
|
+
|
|
660
|
+
# Scan with split filter
|
|
661
|
+
scan = table.scan().filter("split = 'train'")
|
|
662
|
+
arrow_table = scan.to_arrow()
|
|
663
|
+
|
|
664
|
+
# Verify all rows have split == "train"
|
|
665
|
+
split_values = arrow_table["split"].unique().to_pylist()
|
|
666
|
+
assert split_values == ["train"]
|
|
667
|
+
|
|
668
|
+
# Verify we got some rows (not empty result)
|
|
669
|
+
assert arrow_table.num_rows > 0
|
|
670
|
+
|
|
671
|
+
def test_partition_filter_multiple_splits(self, session_mbpp):
|
|
672
|
+
"""Test partition pruning with multiple split filter."""
|
|
673
|
+
catalog = session_mbpp
|
|
674
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
675
|
+
|
|
676
|
+
# Scan with IN filter for multiple splits
|
|
677
|
+
scan = table.scan().filter("split IN ('train', 'test')")
|
|
678
|
+
df = scan.to_pandas()
|
|
679
|
+
|
|
680
|
+
# Verify only train and test splits are present
|
|
681
|
+
unique_splits = df["split"].unique()
|
|
682
|
+
assert set(unique_splits).issubset({"train", "test"})
|
|
683
|
+
|
|
684
|
+
# Verify other splits are excluded (validation, prompt)
|
|
685
|
+
assert "validation" not in unique_splits
|
|
686
|
+
assert "prompt" not in unique_splits
|
|
687
|
+
|
|
688
|
+
# Verify we got some rows
|
|
689
|
+
assert len(df) > 0
|
|
690
|
+
|
|
691
|
+
def test_scan_all_partitions(self, session_mbpp):
|
|
692
|
+
"""Test scanning all partitions without filter."""
|
|
693
|
+
catalog = session_mbpp
|
|
694
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
695
|
+
|
|
696
|
+
# Scan without filter
|
|
697
|
+
scan = table.scan()
|
|
698
|
+
arrow_table = scan.to_arrow()
|
|
699
|
+
|
|
700
|
+
# Group by split to get all partitions
|
|
701
|
+
split_values = set(arrow_table["split"].to_pylist())
|
|
702
|
+
|
|
703
|
+
# Verify we have multiple splits
|
|
704
|
+
assert len(split_values) > 1
|
|
705
|
+
|
|
706
|
+
# Verify expected splits are present (mbpp has train/test/validation/prompt)
|
|
707
|
+
assert "train" in split_values or "test" in split_values
|
|
708
|
+
|
|
709
|
+
def test_scan_empty_result(self, session_mbpp):
|
|
710
|
+
"""Test scanning with filter that returns no rows."""
|
|
711
|
+
catalog = session_mbpp
|
|
712
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
713
|
+
|
|
714
|
+
# Scan with impossible filter
|
|
715
|
+
scan = table.scan().filter("split = 'nonexistent_split'")
|
|
716
|
+
arrow_table = scan.to_arrow()
|
|
717
|
+
|
|
718
|
+
# Verify 0 rows returned
|
|
719
|
+
assert arrow_table.num_rows == 0
|
|
720
|
+
|
|
721
|
+
# Verify schema is still correct (has split and other columns)
|
|
722
|
+
assert "split" in arrow_table.schema.names
|
|
723
|
+
assert len(arrow_table.schema.names) >= 3
|
|
724
|
+
|
|
725
|
+
def test_multiple_scans_same_table(self, session_mbpp):
|
|
726
|
+
"""Test multiple independent scans from the same table."""
|
|
727
|
+
catalog = session_mbpp
|
|
728
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
729
|
+
|
|
730
|
+
# Create two independent scans
|
|
731
|
+
scan1 = table.scan().filter("split = 'train'")
|
|
732
|
+
scan2 = table.scan().filter("split = 'test'")
|
|
733
|
+
|
|
734
|
+
# Materialize both scans
|
|
735
|
+
df1 = scan1.to_pandas().head(5)
|
|
736
|
+
df2 = scan2.to_pandas().head(3)
|
|
737
|
+
|
|
738
|
+
# Verify they don't interfere with each other
|
|
739
|
+
assert len(df1) == 5
|
|
740
|
+
assert all(df1["split"] == "train")
|
|
741
|
+
|
|
742
|
+
assert len(df2) == 3
|
|
743
|
+
assert all(df2["split"] == "test")
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
class TestTableMetadata:
|
|
747
|
+
"""Tests for PyIceberg metadata reading operations."""
|
|
748
|
+
|
|
749
|
+
def test_read_schema(self, session_mbpp):
|
|
750
|
+
"""Test reading table schema."""
|
|
751
|
+
catalog = session_mbpp
|
|
752
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
753
|
+
schema = table.schema()
|
|
754
|
+
|
|
755
|
+
# Verify schema has expected fields
|
|
756
|
+
field_names = [field.name for field in schema.fields]
|
|
757
|
+
assert "split" in field_names
|
|
758
|
+
# Verify we have multiple fields (at least 3)
|
|
759
|
+
assert len(field_names) >= 3
|
|
760
|
+
|
|
761
|
+
# Verify field IDs are assigned (all > 0)
|
|
762
|
+
for field in schema.fields:
|
|
763
|
+
assert field.field_id > 0
|
|
764
|
+
|
|
765
|
+
# Verify split column is first field
|
|
766
|
+
assert schema.fields[0].name == "split"
|
|
767
|
+
|
|
768
|
+
def test_read_partition_spec(self, session_mbpp):
|
|
769
|
+
"""Test reading partition specification."""
|
|
770
|
+
|
|
771
|
+
catalog = session_mbpp
|
|
772
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
773
|
+
spec = table.spec()
|
|
774
|
+
|
|
775
|
+
# Verify partition spec has at least one field
|
|
776
|
+
assert len(spec.fields) >= 1
|
|
777
|
+
|
|
778
|
+
# Find the split partition field
|
|
779
|
+
split_partition = None
|
|
780
|
+
for field in spec.fields:
|
|
781
|
+
if field.name == "split":
|
|
782
|
+
split_partition = field
|
|
783
|
+
break
|
|
784
|
+
|
|
785
|
+
# Verify split partition exists with identity transform
|
|
786
|
+
assert split_partition is not None
|
|
787
|
+
assert isinstance(split_partition.transform, IdentityTransform)
|
|
788
|
+
|
|
789
|
+
def test_read_properties(self, session_mbpp):
|
|
790
|
+
"""Test reading table properties."""
|
|
791
|
+
catalog = session_mbpp
|
|
792
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
793
|
+
properties = table.properties
|
|
794
|
+
|
|
795
|
+
# Verify HuggingFace properties exist
|
|
796
|
+
assert "hf.dataset.repo" in properties
|
|
797
|
+
assert properties["hf.dataset.repo"] == "google-research-datasets/mbpp"
|
|
798
|
+
|
|
799
|
+
assert "hf.dataset.config" in properties
|
|
800
|
+
assert properties["hf.dataset.config"] == "sanitized"
|
|
801
|
+
|
|
802
|
+
assert "hf.dataset.revision" in properties
|
|
803
|
+
|
|
804
|
+
# Verify schema name mapping is present
|
|
805
|
+
assert "schema.name-mapping.default" in properties
|
|
806
|
+
|
|
807
|
+
def test_read_snapshots(self, session_mbpp):
|
|
808
|
+
"""Test reading table snapshots."""
|
|
809
|
+
catalog = session_mbpp
|
|
810
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
811
|
+
snapshots = list(table.snapshots())
|
|
812
|
+
|
|
813
|
+
# Verify at least one snapshot exists
|
|
814
|
+
assert len(snapshots) > 0
|
|
815
|
+
|
|
816
|
+
# Verify snapshot has expected attributes
|
|
817
|
+
snapshot = snapshots[0]
|
|
818
|
+
assert hasattr(snapshot, "snapshot_id")
|
|
819
|
+
assert hasattr(snapshot, "manifest_list")
|
|
820
|
+
assert snapshot.snapshot_id > 0
|
|
821
|
+
|
|
822
|
+
def test_current_snapshot(self, session_mbpp):
|
|
823
|
+
"""Test reading current snapshot."""
|
|
824
|
+
catalog = session_mbpp
|
|
825
|
+
table = catalog.load_table("google-research-datasets.mbpp")
|
|
826
|
+
snapshot = table.current_snapshot()
|
|
827
|
+
|
|
828
|
+
# Verify current snapshot exists
|
|
829
|
+
assert snapshot is not None
|
|
830
|
+
|
|
831
|
+
# Verify snapshot has summary
|
|
832
|
+
assert snapshot.summary is not None
|
|
833
|
+
|
|
834
|
+
# Verify snapshot ID exists
|
|
835
|
+
assert snapshot.snapshot_id > 0
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
# =============================================================================
|
|
839
|
+
# REST Catalog Integration Tests
|
|
840
|
+
# =============================================================================
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
class TestRestCatalogOperations:
|
|
844
|
+
"""Tests for PyIceberg REST catalog basic operations."""
|
|
845
|
+
|
|
846
|
+
def test_rest_list_namespaces(self, session_rest_catalog):
|
|
847
|
+
"""Test listing namespaces via REST catalog."""
|
|
848
|
+
namespaces = session_rest_catalog.list_namespaces()
|
|
849
|
+
|
|
850
|
+
# Verify we got namespaces
|
|
851
|
+
assert len(namespaces) > 0
|
|
852
|
+
|
|
853
|
+
# Verify google-research-datasets namespace exists
|
|
854
|
+
namespace_strs = [".".join(ns) if isinstance(ns, tuple) else ns for ns in namespaces]
|
|
855
|
+
assert "google-research-datasets" in namespace_strs
|
|
856
|
+
|
|
857
|
+
def test_rest_list_tables(self, session_rest_catalog):
|
|
858
|
+
"""Test listing tables via REST catalog."""
|
|
859
|
+
tables = session_rest_catalog.list_tables("google-research-datasets")
|
|
860
|
+
|
|
861
|
+
# Verify we got tables
|
|
862
|
+
assert len(tables) > 0
|
|
863
|
+
|
|
864
|
+
# Verify mbpp table exists
|
|
865
|
+
table_names = [t[1] if isinstance(t, tuple) and len(t) > 1 else str(t) for t in tables]
|
|
866
|
+
assert "mbpp" in table_names
|
|
867
|
+
|
|
868
|
+
def test_rest_load_table(self, session_rest_catalog):
|
|
869
|
+
"""Test loading a table via REST catalog."""
|
|
870
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
871
|
+
|
|
872
|
+
# Verify table loaded successfully
|
|
873
|
+
assert table is not None
|
|
874
|
+
|
|
875
|
+
# Verify table has schema
|
|
876
|
+
schema = table.schema()
|
|
877
|
+
assert schema is not None
|
|
878
|
+
assert len(schema.fields) > 0
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
class TestRestCatalogScanning:
|
|
882
|
+
"""Tests for PyIceberg REST catalog scanning operations."""
|
|
883
|
+
|
|
884
|
+
def test_rest_scan_to_arrow(self, session_rest_catalog):
|
|
885
|
+
"""Test scanning table to Arrow via REST catalog."""
|
|
886
|
+
|
|
887
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
888
|
+
scan = table.scan()
|
|
889
|
+
|
|
890
|
+
# Convert to Arrow table
|
|
891
|
+
arrow_table = scan.to_arrow()
|
|
892
|
+
|
|
893
|
+
# Verify it's an Arrow table
|
|
894
|
+
assert isinstance(arrow_table, pa.Table)
|
|
895
|
+
|
|
896
|
+
# Verify we have rows
|
|
897
|
+
assert arrow_table.num_rows > 0
|
|
898
|
+
|
|
899
|
+
# Verify expected columns (split + at least 2 other columns)
|
|
900
|
+
column_names = arrow_table.schema.names
|
|
901
|
+
assert "split" in column_names
|
|
902
|
+
assert len(column_names) >= 3
|
|
903
|
+
|
|
904
|
+
def test_rest_scan_to_pandas(self, session_rest_catalog):
|
|
905
|
+
"""Test scanning table to Pandas via REST catalog."""
|
|
906
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
907
|
+
scan = table.scan()
|
|
908
|
+
|
|
909
|
+
# Convert to Pandas DataFrame
|
|
910
|
+
df = scan.to_pandas()
|
|
911
|
+
|
|
912
|
+
# Verify DataFrame shape
|
|
913
|
+
assert len(df) > 0
|
|
914
|
+
assert len(df.columns) > 0
|
|
915
|
+
|
|
916
|
+
# Verify split column exists
|
|
917
|
+
assert "split" in df.columns
|
|
918
|
+
|
|
919
|
+
def test_rest_partition_filter(self, session_rest_catalog):
|
|
920
|
+
"""Test partition filtering via REST catalog."""
|
|
921
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
922
|
+
|
|
923
|
+
# Scan with split filter
|
|
924
|
+
scan = table.scan().filter("split = 'train'")
|
|
925
|
+
arrow_table = scan.to_arrow()
|
|
926
|
+
|
|
927
|
+
# Verify all rows have split == "train"
|
|
928
|
+
split_values = arrow_table["split"].unique().to_pylist()
|
|
929
|
+
assert split_values == ["train"]
|
|
930
|
+
|
|
931
|
+
# Verify we got some rows
|
|
932
|
+
assert arrow_table.num_rows > 0
|
|
933
|
+
|
|
934
|
+
def test_rest_column_projection(self, session_rest_catalog):
|
|
935
|
+
"""Test column projection via REST catalog."""
|
|
936
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
937
|
+
|
|
938
|
+
# Get schema to know which columns exist
|
|
939
|
+
schema = table.schema()
|
|
940
|
+
# Select first two non-split columns
|
|
941
|
+
cols_to_select = [f.name for f in schema.fields if f.name != "split"][:2]
|
|
942
|
+
|
|
943
|
+
# Scan with only specific columns selected
|
|
944
|
+
scan = table.scan().select(*cols_to_select)
|
|
945
|
+
arrow_table = scan.to_arrow()
|
|
946
|
+
|
|
947
|
+
# Verify only selected columns are present
|
|
948
|
+
column_names = arrow_table.schema.names
|
|
949
|
+
assert len(column_names) == len(cols_to_select)
|
|
950
|
+
assert "split" not in column_names
|
|
951
|
+
for col in cols_to_select:
|
|
952
|
+
assert col in column_names
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
class TestRestCatalogMetadata:
|
|
956
|
+
"""Tests for PyIceberg REST catalog metadata operations."""
|
|
957
|
+
|
|
958
|
+
def test_rest_read_schema(self, session_rest_catalog):
|
|
959
|
+
"""Test reading table schema via REST catalog."""
|
|
960
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
961
|
+
schema = table.schema()
|
|
962
|
+
|
|
963
|
+
# Verify schema has expected fields
|
|
964
|
+
field_names = [field.name for field in schema.fields]
|
|
965
|
+
assert "split" in field_names
|
|
966
|
+
# Verify we have multiple fields (at least 3)
|
|
967
|
+
assert len(field_names) >= 3
|
|
968
|
+
|
|
969
|
+
def test_rest_read_properties(self, session_rest_catalog):
|
|
970
|
+
"""Test reading table properties via REST catalog."""
|
|
971
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
972
|
+
properties = table.properties
|
|
973
|
+
|
|
974
|
+
# Verify HuggingFace properties exist
|
|
975
|
+
assert "hf.dataset.repo" in properties
|
|
976
|
+
assert properties["hf.dataset.repo"] == "google-research-datasets/mbpp"
|
|
977
|
+
|
|
978
|
+
def test_rest_read_snapshots(self, session_rest_catalog):
|
|
979
|
+
"""Test reading table snapshots via REST catalog."""
|
|
980
|
+
table = session_rest_catalog.load_table("google-research-datasets.mbpp")
|
|
981
|
+
snapshots = list(table.snapshots())
|
|
982
|
+
|
|
983
|
+
# Verify at least one snapshot exists
|
|
984
|
+
assert len(snapshots) > 0
|
|
985
|
+
|
|
986
|
+
# Verify snapshot has expected attributes
|
|
987
|
+
snapshot = snapshots[0]
|
|
988
|
+
assert hasattr(snapshot, "snapshot_id")
|
|
989
|
+
assert snapshot.snapshot_id > 0
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
class TestUnsupportedOperations:
|
|
993
|
+
"""Tests for operations that are not yet supported."""
|
|
994
|
+
|
|
995
|
+
def test_view_operations_not_supported(self, catalog):
|
|
996
|
+
"""Test that view operations are not supported."""
|
|
997
|
+
# view_exists should raise NotImplementedError
|
|
998
|
+
with pytest.raises(NotImplementedError):
|
|
999
|
+
catalog.view_exists("default.test_view")
|
|
1000
|
+
|
|
1001
|
+
# list_views should raise NotImplementedError
|
|
1002
|
+
with pytest.raises(NotImplementedError):
|
|
1003
|
+
catalog.list_views("default")
|
|
1004
|
+
|
|
1005
|
+
# drop_view should raise NotImplementedError
|
|
1006
|
+
with pytest.raises(NotImplementedError):
|
|
1007
|
+
catalog.drop_view("default.test_view")
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
# =============================================================================
|
|
1011
|
+
# HfFileIO Tests
|
|
1012
|
+
# =============================================================================
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
class TestHfFileIO:
|
|
1016
|
+
"""Tests for HfFileIO custom FileIO implementation."""
|
|
1017
|
+
|
|
1018
|
+
def test_hffileio_initialization(self):
|
|
1019
|
+
"""Test that HfFileIO can be initialized with properties."""
|
|
1020
|
+
io = HfFileIO(
|
|
1021
|
+
properties={
|
|
1022
|
+
"hf.endpoint": "https://huggingface.co",
|
|
1023
|
+
"hf.token": "test_token",
|
|
1024
|
+
}
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
assert io is not None
|
|
1028
|
+
assert io.properties["hf.endpoint"] == "https://huggingface.co"
|
|
1029
|
+
assert io.properties["hf.token"] == "test_token"
|
|
1030
|
+
|
|
1031
|
+
def test_hffileio_creates_hf_filesystem(self):
|
|
1032
|
+
"""Test that HfFileIO creates HfFileSystem for hf:// scheme."""
|
|
1033
|
+
io = HfFileIO(properties={"hf.endpoint": "https://huggingface.co"})
|
|
1034
|
+
fs = io.get_fs("hf")
|
|
1035
|
+
|
|
1036
|
+
assert isinstance(fs, HfFileSystem)
|
|
1037
|
+
|
|
1038
|
+
def test_hffileio_uses_skip_instance_cache(self):
|
|
1039
|
+
"""Test that HfFileIO creates multiple distinct HfFileSystem instances.
|
|
1040
|
+
|
|
1041
|
+
When skip_instance_cache=True, each call to get_fs('hf') should create
|
|
1042
|
+
a new HfFileSystem instance (after cache eviction). This test verifies
|
|
1043
|
+
that our custom factory uses skip_instance_cache correctly.
|
|
1044
|
+
"""
|
|
1045
|
+
io = HfFileIO(properties={"hf.endpoint": "https://huggingface.co"})
|
|
1046
|
+
|
|
1047
|
+
# First call creates and caches filesystem
|
|
1048
|
+
fs1 = io.get_fs("hf")
|
|
1049
|
+
|
|
1050
|
+
# Verify we got a HfFileSystem instance
|
|
1051
|
+
assert isinstance(fs1, HfFileSystem)
|
|
1052
|
+
|
|
1053
|
+
# Just verify that calling get_fs again works
|
|
1054
|
+
# (Testing internal cache behavior is fragile across pyiceberg versions)
|
|
1055
|
+
fs2 = io.get_fs("hf")
|
|
1056
|
+
assert isinstance(fs2, HfFileSystem)
|
|
1057
|
+
|
|
1058
|
+
def test_hffileio_extends_fsspec_fileio(self):
|
|
1059
|
+
"""Test that HfFileIO properly extends FsspecFileIO."""
|
|
1060
|
+
io = HfFileIO(properties={})
|
|
1061
|
+
|
|
1062
|
+
assert isinstance(io, FsspecFileIO)
|
|
1063
|
+
# Should have all standard FileIO methods
|
|
1064
|
+
assert hasattr(io, "new_input")
|
|
1065
|
+
assert hasattr(io, "new_output")
|
|
1066
|
+
assert hasattr(io, "delete")
|
|
1067
|
+
assert hasattr(io, "get_fs")
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
# =============================================================================
|
|
1071
|
+
# catalog() Factory Function Tests
|
|
1072
|
+
# =============================================================================
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
class TestCatalogFactory:
|
|
1076
|
+
"""Tests for the catalog() factory function."""
|
|
1077
|
+
|
|
1078
|
+
def test_catalog_local_directory_path(self, tmp_path):
|
|
1079
|
+
"""Test creating LocalCatalog from directory path."""
|
|
1080
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1081
|
+
catalog_dir.mkdir()
|
|
1082
|
+
|
|
1083
|
+
cat = catalog_factory(str(catalog_dir))
|
|
1084
|
+
|
|
1085
|
+
assert isinstance(cat, LocalCatalog)
|
|
1086
|
+
assert cat.uri.startswith("file:///")
|
|
1087
|
+
|
|
1088
|
+
def test_catalog_local_file_uri(self, tmp_path):
|
|
1089
|
+
"""Test creating LocalCatalog from file:// URI."""
|
|
1090
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1091
|
+
catalog_dir.mkdir()
|
|
1092
|
+
uri = f"file://{catalog_dir.as_posix()}"
|
|
1093
|
+
|
|
1094
|
+
cat = catalog_factory(uri)
|
|
1095
|
+
|
|
1096
|
+
assert isinstance(cat, LocalCatalog)
|
|
1097
|
+
assert cat.uri.startswith("file:///")
|
|
1098
|
+
|
|
1099
|
+
def test_catalog_remote_datasets_explicit(self):
|
|
1100
|
+
"""Test creating RemoteCatalog with explicit hf://datasets/ URI."""
|
|
1101
|
+
cat = catalog_factory("hf://datasets/my-org/my-repo", hf_token="test_token")
|
|
1102
|
+
|
|
1103
|
+
assert isinstance(cat, RemoteCatalog)
|
|
1104
|
+
assert cat.uri == "hf://datasets/my-org/my-repo"
|
|
1105
|
+
|
|
1106
|
+
def test_catalog_remote_spaces_explicit(self):
|
|
1107
|
+
"""Test creating RemoteCatalog with explicit hf://spaces/ URI."""
|
|
1108
|
+
cat = catalog_factory("hf://spaces/my-org/my-space", hf_token="test_token")
|
|
1109
|
+
|
|
1110
|
+
assert isinstance(cat, RemoteCatalog)
|
|
1111
|
+
assert cat.uri == "hf://spaces/my-org/my-space"
|
|
1112
|
+
|
|
1113
|
+
def test_catalog_remote_models_explicit(self):
|
|
1114
|
+
"""Test creating RemoteCatalog with explicit hf://models/ URI."""
|
|
1115
|
+
|
|
1116
|
+
with pytest.raises(ValueError, match="Unsupported"):
|
|
1117
|
+
catalog_factory("hf://models/my-org/my-model", hf_token="test_token")
|
|
1118
|
+
|
|
1119
|
+
def test_catalog_remote_shorthand_defaults_to_spaces(self):
|
|
1120
|
+
"""Test creating RemoteCatalog with shorthand org/repo format defaults to spaces."""
|
|
1121
|
+
cat = catalog_factory("my-org/my-repo", hf_token="test_token")
|
|
1122
|
+
|
|
1123
|
+
assert isinstance(cat, RemoteCatalog)
|
|
1124
|
+
assert cat.uri == "hf://spaces/my-org/my-repo"
|
|
1125
|
+
assert cat.name == "my-org/my-repo"
|
|
1126
|
+
|
|
1127
|
+
def test_catalog_remote_with_properties(self):
|
|
1128
|
+
"""Test creating RemoteCatalog with additional properties."""
|
|
1129
|
+
cat = catalog_factory(
|
|
1130
|
+
"hf://spaces/my-org/my-space",
|
|
1131
|
+
hf_token="test_token",
|
|
1132
|
+
custom_prop="custom_value",
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
assert isinstance(cat, RemoteCatalog)
|
|
1136
|
+
assert cat.properties["custom_prop"] == "custom_value"
|
|
1137
|
+
|
|
1138
|
+
def test_catalog_local_with_hf_token(self, tmp_path):
|
|
1139
|
+
"""Test creating LocalCatalog with hf_token (for accessing datasets)."""
|
|
1140
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1141
|
+
catalog_dir.mkdir()
|
|
1142
|
+
|
|
1143
|
+
cat = catalog_factory(str(catalog_dir), hf_token="test_token")
|
|
1144
|
+
|
|
1145
|
+
assert isinstance(cat, LocalCatalog)
|
|
1146
|
+
|
|
1147
|
+
def test_catalog_name_extraction_from_hf_uri(self):
|
|
1148
|
+
"""Test that catalog name is correctly extracted from hf:// URI."""
|
|
1149
|
+
# Datasets
|
|
1150
|
+
cat1 = catalog_factory("hf://datasets/org/repo")
|
|
1151
|
+
assert cat1.name == "org/repo"
|
|
1152
|
+
|
|
1153
|
+
# Spaces
|
|
1154
|
+
cat2 = catalog_factory("hf://spaces/org/space")
|
|
1155
|
+
assert cat2.name == "org/space"
|
|
1156
|
+
|
|
1157
|
+
def test_catalog_warehouse_property_set_correctly(self, tmp_path):
|
|
1158
|
+
"""Test that warehouse property is set correctly for different catalog types."""
|
|
1159
|
+
# Local catalog
|
|
1160
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1161
|
+
catalog_dir.mkdir()
|
|
1162
|
+
local_cat = catalog_factory(str(catalog_dir))
|
|
1163
|
+
assert local_cat.properties["warehouse"] == str(catalog_dir)
|
|
1164
|
+
|
|
1165
|
+
# Remote catalog
|
|
1166
|
+
remote_cat = catalog_factory("hf://datasets/org/repo")
|
|
1167
|
+
assert remote_cat.properties["warehouse"] == "hf://datasets/org/repo"
|
|
1168
|
+
|
|
1169
|
+
def test_local_catalog_requires_file_uri(self, tmp_path):
|
|
1170
|
+
"""Test that LocalCatalog requires file:// URI."""
|
|
1171
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1172
|
+
catalog_dir.mkdir()
|
|
1173
|
+
|
|
1174
|
+
# Should raise ValueError when given a plain path
|
|
1175
|
+
with pytest.raises(ValueError, match="LocalCatalog requires file:// URI"):
|
|
1176
|
+
LocalCatalog(name="test", uri=str(catalog_dir))
|
|
1177
|
+
|
|
1178
|
+
# Should work with file:// URI
|
|
1179
|
+
uri = f"file://{catalog_dir.as_posix()}"
|
|
1180
|
+
cat = LocalCatalog(name="test", uri=uri)
|
|
1181
|
+
assert isinstance(cat, LocalCatalog)
|
|
1182
|
+
|
|
1183
|
+
def test_remote_catalog_requires_hf_uri(self):
|
|
1184
|
+
"""Test that RemoteCatalog requires hf:// URI."""
|
|
1185
|
+
# Should raise ValueError when given an invalid URI
|
|
1186
|
+
with pytest.raises(ValueError, match="RemoteCatalog requires hf:// URI"):
|
|
1187
|
+
RemoteCatalog(name="test", uri="file:///path/to/catalog")
|
|
1188
|
+
|
|
1189
|
+
with pytest.raises(ValueError, match="RemoteCatalog requires hf:// URI"):
|
|
1190
|
+
RemoteCatalog(name="test", uri="org/repo")
|
|
1191
|
+
|
|
1192
|
+
# Should work with hf:// URI
|
|
1193
|
+
cat = RemoteCatalog(name="test", uri="hf://datasets/org/repo")
|
|
1194
|
+
assert isinstance(cat, RemoteCatalog)
|
|
1195
|
+
|
|
1196
|
+
def test_catalog_factory_handles_path_conversion(self, tmp_path):
|
|
1197
|
+
"""Test that catalog() factory converts paths to file:// URIs."""
|
|
1198
|
+
catalog_dir = tmp_path / "test_catalog"
|
|
1199
|
+
catalog_dir.mkdir()
|
|
1200
|
+
|
|
1201
|
+
# Factory should accept plain path and convert to file:// URI
|
|
1202
|
+
cat = catalog_factory(str(catalog_dir))
|
|
1203
|
+
assert isinstance(cat, LocalCatalog)
|
|
1204
|
+
assert cat.uri.startswith("file:///")
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
# =============================================================================
|
|
1208
|
+
# HfLocationProvider Tests
|
|
1209
|
+
# =============================================================================
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
class TestHfLocationProvider:
|
|
1213
|
+
"""Tests for HfLocationProvider."""
|
|
1214
|
+
|
|
1215
|
+
def test_default_pattern(self):
|
|
1216
|
+
"""Test default file naming pattern with UUIDv7."""
|
|
1217
|
+
provider = HfLocationProvider(
|
|
1218
|
+
table_location="hf://datasets/test-org/test-dataset",
|
|
1219
|
+
table_properties={},
|
|
1220
|
+
)
|
|
1221
|
+
|
|
1222
|
+
# First file
|
|
1223
|
+
path1 = provider.new_data_location("ignored.parquet")
|
|
1224
|
+
assert path1.endswith("-iceberg.parquet")
|
|
1225
|
+
assert "train-" in path1
|
|
1226
|
+
# UUIDv7 is 36 characters with hyphens
|
|
1227
|
+
filename1 = path1.split("/")[-1]
|
|
1228
|
+
uuid_part1 = filename1.replace("train-", "").replace("-iceberg.parquet", "")
|
|
1229
|
+
assert len(uuid_part1) == 36
|
|
1230
|
+
|
|
1231
|
+
# Second file - should have different UUID
|
|
1232
|
+
path2 = provider.new_data_location("ignored.parquet")
|
|
1233
|
+
assert path2.endswith("-iceberg.parquet")
|
|
1234
|
+
assert "train-" in path2
|
|
1235
|
+
assert path1 != path2 # Different UUIDs
|
|
1236
|
+
|
|
1237
|
+
def test_custom_split(self):
|
|
1238
|
+
"""Test custom split name."""
|
|
1239
|
+
provider = HfLocationProvider(
|
|
1240
|
+
table_location="hf://datasets/test-org/test-dataset",
|
|
1241
|
+
table_properties={"hf.write.split": "validation"},
|
|
1242
|
+
)
|
|
1243
|
+
|
|
1244
|
+
path = provider.new_data_location("ignored.parquet")
|
|
1245
|
+
assert "validation-" in path
|
|
1246
|
+
assert path.endswith("-iceberg.parquet")
|
|
1247
|
+
|
|
1248
|
+
def test_custom_pattern(self):
|
|
1249
|
+
"""Test custom file pattern."""
|
|
1250
|
+
provider = HfLocationProvider(
|
|
1251
|
+
table_location="hf://datasets/test-org/test-dataset",
|
|
1252
|
+
table_properties={
|
|
1253
|
+
"hf.write.pattern": "data-{split}-{uuid}.parquet",
|
|
1254
|
+
},
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
path = provider.new_data_location("ignored.parquet")
|
|
1258
|
+
assert "data-train-" in path
|
|
1259
|
+
assert path.endswith(".parquet")
|
|
1260
|
+
|
|
1261
|
+
def test_uuidv7_sortability(self):
|
|
1262
|
+
"""Test that UUIDv7 generates sortable identifiers."""
|
|
1263
|
+
import time
|
|
1264
|
+
|
|
1265
|
+
provider = HfLocationProvider(
|
|
1266
|
+
table_location="hf://datasets/test-org/test-dataset",
|
|
1267
|
+
table_properties={
|
|
1268
|
+
"hf.write.pattern": "{split}-{uuid}.parquet",
|
|
1269
|
+
},
|
|
1270
|
+
)
|
|
1271
|
+
|
|
1272
|
+
# Generate first UUID
|
|
1273
|
+
path1 = provider.new_data_location("ignored.parquet")
|
|
1274
|
+
filename1 = path1.split("/")[-1]
|
|
1275
|
+
uuid1 = filename1.replace("train-", "").replace(".parquet", "")
|
|
1276
|
+
|
|
1277
|
+
# Small delay to ensure different timestamp
|
|
1278
|
+
time.sleep(0.001)
|
|
1279
|
+
|
|
1280
|
+
# Generate second UUID
|
|
1281
|
+
path2 = provider.new_data_location("ignored.parquet")
|
|
1282
|
+
filename2 = path2.split("/")[-1]
|
|
1283
|
+
uuid2 = filename2.replace("train-", "").replace(".parquet", "")
|
|
1284
|
+
|
|
1285
|
+
# UUIDv7 should be sortable (later UUIDs are lexicographically greater)
|
|
1286
|
+
assert uuid1 < uuid2, "UUIDv7 should be sortable by timestamp"
|
|
1287
|
+
# UUIDs are 36 characters with hyphens
|
|
1288
|
+
assert len(uuid1) == 36
|
|
1289
|
+
assert len(uuid2) == 36
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
# =============================================================================
|
|
1293
|
+
# Write to Existing Dataset Tests (Parametrized for local/remote)
|
|
1294
|
+
# =============================================================================
|
|
1295
|
+
|
|
1296
|
+
|
|
1297
|
+
class TestWriteToExistingDataset:
|
|
1298
|
+
"""Tests for writing to existing HuggingFace datasets using location provider."""
|
|
1299
|
+
|
|
1300
|
+
def test_append_to_existing_dataset(self, writable_dataset):
|
|
1301
|
+
"""Test appending data to an existing dataset with HfLocationProvider.
|
|
1302
|
+
|
|
1303
|
+
Verifies that:
|
|
1304
|
+
- The writable_dataset fixture provides a valid dataset
|
|
1305
|
+
- Data can be appended and read back correctly
|
|
1306
|
+
- Appended files follow HuggingFace naming pattern: train-{index:05d}-iceberg.parquet
|
|
1307
|
+
"""
|
|
1308
|
+
catalog = writable_dataset
|
|
1309
|
+
|
|
1310
|
+
# Verify table exists and is properly configured
|
|
1311
|
+
assert catalog.table_exists("testorg.testdataset")
|
|
1312
|
+
table = catalog.load_table("testorg.testdataset")
|
|
1313
|
+
assert table is not None
|
|
1314
|
+
|
|
1315
|
+
# Verify table has HfLocationProvider configured
|
|
1316
|
+
assert (
|
|
1317
|
+
table.properties.get("write.py-location-provider.impl")
|
|
1318
|
+
== "faceberg.catalog.HfLocationProvider"
|
|
1319
|
+
)
|
|
1320
|
+
|
|
1321
|
+
# Verify table has initial data
|
|
1322
|
+
before_count = table.scan().to_arrow().num_rows
|
|
1323
|
+
assert before_count == 10 # Initial data from fixture
|
|
1324
|
+
|
|
1325
|
+
# Append new data (including split column as it's part of the schema)
|
|
1326
|
+
new_data = pa.Table.from_pydict(
|
|
1327
|
+
{
|
|
1328
|
+
"split": ["train", "train", "train"],
|
|
1329
|
+
"text": ["Appended test review", "Another appended review", "Third review"],
|
|
1330
|
+
"label": [1, 0, 1],
|
|
1331
|
+
}
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
table.append(new_data)
|
|
1335
|
+
|
|
1336
|
+
# Reload table to get updated metadata
|
|
1337
|
+
table = catalog.load_table("testorg.testdataset")
|
|
1338
|
+
|
|
1339
|
+
# Verify data was appended (count should increase)
|
|
1340
|
+
after_count = table.scan().to_arrow().num_rows
|
|
1341
|
+
assert after_count >= before_count + len(new_data)
|
|
1342
|
+
|
|
1343
|
+
# Verify appended data is readable
|
|
1344
|
+
scan = table.scan().filter("text = 'Appended test review'")
|
|
1345
|
+
result = scan.to_arrow()
|
|
1346
|
+
assert result.num_rows == 1
|
|
1347
|
+
assert result["text"][0].as_py() == "Appended test review"
|