faceberg 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/METADATA +9 -7
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
faceberg/tests/test_bridge.py
DELETED
|
@@ -1,825 +0,0 @@
|
|
|
1
|
-
"""Tests for the bridge layer (dataset discovery, schema conversion, and TableInfo creation)."""
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from datasets import Features
|
|
5
|
-
from datasets.features import ClassLabel, Sequence, Value
|
|
6
|
-
from pyiceberg.schema import Schema
|
|
7
|
-
from pyiceberg.types import (
|
|
8
|
-
IntegerType,
|
|
9
|
-
ListType,
|
|
10
|
-
LongType,
|
|
11
|
-
StringType,
|
|
12
|
-
StructType,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
from faceberg.bridge import (
|
|
16
|
-
DatasetInfo,
|
|
17
|
-
dataset_builder_safe,
|
|
18
|
-
iceberg_schema_from_features,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_discover_public_dataset():
|
|
23
|
-
"""Test discovering a public HuggingFace dataset."""
|
|
24
|
-
# Test with a known public dataset
|
|
25
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
26
|
-
|
|
27
|
-
assert dataset_info.repo_id == "stanfordnlp/imdb"
|
|
28
|
-
assert dataset_info.config == "plain_text"
|
|
29
|
-
|
|
30
|
-
# Check splits
|
|
31
|
-
splits = dataset_info.splits
|
|
32
|
-
assert "train" in splits
|
|
33
|
-
assert "test" in splits
|
|
34
|
-
assert "unsupervised" in splits
|
|
35
|
-
|
|
36
|
-
# Check Parquet files
|
|
37
|
-
assert "train" in dataset_info.data_files
|
|
38
|
-
train_files = dataset_info.data_files["train"]
|
|
39
|
-
assert len(train_files) > 0
|
|
40
|
-
assert all(isinstance(f, str) for f in train_files)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_discover_with_specific_config():
|
|
44
|
-
"""Test discovering a dataset with a specific config."""
|
|
45
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
46
|
-
|
|
47
|
-
assert dataset_info.config == "plain_text"
|
|
48
|
-
assert len(dataset_info.splits) > 0
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def test_discover_nonexistent_dataset():
|
|
52
|
-
"""Test discovering a non-existent dataset raises ValueError."""
|
|
53
|
-
with pytest.raises(ValueError, match="not found or not accessible"):
|
|
54
|
-
DatasetInfo.discover("nonexistent/fake-dataset-12345", config="default")
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def test_discover_nonexistent_config():
|
|
58
|
-
"""Test discovering a non-existent config raises ValueError."""
|
|
59
|
-
with pytest.raises(ValueError, match="Config .* not found"):
|
|
60
|
-
DatasetInfo.discover("stanfordnlp/imdb", config="fake_config")
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def test_to_table_infos():
|
|
64
|
-
"""Test converting DatasetInfo to TableInfo objects."""
|
|
65
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
66
|
-
|
|
67
|
-
# Convert to TableInfo
|
|
68
|
-
table_info = dataset_info.to_table_info(
|
|
69
|
-
namespace="default",
|
|
70
|
-
table_name="imdb_plain_text",
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
assert table_info.namespace == "default"
|
|
74
|
-
assert table_info.table_name == "imdb_plain_text"
|
|
75
|
-
assert table_info.identifier == "default.imdb_plain_text"
|
|
76
|
-
assert table_info.dataset_repo == "stanfordnlp/imdb"
|
|
77
|
-
assert table_info.dataset_config == "plain_text"
|
|
78
|
-
|
|
79
|
-
# Check schema
|
|
80
|
-
assert table_info.schema is not None
|
|
81
|
-
assert len(table_info.schema.fields) > 0
|
|
82
|
-
# Should have split column as first field
|
|
83
|
-
assert table_info.schema.fields[0].name == "split"
|
|
84
|
-
|
|
85
|
-
# Check partition spec (should be partitioned by split)
|
|
86
|
-
assert table_info.partition_spec is not None
|
|
87
|
-
assert len(table_info.partition_spec.fields) == 1
|
|
88
|
-
assert table_info.partition_spec.fields[0].name == "split"
|
|
89
|
-
|
|
90
|
-
# Check files
|
|
91
|
-
assert len(table_info.data_files) > 0
|
|
92
|
-
for file_info in table_info.data_files:
|
|
93
|
-
# URIs now include revision: hf://datasets/stanfordnlp/imdb@<revision>/...
|
|
94
|
-
assert file_info.uri.startswith("hf://datasets/stanfordnlp/imdb")
|
|
95
|
-
assert "@" in file_info.uri or "/" in file_info.uri
|
|
96
|
-
assert file_info.split in ["train", "test", "unsupervised"]
|
|
97
|
-
|
|
98
|
-
# Check properties
|
|
99
|
-
props = table_info.get_table_properties()
|
|
100
|
-
assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
|
|
101
|
-
assert props["hf.dataset.config"] == "plain_text"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
# =============================================================================
|
|
105
|
-
# Schema Conversion Tests
|
|
106
|
-
# =============================================================================
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def test_build_schema_from_simple_features():
|
|
110
|
-
"""Test building schema from simple features."""
|
|
111
|
-
features = Features(
|
|
112
|
-
{
|
|
113
|
-
"text": Value("string"),
|
|
114
|
-
"label": Value("int64"),
|
|
115
|
-
}
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
schema = iceberg_schema_from_features(features, include_split_column=True)
|
|
119
|
-
|
|
120
|
-
# Check split column is first
|
|
121
|
-
assert schema.fields[0].name == "split"
|
|
122
|
-
assert schema.fields[0].field_id == 1
|
|
123
|
-
assert isinstance(schema.fields[0].field_type, StringType)
|
|
124
|
-
|
|
125
|
-
# Check original fields
|
|
126
|
-
assert len(schema.fields) == 3 # split + text + label
|
|
127
|
-
field_names = [f.name for f in schema.fields]
|
|
128
|
-
assert "text" in field_names
|
|
129
|
-
assert "label" in field_names
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def test_build_schema_without_split_column():
|
|
133
|
-
"""Test building schema without split column."""
|
|
134
|
-
features = Features(
|
|
135
|
-
{
|
|
136
|
-
"id": Value("int64"),
|
|
137
|
-
"text": Value("string"),
|
|
138
|
-
}
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
schema = iceberg_schema_from_features(features, include_split_column=False)
|
|
142
|
-
|
|
143
|
-
# No split column
|
|
144
|
-
field_names = [f.name for f in schema.fields]
|
|
145
|
-
assert "split" not in field_names
|
|
146
|
-
assert len(schema.fields) == 2
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def test_build_schema_with_nested_features():
|
|
150
|
-
"""Test building schema with nested structures."""
|
|
151
|
-
features = Features(
|
|
152
|
-
{
|
|
153
|
-
"id": Value("int64"),
|
|
154
|
-
"metadata": {
|
|
155
|
-
"title": Value("string"),
|
|
156
|
-
"author": Value("string"),
|
|
157
|
-
"year": Value("int32"),
|
|
158
|
-
},
|
|
159
|
-
"tags": Sequence(Value("string")),
|
|
160
|
-
}
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
schema = iceberg_schema_from_features(features, include_split_column=False)
|
|
164
|
-
|
|
165
|
-
# Verify structure
|
|
166
|
-
field_names = [f.name for f in schema.fields]
|
|
167
|
-
assert "id" in field_names
|
|
168
|
-
assert "metadata" in field_names
|
|
169
|
-
assert "tags" in field_names
|
|
170
|
-
|
|
171
|
-
# Find metadata field
|
|
172
|
-
metadata_field = next(f for f in schema.fields if f.name == "metadata")
|
|
173
|
-
assert isinstance(metadata_field.field_type, StructType)
|
|
174
|
-
|
|
175
|
-
# Find tags field
|
|
176
|
-
tags_field = next(f for f in schema.fields if f.name == "tags")
|
|
177
|
-
assert isinstance(tags_field.field_type, ListType)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def test_build_schema_with_class_label():
|
|
181
|
-
"""Test building schema with ClassLabel feature."""
|
|
182
|
-
features = Features(
|
|
183
|
-
{
|
|
184
|
-
"text": Value("string"),
|
|
185
|
-
"label": ClassLabel(names=["negative", "positive"]),
|
|
186
|
-
}
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
schema = iceberg_schema_from_features(features, include_split_column=False)
|
|
190
|
-
|
|
191
|
-
# ClassLabel should be converted to an integer type
|
|
192
|
-
label_field = next(f for f in schema.fields if f.name == "label")
|
|
193
|
-
# ClassLabel is typically represented as int64 in Arrow
|
|
194
|
-
assert isinstance(label_field.field_type, (IntegerType, LongType))
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def test_unique_field_ids():
|
|
198
|
-
"""Test that all field IDs are unique across nested structures."""
|
|
199
|
-
features = Features(
|
|
200
|
-
{
|
|
201
|
-
"id": Value("int64"),
|
|
202
|
-
"nested": {
|
|
203
|
-
"field1": Value("string"),
|
|
204
|
-
"field2": Value("int32"),
|
|
205
|
-
"deeper": {
|
|
206
|
-
"field3": Value("string"),
|
|
207
|
-
},
|
|
208
|
-
},
|
|
209
|
-
"list_field": Sequence(Value("string")),
|
|
210
|
-
}
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
schema = iceberg_schema_from_features(features, include_split_column=True)
|
|
214
|
-
|
|
215
|
-
# Collect all field IDs recursively
|
|
216
|
-
def collect_field_ids(field_type, ids=None):
|
|
217
|
-
if ids is None:
|
|
218
|
-
ids = []
|
|
219
|
-
|
|
220
|
-
if isinstance(field_type, StructType):
|
|
221
|
-
for field in field_type.fields:
|
|
222
|
-
ids.append(field.field_id)
|
|
223
|
-
collect_field_ids(field.field_type, ids)
|
|
224
|
-
elif isinstance(field_type, ListType):
|
|
225
|
-
ids.append(field_type.element_id)
|
|
226
|
-
collect_field_ids(field_type.element_type, ids)
|
|
227
|
-
|
|
228
|
-
return ids
|
|
229
|
-
|
|
230
|
-
# Get all field IDs
|
|
231
|
-
all_ids = [f.field_id for f in schema.fields]
|
|
232
|
-
for field in schema.fields:
|
|
233
|
-
all_ids.extend(collect_field_ids(field.field_type))
|
|
234
|
-
|
|
235
|
-
# Check all IDs are unique
|
|
236
|
-
assert len(all_ids) == len(set(all_ids)), f"Duplicate field IDs found: {all_ids}"
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def test_features_dict_to_features_object():
|
|
240
|
-
"""Test that dict features are properly converted to Features object."""
|
|
241
|
-
features_dict = {
|
|
242
|
-
"id": Value("int64"),
|
|
243
|
-
"text": Value("string"),
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
schema = iceberg_schema_from_features(features_dict, include_split_column=False)
|
|
247
|
-
|
|
248
|
-
# Should work the same as passing Features object
|
|
249
|
-
assert isinstance(schema, Schema)
|
|
250
|
-
field_names = [f.name for f in schema.fields]
|
|
251
|
-
assert "id" in field_names
|
|
252
|
-
assert "text" in field_names
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def test_dataset_builder_safe():
|
|
256
|
-
"""Test that the safe builder loader works and avoids local files."""
|
|
257
|
-
# Test with a known public dataset
|
|
258
|
-
builder = dataset_builder_safe("stanfordnlp/imdb", config="plain_text")
|
|
259
|
-
|
|
260
|
-
assert builder is not None
|
|
261
|
-
assert builder.info is not None
|
|
262
|
-
assert builder.info.features is not None
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def test_dataset_builder_safe_nonexistent():
|
|
266
|
-
"""Test that safe builder loader raises error for non-existent dataset."""
|
|
267
|
-
with pytest.raises(Exception):
|
|
268
|
-
dataset_builder_safe("nonexistent/fake-dataset-12345")
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
def test_table_properties_use_hf_prefix():
|
|
272
|
-
"""Test that table properties use hf.dataset.* prefix."""
|
|
273
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
274
|
-
table_info = dataset_info.to_table_info(
|
|
275
|
-
namespace="default",
|
|
276
|
-
table_name="imdb_plain_text",
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
props = table_info.get_table_properties()
|
|
280
|
-
|
|
281
|
-
# Check that properties use hf.dataset prefix
|
|
282
|
-
assert "hf.dataset.repo" in props
|
|
283
|
-
assert "hf.dataset.config" in props
|
|
284
|
-
assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
|
|
285
|
-
assert props["hf.dataset.config"] == "plain_text"
|
|
286
|
-
|
|
287
|
-
# Check that revision is always included (now mandatory)
|
|
288
|
-
assert "hf.dataset.revision" in props
|
|
289
|
-
assert props["hf.dataset.revision"] == table_info.dataset_revision
|
|
290
|
-
|
|
291
|
-
# Verify old prefix is not used
|
|
292
|
-
assert "faceberg.source.repo" not in props
|
|
293
|
-
assert "faceberg.source.config" not in props
|
|
294
|
-
assert "faceberg.source.revision" not in props
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
def test_table_info_name_mapping_with_nested_structs():
|
|
298
|
-
"""Test that name mapping includes nested struct fields."""
|
|
299
|
-
import json
|
|
300
|
-
|
|
301
|
-
from pyiceberg.types import IntegerType, NestedField, StringType, StructType
|
|
302
|
-
|
|
303
|
-
from faceberg.bridge import TableInfo
|
|
304
|
-
|
|
305
|
-
# Create a schema with nested structs
|
|
306
|
-
schema = Schema(
|
|
307
|
-
NestedField(field_id=1, name="id", field_type=StringType(), required=False),
|
|
308
|
-
NestedField(
|
|
309
|
-
field_id=2,
|
|
310
|
-
name="metadata",
|
|
311
|
-
field_type=StructType(
|
|
312
|
-
NestedField(field_id=3, name="author", field_type=StringType(), required=False),
|
|
313
|
-
NestedField(field_id=4, name="year", field_type=IntegerType(), required=False),
|
|
314
|
-
),
|
|
315
|
-
required=False,
|
|
316
|
-
),
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
|
|
320
|
-
|
|
321
|
-
table_info = TableInfo(
|
|
322
|
-
namespace="test",
|
|
323
|
-
table_name="table",
|
|
324
|
-
schema=schema,
|
|
325
|
-
partition_spec=UNPARTITIONED_PARTITION_SPEC,
|
|
326
|
-
data_files=[],
|
|
327
|
-
data_dir="data",
|
|
328
|
-
dataset_repo="test/repo",
|
|
329
|
-
dataset_config="default",
|
|
330
|
-
dataset_revision="abc123",
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
properties = table_info.get_table_properties()
|
|
334
|
-
name_mapping = json.loads(properties["schema.name-mapping.default"])
|
|
335
|
-
|
|
336
|
-
# Check top-level fields
|
|
337
|
-
assert len(name_mapping) == 2
|
|
338
|
-
assert name_mapping[0]["field-id"] == 1
|
|
339
|
-
assert name_mapping[0]["names"] == ["id"]
|
|
340
|
-
|
|
341
|
-
# Check nested struct field
|
|
342
|
-
metadata_mapping = name_mapping[1]
|
|
343
|
-
assert metadata_mapping["field-id"] == 2
|
|
344
|
-
assert metadata_mapping["names"] == ["metadata"]
|
|
345
|
-
assert "fields" in metadata_mapping
|
|
346
|
-
assert len(metadata_mapping["fields"]) == 2
|
|
347
|
-
|
|
348
|
-
# Check nested struct's child fields
|
|
349
|
-
assert metadata_mapping["fields"][0]["field-id"] == 3
|
|
350
|
-
assert metadata_mapping["fields"][0]["names"] == ["author"]
|
|
351
|
-
assert metadata_mapping["fields"][1]["field-id"] == 4
|
|
352
|
-
assert metadata_mapping["fields"][1]["names"] == ["year"]
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
def test_table_info_name_mapping_with_lists():
|
|
356
|
-
"""Test that name mapping includes list element mappings."""
|
|
357
|
-
import json
|
|
358
|
-
|
|
359
|
-
from pyiceberg.types import ListType, NestedField, StringType, StructType
|
|
360
|
-
|
|
361
|
-
from faceberg.bridge import TableInfo
|
|
362
|
-
|
|
363
|
-
# Create a schema with list of strings and list of structs
|
|
364
|
-
schema = Schema(
|
|
365
|
-
NestedField(field_id=1, name="id", field_type=StringType(), required=False),
|
|
366
|
-
NestedField(
|
|
367
|
-
field_id=2,
|
|
368
|
-
name="tags",
|
|
369
|
-
field_type=ListType(element_id=3, element_type=StringType(), element_required=False),
|
|
370
|
-
required=False,
|
|
371
|
-
),
|
|
372
|
-
NestedField(
|
|
373
|
-
field_id=4,
|
|
374
|
-
name="items",
|
|
375
|
-
field_type=ListType(
|
|
376
|
-
element_id=5,
|
|
377
|
-
element_type=StructType(
|
|
378
|
-
NestedField(field_id=6, name="name", field_type=StringType(), required=False),
|
|
379
|
-
NestedField(field_id=7, name="value", field_type=StringType(), required=False),
|
|
380
|
-
),
|
|
381
|
-
element_required=False,
|
|
382
|
-
),
|
|
383
|
-
required=False,
|
|
384
|
-
),
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
|
|
388
|
-
|
|
389
|
-
table_info = TableInfo(
|
|
390
|
-
namespace="test",
|
|
391
|
-
table_name="table",
|
|
392
|
-
schema=schema,
|
|
393
|
-
partition_spec=UNPARTITIONED_PARTITION_SPEC,
|
|
394
|
-
data_files=[],
|
|
395
|
-
data_dir="data",
|
|
396
|
-
dataset_repo="test/repo",
|
|
397
|
-
dataset_config="default",
|
|
398
|
-
dataset_revision="abc123",
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
properties = table_info.get_table_properties()
|
|
402
|
-
name_mapping = json.loads(properties["schema.name-mapping.default"])
|
|
403
|
-
|
|
404
|
-
# Check list of strings (tags)
|
|
405
|
-
tags_mapping = name_mapping[1]
|
|
406
|
-
assert tags_mapping["field-id"] == 2
|
|
407
|
-
assert tags_mapping["names"] == ["tags"]
|
|
408
|
-
assert "fields" in tags_mapping
|
|
409
|
-
assert len(tags_mapping["fields"]) == 1
|
|
410
|
-
|
|
411
|
-
# Check element mapping for simple list
|
|
412
|
-
element_mapping = tags_mapping["fields"][0]
|
|
413
|
-
assert element_mapping["field-id"] == 3
|
|
414
|
-
assert element_mapping["names"] == ["element"]
|
|
415
|
-
|
|
416
|
-
# Check list of structs (items)
|
|
417
|
-
items_mapping = name_mapping[2]
|
|
418
|
-
assert items_mapping["field-id"] == 4
|
|
419
|
-
assert items_mapping["names"] == ["items"]
|
|
420
|
-
assert "fields" in items_mapping
|
|
421
|
-
|
|
422
|
-
# Check element mapping for list of structs
|
|
423
|
-
items_element = items_mapping["fields"][0]
|
|
424
|
-
assert items_element["field-id"] == 5
|
|
425
|
-
assert items_element["names"] == ["element"]
|
|
426
|
-
assert "fields" in items_element
|
|
427
|
-
|
|
428
|
-
# Check struct fields within list element
|
|
429
|
-
assert len(items_element["fields"]) == 2
|
|
430
|
-
assert items_element["fields"][0]["field-id"] == 6
|
|
431
|
-
assert items_element["fields"][0]["names"] == ["name"]
|
|
432
|
-
assert items_element["fields"][1]["field-id"] == 7
|
|
433
|
-
assert items_element["fields"][1]["names"] == ["value"]
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
def test_table_info_name_mapping_with_maps():
|
|
437
|
-
"""Test that name mapping includes map key and value mappings."""
|
|
438
|
-
import json
|
|
439
|
-
|
|
440
|
-
from pyiceberg.types import IntegerType, MapType, NestedField, StringType, StructType
|
|
441
|
-
|
|
442
|
-
from faceberg.bridge import TableInfo
|
|
443
|
-
|
|
444
|
-
# Create a schema with a map
|
|
445
|
-
schema = Schema(
|
|
446
|
-
NestedField(field_id=1, name="id", field_type=StringType(), required=False),
|
|
447
|
-
NestedField(
|
|
448
|
-
field_id=2,
|
|
449
|
-
name="metadata",
|
|
450
|
-
field_type=MapType(
|
|
451
|
-
key_id=3,
|
|
452
|
-
key_type=StringType(),
|
|
453
|
-
value_id=4,
|
|
454
|
-
value_type=StructType(
|
|
455
|
-
NestedField(field_id=5, name="count", field_type=IntegerType(), required=False),
|
|
456
|
-
NestedField(field_id=6, name="name", field_type=StringType(), required=False),
|
|
457
|
-
),
|
|
458
|
-
value_required=False,
|
|
459
|
-
),
|
|
460
|
-
required=False,
|
|
461
|
-
),
|
|
462
|
-
)
|
|
463
|
-
|
|
464
|
-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
|
|
465
|
-
|
|
466
|
-
table_info = TableInfo(
|
|
467
|
-
namespace="test",
|
|
468
|
-
table_name="table",
|
|
469
|
-
schema=schema,
|
|
470
|
-
partition_spec=UNPARTITIONED_PARTITION_SPEC,
|
|
471
|
-
data_files=[],
|
|
472
|
-
data_dir="data",
|
|
473
|
-
dataset_repo="test/repo",
|
|
474
|
-
dataset_config="default",
|
|
475
|
-
dataset_revision="abc123",
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
properties = table_info.get_table_properties()
|
|
479
|
-
name_mapping = json.loads(properties["schema.name-mapping.default"])
|
|
480
|
-
|
|
481
|
-
# Check map field
|
|
482
|
-
metadata_mapping = name_mapping[1]
|
|
483
|
-
assert metadata_mapping["field-id"] == 2
|
|
484
|
-
assert metadata_mapping["names"] == ["metadata"]
|
|
485
|
-
assert "fields" in metadata_mapping
|
|
486
|
-
assert len(metadata_mapping["fields"]) == 2
|
|
487
|
-
|
|
488
|
-
# Check key mapping
|
|
489
|
-
key_mapping = metadata_mapping["fields"][0]
|
|
490
|
-
assert key_mapping["field-id"] == 3
|
|
491
|
-
assert key_mapping["names"] == ["key"]
|
|
492
|
-
|
|
493
|
-
# Check value mapping
|
|
494
|
-
value_mapping = metadata_mapping["fields"][1]
|
|
495
|
-
assert value_mapping["field-id"] == 4
|
|
496
|
-
assert value_mapping["names"] == ["value"]
|
|
497
|
-
assert "fields" in value_mapping
|
|
498
|
-
|
|
499
|
-
# Check struct fields within map value
|
|
500
|
-
assert len(value_mapping["fields"]) == 2
|
|
501
|
-
assert value_mapping["fields"][0]["field-id"] == 5
|
|
502
|
-
assert value_mapping["fields"][0]["names"] == ["count"]
|
|
503
|
-
assert value_mapping["fields"][1]["field-id"] == 6
|
|
504
|
-
assert value_mapping["fields"][1]["names"] == ["name"]
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
# =============================================================================
|
|
508
|
-
# Revision Diff Tests
|
|
509
|
-
# =============================================================================
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def test_dataset_new_files_no_new_files():
|
|
513
|
-
"""Test when no files were added between revisions."""
|
|
514
|
-
from unittest.mock import Mock, patch
|
|
515
|
-
|
|
516
|
-
from faceberg.bridge import dataset_new_files
|
|
517
|
-
|
|
518
|
-
# Mock HfApi
|
|
519
|
-
mock_api = Mock()
|
|
520
|
-
mock_api.list_repo_files.return_value = [
|
|
521
|
-
"plain_text/train-00000.parquet",
|
|
522
|
-
"plain_text/test-00000.parquet",
|
|
523
|
-
"README.md",
|
|
524
|
-
]
|
|
525
|
-
|
|
526
|
-
with patch("faceberg.bridge.HfApi", return_value=mock_api):
|
|
527
|
-
result = dataset_new_files(
|
|
528
|
-
repo_id="test/dataset",
|
|
529
|
-
config="plain_text",
|
|
530
|
-
old_revision="abc123",
|
|
531
|
-
new_revision="def456",
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
# Should return empty list when files are the same
|
|
535
|
-
assert result == []
|
|
536
|
-
|
|
537
|
-
# Verify API was called with both revisions
|
|
538
|
-
assert mock_api.list_repo_files.call_count == 2
|
|
539
|
-
calls = mock_api.list_repo_files.call_args_list
|
|
540
|
-
assert calls[0].kwargs["revision"] == "abc123"
|
|
541
|
-
assert calls[1].kwargs["revision"] == "def456"
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
def test_dataset_new_files_with_new_files():
|
|
545
|
-
"""Test when new parquet files were added."""
|
|
546
|
-
from unittest.mock import Mock, patch
|
|
547
|
-
|
|
548
|
-
from faceberg.bridge import dataset_new_files
|
|
549
|
-
|
|
550
|
-
# Mock HfApi
|
|
551
|
-
mock_api = Mock()
|
|
552
|
-
|
|
553
|
-
def list_files_side_effect(**kwargs):
|
|
554
|
-
if kwargs["revision"] == "abc123":
|
|
555
|
-
# Old revision has 2 files
|
|
556
|
-
return [
|
|
557
|
-
"plain_text/train-00000.parquet",
|
|
558
|
-
"plain_text/test-00000.parquet",
|
|
559
|
-
"README.md",
|
|
560
|
-
]
|
|
561
|
-
else:
|
|
562
|
-
# New revision has 4 files (2 new)
|
|
563
|
-
return [
|
|
564
|
-
"plain_text/train-00000.parquet",
|
|
565
|
-
"plain_text/train-00001.parquet", # NEW
|
|
566
|
-
"plain_text/test-00000.parquet",
|
|
567
|
-
"plain_text/validation-00000.parquet", # NEW
|
|
568
|
-
"README.md",
|
|
569
|
-
]
|
|
570
|
-
|
|
571
|
-
mock_api.list_repo_files.side_effect = list_files_side_effect
|
|
572
|
-
|
|
573
|
-
with patch("faceberg.bridge.HfApi", return_value=mock_api):
|
|
574
|
-
result = dataset_new_files(
|
|
575
|
-
repo_id="test/dataset",
|
|
576
|
-
config="plain_text",
|
|
577
|
-
old_revision="abc123",
|
|
578
|
-
new_revision="def456",
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
# Should return list of new file paths
|
|
582
|
-
assert result == [
|
|
583
|
-
"plain_text/train-00001.parquet",
|
|
584
|
-
"plain_text/validation-00000.parquet",
|
|
585
|
-
]
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
def test_dataset_new_files_filters_by_config():
|
|
589
|
-
"""Test that only files for specified config are returned."""
|
|
590
|
-
from unittest.mock import Mock, patch
|
|
591
|
-
|
|
592
|
-
from faceberg.bridge import dataset_new_files
|
|
593
|
-
|
|
594
|
-
# Mock HfApi
|
|
595
|
-
mock_api = Mock()
|
|
596
|
-
|
|
597
|
-
def list_files_side_effect(**kwargs):
|
|
598
|
-
if kwargs["revision"] == "abc123":
|
|
599
|
-
return ["README.md"]
|
|
600
|
-
else:
|
|
601
|
-
# New files in multiple configs
|
|
602
|
-
return [
|
|
603
|
-
"plain_text/train-00000.parquet", # Should be included
|
|
604
|
-
"other_config/train-00000.parquet", # Should be excluded
|
|
605
|
-
"README.md",
|
|
606
|
-
]
|
|
607
|
-
|
|
608
|
-
mock_api.list_repo_files.side_effect = list_files_side_effect
|
|
609
|
-
|
|
610
|
-
with patch("faceberg.bridge.HfApi", return_value=mock_api):
|
|
611
|
-
result = dataset_new_files(
|
|
612
|
-
repo_id="test/dataset",
|
|
613
|
-
config="plain_text",
|
|
614
|
-
old_revision="abc123",
|
|
615
|
-
new_revision="def456",
|
|
616
|
-
)
|
|
617
|
-
|
|
618
|
-
# Should return only plain_text config file paths
|
|
619
|
-
assert result == ["plain_text/train-00000.parquet"]
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
def test_dataset_new_files_ignores_non_parquet():
|
|
623
|
-
"""Test that non-parquet files are filtered out."""
|
|
624
|
-
from unittest.mock import Mock, patch
|
|
625
|
-
|
|
626
|
-
from faceberg.bridge import dataset_new_files
|
|
627
|
-
|
|
628
|
-
# Mock HfApi
|
|
629
|
-
mock_api = Mock()
|
|
630
|
-
|
|
631
|
-
def list_files_side_effect(**kwargs):
|
|
632
|
-
if kwargs["revision"] == "abc123":
|
|
633
|
-
return []
|
|
634
|
-
else:
|
|
635
|
-
# Mix of file types
|
|
636
|
-
return [
|
|
637
|
-
"plain_text/train-00000.parquet", # Should be included
|
|
638
|
-
"plain_text/metadata.json", # Should be excluded
|
|
639
|
-
"plain_text/dataset_info.txt", # Should be excluded
|
|
640
|
-
"README.md", # Should be excluded
|
|
641
|
-
]
|
|
642
|
-
|
|
643
|
-
mock_api.list_repo_files.side_effect = list_files_side_effect
|
|
644
|
-
|
|
645
|
-
with patch("faceberg.bridge.HfApi", return_value=mock_api):
|
|
646
|
-
result = dataset_new_files(
|
|
647
|
-
repo_id="test/dataset",
|
|
648
|
-
config="plain_text",
|
|
649
|
-
old_revision="abc123",
|
|
650
|
-
new_revision="def456",
|
|
651
|
-
)
|
|
652
|
-
|
|
653
|
-
# Should return only parquet file paths
|
|
654
|
-
assert result == ["plain_text/train-00000.parquet"]
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
def test_discover_with_since_revision():
|
|
658
|
-
"""Test that passing since_revision to discover filters to new files only."""
|
|
659
|
-
from unittest.mock import Mock, patch
|
|
660
|
-
|
|
661
|
-
from datasets.features import Value
|
|
662
|
-
|
|
663
|
-
# Mock dataset_builder_safe to return a mock builder
|
|
664
|
-
mock_builder = Mock()
|
|
665
|
-
mock_builder.hash = "def456"
|
|
666
|
-
mock_builder.info.features = Features(
|
|
667
|
-
{
|
|
668
|
-
"text": Value("string"),
|
|
669
|
-
"label": Value("int64"),
|
|
670
|
-
}
|
|
671
|
-
)
|
|
672
|
-
mock_builder.config.data_dir = None
|
|
673
|
-
mock_builder.config.data_files = {
|
|
674
|
-
"train": [
|
|
675
|
-
"hf://datasets/test/dataset@def456/plain_text/train-00000.parquet",
|
|
676
|
-
"hf://datasets/test/dataset@def456/plain_text/train-00001.parquet",
|
|
677
|
-
],
|
|
678
|
-
"test": ["hf://datasets/test/dataset@def456/plain_text/test-00000.parquet"],
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
# Mock dataset_new_files to return list of new file paths
|
|
682
|
-
mock_get_new_files = Mock(
|
|
683
|
-
return_value=[
|
|
684
|
-
"plain_text/train-00001.parquet",
|
|
685
|
-
"plain_text/test-00000.parquet",
|
|
686
|
-
]
|
|
687
|
-
)
|
|
688
|
-
|
|
689
|
-
# Mock HfFileSystem to resolve file URIs
|
|
690
|
-
mock_fs = Mock()
|
|
691
|
-
|
|
692
|
-
def mock_resolve_path(uri):
|
|
693
|
-
# Extract path from URI: "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet"
|
|
694
|
-
# Split: ['hf:', '', 'datasets', 'test', 'dataset@def456', 'plain_text',
|
|
695
|
-
# 'train-00001.parquet']
|
|
696
|
-
parts = uri.split("/")
|
|
697
|
-
# Join everything after repo@revision (starting from index 5)
|
|
698
|
-
path = "/".join(parts[5:])
|
|
699
|
-
mock_result = Mock()
|
|
700
|
-
mock_result.path_in_repo = path
|
|
701
|
-
return mock_result
|
|
702
|
-
|
|
703
|
-
mock_fs.resolve_path.side_effect = mock_resolve_path
|
|
704
|
-
|
|
705
|
-
with (
|
|
706
|
-
patch("faceberg.bridge.dataset_builder_safe", return_value=mock_builder),
|
|
707
|
-
patch("faceberg.bridge.dataset_new_files", mock_get_new_files),
|
|
708
|
-
patch("faceberg.bridge.HfFileSystem", return_value=mock_fs),
|
|
709
|
-
):
|
|
710
|
-
# Discover with since_revision (should return only new files)
|
|
711
|
-
dataset_info = DatasetInfo.discover(
|
|
712
|
-
repo_id="test/dataset",
|
|
713
|
-
config="plain_text",
|
|
714
|
-
since_revision="abc123",
|
|
715
|
-
)
|
|
716
|
-
|
|
717
|
-
# Should have only 2 files (the new ones)
|
|
718
|
-
assert len(dataset_info.splits) == 2
|
|
719
|
-
assert "train" in dataset_info.splits
|
|
720
|
-
assert "test" in dataset_info.splits
|
|
721
|
-
|
|
722
|
-
# Verify data files are populated with new files
|
|
723
|
-
assert "train" in dataset_info.data_files
|
|
724
|
-
assert "test" in dataset_info.data_files
|
|
725
|
-
assert dataset_info.data_files["train"] == [
|
|
726
|
-
"hf://datasets/test/dataset@def456/plain_text/train-00001.parquet"
|
|
727
|
-
]
|
|
728
|
-
assert dataset_info.data_files["test"] == [
|
|
729
|
-
"hf://datasets/test/dataset@def456/plain_text/test-00000.parquet"
|
|
730
|
-
]
|
|
731
|
-
|
|
732
|
-
# Verify dataset_new_files was called with correct args
|
|
733
|
-
mock_get_new_files.assert_called_once_with(
|
|
734
|
-
repo_id="test/dataset",
|
|
735
|
-
config="plain_text",
|
|
736
|
-
old_revision="abc123",
|
|
737
|
-
new_revision="def456",
|
|
738
|
-
token=None,
|
|
739
|
-
)
|
|
740
|
-
|
|
741
|
-
# Now convert to TableInfo and verify
|
|
742
|
-
table_info = dataset_info.to_table_info(
|
|
743
|
-
namespace="default",
|
|
744
|
-
table_name="test_table",
|
|
745
|
-
)
|
|
746
|
-
|
|
747
|
-
# Should have only 2 files (the new ones)
|
|
748
|
-
assert len(table_info.data_files) == 2
|
|
749
|
-
file_paths = [f.uri for f in table_info.data_files]
|
|
750
|
-
assert "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet" in file_paths
|
|
751
|
-
assert "hf://datasets/test/dataset@def456/plain_text/test-00000.parquet" in file_paths
|
|
752
|
-
|
|
753
|
-
# Verify files are properly organized by split
|
|
754
|
-
splits = {f.split for f in table_info.data_files}
|
|
755
|
-
assert "train" in splits
|
|
756
|
-
assert "test" in splits
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
def test_features_stored_in_dataset_info():
|
|
760
|
-
"""Test that features are stored in DatasetInfo during discover()."""
|
|
761
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
762
|
-
|
|
763
|
-
# Features should be stored in DatasetInfo
|
|
764
|
-
assert hasattr(dataset_info, "features")
|
|
765
|
-
assert dataset_info.features is not None
|
|
766
|
-
assert isinstance(dataset_info.features, Features)
|
|
767
|
-
|
|
768
|
-
# Features should have expected fields for this dataset
|
|
769
|
-
assert "text" in dataset_info.features
|
|
770
|
-
assert "label" in dataset_info.features
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
def test_to_table_info_uses_stored_features():
|
|
774
|
-
"""Test that to_table_info uses stored features instead of calling dataset_builder_safe."""
|
|
775
|
-
from unittest.mock import patch
|
|
776
|
-
|
|
777
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
778
|
-
|
|
779
|
-
# Mock dataset_builder_safe to ensure it's NOT called
|
|
780
|
-
with patch("faceberg.bridge.dataset_builder_safe") as mock_builder:
|
|
781
|
-
table_info = dataset_info.to_table_info(
|
|
782
|
-
namespace="default",
|
|
783
|
-
table_name="imdb_plain_text",
|
|
784
|
-
)
|
|
785
|
-
|
|
786
|
-
# dataset_builder_safe should NOT have been called since features are stored
|
|
787
|
-
mock_builder.assert_not_called()
|
|
788
|
-
|
|
789
|
-
# TableInfo should still be created successfully
|
|
790
|
-
assert table_info.schema is not None
|
|
791
|
-
assert len(table_info.schema.fields) > 0
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
if __name__ == "__main__":
|
|
795
|
-
# Run basic smoke test
|
|
796
|
-
print("Running basic discovery test...")
|
|
797
|
-
dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
|
|
798
|
-
print(f"✓ Discovered config: {dataset_info.config}")
|
|
799
|
-
print(f"✓ Found splits: {dataset_info.splits}")
|
|
800
|
-
|
|
801
|
-
# Count total parquet files
|
|
802
|
-
total_files = sum(len(files) for files in dataset_info.data_files.values())
|
|
803
|
-
print(f"✓ Found {total_files} Parquet files across {len(dataset_info.data_files)} splits")
|
|
804
|
-
|
|
805
|
-
# Get a sample file
|
|
806
|
-
first_split_files = next(iter(dataset_info.data_files.values()))
|
|
807
|
-
if first_split_files:
|
|
808
|
-
# Files are already fully qualified URIs
|
|
809
|
-
sample = first_split_files[0]
|
|
810
|
-
print(f"✓ Sample file: {sample}")
|
|
811
|
-
|
|
812
|
-
print("\nRunning schema conversion tests...")
|
|
813
|
-
test_build_schema_from_simple_features()
|
|
814
|
-
print("✓ Simple features test passed")
|
|
815
|
-
|
|
816
|
-
test_build_schema_without_split_column()
|
|
817
|
-
print("✓ No split column test passed")
|
|
818
|
-
|
|
819
|
-
test_build_schema_with_nested_features()
|
|
820
|
-
print("✓ Nested features test passed")
|
|
821
|
-
|
|
822
|
-
test_unique_field_ids()
|
|
823
|
-
print("✓ Unique field IDs test passed")
|
|
824
|
-
|
|
825
|
-
print("\n✓ All tests passed!")
|